subtree(web/planet-mars): import project from previous upstream

This project is moving into the depot. Upstream is
github/thkoch2001/planet-mars.

This commit does not yet add a Nix build, only imports the code and matches
formatting requirements.

The import has been josh-filtered, which will allow us to continue publishing
the history to the previous repo.

Change-Id: I9cb184b5af3f74a0b4079bac499b4db039b7939b
This commit is contained in:
Vincent Ambo 2025-01-13 11:46:08 +03:00
commit 0a83e6567b
13 changed files with 2869 additions and 0 deletions

2
web/planet-mars/.gitignore vendored Normal file
View file

@ -0,0 +1,2 @@
/target
/mars.toml

2085
web/planet-mars/Cargo.lock generated Normal file

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,25 @@
[package]
name = "planet-mars"
version = "0.1.1"
edition = "2021"
authors = ["Thomas Koch <thomas@koch.ro>"]
description = "Feed aggregation planet like Planet Venus, produces static HTML and ATOM feed from fetched feeds."
homepage = "https://github.com/thkoch2001/planet-mars"
license = "AGPL-3.0-or-later"
keywords = ["atom", "rss", "planet", "feed", "blogging"]
categories = ["web-programming"]
[dependencies]
anyhow = "1"
clap = { version = "4", features = ["derive"] }
env_logger = "0"
feed-rs = "2"
log = "0"
ron = "0"
serde = { version = "1", features = ["derive"] }
slug = "0"
tera = "1"
toml = "0"
ureq = { version = "3.0.0-rc5", features = ["brotli", "charset", "gzip", "native-tls"]}
url = "2"

1
web/planet-mars/OWNERS Normal file
View file

@ -0,0 +1 @@
thk

28
web/planet-mars/README.md Normal file
View file

@ -0,0 +1,28 @@
Simple successor to Planet Venus but in Rust and maintained.
Please see the rustdoc of main.rs for further information.
## Todo
* find and use a nice lib to process the config file
* should check whether dirs exists and are writeable
* should check whether feed urls can be parsed
## Planet Venus
Planet Venus is used by many planets on the internet. However its code has not
been maintained since ~2011 and it uses Python 2.
Planet Mars should be a lightweight successor to Planet Venus.
Still the Planet Venus documentation contains some useful information on
[Etiquette](https://intertwingly.net/code/venus/docs/etiquette.html) for
Planet hosters.
## Credits
While writing this, I read and also copied code from:
* [agro](https://docs.rs/crate/agro/0.1.1)
* [hades](https://github.com/kitallis/hades)
* [planetrs](https://github.com/djc/planetrs)

View file

@ -0,0 +1,8 @@
bot_name = "planet-mars"
feed_dir = "/var/lib/planet-mars/feeds"
from = "thomas@koch.ro"
out_dir = "/var/lib/planet-mars/out"
templates_dir = "/var/lib/planet-mars/templates"
[[feeds]]
url = "https://blog.fefe.de/rss.xml"

View file

@ -0,0 +1,96 @@
p, h1, h2, h3, h4, h5, h6, small {
max-width: 48em;
}
h1 a, h2 a, h3 a, h4 a, h5 a, h6 a {
color: inherit !important;
text-decoration: none;
}
ul, ol {
/* account for the 1em -webkit-margin-start for the list icon */
max-width: 45em;
}
ul,ol,dl, p {
margin-top: 0.3em;
margin-bottom: 0.3em;
line-height: 1.2;
}
ul, ol {
padding-inline-start: 1.5em;
}
#bodydiv {
margin: auto;
max-width: 80em;
}
#maincontainer aside img {
max-width: 10em;
}
#maincontainer main blockquote {
margin-left: 0;
margin-right: 10px;
box-shadow: 10px 0px 0px 0px #C4C4C4;
}
blockquote, pre code {
padding: 0.5ex 0;
display: block;
background-color: #EEE;
}
#maincontainer main * {
max-width: 100%;
}
#maincontainer main pre {
overflow-x: auto;
}
.entry_meta {
margin-bottom: 1em;
}
@media only screen and (min-width: 1024px) {
#maincontainer {
display: flex;
}
#maincontainer main {
max-width: 50em;
flex: 5;
}
#maincontainer aside {
margin-left: 5em;
max-width: 15em;
flex: 1;
}
#maincontainer aside img {
margin: auto;
display: block;
}
article > h2.entry_header {
margin-bottom: 3px;
}
.entry_meta {
padding: 3px 0;
background-color: LightBlue;
}
hr.entry_sep {
border: none;
}
hr.entry_sep::before {
content: '* * *';
display: block;
text-align: center;
}
}

View file

@ -0,0 +1,185 @@
use anyhow::bail;
use anyhow::Result;
use feed_rs::model::Entry;
use feed_rs::model::Feed;
use ron::ser::{to_string_pretty, PrettyConfig};
use serde::{Deserialize, Serialize};
use std::collections::HashMap;
use std::convert::AsRef;
use std::fs;
use std::io::BufReader;
use std::path::PathBuf;
use ureq::http::HeaderMap;
use ureq::http::Response;
use ureq::Body;
use url::Url;
#[derive(Deserialize, Serialize, Default)]
pub struct FetchData {
pub etag: String,
pub last_modified: String,
}
pub struct FeedStore {
pub dir: PathBuf,
}
impl FeedStore {
pub fn new(dir: &str) -> Self {
Self {
dir: super::to_checked_pathbuf(dir),
}
}
fn slugify_url(url: &Url) -> Result<String> {
let Some(domain) = url.domain() else {
bail!("Url has no domain: '{url}'.")
};
let query = url.query().unwrap_or("");
Ok(slug::slugify(format!("{domain}{}{query}", url.path())))
}
fn generic_path(&self, url: &Url, ext: &str) -> Result<String> {
Ok(format!(
"{}/{}{ext}",
self.dir.display(),
Self::slugify_url(url)?
))
}
fn feed_path(&self, url: &Url) -> Result<String> {
self.generic_path(url, "")
}
fn fetchdata_path(&self, url: &Url) -> Result<String> {
self.generic_path(url, ".toml")
}
pub fn load_fetchdata(&self, url: &Url) -> Result<FetchData> {
let path = self.fetchdata_path(url)?;
if !fs::exists(path.clone())? {
return Ok(FetchData::default());
}
Ok(toml::from_str(&fs::read_to_string(path)?)?)
}
fn has_changed(&self, url: &Url, new_feed: &Feed) -> Result<bool> {
let Some(old_feed) = self.load_feed(url, false)? else {
return Ok(true);
};
let mut old_iter = old_feed.entries.iter();
for new in &new_feed.entries {
let Some(old) = old_iter.next() else {
return Ok(true);
};
if old != new {
return Ok(true);
}
}
// ignoring any entries left in old_iter
Ok(false)
}
fn write<P: AsRef<std::path::Path> + std::fmt::Display, C: AsRef<[u8]>>(
path: P,
contents: C,
) -> std::io::Result<()> {
if fs::exists(&path)? {
fs::rename(&path, format!("{path}.backup"))?;
}
fs::write(path, contents)
}
pub fn store(&self, url: &Url, mut response: Response<Body>) -> Result<bool> {
let headers = response.headers();
let fetchdata = FetchData {
etag: hv(headers, "etag"),
last_modified: hv(headers, "last_modified"),
};
let body = response.body_mut().with_config().read_to_vec()?;
let feed = match feed_rs::parser::parse(body.as_slice()) {
Ok(f) => f,
Err(e) => {
warn!("Error when parsing feed for {url}: {e:?}");
return Ok(false);
}
};
if !self.has_changed(url, &feed)? {
return Ok(false);
}
debug!("Storing feed for {url}.");
// todo don't serialize to string but to writer
Self::write(
self.generic_path(url, ".ron")?,
to_string_pretty(&feed, PrettyConfig::default())?,
)?;
Self::write(self.feed_path(url)?, body)?;
Self::write(self.fetchdata_path(url)?, toml::to_string(&fetchdata)?)?;
Ok(true)
}
fn load_feed(&self, url: &Url, sanitize: bool) -> Result<Option<Feed>> {
let parser = feed_rs::parser::Builder::new()
.sanitize_content(sanitize)
.build();
let path = self.feed_path(url)?;
if !fs::exists(path.clone())? {
return Ok(None);
}
let file = fs::File::open(path)?;
Ok(Some(parser.parse(BufReader::new(file))?))
}
pub fn collect(
&self,
feed_configs: &Vec<super::FeedConfig>,
max_entries: usize,
) -> (HashMap<String, Feed>, Vec<Entry>) {
let mut feeds = HashMap::new();
let mut entries = Vec::new();
for feed_config in feed_configs {
let mut feed = match (|| {
let url = Url::parse(&feed_config.url)?;
self.load_feed(&url, true)
})() {
Err(e) => {
warn!(
"Problem parsing feed file for feed {}: {e:?}",
feed_config.url
);
continue;
}
Ok(None) => continue,
Ok(Some(f)) => f,
};
for entry in &mut feed.entries {
entry.source = Some(feed_config.url.clone());
}
entries.append(&mut std::mem::take(&mut feed.entries));
feeds.insert(feed_config.url.clone(), feed);
// optimization to reduce memory usage
if entries.len() > 4 * max_entries {
entries = trim_entries(entries, max_entries);
}
}
(feeds, trim_entries(entries, max_entries))
}
}
fn trim_entries(mut entries: Vec<Entry>, max_entries: usize) -> Vec<Entry> {
entries.sort_by_key(|e| std::cmp::Reverse(e.updated.or(e.published).unwrap_or_default()));
entries.truncate(max_entries);
entries
}
fn hv(headers: &HeaderMap, key: &str) -> String {
match headers.get(key) {
Some(hv) => hv.to_str().unwrap_or_default().to_string(),
_ => "".to_string(),
}
}

View file

@ -0,0 +1,76 @@
use anyhow::Result;
use std::time::Instant;
use ureq::tls::{TlsConfig, TlsProvider};
use ureq::Agent;
use url::Url;
use crate::FeedStore;
pub struct Fetcher {
agent: Agent,
/// FROM header for requests
from: String,
}
impl Fetcher {
pub fn new(bot_name: &str, from: &str) -> Fetcher {
let ua_name = format!(
"{bot_name}/{} {} software: {}",
env!("CARGO_PKG_VERSION"),
env!("CARGO_PKG_HOMEPAGE"),
env!("CARGO_PKG_NAME")
);
info!("useragent: {ua_name}");
let agent = Agent::config_builder()
.http_status_as_error(false)
.user_agent(ua_name)
.tls_config(
TlsConfig::builder()
.provider(TlsProvider::NativeTls)
.build(),
)
.build()
.into();
Fetcher {
agent,
from: from.to_string(),
}
}
pub fn fetch(&self, url: Url, feed_store: &FeedStore) -> Result<bool> {
let fetchdata = feed_store.load_fetchdata(&url)?;
let mut builder = self
.agent
.get(url.to_string())
.header("FROM", self.from.clone());
if !fetchdata.etag.is_empty() {
builder = builder.header("If-None-Match", fetchdata.etag);
}
if !fetchdata.last_modified.is_empty() {
builder = builder.header("If-Modified-Since", fetchdata.last_modified);
}
let start_instant = Instant::now();
let result = builder.call();
let duration = start_instant.elapsed();
let response = result?;
debug!(
"fetched with status {} in {} ms: {url}",
response.status(),
duration.as_millis()
);
let status = response.status();
match status.as_u16() {
304 => Ok(false), // Not Modified -> nothing to do
200 => feed_store.store(&url, response),
_ => {
warn!(
"HTTP Status {} not implemented for {url}",
response.status()
);
Ok(false)
}
}
}
}

136
web/planet-mars/src/main.rs Normal file
View file

@ -0,0 +1,136 @@
//! Planet software to aggregate many feeds into one
//!
//! Input feeds are defined in a toml config file given as cmdline
//! argument. See the [`Config`] struct and the mars.toml.example file.
//!
//! The program iterates over all [feed urls], fetches them, stores them in
//! [feed_dir] and only rebuilds when at least one feed has updates. The
//! fetcher implements HTTP ETag and LastModified caching.
//!
//! During rebuild, all files in [templates_dir] are processed and written to
//! [out_dir].
//!
//! The software is supposed to be run like every 15 minutes.
//!
//! Use a reserved (sub)domain to publish the planet! Although this software
//! tries to sanitize input feeds, there could still be bugs that open the
//! planets domain to cross-site attacks.
//!
//! [templates_dir]: Config#structfield.templates_dir
//! [feed_dir]: Config#structfield.feed_dir
//! [out_dir]: Config#structfield.out_dir
//! [feed urls]: Config#structfield.feeds
#[macro_use]
extern crate log;
use crate::feed_store::FeedStore;
use crate::fetcher::Fetcher;
use anyhow::Result;
use clap::Parser;
use serde::Deserialize;
use std::fs;
use std::path::PathBuf;
use url::Url;
//mod atom_serializer;
mod feed_store;
mod fetcher;
mod template_engine;
#[derive(Parser)]
#[command(author, version, about, long_about = None)]
struct Args {
/// config file in toml format
#[arg(
short,
long,
default_value_t = String::from("mars.toml")
)]
config: String,
#[arg(long, default_value_t = false)]
no_fetch: bool,
}
/// Config to be parsed from toml file given as cmdline option
#[derive(Deserialize)]
struct Config {
/// to be used as part of the fetchers username header
bot_name: String,
/// where to store downloaded feeds and their metadata
feed_dir: String,
/// feeds to be agregated
feeds: Vec<FeedConfig>,
/// Email adress to use for the from header when fetching feeds
from: String,
/// where to build the output files
out_dir: String,
/// templates folder
templates_dir: String,
/// How many feed entries should be included in the planet
max_entries: usize,
}
pub fn to_checked_pathbuf(dir: &str) -> PathBuf {
let dir: PathBuf = PathBuf::from(dir);
let m = dir
.metadata()
.unwrap_or_else(|_| panic!("Could not get metadata of dir: {}", dir.display()));
assert!(m.is_dir(), "Not a dir: {}", dir.display());
dir
}
/// Config for one individual input feed
///
/// This is a separate struct in case one wants to configure additional
/// information in the future.
#[derive(Deserialize)]
struct FeedConfig {
/// url of an ATOM, RSS or Json feed
url: String,
}
fn fetch(config: &Config, feed_store: &FeedStore) -> Result<bool> {
let fetcher = Fetcher::new(&config.bot_name, &config.from);
let mut rebuild = false;
for feed in &config.feeds {
let url = match Url::parse(&feed.url) {
Ok(x) => x,
Err(e) => {
error!("Error parsing url '{}': {e:?}", feed.url);
continue;
}
};
rebuild |= fetcher.fetch(url, feed_store)?;
}
info!("Done fetching. Rebuild needed: {rebuild}");
Ok(rebuild)
}
fn main() -> Result<()> {
env_logger::init();
info!("starting up");
let args = Args::parse();
let config_path = &args.config;
if !fs::exists(config_path)? {
panic!("Configuration file {config_path} does not exist!");
}
let config: Config = toml::from_str(&fs::read_to_string(config_path)?)?;
// only check here to avoid fetching with broken config
// todo: get a config lib that provides validation!
let _ = to_checked_pathbuf(&config.templates_dir);
let _ = to_checked_pathbuf(&config.out_dir);
let feed_store = FeedStore::new(&config.feed_dir);
let should_build = if args.no_fetch {
true
} else {
fetch(&config, &feed_store)?
};
if should_build {
template_engine::build(&config, &feed_store)?;
}
Ok(())
}

View file

@ -0,0 +1,90 @@
use crate::feed_store::FeedStore;
use crate::to_checked_pathbuf;
use crate::Config;
use anyhow::Result;
use feed_rs::model::Feed;
use std::collections::HashMap;
use std::fs::File;
use tera::{from_value, Tera};
pub fn build(config: &Config, feed_store: &FeedStore) -> Result<()> {
let mut tera = create_tera(&config.templates_dir)?;
let out_dir = to_checked_pathbuf(&config.out_dir);
let mut context = tera::Context::new();
let (feeds, entries): (HashMap<String, Feed>, _) =
feed_store.collect(&config.feeds, config.max_entries);
context.insert("feeds", &feeds);
context.insert("entries", &entries);
context.insert("PKG_AUTHORS", env!("CARGO_PKG_AUTHORS"));
context.insert("PKG_HOMEPAGE", env!("CARGO_PKG_HOMEPAGE"));
context.insert("PKG_NAME", env!("CARGO_PKG_NAME"));
context.insert("PKG_VERSION", env!("CARGO_PKG_VERSION"));
tera.register_function("get_author", GetAuthorFunction { feeds });
for name in tera.get_template_names() {
debug!("Processing template {name}");
let file = File::create(format!("{}/{name}", out_dir.display()))?;
tera.render_to(name, &context, file)?;
}
Ok(())
}
fn create_tera(templates_dir: &str) -> Result<Tera> {
let dir = to_checked_pathbuf(templates_dir);
let mut tera = tera::Tera::new(&format!("{}/*", &dir.display()))?;
// disable autoescape as this would corrupt urls or the entriy contents. todo check this!
tera.autoescape_on(vec![]);
Ok(tera)
}
struct GetAuthorFunction {
feeds: HashMap<String, Feed>,
}
impl tera::Function for GetAuthorFunction {
fn call(&self, args: &HashMap<String, tera::Value>) -> Result<tera::Value, tera::Error> {
let entry_val: tera::Map<_, _> = match args.get("entry") {
None => {
return Err(tera::Error::msg(
"No argument of name 'entry' given to function.",
))
}
Some(val) => from_value(val.clone())?,
};
let feed_url: String = from_value(entry_val.get("source").unwrap().clone())?;
let authors_val: Vec<tera::Map<_, _>> =
from_value(entry_val.get("authors").unwrap().clone())?;
let mut authors: Vec<String> = Vec::new();
for author_val in authors_val {
let name: String = from_value(author_val.get("name").unwrap().clone())?;
if is_valid_name(&name) {
authors.push(name.clone());
}
}
if authors.is_empty() {
authors.append(&mut self.find_authors_from_feed(&feed_url));
}
Ok(tera::Value::String(authors.join(", ")))
}
}
impl GetAuthorFunction {
fn find_authors_from_feed(&self, feed_url: &str) -> Vec<String> {
let feed = self.feeds.get(feed_url).unwrap();
feed.authors
.clone()
.into_iter()
.map(|x| x.name)
.filter(is_valid_name)
.collect()
}
}
fn is_valid_name(n: &String) -> bool {
!n.is_empty() && n != "unknown" && n != "author"
}

View file

@ -0,0 +1,52 @@
<?xml version="1.0" encoding="utf-8"?>
<feed xmlns="http://www.w3.org/2005/Atom">
<title>Planet TVL</title>
<link href="https://planet.tvl.fyi"/>
<updated>{{now()|date(format="%Y-%m-%dT%H:%M:%SZ")}}</updated>
<id>https::/planet.tvl.fyi</id>
<generator uri="{{ PKG_HOMEPAGE|escape }}" version="{{ PKG_VERSION|escape }}">
{{ PKG_NAME|escape }} by {{ PKG_AUTHORS|escape }}
</generator>
<icon>https://planet.tvl.fyi/logo.svg</icon>
{% for entry in entries %}
<entry>
<id>{{ entry.id }}/planet.tvl.fyi</id>
{% if entry.title -%}
<title>{{ entry.title.content }}</title>
{% endif -%}
{% for link in entry.links %}
<link href="{{ link.href }}" {% if link.rel %}rel="{{ link.rel }}"{% endif %}/>
{% endfor %}
{% if entry.updated %}
<updated>{{ entry.updated }}</updated>
{% endif %}
{% if entry.published %}
<published>{{ entry.published }}</published>
{% endif %}
{% if entry.summary -%}
<summary>
{{ entry.summary.content|escape }}
</summary>
{% endif -%}
{% for author in entry.authors %}
<author>
{% if author.name -%}
<name>{{ author.name }}</name>
{% endif -%}
{% if author.email -%}
<email>{{ author.email }}</email>
{% endif -%}
</author>
{% if author.email -%}
<uri>{{ author.uri }}</uri>
{% endif -%}
{% endfor %}
{% if entry.content -%}
<content {% if entry.content.type %}type="{{ entry.content.type }}"{% endif %} {% if entry.content.src %}type="{{ entry.content.src }}"{% endif %}>
{{ entry.content.body|escape }}
</content>
{% endif -%}
</entry>
{% endfor %}
</feed>

View file

@ -0,0 +1,85 @@
{% set dateformat = "%d.%m.%Y %H:%M" -%}
<html>
<head>
<title>Planet TVL</title>
<meta http-equiv="Content-Type" content="text/html; charset=utf-8">
<meta name="generator" content="planet-mars">
<link rel="shortcut icon" href="/favicon.ico">
<link rel="stylesheet" href="planet.css" type="text/css">
<link rel="alternate" type="application/xml+atom" title="Planet Haskell Atom Feed" href="atom.xml">
</head>
<body>
<div id="bodydiv">
<header>
<h1>Planet TVL</h1>
</header>
<div id="maincontainer">
<main>
{% for entry in entries -%}
{% if loop.index > 1 -%}
<hr class="entry_sep">
{% endif -%}
{% if entry.links.0 -%}
{% set link = entry.links.0.href -%}
{% else -%}
{% set link = "" -%}
{% endif -%}
<article>
<h2 class="entry_header">
<a {% if link -%}href="{{link}}"{% endif -%}>
{% if entry.title -%}
{{ entry.title.content|striptags }}
{% else -%}
NO TITLE
{% endif -%}
</a>
</h2>
<div class="entry_meta">
<date>
{% if entry.updated -%}
<span>{{ entry.updated | date(format=dateformat) }}</span>
{% else -%}
<span>{{ entry.published | date(format=dateformat) }}</span>
{% endif -%}
</date>
{% set author = get_author(entry=entry) -%}
{% if author -%}
&mdash; <span class="entry_author">{{ author | striptags }}</span>
{% endif -%}
</div>
{% if entry.content -%}
<div class="entry_content">
{{ entry.content.body }}
</div>
{% elif entry.summary -%}
<div class="entry_summary">
{{ entry.summary.content }}
</div>
{% endif -%}
</article>
{% endfor -%}
</main>
<aside>
<img src="logo.svg">
<p>Last updated: {{now()|date(format="%Y-%m-%d %H:%M")}}</p>
<ul>
{% for feed_url, feed in feeds %}
<li>
<a {% if feed.links.0 %}href="{{feed.links.0.href}}"{% endif -%}>
{% if feed.title -%}
{{feed.title.content|striptags}}
{% elif feed.authors.0 and feed.authors.0.name %}
{{ feed.authors.0.name }}
{% endif -%}
</a>
(<a href="{{feed_url}}">feed</a>)
</li>
{% endfor %}
</ul>
</aside>
</div>
</div>
</body>
</html>