feat(sterni/mn2html): reimplement mnote-html in Rust

Reimplement the MIME body extraction and HTML rewriting from mblog in
Rust so that shelling out to it becomes viable. The problem with
mnote-html is mainly that it – being written in CL – requires a ~300MB
executable and is a bit sluggish starting.

Change-Id: I5c1adc1a7ab5f3dde207f9a1f67ace685bd3f69f
Reviewed-on: https://cl.tvl.fyi/c/depot/+/13014
Tested-by: BuildkiteCI
Reviewed-by: sterni <sternenseemann@systemli.org>
Autosubmit: sterni <sternenseemann@systemli.org>
This commit is contained in:
sterni 2024-12-11 00:22:25 +01:00 committed by clbot
parent 84bdb1e89a
commit 3224488a29
6 changed files with 759 additions and 0 deletions

1
users/sterni/mn2html/.gitignore vendored Normal file
View file

@ -0,0 +1 @@
target

534
users/sterni/mn2html/Cargo.lock generated Normal file
View file

@ -0,0 +1,534 @@
# This file is automatically @generated by Cargo.
# It is not intended for manual editing.
version = 3
[[package]]
name = "allocator-api2"
version = "0.2.21"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "683d7910e743518b0e34f1186f92494becacb047c7b6bf616c96772180fef923"
[[package]]
name = "amn2html"
version = "0.1.0"
dependencies = [
"lol_html",
"mail-parser",
"memmap2",
]
[[package]]
name = "bitflags"
version = "1.3.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a"
[[package]]
name = "bitflags"
version = "2.6.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b048fb63fd8b5923fc5aa7b340d8e156aec7ec02f0c78fa8a6ddc2613f6f71de"
[[package]]
name = "byteorder"
version = "1.5.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1fd0f2584146f6f2ef48085050886acf353beff7305ebd1ae69500e27c67f64b"
[[package]]
name = "cfg-if"
version = "1.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd"
[[package]]
name = "convert_case"
version = "0.4.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6245d59a3e82a7fc217c5828a6692dbc6dfb63a0c8c90495621f7b9d79704a0e"
[[package]]
name = "cssparser"
version = "0.27.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "754b69d351cdc2d8ee09ae203db831e005560fc6030da058f86ad60c92a9cb0a"
dependencies = [
"cssparser-macros",
"dtoa-short",
"itoa",
"matches",
"phf",
"proc-macro2",
"quote",
"smallvec",
"syn 1.0.109",
]
[[package]]
name = "cssparser-macros"
version = "0.6.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "13b588ba4ac1a99f7f2964d24b3d896ddc6bf847ee3855dbd4366f058cfcd331"
dependencies = [
"quote",
"syn 2.0.90",
]
[[package]]
name = "derive_more"
version = "0.99.18"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5f33878137e4dafd7fa914ad4e259e18a4e8e532b9617a2d0150262bf53abfce"
dependencies = [
"convert_case",
"proc-macro2",
"quote",
"rustc_version",
"syn 2.0.90",
]
[[package]]
name = "dtoa"
version = "1.0.9"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "dcbb2bf8e87535c23f7a8a321e364ce21462d0ff10cb6407820e8e96dfff6653"
[[package]]
name = "dtoa-short"
version = "0.3.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "cd1511a7b6a56299bd043a9c167a6d2bfb37bf84a6dfceaba651168adfb43c87"
dependencies = [
"dtoa",
]
[[package]]
name = "encoding_rs"
version = "0.8.35"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "75030f3c4f45dafd7586dd6780965a8c7e8e285a5ecb86713e63a79c5b2766f3"
dependencies = [
"cfg-if",
]
[[package]]
name = "equivalent"
version = "1.0.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5443807d6dff69373d433ab9ef5378ad8df50ca6298caf15de6e52e24aaf54d5"
[[package]]
name = "foldhash"
version = "0.1.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f81ec6369c545a7d40e4589b5597581fa1c441fe1cce96dd1de43159910a36a2"
[[package]]
name = "fxhash"
version = "0.2.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c31b6d751ae2c7f11320402d34e41349dd1016f8d5d45e48c4312bc8625af50c"
dependencies = [
"byteorder",
]
[[package]]
name = "getrandom"
version = "0.1.16"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8fc3cb4d91f53b50155bdcfd23f6a4c39ae1969c2ae85982b135750cccaf5fce"
dependencies = [
"cfg-if",
"libc",
"wasi",
]
[[package]]
name = "hashbrown"
version = "0.15.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "bf151400ff0baff5465007dd2f3e717f3fe502074ca563069ce3a6629d07b289"
dependencies = [
"allocator-api2",
"equivalent",
"foldhash",
]
[[package]]
name = "itoa"
version = "0.4.8"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b71991ff56294aa922b450139ee08b3bfc70982c6b2c7562771375cf73542dd4"
[[package]]
name = "lazy_static"
version = "1.5.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "bbd2bcb4c963f2ddae06a2efc7e9f3591312473c50c6685e1f298068316e66fe"
[[package]]
name = "lazycell"
version = "1.3.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "830d08ce1d1d941e6b30645f1a0eb5643013d835ce3779a5fc208261dbe10f55"
[[package]]
name = "libc"
version = "0.2.168"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5aaeb2981e0606ca11d79718f8bb01164f1d6ed75080182d3abf017e6d244b6d"
[[package]]
name = "log"
version = "0.4.22"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a7a70ba024b9dc04c27ea2f0c0548feb474ec5c54bba33a7f72f873a39d07b24"
[[package]]
name = "lol_html"
version = "2.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2872b88213f3cd4b04f719ec8f2e0b37c98882a7c4aa6fc13ba2f19f5eba1bd2"
dependencies = [
"bitflags 2.6.0",
"cfg-if",
"cssparser",
"encoding_rs",
"hashbrown",
"lazy_static",
"lazycell",
"memchr",
"mime",
"selectors",
"thiserror",
]
[[package]]
name = "mail-parser"
version = "0.9.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "93c3b9e5d8b17faf573330bbc43b37d6e918c0a3bf8a88e7d0a220ebc84af9fc"
dependencies = [
"encoding_rs",
]
[[package]]
name = "matches"
version = "0.1.10"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2532096657941c2fea9c289d370a250971c689d4f143798ff67113ec042024a5"
[[package]]
name = "memchr"
version = "2.7.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "78ca9ab1a0babb1e7d5695e3530886289c18cf2f87ec19a575a0abdce112e3a3"
[[package]]
name = "memmap2"
version = "0.9.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "fd3f7eed9d3848f8b98834af67102b720745c4ec028fcd0aa0239277e7de374f"
dependencies = [
"libc",
]
[[package]]
name = "mime"
version = "0.3.17"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6877bb514081ee2a7ff5ef9de3281f14a4dd4bceac4c09388074a6b5df8a139a"
[[package]]
name = "nodrop"
version = "0.1.14"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "72ef4a56884ca558e5ddb05a1d1e7e1bfd9a68d9ed024c21704cc98872dae1bb"
[[package]]
name = "phf"
version = "0.8.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3dfb61232e34fcb633f43d12c58f83c1df82962dcdfa565a4e866ffc17dafe12"
dependencies = [
"phf_macros",
"phf_shared",
"proc-macro-hack",
]
[[package]]
name = "phf_codegen"
version = "0.8.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "cbffee61585b0411840d3ece935cce9cb6321f01c45477d30066498cd5e1a815"
dependencies = [
"phf_generator",
"phf_shared",
]
[[package]]
name = "phf_generator"
version = "0.8.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "17367f0cc86f2d25802b2c26ee58a7b23faeccf78a396094c13dced0d0182526"
dependencies = [
"phf_shared",
"rand",
]
[[package]]
name = "phf_macros"
version = "0.8.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7f6fde18ff429ffc8fe78e2bf7f8b7a5a5a6e2a8b58bc5a9ac69198bbda9189c"
dependencies = [
"phf_generator",
"phf_shared",
"proc-macro-hack",
"proc-macro2",
"quote",
"syn 1.0.109",
]
[[package]]
name = "phf_shared"
version = "0.8.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c00cf8b9eafe68dde5e9eaa2cef8ee84a9336a47d566ec55ca16589633b65af7"
dependencies = [
"siphasher",
]
[[package]]
name = "ppv-lite86"
version = "0.2.20"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "77957b295656769bb8ad2b6a6b09d897d94f05c41b069aede1fcdaa675eaea04"
dependencies = [
"zerocopy",
]
[[package]]
name = "precomputed-hash"
version = "0.1.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "925383efa346730478fb4838dbe9137d2a47675ad789c546d150a6e1dd4ab31c"
[[package]]
name = "proc-macro-hack"
version = "0.5.20+deprecated"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "dc375e1527247fe1a97d8b7156678dfe7c1af2fc075c9a4db3690ecd2a148068"
[[package]]
name = "proc-macro2"
version = "1.0.92"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "37d3544b3f2748c54e147655edb5025752e2303145b5aefb3c3ea2c78b973bb0"
dependencies = [
"unicode-ident",
]
[[package]]
name = "quote"
version = "1.0.37"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b5b9d34b8991d19d98081b46eacdd8eb58c6f2b201139f7c5f643cc155a633af"
dependencies = [
"proc-macro2",
]
[[package]]
name = "rand"
version = "0.7.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6a6b1679d49b24bbfe0c803429aa1874472f50d9b363131f0e89fc356b544d03"
dependencies = [
"getrandom",
"libc",
"rand_chacha",
"rand_core",
"rand_hc",
"rand_pcg",
]
[[package]]
name = "rand_chacha"
version = "0.2.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f4c8ed856279c9737206bf725bf36935d8666ead7aa69b52be55af369d193402"
dependencies = [
"ppv-lite86",
"rand_core",
]
[[package]]
name = "rand_core"
version = "0.5.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "90bde5296fc891b0cef12a6d03ddccc162ce7b2aff54160af9338f8d40df6d19"
dependencies = [
"getrandom",
]
[[package]]
name = "rand_hc"
version = "0.2.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ca3129af7b92a17112d59ad498c6f81eaf463253766b90396d39ea7a39d6613c"
dependencies = [
"rand_core",
]
[[package]]
name = "rand_pcg"
version = "0.2.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "16abd0c1b639e9eb4d7c50c0b8100b0d0f849be2349829c740fe8e6eb4816429"
dependencies = [
"rand_core",
]
[[package]]
name = "rustc_version"
version = "0.4.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "cfcb3a22ef46e85b45de6ee7e79d063319ebb6594faafcf1c225ea92ab6e9b92"
dependencies = [
"semver",
]
[[package]]
name = "selectors"
version = "0.22.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "df320f1889ac4ba6bc0cdc9c9af7af4bd64bb927bccdf32d81140dc1f9be12fe"
dependencies = [
"bitflags 1.3.2",
"cssparser",
"derive_more",
"fxhash",
"log",
"matches",
"phf",
"phf_codegen",
"precomputed-hash",
"servo_arc",
"smallvec",
"thin-slice",
]
[[package]]
name = "semver"
version = "1.0.23"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "61697e0a1c7e512e84a621326239844a24d8207b4669b41bc18b32ea5cbf988b"
[[package]]
name = "servo_arc"
version = "0.1.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d98238b800e0d1576d8b6e3de32827c2d74bee68bb97748dcf5071fb53965432"
dependencies = [
"nodrop",
"stable_deref_trait",
]
[[package]]
name = "siphasher"
version = "0.3.11"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "38b58827f4464d87d377d175e90bf58eb00fd8716ff0a62f80356b5e61555d0d"
[[package]]
name = "smallvec"
version = "1.13.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3c5e1a9a646d36c3599cd173a41282daf47c44583ad367b8e6837255952e5c67"
[[package]]
name = "stable_deref_trait"
version = "1.2.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a8f112729512f8e442d81f95a8a7ddf2b7c6b8a1a6f509a95864142b30cab2d3"
[[package]]
name = "syn"
version = "1.0.109"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "72b64191b275b66ffe2469e8af2c1cfe3bafa67b529ead792a6d0160888b4237"
dependencies = [
"proc-macro2",
"quote",
"unicode-ident",
]
[[package]]
name = "syn"
version = "2.0.90"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "919d3b74a5dd0ccd15aeb8f93e7006bd9e14c295087c9896a110f490752bcf31"
dependencies = [
"proc-macro2",
"quote",
"unicode-ident",
]
[[package]]
name = "thin-slice"
version = "0.1.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8eaa81235c7058867fa8c0e7314f33dcce9c215f535d1913822a2b3f5e289f3c"
[[package]]
name = "thiserror"
version = "1.0.69"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b6aaf5339b578ea85b50e080feb250a3e8ae8cfcdff9a461c9ec2904bc923f52"
dependencies = [
"thiserror-impl",
]
[[package]]
name = "thiserror-impl"
version = "1.0.69"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4fee6c4efc90059e10f81e6d42c60a18f76588c3d74cb83a0b242a2b6c7504c1"
dependencies = [
"proc-macro2",
"quote",
"syn 2.0.90",
]
[[package]]
name = "unicode-ident"
version = "1.0.14"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "adb9e6ca4f869e1180728b7950e35922a7fc6397f7b641499e8f3ef06e50dc83"
[[package]]
name = "wasi"
version = "0.9.0+wasi-snapshot-preview1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "cccddf32554fecc6acb585f82a32a72e28b48f8c4c1883ddfeeeaa96f7d8e519"
[[package]]
name = "zerocopy"
version = "0.7.35"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1b9b4fd18abc82b8136838da5d50bae7bdea537c574d8dc1a34ed098d6c166f0"
dependencies = [
"byteorder",
"zerocopy-derive",
]
[[package]]
name = "zerocopy-derive"
version = "0.7.35"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "fa4f8080344d4671fb4e831a13ad1e68092748387dfc4f55e356242fae12ce3e"
dependencies = [
"proc-macro2",
"quote",
"syn 2.0.90",
]

View file

@ -0,0 +1,14 @@
[package]
name = "mn2html"
version = "0.0.0"
edition = "2021"
license = "GPL-3.0-only"
[dependencies]
lol_html = "2.1.0"
mail-parser = { version = "0.9.4", features = [ "encoding_rs" ] }
memmap2 = "0.9.5"
[[bin]]
name = "mn2html"
path = "mn2html.rs"

View file

@ -0,0 +1,20 @@
# mn2html
Convert mail notes authored e.g. by the iOS/macOS Notes application,
into HTML suitable for standard browsers. Instead of full documents,
mn2html emits HTML fragments that can easily be embedded into other
documents or postprocessed using a templating engine.
## History
mn2html is a reimplementation mnote-html from //users/sterni/mblog.
The reason for this was mainly avoiding the startup cost associated
with Common Lisp programs, so the program would be suitable for
shell scripting.
## Tasks
- [ ] Properly handle `text/plain` bodies (from e.g. notemap)
- [ ] Add man page
- [ ] Help screen
- [ ] Improve error reporting

View file

@ -0,0 +1,25 @@
{ pkgs, lib, ... }:
pkgs.rustPlatform.buildRustPackage rec {
pname = "mn2hmtl";
version = "canon";
src = lib.fileset.toSource {
root = ./.;
fileset = lib.fileset.unions [
./Cargo.lock
./Cargo.toml
./mn2html.rs
];
};
cargoLock.lockFile = ./Cargo.lock;
passthru.shell = pkgs.mkShell {
name = "${pname}-shell";
nativeBuildInputs = [
pkgs.buildPackages.cargo
pkgs.buildPackages.rustc
];
};
}

View file

@ -0,0 +1,165 @@
// SPDX-FileCopyrightText: Copyright © 2024 sterni
// SPDX-License-Identifier: GPL-3.0-only
use lol_html::html_content::ContentType;
use lol_html::{element, HtmlRewriter, Settings};
use mail_parser::{Message, MessageParser, MimeHeaders};
use memmap2::Mmap;
use std::collections::HashMap;
use std::env;
use std::error::Error;
use std::fmt;
use std::fs::File;
use std::io::Write;
type CidMap<'a> = HashMap<&'a str, &'a str>;
#[derive(Debug)]
enum Mn2htmlError {
MimeParseFail,
NoMailNote,
MissingAttachment(String),
}
impl fmt::Display for Mn2htmlError {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
match self {
Mn2htmlError::MimeParseFail => {
write!(f, "Could not parse given file as a MIME message")
}
Mn2htmlError::NoMailNote => {
write!(f, "Given MIME message does not appear to be a Mail Note")
}
Mn2htmlError::MissingAttachment(cid) => write!(
f,
"Given object's Content-Id {} doesn't match any attachment",
cid
),
}
}
}
impl Error for Mn2htmlError {
fn source(&self) -> Option<&(dyn Error + 'static)> {
None
}
}
fn warn(msg: &str) {
eprintln!("mn2html: {}", msg);
}
fn main() -> Result<(), Box<dyn std::error::Error>> {
for arg in env::args_os().skip(1) {
// TODO(sterni): flags, --help and such
let msg_file = File::open(arg)?;
let msg_raw = unsafe { Mmap::map(&msg_file) }?;
let msg_parsed = MessageParser::default()
.parse(msg_raw.as_ref())
.ok_or(Mn2htmlError::MimeParseFail)?;
if !matches!(
msg_parsed
.header("X-Uniform-Type-Identifier")
.and_then(|h| h.as_text()),
Some("com.apple.mail-note")
) {
return Err(Box::new(Mn2htmlError::NoMailNote));
}
let cid_map = index_attachments(&msg_parsed);
let html_body = msg_parsed
.html_bodies()
.nth(0)
.ok_or(Mn2htmlError::NoMailNote)?
.contents();
rewrite_html(html_body, &cid_map)?;
}
Ok(())
}
// At some point, it was a consideration to move this out of the Rust program.
// mn2html would have been a shell script with mblaze(7) tools finding the
// attachments and their content ideas passing the information to a Rust HTML
// rewriter via CLI args. It is unclear how much (if at all?) slower this would
// have been. In the end, it just seemed cleaner to do it in the Rust program,
// especially since the HTML rewriter would not really have been useful on its
// own.
fn index_attachments<'a>(msg: &'a Message) -> CidMap<'a> {
let mut map = HashMap::new();
for a in msg.attachments() {
match (a.content_id(), a.attachment_name()) {
(Some(cid), Some(filename)) => {
if let Some(_) = map.insert(cid, filename) {
warn("multiple attachments share the same Content-Id");
}
}
(_, _) => warn("attachment without Content-Id and/or filename in Content-Disposition"),
}
}
map
}
fn rewrite_html(html_body: &[u8], cid_map: &CidMap) -> Result<(), Box<dyn std::error::Error>> {
let mut stdout = std::io::stdout();
let mut rewriter = HtmlRewriter::new(
Settings {
element_content_handlers: vec![
element!("head", |el| {
el.remove();
Ok(())
}),
element!("body", |el| {
el.remove_and_keep_content();
Ok(())
}),
element!("html", |el| {
el.remove_and_keep_content();
Ok(())
}),
element!("object[type][data]", |el| {
if el
.get_attribute("type")
.expect("element! matched object[type] without type attribute")
!= "application/x-apple-msg-attachment"
{
warn("encountered object with unknown type attribute, ignoring");
return Ok(());
}
match el
.get_attribute("data")
.expect("element! matched object[data] without data attribute")
.split_at_checked(4)
{
Some(("cid:", cid)) => match cid_map.get(cid) {
Some(filename) => el.replace(
&format!(r#"<img src="{}">"#, filename),
ContentType::Html,
),
_ => {
return Err(Box::new(Mn2htmlError::MissingAttachment(
cid.to_string(),
)))
}
},
_ => warn("encountered object with malformed data attribute, ignoring"),
};
Ok(())
}),
],
..Settings::new()
},
|c: &[u8]| stdout.write_all(c).expect("Can't write to stdout"),
);
rewriter.write(html_body)?;
rewriter.end()?;
Ok(())
}