feat(users/Profpatsch/whatcd-resolver): add a simple torrent parser

The `bencode` library is anything but production-grade, but it’s
enough to parse the torrent files generated by Gazelle lol.

This should help with … I haven’t really figured out yet what it helps
with I guess. But it was fun. I like the `Parse` abstraction very
much. It can parse XML and it can parse Bencode. Good.

Change-Id: If1331de423eab3e91ce9db6e2a7eb84da51b18a2
Reviewed-on: https://cl.tvl.fyi/c/depot/+/13211
Tested-by: BuildkiteCI
Reviewed-by: Profpatsch <mail@profpatsch.de>
This commit is contained in:
Profpatsch 2025-03-06 23:59:27 +01:00
parent d64c53d051
commit dde78515f6
8 changed files with 245 additions and 8 deletions

View file

@ -11,6 +11,7 @@ import Data.Aeson.BetterErrors qualified as Json
import Data.Aeson.Key qualified as Key
import Data.Aeson.KeyMap qualified as KeyMap
import Data.Aeson.Types qualified
import Data.ByteString.Base64 qualified as Base64
import Data.Error.Tree
import Data.Map.Strict qualified as Map
import Data.Maybe (catMaybes)
@ -333,6 +334,10 @@ instance ToJSON EmptyObject where
mkJsonArray :: [Value] -> Value
mkJsonArray xs = xs & Vector.fromList & Array
-- | Encode a 'ByteString' as a base64-encoded json string
mkBase64Bytes :: ByteString -> Value
mkBase64Bytes = String . bytesToTextUtf8Unsafe . Base64.encode
data RestrictJsonOpts = RestrictJsonOpts
{ maxDepth :: Natural,
maxSizeObject :: Natural,

View file

@ -7,6 +7,7 @@ import Control.Selective (Selective)
import Data.Error.Tree
import Data.Functor.Compose
import Data.List qualified as List
import Data.Map.Strict qualified as Map
import Data.Monoid (First (..))
import Data.Semigroup.Traversable
import Data.Semigroupoid qualified as Semigroupoid
@ -72,8 +73,8 @@ mkParsePushContext toPush f = Parse $ \(ctx, from) -> case f (ctx, from) of
Right to -> Success (addContext toPush ctx, to)
Left err -> Failure $ singleton err
mkParseNoContext :: (from -> Either ErrorTree to) -> Parse from to
mkParseNoContext f = Parse $ \(ctx, from) -> case f from of
mkParseNoContext :: ((Context, from) -> Either ErrorTree to) -> Parse from to
mkParseNoContext f = Parse $ \(ctx, from) -> case f (ctx, from) of
Right to -> Success (ctx, to)
Left err -> Failure $ singleton err
@ -161,6 +162,24 @@ findAll inner = Parse $ \(ctx, xs) ->
Success (ctx, [])
(_miss, (hitCtx, hit) : hits) -> Success (hitCtx, hit : (hits <&> snd))
-- | Find the given element in the map, and parse it with the given parser.
mapLookup :: (Coercible Text key, Ord key) => key -> Parse from to -> Parse (Map key from) to
mapLookup key inner = do
let keyT :: Text = coerce key
Parse $ \(ctx, m) -> case Map.lookup (coerce key) m of
Nothing -> Failure $ singleton [fmt|Key "{keyT}" not found in map at {showContext ctx}|]
Just a -> runParse' inner (addContext keyT ctx, a)
-- | Find the given element in the map, and parse it with the given parser.
--
-- Use this instead of `rmap` to add the map key to the error context.
mapLookupMay :: (Coercible Text key, Ord key) => key -> Parse from to -> Parse (Map key from) (Maybe to)
mapLookupMay key inner = do
let keyT :: Text = coerce key
Parse $ \(ctx, m) -> case Map.lookup (coerce key) m of
Nothing -> Success (addContext keyT ctx, Nothing)
Just a -> runParse' (Just <$> inner) (addContext keyT ctx, a)
-- | convert a 'FieldParser' into a 'Parse'.
fieldParser :: FieldParser from to -> Parse from to
fieldParser fp = Parse $ \(ctx, from) -> case Field.runFieldParser fp from of

View file

@ -14,6 +14,7 @@ pkgs.mkShell {
h.async
h.aeson-better-errors
h.blaze-html
h.bencode
h.conduit-extra
h.error
h.monad-logger

View file

@ -12,6 +12,7 @@ let
./Main.hs
./src/WhatcdResolver.hs
./src/AppT.hs
./src/Bencode.hs
./src/JsonLd.hs
./src/Optional.hs
./src/Html.hs
@ -29,6 +30,7 @@ let
pkgs.haskellPackages.pa-field-parser
pkgs.haskellPackages.pa-run-command
pkgs.haskellPackages.aeson-better-errors
pkgs.haskellPackages.bencode
pkgs.haskellPackages.blaze-html
pkgs.haskellPackages.hs-opentelemetry-sdk
pkgs.haskellPackages.http-conduit

View file

@ -0,0 +1,109 @@
{-# LANGUAGE QuasiQuotes #-}
module Bencode where
import Aeson (jsonArray)
import Data.Aeson qualified as Json
import Data.Aeson.Key qualified as Key
import Data.Aeson.KeyMap qualified as KeyMap
import Data.BEncode (BEncode)
import Data.BEncode qualified as Bencode
import Data.ByteString.Lazy (LazyByteString)
import Data.ByteString.Lazy.Char8 qualified as Char8.Lazy
import Data.Char qualified as Char
import Data.List qualified as List
import Data.Map.Strict qualified as Map
import FieldParser qualified as Field
import Json (RestrictJsonOpts (RestrictJsonOpts), mkJsonArray)
import Json qualified
import MyPrelude
import Parse (Parse, fieldParser, mkParseNoContext, showContext)
import Pretty
import Text.Printf (printf)
import Prelude hiding (span)
bencodeBytes :: Parse BEncode ByteString
bencodeBytes = Parse.mkParseNoContext $ \(ctx, bencode) -> case bencode of
Bencode.BString bs -> Right (toStrictBytes bs)
_ -> Left $ [fmt|Expected a bencode byte-string, but got {bencode & prettyBencodeRestricted}, at {showContext ctx}|]
bencodeInteger :: Parse BEncode Integer
bencodeInteger = Parse.mkParseNoContext $ \(ctx, bencode) -> case bencode of
Bencode.BInt i -> Right i
_ -> Left $ [fmt|Expected a bencode integer, but got {bencode & prettyBencodeRestricted}, at {showContext ctx}|]
bencodeNatural :: Parse BEncode Natural
bencodeNatural = bencodeInteger >>> Parse.fieldParser Field.integralToNatural
bencodeDict :: Parse BEncode (Map Text BEncode)
bencodeDict = Parse.mkParseNoContext $ \(ctx, bencode) -> case bencode of
Bencode.BDict d -> Right $ Map.mapKeys stringToText d
_ -> Left $ [fmt|Expected a bencode dict, but got {bencode & prettyBencodeRestricted}, at {showContext ctx}|]
bencodeList :: Parse BEncode [BEncode]
bencodeList = Parse.mkParseNoContext $ \(ctx, bencode) -> case bencode of
Bencode.BList l -> Right $ l
_ -> Left $ [fmt|Expected a bencode list, but got {bencode & prettyBencodeRestricted}, at {showContext ctx}|]
parseBencode :: Parse ByteString BEncode
parseBencode = Parse.mkParseNoContext $ \(ctx, bs) -> do
let lazy = toLazyBytes bs
case Bencode.bRead lazy of
Nothing -> Left $ [fmt|Failed to parse bencode: {Bencode.BString lazy & prettyBencodeRestricted}, at {showContext ctx}|]
Just a -> Right a
bencodeTextLenient :: Parse BEncode Text
bencodeTextLenient = Parse.mkParseNoContext $ \(ctx, bencode) -> do
case bencode of
Bencode.BString bs -> Right (bs & toStrictBytes & bytesToTextUtf8Lenient)
_ -> Left $ [fmt|Expected a bencode string, but got {bencode & prettyBencodeRestricted}, at {showContext ctx}|]
prettyBencodeRestricted :: BEncode -> Text
prettyBencodeRestricted =
showPrettyJson . Json.restrictJson restriction . bencodeToJsonValue
where
restriction =
RestrictJsonOpts
{ maxDepth = 3,
maxSizeObject = 10,
maxSizeArray = 10,
maxStringLength = 100
}
bencodeToJsonValue :: BEncode -> Json.Value
bencodeToJsonValue = \case
Bencode.BString bs -> case bs & bytesToTextUtf8Lazy of
-- If its not valid utf-8, lets at least display a hexdump
Left _ -> mkJsonArray $ "hexdump of bytes:" : (hexdump 0 bs <&> Json.String)
Right a -> Json.String $ a & toStrict
Bencode.BInt i -> Json.Number (fromIntegral @Integer @Scientific i)
Bencode.BDict m -> Json.Object $ m & Map.toList <&> bimap Key.fromString bencodeToJsonValue & KeyMap.fromList
Bencode.BList l -> jsonArray $ l <&> bencodeToJsonValue
-- | Unfold using f until the predicate becomes true.
unfoldUntil :: (b -> Bool) -> (b -> (a, b)) -> b -> [a]
unfoldUntil p f = List.unfoldr (\x -> guard (not (p x)) >> pure (f x))
-- | Return hex characters for the byte value.
bytehex :: Int -> String
bytehex n = printf "%02x" n
-- | Return a printable character, or a dot.
prChar :: Char -> Char
prChar ch
| Char.ord ch >= 32 && Char.ord ch < 128 = ch
| otherwise = '.'
-- | Return a string containing a pretty hexdump of xs using addresses
-- starting at n.
hexdump :: Int -> LazyByteString -> [Text]
hexdump n xs = zipWith hexLine addrs dlines
where
addrs = [n, n + 16 ..]
dlines = unfoldUntil null (splitAt 16) (Char8.Lazy.unpack xs)
hexLine :: Int -> String -> Text
hexLine addr xs' = stringToText $ printf "%08x |%-23s %-23s| %s" addr h1 h2 s
where
h1 = unwords $ map (bytehex . Char.ord) $ take 8 xs'
h2 = unwords $ map (bytehex . Char.ord) $ drop 8 xs'
s = map prChar xs'

View file

@ -4,6 +4,7 @@ module Redacted where
import AppT
import Arg
import Bencode
import Builder
import Comparison
import Control.Monad.Logger.CallStack
@ -12,10 +13,14 @@ import Data.Aeson qualified as Json
import Data.Aeson.BetterErrors qualified as Json
import Data.Aeson.Key qualified as Key
import Data.Aeson.KeyMap qualified as KeyMap
import Data.BEncode (BEncode)
import Data.Error.Tree
import Data.List qualified as List
import Data.List.NonEmpty qualified as NonEmpty
import Data.Maybe (catMaybes)
import Data.Text.IO qualified as Text.IO
import Data.Time (NominalDiffTime, UTCTime)
import Data.Time.Clock.POSIX (posixSecondsToUTCTime)
import Database.PostgreSQL.Simple (Binary (Binary), Only (..))
import Database.PostgreSQL.Simple.Types (PGArray (PGArray))
import FieldParser qualified as Field
@ -27,10 +32,12 @@ import Network.HTTP.Types
import Network.Wai.Parse qualified as Wai
import OpenTelemetry.Trace qualified as Otel hiding (getTracer, inSpan, inSpan')
import Optional
import Parse (Parse, mapLookup, mapLookupMay, runParse)
import Parse qualified
import Postgres.Decoder qualified as Dec
import Postgres.MonadPostgres
import Pretty
import Prelude hiding (span)
import Prelude hiding (length, span)
class MonadRedacted m where
getRedactedApiKey :: m ByteString
@ -781,3 +788,93 @@ test =
(Json.asValue)
<&> Pretty.showPrettyJsonColored
>>= liftIO . putStderrLn
readTorrentFile :: (MonadIO m, MonadPostgres m) => m ()
readTorrentFile = runTransaction $ do
torrentBytes <-
queryWith
[sql|
SELECT torrent_file from redacted.torrents where torrent_file is not null limit 10 |]
()
Dec.bytea
liftIO $ for_ torrentBytes $ \b -> case testBencode b of
Left e -> do
Text.IO.putStrLn $ prettyErrorTree e
Right a -> printPretty a
liftIO $ print $ lengthNatural torrentBytes
testBencode :: ByteString -> (Either ErrorTree TorrentFile)
testBencode bs = Parse.runParse "cannot parse bencode" (parseBencode >>> bencodeTorrentParser) bs
-- | A torrent file
--
-- from wikipedia:
--
-- * announce—the URL of the high tracker
-- * info—this maps to a dictionary whose keys are very dependent on whether one or more files are being shared:
-- - files—a list of dictionaries each corresponding to a file (only when multiple files are being shared). Each dictionary has the following keys:
-- * length—size of the file in bytes.
-- * path—a list of strings corresponding to subdirectory names, the last of which is the actual file name
-- - length—size of the file in bytes (only when one file is being shared though)
-- - name—suggested filename where the file is to be saved (if one file)/suggested directory name where the files are to be saved (if multiple files)
-- - piece length—number of bytes per piece. This is commonly 28 KiB = 256 KiB = 262,144 B.
-- - pieces—a hash list, i.e., a concatenation of each piece's SHA-1 hash. As SHA-1 returns a 160-bit hash, pieces will be a string whose length is a multiple of 20 bytes. If the torrent contains multiple files, the pieces are formed by concatenating the files in the order they appear in the files dictionary (i.e., all pieces in the torrent are the full piece length except for the last piece, which may be shorter).
data TorrentFile = TorrentFile
{ announce :: Text,
comment :: Maybe Text,
createdBy :: Maybe Text,
creationDate :: Maybe UTCTime,
encoding :: Maybe Text,
info :: Info
}
deriving stock (Eq, Show)
data Info = Info
{ name :: Text,
files :: [File],
pieceLength :: Natural,
pieces :: ByteString,
private :: Maybe Bool,
source :: Maybe Text
}
deriving stock (Eq, Show)
data File = File
{ length_ :: Natural,
path :: [Text]
}
deriving stock (Eq, Show)
bencodeTorrentParser :: Parse BEncode TorrentFile
bencodeTorrentParser =
bencodeDict >>> do
announce <- mapLookup "announce" bencodeTextLenient
comment <- mapLookupMay "comment" bencodeTextLenient
createdBy <- mapLookupMay "created by" bencodeTextLenient
creationDate <- mapLookupMay "creation date" (bencodeInteger <&> posixSecondsToUTCTime . fromInteger @NominalDiffTime)
encoding <- mapLookupMay "encoding" bencodeTextLenient
info <-
mapLookup "info" $
bencodeDict >>> do
name <- mapLookup "name" bencodeTextLenient
files <-
mapLookup "files" $
bencodeList
>>> ( Parse.multiple $
bencodeDict >>> do
length_ <- mapLookup "length" bencodeNatural
path <- mapLookup "path" $ bencodeList >>> Parse.multiple bencodeTextLenient
pure $ File {..}
)
pieceLength <- mapLookup "piece length" bencodeNatural
pieces <- mapLookup "pieces" bencodeBytes
private <-
mapLookupMay "private" bencodeInteger
<&> fmap
( \case
0 -> False
_ -> True
)
source <- mapLookupMay "source" bencodeTextLenient
pure Info {..}
pure TorrentFile {..}

View file

@ -54,7 +54,7 @@ import OpenTelemetry.Attributes qualified as Otel
import OpenTelemetry.Context.ThreadLocal qualified as Otel
import OpenTelemetry.Trace qualified as Otel hiding (getTracer, inSpan, inSpan')
import OpenTelemetry.Trace.Monad qualified as Otel
import Parse (Parse)
import Parse (Parse, showContext)
import Parse qualified
import Postgres.Decoder qualified as Dec
import Postgres.MonadPostgres
@ -650,17 +650,17 @@ textToURI =
uriToHttpClientRequest :: Parse URI Http.Request
uriToHttpClientRequest =
Parse.mkParseNoContext
( \url ->
( \(ctx, url) ->
(url & Http.requestFromURI)
& runCatch
& first (checkException @Http.HttpException)
& \case
Left (Right (Http.InvalidUrlException urlText reason)) ->
Left [fmt|Unable to set the url "{urlText}" as request URL, reason: {reason}|]
Left [fmt|Unable to set the url "{urlText}" as request URL, reason: {reason}, at {Parse.showContext ctx}|]
Left (Right exc@(Http.HttpExceptionRequest _ _)) ->
Left [fmt|Weird! Should not get a HttpExceptionRequest when parsing an URL (bad library design), was {exc & displayException}|]
Left [fmt|Weird! Should not get a HttpExceptionRequest when parsing an URL (bad library design), was {exc & displayException}, at {Parse.showContext ctx}|]
Left (Left someExc) ->
Left [fmt|Weird! Should not get anyhting but a HttpException when parsing an URL (bad library design), was {someExc & displayException}|]
Left [fmt|Weird! Should not get anyhting but a HttpException when parsing an URL (bad library design), was {someExc & displayException}, at {Parse.showContext ctx}|]
Right req -> pure req
)

View file

@ -65,6 +65,7 @@ library
exposed-modules:
WhatcdResolver
AppT
Bencode
JsonLd
Optional
Http
@ -84,6 +85,7 @@ library
pa-run-command,
aeson-better-errors,
aeson,
bencode,
blaze-html,
bytestring,
case-insensitive,
@ -108,7 +110,9 @@ library
postgresql-simple,
punycode,
tmp-postgres,
time,
unliftio,
selective,
wai-extra,
wai,
warp,