feat(users/Profpatsch/whatcd-resolver): add a simple torrent parser

The `bencode` library is anything but production-grade, but it’s
enough to parse the torrent files generated by Gazelle lol.

This should help with … I haven’t really figured out yet what it helps
with I guess. But it was fun. I like the `Parse` abstraction very
much. It can parse XML and it can parse Bencode. Good.

Change-Id: If1331de423eab3e91ce9db6e2a7eb84da51b18a2
Reviewed-on: https://cl.tvl.fyi/c/depot/+/13211
Tested-by: BuildkiteCI
Reviewed-by: Profpatsch <mail@profpatsch.de>
This commit is contained in:
Profpatsch 2025-03-06 23:59:27 +01:00
parent d64c53d051
commit dde78515f6
8 changed files with 245 additions and 8 deletions

View file

@ -0,0 +1,109 @@
{-# LANGUAGE QuasiQuotes #-}
module Bencode where
import Aeson (jsonArray)
import Data.Aeson qualified as Json
import Data.Aeson.Key qualified as Key
import Data.Aeson.KeyMap qualified as KeyMap
import Data.BEncode (BEncode)
import Data.BEncode qualified as Bencode
import Data.ByteString.Lazy (LazyByteString)
import Data.ByteString.Lazy.Char8 qualified as Char8.Lazy
import Data.Char qualified as Char
import Data.List qualified as List
import Data.Map.Strict qualified as Map
import FieldParser qualified as Field
import Json (RestrictJsonOpts (RestrictJsonOpts), mkJsonArray)
import Json qualified
import MyPrelude
import Parse (Parse, fieldParser, mkParseNoContext, showContext)
import Pretty
import Text.Printf (printf)
import Prelude hiding (span)
bencodeBytes :: Parse BEncode ByteString
bencodeBytes = Parse.mkParseNoContext $ \(ctx, bencode) -> case bencode of
Bencode.BString bs -> Right (toStrictBytes bs)
_ -> Left $ [fmt|Expected a bencode byte-string, but got {bencode & prettyBencodeRestricted}, at {showContext ctx}|]
bencodeInteger :: Parse BEncode Integer
bencodeInteger = Parse.mkParseNoContext $ \(ctx, bencode) -> case bencode of
Bencode.BInt i -> Right i
_ -> Left $ [fmt|Expected a bencode integer, but got {bencode & prettyBencodeRestricted}, at {showContext ctx}|]
bencodeNatural :: Parse BEncode Natural
bencodeNatural = bencodeInteger >>> Parse.fieldParser Field.integralToNatural
bencodeDict :: Parse BEncode (Map Text BEncode)
bencodeDict = Parse.mkParseNoContext $ \(ctx, bencode) -> case bencode of
Bencode.BDict d -> Right $ Map.mapKeys stringToText d
_ -> Left $ [fmt|Expected a bencode dict, but got {bencode & prettyBencodeRestricted}, at {showContext ctx}|]
bencodeList :: Parse BEncode [BEncode]
bencodeList = Parse.mkParseNoContext $ \(ctx, bencode) -> case bencode of
Bencode.BList l -> Right $ l
_ -> Left $ [fmt|Expected a bencode list, but got {bencode & prettyBencodeRestricted}, at {showContext ctx}|]
parseBencode :: Parse ByteString BEncode
parseBencode = Parse.mkParseNoContext $ \(ctx, bs) -> do
let lazy = toLazyBytes bs
case Bencode.bRead lazy of
Nothing -> Left $ [fmt|Failed to parse bencode: {Bencode.BString lazy & prettyBencodeRestricted}, at {showContext ctx}|]
Just a -> Right a
bencodeTextLenient :: Parse BEncode Text
bencodeTextLenient = Parse.mkParseNoContext $ \(ctx, bencode) -> do
case bencode of
Bencode.BString bs -> Right (bs & toStrictBytes & bytesToTextUtf8Lenient)
_ -> Left $ [fmt|Expected a bencode string, but got {bencode & prettyBencodeRestricted}, at {showContext ctx}|]
prettyBencodeRestricted :: BEncode -> Text
prettyBencodeRestricted =
showPrettyJson . Json.restrictJson restriction . bencodeToJsonValue
where
restriction =
RestrictJsonOpts
{ maxDepth = 3,
maxSizeObject = 10,
maxSizeArray = 10,
maxStringLength = 100
}
bencodeToJsonValue :: BEncode -> Json.Value
bencodeToJsonValue = \case
Bencode.BString bs -> case bs & bytesToTextUtf8Lazy of
-- If its not valid utf-8, lets at least display a hexdump
Left _ -> mkJsonArray $ "hexdump of bytes:" : (hexdump 0 bs <&> Json.String)
Right a -> Json.String $ a & toStrict
Bencode.BInt i -> Json.Number (fromIntegral @Integer @Scientific i)
Bencode.BDict m -> Json.Object $ m & Map.toList <&> bimap Key.fromString bencodeToJsonValue & KeyMap.fromList
Bencode.BList l -> jsonArray $ l <&> bencodeToJsonValue
-- | Unfold using f until the predicate becomes true.
unfoldUntil :: (b -> Bool) -> (b -> (a, b)) -> b -> [a]
unfoldUntil p f = List.unfoldr (\x -> guard (not (p x)) >> pure (f x))
-- | Return hex characters for the byte value.
bytehex :: Int -> String
bytehex n = printf "%02x" n
-- | Return a printable character, or a dot.
prChar :: Char -> Char
prChar ch
| Char.ord ch >= 32 && Char.ord ch < 128 = ch
| otherwise = '.'
-- | Return a string containing a pretty hexdump of xs using addresses
-- starting at n.
hexdump :: Int -> LazyByteString -> [Text]
hexdump n xs = zipWith hexLine addrs dlines
where
addrs = [n, n + 16 ..]
dlines = unfoldUntil null (splitAt 16) (Char8.Lazy.unpack xs)
hexLine :: Int -> String -> Text
hexLine addr xs' = stringToText $ printf "%08x |%-23s %-23s| %s" addr h1 h2 s
where
h1 = unwords $ map (bytehex . Char.ord) $ take 8 xs'
h2 = unwords $ map (bytehex . Char.ord) $ drop 8 xs'
s = map prChar xs'

View file

@ -4,6 +4,7 @@ module Redacted where
import AppT
import Arg
import Bencode
import Builder
import Comparison
import Control.Monad.Logger.CallStack
@ -12,10 +13,14 @@ import Data.Aeson qualified as Json
import Data.Aeson.BetterErrors qualified as Json
import Data.Aeson.Key qualified as Key
import Data.Aeson.KeyMap qualified as KeyMap
import Data.BEncode (BEncode)
import Data.Error.Tree
import Data.List qualified as List
import Data.List.NonEmpty qualified as NonEmpty
import Data.Maybe (catMaybes)
import Data.Text.IO qualified as Text.IO
import Data.Time (NominalDiffTime, UTCTime)
import Data.Time.Clock.POSIX (posixSecondsToUTCTime)
import Database.PostgreSQL.Simple (Binary (Binary), Only (..))
import Database.PostgreSQL.Simple.Types (PGArray (PGArray))
import FieldParser qualified as Field
@ -27,10 +32,12 @@ import Network.HTTP.Types
import Network.Wai.Parse qualified as Wai
import OpenTelemetry.Trace qualified as Otel hiding (getTracer, inSpan, inSpan')
import Optional
import Parse (Parse, mapLookup, mapLookupMay, runParse)
import Parse qualified
import Postgres.Decoder qualified as Dec
import Postgres.MonadPostgres
import Pretty
import Prelude hiding (span)
import Prelude hiding (length, span)
class MonadRedacted m where
getRedactedApiKey :: m ByteString
@ -781,3 +788,93 @@ test =
(Json.asValue)
<&> Pretty.showPrettyJsonColored
>>= liftIO . putStderrLn
readTorrentFile :: (MonadIO m, MonadPostgres m) => m ()
readTorrentFile = runTransaction $ do
torrentBytes <-
queryWith
[sql|
SELECT torrent_file from redacted.torrents where torrent_file is not null limit 10 |]
()
Dec.bytea
liftIO $ for_ torrentBytes $ \b -> case testBencode b of
Left e -> do
Text.IO.putStrLn $ prettyErrorTree e
Right a -> printPretty a
liftIO $ print $ lengthNatural torrentBytes
testBencode :: ByteString -> (Either ErrorTree TorrentFile)
testBencode bs = Parse.runParse "cannot parse bencode" (parseBencode >>> bencodeTorrentParser) bs
-- | A torrent file
--
-- from wikipedia:
--
-- * announce—the URL of the high tracker
-- * info—this maps to a dictionary whose keys are very dependent on whether one or more files are being shared:
-- - files—a list of dictionaries each corresponding to a file (only when multiple files are being shared). Each dictionary has the following keys:
-- * length—size of the file in bytes.
-- * path—a list of strings corresponding to subdirectory names, the last of which is the actual file name
-- - length—size of the file in bytes (only when one file is being shared though)
-- - name—suggested filename where the file is to be saved (if one file)/suggested directory name where the files are to be saved (if multiple files)
-- - piece length—number of bytes per piece. This is commonly 28 KiB = 256 KiB = 262,144 B.
-- - pieces—a hash list, i.e., a concatenation of each piece's SHA-1 hash. As SHA-1 returns a 160-bit hash, pieces will be a string whose length is a multiple of 20 bytes. If the torrent contains multiple files, the pieces are formed by concatenating the files in the order they appear in the files dictionary (i.e., all pieces in the torrent are the full piece length except for the last piece, which may be shorter).
data TorrentFile = TorrentFile
{ announce :: Text,
comment :: Maybe Text,
createdBy :: Maybe Text,
creationDate :: Maybe UTCTime,
encoding :: Maybe Text,
info :: Info
}
deriving stock (Eq, Show)
data Info = Info
{ name :: Text,
files :: [File],
pieceLength :: Natural,
pieces :: ByteString,
private :: Maybe Bool,
source :: Maybe Text
}
deriving stock (Eq, Show)
data File = File
{ length_ :: Natural,
path :: [Text]
}
deriving stock (Eq, Show)
bencodeTorrentParser :: Parse BEncode TorrentFile
bencodeTorrentParser =
bencodeDict >>> do
announce <- mapLookup "announce" bencodeTextLenient
comment <- mapLookupMay "comment" bencodeTextLenient
createdBy <- mapLookupMay "created by" bencodeTextLenient
creationDate <- mapLookupMay "creation date" (bencodeInteger <&> posixSecondsToUTCTime . fromInteger @NominalDiffTime)
encoding <- mapLookupMay "encoding" bencodeTextLenient
info <-
mapLookup "info" $
bencodeDict >>> do
name <- mapLookup "name" bencodeTextLenient
files <-
mapLookup "files" $
bencodeList
>>> ( Parse.multiple $
bencodeDict >>> do
length_ <- mapLookup "length" bencodeNatural
path <- mapLookup "path" $ bencodeList >>> Parse.multiple bencodeTextLenient
pure $ File {..}
)
pieceLength <- mapLookup "piece length" bencodeNatural
pieces <- mapLookup "pieces" bencodeBytes
private <-
mapLookupMay "private" bencodeInteger
<&> fmap
( \case
0 -> False
_ -> True
)
source <- mapLookupMay "source" bencodeTextLenient
pure Info {..}
pure TorrentFile {..}

View file

@ -54,7 +54,7 @@ import OpenTelemetry.Attributes qualified as Otel
import OpenTelemetry.Context.ThreadLocal qualified as Otel
import OpenTelemetry.Trace qualified as Otel hiding (getTracer, inSpan, inSpan')
import OpenTelemetry.Trace.Monad qualified as Otel
import Parse (Parse)
import Parse (Parse, showContext)
import Parse qualified
import Postgres.Decoder qualified as Dec
import Postgres.MonadPostgres
@ -650,17 +650,17 @@ textToURI =
uriToHttpClientRequest :: Parse URI Http.Request
uriToHttpClientRequest =
Parse.mkParseNoContext
( \url ->
( \(ctx, url) ->
(url & Http.requestFromURI)
& runCatch
& first (checkException @Http.HttpException)
& \case
Left (Right (Http.InvalidUrlException urlText reason)) ->
Left [fmt|Unable to set the url "{urlText}" as request URL, reason: {reason}|]
Left [fmt|Unable to set the url "{urlText}" as request URL, reason: {reason}, at {Parse.showContext ctx}|]
Left (Right exc@(Http.HttpExceptionRequest _ _)) ->
Left [fmt|Weird! Should not get a HttpExceptionRequest when parsing an URL (bad library design), was {exc & displayException}|]
Left [fmt|Weird! Should not get a HttpExceptionRequest when parsing an URL (bad library design), was {exc & displayException}, at {Parse.showContext ctx}|]
Left (Left someExc) ->
Left [fmt|Weird! Should not get anyhting but a HttpException when parsing an URL (bad library design), was {someExc & displayException}|]
Left [fmt|Weird! Should not get anyhting but a HttpException when parsing an URL (bad library design), was {someExc & displayException}, at {Parse.showContext ctx}|]
Right req -> pure req
)