From cb5a74523339dc796584dc68b7ebc2c33e01056f Mon Sep 17 00:00:00 2001 From: Profpatsch Date: Mon, 10 Mar 2025 17:47:36 +0100 Subject: [PATCH] fix(users/Profpatsch/whatcd-resolver): speed up artist_ids lookups MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit We need `artist_ids` in most of the queries, so let’s make them a generated column and put an index on them. This reduces times for getArtistNameById from ~300ms to ~20ms and for the main “latest” table from ~250ms to ~60ms. The `artist_has_been_snatched` logic moves into the torrent filter, so we don’t have to work on such large records further down. Change-Id: I5a1929bd9c81ea0031e512d01897c0e460ded077 Reviewed-on: https://cl.tvl.fyi/c/depot/+/13240 Tested-by: BuildkiteCI Reviewed-by: Profpatsch --- .../whatcd-resolver/src/Redacted.hs | 58 ++++++++----------- .../whatcd-resolver/src/WhatcdResolver.hs | 39 ++++++++++++- 2 files changed, 60 insertions(+), 37 deletions(-) diff --git a/users/Profpatsch/whatcd-resolver/src/Redacted.hs b/users/Profpatsch/whatcd-resolver/src/Redacted.hs index b191ed8d3..41e55e320 100644 --- a/users/Profpatsch/whatcd-resolver/src/Redacted.hs +++ b/users/Profpatsch/whatcd-resolver/src/Redacted.hs @@ -638,11 +638,28 @@ getBestTorrents :: getBestTorrents opts = do queryWith ( [sql| - WITH filtered_torrents AS ( + WITH + artist_has_been_snatched AS ( + SELECT DISTINCT artist_id + FROM ( + SELECT + UNNEST(artist_ids) as artist_id, + t.torrent_file IS NOT NULL as has_torrent_file + FROM redacted.torrents t) as _ + WHERE has_torrent_file + ), + filtered_torrents AS ( SELECT DISTINCT ON (torrent_group) id FROM redacted.torrents + JOIN LATERAL + -- filter everything that’s not a favourite if requested + (SELECT ( + artist_ids && ARRAY(SELECT artist_id FROM redacted.artist_favourites) + OR artist_ids && ARRAY(SELECT artist_id FROM artist_has_been_snatched) + ) as is_favourite) as _ + ON (NOT ?::bool OR is_favourite) WHERE -- filter by artist id (?::bool OR (to_jsonb(?::int) <@ (jsonb_path_query_array(full_json_result, '$.artists[*].id')))) @@ -667,6 +684,7 @@ getBestTorrents opts = do tg.full_json_result->'artists', '[]'::jsonb ) as artists, + t.artist_ids || tg.artist_ids as artist_ids, tg.full_json_result->>'groupName' AS group_name, tg.full_json_result->>'groupYear' AS group_year, t.torrent_file IS NOT NULL AS has_torrent_file, @@ -677,16 +695,6 @@ getBestTorrents opts = do JOIN redacted.torrent_groups tg ON tg.id = t.torrent_group WHERE tg.full_json_result->>'releaseType' <> ALL (?::text[]) - ), - prepare2 AS MATERIALIZED ( - -- extract the json artist ids field into an array of ints - SELECT *, array(select id from jsonb_to_recordset(artists) as (id int)) as artist_ids - FROM prepare1 - ), - artist_has_been_snatched AS MATERIALIZED ( - SELECT DISTINCT artist_id - FROM (SELECT UNNEST(artist_ids) as artist_id, has_torrent_file from prepare2) as _ - WHERE has_torrent_file ) SELECT group_id, @@ -699,14 +707,7 @@ getBestTorrents opts = do has_torrent_file, transmission_torrent_hash, torrent_format - FROM prepare2 - JOIN LATERAL - (SELECT ( - artist_ids && ARRAY(SELECT artist_id FROM artist_has_been_snatched) - OR artist_ids && ARRAY(SELECT artist_id FROM redacted.artist_favourites) - ) as is_favourite) as _ - -- filter everything that’s not a favourite if requested - ON (NOT ?::bool OR is_favourite) + FROM prepare1 |] <> case opts.ordering of BySeedingWeight -> [fmt|ORDER BY seeding_weight DESC|] <> "\n" @@ -722,12 +723,12 @@ getBestTorrents opts = do let (onlyTheseTorrentsB, onlyTheseTorrents) = case opts.onlyTheseTorrents of Nothing -> (True, PGArray []) Just a -> (False, a <&> (.torrentId) & PGArray) - ( onlyArtistB :: Bool, + ( opts.onlyFavourites :: Bool, + onlyArtistB :: Bool, onlyArtistId :: Int, onlyTheseTorrentsB :: Bool, onlyTheseTorrents, (opts.disallowedReleaseTypes & concatMap (\rt -> [rt.stringKey, rt.intKey & buildText intDecimalT]) & PGArray :: PGArray Text), - opts.onlyFavourites :: Bool, opts.limitResults <&> naturalToInteger :: Maybe Integer ) ) @@ -772,19 +773,8 @@ getArtistNameById :: (MonadPostgres m, HasField "artistId" r Int) => r -> Transa getArtistNameById dat = do queryFirstRowWithMaybe [sql| - explain analyze WITH mapping as ( - SELECT x.id, x.name FROM - redacted.torrents t - join LATERAL - jsonb_to_recordset(full_json_result->'artists') as x(id int, name text) on true - UNION - SELECT x.id, x.name FROM - redacted.torrent_groups tg - join LATERAL - jsonb_to_recordset(full_json_result->'artists') as x(id int, name text) on true - ) - SELECT name FROM mapping - WHERE id = ?::int + SELECT artist_name FROM redacted.artist_names + WHERE artist_id = ?::int LIMIT 1 |] (getLabel @"artistId" dat) diff --git a/users/Profpatsch/whatcd-resolver/src/WhatcdResolver.hs b/users/Profpatsch/whatcd-resolver/src/WhatcdResolver.hs index a2af7f6bf..a516136c8 100644 --- a/users/Profpatsch/whatcd-resolver/src/WhatcdResolver.hs +++ b/users/Profpatsch/whatcd-resolver/src/WhatcdResolver.hs @@ -1107,7 +1107,6 @@ migrate = inSpan "Database Migration" $ do ALTER TABLE redacted.torrents_json ADD COLUMN IF NOT EXISTS transmission_torrent_hash text NULL; - -- the seeding weight is used to find the best torrent in a group. CREATE OR REPLACE FUNCTION calc_seeding_weight(full_json_result jsonb) RETURNS int AS $$ BEGIN @@ -1147,6 +1146,19 @@ migrate = inSpan "Database Migration" $ do ALTER TABLE redacted.torrents_json ADD COLUMN IF NOT EXISTS seeding_weight int NOT NULL GENERATED ALWAYS AS (calc_seeding_weight(full_json_result)) STORED; + CREATE OR REPLACE FUNCTION artist_record_to_id(artists jsonb) RETURNS int[] + as $$ + SELECT array_agg(x::int) from jsonb_path_query(artists, '$[*].id') j(x); + $$ LANGUAGE sql IMMUTABLE; + + ALTER TABLE redacted.torrents_json + ADD COLUMN IF NOT EXISTS artist_ids int[] NOT NULL GENERATED ALWAYS AS (COALESCE(artist_record_to_id(full_json_result->'artists'), ARRAY[]::int[])) STORED; + ALTER TABLE redacted.torrent_groups + ADD COLUMN IF NOT EXISTS artist_ids int[] NOT NULL GENERATED ALWAYS AS (COALESCE(artist_record_to_id(full_json_result->'artists'), ARRAY[]::int[])) STORED; + + CREATE INDEX IF NOT EXISTS torrents_json_artist_ids ON redacted.torrents_json USING GIN (artist_ids); + CREATE INDEX IF NOT EXISTS torrent_groups_artist_ids ON redacted.torrent_groups USING GIN (artist_ids); + -- inflect out values of the full json CREATE OR REPLACE VIEW redacted.torrents AS SELECT @@ -1157,10 +1169,10 @@ migrate = inSpan "Database Migration" $ do t.seeding_weight, t.full_json_result, t.torrent_file, - t.transmission_torrent_hash + t.transmission_torrent_hash, + t.artist_ids FROM redacted.torrents_json t; - CREATE INDEX IF NOT EXISTS torrents_json_seeding ON redacted.torrents_json(((full_json_result->'seeding')::integer)); CREATE INDEX IF NOT EXISTS torrents_json_snatches ON redacted.torrents_json(((full_json_result->'snatches')::integer)); @@ -1169,6 +1181,27 @@ migrate = inSpan "Database Migration" $ do artist_id INTEGER NOT NULL, UNIQUE(artist_id) ); + + -- for easier query lookup, a mapping from artist ids to names + CREATE OR REPLACE VIEW redacted.artist_names AS + SELECT + t.artist_id, x.name as artist_name + FROM + (SELECT unnest(artist_ids) as artist_id, * FROM redacted.torrents t) as t + join LATERAL + jsonb_to_recordset(full_json_result->'artists') as x(id int, name text) + ON x.id = t.artist_id + WHERE x.id = t.artist_id + UNION ALL + SELECT + t.artist_id, x.name as artist_name + FROM + (SELECT unnest(artist_ids) as artist_id, * FROM redacted.torrent_groups t) as t + join LATERAL + jsonb_to_recordset(full_json_result->'artists') as x(id int, name text) + ON x.id = t.artist_id + WHERE x.id = t.artist_id; + |] ()