refactor(users/flokli): unify archeology and archivist

This merges the two directories together, and explains the two different
AWS profiles and what they're used for.

Change-Id: Ieaa09be5af02491056f5ad83b1d639e2de9a218b
Reviewed-on: https://cl.snix.dev/c/snix/+/30102
Autosubmit: Florian Klink <flokli@flokli.de>
Reviewed-by: Ryan Lahfa <masterancpp@gmail.com>
Tested-by: besadii
This commit is contained in:
Florian Klink 2025-03-18 13:39:41 +00:00 committed by clbot
parent 580f03f6fd
commit 2bdb497c85
12 changed files with 74 additions and 74 deletions

View file

@ -0,0 +1,23 @@
# archivist
This directory contains various scripts and helpers used for nix-archivist tasks.
It's used from some of the archivist EC2 instance, as well as standalone.
## AWS Profile setup
There's 2 AWS Accounts, reachable via the nixos.awsapps.com SSO portal.
### archeologist
This is assuming the `archeologist` AWS role in the main NixOS account.
### archivist
This is a separate AWS Account, only for the archivist project. We can assume
`AWSAdministratorAccess` in there.
## Machine
The `archivist-ec2` machine currently is deployed in the main NixOS account.
It regularly processes S3 bucket logs and dumps them in parquet format into
another bucket.
In the future, we want to move this machine to the dedicated `archivist` AWS
account.

View file

@ -2,7 +2,41 @@
, pkgs
, ...
}:
let
clickhouseConfigAWS = builtins.toFile "clickhouse-local.xml" ''
<clickhouse>
<s3>
<use_environment_credentials>true</use_environment_credentials>
</s3>
</clickhouse>
'';
# clickhouse has a very odd AWS config concept.
# Configure it to be a bit more sane.
clickhouseLocalFixedAWS = pkgs.runCommand "clickhouse-local-fixed"
{
nativeBuildInputs = [ pkgs.makeWrapper ];
} ''
mkdir -p $out/bin
makeWrapper ${pkgs.clickhouse}/bin/clickhouse-local $out/bin/clickhouse-local \
--append-flags "-C ${clickhouseConfigAWS}"
'';
in
depot.nix.readTree.drvTargets {
inherit clickhouseLocalFixedAWS;
parse-bucket-logs = pkgs.runCommand "archivist-parse-bucket-logs"
{
nativeBuildInputs = [ pkgs.makeWrapper ];
} ''
mkdir -p $out/bin
makeWrapper ${(pkgs.writers.writeRust "parse-bucket-logs-unwrapped" {} ./parse_bucket_logs.rs)} $out/bin/archivist-parse-bucket-logs \
--prefix PATH : ${pkgs.lib.makeBinPath [ clickhouseLocalFixedAWS ]}
'';
# A shell, by default pointing us to the archivist SSO profile / account by default.
shell = pkgs.mkShell {
name = "archivist-shell";
packages = with pkgs; [ awscli2 ];
@ -14,15 +48,15 @@ depot.nix.readTree.drvTargets {
sso_start_url = https://nixos.awsapps.com/start
sso_registration_scopes = sso:account:access
[profile "archivist"]
sso_session = nixos
sso_account_id = 286553126452
sso_role_name = AWSAdministratorAccess
[profile "archeologist"]
sso_session = nixos
sso_account_id = 080433136561
sso_account_id = 080433136561 # nixos root
sso_role_name = archeologist
[profile "archivist"]
sso_session = nixos
sso_account_id = 286553126452 # archivist
sso_role_name = AWSAdministratorAccess
'';
};
}

View file

@ -0,0 +1,42 @@
use std::env;
use std::process::Command;
use std::process::ExitCode;
fn main() -> ExitCode {
let args: Vec<String> = env::args().collect();
if args.len() != 3 {
eprintln!("needs two args, input s3 url (glob) and output pq file");
return ExitCode::FAILURE;
}
let input_files = &args[1];
let output_file = &args[2];
let mut cmd = Command::new("clickhouse-local");
cmd.arg("--progress")
.arg("-q")
.arg(format!(r#"SELECT
key,
toInt64(nullif(http_status, '-')) AS http_status,
toInt64(nullif(object_size_str, '-')) AS object_size,
toInt64(nullif(bytes_sent_str, '-')) AS bytes_sent,
nullif(user_agent, '-') AS user_agent,
operation,
nullif(requester, '-') AS requester,
parseDateTime(timestamp_str, '%d/%b/%Y:%k:%i:%s %z') AS timestamp
FROM s3(
'{}',
'Regexp',
'owner String , bucket String, timestamp_str String, remote_ip String, requester LowCardinality(String), request_id String, operation LowCardinality(String), key String, request_uri String, http_status String, error_code String, bytes_sent_str String, object_size_str String, total_time String, turn_around_time String, referer String, user_agent String, version_id String, host_id String, signature_version String, cipher_suite String, authentication_type String, host_header String, tls_version String, access_point_arn String, acl_required String'
)
ORDER BY timestamp ASC
SETTINGS
format_regexp_skip_unmatched = 1,
format_regexp = '(\\S+) (\\S+) \\[(.*)\\] (\\S+) (\\S+) (\\S+) (\\S+) (\\S+) ((?:\\S+ \\S+ \\S+)|\\S+) (\\S+) (\\S+) (\\S+) (\\S+) (\\S+) (\\S+) (\\S+) ("\\S+") (\\S+) (\\S+) (\\S+) (\\S+) (\\S+) (\\S+) (\\S+) (\\S+) (\\S+).*',
output_format_parquet_compression_method = 'zstd'
INTO OUTFILE '{}' FORMAT Parquet"#, input_files, output_file));
cmd.status().expect("clickhouse-local failed");
ExitCode::SUCCESS
}