chore(ops): move archivist machine to ops and contrib

contrib/ gets the clickhouse patching, the bucket log parsing code and
the awscli setup and shell.

ops/ gets the machine config itself.

Change-Id: If8b8f8cce5ca9c2b4d19e17be9a8b895ac35e84a
Reviewed-on: https://cl.snix.dev/c/snix/+/30163
Autosubmit: Florian Klink <flokli@flokli.de>
Tested-by: besadii
Reviewed-by: Ryan Lahfa <masterancpp@gmail.com>
This commit is contained in:
Florian Klink 2025-03-19 23:42:36 +00:00 committed by clbot
parent c3de9e21eb
commit ae4d967288
14 changed files with 21 additions and 40 deletions

1
contrib/archivist/OWNERS Normal file
View file

@ -0,0 +1 @@
edef

View file

@ -0,0 +1,23 @@
# archivist
This directory contains various scripts and helpers used for nix-archivist tasks.
It's used from the archivist EC2 instance, as well as standalone.
## AWS Profile setup
There's 2 AWS Accounts, reachable via the nixos.awsapps.com SSO portal.
### archeologist
This is assuming the `archeologist` AWS role in the main NixOS account.
### archivist
This is a separate AWS Account, only for the archivist project. We can assume
`AWSAdministratorAccess` in there.
## archivist-ec2 Machine
The `archivist-ec2` machine currently is deployed in the main NixOS account.
It regularly processes S3 bucket logs and dumps them in parquet format into
another bucket.
In the future, we want to move this machine to the dedicated `archivist` AWS
account.

View file

@ -0,0 +1,61 @@
{ depot
, pkgs
, ...
}:
let
clickhouseConfigAWS = builtins.toFile "clickhouse-local.xml" ''
<clickhouse>
<s3>
<use_environment_credentials>true</use_environment_credentials>
</s3>
</clickhouse>
'';
# clickhouse has a very odd AWS config concept.
# Configure it to be a bit more sane.
clickhouseLocalFixedAWS = pkgs.runCommand "clickhouse-local-fixed"
{
nativeBuildInputs = [ pkgs.makeWrapper ];
} ''
mkdir -p $out/bin
makeWrapper ${pkgs.clickhouse}/bin/clickhouse-local $out/bin/clickhouse-local \
--append-flags "-C ${clickhouseConfigAWS}"
'';
in
depot.nix.readTree.drvTargets {
inherit clickhouseLocalFixedAWS;
parse-bucket-logs = pkgs.runCommand "archivist-parse-bucket-logs"
{
nativeBuildInputs = [ pkgs.makeWrapper ];
} ''
mkdir -p $out/bin
makeWrapper ${(pkgs.writers.writeRust "parse-bucket-logs-unwrapped" {} ./parse_bucket_logs.rs)} $out/bin/archivist-parse-bucket-logs \
--prefix PATH : ${pkgs.lib.makeBinPath [ clickhouseLocalFixedAWS ]}
'';
# A shell, by default pointing us to the archivist SSO profile / account by default.
shell = pkgs.mkShell {
name = "archivist-shell";
packages = with pkgs; [ awscli2 ];
AWS_PROFILE = "archivist";
AWS_CONFIG_FILE = pkgs.writeText "aws-config" ''
[sso-session nixos]
sso_region = eu-north-1
sso_start_url = https://nixos.awsapps.com/start
sso_registration_scopes = sso:account:access
[profile "archeologist"]
sso_session = nixos
sso_account_id = 080433136561 # nixos root
sso_role_name = archeologist
[profile "archivist"]
sso_session = nixos
sso_account_id = 286553126452 # archivist
sso_role_name = AWSAdministratorAccess
'';
};
}

View file

@ -0,0 +1,42 @@
use std::env;
use std::process::Command;
use std::process::ExitCode;
fn main() -> ExitCode {
let args: Vec<String> = env::args().collect();
if args.len() != 3 {
eprintln!("needs two args, input s3 url (glob) and output pq file");
return ExitCode::FAILURE;
}
let input_files = &args[1];
let output_file = &args[2];
let mut cmd = Command::new("clickhouse-local");
cmd.arg("--progress")
.arg("-q")
.arg(format!(r#"SELECT
key,
toInt64(nullif(http_status, '-')) AS http_status,
toInt64(nullif(object_size_str, '-')) AS object_size,
toInt64(nullif(bytes_sent_str, '-')) AS bytes_sent,
nullif(user_agent, '-') AS user_agent,
operation,
nullif(requester, '-') AS requester,
parseDateTime(timestamp_str, '%d/%b/%Y:%k:%i:%s %z') AS timestamp
FROM s3(
'{}',
'Regexp',
'owner String , bucket String, timestamp_str String, remote_ip String, requester LowCardinality(String), request_id String, operation LowCardinality(String), key String, request_uri String, http_status String, error_code String, bytes_sent_str String, object_size_str String, total_time String, turn_around_time String, referer String, user_agent String, version_id String, host_id String, signature_version String, cipher_suite String, authentication_type String, host_header String, tls_version String, access_point_arn String, acl_required String'
)
ORDER BY timestamp ASC
SETTINGS
format_regexp_skip_unmatched = 1,
format_regexp = '(\\S+) (\\S+) \\[(.*)\\] (\\S+) (\\S+) (\\S+) (\\S+) (\\S+) ((?:\\S+ \\S+ \\S+)|\\S+) (\\S+) (\\S+) (\\S+) (\\S+) (\\S+) (\\S+) (\\S+) ("\\S+") (\\S+) (\\S+) (\\S+) (\\S+) (\\S+) (\\S+) (\\S+) (\\S+) (\\S+).*',
output_format_parquet_compression_method = 'zstd'
INTO OUTFILE '{}' FORMAT Parquet"#, input_files, output_file));
cmd.status().expect("clickhouse-local failed");
ExitCode::SUCCESS
}