feat(corp/data-import): insert OpenCorpora data into SQLite

This is an initial and kind of dumb table structure, but there's some
massaging that needs to be done before this makes more sense.

Change-Id: I441288b684ef86be507099bcc4ebf984598789c8
Reviewed-on: https://cl.tvl.fyi/c/depot/+/7861
Reviewed-by: tazjin <tazjin@tvl.su>
Tested-by: BuildkiteCI
This commit is contained in:
Vincent Ambo 2023-01-18 14:52:53 +03:00 committed by tazjin
parent 0196555f07
commit 6986aa5824
2 changed files with 155 additions and 9 deletions

View file

@ -55,11 +55,13 @@
//! between the lemmas "два" and "второй".
use log::{error, info};
use rusqlite::{Connection, Result};
use std::env;
use std::fmt::Display;
use std::fs::File;
use std::io::{BufReader, BufWriter, Write};
use std::io::BufReader;
mod db_setup;
mod oc_parser;
fn main() {
@ -77,18 +79,34 @@ fn main() {
let mut parser = oc_parser::OpenCorporaParser::new(BufReader::new(input_file));
let mut out = BufWriter::new(std::io::stdout().lock());
let conn = Connection::open("out.db").ensure("failed to open DB connection");
db_setup::initial_schema(&conn);
// afterwards:
// add actual IDs to grammemes
// properly reference keys internally
// add foreign key constraint on lemma_grammemes.grammeme
let mut tx = conn
.unchecked_transaction()
.ensure("failed to start transaction");
let mut count = 0;
while let Some(elem) = parser.next_element() {
if let oc_parser::OcElement::Lemma(lemma) = elem {
if lemma.lemma.word == "тяжёлый" {
writeln!(out, "{:?}", lemma).ensure("writing output failed");
break;
}
// commit every 1000 things
if count % 1000 == 0 {
tx.commit().ensure("transaction failed");
tx = conn
.unchecked_transaction()
.ensure("failed to start new transaction");
info!("transaction committed at watermark {}", count);
}
}
out.flush().ensure("flushing the out buffer failed");
db_setup::insert_oc_element(&tx, elem);
count += 1;
}
}
/// It's like `expect`, but through `log::error`.