feat(corp/data-import): add import of OpenRussian 'words' table
This is actually the lemmata table of this corpus, not the forms of all words (they're in a separate table). Change-Id: I89a2c2817ccce840f47406fa2a636f4ed3f49154 Reviewed-on: https://cl.tvl.fyi/c/depot/+/7893 Reviewed-by: tazjin <tazjin@tvl.su> Tested-by: BuildkiteCI
This commit is contained in:
parent
ee0c0ee951
commit
429c0d00c4
6 changed files with 349 additions and 31 deletions
|
|
@ -8,6 +8,7 @@
|
|||
|
||||
use super::{bail, Ensure};
|
||||
use crate::oc_parser::*;
|
||||
use crate::or_parser;
|
||||
use log::{debug, info};
|
||||
use rusqlite::Connection;
|
||||
|
||||
|
|
@ -69,7 +70,7 @@ CREATE TABLE oc_links (
|
|||
|
||||
"#,
|
||||
)
|
||||
.ensure("setting up initial table schema failed");
|
||||
.ensure("setting up OpenCorpora table schema failed");
|
||||
|
||||
info!("set up initial table schema for OpenCorpora import");
|
||||
}
|
||||
|
|
@ -166,3 +167,51 @@ fn insert_lemma(conn: &Connection, lemma: Lemma) {
|
|||
|
||||
debug!("inserted lemma {}", lemma.id);
|
||||
}
|
||||
|
||||
/// Sets up an initial schema for the OpenRussian data.
|
||||
pub fn initial_or_schema(conn: &Connection) {
|
||||
conn.execute_batch(
|
||||
r#"
|
||||
CREATE TABLE or_words (
|
||||
id INTEGER PRIMARY KEY,
|
||||
bare TEXT NOT NULL,
|
||||
accented TEXT,
|
||||
derived_from_word_id INTEGER,
|
||||
rank TEXT,
|
||||
word_type TEXT,
|
||||
level TEXT
|
||||
) STRICT;
|
||||
"#,
|
||||
)
|
||||
.ensure("setting up OpenRussian table schema failed");
|
||||
|
||||
info!("set up initial table schema for OpenRussian import");
|
||||
}
|
||||
|
||||
pub fn insert_or_words<I: Iterator<Item = or_parser::Word>>(conn: &Connection, words: I) {
|
||||
let mut stmt = conn
|
||||
.prepare_cached(
|
||||
"
|
||||
INSERT INTO or_words (id, bare, accented, derived_from_word_id, rank, word_type, level)
|
||||
VALUES (?1, ?2, ?3, ?4, ?5, ?6, ?7)
|
||||
",
|
||||
)
|
||||
.ensure("failed to prepare OR words statement");
|
||||
let mut count = 0;
|
||||
|
||||
for word in words {
|
||||
stmt.execute((
|
||||
word.id,
|
||||
word.bare,
|
||||
word.accented,
|
||||
word.derived_from_word_id,
|
||||
word.rank,
|
||||
word.word_type,
|
||||
word.level,
|
||||
))
|
||||
.ensure("failed to insert OR word");
|
||||
count += 1;
|
||||
}
|
||||
|
||||
info!("inserted {} OpenRussian words", count);
|
||||
}
|
||||
|
|
|
|||
|
|
@ -1,6 +1,6 @@
|
|||
//! This program imports Russian language data from OpenCorpora and
|
||||
//! OpenRussian ("Открытый корпус") into a SQLite database that can be
|
||||
//! used for [//corp/russian][corp-russian] projects.
|
||||
//! This program imports Russian language data from OpenCorpora
|
||||
//! ("Открытый корпус") and OpenRussian into a SQLite database that
|
||||
//! can be used for [//corp/russian][corp-russian] projects.
|
||||
//!
|
||||
//! [corp-russian]: https://at.tvl.fyi/?q=%2F%2Fcorp%2Frussian
|
||||
//!
|
||||
|
|
@ -112,42 +112,77 @@ use std::io::BufReader;
|
|||
|
||||
mod db_setup;
|
||||
mod oc_parser;
|
||||
mod or_parser;
|
||||
|
||||
fn main() {
|
||||
env_logger::builder()
|
||||
.filter_level(log::LevelFilter::Info)
|
||||
.init();
|
||||
struct Args {
|
||||
output: String,
|
||||
or_input: String,
|
||||
oc_input: String,
|
||||
}
|
||||
|
||||
let (input_path, output_path) = {
|
||||
let mut args = env::args().collect::<Vec<_>>();
|
||||
impl Args {
|
||||
fn populated(&self) -> bool {
|
||||
!(self.output.is_empty() || self.or_input.is_empty() || self.oc_input.is_empty())
|
||||
}
|
||||
}
|
||||
|
||||
if args.len() != 3 {
|
||||
bail(format!(
|
||||
"usage: {} <input-file> <output-file>",
|
||||
args.first().map(String::as_str).unwrap_or("data-import")
|
||||
));
|
||||
}
|
||||
fn usage(binary_name: &str) {
|
||||
bail(format!(
|
||||
"usage: {} --output <output-file> --or-input <or-input> --oc-input <oc-input>",
|
||||
binary_name
|
||||
));
|
||||
}
|
||||
|
||||
(args.remove(1), args.remove(1))
|
||||
fn parse_args() -> Args {
|
||||
let mut args_iter = env::args();
|
||||
let binary_name = args_iter.next().unwrap();
|
||||
|
||||
let mut args = Args {
|
||||
output: "".into(),
|
||||
or_input: env::var("OPENRUSSIAN_DATA").unwrap_or_default(),
|
||||
oc_input: env::var("OPENCORPORA_DATA").unwrap_or_default(),
|
||||
};
|
||||
|
||||
info!("reading from {input_path}; writing output to {output_path}");
|
||||
let input_file = File::open(input_path).ensure("failed to open input file");
|
||||
loop {
|
||||
if args.populated() {
|
||||
break;
|
||||
}
|
||||
|
||||
while let Some(arg) = args_iter.next() {
|
||||
match arg.as_str() {
|
||||
"--output" => {
|
||||
args.output = args_iter.next().unwrap();
|
||||
}
|
||||
|
||||
"--or-input" => {
|
||||
args.or_input = args_iter.next().unwrap();
|
||||
}
|
||||
|
||||
"--oc-input" => {
|
||||
args.oc_input = args_iter.next().unwrap();
|
||||
}
|
||||
|
||||
_ => usage(&binary_name),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if args.output.is_empty() || args.or_input.is_empty() || args.oc_input.is_empty() {
|
||||
usage(&binary_name);
|
||||
}
|
||||
|
||||
args
|
||||
}
|
||||
|
||||
fn open_corpora(conn: &Connection, args: &Args) {
|
||||
let input_file = File::open(&args.oc_input).ensure("failed to open input file");
|
||||
let mut parser = oc_parser::OpenCorporaParser::new(BufReader::new(input_file));
|
||||
|
||||
let conn = Connection::open(output_path).ensure("failed to open DB connection");
|
||||
|
||||
db_setup::initial_oc_schema(&conn);
|
||||
|
||||
// afterwards:
|
||||
// add actual IDs to grammemes
|
||||
// properly reference keys internally
|
||||
// add foreign key constraint on lemma_grammemes.grammeme
|
||||
|
||||
let mut tx = conn
|
||||
.unchecked_transaction()
|
||||
.ensure("failed to start transaction");
|
||||
|
||||
let mut count = 0;
|
||||
|
||||
while let Some(elem) = parser.next_element() {
|
||||
|
|
@ -165,7 +200,46 @@ fn main() {
|
|||
count += 1;
|
||||
}
|
||||
|
||||
tx.commit().ensure("final commit failed");
|
||||
tx.commit().ensure("final OpenCorpora commit failed");
|
||||
|
||||
info!("finished OpenCorpora import");
|
||||
}
|
||||
|
||||
fn open_russian(conn: &Connection, args: &Args) {
|
||||
let parser = or_parser::OpenRussianParser::new(&args.or_input);
|
||||
|
||||
db_setup::initial_or_schema(conn);
|
||||
|
||||
let tx = conn
|
||||
.unchecked_transaction()
|
||||
.ensure("failed to start transaction");
|
||||
|
||||
db_setup::insert_or_words(&tx, parser.words());
|
||||
tx.commit().ensure("OpenRussian words commit failed");
|
||||
|
||||
info!("finished OpenRussian import");
|
||||
}
|
||||
|
||||
fn main() {
|
||||
env_logger::builder()
|
||||
.filter_level(log::LevelFilter::Info)
|
||||
.init();
|
||||
|
||||
let args = parse_args();
|
||||
|
||||
info!("output path: {}", args.output);
|
||||
info!("OpenCorpora input path: {}", args.oc_input);
|
||||
info!("OpenRussian input path: {}", args.or_input);
|
||||
|
||||
let conn = Connection::open(&args.output).ensure("failed to open DB connection");
|
||||
|
||||
open_corpora(&conn, &args);
|
||||
open_russian(&conn, &args);
|
||||
|
||||
// afterwards:
|
||||
// add actual IDs to grammemes
|
||||
// properly reference keys internally
|
||||
// add foreign key constraint on lemma_grammemes.grammeme
|
||||
}
|
||||
|
||||
/// It's like `expect`, but through `log::error`.
|
||||
|
|
|
|||
73
corp/russian/data-import/src/or_parser.rs
Normal file
73
corp/russian/data-import/src/or_parser.rs
Normal file
|
|
@ -0,0 +1,73 @@
|
|||
//! Parser for the OpenRussian data format.
|
||||
//!
|
||||
//! Note that when exporting OpenRussian data from the project you
|
||||
//! have to choose an encoding. We choose tab-separated CSV files, as
|
||||
//! tabs have a very low probability of actually appearing in the
|
||||
//! input data and this skips some potential encoding issues.
|
||||
|
||||
use super::Ensure;
|
||||
use serde::Deserialize;
|
||||
use std::fs::File;
|
||||
use std::io::BufReader;
|
||||
use std::path::PathBuf;
|
||||
|
||||
/// A word from the `words` table.
|
||||
#[derive(Debug, Deserialize)]
|
||||
pub struct Word {
|
||||
pub id: usize,
|
||||
pub position: String, // TODO: unknown
|
||||
pub bare: String, // TODO: unknown
|
||||
pub accented: String, // TODO: unknown
|
||||
pub derived_from_word_id: Option<usize>,
|
||||
pub rank: String, // TODO: unknown
|
||||
pub disabled: String, // TODO: unknown
|
||||
pub audio: String, // TODO: unknown
|
||||
pub usage_en: String, // TODO: unknown
|
||||
pub usage_de: String, // TODO: unknown
|
||||
pub number_value: String, // TODO: unknown
|
||||
|
||||
#[serde(rename = "type")]
|
||||
pub word_type: String, // TODO: unknown
|
||||
|
||||
pub level: String, // TODO: unknown
|
||||
pub created_at: String, // TODO: unknown
|
||||
}
|
||||
|
||||
pub struct OpenRussianParser {
|
||||
or_directory: PathBuf,
|
||||
}
|
||||
|
||||
pub type DynIter<T> = Box<dyn Iterator<Item = T>>;
|
||||
|
||||
impl OpenRussianParser {
|
||||
pub fn new<P: Into<PathBuf>>(path: P) -> Self {
|
||||
OpenRussianParser {
|
||||
or_directory: path.into(),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn words(&self) -> DynIter<Word> {
|
||||
self.parser_for("words.csv")
|
||||
}
|
||||
|
||||
fn parser_for<T: serde::de::DeserializeOwned + 'static>(
|
||||
&self,
|
||||
file_name: &str,
|
||||
) -> Box<dyn Iterator<Item = T>> {
|
||||
let mut path = self.or_directory.clone();
|
||||
path.push(file_name);
|
||||
|
||||
let reader = csv::ReaderBuilder::new()
|
||||
.delimiter(b'\t')
|
||||
.from_reader(BufReader::new(
|
||||
File::open(&path).ensure("failed to open words.csv"),
|
||||
));
|
||||
|
||||
Box::new(reader.into_deserialize().map(|result| {
|
||||
result.ensure(format!(
|
||||
"failed to deserialize {}",
|
||||
std::any::type_name::<T>()
|
||||
))
|
||||
}))
|
||||
}
|
||||
}
|
||||
Loading…
Add table
Add a link
Reference in a new issue