feat(corp/russian/data-import): new OpenCorpora data import tool
Adds the beginning of a tool which can import OpenCorpora data into a SQLite database. This is quite a lot of toil and there's probably a better way to do this, but overall becoming this intimately familiar with the data structures is quite helpful for understanding what I can/can't do with only this dataset. Change-Id: Ieab33a8ce07ea4ac87917b9c8132226bbc6523b1 Reviewed-on: https://cl.tvl.fyi/c/depot/+/7859 Reviewed-by: tazjin <tazjin@tvl.su> Tested-by: BuildkiteCI
This commit is contained in:
parent
032ab16bbb
commit
ee7616d956
6 changed files with 829 additions and 0 deletions
126
corp/russian/data-import/src/main.rs
Normal file
126
corp/russian/data-import/src/main.rs
Normal file
|
|
@ -0,0 +1,126 @@
|
|||
//! This program imports Russian language data from OpenCorpora
|
||||
//! ("Открытый корпус") into a SQLite database that can be used for
|
||||
//! [//corp/russian][corp-russian] projects.
|
||||
//!
|
||||
//! [corp-russian]: https://at.tvl.fyi/?q=%2F%2Fcorp%2Frussian
|
||||
//!
|
||||
//! Ideally, running this on an OpenCorpora dump should yield a fully
|
||||
//! functional SQLite database compatible with all other tools
|
||||
//! consuming it.
|
||||
//!
|
||||
//! ## OpenCorpora format
|
||||
//!
|
||||
//! The format used is partially documented on the [OpenCorpora
|
||||
//! website][format-docs]. This seems to be a slightly outdated
|
||||
//! format, however, hence some information about what the format
|
||||
//! seems to be today.
|
||||
//!
|
||||
//! [format-docs]: http://opencorpora.org/?page=export
|
||||
//!
|
||||
//! The format is an XML file, which has several categories of data,
|
||||
//! each with their own schema:
|
||||
//!
|
||||
//! * `grammemes`: These define units of grammar. They're *likely* pretty
|
||||
//! static, and we'll *likely* want to map them into a custom set of
|
||||
//! (simpler) categories.
|
||||
//!
|
||||
//! They form some kind of internal hierarchy, where some of them have a
|
||||
//! `parent` attribute set to some other grammemes `name`.
|
||||
//!
|
||||
//! There's a ridiculous number of these.
|
||||
//!
|
||||
//! * `restrictions`: Unclear, not documented on the page. They describe
|
||||
//! something about the relationship between grammemes.
|
||||
//!
|
||||
//! * `lemmata`: this lists the actual lemmas, as well as all their
|
||||
//! included morphological variants
|
||||
//!
|
||||
//! Each lemma has an `id` attribute uniquely identifying its dictionary
|
||||
//! form, as well as a number of sub-elements:
|
||||
//!
|
||||
//! * the `l` attribute contains the lemma itself
|
||||
//! * the `f` attributes contain morphological variations
|
||||
//!
|
||||
//! Each of these sub elements again contains a number of `g` elements,
|
||||
//! which refer to the IDs of grammems in their `v` attributes.
|
||||
//!
|
||||
//! * `<link_types>` These list possible "relationships between lemmas",
|
||||
//! basically just assigning them IDs and names. There's only 27 of
|
||||
//! these.
|
||||
//!
|
||||
//! * `<links>`: Using the types defined above, this establishes links
|
||||
//! between lemmas that have some kind of relationship.
|
||||
//!
|
||||
//! For example, a relationship `cardinal/ordinal` might be established
|
||||
//! between the lemmas "два" and "второй".
|
||||
|
||||
use log::{error, info};
|
||||
use std::env;
|
||||
use std::fmt::Display;
|
||||
use std::fs::File;
|
||||
use std::io::{BufReader, BufWriter, Write};
|
||||
|
||||
mod oc_parser;
|
||||
|
||||
fn main() {
|
||||
env_logger::builder()
|
||||
.filter_level(log::LevelFilter::Info)
|
||||
.init();
|
||||
|
||||
let input_path = env::args()
|
||||
.skip(1)
|
||||
.next()
|
||||
.ensure("must specify the input filename as the only argument");
|
||||
|
||||
info!("reading from {input_path}");
|
||||
let input_file = File::open(input_path).ensure("failed to open input file");
|
||||
|
||||
let mut parser = oc_parser::OpenCorporaParser::new(BufReader::new(input_file));
|
||||
|
||||
let mut out = BufWriter::new(std::io::stdout().lock());
|
||||
|
||||
while let Some(elem) = parser.next_element() {
|
||||
match elem {
|
||||
oc_parser::OcElement::Grammeme(g) => {
|
||||
writeln!(out, "{:?}", g).ensure("writing element failed")
|
||||
}
|
||||
oc_parser::OcElement::Lemma(_) => continue,
|
||||
}
|
||||
}
|
||||
|
||||
out.flush().ensure("flushing the out buffer failed");
|
||||
}
|
||||
|
||||
/// It's like `expect`, but through `log::error`.
|
||||
trait Ensure<T> {
|
||||
fn ensure<S: Into<String>>(self, msg: S) -> T;
|
||||
}
|
||||
|
||||
impl<T, E: Display> Ensure<T> for Result<T, E> {
|
||||
fn ensure<S: Into<String>>(self, msg: S) -> T {
|
||||
match self {
|
||||
Ok(x) => x,
|
||||
Err(err) => {
|
||||
error!("{}: {}", msg.into(), err);
|
||||
std::process::exit(1);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<T> Ensure<T> for Option<T> {
|
||||
fn ensure<S: Into<String>>(self, msg: S) -> T {
|
||||
match self {
|
||||
Some(x) => x,
|
||||
None => {
|
||||
error!("{}", msg.into());
|
||||
std::process::exit(1);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn bail<S: Into<String>>(msg: S) -> ! {
|
||||
error!("{}", msg.into());
|
||||
std::process::exit(1);
|
||||
}
|
||||
262
corp/russian/data-import/src/oc_parser.rs
Normal file
262
corp/russian/data-import/src/oc_parser.rs
Normal file
|
|
@ -0,0 +1,262 @@
|
|||
use super::{bail, Ensure};
|
||||
use log::info;
|
||||
use xml::attribute::OwnedAttribute;
|
||||
use xml::name::OwnedName;
|
||||
use xml::reader::XmlEvent;
|
||||
use xml::EventReader;
|
||||
|
||||
#[derive(Default, Debug)]
|
||||
pub struct Grammeme {
|
||||
parent: Option<String>,
|
||||
name: String,
|
||||
alias: String,
|
||||
description: String,
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
pub struct Lemma {}
|
||||
|
||||
#[derive(Debug)]
|
||||
pub enum OcElement {
|
||||
Grammeme(Grammeme),
|
||||
Lemma(Lemma),
|
||||
}
|
||||
|
||||
#[derive(Debug, PartialEq)]
|
||||
enum ParserState {
|
||||
/// Parser is not parsing any particular section and waiting for a
|
||||
/// start tag instead.
|
||||
Init,
|
||||
|
||||
/// Parser is parsing grammemes.
|
||||
Grammemes,
|
||||
|
||||
/// Parser is parsing lemmata.
|
||||
Lemmata,
|
||||
|
||||
/// Parser has seen the end of the line and nothing more is
|
||||
/// available.
|
||||
Ended,
|
||||
}
|
||||
|
||||
pub struct OpenCorporaParser<R: std::io::Read> {
|
||||
reader: EventReader<R>,
|
||||
state: ParserState,
|
||||
}
|
||||
|
||||
#[derive(PartialEq)]
|
||||
enum SectionState {
|
||||
/// Actively interested in parsing this section.
|
||||
Active,
|
||||
|
||||
/// Section is known, but currently ignored.
|
||||
Inactive,
|
||||
|
||||
/// Section is unknown (probably a bug).
|
||||
Unknown,
|
||||
}
|
||||
|
||||
fn section_state(section: &str) -> SectionState {
|
||||
match section {
|
||||
"grammemes" | "lemmata" => SectionState::Active,
|
||||
"restrictions" | "link_types" | "links" => SectionState::Inactive,
|
||||
_ => SectionState::Unknown,
|
||||
}
|
||||
}
|
||||
|
||||
impl<R: std::io::Read> OpenCorporaParser<R> {
|
||||
pub fn new(reader: R) -> Self {
|
||||
let config = xml::ParserConfig::new().trim_whitespace(true);
|
||||
let reader = EventReader::new_with_config(reader, config);
|
||||
|
||||
Self {
|
||||
reader,
|
||||
state: ParserState::Init,
|
||||
}
|
||||
}
|
||||
|
||||
/// Pull an `OcElement` out of the parser. Returns `None` if the
|
||||
/// parser stream has ended.
|
||||
pub fn next_element(&mut self) -> Option<OcElement> {
|
||||
if self.state == ParserState::Ended {
|
||||
return None;
|
||||
}
|
||||
|
||||
// Pull the next element to determine what context to enter
|
||||
// next.
|
||||
loop {
|
||||
match &self.next() {
|
||||
// no-op events that do not affect parser state
|
||||
XmlEvent::Comment(_)
|
||||
| XmlEvent::Whitespace(_)
|
||||
| XmlEvent::ProcessingInstruction { .. }
|
||||
| XmlEvent::StartDocument { .. } => continue,
|
||||
XmlEvent::StartElement { name, .. } | XmlEvent::EndElement { name }
|
||||
if name.local_name == "dictionary" =>
|
||||
{
|
||||
continue
|
||||
}
|
||||
|
||||
// end of the file, nothing more to return
|
||||
XmlEvent::EndDocument => {
|
||||
self.state = ParserState::Ended;
|
||||
return None;
|
||||
}
|
||||
|
||||
// some sections are skipped
|
||||
XmlEvent::StartElement { name, .. } | XmlEvent::EndElement { name }
|
||||
if section_state(&name.local_name) == SectionState::Inactive =>
|
||||
{
|
||||
info!("skipping {} section", name.local_name);
|
||||
self.skip_section(&name.local_name);
|
||||
}
|
||||
|
||||
// active section events start specific parser states ...
|
||||
XmlEvent::StartElement { name, .. }
|
||||
if section_state(&name.local_name) == SectionState::Active =>
|
||||
{
|
||||
self.state = match name.local_name.as_str() {
|
||||
"grammemes" => ParserState::Grammemes,
|
||||
"lemmata" => ParserState::Lemmata,
|
||||
_ => unreachable!(),
|
||||
};
|
||||
}
|
||||
|
||||
// ... or end them
|
||||
XmlEvent::EndElement { name, .. }
|
||||
if section_state(&name.local_name) == SectionState::Active =>
|
||||
{
|
||||
// TODO: assert that the right section ended
|
||||
self.state = ParserState::Init;
|
||||
}
|
||||
|
||||
// actual beginning of an actual element, dispatch accordingly
|
||||
event @ XmlEvent::StartElement {
|
||||
name, attributes, ..
|
||||
} => match self.state {
|
||||
ParserState::Grammemes => {
|
||||
return Some(OcElement::Grammeme(self.parse_grammeme(name, attributes)))
|
||||
}
|
||||
ParserState::Lemmata => {
|
||||
return Some(OcElement::Lemma(self.parse_lemma(name, attributes)))
|
||||
}
|
||||
|
||||
ParserState::Init | ParserState::Ended => bail(format!(
|
||||
"parser received an unexpected start element while in state {:?}: {:?}",
|
||||
self.state, event
|
||||
)),
|
||||
},
|
||||
|
||||
// finally, events that indicate a bug if they're
|
||||
// encountered here
|
||||
event @ XmlEvent::EndElement { .. }
|
||||
| event @ XmlEvent::CData(_)
|
||||
| event @ XmlEvent::Characters(_) => {
|
||||
bail(format!("unexpected XML event: {:?}", event))
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Skip a section by advancing the parser state until we see an
|
||||
/// end element for the skipped section.
|
||||
fn skip_section(&mut self, section: &str) {
|
||||
loop {
|
||||
match self.next() {
|
||||
XmlEvent::EndElement { name } if name.local_name == section => return,
|
||||
_ => continue,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn next(&mut self) -> XmlEvent {
|
||||
self.reader.next().ensure("XML parsing failed")
|
||||
}
|
||||
|
||||
/// Parse a tag that should have plain string content.
|
||||
fn parse_string(&mut self, tag_name: &str) -> String {
|
||||
let mut out = String::new();
|
||||
|
||||
loop {
|
||||
match self.next() {
|
||||
// ignore irrelevant things
|
||||
XmlEvent::Comment(_) | XmlEvent::Whitespace(_) => continue,
|
||||
|
||||
// set the content
|
||||
XmlEvent::Characters(content) => {
|
||||
out = content;
|
||||
}
|
||||
|
||||
// expect the end of the element
|
||||
XmlEvent::EndElement { name } if name.local_name == tag_name => return out,
|
||||
|
||||
// fail on everything unexpected
|
||||
event => bail(format!(
|
||||
"unexpected element while parsing <{}>: {:?}",
|
||||
tag_name, event
|
||||
)),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn parse_grammeme(&mut self, name: &OwnedName, attributes: &[OwnedAttribute]) -> Grammeme {
|
||||
if name.local_name != "grammeme" {
|
||||
bail(format!(
|
||||
"expected to parse a grammeme, but found <{}>",
|
||||
name.local_name
|
||||
));
|
||||
}
|
||||
|
||||
let mut grammeme = Grammeme::default();
|
||||
|
||||
for attr in attributes {
|
||||
if attr.name.local_name == "parent" && !attr.value.is_empty() {
|
||||
grammeme.parent = Some(attr.value.clone());
|
||||
}
|
||||
}
|
||||
|
||||
loop {
|
||||
match self.next() {
|
||||
// ignore irrelevant things
|
||||
XmlEvent::Comment(_) | XmlEvent::Whitespace(_) => continue,
|
||||
|
||||
// expect known tags
|
||||
XmlEvent::StartElement { name, .. } if name.local_name == "name" => {
|
||||
grammeme.name = self.parse_string("name");
|
||||
}
|
||||
|
||||
XmlEvent::StartElement { name, .. } if name.local_name == "alias" => {
|
||||
grammeme.alias = self.parse_string("alias");
|
||||
}
|
||||
|
||||
XmlEvent::StartElement { name, .. } if name.local_name == "description" => {
|
||||
grammeme.description = self.parse_string("description");
|
||||
}
|
||||
|
||||
// handle end of the grammeme
|
||||
XmlEvent::EndElement { name } if name.local_name == "grammeme" => break,
|
||||
|
||||
// fail on everything unexpected
|
||||
event => bail(format!(
|
||||
"unexpected element while parsing <grammeme>: {:?}",
|
||||
event
|
||||
)),
|
||||
}
|
||||
}
|
||||
|
||||
grammeme
|
||||
}
|
||||
|
||||
fn parse_lemma(&mut self, name: &OwnedName, _attributes: &[OwnedAttribute]) -> Lemma {
|
||||
if name.local_name != "lemma" {
|
||||
bail(format!(
|
||||
"expected to parse a lemma, but found <{}>",
|
||||
name.local_name
|
||||
));
|
||||
}
|
||||
|
||||
self.skip_section("lemma");
|
||||
|
||||
Lemma {}
|
||||
}
|
||||
}
|
||||
Loading…
Add table
Add a link
Reference in a new issue