Skip to content

Commit

Permalink
Merge pull request #37 from oscar-corpus/dev
Browse files Browse the repository at this point in the history
Ungoliant v1.1.0
  • Loading branch information
Uinelj authored Feb 28, 2022
2 parents eb8ccf8 + c2d7cdf commit aff79ed
Show file tree
Hide file tree
Showing 30 changed files with 1,646 additions and 179 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
ut1-blacklists/
debug/
target/
Cargo.lock
Expand Down
14 changes: 11 additions & 3 deletions Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "ungoliant"
version = "1.0.0"
version = "1.1.0"
authors = ["Julien Abadji <[email protected]>, Pedro J. Ortiz <[email protected]>"]
edition = "2021"
description = "The pipeline for the OSCAR corpus."
Expand All @@ -20,8 +20,8 @@ log = "*"
itertools = "0.10.0"
tokio = { version = "1", features = ["full"] }
tokio-util = {version="0.6.6", features=["compat"]}
warc = {git = "https://github.com/jedireza/warc", features=["with_serde"], version = "0.3"}
ut1_blocklist = {git = "https://github.com/oscar-corpus/ut1-rs"}
warc = {version="0.3.0", features=["with_serde"]}
ut1_blocklist = "0.1.0"
fasttext = "0.6"
bytes = "1"
rayon = "1"
Expand All @@ -36,6 +36,10 @@ runiq-lib = "1.2.2"
rand = "0.8.4"
url = "2.2.2"
avro-rs = { version = "0.13.0", features = ["snappy"] }
unicode-script = "0.5.4"
unicode-segmentation = "1.8.0"
csv = "1.1.6"
unic-ucd = "0.9.0"

[dev-dependencies]
rand_distr = "0.4.2"
Expand All @@ -51,3 +55,7 @@ harness = false
[[bench]]
name = "pipeline_bench_rayon"
harness = false

[[bench]]
name = "annotate_noisy"
harness = false
32 changes: 32 additions & 0 deletions benches/annotate_noisy.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
use std::collections::HashMap;

use criterion::{black_box, criterion_group, criterion_main, Criterion};
use ungoliant::{
pipelines::oscardoc::types::{Document, Metadata},
transformers::{Annotate, Noisy},
};
pub fn noisy(c: &mut Criterion) {
let documents: Vec<Document> = [
"//////////////////////////////////////////////.",
"lorem ipsum dolor sit ////////////////////////.",
"lore////mmm////m ipsum d///////olor//////sit a.",
"lorem ipsum dolor sit amet.",
]
.into_iter()
.map(String::from)
.map(|content| Document::new(content, HashMap::new(), Metadata::default()))
.collect();

let a = Noisy::default();
c.bench_function("noisy_annotate", |b| {
b.iter(|| {
let documents = documents.clone();
for mut d in documents {
a.annotate(black_box(&mut d))
}
})
});
}

criterion_group!(benches, noisy);
criterion_main!(benches);
9 changes: 9 additions & 0 deletions src/cli.rs
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,17 @@ pub enum Ungoliant {
Package(Package),
#[structopt(about = "rebuild the corpus for a given language")]
Rebuild(Rebuild),
#[structopt(about = "check for corpus validity")]
Check(Check),
}

#[derive(Debug, StructOpt)]
pub struct Check {
#[structopt(parse(from_os_str), help = "Corpus file")]
pub src: PathBuf,
#[structopt(parse(from_os_str), help = "csv file destination")]
pub dst: PathBuf,
}
#[derive(Debug, StructOpt)]
pub struct Rebuild {
#[structopt(parse(from_os_str), help = "source rebuild file (not directory)")]
Expand Down
7 changes: 7 additions & 0 deletions src/error.rs
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,13 @@ pub enum Error {
FastText(String),
IncompleteLocation(IncompleteLocation),
Avro(avro_rs::Error),
Csv(csv::Error),
}

impl From<csv::Error> for Error {
fn from(v: csv::Error) -> Self {
Self::Csv(v)
}
}

impl From<avro_rs::Error> for Error {
Expand Down
10 changes: 5 additions & 5 deletions src/filtering/filter.rs
Original file line number Diff line number Diff line change
@@ -1,14 +1,14 @@
//! Filtering traits.
/// immutable, pure filter (=2 successive equal inputs -> 2 equal outputs)
/// immutable, pure filter (2 successive equal inputs -> 2 equal outputs)
pub trait Filter<T>: Default {
fn detect(&self, item: T) -> bool;
}

/// mutable filter (that holds state)
// Note that the function name is different,
// Because some filters may be able to use both traits
// (it is possible to keep same naming but the ergonomics are weird)
/// mutable filter (that holds state).
/// Note that the function name is different,
/// Because some filters may be able to use both traits
/// (it is possible to keep same naming but the ergonomics are weird)
pub trait FilterMut<T>: Default {
fn detect_mut(&mut self, item: T) -> bool;
}
2 changes: 1 addition & 1 deletion src/filtering/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ Filters implement [filter::Filter], [filter::FilterMut] or both:
Both can be implemented for a given filter,
in order to provide a mutable detection that could be used to "train" the filter, then an immutable one to effectively filter content.
! */
!*/
mod filter;
pub mod record;
pub mod sentence;
Expand Down
11 changes: 7 additions & 4 deletions src/filtering/record.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
//! Document-level filtering.
//!
//! Those filters take a WARC [warc::Record] as a parameter.
use std::convert::TryFrom;

use warc::{BufferedBody, Record};
Expand Down Expand Up @@ -27,14 +29,15 @@ impl Filter<&Record<BufferedBody>> for FilterKind {
/// Filters out documents that doesn't have its content enough in long newline-separated strings.
///
/// For each document, we compute the size (in bytes) of newline-separated strings, that we bucket in two bins
/// depending on their size (<>min_length).
/// If the >min_length bin makes for at least sentence_threshold of the document, we keep it.
/// depending on their size. The threshold size is specified in the [PFilter::min_length].
/// If the `>min_length` bin makes for at least sentence_threshold of the document, we keep it.
pub struct PFilter {
sentence_threshold: f64,
sentence_filter: Length,
}

impl PFilter {
/// Create a new PFilter with specific parameters.
pub fn new(sentence_threshold: f64, sentence_filter: Length) -> Self {
PFilter {
sentence_threshold,
Expand Down Expand Up @@ -83,8 +86,8 @@ impl Filter<&Record<BufferedBody>> for PFilter {
}

impl Default for PFilter {
/// inits PFilter with a threshold of 0.6 (that means, at least 60% of content is from long sentences)
/// sentence filter's default long sentence threshold (100 codepoints).
/// inits PFilter with a threshold of `0.6` (that means, at least 60% of content is from long sentences)
/// sentence filter's default long sentence threshold (`100 codepoints`).
fn default() -> Self {
PFilter {
sentence_threshold: 0.6,
Expand Down
2 changes: 1 addition & 1 deletion src/filtering/sentence.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
//! sentence-level filtering
//! Sentence-level filtering
use super::filter::FilterMut;
use super::Filter;
use std::convert::TryInto;
Expand Down
115 changes: 111 additions & 4 deletions src/identifiers/fasttext.rs
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
//! Fasttext identifier
use std::path::Path;
use std::{collections::HashMap, path::Path, str::Lines};

use crate::error::Error;
use crate::{error::Error, lang::Lang};
use fasttext::{FastText as FastTextLib, Prediction};

use super::{identifier, Identification};
use super::{identifier, Identification, Identifier};

/// Clean the prediction label field from `__label__xx` into `xx`.
///
Expand Down Expand Up @@ -92,9 +92,66 @@ impl FastText {
))
}
}

/// Identifies each line, then returns both identifications for each line _and_
/// a HashMap holding (byte_count, sum(byte_count*prob) / total count).
pub fn get_weighted_ids(
&self,
lines: Lines,
) -> Result<
(
Vec<Option<Identification>>,
HashMap<Option<Lang>, (usize, f32)>,
usize,
),
Error,
> {
// per-lang and total byte counts
// lang_count maps Lang -> (lang_byte_count, sum(byte_count*prob))
let mut lang_count = HashMap::new();
let mut total_count = 0;

// filter out unicode null chars
// this prevents fasttext errors and hopefully improves
// corpus quality
let lines = lines.map(|l| l.replace(char::from(0), ""));
let ids: Vec<Option<Identification>> = lines
.map(|line| {
// identify
let id = self.identify(line.as_str());

// add to byte count for document-level identification
if let Ok(ref ide) = id {
// map Identification to its lang, or keep None to store the "None" language identification
let ide_label = ide.as_ref().map(|i| *i.label());
let ide_prob = ide.as_ref().map(|i| *i.prob());
// get length of current line
let byte_count = line.bytes().count();

lang_count
.entry(ide_label)
.and_modify(|(count, count_times_prob)| {
*count += byte_count;
*count_times_prob += byte_count as f32 * ide_prob.unwrap_or(1.0f32);
})
.or_insert((byte_count, byte_count as f32 * ide_prob.unwrap_or(1.0f32)));

total_count += byte_count;
}
id
})
.collect::<Result<_, Error>>()?;

// divide by total count to get probs between 0 and 1.
for (_, count_times_prob) in lang_count.values_mut() {
*count_times_prob /= total_count as f32;
}

Ok((ids, lang_count, total_count))
}
}

impl identifier::Identifier for FastText {
impl identifier::Identifier<&str> for FastText {
fn identify(&self, sentence: &str) -> Result<Option<Identification>, Error> {
let prediction = self
.predictor
Expand All @@ -111,6 +168,7 @@ impl identifier::Identifier for FastText {
}
}
}

#[cfg(test)]
mod tests {
use super::*;
Expand Down Expand Up @@ -173,6 +231,55 @@ mod tests {
assert!(cls2.is_err());
}

#[test]
fn test_weighted_ids() {
let classifier = FastText::new_lid().expect("could not instantiate a classifier");
let document = "This sentence is a long, long sentence that happens to be in english.
This one too, what a coincidence, whew. The quick brown fox jumps over the lazy dog. This one too, what a coincidence, whew. The quick brown fox jumps over the lazy dog.
Phrase courte en français
il y en a 3 mais moins de contenu que les anglaises
héhé c'est vrai que c'est petit
qdlskfjqmfdjlmkj";

let lines: Vec<&str> = document.lines().collect();
let en_count = lines[0].len() + lines[1].len();
let fr_count = lines[2].len() + lines[3].len() + lines[4].len();
let unk_count = lines[5].len();
let total_count = en_count + fr_count + unk_count;
let (ids, langs, total_size) = classifier.get_weighted_ids(document.lines()).unwrap();

println!("{:?}", ids);
println!("{:?}", langs);
println!("{:?}", total_size);
// assert correct byte counts
assert_eq!(langs.get(&Some(Lang::En)).unwrap().0, en_count);
assert_eq!(langs.get(&Some(Lang::Fr)).unwrap().0, fr_count);
assert_eq!(langs.get(&None).unwrap().0, unk_count);

// assert correct language ids
let expected_ids = [
Some(Lang::En),
Some(Lang::En),
Some(Lang::Fr),
Some(Lang::Fr),
Some(Lang::Fr),
None,
];

assert_eq!(
ids.into_iter()
.map(|id| id.map(|x| *x.label()))
.collect::<Vec<Option<Lang>>>(),
expected_ids
);

// assert total count
assert_eq!(total_count, total_size);

//assert sum lengths
let (lengths, _): (Vec<usize>, Vec<f32>) = langs.values().map(|v| (v.0, v.1)).unzip();
assert_eq!(lengths.iter().sum::<usize>(), total_count);
}
// #[test]
// fn test_clean_prediction_invalid() -> {

Expand Down
7 changes: 4 additions & 3 deletions src/identifiers/identifier.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,11 +5,12 @@ All identifiers should implement [Identifier] to be useable in processing and pi
use std::str::FromStr;

use fasttext::Prediction;
use schemars::JsonSchema;

use crate::{error::Error, lang::Lang};
use serde::{Deserialize, Serialize};

#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, JsonSchema)]
#[serde(from = "IdentificationSer", into = "IdentificationSer")]
pub struct Identification {
label: Lang,
Expand Down Expand Up @@ -62,9 +63,9 @@ impl From<Prediction> for Identification {
}
}
}
pub trait Identifier {
pub trait Identifier<T> {
/// returns a language identification token (from [crate::lang::LANG]).
fn identify(&self, sentence: &str) -> Result<Option<Identification>, Error>;
fn identify(&self, sentence: T) -> Result<Option<Identification>, Error>;
}

#[cfg(test)]
Expand Down
3 changes: 3 additions & 0 deletions src/identifiers/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,10 @@ The current identifier used is [fasttext](https://fasttext.cc)
!*/
mod fasttext;
mod identifier;
mod multilingual;

pub use self::fasttext::FastText;
pub use identifier::Identification;
pub use identifier::Identifier;
pub use multilingual::Multilingual;
pub use multilingual::StrictMultilingual;
Loading

0 comments on commit aff79ed

Please sign in to comment.