Merge pull request #37 from oscar-corpus/dev

Ungoliant v1.1.0
oscar-project · Feb 28, 2022 · aff79ed · aff79ed
2 parents eb8ccf8 + c2d7cdf
commit aff79ed
Show file tree

Hide file tree

Showing 30 changed files with 1,646 additions and 179 deletions.
diff --git a/.gitignore b/.gitignore
@@ -1,3 +1,4 @@
+ut1-blacklists/
 debug/
 target/
 Cargo.lock

diff --git a/Cargo.toml b/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "ungoliant"
-version = "1.0.0"
+version = "1.1.0"
 authors = ["Julien Abadji <[email protected]>, Pedro J. Ortiz <[email protected]>"]
 edition = "2021"
 description = "The pipeline for the OSCAR corpus."
@@ -20,8 +20,8 @@ log = "*"
 itertools = "0.10.0"
 tokio = { version = "1", features = ["full"] }
 tokio-util = {version="0.6.6", features=["compat"]}
-warc = {git = "https://github.com/jedireza/warc", features=["with_serde"], version = "0.3"}
-ut1_blocklist = {git = "https://github.com/oscar-corpus/ut1-rs"}
+warc = {version="0.3.0", features=["with_serde"]}
+ut1_blocklist = "0.1.0"
 fasttext = "0.6"
 bytes = "1"
 rayon = "1"
@@ -36,6 +36,10 @@ runiq-lib = "1.2.2"
 rand = "0.8.4"
 url = "2.2.2"
 avro-rs = { version = "0.13.0", features = ["snappy"] }
+unicode-script = "0.5.4"
+unicode-segmentation = "1.8.0"
+csv = "1.1.6"
+unic-ucd = "0.9.0"
 
 [dev-dependencies]
 rand_distr = "0.4.2"
@@ -51,3 +55,7 @@ harness = false
 [[bench]]
 name = "pipeline_bench_rayon"
 harness = false
+
+[[bench]]
+name = "annotate_noisy"
+harness = false
diff --git a/benches/annotate_noisy.rs b/benches/annotate_noisy.rs
@@ -0,0 +1,32 @@
+use std::collections::HashMap;
+
+use criterion::{black_box, criterion_group, criterion_main, Criterion};
+use ungoliant::{
+    pipelines::oscardoc::types::{Document, Metadata},
+    transformers::{Annotate, Noisy},
+};
+pub fn noisy(c: &mut Criterion) {
+    let documents: Vec<Document> = [
+        "//////////////////////////////////////////////.",
+        "lorem ipsum dolor sit ////////////////////////.",
+        "lore////mmm////m ipsum d///////olor//////sit a.",
+        "lorem ipsum dolor sit amet.",
+    ]
+    .into_iter()
+    .map(String::from)
+    .map(|content| Document::new(content, HashMap::new(), Metadata::default()))
+    .collect();
+
+    let a = Noisy::default();
+    c.bench_function("noisy_annotate", |b| {
+        b.iter(|| {
+            let documents = documents.clone();
+            for mut d in documents {
+                a.annotate(black_box(&mut d))
+            }
+        })
+    });
+}
+
+criterion_group!(benches, noisy);
+criterion_main!(benches);
diff --git a/src/cli.rs b/src/cli.rs
@@ -21,8 +21,17 @@ pub enum Ungoliant {
     Package(Package),
     #[structopt(about = "rebuild the corpus for a given language")]
     Rebuild(Rebuild),
+    #[structopt(about = "check for corpus validity")]
+    Check(Check),
 }
 
+#[derive(Debug, StructOpt)]
+pub struct Check {
+    #[structopt(parse(from_os_str), help = "Corpus file")]
+    pub src: PathBuf,
+    #[structopt(parse(from_os_str), help = "csv file destination")]
+    pub dst: PathBuf,
+}
 #[derive(Debug, StructOpt)]
 pub struct Rebuild {
     #[structopt(parse(from_os_str), help = "source rebuild file (not directory)")]

diff --git a/src/error.rs b/src/error.rs
@@ -18,6 +18,13 @@ pub enum Error {
     FastText(String),
     IncompleteLocation(IncompleteLocation),
     Avro(avro_rs::Error),
+    Csv(csv::Error),
+}
+
+impl From<csv::Error> for Error {
+    fn from(v: csv::Error) -> Self {
+        Self::Csv(v)
+    }
 }
 
 impl From<avro_rs::Error> for Error {

diff --git a/src/filtering/filter.rs b/src/filtering/filter.rs
@@ -1,14 +1,14 @@
 //! Filtering traits.
 
-/// immutable, pure filter (=2 successive equal inputs -> 2 equal outputs)
+/// immutable, pure filter (2 successive equal inputs -> 2 equal outputs)
 pub trait Filter<T>: Default {
     fn detect(&self, item: T) -> bool;
 }
 
-/// mutable filter (that holds state)
-// Note that the function name is different,
-// Because some filters may be able to use both traits
-// (it is possible to keep same naming but the ergonomics are weird)
+/// mutable filter (that holds state).
+/// Note that the function name is different,
+/// Because some filters may be able to use both traits
+/// (it is possible to keep same naming but the ergonomics are weird)
 pub trait FilterMut<T>: Default {
     fn detect_mut(&mut self, item: T) -> bool;
 }
diff --git a/src/filtering/mod.rs b/src/filtering/mod.rs
@@ -8,7 +8,7 @@ Filters implement [filter::Filter], [filter::FilterMut] or both:
 
 Both can be implemented for a given filter,
 in order to provide a mutable detection that could be used to "train" the filter, then an immutable one to effectively filter content.
-! */
+!*/
 mod filter;
 pub mod record;
 pub mod sentence;

diff --git a/src/filtering/record.rs b/src/filtering/record.rs
@@ -1,4 +1,6 @@
 //! Document-level filtering.
+//!
+//! Those filters take a WARC [warc::Record] as a parameter.
 use std::convert::TryFrom;
 
 use warc::{BufferedBody, Record};
@@ -27,14 +29,15 @@ impl Filter<&Record<BufferedBody>> for FilterKind {
 /// Filters out documents that doesn't have its content enough in long newline-separated strings.
 ///
 /// For each document, we compute the size (in bytes) of newline-separated strings, that we bucket in two bins
-/// depending on their size (<>min_length).
-/// If the >min_length bin makes for at least sentence_threshold of the document, we keep it.
+/// depending on their size. The threshold size is specified in the [PFilter::min_length].
+/// If the `>min_length` bin makes for at least sentence_threshold of the document, we keep it.
 pub struct PFilter {
     sentence_threshold: f64,
     sentence_filter: Length,
 }
 
 impl PFilter {
+    /// Create a new PFilter with specific parameters.
     pub fn new(sentence_threshold: f64, sentence_filter: Length) -> Self {
         PFilter {
             sentence_threshold,
@@ -83,8 +86,8 @@ impl Filter<&Record<BufferedBody>> for PFilter {
 }
 
 impl Default for PFilter {
-    /// inits PFilter with a threshold of 0.6 (that means, at least 60% of content is from long sentences)
-    /// sentence filter's default long sentence threshold (100 codepoints).
+    /// inits PFilter with a threshold of `0.6` (that means, at least 60% of content is from long sentences)
+    /// sentence filter's default long sentence threshold (`100 codepoints`).
     fn default() -> Self {
         PFilter {
             sentence_threshold: 0.6,

diff --git a/src/filtering/sentence.rs b/src/filtering/sentence.rs
@@ -1,4 +1,4 @@
-//! sentence-level filtering
+//! Sentence-level filtering
 use super::filter::FilterMut;
 use super::Filter;
 use std::convert::TryInto;

diff --git a/src/identifiers/fasttext.rs b/src/identifiers/fasttext.rs
@@ -1,10 +1,10 @@
 //! Fasttext identifier
-use std::path::Path;
+use std::{collections::HashMap, path::Path, str::Lines};
 
-use crate::error::Error;
+use crate::{error::Error, lang::Lang};
 use fasttext::{FastText as FastTextLib, Prediction};
 
-use super::{identifier, Identification};
+use super::{identifier, Identification, Identifier};
 
 /// Clean the prediction label field from `__label__xx` into `xx`.
 ///
@@ -92,9 +92,66 @@ impl FastText {
             ))
         }
     }
+
+    /// Identifies each line, then returns both identifications for each line _and_
+    /// a HashMap holding (byte_count, sum(byte_count*prob) / total count).
+    pub fn get_weighted_ids(
+        &self,
+        lines: Lines,
+    ) -> Result<
+        (
+            Vec<Option<Identification>>,
+            HashMap<Option<Lang>, (usize, f32)>,
+            usize,
+        ),
+        Error,
+    > {
+        // per-lang and total byte counts
+        // lang_count maps Lang -> (lang_byte_count, sum(byte_count*prob))
+        let mut lang_count = HashMap::new();
+        let mut total_count = 0;
+
+        // filter out unicode null chars
+        // this prevents fasttext errors and hopefully improves
+        // corpus quality
+        let lines = lines.map(|l| l.replace(char::from(0), ""));
+        let ids: Vec<Option<Identification>> = lines
+            .map(|line| {
+                // identify
+                let id = self.identify(line.as_str());
+
+                // add to byte count for document-level identification
+                if let Ok(ref ide) = id {
+                    // map Identification to its lang, or keep None to store the "None" language identification
+                    let ide_label = ide.as_ref().map(|i| *i.label());
+                    let ide_prob = ide.as_ref().map(|i| *i.prob());
+                    // get length of current line
+                    let byte_count = line.bytes().count();
+
+                    lang_count
+                        .entry(ide_label)
+                        .and_modify(|(count, count_times_prob)| {
+                            *count += byte_count;
+                            *count_times_prob += byte_count as f32 * ide_prob.unwrap_or(1.0f32);
+                        })
+                        .or_insert((byte_count, byte_count as f32 * ide_prob.unwrap_or(1.0f32)));
+
+                    total_count += byte_count;
+                }
+                id
+            })
+            .collect::<Result<_, Error>>()?;
+
+        // divide by total count to get probs between 0 and 1.
+        for (_, count_times_prob) in lang_count.values_mut() {
+            *count_times_prob /= total_count as f32;
+        }
+
+        Ok((ids, lang_count, total_count))
+    }
 }
 
-impl identifier::Identifier for FastText {
+impl identifier::Identifier<&str> for FastText {
     fn identify(&self, sentence: &str) -> Result<Option<Identification>, Error> {
         let prediction = self
             .predictor
@@ -111,6 +168,7 @@ impl identifier::Identifier for FastText {
         }
     }
 }
+
 #[cfg(test)]
 mod tests {
     use super::*;
@@ -173,6 +231,55 @@ mod tests {
         assert!(cls2.is_err());
     }
 
+    #[test]
+    fn test_weighted_ids() {
+        let classifier = FastText::new_lid().expect("could not instantiate a classifier");
+        let document = "This sentence is a long, long sentence that happens to be in english.
+        This one too, what a coincidence, whew. The quick brown fox jumps over the lazy dog. This one too, what a coincidence, whew. The quick brown fox jumps over the lazy dog.
+        Phrase courte en français
+        il y en a 3 mais moins de contenu que les anglaises
+        héhé c'est vrai que c'est petit
+        qdlskfjqmfdjlmkj";
+
+        let lines: Vec<&str> = document.lines().collect();
+        let en_count = lines[0].len() + lines[1].len();
+        let fr_count = lines[2].len() + lines[3].len() + lines[4].len();
+        let unk_count = lines[5].len();
+        let total_count = en_count + fr_count + unk_count;
+        let (ids, langs, total_size) = classifier.get_weighted_ids(document.lines()).unwrap();
+
+        println!("{:?}", ids);
+        println!("{:?}", langs);
+        println!("{:?}", total_size);
+        // assert correct byte counts
+        assert_eq!(langs.get(&Some(Lang::En)).unwrap().0, en_count);
+        assert_eq!(langs.get(&Some(Lang::Fr)).unwrap().0, fr_count);
+        assert_eq!(langs.get(&None).unwrap().0, unk_count);
+
+        // assert correct language ids
+        let expected_ids = [
+            Some(Lang::En),
+            Some(Lang::En),
+            Some(Lang::Fr),
+            Some(Lang::Fr),
+            Some(Lang::Fr),
+            None,
+        ];
+
+        assert_eq!(
+            ids.into_iter()
+                .map(|id| id.map(|x| *x.label()))
+                .collect::<Vec<Option<Lang>>>(),
+            expected_ids
+        );
+
+        // assert total count
+        assert_eq!(total_count, total_size);
+
+        //assert sum lengths
+        let (lengths, _): (Vec<usize>, Vec<f32>) = langs.values().map(|v| (v.0, v.1)).unzip();
+        assert_eq!(lengths.iter().sum::<usize>(), total_count);
+    }
     // #[test]
     // fn test_clean_prediction_invalid() -> {
 

diff --git a/src/identifiers/identifier.rs b/src/identifiers/identifier.rs
@@ -5,11 +5,12 @@ All identifiers should implement [Identifier] to be useable in processing and pi
 use std::str::FromStr;
 
 use fasttext::Prediction;
+use schemars::JsonSchema;
 
 use crate::{error::Error, lang::Lang};
 use serde::{Deserialize, Serialize};
 
-#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
+#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, JsonSchema)]
 #[serde(from = "IdentificationSer", into = "IdentificationSer")]
 pub struct Identification {
     label: Lang,
@@ -62,9 +63,9 @@ impl From<Prediction> for Identification {
         }
     }
 }
-pub trait Identifier {
+pub trait Identifier<T> {
     /// returns a language identification token (from [crate::lang::LANG]).
-    fn identify(&self, sentence: &str) -> Result<Option<Identification>, Error>;
+    fn identify(&self, sentence: T) -> Result<Option<Identification>, Error>;
 }
 
 #[cfg(test)]

diff --git a/src/identifiers/mod.rs b/src/identifiers/mod.rs
@@ -6,7 +6,10 @@ The current identifier used is [fasttext](https://fasttext.cc)
 !*/
 mod fasttext;
 mod identifier;
+mod multilingual;
 
 pub use self::fasttext::FastText;
 pub use identifier::Identification;
 pub use identifier::Identifier;
+pub use multilingual::Multilingual;
+pub use multilingual::StrictMultilingual;