-
Notifications
You must be signed in to change notification settings - Fork 14
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #37 from oscar-corpus/dev
Ungoliant v1.1.0
- Loading branch information
Showing
30 changed files
with
1,646 additions
and
179 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,3 +1,4 @@ | ||
ut1-blacklists/ | ||
debug/ | ||
target/ | ||
Cargo.lock | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,6 +1,6 @@ | ||
[package] | ||
name = "ungoliant" | ||
version = "1.0.0" | ||
version = "1.1.0" | ||
authors = ["Julien Abadji <[email protected]>, Pedro J. Ortiz <[email protected]>"] | ||
edition = "2021" | ||
description = "The pipeline for the OSCAR corpus." | ||
|
@@ -20,8 +20,8 @@ log = "*" | |
itertools = "0.10.0" | ||
tokio = { version = "1", features = ["full"] } | ||
tokio-util = {version="0.6.6", features=["compat"]} | ||
warc = {git = "https://github.com/jedireza/warc", features=["with_serde"], version = "0.3"} | ||
ut1_blocklist = {git = "https://github.com/oscar-corpus/ut1-rs"} | ||
warc = {version="0.3.0", features=["with_serde"]} | ||
ut1_blocklist = "0.1.0" | ||
fasttext = "0.6" | ||
bytes = "1" | ||
rayon = "1" | ||
|
@@ -36,6 +36,10 @@ runiq-lib = "1.2.2" | |
rand = "0.8.4" | ||
url = "2.2.2" | ||
avro-rs = { version = "0.13.0", features = ["snappy"] } | ||
unicode-script = "0.5.4" | ||
unicode-segmentation = "1.8.0" | ||
csv = "1.1.6" | ||
unic-ucd = "0.9.0" | ||
|
||
[dev-dependencies] | ||
rand_distr = "0.4.2" | ||
|
@@ -51,3 +55,7 @@ harness = false | |
[[bench]] | ||
name = "pipeline_bench_rayon" | ||
harness = false | ||
|
||
[[bench]] | ||
name = "annotate_noisy" | ||
harness = false |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,32 @@ | ||
use std::collections::HashMap; | ||
|
||
use criterion::{black_box, criterion_group, criterion_main, Criterion}; | ||
use ungoliant::{ | ||
pipelines::oscardoc::types::{Document, Metadata}, | ||
transformers::{Annotate, Noisy}, | ||
}; | ||
pub fn noisy(c: &mut Criterion) { | ||
let documents: Vec<Document> = [ | ||
"//////////////////////////////////////////////.", | ||
"lorem ipsum dolor sit ////////////////////////.", | ||
"lore////mmm////m ipsum d///////olor//////sit a.", | ||
"lorem ipsum dolor sit amet.", | ||
] | ||
.into_iter() | ||
.map(String::from) | ||
.map(|content| Document::new(content, HashMap::new(), Metadata::default())) | ||
.collect(); | ||
|
||
let a = Noisy::default(); | ||
c.bench_function("noisy_annotate", |b| { | ||
b.iter(|| { | ||
let documents = documents.clone(); | ||
for mut d in documents { | ||
a.annotate(black_box(&mut d)) | ||
} | ||
}) | ||
}); | ||
} | ||
|
||
criterion_group!(benches, noisy); | ||
criterion_main!(benches); |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,14 +1,14 @@ | ||
//! Filtering traits. | ||
/// immutable, pure filter (=2 successive equal inputs -> 2 equal outputs) | ||
/// immutable, pure filter (2 successive equal inputs -> 2 equal outputs) | ||
pub trait Filter<T>: Default { | ||
fn detect(&self, item: T) -> bool; | ||
} | ||
|
||
/// mutable filter (that holds state) | ||
// Note that the function name is different, | ||
// Because some filters may be able to use both traits | ||
// (it is possible to keep same naming but the ergonomics are weird) | ||
/// mutable filter (that holds state). | ||
/// Note that the function name is different, | ||
/// Because some filters may be able to use both traits | ||
/// (it is possible to keep same naming but the ergonomics are weird) | ||
pub trait FilterMut<T>: Default { | ||
fn detect_mut(&mut self, item: T) -> bool; | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.