diff --git a/Project.toml b/Project.toml index 84d9468..71d41ff 100644 --- a/Project.toml +++ b/Project.toml @@ -1,7 +1,7 @@ name = "SimString" uuid = "2e3c4037-312d-4650-b9c0-fcd0fc09aae4" authors = ["Bernard Brenyah"] -version = "0.1.0" +version = "0.2.0" [deps] CircularArrays = "7a955b69-7140-5f4e-a0ed-f168c5e2e749" diff --git a/README.md b/README.md index 900293d..8a0fc95 100644 --- a/README.md +++ b/README.md @@ -15,9 +15,9 @@ This package is be particulary useful for natural language processing tasks whic - [X] Fast algorithm for string matching - [X] 100% exact retrieval - [X] Support for unicodes +- [X] Support for building databases directly from text files - [ ] Custom user defined feature generation methods - [ ] Mecab-based tokenizer support -- [X] Support for building databases directly from text files - [ ] Support for persistent databases ## Suported String Similarity Measures @@ -41,7 +41,7 @@ pkg> add SimString The few (and selected) brave ones can simply grab the current experimental features by simply adding the master branch to your development environment after invoking the package manager with `]`: ```julia -pkg> add SimString#master +pkg> add SimString#main ``` You are good to go with bleeding edge features and breakages! diff --git a/docs/src/index.md b/docs/src/index.md index 06d50b2..e253381 100644 --- a/docs/src/index.md +++ b/docs/src/index.md @@ -7,16 +7,18 @@ CurrentModule = SimString Documentation for [SimString](https://github.com/PyDataBlog/SimString.jl). A native Julia implementation of the CPMerge algorithm, which is designed for approximate string matching. -This package is be particulary useful for natural language processing tasks which demand the retrieval of strings/texts from a very large corpora (big amounts of texts). Currently, this package supports both Character and Word based N-grams feature generations and there are plans to open the package up for custom user defined feature generation methods. +This package is be particulary useful for natural language processing tasks which require the retrieval of strings/texts from a very large corpora (big amounts of texts). Currently, this package supports both Character and Word based N-grams feature generations and there are plans to open the package up for custom user defined feature generation methods. + +CPMerge Paper: [https://aclanthology.org/C10-1096/](https://aclanthology.org/C10-1096/) ## Features - [X] Fast algorithm for string matching - [X] 100% exact retrieval - [X] Support for unicodes +- [X] Support for building databases directly from text files - [ ] Custom user defined feature generation methods - [ ] Mecab-based tokenizer support -- [X] Support for building databases directly from text files - [ ] Support for persistent databases ## Suported String Similarity Measures @@ -82,6 +84,7 @@ desc = describe_collection(db) ## Release History - 0.1.0 Initial release. +- 0.2.0 Added support for unicodes ```@index ``` diff --git a/src/dictdb.jl b/src/dictdb.jl index 463c936..4b932d3 100644 --- a/src/dictdb.jl +++ b/src/dictdb.jl @@ -129,7 +129,7 @@ end Internal function to lookup feature sets by size and feature """ function lookup_feature_set_by_size_feature(db::DictDB, size, feature) - if feature ∉ keys(db.lookup_cache[size]) + if !haskey(db.lookup_cache[size], feature) db.lookup_cache[size][feature] = get(db.string_feature_map[size], feature, Set{String}()) end return db.lookup_cache[size][feature] diff --git a/src/features.jl b/src/features.jl index c2024b4..b796195 100644 --- a/src/features.jl +++ b/src/features.jl @@ -10,7 +10,6 @@ end Internal function to pad AbstractVector types with specified padder """ function pad_string(x::AbstractVector, padder::AbstractString) - # TODO: Insert a padder as the first and last element of x with undef insert!(x, 1, padder) push!(x, padder) return x @@ -96,7 +95,6 @@ end Internal function to count and pad generated character-level ngrams (including duplicates) """ function cummulative_ngram_count(x) - # TODO: Use length of x initiate non allocated ngrams counter = Dict{eltype(x), Int}() return map(x) do val diff --git a/src/search.jl b/src/search.jl index fa0ddb4..89cb2a3 100644 --- a/src/search.jl +++ b/src/search.jl @@ -102,11 +102,6 @@ function search!(measure::AbstractSimilarityMeasure, db_collection::DictDB, quer # Generate features from query string features = extract_features(db_collection.feature_extractor, query) - # Metadata from the generated features (length, min & max sizes) - # length_of_features = length(features) - # min_feature_size = minimum_feature_size(measure, length_of_features, α) - # max_feature_size = maximum_feature_size(measure, db_collection, length_of_features, α) - results = String[] # Generate and return results from the potential candidate size pool