Skip to content

Commit

Permalink
Add limit parameter for when to use bitvector for terms searching in …
Browse files Browse the repository at this point in the history
…disk indexes.
  • Loading branch information
geirst committed Nov 27, 2024
1 parent c458729 commit 3e66324
Show file tree
Hide file tree
Showing 8 changed files with 53 additions and 17 deletions.
31 changes: 18 additions & 13 deletions searchcore/src/tests/proton/matching/matching_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1262,13 +1262,13 @@ TEST_F(MatchingTest, require_that_docsum_matcher_can_extract_matching_elements_f

using FMA = vespalib::FuzzyMatchingAlgorithm;

struct AttributeBlueprintParamsFixture {
struct CreateBlueprintParamsFixture {
BlueprintFactory factory;
search::fef::test::IndexEnvironment index_env;
RankSetup rank_setup;
Properties rank_properties;
AttributeBlueprintParamsFixture(double lower_limit, double upper_limit, double target_hits_max_adjustment_factor,
FMA fuzzy_matching_algorithm)
CreateBlueprintParamsFixture(double lower_limit, double upper_limit, double target_hits_max_adjustment_factor,
FMA fuzzy_matching_algorithm)
: factory(),
index_env(),
rank_setup(factory, index_env),
Expand All @@ -1281,52 +1281,57 @@ struct AttributeBlueprintParamsFixture {
}
void set_query_properties(std::string_view lower_limit, std::string_view upper_limit,
std::string_view target_hits_max_adjustment_factor,
const std::string & fuzzy_matching_algorithm) {
std::string_view fuzzy_matching_algorithm,
std::string_view disk_index_bitvector_limit) {
rank_properties.add(GlobalFilterLowerLimit::NAME, lower_limit);
rank_properties.add(GlobalFilterUpperLimit::NAME, upper_limit);
rank_properties.add(TargetHitsMaxAdjustmentFactor::NAME, target_hits_max_adjustment_factor);
rank_properties.add(FuzzyAlgorithm::NAME, fuzzy_matching_algorithm);
rank_properties.add(DiskIndexBitvectorLimit::NAME, disk_index_bitvector_limit);
}
~AttributeBlueprintParamsFixture();
~CreateBlueprintParamsFixture();
CreateBlueprintParams extract(uint32_t active_docids = 9, uint32_t docid_limit = 10) const {
return MatchToolsFactory::extract_create_blueprint_params(rank_setup, rank_properties, active_docids, docid_limit);
}
};

AttributeBlueprintParamsFixture::~AttributeBlueprintParamsFixture() = default;
CreateBlueprintParamsFixture::~CreateBlueprintParamsFixture() = default;

TEST_F(MatchingTest, attribute_blueprint_params_are_extracted_from_rank_profile)
TEST_F(MatchingTest, create_blueprint_params_are_extracted_from_rank_profile)
{
AttributeBlueprintParamsFixture f(0.2, 0.8, 5.0, FMA::DfaTable);
CreateBlueprintParamsFixture f(0.2, 0.8, 5.0, FMA::DfaTable);
f.rank_setup.set_disk_index_bitvector_limit(0.04);
auto params = f.extract();
EXPECT_EQ(0.2, params.global_filter_lower_limit);
EXPECT_EQ(0.8, params.global_filter_upper_limit);
EXPECT_EQ(5.0, params.target_hits_max_adjustment_factor);
EXPECT_EQ(FMA::DfaTable, params.fuzzy_matching_algorithm);
EXPECT_EQ(0.04, params.disk_index_bitvector_limit);
}

TEST_F(MatchingTest, attribute_blueprint_params_are_extracted_from_query)
TEST_F(MatchingTest, create_blueprint_params_are_extracted_from_query)
{
AttributeBlueprintParamsFixture f(0.2, 0.8, 5.0, FMA::DfaTable);
f.set_query_properties("0.15", "0.75", "3.0", "dfa_explicit");
CreateBlueprintParamsFixture f(0.2, 0.8, 5.0, FMA::DfaTable);
f.set_query_properties("0.15", "0.75", "3.0", "dfa_explicit", "0.02");
auto params = f.extract();
EXPECT_EQ(0.15, params.global_filter_lower_limit);
EXPECT_EQ(0.75, params.global_filter_upper_limit);
EXPECT_EQ(3.0, params.target_hits_max_adjustment_factor);
EXPECT_EQ(FMA::DfaExplicit, params.fuzzy_matching_algorithm);
EXPECT_EQ(0.02, params.disk_index_bitvector_limit);
}

TEST_F(MatchingTest, global_filter_params_are_scaled_with_active_hit_ratio)
{
AttributeBlueprintParamsFixture f(0.2, 0.8, 5.0, FMA::DfaTable);
CreateBlueprintParamsFixture f(0.2, 0.8, 5.0, FMA::DfaTable);
auto params = f.extract(5, 10);
EXPECT_EQ(0.12, params.global_filter_lower_limit);
EXPECT_EQ(0.48, params.global_filter_upper_limit);
}

TEST_F(MatchingTest, weak_and_stop_word_strategy_is_resolved_correctly)
{
AttributeBlueprintParamsFixture f(0.2, 0.8, 5.0, FMA::DfaTable);
CreateBlueprintParamsFixture f(0.2, 0.8, 5.0, FMA::DfaTable);
EXPECT_EQ(WeakAndStopWordAdjustLimit::DEFAULT_VALUE, 1.0);
EXPECT_EQ(WeakAndStopWordDropLimit::DEFAULT_VALUE, 1.0);
EXPECT_EQ(f.rank_setup.get_weakand_stop_word_adjust_limit(), 1.0);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -356,6 +356,7 @@ MatchToolsFactory::extract_create_blueprint_params(const RankSetup& rank_setup,
double weakand_range = temporary::WeakAndRange::lookup(rank_properties, rank_setup.get_weakand_range());
double weakand_stop_word_adjust_limit = WeakAndStopWordAdjustLimit::lookup(rank_properties, rank_setup.get_weakand_stop_word_adjust_limit());
double weakand_stop_word_drop_limit = WeakAndStopWordDropLimit::lookup(rank_properties, rank_setup.get_weakand_stop_word_drop_limit());
double disk_index_bitvector_limit = DiskIndexBitvectorLimit::lookup(rank_properties, rank_setup.get_disk_index_bitvector_limit());

// Note that we count the reserved docid 0 as active.
// This ensures that when searchable-copies=1, the ratio is 1.0.
Expand All @@ -367,7 +368,8 @@ MatchToolsFactory::extract_create_blueprint_params(const RankSetup& rank_setup,
fuzzy_matching_algorithm,
weakand_range,
StopWordStrategy(weakand_stop_word_adjust_limit,
weakand_stop_word_drop_limit, docid_limit)};
weakand_stop_word_drop_limit, docid_limit),
disk_index_bitvector_limit};
}

AttributeOperationTask::AttributeOperationTask(const RequestContext & requestContext,
Expand Down
2 changes: 2 additions & 0 deletions searchlib/src/tests/ranksetup/ranksetup_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -563,6 +563,7 @@ TEST_F(RankSetupTest, rank_setup)
env.getProperties().add(matching::FuzzyAlgorithm::NAME, "dfa_implicit");
env.getProperties().add(matching::WeakAndStopWordAdjustLimit::NAME, "0.05");
env.getProperties().add(matching::WeakAndStopWordDropLimit::NAME, "0.5");
env.getProperties().add(matching::DiskIndexBitvectorLimit::NAME, "0.04");

RankSetup rs(_factory, env);
EXPECT_FALSE(rs.has_match_features());
Expand Down Expand Up @@ -608,6 +609,7 @@ TEST_F(RankSetupTest, rank_setup)
EXPECT_EQ(rs.get_fuzzy_matching_algorithm(), vespalib::FuzzyMatchingAlgorithm::DfaImplicit);
EXPECT_EQ(rs.get_weakand_stop_word_adjust_limit(), 0.05);
EXPECT_EQ(rs.get_weakand_stop_word_drop_limit(), 0.5);
EXPECT_EQ(rs.get_disk_index_bitvector_limit(), 0.04);
}

bool
Expand Down
7 changes: 7 additions & 0 deletions searchlib/src/vespa/searchlib/fef/indexproperties.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -458,6 +458,13 @@ double WeakAndStopWordDropLimit::lookup(const Properties &props, double defaultV
return lookupDouble(props, NAME, defaultValue);
}

const std::string DiskIndexBitvectorLimit::NAME("vespa.matching.diskindex.bitvector_limit");
const double DiskIndexBitvectorLimit::DEFAULT_VALUE(1.0);
double DiskIndexBitvectorLimit::lookup(const Properties& props) { return lookup(props, DEFAULT_VALUE); }
double DiskIndexBitvectorLimit::lookup(const Properties& props, double default_value) {
return lookupDouble(props, NAME, default_value);
}

const std::string TargetHitsMaxAdjustmentFactor::NAME("vespa.matching.nns.target_hits_max_adjustment_factor");

const double TargetHitsMaxAdjustmentFactor::DEFAULT_VALUE(20.0);
Expand Down
11 changes: 11 additions & 0 deletions searchlib/src/vespa/searchlib/fef/indexproperties.h
Original file line number Diff line number Diff line change
Expand Up @@ -363,6 +363,17 @@ namespace matching {
static double lookup(const Properties &props, double defaultValue);
};

/**
* Use bitvector posting list for terms searching in disk indexes that match more than this limit of the corpus.
* If a bitvector is not available for the term, mask the posocc posting list as a bitvector iterator.
**/
struct DiskIndexBitvectorLimit {
static const std::string NAME;
static const double DEFAULT_VALUE;
static double lookup(const Properties& props);
static double lookup(const Properties& props, double default_value);
};

/**
* Property to control the algorithm using for fuzzy matching.
**/
Expand Down
2 changes: 2 additions & 0 deletions searchlib/src/vespa/searchlib/fef/ranksetup.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,7 @@ RankSetup::RankSetup(const BlueprintFactory &factory, const IIndexEnvironment &i
_weakand_range(0.0),
_weakand_stop_word_adjust_limit(matching::WeakAndStopWordAdjustLimit::DEFAULT_VALUE),
_weakand_stop_word_drop_limit(matching::WeakAndStopWordDropLimit::DEFAULT_VALUE),
_disk_index_bitvector_limit(matching::DiskIndexBitvectorLimit::DEFAULT_VALUE),
_fuzzy_matching_algorithm(vespalib::FuzzyMatchingAlgorithm::DfaTable),
_mutateOnMatch(),
_mutateOnFirstPhase(),
Expand Down Expand Up @@ -136,6 +137,7 @@ RankSetup::configure()
set_weakand_range(temporary::WeakAndRange::lookup(_indexEnv.getProperties()));
set_weakand_stop_word_adjust_limit(matching::WeakAndStopWordAdjustLimit::lookup(_indexEnv.getProperties()));
set_weakand_stop_word_drop_limit(matching::WeakAndStopWordDropLimit::lookup(_indexEnv.getProperties()));
set_disk_index_bitvector_limit(matching::DiskIndexBitvectorLimit::lookup(_indexEnv.getProperties()));
_mutateOnMatch._attribute = mutate::on_match::Attribute::lookup(_indexEnv.getProperties());
_mutateOnMatch._operation = mutate::on_match::Operation::lookup(_indexEnv.getProperties());
_mutateOnFirstPhase._attribute = mutate::on_first_phase::Attribute::lookup(_indexEnv.getProperties());
Expand Down
3 changes: 3 additions & 0 deletions searchlib/src/vespa/searchlib/fef/ranksetup.h
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,7 @@ class RankSetup
double _weakand_range;
double _weakand_stop_word_adjust_limit;
double _weakand_stop_word_drop_limit;
double _disk_index_bitvector_limit;
vespalib::FuzzyMatchingAlgorithm _fuzzy_matching_algorithm;
MutateOperation _mutateOnMatch;
MutateOperation _mutateOnFirstPhase;
Expand Down Expand Up @@ -418,6 +419,8 @@ class RankSetup
double get_weakand_stop_word_adjust_limit() const { return _weakand_stop_word_adjust_limit; }
void set_weakand_stop_word_drop_limit(double v) { _weakand_stop_word_drop_limit = v; }
double get_weakand_stop_word_drop_limit() const { return _weakand_stop_word_drop_limit; }
void set_disk_index_bitvector_limit(double v) { _disk_index_bitvector_limit = v; }
double get_disk_index_bitvector_limit() const { return _disk_index_bitvector_limit; }

/**
* This method may be used to indicate that certain features
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,19 +19,22 @@ struct CreateBlueprintParams
vespalib::FuzzyMatchingAlgorithm fuzzy_matching_algorithm;
double weakand_range;
queryeval::wand::StopWordStrategy weakand_stop_word_strategy;
double disk_index_bitvector_limit;

CreateBlueprintParams(double global_filter_lower_limit_in,
double global_filter_upper_limit_in,
double target_hits_max_adjustment_factor_in,
vespalib::FuzzyMatchingAlgorithm fuzzy_matching_algorithm_in,
double weakand_range_in,
queryeval::wand::StopWordStrategy weakand_stop_word_strategy_in)
queryeval::wand::StopWordStrategy weakand_stop_word_strategy_in,
double disk_index_bitvector_limit_in)
: global_filter_lower_limit(global_filter_lower_limit_in),
global_filter_upper_limit(global_filter_upper_limit_in),
target_hits_max_adjustment_factor(target_hits_max_adjustment_factor_in),
fuzzy_matching_algorithm(fuzzy_matching_algorithm_in),
weakand_range(weakand_range_in),
weakand_stop_word_strategy(weakand_stop_word_strategy_in)
weakand_stop_word_strategy(weakand_stop_word_strategy_in),
disk_index_bitvector_limit(disk_index_bitvector_limit_in)
{
}

Expand All @@ -41,7 +44,8 @@ struct CreateBlueprintParams
fef::indexproperties::matching::TargetHitsMaxAdjustmentFactor::DEFAULT_VALUE,
fef::indexproperties::matching::FuzzyAlgorithm::DEFAULT_VALUE,
fef::indexproperties::temporary::WeakAndRange::DEFAULT_VALUE,
queryeval::wand::StopWordStrategy::none())
queryeval::wand::StopWordStrategy::none(),
fef::indexproperties::matching::DiskIndexBitvectorLimit::DEFAULT_VALUE)
{
}
};
Expand Down

0 comments on commit 3e66324

Please sign in to comment.