Skip to content

Commit

Permalink
Merge pull request #33156 from vespa-engine/geirst/filter-threshold-p…
Browse files Browse the repository at this point in the history
…er-index-field

Support filter threshold setting per index field in an index environm…
  • Loading branch information
toregge authored Jan 22, 2025
2 parents 320533c + af8432c commit 028ed29
Show file tree
Hide file tree
Showing 9 changed files with 174 additions and 12 deletions.
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
#include <vespa/eval/eval/value_cache/constant_value.h>
#include <vespa/searchcore/proton/matching/indexenvironment.h>
#include <vespa/searchlib/fef/indexproperties.h>
#include <vespa/searchlib/fef/onnx_models.h>
#include <vespa/searchlib/fef/ranking_expressions.h>
#include <vespa/vespalib/gtest/gtest.h>
Expand All @@ -14,6 +15,8 @@ using search::fef::OnnxModel;
using search::fef::OnnxModels;
using search::fef::Properties;
using search::fef::RankingExpressions;
using search::fef::indexproperties::IsFilterField;
using search::fef::indexproperties::matching::FilterThreshold;
using search::index::Schema;
using search::index::schema::CollectionType;
using search::index::schema::DataType;
Expand Down Expand Up @@ -86,10 +89,10 @@ struct Fixture {
MyRankingAssetsRepo repo;
Schema::UP schema;
IndexEnvironment env;
explicit Fixture(Schema::UP schema_)
explicit Fixture(Schema::UP schema_, Properties props = Properties())
: repo(make_expressions(), make_models()),
schema(std::move(schema_)),
env(7, *schema, Properties(), repo)
env(7, *schema, props, repo)
{
}
~Fixture();
Expand Down Expand Up @@ -237,4 +240,52 @@ TEST(IndexEnvironmentTest, require_that_external_ranking_expressions_can_be_obta
EXPECT_TRUE(expr3.empty());
}

Schema::UP
schema_with_index_fields()
{
auto result = std::make_unique<Schema>();
result->addIndexField(SIF("a", DataType::STRING));
result->addIndexField(SIF("b", DataType::STRING));
result->addIndexField(SIF("c", DataType::STRING));
return result;
}



TEST(IndexEnvironmentTest, no_filter_threshold_settings_are_default)
{
Fixture f(schema_with_index_fields());
auto a = f.env.getFieldByName("a");
auto b = f.env.getFieldByName("b");
auto c = f.env.getFieldByName("c");
EXPECT_FALSE(a->isFilter());
EXPECT_FALSE(b->isFilter());
EXPECT_FALSE(c->isFilter());
EXPECT_FLOAT_EQ(1.0, a->get_filter_threshold().threshold());
EXPECT_FLOAT_EQ(1.0, b->get_filter_threshold().threshold());
EXPECT_FLOAT_EQ(1.0, c->get_filter_threshold().threshold());
}

TEST(IndexEnvironmentTest, is_filter_and_filter_threshold_settings_are_extracted_in_precedence_order)
{
Properties p;
{
IsFilterField::set(p, "a");
FilterThreshold::set(p, "0.1");
// Note: 'is filter' setting has precedence over 'filter threshold' setting.
FilterThreshold::set_for_field(p, "a", "0.3");
FilterThreshold::set_for_field(p, "b", "0.2");
}
Fixture f(schema_with_index_fields(), p);
auto a = f.env.getFieldByName("a");
auto b = f.env.getFieldByName("b");
auto c = f.env.getFieldByName("c");
EXPECT_TRUE(a->isFilter());
EXPECT_FALSE(b->isFilter());
EXPECT_FALSE(c->isFilter());
EXPECT_FLOAT_EQ(0.0, a->get_filter_threshold().threshold());
EXPECT_FLOAT_EQ(0.2, b->get_filter_threshold().threshold());
EXPECT_FLOAT_EQ(0.1, c->get_filter_threshold().threshold());
}

GTEST_MAIN_RUN_ALL_TESTS()
Original file line number Diff line number Diff line change
Expand Up @@ -53,12 +53,18 @@ IndexEnvironment::extractFields(const search::index::Schema &schema)
fieldInfo.set_data_type(field.getDataType());
insertField(fieldInfo);
}
auto filter_threshold = indexproperties::matching::FilterThreshold::lookup(_properties);
for (uint32_t i = 0; i < schema.getNumIndexFields(); ++i) {
const SchemaField &field = schema.getIndexField(i);
FieldInfo fieldInfo(FieldType::INDEX, field.getCollectionType(), field.getName(), _fields.size());
fieldInfo.set_data_type(field.getDataType());
auto field_filter_threshold = indexproperties::matching::FilterThreshold::lookup_for_field(_properties, field.getName());
if (indexproperties::IsFilterField::check(_properties, field.getName())) {
fieldInfo.setFilter(true);
} else if (field_filter_threshold.has_value()) {
fieldInfo.set_filter_threshold(search::fef::FilterThreshold(field_filter_threshold.value()));
} else if (filter_threshold.has_value()) {
fieldInfo.set_filter_threshold(search::fef::FilterThreshold(filter_threshold.value()));
}
auto itr = _fieldNames.find(field.getName());
if (itr != _fieldNames.end()) { // override the attribute field
Expand Down
25 changes: 25 additions & 0 deletions searchlib/src/tests/fef/fef_test.cpp
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
#include <vespa/vespalib/testkit/test_kit.h>
#include <vespa/searchlib/fef/fef.h>
#include <vespa/searchlib/fef/filter_threshold.h>
#include <vespa/searchlib/fef/objectstore.h>

#include <vespa/log/log.h>
Expand Down Expand Up @@ -98,4 +99,28 @@ TEST("verify size of essential fef classes") {
EXPECT_EQUAL(48u, sizeof(search::fef::FeatureExecutor));
}

TEST("FilterThreshold can represent a boolean is filter value")
{
FilterThreshold a;
EXPECT_FALSE(a.is_filter());

FilterThreshold b(false);
EXPECT_FALSE(b.is_filter());

FilterThreshold c(true);
EXPECT_TRUE(c.is_filter());
}

TEST("FilterThreshold can represent a threshold value")
{
FilterThreshold a;
EXPECT_FALSE(a.is_filter(1.0));

FilterThreshold b(0.5);
EXPECT_EQUAL((float)0.5, b.threshold());
EXPECT_FALSE(b.is_filter());
EXPECT_FALSE(b.is_filter(0.5));
EXPECT_TRUE(b.is_filter(0.51));
}

TEST_MAIN() { TEST_RUN_ALL(); }
16 changes: 16 additions & 0 deletions searchlib/src/tests/fef/properties/properties_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -644,4 +644,20 @@ TEST(PropertiesTest, second_phase_rank_score_drop_limit)
EXPECT_EQ(std::optional<search::feature_t>(123456789.12345), hitcollector::SecondPhaseRankScoreDropLimit::lookup(p, 4.0));
}

TEST(PropertiesTest, filter_threshold_setting)
{
Properties p;
EXPECT_EQ(std::nullopt, matching::FilterThreshold::lookup(p));
matching::FilterThreshold::set(p, "0.5");
EXPECT_EQ(std::optional<double>(0.5), matching::FilterThreshold::lookup(p));
}

TEST(PropertiesTest, per_field_filter_threshold_setting)
{
Properties p;
EXPECT_EQ(std::nullopt, matching::FilterThreshold::lookup_for_field(p, "foo"));
matching::FilterThreshold::set_for_field(p, "foo", "0.4");
EXPECT_EQ(std::optional<double>(0.4), matching::FilterThreshold::lookup_for_field(p, "foo"));
}

GTEST_MAIN_RUN_ALL_TESTS()
2 changes: 1 addition & 1 deletion searchlib/src/vespa/searchlib/fef/fieldinfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ FieldInfo::FieldInfo(FieldType type_in, CollectionType collection_in,
_collection(collection_in),
_name(name_in),
_id(id_in),
_isFilter(false),
_threshold(false),
_hasAttribute(type_in == FieldType::ATTRIBUTE)
{
}
Expand Down
22 changes: 13 additions & 9 deletions searchlib/src/vespa/searchlib/fef/fieldinfo.h
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
#pragma once

#include "fieldtype.h"
#include "filter_threshold.h"
#include <vespa/searchcommon/common/datatype.h>
#include <cstdint>
#include <string>
Expand All @@ -22,13 +23,13 @@ class FieldInfo
using DataType = search::index::schema::DataType;
using string = std::string;
private:
FieldType _type;
DataType _data_type;
CollectionType _collection;
string _name;
uint32_t _id;
bool _isFilter;
bool _hasAttribute;
FieldType _type;
DataType _data_type;
CollectionType _collection;
string _name;
uint32_t _id;
FilterThreshold _threshold;
bool _hasAttribute;

public:
/**
Expand Down Expand Up @@ -96,15 +97,18 @@ class FieldInfo
*
* @param flag true if this field should be treated as a filter
**/
void setFilter(bool flag) { _isFilter = flag; }
void setFilter(bool flag) { _threshold = FilterThreshold(flag); }

/**
* Obtain the flag indicating whether this field should be treated
* as a filter field (fast searching and low complexity ranking).
*
* @return true if this field should be treated as a filter
**/
bool isFilter() const { return _isFilter; }
bool isFilter() const { return _threshold.is_filter(); }

void set_filter_threshold(FilterThreshold threshold) { _threshold = threshold; }
FilterThreshold get_filter_threshold() const { return _threshold; }
};

}
32 changes: 32 additions & 0 deletions searchlib/src/vespa/searchlib/fef/filter_threshold.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.

#pragma once

namespace search::fef {

/**
* Class representing the threshold of whether a field should be considered a filter or not during query evaluation.
*
* Some fields are always considered filters, while others are only considered filters
* if the relative document frequency of the term searching the field is above the specified threshold.
*/
class FilterThreshold {
private:
// A number in the range [0.0, 1.0] encapsulating whether a field should be considered a filter or not.
float _threshold;

public:
FilterThreshold() noexcept : _threshold(1.0) { }
FilterThreshold(bool is_filter_in) noexcept : _threshold(is_filter_in ? 0.0 : 1.0) { }
FilterThreshold(float threshold) noexcept : _threshold(threshold) { }
FilterThreshold(double threshold) noexcept : _threshold(threshold) { }
float threshold() const noexcept { return _threshold; }
bool is_filter() const noexcept { return _threshold == 0.0; }

/**
* Returns whether this is considered a filter for a query term with the given relative document frequency (in the range [0.0, 1.0]).
*/
bool is_filter(float rel_doc_freq) const noexcept { return rel_doc_freq > _threshold; }
};

}
15 changes: 15 additions & 0 deletions searchlib/src/vespa/searchlib/fef/indexproperties.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -450,6 +450,21 @@ double DiskIndexBitvectorLimit::lookup(const Properties& props, double default_v
return lookupDouble(props, NAME, default_value);
}

const std::string FilterThreshold::NAME("vespa.matching.filter_threshold");
const std::optional<double> FilterThreshold::DEFAULT_VALUE(std::nullopt);
std::optional<double> FilterThreshold::lookup(const search::fef::Properties &props) {
return lookup_opt_double(props, NAME, DEFAULT_VALUE);
}
std::optional<double> FilterThreshold::lookup_for_field(const Properties& props, const std::string& field_name) {
return lookup_opt_double(props, NAME + "." + field_name, DEFAULT_VALUE);
}
void FilterThreshold::set(Properties& props, const std::string& threshold) {
props.add(NAME, threshold);
}
void FilterThreshold::set_for_field(Properties& props, const std::string& field_name, const std::string& threshold) {
props.add(NAME + "." + field_name, threshold);
}

const std::string TargetHitsMaxAdjustmentFactor::NAME("vespa.matching.nns.target_hits_max_adjustment_factor");

const double TargetHitsMaxAdjustmentFactor::DEFAULT_VALUE(20.0);
Expand Down
13 changes: 13 additions & 0 deletions searchlib/src/vespa/searchlib/fef/indexproperties.h
Original file line number Diff line number Diff line change
Expand Up @@ -363,6 +363,19 @@ namespace matching {
static double lookup(const Properties& props, double default_value);
};

/**
* Property to extract the filter threshold settings for a query (see search::fef::FilterThreshold for details).
* The per field filter threshold has precedence over the overall filter threshold.
*/
struct FilterThreshold {
static const std::string NAME;
static const std::optional<feature_t> DEFAULT_VALUE;
static std::optional<double> lookup(const Properties& props);
static std::optional<double> lookup_for_field(const Properties& props, const std::string& field_name);
static void set(Properties& props, const std::string& threshold);
static void set_for_field(Properties& props, const std::string& field_name, const std::string& threshold);
};

/**
* Property to control the algorithm using for fuzzy matching.
**/
Expand Down

0 comments on commit 028ed29

Please sign in to comment.