Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Extend field length calculator to track average element length. #33263

Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions searchcore/src/tests/proton/index/indexcollection_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -92,8 +92,8 @@ class IndexCollectionTest : public ::testing::Test,

IndexCollectionTest()
: _selector(std::make_shared<FixedSourceSelector>(0, "fs1")),
_source1(std::make_shared<MockIndexSearchable>(FieldLengthInfo(3, 5))),
_source2(std::make_shared<MockIndexSearchable>(FieldLengthInfo(7, 11))),
_source1(std::make_shared<MockIndexSearchable>(FieldLengthInfo(3.0, 3.0, 5))),
_source2(std::make_shared<MockIndexSearchable>(FieldLengthInfo(7.0, 7.0, 11))),
_fusion_source(std::make_shared<FakeIndexSearchable>()),
_executor(1),
_warmup(std::make_shared<FakeIndexSearchable>())
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -211,7 +211,7 @@ WrappedFieldWriter::open()
_fieldWriter->open(minSkipDocs, minChunkDocs,
_dynamicK, _encode_interleaved_features,
_schema, _indexId,
FieldLengthInfo(4.5, 42),
FieldLengthInfo(4.5, 4.5, 42),
tuneFileWrite, fileHeaderContext);
}

Expand Down
2 changes: 1 addition & 1 deletion searchlib/src/tests/diskindex/fusion/fusion_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ namespace diskindex {
class MyMockFieldLengthInspector : public IFieldLengthInspector {
FieldLengthInfo get_field_length_info(const std::string& field_name) const override {
if (field_name == "f0") {
return FieldLengthInfo(3.5, 21);
return FieldLengthInfo(3.5, 3.5, 21);
} else {
return FieldLengthInfo();
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -26,36 +26,40 @@ TEST(FieldLengthCalculatorTest, empty_is_zero)
TEST(FieldLengthCalculatorTest, startup_is_average)
{
FieldLengthCalculator calc;
calc.add_field_length(3);
calc.add_field_length(3, 1);
EXPECT_DOUBLE_EQ(3.0, calc.get_average_field_length());
EXPECT_DOUBLE_EQ(3.0, calc.get_average_element_length());
EXPECT_EQ(1, calc.get_num_samples());
calc.add_field_length(4);
calc.add_field_length(4, 1);
EXPECT_DOUBLE_EQ(3.5, calc.get_average_field_length());
EXPECT_DOUBLE_EQ(3.5, calc.get_average_element_length());
EXPECT_EQ(2, calc.get_num_samples());
calc.add_field_length(7);
calc.add_field_length(7, 1);
EXPECT_DOUBLE_EQ((3 + 4 + 7)/3.0, calc.get_average_field_length());
EXPECT_DOUBLE_EQ((3 + 4 + 7)/3.0, calc.get_average_element_length());
EXPECT_EQ(3, calc.get_num_samples());
calc.add_field_length(9);
calc.add_field_length(9, 3);
EXPECT_DOUBLE_EQ((3 + 4 + 7 + 9)/4.0, calc.get_average_field_length());
EXPECT_DOUBLE_EQ((3 + 4 + 7 + 9)/6.0, calc.get_average_element_length());
EXPECT_EQ(4, calc.get_num_samples());
}

TEST(FieldLengthCalculatorTest, average_until_max_num_samples)
{
const uint32_t max_num_samples = 5;
FieldLengthCalculator calc(0.0, 0, max_num_samples);
FieldLengthCalculator calc(0.0, 0.0, 0, max_num_samples);
static constexpr double epsilon = 0.000000001; // Allowed difference
for (uint32_t i = 0; i + 1 < max_num_samples; ++i) {
calc.add_field_length(i + 1);
calc.add_field_length(i + 1, 1);
}
// Arithmetic average
EXPECT_NEAR(arith_avg(max_num_samples - 1), calc.get_average_field_length(), epsilon);
EXPECT_EQ(max_num_samples - 1, calc.get_num_samples());
calc.add_field_length(max_num_samples);
calc.add_field_length(max_num_samples, 1);
// Arithmetic average
EXPECT_NEAR(arith_avg(max_num_samples), calc.get_average_field_length(), epsilon);
EXPECT_EQ(max_num_samples, calc.get_num_samples());
calc.add_field_length(max_num_samples + 1);
calc.add_field_length(max_num_samples + 1, 1);
// No longer arithmetic average
EXPECT_LT(arith_avg(max_num_samples + 1), calc.get_average_field_length());
// Switched to exponential decay
Expand All @@ -65,12 +69,34 @@ TEST(FieldLengthCalculatorTest, average_until_max_num_samples)

TEST(FieldLengthCalculatorTest, calculator_can_return_info_object)
{
FieldLengthCalculator calc(3, 5);
FieldLengthCalculator calc(3.0, 2.0, 5);
auto info = calc.get_info();
EXPECT_EQ(3, info.get_average_field_length());
EXPECT_DOUBLE_EQ(3.0, info.get_average_field_length());
EXPECT_DOUBLE_EQ(2.0, info.get_average_element_length());
EXPECT_EQ(5, info.get_num_samples());
}

TEST(FieldLengthCalculatorTest, average_element_length_is_calculated)
{
FieldLengthCalculator calc;
calc.add_field_length(7, 1);
calc.add_field_length(9, 3);
auto info = calc.get_info();
EXPECT_DOUBLE_EQ(8.0, info.get_average_field_length());
EXPECT_DOUBLE_EQ(4.0, info.get_average_element_length());
EXPECT_EQ(2, info.get_num_samples());
}

TEST(FieldLengthCalculatorTest, calculator_can_restore_state)
{
FieldLengthCalculator calc(FieldLengthInfo(9.0, 3.0, 1));
calc.add_field_length(47, 1);
auto info = calc.get_info();
EXPECT_DOUBLE_EQ(28.0, info.get_average_field_length());
EXPECT_DOUBLE_EQ(14.0, info.get_average_element_length());
EXPECT_EQ(2, info.get_num_samples());
}

}

GTEST_MAIN_RUN_ALL_TESTS()
Original file line number Diff line number Diff line change
Expand Up @@ -552,8 +552,8 @@ TEST(MemoryIndexTest, require_that_we_can_fake_bit_vector)
TEST(MemoryIndexTest, field_length_info_can_be_retrieved_per_field)
{
Index index(MySetup().field(title).field(body)
.field_length("title", FieldLengthInfo(3, 5))
.field_length("body", FieldLengthInfo(7, 11)));
.field_length("title", FieldLengthInfo(3.0, 3.0, 5))
.field_length("body", FieldLengthInfo(7.0, 7.0, 11)));

EXPECT_EQ(3, index.index.get_field_length_info("title").get_average_field_length());
EXPECT_EQ(5, index.index.get_field_length_info("title").get_num_samples());
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -137,13 +137,15 @@ std::string field_length_infix = "field_length.";
struct FieldLengthKeys {
std::string _average;
std::string _samples;
std::string _average_element_length;
FieldLengthKeys(const std::string &prefix);
~FieldLengthKeys();
};

FieldLengthKeys::FieldLengthKeys(const std::string &prefix)
: _average(prefix + field_length_infix + "average"),
_samples(prefix + field_length_infix + "samples")
_samples(prefix + field_length_infix + "samples"),
_average_element_length(prefix + field_length_infix + "average_element_length")
{
}

Expand Down Expand Up @@ -189,7 +191,15 @@ PosOccFieldParams::readHeader(const GenericHeader &header,
const auto &field_length_samples_tag = header.getTag(field_length_keys._samples);
if (average_field_length_tag.getType() == Tag::Type::TYPE_FLOAT &&
field_length_samples_tag.getType() == Tag::Type::TYPE_INTEGER) {
_field_length_info = index::FieldLengthInfo(average_field_length_tag.asFloat(), field_length_samples_tag.asInteger());
double average_field_length = average_field_length_tag.asFloat();
double average_element_length = average_field_length;
if (header.hasTag(field_length_keys._average_element_length)) {
const auto& average_element_length_tag = header.getTag(field_length_keys._average_element_length);
if (average_element_length_tag.getType() == Tag::Type::TYPE_FLOAT) {
average_element_length = average_element_length_tag.asFloat();
}
}
_field_length_info = index::FieldLengthInfo(average_field_length, average_element_length, field_length_samples_tag.asInteger());
}
}
}
Expand Down Expand Up @@ -223,6 +233,7 @@ PosOccFieldParams::writeHeader(GenericHeader &header,
header.putTag(Tag(avgElemLenKey, _avgElemLen));
header.putTag(Tag(field_length_keys._average, _field_length_info.get_average_field_length()));
header.putTag(Tag(field_length_keys._samples, static_cast<int64_t>(_field_length_info.get_num_samples())));
header.putTag(Tag(field_length_keys._average_element_length,_field_length_info.get_average_element_length()));
toregge marked this conversation as resolved.
Show resolved Hide resolved
}

}
55 changes: 37 additions & 18 deletions searchlib/src/vespa/searchlib/index/field_length_calculator.h
Original file line number Diff line number Diff line change
Expand Up @@ -8,53 +8,72 @@
#include <cstdint>

namespace search::index {

/**
* Class used to calculate average field length, with a bias towards
* the latest field lengths when max_num_samples samples have been reached.
*/
class FieldLengthCalculator {
std::atomic<double> _average_field_length;
std::atomic<uint32_t> _num_samples; // Capped by _max_num_samples
std::atomic<double> _average_element_length;
std::atomic<uint32_t> _num_samples; // Capped by _max_num_samples
uint32_t _max_num_samples;
double _average_elements;

static double calc_average_elements(double average_field_length, double average_element_length,
uint32_t num_samples) {
return (num_samples == 0) ? 0.0 : average_field_length / average_element_length;
}

static double calc_decay(double old_value, double new_value, uint32_t num_samples) {
return (old_value * (num_samples - 1) + new_value) / num_samples;
}

public:
FieldLengthCalculator()
: FieldLengthCalculator(0.0, 0)
{
: FieldLengthCalculator(0.0, 0.0, 0) {
}

FieldLengthCalculator(double average_field_length, uint32_t num_samples, uint32_t max_num_samples = 100000)
FieldLengthCalculator(double average_field_length, double average_element_length, uint32_t num_samples,
uint32_t max_num_samples = 100000)
: _average_field_length(average_field_length),
_average_element_length(average_element_length),
_num_samples(std::min(num_samples, max_num_samples)),
_max_num_samples(max_num_samples)
{
_max_num_samples(max_num_samples),
_average_elements(calc_average_elements(average_field_length, average_element_length, num_samples)) {
}

FieldLengthCalculator(const FieldLengthInfo& info, uint32_t max_num_samples = 100000)
: _average_field_length(info.get_average_field_length()),
_num_samples(std::min(info.get_num_samples(), max_num_samples)),
_max_num_samples(max_num_samples)
: _average_field_length(info.get_average_field_length()),
_average_element_length(info.get_average_element_length()),
_num_samples(std::min(info.get_num_samples(), max_num_samples)),
_max_num_samples(max_num_samples),
_average_elements(calc_average_elements(info.get_average_field_length(),
info.get_average_element_length(),
info.get_num_samples()))
{
}

double get_average_field_length() const { return _average_field_length.load(std::memory_order_relaxed); }
uint32_t get_num_samples() const { return _num_samples.load(std::memory_order_relaxed); }
uint32_t get_max_num_samples() const { return _max_num_samples; }
double get_average_field_length() const noexcept { return _average_field_length.load(std::memory_order_relaxed); }
double get_average_element_length() const noexcept { return _average_element_length.load(std::memory_order_relaxed); }
uint32_t get_num_samples() const noexcept { return _num_samples.load(std::memory_order_relaxed); }
uint32_t get_max_num_samples() const noexcept { return _max_num_samples; }

FieldLengthInfo get_info() const {
return FieldLengthInfo(get_average_field_length(), get_num_samples());
FieldLengthInfo get_info() const noexcept {
return FieldLengthInfo(get_average_field_length(), get_average_element_length(), get_num_samples());
}

void add_field_length(uint32_t field_length) {
void add_field_length(uint32_t field_length, uint32_t elements) noexcept {
auto num_samples = get_num_samples();
if (num_samples < _max_num_samples) {
++num_samples;
_num_samples.store(num_samples, std::memory_order_relaxed);
}
_average_field_length.store((_average_field_length.load(std::memory_order_relaxed) * (num_samples - 1) + field_length) / num_samples, std::memory_order_relaxed);
auto average_field_length = calc_decay(_average_field_length.load(std::memory_order_relaxed),
field_length, num_samples);
_average_field_length.store(average_field_length, std::memory_order_relaxed);
_average_elements = calc_decay(_average_elements, elements, num_samples);
_average_element_length.store(average_field_length / _average_elements, std::memory_order_relaxed);
}

};

}
7 changes: 5 additions & 2 deletions searchlib/src/vespa/searchlib/index/field_length_info.h
Original file line number Diff line number Diff line change
Expand Up @@ -12,21 +12,24 @@ namespace search::index {
class FieldLengthInfo {
private:
double _average_field_length;
double _average_element_length;
uint32_t _num_samples;

public:
FieldLengthInfo() noexcept
: FieldLengthInfo(0.0, 0)
: FieldLengthInfo(0.0, 0.0, 0)
{
}

FieldLengthInfo(double average_field_length, uint32_t num_samples) noexcept
FieldLengthInfo(double average_field_length, double average_element_length, uint32_t num_samples) noexcept
: _average_field_length(average_field_length),
_average_element_length(average_element_length),
_num_samples(num_samples)
{
}

[[nodiscard]] double get_average_field_length() const noexcept { return _average_field_length; }
[[nodiscard]] double get_average_element_length() const noexcept { return _average_element_length; }
[[nodiscard]] uint32_t get_num_samples() const noexcept { return _num_samples; }
};

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -182,7 +182,7 @@ FieldInverter::endDoc()
++itr;
}
}
_calculator.add_field_length(field_length);
_calculator.add_field_length(field_length, _elem);
uint32_t newPosSize = static_cast<uint32_t>(_positions.size());
_pendingDocs.insert({ _docId, { _oldPosSize, newPosSize - _oldPosSize } });
_docId = 0;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,9 +23,9 @@ namespace {
class MockFieldLengthInspector : public IFieldLengthInspector {
FieldLengthInfo get_field_length_info(const std::string& field_name) const override {
if (field_name == "f1") {
return {3.5, 21};
return {3.5, 3.5, 21};
} else if (field_name == "f2") {
return {4.0, 23};
return {4.0, 4.0, 23};
} else {
return {};
}
Expand Down