Skip to content

Commit

Permalink
Merge pull request #33263 from vespa-engine/toregge/extend-field-leng…
Browse files Browse the repository at this point in the history
…th-calculator-to-track-average-element-length

Extend field length calculator to track average element length.
  • Loading branch information
toregge authored Feb 5, 2025
2 parents 2e3be8f + 77c919b commit 20bc2f7
Show file tree
Hide file tree
Showing 10 changed files with 100 additions and 41 deletions.
4 changes: 2 additions & 2 deletions searchcore/src/tests/proton/index/indexcollection_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -92,8 +92,8 @@ class IndexCollectionTest : public ::testing::Test,

IndexCollectionTest()
: _selector(std::make_shared<FixedSourceSelector>(0, "fs1")),
_source1(std::make_shared<MockIndexSearchable>(FieldLengthInfo(3, 5))),
_source2(std::make_shared<MockIndexSearchable>(FieldLengthInfo(7, 11))),
_source1(std::make_shared<MockIndexSearchable>(FieldLengthInfo(3.0, 3.0, 5))),
_source2(std::make_shared<MockIndexSearchable>(FieldLengthInfo(7.0, 7.0, 11))),
_fusion_source(std::make_shared<FakeIndexSearchable>()),
_executor(1),
_warmup(std::make_shared<FakeIndexSearchable>())
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -211,7 +211,7 @@ WrappedFieldWriter::open()
_fieldWriter->open(minSkipDocs, minChunkDocs,
_dynamicK, _encode_interleaved_features,
_schema, _indexId,
FieldLengthInfo(4.5, 42),
FieldLengthInfo(4.5, 4.5, 42),
tuneFileWrite, fileHeaderContext);
}

Expand Down
2 changes: 1 addition & 1 deletion searchlib/src/tests/diskindex/fusion/fusion_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ namespace diskindex {
class MyMockFieldLengthInspector : public IFieldLengthInspector {
FieldLengthInfo get_field_length_info(const std::string& field_name) const override {
if (field_name == "f0") {
return FieldLengthInfo(3.5, 21);
return FieldLengthInfo(3.5, 3.5, 21);
} else {
return FieldLengthInfo();
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -26,36 +26,40 @@ TEST(FieldLengthCalculatorTest, empty_is_zero)
TEST(FieldLengthCalculatorTest, startup_is_average)
{
FieldLengthCalculator calc;
calc.add_field_length(3);
calc.add_field_length(3, 1);
EXPECT_DOUBLE_EQ(3.0, calc.get_average_field_length());
EXPECT_DOUBLE_EQ(3.0, calc.get_average_element_length());
EXPECT_EQ(1, calc.get_num_samples());
calc.add_field_length(4);
calc.add_field_length(4, 1);
EXPECT_DOUBLE_EQ(3.5, calc.get_average_field_length());
EXPECT_DOUBLE_EQ(3.5, calc.get_average_element_length());
EXPECT_EQ(2, calc.get_num_samples());
calc.add_field_length(7);
calc.add_field_length(7, 1);
EXPECT_DOUBLE_EQ((3 + 4 + 7)/3.0, calc.get_average_field_length());
EXPECT_DOUBLE_EQ((3 + 4 + 7)/3.0, calc.get_average_element_length());
EXPECT_EQ(3, calc.get_num_samples());
calc.add_field_length(9);
calc.add_field_length(9, 3);
EXPECT_DOUBLE_EQ((3 + 4 + 7 + 9)/4.0, calc.get_average_field_length());
EXPECT_DOUBLE_EQ((3 + 4 + 7 + 9)/6.0, calc.get_average_element_length());
EXPECT_EQ(4, calc.get_num_samples());
}

TEST(FieldLengthCalculatorTest, average_until_max_num_samples)
{
const uint32_t max_num_samples = 5;
FieldLengthCalculator calc(0.0, 0, max_num_samples);
FieldLengthCalculator calc(0.0, 0.0, 0, max_num_samples);
static constexpr double epsilon = 0.000000001; // Allowed difference
for (uint32_t i = 0; i + 1 < max_num_samples; ++i) {
calc.add_field_length(i + 1);
calc.add_field_length(i + 1, 1);
}
// Arithmetic average
EXPECT_NEAR(arith_avg(max_num_samples - 1), calc.get_average_field_length(), epsilon);
EXPECT_EQ(max_num_samples - 1, calc.get_num_samples());
calc.add_field_length(max_num_samples);
calc.add_field_length(max_num_samples, 1);
// Arithmetic average
EXPECT_NEAR(arith_avg(max_num_samples), calc.get_average_field_length(), epsilon);
EXPECT_EQ(max_num_samples, calc.get_num_samples());
calc.add_field_length(max_num_samples + 1);
calc.add_field_length(max_num_samples + 1, 1);
// No longer arithmetic average
EXPECT_LT(arith_avg(max_num_samples + 1), calc.get_average_field_length());
// Switched to exponential decay
Expand All @@ -65,12 +69,34 @@ TEST(FieldLengthCalculatorTest, average_until_max_num_samples)

TEST(FieldLengthCalculatorTest, calculator_can_return_info_object)
{
FieldLengthCalculator calc(3, 5);
FieldLengthCalculator calc(3.0, 2.0, 5);
auto info = calc.get_info();
EXPECT_EQ(3, info.get_average_field_length());
EXPECT_DOUBLE_EQ(3.0, info.get_average_field_length());
EXPECT_DOUBLE_EQ(2.0, info.get_average_element_length());
EXPECT_EQ(5, info.get_num_samples());
}

TEST(FieldLengthCalculatorTest, average_element_length_is_calculated)
{
FieldLengthCalculator calc;
calc.add_field_length(7, 1);
calc.add_field_length(9, 3);
auto info = calc.get_info();
EXPECT_DOUBLE_EQ(8.0, info.get_average_field_length());
EXPECT_DOUBLE_EQ(4.0, info.get_average_element_length());
EXPECT_EQ(2, info.get_num_samples());
}

TEST(FieldLengthCalculatorTest, calculator_can_restore_state)
{
FieldLengthCalculator calc(FieldLengthInfo(9.0, 3.0, 1));
calc.add_field_length(47, 1);
auto info = calc.get_info();
EXPECT_DOUBLE_EQ(28.0, info.get_average_field_length());
EXPECT_DOUBLE_EQ(14.0, info.get_average_element_length());
EXPECT_EQ(2, info.get_num_samples());
}

}

GTEST_MAIN_RUN_ALL_TESTS()
Original file line number Diff line number Diff line change
Expand Up @@ -552,8 +552,8 @@ TEST(MemoryIndexTest, require_that_we_can_fake_bit_vector)
TEST(MemoryIndexTest, field_length_info_can_be_retrieved_per_field)
{
Index index(MySetup().field(title).field(body)
.field_length("title", FieldLengthInfo(3, 5))
.field_length("body", FieldLengthInfo(7, 11)));
.field_length("title", FieldLengthInfo(3.0, 3.0, 5))
.field_length("body", FieldLengthInfo(7.0, 7.0, 11)));

EXPECT_EQ(3, index.index.get_field_length_info("title").get_average_field_length());
EXPECT_EQ(5, index.index.get_field_length_info("title").get_num_samples());
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -137,13 +137,15 @@ std::string field_length_infix = "field_length.";
struct FieldLengthKeys {
std::string _average;
std::string _samples;
std::string _average_element_length;
FieldLengthKeys(const std::string &prefix);
~FieldLengthKeys();
};

FieldLengthKeys::FieldLengthKeys(const std::string &prefix)
: _average(prefix + field_length_infix + "average"),
_samples(prefix + field_length_infix + "samples")
_samples(prefix + field_length_infix + "samples"),
_average_element_length(prefix + field_length_infix + "average_element_length")
{
}

Expand Down Expand Up @@ -189,7 +191,15 @@ PosOccFieldParams::readHeader(const GenericHeader &header,
const auto &field_length_samples_tag = header.getTag(field_length_keys._samples);
if (average_field_length_tag.getType() == Tag::Type::TYPE_FLOAT &&
field_length_samples_tag.getType() == Tag::Type::TYPE_INTEGER) {
_field_length_info = index::FieldLengthInfo(average_field_length_tag.asFloat(), field_length_samples_tag.asInteger());
double average_field_length = average_field_length_tag.asFloat();
double average_element_length = average_field_length;
if (header.hasTag(field_length_keys._average_element_length)) {
const auto& average_element_length_tag = header.getTag(field_length_keys._average_element_length);
if (average_element_length_tag.getType() == Tag::Type::TYPE_FLOAT) {
average_element_length = average_element_length_tag.asFloat();
}
}
_field_length_info = index::FieldLengthInfo(average_field_length, average_element_length, field_length_samples_tag.asInteger());
}
}
}
Expand Down Expand Up @@ -223,6 +233,7 @@ PosOccFieldParams::writeHeader(GenericHeader &header,
header.putTag(Tag(avgElemLenKey, _avgElemLen));
header.putTag(Tag(field_length_keys._average, _field_length_info.get_average_field_length()));
header.putTag(Tag(field_length_keys._samples, static_cast<int64_t>(_field_length_info.get_num_samples())));
header.putTag(Tag(field_length_keys._average_element_length, _field_length_info.get_average_element_length()));
}

}
55 changes: 37 additions & 18 deletions searchlib/src/vespa/searchlib/index/field_length_calculator.h
Original file line number Diff line number Diff line change
Expand Up @@ -8,53 +8,72 @@
#include <cstdint>

namespace search::index {

/**
* Class used to calculate average field length, with a bias towards
* the latest field lengths when max_num_samples samples have been reached.
*/
class FieldLengthCalculator {
std::atomic<double> _average_field_length;
std::atomic<uint32_t> _num_samples; // Capped by _max_num_samples
std::atomic<double> _average_element_length;
std::atomic<uint32_t> _num_samples; // Capped by _max_num_samples
uint32_t _max_num_samples;
double _average_elements;

static double calc_average_elements(double average_field_length, double average_element_length,
uint32_t num_samples) {
return (num_samples == 0) ? 0.0 : average_field_length / average_element_length;
}

static double calc_decay(double old_value, double new_value, uint32_t num_samples) {
return (old_value * (num_samples - 1) + new_value) / num_samples;
}

public:
FieldLengthCalculator()
: FieldLengthCalculator(0.0, 0)
{
: FieldLengthCalculator(0.0, 0.0, 0) {
}

FieldLengthCalculator(double average_field_length, uint32_t num_samples, uint32_t max_num_samples = 100000)
FieldLengthCalculator(double average_field_length, double average_element_length, uint32_t num_samples,
uint32_t max_num_samples = 100000)
: _average_field_length(average_field_length),
_average_element_length(average_element_length),
_num_samples(std::min(num_samples, max_num_samples)),
_max_num_samples(max_num_samples)
{
_max_num_samples(max_num_samples),
_average_elements(calc_average_elements(average_field_length, average_element_length, num_samples)) {
}

FieldLengthCalculator(const FieldLengthInfo& info, uint32_t max_num_samples = 100000)
: _average_field_length(info.get_average_field_length()),
_num_samples(std::min(info.get_num_samples(), max_num_samples)),
_max_num_samples(max_num_samples)
: _average_field_length(info.get_average_field_length()),
_average_element_length(info.get_average_element_length()),
_num_samples(std::min(info.get_num_samples(), max_num_samples)),
_max_num_samples(max_num_samples),
_average_elements(calc_average_elements(info.get_average_field_length(),
info.get_average_element_length(),
info.get_num_samples()))
{
}

double get_average_field_length() const { return _average_field_length.load(std::memory_order_relaxed); }
uint32_t get_num_samples() const { return _num_samples.load(std::memory_order_relaxed); }
uint32_t get_max_num_samples() const { return _max_num_samples; }
double get_average_field_length() const noexcept { return _average_field_length.load(std::memory_order_relaxed); }
double get_average_element_length() const noexcept { return _average_element_length.load(std::memory_order_relaxed); }
uint32_t get_num_samples() const noexcept { return _num_samples.load(std::memory_order_relaxed); }
uint32_t get_max_num_samples() const noexcept { return _max_num_samples; }

FieldLengthInfo get_info() const {
return FieldLengthInfo(get_average_field_length(), get_num_samples());
FieldLengthInfo get_info() const noexcept {
return FieldLengthInfo(get_average_field_length(), get_average_element_length(), get_num_samples());
}

void add_field_length(uint32_t field_length) {
void add_field_length(uint32_t field_length, uint32_t elements) noexcept {
auto num_samples = get_num_samples();
if (num_samples < _max_num_samples) {
++num_samples;
_num_samples.store(num_samples, std::memory_order_relaxed);
}
_average_field_length.store((_average_field_length.load(std::memory_order_relaxed) * (num_samples - 1) + field_length) / num_samples, std::memory_order_relaxed);
auto average_field_length = calc_decay(_average_field_length.load(std::memory_order_relaxed),
field_length, num_samples);
_average_field_length.store(average_field_length, std::memory_order_relaxed);
_average_elements = calc_decay(_average_elements, elements, num_samples);
_average_element_length.store(average_field_length / _average_elements, std::memory_order_relaxed);
}

};

}
7 changes: 5 additions & 2 deletions searchlib/src/vespa/searchlib/index/field_length_info.h
Original file line number Diff line number Diff line change
Expand Up @@ -12,21 +12,24 @@ namespace search::index {
class FieldLengthInfo {
private:
double _average_field_length;
double _average_element_length;
uint32_t _num_samples;

public:
FieldLengthInfo() noexcept
: FieldLengthInfo(0.0, 0)
: FieldLengthInfo(0.0, 0.0, 0)
{
}

FieldLengthInfo(double average_field_length, uint32_t num_samples) noexcept
FieldLengthInfo(double average_field_length, double average_element_length, uint32_t num_samples) noexcept
: _average_field_length(average_field_length),
_average_element_length(average_element_length),
_num_samples(num_samples)
{
}

[[nodiscard]] double get_average_field_length() const noexcept { return _average_field_length; }
[[nodiscard]] double get_average_element_length() const noexcept { return _average_element_length; }
[[nodiscard]] uint32_t get_num_samples() const noexcept { return _num_samples; }
};

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -182,7 +182,7 @@ FieldInverter::endDoc()
++itr;
}
}
_calculator.add_field_length(field_length);
_calculator.add_field_length(field_length, _elem);
uint32_t newPosSize = static_cast<uint32_t>(_positions.size());
_pendingDocs.insert({ _docId, { _oldPosSize, newPosSize - _oldPosSize } });
_docId = 0;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,9 +23,9 @@ namespace {
class MockFieldLengthInspector : public IFieldLengthInspector {
FieldLengthInfo get_field_length_info(const std::string& field_name) const override {
if (field_name == "f1") {
return {3.5, 21};
return {3.5, 3.5, 21};
} else if (field_name == "f2") {
return {4.0, 23};
return {4.0, 4.0, 23};
} else {
return {};
}
Expand Down

0 comments on commit 20bc2f7

Please sign in to comment.