Skip to content

Commit

Permalink
[tantivy] cached column to reduce disk reads
Browse files Browse the repository at this point in the history
  • Loading branch information
mikkeldenker committed Nov 27, 2024
1 parent 761eba7 commit 46016ab
Show file tree
Hide file tree
Showing 5 changed files with 75 additions and 3 deletions.
5 changes: 3 additions & 2 deletions crates/core/src/webgraph/query/raw/links.rs
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
// You should have received a copy of the GNU Affero General Public License
// along with this program. If not, see <https://www.gnu.org/licenses/>.

use tantivy::{columnar::Column, postings::SegmentPostings, DocSet};
use tantivy::{columnar::CachedColumn, postings::SegmentPostings, DocSet};

use crate::webgraph::{
schema::{Field, FieldEnum},
Expand Down Expand Up @@ -114,7 +114,7 @@ impl tantivy::query::Weight for LinksWeight {

struct LinksScorer {
postings: SegmentPostings,
dedup_column: Option<Column<u128>>,
dedup_column: Option<CachedColumn<u128>>,
last_dedup_val: Option<u128>,
self_dedup_val: u128,
skip_self_links: bool,
Expand All @@ -134,6 +134,7 @@ impl LinksScorer {
.segment(&reader.segment_id())
.u128(f)
.unwrap()
.to_cached()
});

Ok(reader
Expand Down
59 changes: 59 additions & 0 deletions crates/tantivy/src/columnar/column/cached.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
use std::cell::Cell;
use std::fmt::Debug;

use crate::columnar::RowId;

use super::Column;

#[derive(Clone, Copy)]
struct CachedValue<T> {
value: T,
row_id: RowId,
}

/// A column that caches the last accessed value to avoid re-reading from the underlying column.
pub struct CachedColumn<T> {
column: Column<T>,
cache: Cell<Option<CachedValue<T>>>,
}

impl<T> CachedColumn<T> {
pub fn new(column: Column<T>) -> Self {
Self {
column,
cache: Cell::new(None),
}
}
}

impl<T> CachedColumn<T>
where
T: PartialOrd + Copy + Debug + Send + Sync + 'static,
{
pub fn num_docs(&self) -> RowId {
self.column.num_docs()
}

pub fn min_value(&self) -> T {
self.column.min_value()
}

pub fn max_value(&self) -> T {
self.column.max_value()
}

#[inline]
pub fn first(&self, row_id: RowId) -> Option<T> {
if let Some(cached_value) = self.cache.get() {
if cached_value.row_id == row_id {
return Some(cached_value.value);
}
}

let value = self.column.first(row_id);
if let Some(value) = value {
self.cache.set(Some(CachedValue { value, row_id }));
}
value
}
}
8 changes: 8 additions & 0 deletions crates/tantivy/src/columnar/column/mod.rs
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
mod cached;
mod dictionary_encoded;
mod serialize;

Expand All @@ -7,6 +8,7 @@ use std::ops::{Range, RangeInclusive};
use std::sync::Arc;

use crate::common::BinarySerializable;
pub use cached::CachedColumn;
pub use dictionary_encoded::BytesColumn;
pub use serialize::{
open_column_bytes, open_column_u128, open_column_u64, serialize_column_mappable_to_u128,
Expand Down Expand Up @@ -60,6 +62,12 @@ impl<T: MonotonicallyMappableToU128> Column<T> {
}
}

impl<T> Column<T> {
pub fn to_cached(self) -> CachedColumn<T> {
CachedColumn::new(self)
}
}

impl<T: PartialOrd + Copy + Debug + Send + Sync + 'static> Column<T> {
pub fn num_docs(&self) -> RowId {
match &self.index {
Expand Down
2 changes: 1 addition & 1 deletion crates/tantivy/src/columnar/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ mod value;

use crate::sstable::VoidSSTable;
pub use block_accessor::ColumnBlockAccessor;
pub use column::{BytesColumn, Column};
pub use column::{BytesColumn, CachedColumn, Column};
pub use column_index::ColumnIndex;
pub use column_values::{
ColumnValues, EmptyColumnValues, MonotonicallyMappableToU128, MonotonicallyMappableToU64,
Expand Down
4 changes: 4 additions & 0 deletions crates/tantivy/src/core/tests.rs
Original file line number Diff line number Diff line change
Expand Up @@ -483,4 +483,8 @@ fn test_u128_columnar_values() {
assert_eq!(postings.term_freq(), 1u32);
let column = segment_reader.column_fields().u128("u128").unwrap();
assert_eq!(column.first(0).unwrap(), 1u128);

let cached_column = column.to_cached();
assert_eq!(cached_column.first(0).unwrap(), 1u128);
assert_eq!(cached_column.first(0).unwrap(), 1u128);
}

0 comments on commit 46016ab

Please sign in to comment.