From 7e3012910c3762097ac31af1606d478c6308219e Mon Sep 17 00:00:00 2001 From: Mikkel Denker Date: Thu, 5 Dec 2024 11:15:37 +0100 Subject: [PATCH] document the different searchers --- crates/core/src/entrypoint/search_server.rs | 2 +- crates/core/src/index.rs | 8 ++++ crates/core/src/searcher/distributed.rs | 10 ++--- crates/core/src/searcher/local/guard.rs | 33 --------------- crates/core/src/searcher/local/inner.rs | 44 ++++++++++---------- crates/core/src/searcher/local/mod.rs | 46 ++++++--------------- crates/core/src/searcher/mod.rs | 6 +++ 7 files changed, 53 insertions(+), 96 deletions(-) delete mode 100644 crates/core/src/searcher/local/guard.rs diff --git a/crates/core/src/entrypoint/search_server.rs b/crates/core/src/entrypoint/search_server.rs index 9804e74c..c166216c 100644 --- a/crates/core/src/entrypoint/search_server.rs +++ b/crates/core/src/entrypoint/search_server.rs @@ -137,7 +137,7 @@ impl_search!([ ]); pub struct SearchService { - local_searcher: LocalSearcher>>, + local_searcher: LocalSearcher, // dropping the handle leaves the cluster #[allow(unused)] cluster_handle: Arc, diff --git a/crates/core/src/index.rs b/crates/core/src/index.rs index 793fa02f..b57cf8bd 100644 --- a/crates/core/src/index.rs +++ b/crates/core/src/index.rs @@ -51,6 +51,14 @@ impl Index { }) } + pub fn inverted_index(&self) -> &InvertedIndex { + &self.inverted_index + } + + pub fn region_count(&self) -> &Mutex { + &self.region_count + } + pub fn path(&self) -> PathBuf { PathBuf::from(&self.path) } diff --git a/crates/core/src/searcher/distributed.rs b/crates/core/src/searcher/distributed.rs index 2978bbfd..523b6f62 100644 --- a/crates/core/src/searcher/distributed.rs +++ b/crates/core/src/searcher/distributed.rs @@ -31,7 +31,6 @@ use crate::{ entity_search_server, live_index::LiveIndexService, search_server::{self, RetrieveReq, SearchService} }, generic_query::{self, Collector}, - index::Index, inverted_index::{RetrievedWebpage, ShardId, WebpagePointer}, ranking::pipeline::{PrecisionRankingWebpage, RecallRankingWebpage}, Result, @@ -44,7 +43,7 @@ use futures::{future::join_all, stream::FuturesUnordered, StreamExt}; use itertools::Itertools; use std::future::Future; use thiserror::Error; -use tokio::sync::{Mutex, RwLock}; +use tokio::sync::Mutex; use super::{InitialWebsiteResult, LocalSearcher, SearchQuery}; @@ -284,6 +283,7 @@ impl ReusableClientManager for LiveIndexService { } } +/// A searcher that runs the search on a remote cluster. pub struct DistributedSearcher { client: Mutex>, } @@ -584,9 +584,9 @@ impl SearchClient for DistributedSearcher { } /// This should only be used for testing and benchmarks. -pub struct LocalSearchClient(LocalSearcher>>); -impl From>>> for LocalSearchClient { - fn from(searcher: LocalSearcher>>) -> Self { +pub struct LocalSearchClient(LocalSearcher); +impl From for LocalSearchClient { + fn from(searcher: LocalSearcher) -> Self { Self(searcher) } } diff --git a/crates/core/src/searcher/local/guard.rs b/crates/core/src/searcher/local/guard.rs deleted file mode 100644 index 2b771274..00000000 --- a/crates/core/src/searcher/local/guard.rs +++ /dev/null @@ -1,33 +0,0 @@ -// Stract is an open source web search engine. -// Copyright (C) 2024 Stract ApS -// -// This program is free software: you can redistribute it and/or modify -// it under the terms of the GNU Affero General Public License as -// published by the Free Software Foundation, either version 3 of the -// License, or (at your option) any later version. -// -// This program is distributed in the hope that it will be useful, -// but WITHOUT ANY WARRANTY; without even the implied warranty of -// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -// GNU Affero General Public License for more details. -// -// You should have received a copy of the GNU Affero General Public License -// along with this program. If not, see . - -use tokio::sync::OwnedRwLockReadGuard; - -use crate::index::Index; -use crate::inverted_index::InvertedIndex; - -pub trait ReadGuard: Send + Sync { - fn search_index(&self) -> &Index; - fn inverted_index(&self) -> &InvertedIndex { - &self.search_index().inverted_index - } -} - -impl ReadGuard for OwnedRwLockReadGuard { - fn search_index(&self) -> &Index { - self - } -} diff --git a/crates/core/src/searcher/local/inner.rs b/crates/core/src/searcher/local/inner.rs index f3ef0947..62bb66eb 100644 --- a/crates/core/src/searcher/local/inner.rs +++ b/crates/core/src/searcher/local/inner.rs @@ -16,32 +16,31 @@ use crate::{ generic_query::{self, GenericQuery}, + index::Index, inverted_index, ranking::{LocalRanker, SignalComputer}, searcher::InitialWebsiteResult, Result, }; use std::sync::Arc; +use tokio::sync::{OwnedRwLockReadGuard, RwLock}; use crate::{ config::CollectorConfig, models::dual_encoder::DualEncoder, query::Query, ranking::models::linear::LinearRegression, search_ctx::Ctx, searcher::SearchQuery, }; -use super::{InvertedIndexResult, ReadGuard, SearchableIndex}; +use super::InvertedIndexResult; -pub struct InnerLocalSearcher { - index: I, +pub struct InnerLocalSearcher { + index: Arc>, linear_regression: Option>, dual_encoder: Option>, collector_config: CollectorConfig, } -impl InnerLocalSearcher -where - I: SearchableIndex, -{ - pub fn new(index: I) -> Self { +impl InnerLocalSearcher { + pub fn new(index: Arc>) -> Self { Self { index, linear_regression: None, @@ -50,8 +49,8 @@ where } } - pub async fn guard(&self) -> I::ReadGuard { - self.index.read_guard().await + pub async fn guard(&self) -> OwnedRwLockReadGuard { + self.index.clone().read_owned().await } pub fn set_linear_model(&mut self, model: LinearRegression) { @@ -66,19 +65,19 @@ where self.collector_config = config; } - fn parse_query( + fn parse_query( &self, ctx: &Ctx, - guard: &G, + guard: &OwnedRwLockReadGuard, query: &SearchQuery, ) -> Result { Query::parse(ctx, query, guard.inverted_index()) } - fn ranker( + fn ranker( &self, query: &Query, - guard: &G, + guard: &OwnedRwLockReadGuard, de_rank_similar: bool, computer: SignalComputer, ) -> Result { @@ -99,10 +98,10 @@ where .with_offset(query.offset())) } - fn search_inverted_index( + fn search_inverted_index( &self, ctx: &Ctx, - guard: &G, + guard: &OwnedRwLockReadGuard, query: &SearchQuery, de_rank_similar: bool, ) -> Result { @@ -112,8 +111,7 @@ where computer.set_region_count( guard - .search_index() - .region_count + .region_count() .lock() .unwrap_or_else(|e| e.into_inner()) .clone(), @@ -149,7 +147,7 @@ where pub fn search_initial( &self, query: &SearchQuery, - guard: &I::ReadGuard, + guard: &OwnedRwLockReadGuard, de_rank_similar: bool, ) -> Result { let query = query.clone(); @@ -168,7 +166,7 @@ where &self, websites: &[inverted_index::WebpagePointer], query: &str, - guard: &I::ReadGuard, + guard: &OwnedRwLockReadGuard, ) -> Result> { let ctx = guard.inverted_index().local_search_ctx(); let query = SearchQuery { @@ -183,7 +181,7 @@ where pub fn search_initial_generic( &self, query: &Q, - guard: &I::ReadGuard, + guard: &OwnedRwLockReadGuard, ) -> Result<::Fruit> { guard.inverted_index().search_initial_generic(query) } @@ -192,7 +190,7 @@ where &self, query: &Q, fruit: ::Fruit, - guard: &I::ReadGuard, + guard: &OwnedRwLockReadGuard, ) -> Result { guard.inverted_index().retrieve_generic(query, fruit) } @@ -200,7 +198,7 @@ where pub fn search_generic( &self, query: Q, - guard: &I::ReadGuard, + guard: &OwnedRwLockReadGuard, ) -> Result { let fruit = self.search_initial_generic(&query, guard)?; Ok(Q::merge_results(vec![ diff --git a/crates/core/src/searcher/local/mod.rs b/crates/core/src/searcher/local/mod.rs index 8b743991..f787ad80 100644 --- a/crates/core/src/searcher/local/mod.rs +++ b/crates/core/src/searcher/local/mod.rs @@ -14,15 +14,13 @@ // You should have received a copy of the GNU Affero General Public License // along with this program. If not, see . -mod guard; -use guard::ReadGuard; +//! The local searcher runs the search against a local index. mod inner; use inner::InnerLocalSearcher; -use tokio::sync::{OwnedRwLockReadGuard, RwLock}; +use tokio::sync::RwLock; use std::collections::HashMap; -use std::future::Future; use std::sync::Arc; use itertools::Itertools; @@ -44,29 +42,12 @@ use crate::{inverted_index, Result}; use super::WebsitesResult; use super::{InitialWebsiteResult, SearchQuery}; -pub trait SearchableIndex: Send + Sync + 'static { - type ReadGuard: ReadGuard; - - fn read_guard(&self) -> impl Future; -} - -impl SearchableIndex for Arc> { - type ReadGuard = OwnedRwLockReadGuard; - - async fn read_guard(&self) -> Self::ReadGuard { - self.clone().read_owned().await - } -} - -pub struct LocalSearcherBuilder { - inner: InnerLocalSearcher, +pub struct LocalSearcherBuilder { + inner: InnerLocalSearcher, } -impl LocalSearcherBuilder -where - I: SearchableIndex, -{ - pub fn new(index: I) -> Self { +impl LocalSearcherBuilder { + pub fn new(index: Arc>) -> Self { Self { inner: InnerLocalSearcher::new(index), } @@ -87,22 +68,19 @@ where self } - pub fn build(self) -> LocalSearcher { + pub fn build(self) -> LocalSearcher { LocalSearcher { inner: Arc::new(self.inner), } } } -pub struct LocalSearcher { - inner: Arc>, +pub struct LocalSearcher { + inner: Arc, } -impl LocalSearcher -where - I: SearchableIndex, -{ - pub fn builder(index: I) -> LocalSearcherBuilder { +impl LocalSearcher { + pub fn builder(index: Arc>) -> LocalSearcherBuilder { LocalSearcherBuilder::new(index) } @@ -203,7 +181,7 @@ where }) } - /// This function is mainly used for tests and benchmarks + /// This function is only used for tests and benchmarks pub fn search_sync(&self, query: &SearchQuery) -> Result { crate::block_on(self.search(query)) } diff --git a/crates/core/src/searcher/mod.rs b/crates/core/src/searcher/mod.rs index 83ac2148..3c31f3f0 100644 --- a/crates/core/src/searcher/mod.rs +++ b/crates/core/src/searcher/mod.rs @@ -14,6 +14,12 @@ // You should have received a copy of the GNU Affero General Public License // along with this program. If not, see . +//! Searchers are responsible for executing search queries against an index. +//! There are two types of searchers: +//! - [`local::LocalSearcher`] which runs the search on the local machine. +//! - [`distributed::DistributedSearcher`] which runs the search on a remote cluster. Each node +//! will run a local searcher and then the results are merged on the coordinator node. + pub mod api; pub mod distributed; pub mod local;