BM25Retriever.search:v0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
import weave
import bm25s
@weave.op()
def search(self, query, k=5):
"""
Searches the indexed data for the given query using BM25.
Args:
query (str): The search query.
k (int): The number of top results to return. Default is 5.
Returns:
list: A list of dictionaries containing the source, text, and score of the top-k results.
"""
query_tokens = bm25s.tokenize(query, show_progress=False)
# Get top-k results as a tuple of (doc ids, scores). Both are arrays of shape (n_queries, k)
results, scores = self.index.retrieve(
query_tokens, corpus=self.data, k=k, show_progress=False
)
output = []
for idx in range(results.shape[1]):
output.append(
{
"source": results[0, idx]["metadata"]["source"],
"text": results[0, idx]["cleaned_content"],
"score": scores[0, idx],