Evaluation.evaluate:v0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
import weave
from typing import Union
from typing import Callable
from weave.flow.model import Model
from rich import print
@weave.op()
async def evaluate(self, model: Union[Callable, Model]) -> dict:
# The need for this pattern is quite unfortunate and highlights a gap in our
# data model. As a user, I just want to pass a list of data `eval_rows` to
# summarize. Under the hood, Weave should choose the appropriate storage
# format (in this case `Table`) and serialize it that way. Right now, it is
# just a huge list of dicts. The fact that "as a user" I need to construct
# `weave.Table` at all is a leaky abstraction. Moreover, the need to
# construct `EvaluationResults` just so that tracing and the UI works is
# also bad. In the near-term, this will at least solve the problem of
# breaking summarization with big datasets, but this is not the correct
# long-term solution.
eval_results = await self.get_eval_results(model)
summary = await self.summarize(eval_results)
print("Evaluation summary", summary)
return summary