Evaluation.evaluate:v0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
import weave
from typing import Union
from typing import Callable
from weave.flow.model import Model
from rich import print
from weave.trace.weave_client import Call
from datetime import datetime
from weave.flow.util import make_memorable_name
def default_evaluation_display_name(call: Call) -> str:
date = datetime.now().strftime("%Y-%m-%d")
unique_name = make_memorable_name()
return f"eval-{date}-{unique_name}"
@weave.op(call_display_name=default_evaluation_display_name)
async def evaluate(self, model: Union[Callable, Model], verbose: bool = True) -> dict:
# The need for this pattern is quite unfortunate and highlights a gap in our
# data model. As a user, I just want to pass a list of data `eval_rows` to
# summarize. Under the hood, Weave should choose the appropriate storage
# format (in this case `Table`) and serialize it that way. Right now, it is
# just a huge list of dicts. The fact that "as a user" I need to construct
# `weave.Table` at all is a leaky abstraction. Moreover, the need to
# construct `EvaluationResults` just so that tracing and the UI works is
# also bad. In the near-term, this will at least solve the problem of
# breaking summarization with big datasets, but this is not the correct
# long-term solution.
eval_results = await self.get_eval_results(model, verbose)
summary = await self.summarize(eval_results)