context-relevance-scorer Workspace

Evaluation.evaluate:v0
Name
Version
Last updated
10 months ago
Calls:
import weave
from typing import Union
from typing import Callable
from weave.flow.model import Model
from rich import print
from weave.trace.weave_client import Call
from datetime import datetime
from weave.flow.util import make_memorable_name
def default_evaluation_display_name(call: Call) -> str:
    date = datetime.now().strftime("%Y-%m-%d")
    unique_name = make_memorable_name()
    return f"eval-{date}-{unique_name}"
@weave.op(call_display_name=default_evaluation_display_name)
async def evaluate(self, model: Union[Callable, Model], verbose: bool = True) -> dict:
    # The need for this pattern is quite unfortunate and highlights a gap in our
    # data model. As a user, I just want to pass a list of data `eval_rows` to
    # summarize. Under the hood, Weave should choose the appropriate storage
    # format (in this case `Table`) and serialize it that way. Right now, it is
    # just a huge list of dicts. The fact that "as a user" I need to construct
    # `weave.Table` at all is a leaky abstraction. Moreover, the need to
    # construct `EvaluationResults` just so that tracing and the UI works is
    # also bad. In the near-term, this will at least solve the problem of
    # breaking summarization with big datasets, but this is not the correct
    # long-term solution.
    eval_results = await self.get_eval_results(model, verbose)
    summary = await self.summarize(eval_results)