Skip to content

Commit 542c56c

Browse files
vertex-sdk-botcopybara-github
authored andcommitted
feat: GenAI Client(evals) - Add pd.DataFrame as input for dataset in create_evaluation_runin Vertex AI GenAI SDK evals
PiperOrigin-RevId: 825658094
1 parent 4216790 commit 542c56c

2 files changed

Lines changed: 193 additions & 147 deletions

File tree

tests/unit/vertexai/genai/replays/test_create_evaluation_run.py

Lines changed: 177 additions & 143 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,8 @@
1414
#
1515
# pylint: disable=protected-access,bad-continuation,missing-function-docstring
1616

17+
import pandas as pd
18+
1719
from tests.unit.vertexai.genai.replays import pytest_helper
1820
from vertexai import types
1921
from google.genai import types as genai_types
@@ -46,72 +48,116 @@
4648
)
4749
),
4850
)
51+
INPUT_DF = pd.DataFrame(
52+
{
53+
"prompt": ["prompt1", "prompt2"],
54+
"reference": ["reference1", "reference2"],
55+
"response": ["response1", "response2"],
56+
"intermediate_events": [
57+
[
58+
{
59+
"content": {
60+
"parts": [
61+
{"text": "first user input"},
62+
],
63+
"role": "user",
64+
},
65+
},
66+
{
67+
"content": {
68+
"parts": [
69+
{"text": "first model response"},
70+
],
71+
"role": "model",
72+
},
73+
},
74+
],
75+
[
76+
{
77+
"content": {
78+
"parts": [
79+
{"text": "second user input"},
80+
],
81+
"role": "user",
82+
},
83+
},
84+
{
85+
"content": {
86+
"parts": [
87+
{"text": "second model response"},
88+
],
89+
"role": "model",
90+
},
91+
},
92+
],
93+
],
94+
}
95+
)
4996

5097

51-
# TODO(b/431231205): Re-enable once Unified Metrics are in prod.
52-
# def test_create_eval_run_data_source_evaluation_set(client):
53-
# """Tests that create_evaluation_run() creates a correctly structured EvaluationRun."""
54-
# client._api_client._http_options.api_version = "v1beta1"
55-
# tool = genai_types.Tool(
56-
# function_declarations=[
57-
# genai_types.FunctionDeclaration(
58-
# name="get_weather",
59-
# description="Get weather in a location",
60-
# parameters={
61-
# "type": "object",
62-
# "properties": {"location": {"type": "string"}},
63-
# },
64-
# )
65-
# ]
66-
# )
67-
# evaluation_run = client.evals.create_evaluation_run(
68-
# name="test4",
69-
# display_name="test4",
70-
# dataset=types.EvaluationRunDataSource(
71-
# evaluation_set="projects/503583131166/locations/us-central1/evaluationSets/6619939608513740800"
72-
# ),
73-
# dest=GCS_DEST,
74-
# metrics=[
75-
# UNIVERSAL_AR_METRIC,
76-
# types.RubricMetric.FINAL_RESPONSE_QUALITY,
77-
# LLM_METRIC
78-
# ],
79-
# agent_info=types.AgentInfo(
80-
# agent="project/123/locations/us-central1/reasoningEngines/456",
81-
# name="agent-1",
82-
# instruction="agent-1 instruction",
83-
# tool_declarations=[tool],
84-
# ),
85-
# labels={"label1": "value1"},
86-
# )
87-
# assert isinstance(evaluation_run, types.EvaluationRun)
88-
# assert evaluation_run.display_name == "test4"
89-
# assert evaluation_run.state == types.EvaluationRunState.PENDING
90-
# assert isinstance(evaluation_run.data_source, types.EvaluationRunDataSource)
91-
# assert evaluation_run.data_source.evaluation_set == (
92-
# "projects/503583131166/locations/us-central1/evaluationSets/6619939608513740800"
93-
# )
94-
# assert evaluation_run.evaluation_config == types.EvaluationRunConfig(
95-
# output_config=genai_types.OutputConfig(
96-
# gcs_destination=genai_types.GcsDestination(output_uri_prefix=GCS_DEST)
97-
# ),
98-
# metrics=[UNIVERSAL_AR_METRIC, FINAL_RESPONSE_QUALITY_METRIC, LLM_METRIC],
99-
# )
100-
# assert evaluation_run.inference_configs[
101-
# "agent-1"
102-
# ] == types.EvaluationRunInferenceConfig(
103-
# agent_config=types.EvaluationRunAgentConfig(
104-
# developer_instruction=genai_types.Content(
105-
# parts=[genai_types.Part(text="agent-1 instruction")]
106-
# ),
107-
# tools=[tool],
108-
# )
109-
# )
110-
# assert evaluation_run.labels == {
111-
# "vertex-ai-evaluation-agent-engine-id": "456",
112-
# "label1": "value1",
113-
# }
114-
# assert evaluation_run.error is None
98+
def test_create_eval_run_data_source_evaluation_set(client):
99+
"""Tests that create_evaluation_run() creates a correctly structured EvaluationRun."""
100+
client._api_client._http_options.api_version = "v1beta1"
101+
tool = genai_types.Tool(
102+
function_declarations=[
103+
genai_types.FunctionDeclaration(
104+
name="get_weather",
105+
description="Get weather in a location",
106+
parameters={
107+
"type": "object",
108+
"properties": {"location": {"type": "string"}},
109+
},
110+
)
111+
]
112+
)
113+
evaluation_run = client.evals.create_evaluation_run(
114+
name="test4",
115+
display_name="test4",
116+
dataset=types.EvaluationRunDataSource(
117+
evaluation_set="projects/503583131166/locations/us-central1/evaluationSets/6619939608513740800"
118+
),
119+
dest=GCS_DEST,
120+
metrics=[
121+
UNIVERSAL_AR_METRIC,
122+
types.RubricMetric.FINAL_RESPONSE_QUALITY,
123+
LLM_METRIC,
124+
],
125+
agent_info=types.evals.AgentInfo(
126+
agent="project/123/locations/us-central1/reasoningEngines/456",
127+
name="agent-1",
128+
instruction="agent-1 instruction",
129+
tool_declarations=[tool],
130+
),
131+
labels={"label1": "value1"},
132+
)
133+
assert isinstance(evaluation_run, types.EvaluationRun)
134+
assert evaluation_run.display_name == "test4"
135+
assert evaluation_run.state == types.EvaluationRunState.PENDING
136+
assert isinstance(evaluation_run.data_source, types.EvaluationRunDataSource)
137+
assert evaluation_run.data_source.evaluation_set == (
138+
"projects/503583131166/locations/us-central1/evaluationSets/6619939608513740800"
139+
)
140+
assert evaluation_run.evaluation_config == types.EvaluationRunConfig(
141+
output_config=genai_types.OutputConfig(
142+
gcs_destination=genai_types.GcsDestination(output_uri_prefix=GCS_DEST)
143+
),
144+
metrics=[UNIVERSAL_AR_METRIC, FINAL_RESPONSE_QUALITY_METRIC, LLM_METRIC],
145+
)
146+
assert evaluation_run.inference_configs[
147+
"agent-1"
148+
] == types.EvaluationRunInferenceConfig(
149+
agent_config=types.EvaluationRunAgentConfig(
150+
developer_instruction=genai_types.Content(
151+
parts=[genai_types.Part(text="agent-1 instruction")]
152+
),
153+
tools=[tool],
154+
)
155+
)
156+
assert evaluation_run.labels == {
157+
"vertex-ai-evaluation-agent-engine-id": "456",
158+
"label1": "value1",
159+
}
160+
assert evaluation_run.error is None
115161

116162

117163
def test_create_eval_run_data_source_bigquery_request_set(client):
@@ -160,101 +206,89 @@ def test_create_eval_run_data_source_bigquery_request_set(client):
160206
assert evaluation_run.error is None
161207

162208

163-
# Test fails in replay mode because of the timestamp issue
209+
# Test fails in replay mode because of UUID generation mismatch.
164210
# def test_create_eval_run_data_source_evaluation_dataset(client):
165211
# """Tests that create_evaluation_run() creates a correctly structured EvaluationRun with EvaluationDataset."""
166-
# input_df = pd.DataFrame(
167-
# {
168-
# "prompt": ["prompt1", "prompt2"],
169-
# "reference": ["reference1", "reference2"],
170-
# "response": ["response1", "response2"],
171-
# "intermediate_events": [
172-
# [
173-
# {
174-
# "content": {
175-
# "parts": [
176-
# {"text": "first user input"},
177-
# ],
178-
# "role": "user",
179-
# },
180-
# },
181-
# {
182-
# "content": {
183-
# "parts": [
184-
# {"text": "first model response"},
185-
# ],
186-
# "role": "model",
187-
# },
188-
# },
189-
# ],
190-
# [
191-
# {
192-
# "content": {
193-
# "parts": [
194-
# {"text": "second user input"},
195-
# ],
196-
# "role": "user",
197-
# },
198-
# },
199-
# {
200-
# "content": {
201-
# "parts": [
202-
# {"text": "second model response"},
203-
# ],
204-
# "role": "model",
205-
# },
206-
# },
207-
# ],
208-
# ],
209-
# }
210-
# )
211212
# evaluation_run = client.evals.create_evaluation_run(
212213
# name="test6",
213214
# display_name="test6",
214215
# dataset=types.EvaluationDataset(
215216
# candidate_name="candidate_1",
216-
# eval_dataset_df=input_df,
217+
# eval_dataset_df=INPUT_DF,
217218
# ),
218-
# dest="gs://lakeyk-limited-bucket/eval_run_output",
219+
# dest=GCS_DEST,
219220
# )
220221
# assert isinstance(evaluation_run, types.EvaluationRun)
221222
# assert evaluation_run.display_name == "test6"
222223
# assert evaluation_run.state == types.EvaluationRunState.PENDING
223-
# assert isinstance(evaluation_run.data_source, types.EvaluationRunDataSource)
224-
# # Check evaluation set
225-
# assert evaluation_run.data_source.evaluation_set
226-
# eval_set = client.evals.get_evaluation_set(
227-
# name=evaluation_run.data_source.evaluation_set
224+
# check_evaluation_run_data_source(client, evaluation_run)
225+
# assert evaluation_run.error is None
226+
227+
228+
# # Test fails in replay mode because of UUID generation mismatch.
229+
# def test_create_eval_run_data_source_pandas_dataframe(client):
230+
# """Tests that create_evaluation_run() creates a correctly structured EvaluationRun with Pandas DataFrame."""
231+
# evaluation_run = client.evals.create_evaluation_run(
232+
# dataset=INPUT_DF,
233+
# dest=GCS_DEST,
234+
# )
235+
# assert isinstance(evaluation_run, types.EvaluationRun)
236+
# assert evaluation_run.state == types.EvaluationRunState.PENDING
237+
# check_evaluation_run_data_source(client, evaluation_run)
238+
# assert evaluation_run.error is None
239+
240+
# Test fails in replay mode because of UUID generation mismatch.
241+
# def test_create_eval_run_data_source_evaluation_dataset_dict(client):
242+
# """Tests that create_evaluation_run() creates a correctly structured EvaluationRun with Pandas DataFrame."""
243+
# eval_dataset_dict = {
244+
# "candidate_name": "candidate_1",
245+
# "eval_dataset_df": INPUT_DF,
246+
# }
247+
# evaluation_run = client.evals.create_evaluation_run(
248+
# dataset=eval_dataset_dict,
249+
# dest=GCS_DEST,
228250
# )
229-
# assert len(eval_set.evaluation_items) == 2
230-
# # Check evaluation items
231-
# for i, eval_item_name in enumerate(eval_set.evaluation_items):
232-
# eval_item = client.evals.get_evaluation_item(name=eval_item_name)
233-
# assert eval_item.evaluation_item_type == types.EvaluationItemType.REQUEST
234-
# assert eval_item.evaluation_request.prompt.text == input_df.iloc[i]["prompt"]
235-
# assert (
236-
# eval_item.evaluation_request.candidate_responses[0].text
237-
# == input_df.iloc[i]["response"]
238-
# )
239-
# assert (
240-
# eval_item.evaluation_request.candidate_responses[0].events[0].parts[0].text
241-
# == input_df.iloc[i]["intermediate_events"][0]["content"]["parts"][0]["text"]
242-
# )
243-
# assert (
244-
# eval_item.evaluation_request.candidate_responses[0].events[0].role
245-
# == input_df.iloc[i]["intermediate_events"][0]["content"]["role"]
246-
# )
247-
# assert (
248-
# eval_item.evaluation_request.candidate_responses[0].events[1].parts[0].text
249-
# == input_df.iloc[i]["intermediate_events"][1]["content"]["parts"][0]["text"]
250-
# )
251-
# assert (
252-
# eval_item.evaluation_request.candidate_responses[0].events[1].role
253-
# == input_df.iloc[i]["intermediate_events"][1]["content"]["role"]
254-
# )
251+
# assert isinstance(evaluation_run, types.EvaluationRun)
252+
# assert evaluation_run.state == types.EvaluationRunState.PENDING
253+
# check_evaluation_run_data_source(client, evaluation_run)
255254
# assert evaluation_run.error is None
256255

257256

257+
def check_evaluation_run_data_source(client, evaluation_run):
258+
assert isinstance(evaluation_run.data_source, types.EvaluationRunDataSource)
259+
# Check evaluation set
260+
assert evaluation_run.data_source.evaluation_set
261+
eval_set = client.evals.get_evaluation_set(
262+
name=evaluation_run.data_source.evaluation_set
263+
)
264+
assert len(eval_set.evaluation_items) == 2
265+
# Check evaluation items
266+
for i, eval_item_name in enumerate(eval_set.evaluation_items):
267+
eval_item = client.evals.get_evaluation_item(name=eval_item_name)
268+
assert eval_item.evaluation_item_type == types.EvaluationItemType.REQUEST
269+
assert eval_item.evaluation_request.prompt.text == INPUT_DF.iloc[i]["prompt"]
270+
assert (
271+
eval_item.evaluation_request.candidate_responses[0].text
272+
== INPUT_DF.iloc[i]["response"]
273+
)
274+
assert (
275+
eval_item.evaluation_request.candidate_responses[0].events[0].parts[0].text
276+
== INPUT_DF.iloc[i]["intermediate_events"][0]["content"]["parts"][0]["text"]
277+
)
278+
assert (
279+
eval_item.evaluation_request.candidate_responses[0].events[0].role
280+
== INPUT_DF.iloc[i]["intermediate_events"][0]["content"]["role"]
281+
)
282+
assert (
283+
eval_item.evaluation_request.candidate_responses[0].events[1].parts[0].text
284+
== INPUT_DF.iloc[i]["intermediate_events"][1]["content"]["parts"][0]["text"]
285+
)
286+
assert (
287+
eval_item.evaluation_request.candidate_responses[0].events[1].role
288+
== INPUT_DF.iloc[i]["intermediate_events"][1]["content"]["role"]
289+
)
290+
291+
258292
pytest_plugins = ("pytest_asyncio",)
259293

260294

0 commit comments

Comments
 (0)