|
14 | 14 | # |
15 | 15 | # pylint: disable=protected-access,bad-continuation,missing-function-docstring |
16 | 16 |
|
| 17 | +import pandas as pd |
| 18 | + |
17 | 19 | from tests.unit.vertexai.genai.replays import pytest_helper |
18 | 20 | from vertexai import types |
19 | 21 | from google.genai import types as genai_types |
|
46 | 48 | ) |
47 | 49 | ), |
48 | 50 | ) |
| 51 | +INPUT_DF = pd.DataFrame( |
| 52 | + { |
| 53 | + "prompt": ["prompt1", "prompt2"], |
| 54 | + "reference": ["reference1", "reference2"], |
| 55 | + "response": ["response1", "response2"], |
| 56 | + "intermediate_events": [ |
| 57 | + [ |
| 58 | + { |
| 59 | + "content": { |
| 60 | + "parts": [ |
| 61 | + {"text": "first user input"}, |
| 62 | + ], |
| 63 | + "role": "user", |
| 64 | + }, |
| 65 | + }, |
| 66 | + { |
| 67 | + "content": { |
| 68 | + "parts": [ |
| 69 | + {"text": "first model response"}, |
| 70 | + ], |
| 71 | + "role": "model", |
| 72 | + }, |
| 73 | + }, |
| 74 | + ], |
| 75 | + [ |
| 76 | + { |
| 77 | + "content": { |
| 78 | + "parts": [ |
| 79 | + {"text": "second user input"}, |
| 80 | + ], |
| 81 | + "role": "user", |
| 82 | + }, |
| 83 | + }, |
| 84 | + { |
| 85 | + "content": { |
| 86 | + "parts": [ |
| 87 | + {"text": "second model response"}, |
| 88 | + ], |
| 89 | + "role": "model", |
| 90 | + }, |
| 91 | + }, |
| 92 | + ], |
| 93 | + ], |
| 94 | + } |
| 95 | +) |
49 | 96 |
|
50 | 97 |
|
51 | | -# TODO(b/431231205): Re-enable once Unified Metrics are in prod. |
52 | | -# def test_create_eval_run_data_source_evaluation_set(client): |
53 | | -# """Tests that create_evaluation_run() creates a correctly structured EvaluationRun.""" |
54 | | -# client._api_client._http_options.api_version = "v1beta1" |
55 | | -# tool = genai_types.Tool( |
56 | | -# function_declarations=[ |
57 | | -# genai_types.FunctionDeclaration( |
58 | | -# name="get_weather", |
59 | | -# description="Get weather in a location", |
60 | | -# parameters={ |
61 | | -# "type": "object", |
62 | | -# "properties": {"location": {"type": "string"}}, |
63 | | -# }, |
64 | | -# ) |
65 | | -# ] |
66 | | -# ) |
67 | | -# evaluation_run = client.evals.create_evaluation_run( |
68 | | -# name="test4", |
69 | | -# display_name="test4", |
70 | | -# dataset=types.EvaluationRunDataSource( |
71 | | -# evaluation_set="projects/503583131166/locations/us-central1/evaluationSets/6619939608513740800" |
72 | | -# ), |
73 | | -# dest=GCS_DEST, |
74 | | -# metrics=[ |
75 | | -# UNIVERSAL_AR_METRIC, |
76 | | -# types.RubricMetric.FINAL_RESPONSE_QUALITY, |
77 | | -# LLM_METRIC |
78 | | -# ], |
79 | | -# agent_info=types.AgentInfo( |
80 | | -# agent="project/123/locations/us-central1/reasoningEngines/456", |
81 | | -# name="agent-1", |
82 | | -# instruction="agent-1 instruction", |
83 | | -# tool_declarations=[tool], |
84 | | -# ), |
85 | | -# labels={"label1": "value1"}, |
86 | | -# ) |
87 | | -# assert isinstance(evaluation_run, types.EvaluationRun) |
88 | | -# assert evaluation_run.display_name == "test4" |
89 | | -# assert evaluation_run.state == types.EvaluationRunState.PENDING |
90 | | -# assert isinstance(evaluation_run.data_source, types.EvaluationRunDataSource) |
91 | | -# assert evaluation_run.data_source.evaluation_set == ( |
92 | | -# "projects/503583131166/locations/us-central1/evaluationSets/6619939608513740800" |
93 | | -# ) |
94 | | -# assert evaluation_run.evaluation_config == types.EvaluationRunConfig( |
95 | | -# output_config=genai_types.OutputConfig( |
96 | | -# gcs_destination=genai_types.GcsDestination(output_uri_prefix=GCS_DEST) |
97 | | -# ), |
98 | | -# metrics=[UNIVERSAL_AR_METRIC, FINAL_RESPONSE_QUALITY_METRIC, LLM_METRIC], |
99 | | -# ) |
100 | | -# assert evaluation_run.inference_configs[ |
101 | | -# "agent-1" |
102 | | -# ] == types.EvaluationRunInferenceConfig( |
103 | | -# agent_config=types.EvaluationRunAgentConfig( |
104 | | -# developer_instruction=genai_types.Content( |
105 | | -# parts=[genai_types.Part(text="agent-1 instruction")] |
106 | | -# ), |
107 | | -# tools=[tool], |
108 | | -# ) |
109 | | -# ) |
110 | | -# assert evaluation_run.labels == { |
111 | | -# "vertex-ai-evaluation-agent-engine-id": "456", |
112 | | -# "label1": "value1", |
113 | | -# } |
114 | | -# assert evaluation_run.error is None |
| 98 | +def test_create_eval_run_data_source_evaluation_set(client): |
| 99 | + """Tests that create_evaluation_run() creates a correctly structured EvaluationRun.""" |
| 100 | + client._api_client._http_options.api_version = "v1beta1" |
| 101 | + tool = genai_types.Tool( |
| 102 | + function_declarations=[ |
| 103 | + genai_types.FunctionDeclaration( |
| 104 | + name="get_weather", |
| 105 | + description="Get weather in a location", |
| 106 | + parameters={ |
| 107 | + "type": "object", |
| 108 | + "properties": {"location": {"type": "string"}}, |
| 109 | + }, |
| 110 | + ) |
| 111 | + ] |
| 112 | + ) |
| 113 | + evaluation_run = client.evals.create_evaluation_run( |
| 114 | + name="test4", |
| 115 | + display_name="test4", |
| 116 | + dataset=types.EvaluationRunDataSource( |
| 117 | + evaluation_set="projects/503583131166/locations/us-central1/evaluationSets/6619939608513740800" |
| 118 | + ), |
| 119 | + dest=GCS_DEST, |
| 120 | + metrics=[ |
| 121 | + UNIVERSAL_AR_METRIC, |
| 122 | + types.RubricMetric.FINAL_RESPONSE_QUALITY, |
| 123 | + LLM_METRIC, |
| 124 | + ], |
| 125 | + agent_info=types.evals.AgentInfo( |
| 126 | + agent="project/123/locations/us-central1/reasoningEngines/456", |
| 127 | + name="agent-1", |
| 128 | + instruction="agent-1 instruction", |
| 129 | + tool_declarations=[tool], |
| 130 | + ), |
| 131 | + labels={"label1": "value1"}, |
| 132 | + ) |
| 133 | + assert isinstance(evaluation_run, types.EvaluationRun) |
| 134 | + assert evaluation_run.display_name == "test4" |
| 135 | + assert evaluation_run.state == types.EvaluationRunState.PENDING |
| 136 | + assert isinstance(evaluation_run.data_source, types.EvaluationRunDataSource) |
| 137 | + assert evaluation_run.data_source.evaluation_set == ( |
| 138 | + "projects/503583131166/locations/us-central1/evaluationSets/6619939608513740800" |
| 139 | + ) |
| 140 | + assert evaluation_run.evaluation_config == types.EvaluationRunConfig( |
| 141 | + output_config=genai_types.OutputConfig( |
| 142 | + gcs_destination=genai_types.GcsDestination(output_uri_prefix=GCS_DEST) |
| 143 | + ), |
| 144 | + metrics=[UNIVERSAL_AR_METRIC, FINAL_RESPONSE_QUALITY_METRIC, LLM_METRIC], |
| 145 | + ) |
| 146 | + assert evaluation_run.inference_configs[ |
| 147 | + "agent-1" |
| 148 | + ] == types.EvaluationRunInferenceConfig( |
| 149 | + agent_config=types.EvaluationRunAgentConfig( |
| 150 | + developer_instruction=genai_types.Content( |
| 151 | + parts=[genai_types.Part(text="agent-1 instruction")] |
| 152 | + ), |
| 153 | + tools=[tool], |
| 154 | + ) |
| 155 | + ) |
| 156 | + assert evaluation_run.labels == { |
| 157 | + "vertex-ai-evaluation-agent-engine-id": "456", |
| 158 | + "label1": "value1", |
| 159 | + } |
| 160 | + assert evaluation_run.error is None |
115 | 161 |
|
116 | 162 |
|
117 | 163 | def test_create_eval_run_data_source_bigquery_request_set(client): |
@@ -160,101 +206,89 @@ def test_create_eval_run_data_source_bigquery_request_set(client): |
160 | 206 | assert evaluation_run.error is None |
161 | 207 |
|
162 | 208 |
|
163 | | -# Test fails in replay mode because of the timestamp issue |
| 209 | +# Test fails in replay mode because of UUID generation mismatch. |
164 | 210 | # def test_create_eval_run_data_source_evaluation_dataset(client): |
165 | 211 | # """Tests that create_evaluation_run() creates a correctly structured EvaluationRun with EvaluationDataset.""" |
166 | | -# input_df = pd.DataFrame( |
167 | | -# { |
168 | | -# "prompt": ["prompt1", "prompt2"], |
169 | | -# "reference": ["reference1", "reference2"], |
170 | | -# "response": ["response1", "response2"], |
171 | | -# "intermediate_events": [ |
172 | | -# [ |
173 | | -# { |
174 | | -# "content": { |
175 | | -# "parts": [ |
176 | | -# {"text": "first user input"}, |
177 | | -# ], |
178 | | -# "role": "user", |
179 | | -# }, |
180 | | -# }, |
181 | | -# { |
182 | | -# "content": { |
183 | | -# "parts": [ |
184 | | -# {"text": "first model response"}, |
185 | | -# ], |
186 | | -# "role": "model", |
187 | | -# }, |
188 | | -# }, |
189 | | -# ], |
190 | | -# [ |
191 | | -# { |
192 | | -# "content": { |
193 | | -# "parts": [ |
194 | | -# {"text": "second user input"}, |
195 | | -# ], |
196 | | -# "role": "user", |
197 | | -# }, |
198 | | -# }, |
199 | | -# { |
200 | | -# "content": { |
201 | | -# "parts": [ |
202 | | -# {"text": "second model response"}, |
203 | | -# ], |
204 | | -# "role": "model", |
205 | | -# }, |
206 | | -# }, |
207 | | -# ], |
208 | | -# ], |
209 | | -# } |
210 | | -# ) |
211 | 212 | # evaluation_run = client.evals.create_evaluation_run( |
212 | 213 | # name="test6", |
213 | 214 | # display_name="test6", |
214 | 215 | # dataset=types.EvaluationDataset( |
215 | 216 | # candidate_name="candidate_1", |
216 | | -# eval_dataset_df=input_df, |
| 217 | +# eval_dataset_df=INPUT_DF, |
217 | 218 | # ), |
218 | | -# dest="gs://lakeyk-limited-bucket/eval_run_output", |
| 219 | +# dest=GCS_DEST, |
219 | 220 | # ) |
220 | 221 | # assert isinstance(evaluation_run, types.EvaluationRun) |
221 | 222 | # assert evaluation_run.display_name == "test6" |
222 | 223 | # assert evaluation_run.state == types.EvaluationRunState.PENDING |
223 | | -# assert isinstance(evaluation_run.data_source, types.EvaluationRunDataSource) |
224 | | -# # Check evaluation set |
225 | | -# assert evaluation_run.data_source.evaluation_set |
226 | | -# eval_set = client.evals.get_evaluation_set( |
227 | | -# name=evaluation_run.data_source.evaluation_set |
| 224 | +# check_evaluation_run_data_source(client, evaluation_run) |
| 225 | +# assert evaluation_run.error is None |
| 226 | + |
| 227 | + |
| 228 | +# # Test fails in replay mode because of UUID generation mismatch. |
| 229 | +# def test_create_eval_run_data_source_pandas_dataframe(client): |
| 230 | +# """Tests that create_evaluation_run() creates a correctly structured EvaluationRun with Pandas DataFrame.""" |
| 231 | +# evaluation_run = client.evals.create_evaluation_run( |
| 232 | +# dataset=INPUT_DF, |
| 233 | +# dest=GCS_DEST, |
| 234 | +# ) |
| 235 | +# assert isinstance(evaluation_run, types.EvaluationRun) |
| 236 | +# assert evaluation_run.state == types.EvaluationRunState.PENDING |
| 237 | +# check_evaluation_run_data_source(client, evaluation_run) |
| 238 | +# assert evaluation_run.error is None |
| 239 | + |
| 240 | +# Test fails in replay mode because of UUID generation mismatch. |
| 241 | +# def test_create_eval_run_data_source_evaluation_dataset_dict(client): |
| 242 | +# """Tests that create_evaluation_run() creates a correctly structured EvaluationRun with Pandas DataFrame.""" |
| 243 | +# eval_dataset_dict = { |
| 244 | +# "candidate_name": "candidate_1", |
| 245 | +# "eval_dataset_df": INPUT_DF, |
| 246 | +# } |
| 247 | +# evaluation_run = client.evals.create_evaluation_run( |
| 248 | +# dataset=eval_dataset_dict, |
| 249 | +# dest=GCS_DEST, |
228 | 250 | # ) |
229 | | -# assert len(eval_set.evaluation_items) == 2 |
230 | | -# # Check evaluation items |
231 | | -# for i, eval_item_name in enumerate(eval_set.evaluation_items): |
232 | | -# eval_item = client.evals.get_evaluation_item(name=eval_item_name) |
233 | | -# assert eval_item.evaluation_item_type == types.EvaluationItemType.REQUEST |
234 | | -# assert eval_item.evaluation_request.prompt.text == input_df.iloc[i]["prompt"] |
235 | | -# assert ( |
236 | | -# eval_item.evaluation_request.candidate_responses[0].text |
237 | | -# == input_df.iloc[i]["response"] |
238 | | -# ) |
239 | | -# assert ( |
240 | | -# eval_item.evaluation_request.candidate_responses[0].events[0].parts[0].text |
241 | | -# == input_df.iloc[i]["intermediate_events"][0]["content"]["parts"][0]["text"] |
242 | | -# ) |
243 | | -# assert ( |
244 | | -# eval_item.evaluation_request.candidate_responses[0].events[0].role |
245 | | -# == input_df.iloc[i]["intermediate_events"][0]["content"]["role"] |
246 | | -# ) |
247 | | -# assert ( |
248 | | -# eval_item.evaluation_request.candidate_responses[0].events[1].parts[0].text |
249 | | -# == input_df.iloc[i]["intermediate_events"][1]["content"]["parts"][0]["text"] |
250 | | -# ) |
251 | | -# assert ( |
252 | | -# eval_item.evaluation_request.candidate_responses[0].events[1].role |
253 | | -# == input_df.iloc[i]["intermediate_events"][1]["content"]["role"] |
254 | | -# ) |
| 251 | +# assert isinstance(evaluation_run, types.EvaluationRun) |
| 252 | +# assert evaluation_run.state == types.EvaluationRunState.PENDING |
| 253 | +# check_evaluation_run_data_source(client, evaluation_run) |
255 | 254 | # assert evaluation_run.error is None |
256 | 255 |
|
257 | 256 |
|
| 257 | +def check_evaluation_run_data_source(client, evaluation_run): |
| 258 | + assert isinstance(evaluation_run.data_source, types.EvaluationRunDataSource) |
| 259 | + # Check evaluation set |
| 260 | + assert evaluation_run.data_source.evaluation_set |
| 261 | + eval_set = client.evals.get_evaluation_set( |
| 262 | + name=evaluation_run.data_source.evaluation_set |
| 263 | + ) |
| 264 | + assert len(eval_set.evaluation_items) == 2 |
| 265 | + # Check evaluation items |
| 266 | + for i, eval_item_name in enumerate(eval_set.evaluation_items): |
| 267 | + eval_item = client.evals.get_evaluation_item(name=eval_item_name) |
| 268 | + assert eval_item.evaluation_item_type == types.EvaluationItemType.REQUEST |
| 269 | + assert eval_item.evaluation_request.prompt.text == INPUT_DF.iloc[i]["prompt"] |
| 270 | + assert ( |
| 271 | + eval_item.evaluation_request.candidate_responses[0].text |
| 272 | + == INPUT_DF.iloc[i]["response"] |
| 273 | + ) |
| 274 | + assert ( |
| 275 | + eval_item.evaluation_request.candidate_responses[0].events[0].parts[0].text |
| 276 | + == INPUT_DF.iloc[i]["intermediate_events"][0]["content"]["parts"][0]["text"] |
| 277 | + ) |
| 278 | + assert ( |
| 279 | + eval_item.evaluation_request.candidate_responses[0].events[0].role |
| 280 | + == INPUT_DF.iloc[i]["intermediate_events"][0]["content"]["role"] |
| 281 | + ) |
| 282 | + assert ( |
| 283 | + eval_item.evaluation_request.candidate_responses[0].events[1].parts[0].text |
| 284 | + == INPUT_DF.iloc[i]["intermediate_events"][1]["content"]["parts"][0]["text"] |
| 285 | + ) |
| 286 | + assert ( |
| 287 | + eval_item.evaluation_request.candidate_responses[0].events[1].role |
| 288 | + == INPUT_DF.iloc[i]["intermediate_events"][1]["content"]["role"] |
| 289 | + ) |
| 290 | + |
| 291 | + |
258 | 292 | pytest_plugins = ("pytest_asyncio",) |
259 | 293 |
|
260 | 294 |
|
|
0 commit comments