2828from bigframes import series , session
2929from bigframes .core import convert
3030from bigframes .core .logging import log_adapter
31+ import bigframes .core .sql .literals
3132from bigframes .ml import core as ml_core
3233from bigframes .operations import ai_ops , output_schemas
3334
@@ -394,9 +395,11 @@ def generate_embedding(
394395 data : Union [dataframe .DataFrame , series .Series , pd .DataFrame , pd .Series ],
395396 * ,
396397 output_dimensionality : Optional [int ] = None ,
398+ task_type : Optional [str ] = None ,
397399 start_second : Optional [float ] = None ,
398400 end_second : Optional [float ] = None ,
399401 interval_seconds : Optional [float ] = None ,
402+ trial_id : Optional [int ] = None ,
400403) -> dataframe .DataFrame :
401404 """
402405 Creates embeddings that describe an entity—for example, a piece of text or an image.
@@ -414,32 +417,49 @@ def generate_embedding(
414417 Args:
415418 model_name (str):
416419 The name of a remote model over a Vertex AI multimodalembedding@001 model.
417- data (DataFrame or Series):
418- The data to generate embeddings for. If a Series is provided, it is treated as the 'content' column.
419- If a DataFrame is provided, it must contain a 'content' column, or you must rename the column you wish to embed to 'content'.
420+ data (bigframes.pandas.DataFrame or bigframes.pandas.Series):
421+ The data to generate embeddings for. If a Series is provided, it is
422+ treated as the 'content' column. If a DataFrame is provided, it
423+ must contain a 'content' column, or you must rename the column you
424+ wish to embed to 'content'.
420425 output_dimensionality (int, optional):
421- The number of dimensions to use when generating embeddings. Valid values are 128, 256, 512, and 1408. The default value is 1408.
426+ An INT64 value that specifies the number of dimensions to use when
427+ generating embeddings. For example, if you specify 256 AS
428+ output_dimensionality, then the embedding output column contains a
429+ 256-dimensional embedding for each input value. To find the
430+ supported range of output dimensions, read about the available
431+ `Google text embedding models <https://docs.cloud.google.com/vertex-ai/generative-ai/docs/embeddings/get-text-embeddings#google-models>`_.
432+ task_type (str, optional):
433+ A STRING literal that specifies the intended downstream application to
434+ help the model produce better quality embeddings. For a list of
435+ supported task types and how to choose which one to use, see `Choose an
436+ embeddings task type <http://docs.cloud.google.com/vertex-ai/generative-ai/docs/embeddings/task-types>`_.
422437 start_second (float, optional):
423438 The second in the video at which to start the embedding. The default value is 0.
424439 end_second (float, optional):
425440 The second in the video at which to end the embedding. The default value is 120.
426441 interval_seconds (float, optional):
427442 The interval to use when creating embeddings. The default value is 16.
443+ trial_id (int, optional):
444+ An INT64 value that identifies the hyperparameter tuning trial that
445+ you want the function to evaluate. The function uses the optimal
446+ trial by default. Only specify this argument if you ran
447+ hyperparameter tuning when creating the model.
428448
429449 Returns:
430- bigframes.dataframe.DataFrame:
431- A new DataFrame with the generated embeddings. It contains the input table columns and the following columns:
432- * "embedding": an ARRAY<FLOAT64> value that contains the generated embedding vector.
433- * "status": a STRING value that contains the API response status for the corresponding row.
434- * "video_start_sec": for video content, an INT64 value that contains the starting second.
435- * "video_end_sec": for video content, an INT64 value that contains the ending second.
450+ bigframes.pandas.DataFrame:
451+ A new DataFrame with the generated embeddings. See the `SQL
452+ reference for AI.GENERATE_EMBEDDING
453+ <https://docs.cloud.google.com/bigquery/docs/reference/standard-sql/bigqueryml-syntax-ai-generate-embedding#output>`_
454+ for details.
436455 """
437456 if isinstance (data , (pd .DataFrame , pd .Series )):
438457 data = bpd .read_pandas (data )
439458
440459 if isinstance (data , series .Series ):
441- # Rename series to 'content' and convert to DataFrame
442- data_df = data .rename ("content" ).to_frame ()
460+ data = data .copy ()
461+ data .name = "content"
462+ data_df = data .to_frame ()
443463 elif isinstance (data , dataframe .DataFrame ):
444464 data_df = data
445465 else :
@@ -448,25 +468,27 @@ def generate_embedding(
448468 # We need to get the SQL for the input data to pass as a subquery to the TVF
449469 source_sql = data_df .sql
450470
451- struct_fields = []
471+ struct_fields = {}
452472 if output_dimensionality is not None :
453- struct_fields .append (f"{ output_dimensionality } AS output_dimensionality" )
473+ struct_fields ["OUTPUT_DIMENSIONALITY" ] = output_dimensionality
474+ if task_type is not None :
475+ struct_fields ["TASK_TYPE" ] = task_type
454476 if start_second is not None :
455- struct_fields . append ( f" { start_second } AS start_second" )
477+ struct_fields [ "START_SECOND" ] = start_second
456478 if end_second is not None :
457- struct_fields . append ( f" { end_second } AS end_second" )
479+ struct_fields [ "END_SECOND" ] = end_second
458480 if interval_seconds is not None :
459- struct_fields . append ( f" { interval_seconds } AS interval_seconds" )
460-
461- struct_args = ", " . join ( struct_fields )
481+ struct_fields [ "INTERVAL_SECONDS" ] = interval_seconds
482+ if trial_id is not None :
483+ struct_fields [ "TRIAL_ID" ] = trial_id
462484
463485 # Construct the TVF query
464486 query = f"""
465487 SELECT *
466488 FROM AI.GENERATE_EMBEDDING(
467489 MODEL `{ model_name } `,
468490 ({ source_sql } ),
469- STRUCT( { struct_args } )
491+ { bigframes . core . sql . literals . struct_literal ( struct_fields ) } )
470492 )
471493 """
472494
0 commit comments