|
14 | 14 |
|
15 | 15 | from __future__ import annotations |
16 | 16 |
|
17 | | -from typing import cast, Mapping, Optional, Union |
| 17 | +from typing import cast, List, Mapping, Optional, Union |
18 | 18 |
|
19 | 19 | import bigframes_vendored.constants |
20 | 20 | import google.cloud.bigquery |
@@ -393,3 +393,190 @@ def global_explain( |
393 | 393 | return bpd.read_gbq_query(sql) |
394 | 394 | else: |
395 | 395 | return session.read_gbq_query(sql) |
| 396 | + |
| 397 | + |
| 398 | +@log_adapter.method_logger(custom_base_name="bigquery_ml") |
| 399 | +def transform( |
| 400 | + model: Union[bigframes.ml.base.BaseEstimator, str, pd.Series], |
| 401 | + input_: Union[pd.DataFrame, dataframe.DataFrame, str], |
| 402 | +) -> dataframe.DataFrame: |
| 403 | + """ |
| 404 | + Transforms input data using a BigQuery ML model. |
| 405 | +
|
| 406 | + See the `BigQuery ML TRANSFORM function syntax |
| 407 | + <https://docs.cloud.google.com/bigquery/docs/reference/standard-sql/bigqueryml-syntax-transform>`_ |
| 408 | + for additional reference. |
| 409 | +
|
| 410 | + Args: |
| 411 | + model (bigframes.ml.base.BaseEstimator or str): |
| 412 | + The model to use for transformation. |
| 413 | + input_ (Union[bigframes.pandas.DataFrame, str]): |
| 414 | + The DataFrame or query to use for transformation. |
| 415 | +
|
| 416 | + Returns: |
| 417 | + bigframes.pandas.DataFrame: |
| 418 | + The transformed data. |
| 419 | + """ |
| 420 | + import bigframes.pandas as bpd |
| 421 | + |
| 422 | + model_name, session = _get_model_name_and_session(model, input_) |
| 423 | + table_sql = _to_sql(input_) |
| 424 | + |
| 425 | + sql = bigframes.core.sql.ml.transform( |
| 426 | + model_name=model_name, |
| 427 | + table=table_sql, |
| 428 | + ) |
| 429 | + |
| 430 | + if session is None: |
| 431 | + return bpd.read_gbq_query(sql) |
| 432 | + else: |
| 433 | + return session.read_gbq_query(sql) |
| 434 | + |
| 435 | + |
| 436 | +@log_adapter.method_logger(custom_base_name="bigquery_ml") |
| 437 | +def generate_text( |
| 438 | + model: Union[bigframes.ml.base.BaseEstimator, str, pd.Series], |
| 439 | + input_: Union[pd.DataFrame, dataframe.DataFrame, str], |
| 440 | + *, |
| 441 | + temperature: Optional[float] = None, |
| 442 | + max_output_tokens: Optional[int] = None, |
| 443 | + top_k: Optional[int] = None, |
| 444 | + top_p: Optional[float] = None, |
| 445 | + flatten_json_output: Optional[bool] = None, |
| 446 | + stop_sequences: Optional[List[str]] = None, |
| 447 | + ground_with_google_search: Optional[bool] = None, |
| 448 | + request_type: Optional[str] = None, |
| 449 | +) -> dataframe.DataFrame: |
| 450 | + """ |
| 451 | + Generates text using a BigQuery ML model. |
| 452 | +
|
| 453 | + See the `BigQuery ML GENERATE_TEXT function syntax |
| 454 | + <https://docs.cloud.google.com/bigquery/docs/reference/standard-sql/bigqueryml-syntax-generate-text>`_ |
| 455 | + for additional reference. |
| 456 | +
|
| 457 | + Args: |
| 458 | + model (bigframes.ml.base.BaseEstimator or str): |
| 459 | + The model to use for text generation. |
| 460 | + input_ (Union[bigframes.pandas.DataFrame, str]): |
| 461 | + The DataFrame or query to use for text generation. |
| 462 | + temperature (float, optional): |
| 463 | + A FLOAT64 value that is used for sampling promiscuity. The value |
| 464 | + must be in the range ``[0.0, 1.0]``. A lower temperature works well |
| 465 | + for prompts that expect a more deterministic and less open-ended |
| 466 | + or creative response, while a higher temperature can lead to more |
| 467 | + diverse or creative results. A temperature of ``0`` is |
| 468 | + deterministic, meaning that the highest probability response is |
| 469 | + always selected. |
| 470 | + max_output_tokens (int, optional): |
| 471 | + An INT64 value that sets the maximum number of tokens in the |
| 472 | + generated text. |
| 473 | + top_k (int, optional): |
| 474 | + An INT64 value that changes how the model selects tokens for |
| 475 | + output. A ``top_k`` of ``1`` means the next selected token is the |
| 476 | + most probable among all tokens in the model's vocabulary. A |
| 477 | + ``top_k`` of ``3`` means that the next token is selected from |
| 478 | + among the three most probable tokens by using temperature. The |
| 479 | + default value is ``40``. |
| 480 | + top_p (float, optional): |
| 481 | + A FLOAT64 value that changes how the model selects tokens for |
| 482 | + output. Tokens are selected from most probable to least probable |
| 483 | + until the sum of their probabilities equals the ``top_p`` value. |
| 484 | + For example, if tokens A, B, and C have a probability of 0.3, 0.2, |
| 485 | + and 0.1 and the ``top_p`` value is ``0.5``, then the model will |
| 486 | + select either A or B as the next token by using temperature. The |
| 487 | + default value is ``0.95``. |
| 488 | + flatten_json_output (bool, optional): |
| 489 | + A BOOL value that determines the content of the generated JSON column. |
| 490 | + stop_sequences (List[str], optional): |
| 491 | + An ARRAY<STRING> value that contains the stop sequences for the model. |
| 492 | + ground_with_google_search (bool, optional): |
| 493 | + A BOOL value that determines whether to ground the model with Google Search. |
| 494 | + request_type (str, optional): |
| 495 | + A STRING value that contains the request type for the model. |
| 496 | +
|
| 497 | + Returns: |
| 498 | + bigframes.pandas.DataFrame: |
| 499 | + The generated text. |
| 500 | + """ |
| 501 | + import bigframes.pandas as bpd |
| 502 | + |
| 503 | + model_name, session = _get_model_name_and_session(model, input_) |
| 504 | + table_sql = _to_sql(input_) |
| 505 | + |
| 506 | + sql = bigframes.core.sql.ml.generate_text( |
| 507 | + model_name=model_name, |
| 508 | + table=table_sql, |
| 509 | + temperature=temperature, |
| 510 | + max_output_tokens=max_output_tokens, |
| 511 | + top_k=top_k, |
| 512 | + top_p=top_p, |
| 513 | + flatten_json_output=flatten_json_output, |
| 514 | + stop_sequences=stop_sequences, |
| 515 | + ground_with_google_search=ground_with_google_search, |
| 516 | + request_type=request_type, |
| 517 | + ) |
| 518 | + |
| 519 | + if session is None: |
| 520 | + return bpd.read_gbq_query(sql) |
| 521 | + else: |
| 522 | + return session.read_gbq_query(sql) |
| 523 | + |
| 524 | + |
| 525 | +@log_adapter.method_logger(custom_base_name="bigquery_ml") |
| 526 | +def generate_embedding( |
| 527 | + model: Union[bigframes.ml.base.BaseEstimator, str, pd.Series], |
| 528 | + input_: Union[pd.DataFrame, dataframe.DataFrame, str], |
| 529 | + *, |
| 530 | + flatten_json_output: Optional[bool] = None, |
| 531 | + task_type: Optional[str] = None, |
| 532 | + output_dimensionality: Optional[int] = None, |
| 533 | +) -> dataframe.DataFrame: |
| 534 | + """ |
| 535 | + Generates text embedding using a BigQuery ML model. |
| 536 | +
|
| 537 | + See the `BigQuery ML GENERATE_EMBEDDING function syntax |
| 538 | + <https://docs.cloud.google.com/bigquery/docs/reference/standard-sql/bigqueryml-syntax-generate-embedding>`_ |
| 539 | + for additional reference. |
| 540 | +
|
| 541 | + Args: |
| 542 | + model (bigframes.ml.base.BaseEstimator or str): |
| 543 | + The model to use for text embedding. |
| 544 | + input_ (Union[bigframes.pandas.DataFrame, str]): |
| 545 | + The DataFrame or query to use for text embedding. |
| 546 | + flatten_json_output (bool, optional): |
| 547 | + A BOOL value that determines the content of the generated JSON column. |
| 548 | + task_type (str, optional): |
| 549 | + A STRING value that specifies the intended downstream application task. |
| 550 | + Supported values are: |
| 551 | + - `RETRIEVAL_QUERY` |
| 552 | + - `RETRIEVAL_DOCUMENT` |
| 553 | + - `SEMANTIC_SIMILARITY` |
| 554 | + - `CLASSIFICATION` |
| 555 | + - `CLUSTERING` |
| 556 | + - `QUESTION_ANSWERING` |
| 557 | + - `FACT_VERIFICATION` |
| 558 | + - `CODE_RETRIEVAL_QUERY` |
| 559 | + output_dimensionality (int, optional): |
| 560 | + An INT64 value that specifies the size of the output embedding. |
| 561 | +
|
| 562 | + Returns: |
| 563 | + bigframes.pandas.DataFrame: |
| 564 | + The generated text embedding. |
| 565 | + """ |
| 566 | + import bigframes.pandas as bpd |
| 567 | + |
| 568 | + model_name, session = _get_model_name_and_session(model, input_) |
| 569 | + table_sql = _to_sql(input_) |
| 570 | + |
| 571 | + sql = bigframes.core.sql.ml.generate_embedding( |
| 572 | + model_name=model_name, |
| 573 | + table=table_sql, |
| 574 | + flatten_json_output=flatten_json_output, |
| 575 | + task_type=task_type, |
| 576 | + output_dimensionality=output_dimensionality, |
| 577 | + ) |
| 578 | + |
| 579 | + if session is None: |
| 580 | + return bpd.read_gbq_query(sql) |
| 581 | + else: |
| 582 | + return session.read_gbq_query(sql) |
0 commit comments