Skip to content

Commit e8629d2

Browse files
authored
Merge branch 'main' into generate-embedding-impl-11924477578091076513
2 parents fa54cb2 + 4b0f13b commit e8629d2

File tree

6 files changed

+356
-0
lines changed

6 files changed

+356
-0
lines changed

bigframes/bigquery/__init__.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,7 @@
4343
st_regionstats,
4444
st_simplify,
4545
)
46+
from bigframes.bigquery._operations.io import load_data
4647
from bigframes.bigquery._operations.json import (
4748
json_extract,
4849
json_extract_array,
@@ -107,6 +108,8 @@
107108
struct,
108109
# table ops
109110
create_external_table,
111+
# io ops
112+
load_data,
110113
]
111114

112115
_module = sys.modules[__name__]
@@ -160,6 +163,8 @@
160163
"struct",
161164
# table ops
162165
"create_external_table",
166+
# io ops
167+
"load_data",
163168
# Modules / SQL namespaces
164169
"ai",
165170
"ml",
Lines changed: 94 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,94 @@
1+
# Copyright 2026 Google LLC
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
from __future__ import annotations
16+
17+
from typing import Mapping, Optional, Union
18+
19+
import pandas as pd
20+
21+
from bigframes.bigquery._operations.table import _get_table_metadata
22+
import bigframes.core.logging.log_adapter as log_adapter
23+
import bigframes.core.sql.io
24+
import bigframes.session
25+
26+
27+
@log_adapter.method_logger(custom_base_name="bigquery_io")
28+
def load_data(
29+
table_name: str,
30+
*,
31+
write_disposition: str = "INTO",
32+
columns: Optional[Mapping[str, str]] = None,
33+
partition_by: Optional[list[str]] = None,
34+
cluster_by: Optional[list[str]] = None,
35+
table_options: Optional[Mapping[str, Union[str, int, float, bool, list]]] = None,
36+
from_files_options: Mapping[str, Union[str, int, float, bool, list]],
37+
with_partition_columns: Optional[Mapping[str, str]] = None,
38+
connection_name: Optional[str] = None,
39+
session: Optional[bigframes.session.Session] = None,
40+
) -> pd.Series:
41+
"""
42+
Loads data into a BigQuery table.
43+
See the `BigQuery LOAD DATA DDL syntax
44+
<https://docs.cloud.google.com/bigquery/docs/reference/standard-sql/load-statements>`_
45+
for additional reference.
46+
Args:
47+
table_name (str):
48+
The name of the table in BigQuery.
49+
write_disposition (str, default "INTO"):
50+
Whether to replace the table if it already exists ("OVERWRITE") or append to it ("INTO").
51+
columns (Mapping[str, str], optional):
52+
The table's schema.
53+
partition_by (list[str], optional):
54+
A list of partition expressions to partition the table by. See https://docs.cloud.google.com/bigquery/docs/reference/standard-sql/load-statements#partition_expression.
55+
cluster_by (list[str], optional):
56+
A list of columns to cluster the table by.
57+
table_options (Mapping[str, Union[str, int, float, bool, list]], optional):
58+
The table options.
59+
from_files_options (Mapping[str, Union[str, int, float, bool, list]]):
60+
The options for loading data from files.
61+
with_partition_columns (Mapping[str, str], optional):
62+
The table's partition columns.
63+
connection_name (str, optional):
64+
The connection to use for the table.
65+
session (bigframes.session.Session, optional):
66+
The session to use. If not provided, the default session is used.
67+
Returns:
68+
pandas.Series:
69+
A Series with object dtype containing the table metadata. Reference
70+
the `BigQuery Table REST API reference
71+
<https://cloud.google.com/bigquery/docs/reference/rest/v2/tables#Table>`_
72+
for available fields.
73+
"""
74+
import bigframes.pandas as bpd
75+
76+
sql = bigframes.core.sql.io.load_data_ddl(
77+
table_name=table_name,
78+
write_disposition=write_disposition,
79+
columns=columns,
80+
partition_by=partition_by,
81+
cluster_by=cluster_by,
82+
table_options=table_options,
83+
from_files_options=from_files_options,
84+
with_partition_columns=with_partition_columns,
85+
connection_name=connection_name,
86+
)
87+
88+
if session is None:
89+
bpd.read_gbq_query(sql)
90+
session = bpd.get_global_session()
91+
else:
92+
session.read_gbq_query(sql)
93+
94+
return _get_table_metadata(bqclient=session.bqclient, table_name=table_name)

bigframes/core/sql/io.py

Lines changed: 87 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,87 @@
1+
# Copyright 2026 Google LLC
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
from __future__ import annotations
16+
17+
from typing import Mapping, Optional, Union
18+
19+
20+
def load_data_ddl(
21+
table_name: str,
22+
*,
23+
write_disposition: str = "INTO",
24+
columns: Optional[Mapping[str, str]] = None,
25+
partition_by: Optional[list[str]] = None,
26+
cluster_by: Optional[list[str]] = None,
27+
table_options: Optional[Mapping[str, Union[str, int, float, bool, list]]] = None,
28+
from_files_options: Mapping[str, Union[str, int, float, bool, list]],
29+
with_partition_columns: Optional[Mapping[str, str]] = None,
30+
connection_name: Optional[str] = None,
31+
) -> str:
32+
"""Generates the LOAD DATA DDL statement."""
33+
statement = ["LOAD DATA"]
34+
statement.append(write_disposition)
35+
statement.append(table_name)
36+
37+
if columns:
38+
column_defs = ", ".join([f"{name} {typ}" for name, typ in columns.items()])
39+
statement.append(f"({column_defs})")
40+
41+
if partition_by:
42+
statement.append(f"PARTITION BY {', '.join(partition_by)}")
43+
44+
if cluster_by:
45+
statement.append(f"CLUSTER BY {', '.join(cluster_by)}")
46+
47+
if table_options:
48+
opts = []
49+
for key, value in table_options.items():
50+
if isinstance(value, str):
51+
value_sql = repr(value)
52+
opts.append(f"{key} = {value_sql}")
53+
elif isinstance(value, bool):
54+
opts.append(f"{key} = {str(value).upper()}")
55+
elif isinstance(value, list):
56+
list_str = ", ".join([repr(v) for v in value])
57+
opts.append(f"{key} = [{list_str}]")
58+
else:
59+
opts.append(f"{key} = {value}")
60+
options_str = ", ".join(opts)
61+
statement.append(f"OPTIONS ({options_str})")
62+
63+
opts = []
64+
for key, value in from_files_options.items():
65+
if isinstance(value, str):
66+
value_sql = repr(value)
67+
opts.append(f"{key} = {value_sql}")
68+
elif isinstance(value, bool):
69+
opts.append(f"{key} = {str(value).upper()}")
70+
elif isinstance(value, list):
71+
list_str = ", ".join([repr(v) for v in value])
72+
opts.append(f"{key} = [{list_str}]")
73+
else:
74+
opts.append(f"{key} = {value}")
75+
options_str = ", ".join(opts)
76+
statement.append(f"FROM FILES ({options_str})")
77+
78+
if with_partition_columns:
79+
part_defs = ", ".join(
80+
[f"{name} {typ}" for name, typ in with_partition_columns.items()]
81+
)
82+
statement.append(f"WITH PARTITION COLUMNS ({part_defs})")
83+
84+
if connection_name:
85+
statement.append(f"WITH CONNECTION `{connection_name}`")
86+
87+
return " ".join(statement)
Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,39 @@
1+
# Copyright 2026 Google LLC
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
import bigframes.bigquery as bbq
16+
17+
18+
def test_load_data(session, dataset_id):
19+
table_name = f"{dataset_id}.test_load_data"
20+
uri = "gs://cloud-samples-data/bigquery/us-states/us-states.csv"
21+
22+
# Create the external table
23+
table = bbq.load_data(
24+
table_name,
25+
columns={
26+
"name": "STRING",
27+
"post_abbr": "STRING",
28+
},
29+
from_files_options={"format": "CSV", "uris": [uri], "skip_leading_rows": 1},
30+
session=session,
31+
)
32+
assert table is not None
33+
34+
# Read the table to verify
35+
import bigframes.pandas as bpd
36+
37+
bf_df = bpd.read_gbq(table_name)
38+
pd_df = bf_df.to_pandas()
39+
assert len(pd_df) > 0
Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,41 @@
1+
# Copyright 2026 Google LLC
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
from unittest import mock
16+
17+
import pytest
18+
19+
import bigframes.bigquery._operations.io
20+
import bigframes.core.sql.io
21+
import bigframes.session
22+
23+
24+
@pytest.fixture
25+
def mock_session():
26+
return mock.create_autospec(spec=bigframes.session.Session)
27+
28+
29+
@mock.patch("bigframes.bigquery._operations.io._get_table_metadata")
30+
def test_load_data(get_table_metadata_mock, mock_session):
31+
bigframes.bigquery._operations.io.load_data(
32+
"my-project.my_dataset.my_table",
33+
columns={"col1": "INT64", "col2": "STRING"},
34+
from_files_options={"format": "CSV", "uris": ["gs://bucket/path*"]},
35+
session=mock_session,
36+
)
37+
mock_session.read_gbq_query.assert_called_once()
38+
generated_sql = mock_session.read_gbq_query.call_args[0][0]
39+
expected = "LOAD DATA INTO my-project.my_dataset.my_table (col1 INT64, col2 STRING) FROM FILES (format = 'CSV', uris = ['gs://bucket/path*'])"
40+
assert generated_sql == expected
41+
get_table_metadata_mock.assert_called_once()

tests/unit/core/sql/test_io.py

Lines changed: 90 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,90 @@
1+
# Copyright 2026 Google LLC
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may
6+
# http://www.apache.org/licenses/LICENSE-2.0
7+
#
8+
# Unless required by applicable law or agreed to in writing, software
9+
# distributed under the License is distributed on an "AS IS" BASIS,
10+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11+
# See the License for the specific language governing permissions and
12+
# limitations under the License.
13+
14+
import bigframes.core.sql.io
15+
16+
17+
def test_load_data_ddl():
18+
sql = bigframes.core.sql.io.load_data_ddl(
19+
"my-project.my_dataset.my_table",
20+
columns={"col1": "INT64", "col2": "STRING"},
21+
from_files_options={"format": "CSV", "uris": ["gs://bucket/path*"]},
22+
)
23+
expected = "LOAD DATA INTO my-project.my_dataset.my_table (col1 INT64, col2 STRING) FROM FILES (format = 'CSV', uris = ['gs://bucket/path*'])"
24+
assert sql == expected
25+
26+
27+
def test_load_data_ddl_overwrite():
28+
sql = bigframes.core.sql.io.load_data_ddl(
29+
"my-project.my_dataset.my_table",
30+
write_disposition="OVERWRITE",
31+
columns={"col1": "INT64", "col2": "STRING"},
32+
from_files_options={"format": "CSV", "uris": ["gs://bucket/path*"]},
33+
)
34+
expected = "LOAD DATA OVERWRITE my-project.my_dataset.my_table (col1 INT64, col2 STRING) FROM FILES (format = 'CSV', uris = ['gs://bucket/path*'])"
35+
assert sql == expected
36+
37+
38+
def test_load_data_ddl_with_partition_columns():
39+
sql = bigframes.core.sql.io.load_data_ddl(
40+
"my-project.my_dataset.my_table",
41+
columns={"col1": "INT64", "col2": "STRING"},
42+
with_partition_columns={"part1": "DATE", "part2": "STRING"},
43+
from_files_options={"format": "CSV", "uris": ["gs://bucket/path*"]},
44+
)
45+
expected = "LOAD DATA INTO my-project.my_dataset.my_table (col1 INT64, col2 STRING) FROM FILES (format = 'CSV', uris = ['gs://bucket/path*']) WITH PARTITION COLUMNS (part1 DATE, part2 STRING)"
46+
assert sql == expected
47+
48+
49+
def test_load_data_ddl_connection():
50+
sql = bigframes.core.sql.io.load_data_ddl(
51+
"my-project.my_dataset.my_table",
52+
columns={"col1": "INT64", "col2": "STRING"},
53+
connection_name="my-connection",
54+
from_files_options={"format": "CSV", "uris": ["gs://bucket/path*"]},
55+
)
56+
expected = "LOAD DATA INTO my-project.my_dataset.my_table (col1 INT64, col2 STRING) FROM FILES (format = 'CSV', uris = ['gs://bucket/path*']) WITH CONNECTION `my-connection`"
57+
assert sql == expected
58+
59+
60+
def test_load_data_ddl_partition_by():
61+
sql = bigframes.core.sql.io.load_data_ddl(
62+
"my-project.my_dataset.my_table",
63+
columns={"col1": "INT64", "col2": "STRING"},
64+
partition_by=["date_col"],
65+
from_files_options={"format": "CSV", "uris": ["gs://bucket/path*"]},
66+
)
67+
expected = "LOAD DATA INTO my-project.my_dataset.my_table (col1 INT64, col2 STRING) PARTITION BY date_col FROM FILES (format = 'CSV', uris = ['gs://bucket/path*'])"
68+
assert sql == expected
69+
70+
71+
def test_load_data_ddl_cluster_by():
72+
sql = bigframes.core.sql.io.load_data_ddl(
73+
"my-project.my_dataset.my_table",
74+
columns={"col1": "INT64", "col2": "STRING"},
75+
cluster_by=["cluster_col"],
76+
from_files_options={"format": "CSV", "uris": ["gs://bucket/path*"]},
77+
)
78+
expected = "LOAD DATA INTO my-project.my_dataset.my_table (col1 INT64, col2 STRING) CLUSTER BY cluster_col FROM FILES (format = 'CSV', uris = ['gs://bucket/path*'])"
79+
assert sql == expected
80+
81+
82+
def test_load_data_ddl_table_options():
83+
sql = bigframes.core.sql.io.load_data_ddl(
84+
"my-project.my_dataset.my_table",
85+
columns={"col1": "INT64", "col2": "STRING"},
86+
table_options={"description": "my table"},
87+
from_files_options={"format": "CSV", "uris": ["gs://bucket/path*"]},
88+
)
89+
expected = "LOAD DATA INTO my-project.my_dataset.my_table (col1 INT64, col2 STRING) OPTIONS (description = 'my table') FROM FILES (format = 'CSV', uris = ['gs://bucket/path*'])"
90+
assert sql == expected

0 commit comments

Comments
 (0)