Merge branch 'main' into main_chelsealin_add

chelsea-lin · web-flow · commit 5229ec5b8e09 · 2025-07-30T17:00:03.000-07:00
diff --git a/bigframes/core/compile/sqlglot/aggregations/nullary_compiler.py b/bigframes/core/compile/sqlglot/aggregations/nullary_compiler.py
@@ -20,7 +20,7 @@
 
 from bigframes.core import window_spec
 import bigframes.core.compile.sqlglot.aggregations.op_registration as reg
-from bigframes.core.compile.sqlglot.aggregations.utils import apply_window_if_present
+from bigframes.core.compile.sqlglot.aggregations.windows import apply_window_if_present
 from bigframes.operations import aggregations as agg_ops
 
 NULLARY_OP_REGISTRATION = reg.OpRegistration()
diff --git a/bigframes/core/compile/sqlglot/aggregations/unary_compiler.py b/bigframes/core/compile/sqlglot/aggregations/unary_compiler.py
@@ -20,7 +20,7 @@
 
 from bigframes.core import window_spec
 import bigframes.core.compile.sqlglot.aggregations.op_registration as reg
-from bigframes.core.compile.sqlglot.aggregations.utils import apply_window_if_present
+from bigframes.core.compile.sqlglot.aggregations.windows import apply_window_if_present
 import bigframes.core.compile.sqlglot.expressions.typed_expr as typed_expr
 import bigframes.core.compile.sqlglot.sqlglot_ir as ir
 from bigframes.operations import aggregations as agg_ops
diff --git a/bigframes/core/compile/sqlglot/aggregations/utils.py b/bigframes/core/compile/sqlglot/aggregations/utils.py
diff --git a/bigframes/core/compile/sqlglot/aggregations/windows.py b/bigframes/core/compile/sqlglot/aggregations/windows.py
@@ -0,0 +1,153 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import annotations
+
+import typing
+
+import sqlglot.expressions as sge
+
+from bigframes.core import utils, window_spec
+import bigframes.core.compile.sqlglot.scalar_compiler as scalar_compiler
+import bigframes.core.ordering as ordering_spec
+
+
+def apply_window_if_present(
+    value: sge.Expression,
+    window: typing.Optional[window_spec.WindowSpec] = None,
+) -> sge.Expression:
+    if window is None:
+        return value
+
+    if window.is_row_bounded and not window.ordering:
+        raise ValueError("No ordering provided for ordered analytic function")
+    elif (
+        not window.is_row_bounded
+        and not window.is_range_bounded
+        and not window.ordering
+    ):
+        # Unbound grouping window.
+        order_by = None
+    elif window.is_range_bounded:
+        # Note that, when the window is range-bounded, we only need one ordering key.
+        # There are two reasons:
+        # 1. Manipulating null positions requires more than one ordering key, which
+        #  is forbidden by SQL window syntax for range rolling.
+        # 2. Pandas does not allow range rolling on timeseries with nulls.
+        order_by = get_window_order_by((window.ordering[0],), override_null_order=False)
+    else:
+        order_by = get_window_order_by(window.ordering, override_null_order=True)
+
+    order = sge.Order(expressions=order_by) if order_by else None
+
+    group_by = (
+        [scalar_compiler.compile_scalar_expression(key) for key in window.grouping_keys]
+        if window.grouping_keys
+        else None
+    )
+
+    # This is the key change. Don't create a spec for the default window frame
+    # if there's no ordering. This avoids generating an `ORDER BY NULL` clause.
+    if not window.bounds and not order:
+        return sge.Window(this=value, partition_by=group_by)
+
+    kind = (
+        "ROWS" if isinstance(window.bounds, window_spec.RowsWindowBounds) else "RANGE"
+    )
+
+    start: typing.Union[int, float, None] = None
+    end: typing.Union[int, float, None] = None
+    if isinstance(window.bounds, window_spec.RangeWindowBounds):
+        if window.bounds.start is not None:
+            start = utils.timedelta_to_micros(window.bounds.start)
+        if window.bounds.end is not None:
+            end = utils.timedelta_to_micros(window.bounds.end)
+    elif window.bounds:
+        start = window.bounds.start
+        end = window.bounds.end
+
+    start_value, start_side = _get_window_bounds(start, is_preceding=True)
+    end_value, end_side = _get_window_bounds(end, is_preceding=False)
+
+    spec = sge.WindowSpec(
+        kind=kind,
+        start=start_value,
+        start_side=start_side,
+        end=end_value,
+        end_side=end_side,
+        over="OVER",
+    )
+
+    return sge.Window(this=value, partition_by=group_by, order=order, spec=spec)
+
+
+def get_window_order_by(
+    ordering: typing.Tuple[ordering_spec.OrderingExpression, ...],
+    override_null_order: bool = False,
+) -> typing.Optional[tuple[sge.Ordered, ...]]:
+    """Returns the SQL order by clause for a window specification."""
+    if not ordering:
+        return None
+
+    order_by = []
+    for ordering_spec_item in ordering:
+        expr = scalar_compiler.compile_scalar_expression(
+            ordering_spec_item.scalar_expression
+        )
+        desc = not ordering_spec_item.direction.is_ascending
+        nulls_first = not ordering_spec_item.na_last
+
+        if override_null_order:
+            # Bigquery SQL considers NULLS to be "smallest" values, but we need
+            # to override in these cases.
+            is_null_expr = sge.Is(this=expr, expression=sge.Null())
+            if nulls_first and desc:
+                order_by.append(
+                    sge.Ordered(
+                        this=is_null_expr,
+                        desc=desc,
+                        nulls_first=nulls_first,
+                    )
+                )
+            elif not nulls_first and not desc:
+                order_by.append(
+                    sge.Ordered(
+                        this=is_null_expr,
+                        desc=desc,
+                        nulls_first=nulls_first,
+                    )
+                )
+
+        order_by.append(
+            sge.Ordered(
+                this=expr,
+                desc=desc,
+                nulls_first=nulls_first,
+            )
+        )
+    return tuple(order_by)
+
+
+def _get_window_bounds(
+    value, is_preceding: bool
+) -> tuple[typing.Union[str, sge.Expression], typing.Optional[str]]:
+    """Compiles a single boundary value into its SQL components."""
+    if value is None:
+        side = "PRECEDING" if is_preceding else "FOLLOWING"
+        return "UNBOUNDED", side
+
+    if value == 0:
+        return "CURRENT ROW", None
+
+    side = "PRECEDING" if value < 0 else "FOLLOWING"
+    return sge.convert(abs(value)), side
diff --git a/bigframes/core/compile/sqlglot/compiler.py b/bigframes/core/compile/sqlglot/compiler.py
@@ -23,6 +23,7 @@
 from bigframes.core import expression, guid, identifiers, nodes, pyarrow_utils, rewrite
 from bigframes.core.compile import configs
 import bigframes.core.compile.sqlglot.aggregate_compiler as aggregate_compiler
+from bigframes.core.compile.sqlglot.aggregations import windows
 from bigframes.core.compile.sqlglot.expressions import typed_expr
 import bigframes.core.compile.sqlglot.scalar_compiler as scalar_compiler
 import bigframes.core.compile.sqlglot.sqlglot_ir as ir
@@ -272,18 +273,16 @@ def compile_random_sample(
     def compile_aggregate(
         self, node: nodes.AggregateNode, child: ir.SQLGlotIR
     ) -> ir.SQLGlotIR:
-        ordering_cols = tuple(
-            sge.Ordered(
-                this=scalar_compiler.compile_scalar_expression(
-                    ordering.scalar_expression
-                ),
-                desc=ordering.direction.is_ascending is False,
-                nulls_first=ordering.na_last is False,
-            )
-            for ordering in node.order_by
+        ordering_cols = windows.get_window_order_by(
+            node.order_by, override_null_order=True
         )
         aggregations: tuple[tuple[str, sge.Expression], ...] = tuple(
-            (id.sql, aggregate_compiler.compile_aggregate(agg, order_by=ordering_cols))
+            (
+                id.sql,
+                aggregate_compiler.compile_aggregate(
+                    agg, order_by=ordering_cols if ordering_cols else ()
+                ),
+            )
             for agg, id in node.aggregations
         )
         by_cols: tuple[sge.Expression, ...] = tuple(
diff --git a/samples/snippets/conftest.py b/samples/snippets/conftest.py
@@ -12,9 +12,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from typing import Iterator
+from typing import Generator, Iterator
 
-from google.cloud import bigquery
+from google.cloud import bigquery, storage
 import pytest
 import test_utils.prefixer
 
@@ -42,11 +42,27 @@ def bigquery_client() -> bigquery.Client:
     return bigquery_client
 
 
+@pytest.fixture(scope="session")
+def storage_client(project_id: str) -> storage.Client:
+    return storage.Client(project=project_id)
+
+
 @pytest.fixture(scope="session")
 def project_id(bigquery_client: bigquery.Client) -> str:
     return bigquery_client.project
 
 
+@pytest.fixture(scope="session")
+def gcs_bucket(storage_client: storage.Client) -> Generator[str, None, None]:
+    bucket_name = "bigframes_blob_test_with_data_wipeout"
+
+    yield bucket_name
+
+    bucket = storage_client.get_bucket(bucket_name)
+    for blob in bucket.list_blobs():
+        blob.delete()
+
+
 @pytest.fixture(autouse=True)
 def reset_session() -> None:
     """An autouse fixture ensuring each sample runs in a fresh session.
@@ -78,11 +94,6 @@ def dataset_id_eu(bigquery_client: bigquery.Client, project_id: str) -> Iterator
     bigquery_client.delete_dataset(dataset, delete_contents=True, not_found_ok=True)
 
 
-@pytest.fixture(scope="session")
-def gcs_dst_bucket() -> str:
-    return "gs://bigframes_blob_test"
-
-
 @pytest.fixture
 def random_model_id(
     bigquery_client: bigquery.Client, project_id: str, dataset_id: str
diff --git a/samples/snippets/multimodal_test.py b/samples/snippets/multimodal_test.py
@@ -13,9 +13,9 @@
 # limitations under the License.
 
 
-def test_multimodal_dataframe(gcs_dst_bucket: str) -> None:
+def test_multimodal_dataframe(gcs_bucket: str) -> None:
     # destination folder must be in a GCS bucket that the BQ connection service account (default or user provided) has write access to.
-    dst_bucket = gcs_dst_bucket
+    dst_bucket = f"gs://{gcs_bucket}"
     # [START bigquery_dataframes_multimodal_dataframe_create]
     import bigframes
 
diff --git a/samples/snippets/sessions_and_io_test.py b/samples/snippets/sessions_and_io_test.py
@@ -13,10 +13,11 @@
 # limitations under the License.
 
 
-def test_sessions_and_io(project_id: str, dataset_id: str) -> None:
+def test_sessions_and_io(project_id: str, dataset_id: str, gcs_bucket: str) -> None:
     YOUR_PROJECT_ID = project_id
     YOUR_DATASET_ID = dataset_id
     YOUR_LOCATION = "us"
+    YOUR_BUCKET = gcs_bucket
 
     # [START bigquery_dataframes_create_and_use_session_instance]
     import bigframes
@@ -139,6 +140,15 @@ def test_sessions_and_io(project_id: str, dataset_id: str) -> None:
     # [END bigquery_dataframes_read_data_from_csv]
     assert df is not None
 
+    # [START bigquery_dataframes_write_data_to_csv]
+    import bigframes.pandas as bpd
+
+    df = bpd.DataFrame({"my_col": [1, 2, 3]})
+    # Write a dataframe to a CSV file in GCS
+    df.to_csv(f"gs://{YOUR_BUCKET}/myfile*.csv")
+    # [END bigquery_dataframes_write_data_to_csv]
+    assert df is not None
+
     # [START bigquery_dataframes_read_data_from_bigquery_table]
     import bigframes.pandas as bpd
 
diff --git a/tests/system/small/blob/test_properties.py b/tests/system/small/blob/test_properties.py
@@ -40,7 +40,7 @@ def test_blob_authorizer(images_mm_df: bpd.DataFrame, bq_connection: str):
 
 def test_blob_version(images_mm_df: bpd.DataFrame):
     actual = images_mm_df["blob_col"].blob.version().to_pandas()
-    expected = pd.Series(["1739574332294150", "1739574332271343"], name="version")
+    expected = pd.Series(["1753907851152593", "1753907851111538"], name="version")
 
     pd.testing.assert_series_equal(
         actual, expected, check_dtype=False, check_index_type=False
@@ -55,13 +55,13 @@ def test_blob_metadata(images_mm_df: bpd.DataFrame):
                 '{"content_type":"image/jpeg",'
                 '"md5_hash":"e130ad042261a1883cd2cc06831cf748",'
                 '"size":338390,'
-                '"updated":1739574332000000}'
+                '"updated":1753907851000000}'
             ),
             (
                 '{"content_type":"image/jpeg",'
                 '"md5_hash":"e2ae3191ff2b809fd0935f01a537c650",'
                 '"size":43333,'
-                '"updated":1739574332000000}'
+                '"updated":1753907851000000}'
             ),
         ],
         name="metadata",
@@ -105,8 +105,8 @@ def test_blob_updated(images_mm_df: bpd.DataFrame):
     actual = images_mm_df["blob_col"].blob.updated().to_pandas()
     expected = pd.Series(
         [
-            pd.Timestamp("2025-02-14 23:05:32", tz="UTC"),
-            pd.Timestamp("2025-02-14 23:05:32", tz="UTC"),
+            pd.Timestamp("2025-07-30 20:37:31", tz="UTC"),
+            pd.Timestamp("2025-07-30 20:37:31", tz="UTC"),
         ],
         name="updated",
     )
diff --git a/tests/unit/core/compile/sqlglot/aggregations/test_windows.py b/tests/unit/core/compile/sqlglot/aggregations/test_windows.py