Merge remote-tracking branch 'github/main' into grouped_value_counts

TrevorBergeron · TrevorBergeron · commit 8fe7c9d11549 · 2025-08-07T19:47:48.000Z
diff --git a/bigframes/core/compile/compiled.py b/bigframes/core/compile/compiled.py
@@ -459,7 +459,7 @@ def project_window_op(
             for column in inputs:
                 clauses.append((column.isnull(), ibis_types.null()))
         if window_spec.min_periods and len(inputs) > 0:
-            if expression.op.skips_nulls:
+            if not expression.op.nulls_count_for_min_values:
                 # Most operations do not count NULL values towards min_periods
                 per_col_does_count = (column.notnull() for column in inputs)
                 # All inputs must be non-null for observation to count
diff --git a/bigframes/core/groupby/dataframe_group_by.py b/bigframes/core/groupby/dataframe_group_by.py
@@ -263,6 +263,48 @@ def kurt(
 
     kurtosis = kurt
 
+    @validations.requires_ordering()
+    def first(self, numeric_only: bool = False, min_count: int = -1) -> df.DataFrame:
+        window_spec = window_specs.unbound(
+            grouping_keys=tuple(self._by_col_ids),
+            min_periods=min_count if min_count >= 0 else 0,
+        )
+        target_cols, index = self._aggregated_columns(numeric_only)
+        block, firsts_ids = self._block.multi_apply_window_op(
+            target_cols,
+            agg_ops.FirstNonNullOp(),
+            window_spec=window_spec,
+        )
+        block, _ = block.aggregate(
+            self._by_col_ids,
+            tuple(
+                aggs.agg(firsts_id, agg_ops.AnyValueOp()) for firsts_id in firsts_ids
+            ),
+            dropna=self._dropna,
+            column_labels=index,
+        )
+        return df.DataFrame(block)
+
+    @validations.requires_ordering()
+    def last(self, numeric_only: bool = False, min_count: int = -1) -> df.DataFrame:
+        window_spec = window_specs.unbound(
+            grouping_keys=tuple(self._by_col_ids),
+            min_periods=min_count if min_count >= 0 else 0,
+        )
+        target_cols, index = self._aggregated_columns(numeric_only)
+        block, lasts_ids = self._block.multi_apply_window_op(
+            target_cols,
+            agg_ops.LastNonNullOp(),
+            window_spec=window_spec,
+        )
+        block, _ = block.aggregate(
+            self._by_col_ids,
+            tuple(aggs.agg(lasts_id, agg_ops.AnyValueOp()) for lasts_id in lasts_ids),
+            dropna=self._dropna,
+            column_labels=index,
+        )
+        return df.DataFrame(block)
+
     def all(self) -> df.DataFrame:
         return self._aggregate_all(agg_ops.all_op)
 
diff --git a/bigframes/core/groupby/series_group_by.py b/bigframes/core/groupby/series_group_by.py
@@ -36,6 +36,7 @@
 import bigframes.core.window as windows
 import bigframes.core.window_spec as window_specs
 import bigframes.dataframe as df
+import bigframes.dtypes
 import bigframes.operations.aggregations as agg_ops
 import bigframes.series as series
 
@@ -162,6 +163,54 @@ def kurt(self, *args, **kwargs) -> series.Series:
 
     kurtosis = kurt
 
+    @validations.requires_ordering()
+    def first(self, numeric_only: bool = False, min_count: int = -1) -> series.Series:
+        if numeric_only and not bigframes.dtypes.is_numeric(
+            self._block.expr.get_column_type(self._value_column)
+        ):
+            raise TypeError(
+                f"Cannot use 'numeric_only' with non-numeric column {self._value_name}."
+            )
+        window_spec = window_specs.unbound(
+            grouping_keys=tuple(self._by_col_ids),
+            min_periods=min_count if min_count >= 0 else 0,
+        )
+        block, firsts_id = self._block.apply_window_op(
+            self._value_column,
+            agg_ops.FirstNonNullOp(),
+            window_spec=window_spec,
+        )
+        block, _ = block.aggregate(
+            self._by_col_ids,
+            (aggs.agg(firsts_id, agg_ops.AnyValueOp()),),
+            dropna=self._dropna,
+        )
+        return series.Series(block.with_column_labels([self._value_name]))
+
+    @validations.requires_ordering()
+    def last(self, numeric_only: bool = False, min_count: int = -1) -> series.Series:
+        if numeric_only and not bigframes.dtypes.is_numeric(
+            self._block.expr.get_column_type(self._value_column)
+        ):
+            raise TypeError(
+                f"Cannot use 'numeric_only' with non-numeric column {self._value_name}."
+            )
+        window_spec = window_specs.unbound(
+            grouping_keys=tuple(self._by_col_ids),
+            min_periods=min_count if min_count >= 0 else 0,
+        )
+        block, firsts_id = self._block.apply_window_op(
+            self._value_column,
+            agg_ops.LastNonNullOp(),
+            window_spec=window_spec,
+        )
+        block, _ = block.aggregate(
+            self._by_col_ids,
+            (aggs.agg(firsts_id, agg_ops.AnyValueOp()),),
+            dropna=self._dropna,
+        )
+        return series.Series(block.with_column_labels([self._value_name]))
+
     def prod(self, *args) -> series.Series:
         return self._aggregate(agg_ops.product_op)
 
@@ -338,7 +387,7 @@ def _apply_window_op(
         discard_name=False,
         window: typing.Optional[window_specs.WindowSpec] = None,
         never_skip_nulls: bool = False,
-    ):
+    ) -> series.Series:
         """Apply window op to groupby. Defaults to grouped cumulative window."""
         window_spec = window or window_specs.cumulative_rows(
             grouping_keys=tuple(self._by_col_ids)
diff --git a/bigframes/functions/_function_client.py b/bigframes/functions/_function_client.py
@@ -366,8 +366,8 @@ def generate_cloud_function_code(
     def create_cloud_function(
         self,
         def_,
-        cf_name,
         *,
+        random_name,
         input_types: Tuple[str],
         output_type: str,
         package_requirements=None,
@@ -428,9 +428,9 @@ def create_cloud_function(
             create_function_request.parent = (
                 self.get_cloud_function_fully_qualified_parent()
             )
-            create_function_request.function_id = cf_name
+            create_function_request.function_id = random_name
             function = functions_v2.Function()
-            function.name = self.get_cloud_function_fully_qualified_name(cf_name)
+            function.name = self.get_cloud_function_fully_qualified_name(random_name)
             function.build_config = functions_v2.BuildConfig()
             function.build_config.runtime = python_version
             function.build_config.entry_point = entry_point
@@ -497,24 +497,25 @@ def create_cloud_function(
                 # Cleanup
                 os.remove(archive_path)
             except google.api_core.exceptions.AlreadyExists:
-                # If a cloud function with the same name already exists, let's
-                # update it
-                update_function_request = functions_v2.UpdateFunctionRequest()
-                update_function_request.function = function
-                operation = self._cloud_functions_client.update_function(
-                    request=update_function_request
-                )
-                operation.result()
+                # b/437124912: The most likely scenario is that
+                # `create_function` had a retry due to a network issue. The
+                # retried request then fails because the first call actually
+                # succeeded, but we didn't get the successful response back.
+                #
+                # Since the function name was randomly chosen to avoid
+                # conflicts, we know the AlreadyExist can only happen because
+                # we created it. This error is safe to ignore.
+                pass
 
         # Fetch the endpoint of the just created function
-        endpoint = self.get_cloud_function_endpoint(cf_name)
+        endpoint = self.get_cloud_function_endpoint(random_name)
         if not endpoint:
             raise bf_formatting.create_exception_with_feedback_link(
                 ValueError, "Couldn't fetch the http endpoint."
             )
 
         logger.info(
-            f"Successfully created cloud function {cf_name} with uri ({endpoint})"
+            f"Successfully created cloud function {random_name} with uri ({endpoint})"
         )
         return endpoint
 
@@ -571,7 +572,7 @@ def provision_bq_remote_function(
         if not cf_endpoint:
             cf_endpoint = self.create_cloud_function(
                 def_,
-                cloud_function_name,
+                random_name=cloud_function_name,
                 input_types=input_types,
                 output_type=output_type,
                 package_requirements=package_requirements,
diff --git a/bigframes/operations/aggregations.py b/bigframes/operations/aggregations.py
@@ -33,6 +33,11 @@ def skips_nulls(self):
         """Whether the window op skips null rows."""
         return True
 
+    @property
+    def nulls_count_for_min_values(self) -> bool:
+        """Whether null values count for min_values."""
+        return not self.skips_nulls
+
     @property
     def implicitly_inherits_order(self):
         """
@@ -480,6 +485,10 @@ class FirstNonNullOp(UnaryWindowOp):
     def skips_nulls(self):
         return False
 
+    @property
+    def nulls_count_for_min_values(self) -> bool:
+        return False
+
 
 @dataclasses.dataclass(frozen=True)
 class LastOp(UnaryWindowOp):
@@ -492,6 +501,10 @@ class LastNonNullOp(UnaryWindowOp):
     def skips_nulls(self):
         return False
 
+    @property
+    def nulls_count_for_min_values(self) -> bool:
+        return False
+
 
 @dataclasses.dataclass(frozen=True)
 class ShiftOp(UnaryWindowOp):
diff --git a/tests/system/small/test_groupby.py b/tests/system/small/test_groupby.py
@@ -622,6 +622,59 @@ def test_dataframe_groupby_value_counts(
         pd.testing.assert_frame_equal(pd_result, bf_result, check_dtype=False)
 
 
+@pytest.mark.parametrize(
+    ("numeric_only", "min_count"),
+    [
+        (False, 4),
+        (True, 0),
+    ],
+)
+def test_dataframe_groupby_first(
+    scalars_df_index, scalars_pandas_df_index, numeric_only, min_count
+):
+    # min_count seems to not work properly on older pandas
+    pytest.importorskip("pandas", minversion="2.0.0")
+    # bytes, dates not handling min_count properly in pandas
+    bf_result = (
+        scalars_df_index.drop(columns=["bytes_col", "date_col"])
+        .groupby(scalars_df_index.int64_col % 2)
+        .first(numeric_only=numeric_only, min_count=min_count)
+    ).to_pandas()
+    pd_result = (
+        scalars_pandas_df_index.drop(columns=["bytes_col", "date_col"])
+        .groupby(scalars_pandas_df_index.int64_col % 2)
+        .first(numeric_only=numeric_only, min_count=min_count)
+    )
+    pd.testing.assert_frame_equal(
+        pd_result,
+        bf_result,
+    )
+
+
+@pytest.mark.parametrize(
+    ("numeric_only", "min_count"),
+    [
+        (True, 2),
+        (False, -1),
+    ],
+)
+def test_dataframe_groupby_last(
+    scalars_df_index, scalars_pandas_df_index, numeric_only, min_count
+):
+    bf_result = (
+        scalars_df_index.groupby(scalars_df_index.int64_col % 2).last(
+            numeric_only=numeric_only, min_count=min_count
+        )
+    ).to_pandas()
+    pd_result = scalars_pandas_df_index.groupby(
+        scalars_pandas_df_index.int64_col % 2
+    ).last(numeric_only=numeric_only, min_count=min_count)
+    pd.testing.assert_frame_equal(
+        pd_result,
+        bf_result,
+    )
+
+
 # ==============
 # Series.groupby
 # ==============
@@ -841,3 +894,48 @@ def test_series_groupby_value_counts(
         normalize=normalize, ascending=ascending, dropna=dropna
     )
     pd.testing.assert_series_equal(pd_result, bf_result, check_dtype=False)
+
+
+@pytest.mark.parametrize(
+    ("numeric_only", "min_count"),
+    [
+        (True, 2),
+        (False, -1),
+    ],
+)
+def test_series_groupby_first(
+    scalars_df_index, scalars_pandas_df_index, numeric_only, min_count
+):
+    bf_result = (
+        scalars_df_index.groupby("string_col")["int64_col"].first(
+            numeric_only=numeric_only, min_count=min_count
+        )
+    ).to_pandas()
+    pd_result = scalars_pandas_df_index.groupby("string_col")["int64_col"].first(
+        numeric_only=numeric_only, min_count=min_count
+    )
+    pd.testing.assert_series_equal(
+        pd_result,
+        bf_result,
+    )
+
+
+@pytest.mark.parametrize(
+    ("numeric_only", "min_count"),
+    [
+        (False, 4),
+        (True, 0),
+    ],
+)
+def test_series_groupby_last(
+    scalars_df_index, scalars_pandas_df_index, numeric_only, min_count
+):
+    bf_result = (
+        scalars_df_index.groupby("string_col")["int64_col"].last(
+            numeric_only=numeric_only, min_count=min_count
+        )
+    ).to_pandas()
+    pd_result = scalars_pandas_df_index.groupby("string_col")["int64_col"].last(
+        numeric_only=numeric_only, min_count=min_count
+    )
+    pd.testing.assert_series_equal(pd_result, bf_result)
diff --git a/third_party/bigframes_vendored/pandas/core/groupby/__init__.py b/third_party/bigframes_vendored/pandas/core/groupby/__init__.py