Merge remote-tracking branch 'origin/main' into tswast-geo

tswast · tswast · commit 523c80e87bdf · 2025-08-05T17:23:06.000Z
diff --git a/bigframes/core/compile/polars/compiler.py b/bigframes/core/compile/polars/compiler.py
@@ -646,7 +646,7 @@ def _aggregate(
     def compile_explode(self, node: nodes.ExplodeNode):
         assert node.offsets_col is None
         df = self.compile_node(node.child)
-        cols = [pl.col(col.id.sql) for col in node.column_ids]
+        cols = [col.id.sql for col in node.column_ids]
         return df.explode(cols)
 
     @compile_node.register
diff --git a/bigframes/core/indexes/base.py b/bigframes/core/indexes/base.py
@@ -27,16 +27,12 @@
 import pandas
 
 from bigframes import dtypes
-from bigframes.core.array_value import ArrayValue
 import bigframes.core.block_transforms as block_ops
 import bigframes.core.blocks as blocks
 import bigframes.core.expression as ex
-import bigframes.core.identifiers as ids
-import bigframes.core.nodes as nodes
 import bigframes.core.ordering as order
 import bigframes.core.utils as utils
 import bigframes.core.validations as validations
-import bigframes.core.window_spec as window_spec
 import bigframes.dtypes
 import bigframes.formatting_helpers as formatter
 import bigframes.operations as ops
@@ -272,37 +268,20 @@ def get_loc(self, key) -> typing.Union[int, slice, "bigframes.series.Series"]:
         # Get the index column from the block
         index_column = self._block.index_columns[0]
 
-        # Apply row numbering to the original data
-        row_number_column_id = ids.ColumnId.unique()
-        window_node = nodes.WindowOpNode(
-            child=self._block._expr.node,
-            expression=ex.NullaryAggregation(agg_ops.RowNumberOp()),
-            window_spec=window_spec.unbound(),
-            output_name=row_number_column_id,
-            never_skip_nulls=True,
-        )
-
-        windowed_array = ArrayValue(window_node)
-        windowed_block = blocks.Block(
-            windowed_array,
-            index_columns=self._block.index_columns,
-            column_labels=self._block.column_labels.insert(
-                len(self._block.column_labels), None
-            ),
-            index_labels=self._block._index_labels,
+        # Use promote_offsets to get row numbers (similar to argmax/argmin implementation)
+        block_with_offsets, offsets_id = self._block.promote_offsets(
+            "temp_get_loc_offsets_"
         )
 
         # Create expression to find matching positions
         match_expr = ops.eq_op.as_expr(ex.deref(index_column), ex.const(key))
-        windowed_block, match_col_id = windowed_block.project_expr(match_expr)
+        block_with_offsets, match_col_id = block_with_offsets.project_expr(match_expr)
 
         # Filter to only rows where the key matches
-        filtered_block = windowed_block.filter_by_id(match_col_id)
+        filtered_block = block_with_offsets.filter_by_id(match_col_id)
 
-        # Check if key exists at all by counting on the filtered block
-        count_agg = ex.UnaryAggregation(
-            agg_ops.count_op, ex.deref(row_number_column_id.name)
-        )
+        # Check if key exists at all by counting
+        count_agg = ex.UnaryAggregation(agg_ops.count_op, ex.deref(offsets_id))
         count_result = filtered_block._expr.aggregate([(count_agg, "count")])
         count_scalar = self._block.session._executor.execute(
             count_result
@@ -313,9 +292,7 @@ def get_loc(self, key) -> typing.Union[int, slice, "bigframes.series.Series"]:
 
         # If only one match, return integer position
         if count_scalar == 1:
-            min_agg = ex.UnaryAggregation(
-                agg_ops.min_op, ex.deref(row_number_column_id.name)
-            )
+            min_agg = ex.UnaryAggregation(agg_ops.min_op, ex.deref(offsets_id))
             position_result = filtered_block._expr.aggregate([(min_agg, "position")])
             position_scalar = self._block.session._executor.execute(
                 position_result
@@ -325,32 +302,24 @@ def get_loc(self, key) -> typing.Union[int, slice, "bigframes.series.Series"]:
         # Handle multiple matches based on index monotonicity
         is_monotonic = self.is_monotonic_increasing or self.is_monotonic_decreasing
         if is_monotonic:
-            return self._get_monotonic_slice(filtered_block, row_number_column_id)
+            return self._get_monotonic_slice(filtered_block, offsets_id)
         else:
             # Return boolean mask for non-monotonic duplicates
-            mask_block = windowed_block.select_columns([match_col_id])
-            # Reset the index to use positional integers instead of original index values
+            mask_block = block_with_offsets.select_columns([match_col_id])
             mask_block = mask_block.reset_index(drop=True)
-            # Ensure correct dtype and name to match pandas behavior
             result_series = bigframes.series.Series(mask_block)
             return result_series.astype("boolean")
 
-    def _get_monotonic_slice(
-        self, filtered_block, row_number_column_id: "ids.ColumnId"
-    ) -> slice:
+    def _get_monotonic_slice(self, filtered_block, offsets_id: str) -> slice:
         """Helper method to get a slice for monotonic duplicates with an optimized query."""
         # Combine min and max aggregations into a single query for efficiency
         min_max_aggs = [
             (
-                ex.UnaryAggregation(
-                    agg_ops.min_op, ex.deref(row_number_column_id.name)
-                ),
+                ex.UnaryAggregation(agg_ops.min_op, ex.deref(offsets_id)),
                 "min_pos",
             ),
             (
-                ex.UnaryAggregation(
-                    agg_ops.max_op, ex.deref(row_number_column_id.name)
-                ),
+                ex.UnaryAggregation(agg_ops.max_op, ex.deref(offsets_id)),
                 "max_pos",
             ),
         ]
diff --git a/tests/system/small/ml/test_llm.py b/tests/system/small/ml/test_llm.py
@@ -251,6 +251,7 @@ def __eq__(self, other):
         return self.equals(other)
 
 
+@pytest.mark.skip("b/436340035 test failed")
 @pytest.mark.parametrize(
     (
         "model_class",
@@ -393,6 +394,7 @@ def test_text_generator_retry_success(
         )
 
 
+@pytest.mark.skip("b/436340035 test failed")
 @pytest.mark.parametrize(
     (
         "model_class",
@@ -509,6 +511,7 @@ def test_text_generator_retry_no_progress(session, model_class, options, bq_conn
         )
 
 
+@pytest.mark.skip("b/436340035 test failed")
 def test_text_embedding_generator_retry_success(session, bq_connection):
     # Requests.
     df0 = EqCmpAllDataFrame(
@@ -790,13 +793,14 @@ def test_gemini_preview_model_warnings(model_name):
         llm.GeminiTextGenerator(model_name=model_name)
 
 
+# b/436340035 temp disable the test to unblock presumbit
 @pytest.mark.parametrize(
     "model_class",
     [
         llm.TextEmbeddingGenerator,
         llm.MultimodalEmbeddingGenerator,
         llm.GeminiTextGenerator,
-        llm.Claude3TextGenerator,
+        # llm.Claude3TextGenerator,
     ],
 )
 def test_text_embedding_generator_no_default_model_warning(model_class):
diff --git a/tests/unit/test_dataframe_polars.py b/tests/unit/test_dataframe_polars.py
@@ -1198,6 +1198,7 @@ def test_df_fillna(scalars_dfs, col, fill_value):
     pd.testing.assert_frame_equal(bf_result, pd_result, check_dtype=False)
 
 
+@pytest.mark.skip("b/436316698 unit test failed for python 3.12")
 def test_df_ffill(scalars_dfs):
     scalars_df, scalars_pandas_df = scalars_dfs
     bf_result = scalars_df[["int64_col", "float64_col"]].ffill(limit=1).to_pandas()
diff --git a/third_party/bigframes_vendored/pandas/core/indexes/base.py b/third_party/bigframes_vendored/pandas/core/indexes/base.py
@@ -767,7 +767,7 @@ def get_loc(
             1     True
             2    False
             3     True
-            Name: nan, dtype: boolean
+            dtype: boolean
 
         Args:
             key: Label to get the location for.

Original file line number	Diff line number	Diff line change
`@@ -251,6 +251,7 @@ def __eq__(self, other):`
`251`	`251`	`return self.equals(other)`
`252`	`252`
`253`	`253`
	`254`	`+@pytest.mark.skip("b/436340035 test failed")`
`254`	`255`	`@pytest.mark.parametrize(`
`255`	`256`	`(`
`256`	`257`	`"model_class",`
`@@ -393,6 +394,7 @@ def test_text_generator_retry_success(`
`393`	`394`	`)`
`394`	`395`
`395`	`396`
	`397`	`+@pytest.mark.skip("b/436340035 test failed")`
`396`	`398`	`@pytest.mark.parametrize(`
`397`	`399`	`(`
`398`	`400`	`"model_class",`
`@@ -509,6 +511,7 @@ def test_text_generator_retry_no_progress(session, model_class, options, bq_conn`
`509`	`511`	`)`
`510`	`512`
`511`	`513`
	`514`	`+@pytest.mark.skip("b/436340035 test failed")`
`512`	`515`	`def test_text_embedding_generator_retry_success(session, bq_connection):`
`513`	`516`	`# Requests.`
`514`	`517`	`df0 = EqCmpAllDataFrame(`
`@@ -790,13 +793,14 @@ def test_gemini_preview_model_warnings(model_name):`
`790`	`793`	`llm.GeminiTextGenerator(model_name=model_name)`
`791`	`794`
`792`	`795`
	`796`	`+# b/436340035 temp disable the test to unblock presumbit`
`793`	`797`	`@pytest.mark.parametrize(`
`794`	`798`	`"model_class",`
`795`	`799`	`[`
`796`	`800`	`llm.TextEmbeddingGenerator,`
`797`	`801`	`llm.MultimodalEmbeddingGenerator,`
`798`	`802`	`llm.GeminiTextGenerator,`
`799`		`- llm.Claude3TextGenerator,`
	`803`	`+ # llm.Claude3TextGenerator,`
`800`	`804`	`],`
`801`	`805`	`)`
`802`	`806`	`def test_text_embedding_generator_no_default_model_warning(model_class):`