Skip to content

Commit fc2b3eb

Browse files
authored
Merge branch 'main' into udf-packages
2 parents 865d198 + 0bd5e1b commit fc2b3eb

File tree

33 files changed

+1046
-149
lines changed

33 files changed

+1046
-149
lines changed

bigframes/_config/display_options.py

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,8 +26,12 @@
2626
class DisplayOptions:
2727
__doc__ = vendored_pandas_config.display_options_doc
2828

29+
# Options borrowed from pandas.
2930
max_columns: int = 20
30-
max_rows: int = 25
31+
max_rows: int = 10
32+
precision: int = 6
33+
34+
# Options unique to BigQuery DataFrames.
3135
progress_bar: Optional[str] = "auto"
3236
repr_mode: Literal["head", "deferred", "anywidget"] = "head"
3337

@@ -52,6 +56,8 @@ def pandas_repr(display_options: DisplayOptions):
5256
display_options.max_columns,
5357
"display.max_rows",
5458
display_options.max_rows,
59+
"display.precision",
60+
display_options.precision,
5561
"display.show_dimensions",
5662
True,
5763
) as pandas_context:

bigframes/blob/_functions.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -473,7 +473,9 @@ def pdf_extract_func(src_obj_ref_rt: str) -> str:
473473
return result_json
474474

475475

476-
pdf_extract_def = FunctionDef(pdf_extract_func, ["pypdf", "requests", "pypdf[crypto]"])
476+
pdf_extract_def = FunctionDef(
477+
pdf_extract_func, ["pypdf>=5.3.1,<6.0.0", "requests", "cryptography==43.0.3"]
478+
)
477479

478480

479481
# Extracts text from a PDF url and chunks it simultaneously
@@ -527,4 +529,6 @@ def pdf_chunk_func(src_obj_ref_rt: str, chunk_size: int, overlap_size: int) -> s
527529
return result_json
528530

529531

530-
pdf_chunk_def = FunctionDef(pdf_chunk_func, ["pypdf", "requests", "pypdf[crypto]"])
532+
pdf_chunk_def = FunctionDef(
533+
pdf_chunk_func, ["pypdf>=5.3.1,<6.0.0", "requests", "cryptography==43.0.3"]
534+
)

bigframes/core/blocks.py

Lines changed: 25 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -387,25 +387,39 @@ def reversed(self) -> Block:
387387
index_labels=self.index.names,
388388
)
389389

390-
def reset_index(self, drop: bool = True) -> Block:
390+
def reset_index(self, level: LevelsType = None, drop: bool = True) -> Block:
391391
"""Reset the index of the block, promoting the old index to a value column.
392392
393393
Arguments:
394+
level: the label or index level of the index levels to remove.
394395
name: this is the column id for the new value id derived from the old index
395396
396397
Returns:
397398
A new Block because dropping index columns can break references
398399
from Index classes that point to this block.
399400
"""
401+
if level:
402+
# preserve original order, not user provided order
403+
level_ids: Sequence[str] = [
404+
id for id in self.index_columns if id in self.index.resolve_level(level)
405+
]
406+
else:
407+
level_ids = self.index_columns
408+
400409
expr = self._expr
401-
if (
410+
if set(self.index_columns) > set(level_ids):
411+
new_index_cols = [col for col in self.index_columns if col not in level_ids]
412+
new_index_labels = [self.col_id_to_index_name[id] for id in new_index_cols]
413+
elif (
402414
self.session._default_index_type
403415
== bigframes.enums.DefaultIndexKind.SEQUENTIAL_INT64
404416
):
405417
expr, new_index_col_id = expr.promote_offsets()
406418
new_index_cols = [new_index_col_id]
419+
new_index_labels = [None]
407420
elif self.session._default_index_type == bigframes.enums.DefaultIndexKind.NULL:
408421
new_index_cols = []
422+
new_index_labels = []
409423
else:
410424
raise ValueError(
411425
f"Unrecognized default index kind: {self.session._default_index_type}"
@@ -415,22 +429,23 @@ def reset_index(self, drop: bool = True) -> Block:
415429
# Even though the index might be part of the ordering, keep that
416430
# ordering expression as reset_index shouldn't change the row
417431
# order.
418-
expr = expr.drop_columns(self.index_columns)
432+
expr = expr.drop_columns(level_ids)
419433
return Block(
420434
expr,
421435
index_columns=new_index_cols,
436+
index_labels=new_index_labels,
422437
column_labels=self.column_labels,
423438
)
424439
else:
425440
# Add index names to column index
426-
index_labels = self.index.names
427441
column_labels_modified = self.column_labels
428-
for level, label in enumerate(index_labels):
442+
for position, level_id in enumerate(level_ids):
443+
label = self.col_id_to_index_name[level_id]
429444
if label is None:
430-
if "index" not in self.column_labels and len(index_labels) <= 1:
445+
if "index" not in self.column_labels and self.index.nlevels <= 1:
431446
label = "index"
432447
else:
433-
label = f"level_{level}"
448+
label = f"level_{self.index_columns.index(level_id)}"
434449

435450
if label in self.column_labels:
436451
raise ValueError(f"cannot insert {label}, already exists")
@@ -439,11 +454,12 @@ def reset_index(self, drop: bool = True) -> Block:
439454
label = tuple(label if i == 0 else "" for i in range(nlevels))
440455
# Create index copy with label inserted
441456
# See: https://pandas.pydata.org/docs/reference/api/pandas.Index.insert.html
442-
column_labels_modified = column_labels_modified.insert(level, label)
457+
column_labels_modified = column_labels_modified.insert(position, label)
443458

444459
return Block(
445-
expr,
460+
expr.select_columns((*new_index_cols, *level_ids, *self.value_columns)),
446461
index_columns=new_index_cols,
462+
index_labels=new_index_labels,
447463
column_labels=column_labels_modified,
448464
)
449465

bigframes/core/compile/polars/compiler.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -168,7 +168,7 @@ def compile_op(self, op: ops.ScalarOp, *args: pl.Expr) -> pl.Expr:
168168

169169
@compile_op.register(gen_ops.InvertOp)
170170
def _(self, op: ops.ScalarOp, input: pl.Expr) -> pl.Expr:
171-
return ~input
171+
return input.not_()
172172

173173
@compile_op.register(num_ops.AbsOp)
174174
def _(self, op: ops.ScalarOp, input: pl.Expr) -> pl.Expr:

bigframes/core/compile/polars/lowering.py

Lines changed: 30 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,10 +14,18 @@
1414

1515
import dataclasses
1616

17+
import numpy as np
18+
1719
from bigframes import dtypes
1820
from bigframes.core import bigframe_node, expression
1921
from bigframes.core.rewrite import op_lowering
20-
from bigframes.operations import comparison_ops, datetime_ops, json_ops, numeric_ops
22+
from bigframes.operations import (
23+
comparison_ops,
24+
datetime_ops,
25+
generic_ops,
26+
json_ops,
27+
numeric_ops,
28+
)
2129
import bigframes.operations as ops
2230

2331
# TODO: Would be more precise to actually have separate op set for polars ops (where they diverge from the original ops)
@@ -288,6 +296,26 @@ def lower(self, expr: expression.OpExpression) -> expression.Expression:
288296
return _lower_cast(expr.op, expr.inputs[0])
289297

290298

299+
def invert_bytes(byte_string):
300+
inverted_bytes = ~np.frombuffer(byte_string, dtype=np.uint8)
301+
return inverted_bytes.tobytes()
302+
303+
304+
class LowerInvertOp(op_lowering.OpLoweringRule):
305+
@property
306+
def op(self) -> type[ops.ScalarOp]:
307+
return generic_ops.InvertOp
308+
309+
def lower(self, expr: expression.OpExpression) -> expression.Expression:
310+
assert isinstance(expr.op, generic_ops.InvertOp)
311+
arg = expr.children[0]
312+
if arg.output_type == dtypes.BYTES_DTYPE:
313+
return generic_ops.PyUdfOp(invert_bytes, dtypes.BYTES_DTYPE).as_expr(
314+
expr.inputs[0]
315+
)
316+
return expr
317+
318+
291319
def _coerce_comparables(
292320
expr1: expression.Expression,
293321
expr2: expression.Expression,
@@ -385,6 +413,7 @@ def _lower_cast(cast_op: ops.AsTypeOp, arg: expression.Expression):
385413
LowerFloorDivRule(),
386414
LowerModRule(),
387415
LowerAsTypeRule(),
416+
LowerInvertOp(),
388417
)
389418

390419

bigframes/core/compile/polars/operations/generic_ops.py

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -45,3 +45,14 @@ def isnull_op_impl(
4545
input: pl.Expr,
4646
) -> pl.Expr:
4747
return input.is_null()
48+
49+
50+
@polars_compiler.register_op(generic_ops.PyUdfOp)
51+
def py_udf_op_impl(
52+
compiler: polars_compiler.PolarsExpressionCompiler,
53+
op: generic_ops.PyUdfOp, # type: ignore
54+
input: pl.Expr,
55+
) -> pl.Expr:
56+
return input.map_elements(
57+
op.fn, return_dtype=polars_compiler._DTYPE_MAPPING[op._output_type]
58+
)

bigframes/dataframe.py

Lines changed: 37 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@
2323
import re
2424
import sys
2525
import textwrap
26+
import traceback
2627
import typing
2728
from typing import (
2829
Callable,
@@ -814,7 +815,9 @@ def _repr_html_(self) -> str:
814815
except (AttributeError, ValueError, ImportError):
815816
# Fallback if anywidget is not available
816817
warnings.warn(
817-
"Anywidget mode is not available. Please `pip install anywidget traitlets` or `pip install 'bigframes[anywidget]'` to use interactive tables. Falling back to deferred mode."
818+
"Anywidget mode is not available. "
819+
"Please `pip install anywidget traitlets` or `pip install 'bigframes[anywidget]'` to use interactive tables. "
820+
f"Falling back to deferred mode. Error: {traceback.format_exc()}"
818821
)
819822
return formatter.repr_query_job(self._compute_dry_run())
820823

@@ -2312,9 +2315,39 @@ def _assign_series_join_on_index(
23122315

23132316
return DataFrame(block.with_index_labels(self._block.index.names))
23142317

2315-
def reset_index(self, *, drop: bool = False) -> DataFrame:
2316-
block = self._block.reset_index(drop)
2317-
return DataFrame(block)
2318+
@overload # type: ignore[override]
2319+
def reset_index(
2320+
self,
2321+
level: blocks.LevelsType = ...,
2322+
drop: bool = ...,
2323+
inplace: Literal[False] = ...,
2324+
) -> DataFrame:
2325+
...
2326+
2327+
@overload
2328+
def reset_index(
2329+
self,
2330+
level: blocks.LevelsType = ...,
2331+
drop: bool = ...,
2332+
inplace: Literal[True] = ...,
2333+
) -> None:
2334+
...
2335+
2336+
@overload
2337+
def reset_index(
2338+
self, level: blocks.LevelsType = None, drop: bool = False, inplace: bool = ...
2339+
) -> Optional[DataFrame]:
2340+
...
2341+
2342+
def reset_index(
2343+
self, level: blocks.LevelsType = None, drop: bool = False, inplace: bool = False
2344+
) -> Optional[DataFrame]:
2345+
block = self._block.reset_index(level, drop)
2346+
if inplace:
2347+
self._set_block(block)
2348+
return None
2349+
else:
2350+
return DataFrame(block)
23182351

23192352
def set_index(
23202353
self,

bigframes/display/anywidget.py

Lines changed: 3 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@
2323
import pandas as pd
2424

2525
import bigframes
26+
import bigframes.display.html
2627

2728
# anywidget and traitlets are optional dependencies. We don't want the import of this
2829
# module to fail if they aren't installed, though. Instead, we try to limit the surface that
@@ -201,12 +202,9 @@ def _set_table_html(self):
201202
page_data = cached_data.iloc[start:end]
202203

203204
# Generate HTML table
204-
self.table_html = page_data.to_html(
205-
index=False,
206-
max_rows=None,
205+
self.table_html = bigframes.display.html.render_html(
206+
dataframe=page_data,
207207
table_id=f"table-{self._table_id}",
208-
classes="table table-striped table-hover",
209-
escape=False,
210208
)
211209

212210
@traitlets.observe("page")

bigframes/display/html.py

Lines changed: 79 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,79 @@
1+
# Copyright 2024 Google LLC
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
"""HTML rendering for DataFrames and other objects."""
16+
17+
from __future__ import annotations
18+
19+
import html
20+
21+
import pandas as pd
22+
import pandas.api.types
23+
24+
from bigframes._config import options
25+
26+
27+
def _is_dtype_numeric(dtype) -> bool:
28+
"""Check if a dtype is numeric for alignment purposes."""
29+
return pandas.api.types.is_numeric_dtype(dtype)
30+
31+
32+
def render_html(
33+
*,
34+
dataframe: pd.DataFrame,
35+
table_id: str,
36+
) -> str:
37+
"""Render a pandas DataFrame to HTML with specific styling."""
38+
classes = "dataframe table table-striped table-hover"
39+
table_html = [f'<table border="1" class="{classes}" id="{table_id}">']
40+
precision = options.display.precision
41+
42+
# Render table head
43+
table_html.append(" <thead>")
44+
table_html.append(' <tr style="text-align: left;">')
45+
for col in dataframe.columns:
46+
table_html.append(
47+
f' <th style="text-align: left;"><div style="resize: horizontal; overflow: auto; box-sizing: border-box; width: 100%; height: 100%; padding: 0.5em;">{html.escape(str(col))}</div></th>'
48+
)
49+
table_html.append(" </tr>")
50+
table_html.append(" </thead>")
51+
52+
# Render table body
53+
table_html.append(" <tbody>")
54+
for i in range(len(dataframe)):
55+
table_html.append(" <tr>")
56+
row = dataframe.iloc[i]
57+
for col_name, value in row.items():
58+
dtype = dataframe.dtypes.loc[col_name] # type: ignore
59+
align = "right" if _is_dtype_numeric(dtype) else "left"
60+
table_html.append(
61+
' <td style="text-align: {}; padding: 0.5em;">'.format(align)
62+
)
63+
64+
# TODO(b/438181139): Consider semi-exploding ARRAY/STRUCT columns
65+
# into multiple rows/columns like the BQ UI does.
66+
if pandas.api.types.is_scalar(value) and pd.isna(value):
67+
table_html.append(' <em style="color: gray;">&lt;NA&gt;</em>')
68+
else:
69+
if isinstance(value, float):
70+
formatted_value = f"{value:.{precision}f}"
71+
table_html.append(f" {html.escape(formatted_value)}")
72+
else:
73+
table_html.append(f" {html.escape(str(value))}")
74+
table_html.append(" </td>")
75+
table_html.append(" </tr>")
76+
table_html.append(" </tbody>")
77+
table_html.append("</table>")
78+
79+
return "\n".join(table_html)

bigframes/operations/generic_ops.py

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -446,3 +446,15 @@ class SqlScalarOp(base_ops.NaryOp):
446446

447447
def output_type(self, *input_types: dtypes.ExpressionType) -> dtypes.ExpressionType:
448448
return self._output_type
449+
450+
451+
@dataclasses.dataclass(frozen=True)
452+
class PyUdfOp(base_ops.NaryOp):
453+
"""Represents a local UDF."""
454+
455+
name: typing.ClassVar[str] = "py_udf"
456+
fn: typing.Callable
457+
_output_type: dtypes.ExpressionType
458+
459+
def output_type(self, *input_types: dtypes.ExpressionType) -> dtypes.ExpressionType:
460+
return self._output_type

0 commit comments

Comments
 (0)