googleapis
diff --git a/‎bigframes/core/array_value.py‎
Lines changed: 6 additions & 7 deletions b/‎bigframes/core/array_value.py‎
Lines changed: 6 additions & 7 deletions
diff --git a/‎bigframes/core/bq_data.py‎
Lines changed: 153 additions & 12 deletions b/‎bigframes/core/bq_data.py‎
Lines changed: 153 additions & 12 deletions
diff --git a/‎bigframes/core/compile/ibis_compiler/ibis_compiler.py‎
Lines changed: 1 addition & 3 deletions b/‎bigframes/core/compile/ibis_compiler/ibis_compiler.py‎
Lines changed: 1 addition & 3 deletions
diff --git a/‎bigframes/core/compile/sqlglot/compiler.py‎
Lines changed: 15 additions & 3 deletions b/‎bigframes/core/compile/sqlglot/compiler.py‎
Lines changed: 15 additions & 3 deletions
diff --git a/‎bigframes/core/nodes.py‎
Lines changed: 1 addition & 3 deletions b/‎bigframes/core/nodes.py‎
Lines changed: 1 addition & 3 deletions
@@ -17,9 +17,8 @@
 import datetime
 import functools
 import typing
-from typing import Iterable, List, Mapping, Optional, Sequence, Tuple
+from typing import Iterable, List, Mapping, Optional, Sequence, Tuple, Union
 
-import google.cloud.bigquery
 import pandas
 import pyarrow as pa
 
@@ -91,7 +90,7 @@ def from_range(cls, start, end, step):
     @classmethod
     def from_table(
         cls,
-        table: google.cloud.bigquery.Table,
+        table: Union[bq_data.BiglakeIcebergTable, bq_data.GbqNativeTable],
         session: Session,
         *,
         columns: Optional[Sequence[str]] = None,
@@ -103,8 +102,6 @@ def from_table(
     ):
         if offsets_col and primary_key:
             raise ValueError("must set at most one of 'offests', 'primary_key'")
-        # define data source only for needed columns, this makes row-hashing cheaper
-        table_def = bq_data.GbqTable.from_table(table, columns=columns or ())
 
         # create ordering from info
         ordering = None
@@ -115,7 +112,9 @@ def from_table(
                 [ids.ColumnId(key_part) for key_part in primary_key]
             )
 
-        bf_schema = schemata.ArraySchema.from_bq_table(table, columns=columns)
+        bf_schema = schemata.ArraySchema.from_bq_schema(
+            table.physical_schema, columns=columns
+        )
         # Scan all columns by default, we define this list as it can be pruned while preserving source_def
         scan_list = nodes.ScanList(
             tuple(
@@ -124,7 +123,7 @@ def from_table(
             )
         )
         source_def = bq_data.BigqueryDataSource(
-            table=table_def,
+            table=table,
             schema=bf_schema,
             at_time=at_time,
             sql_predicate=predicate,
 
@@ -22,73 +22,194 @@
 import queue
 import threading
 import typing
-from typing import Any, Iterator, Optional, Sequence, Tuple
+from typing import Any, Iterator, List, Literal, Optional, Sequence, Tuple, Union
 
 from google.cloud import bigquery_storage_v1
 import google.cloud.bigquery as bq
 import google.cloud.bigquery_storage_v1.types as bq_storage_types
 from google.protobuf import timestamp_pb2
 import pyarrow as pa
 
+import bigframes.constants
 from bigframes.core import pyarrow_utils
 import bigframes.core.schema
 
 if typing.TYPE_CHECKING:
     import bigframes.core.ordering as orderings
 
 
+def _resolve_standard_gcp_region(bq_region: str):
+    """
+    Resolve bq regions to standardized
+    """
+    if bq_region.casefold() == "US":
+        return "us-central1"
+    elif bq_region.casefold() == "EU":
+        return "europe-west4"
+    return bq_region
+
+
+def is_irc_table(table_id: str):
+    """
+    Determines if a table id should be resolved through the iceberg rest catalog.
+    """
+    return len(table_id.split(".")) == 4
+
+
+def is_compatible(
+    data_region: Union[GcsRegion, BigQueryRegion], session_location: str
+) -> bool:
+    # based on https://docs.cloud.google.com/bigquery/docs/locations#storage-location-considerations
+    if isinstance(data_region, BigQueryRegion):
+        return data_region.name == session_location
+    else:
+        assert isinstance(data_region, GcsRegion)
+        # TODO(b/463675088): Multi-regions don't yet support rest catalog tables
+        if session_location in bigframes.constants.BIGQUERY_MULTIREGIONS:
+            return False
+        return _resolve_standard_gcp_region(session_location) in data_region.included
+
+
+def get_default_bq_region(data_region: Union[GcsRegion, BigQueryRegion]) -> str:
+    if isinstance(data_region, BigQueryRegion):
+        return data_region.name
+    elif isinstance(data_region, GcsRegion):
+        # should maybe try to track and prefer primary replica?
+        return data_region.included[0]
+
+
 @dataclasses.dataclass(frozen=True)
-class GbqTable:
+class BigQueryRegion:
+    name: str
+
+
+@dataclasses.dataclass(frozen=True)
+class GcsRegion:
+    # this is the name of gcs regions, which may be names for multi-regions, so shouldn't be compared with non-gcs locations
+    storage_regions: tuple[str, ...]
+    # this tracks all the included standard, specific regions (eg us-east1), and should be comparable to bq regions (except non-standard US, EU, omni regions)
+    included: tuple[str, ...]
+
+
+# what is the line between metadata and core fields? Mostly metadata fields are optional or unreliable, but its fuzzy
+@dataclasses.dataclass(frozen=True)
+class TableMetadata:
+    # this size metadata might be stale, don't use where strict correctness is needed
+    location: Union[BigQueryRegion, GcsRegion]
+    type: Literal["TABLE", "EXTERNAL", "VIEW", "MATERIALIZE_VIEW", "SNAPSHOT"]
+    numBytes: Optional[int] = None
+    numRows: Optional[int] = None
+    created_time: Optional[datetime.datetime] = None
+    modified_time: Optional[datetime.datetime] = None
+
+
+@dataclasses.dataclass(frozen=True)
+class GbqNativeTable:
     project_id: str = dataclasses.field()
     dataset_id: str = dataclasses.field()
     table_id: str = dataclasses.field()
     physical_schema: Tuple[bq.SchemaField, ...] = dataclasses.field()
-    is_physically_stored: bool = dataclasses.field()
-    cluster_cols: typing.Optional[Tuple[str, ...]]
+    metadata: TableMetadata = dataclasses.field()
+    partition_col: Optional[str] = None
+    cluster_cols: typing.Optional[Tuple[str, ...]] = None
+    primary_key: Optional[Tuple[str, ...]] = None
 
     @staticmethod
-    def from_table(table: bq.Table, columns: Sequence[str] = ()) -> GbqTable:
+    def from_table(table: bq.Table, columns: Sequence[str] = ()) -> GbqNativeTable:
         # Subsetting fields with columns can reduce cost of row-hash default ordering
         if columns:
             schema = tuple(item for item in table.schema if item.name in columns)
         else:
             schema = tuple(table.schema)
-        return GbqTable(
+
+        metadata = TableMetadata(
+            numBytes=table.num_bytes,
+            numRows=table.num_rows,
+            location=table.location,  # type: ignore
+            type=table.table_type,  # type: ignore
+            created_time=table.created,
+            modified_time=table.modified,
+        )
+
+        return GbqNativeTable(
             project_id=table.project,
             dataset_id=table.dataset_id,
             table_id=table.table_id,
             physical_schema=schema,
-            is_physically_stored=(table.table_type in ["TABLE", "MATERIALIZED_VIEW"]),
             cluster_cols=None
             if table.clustering_fields is None
             else tuple(table.clustering_fields),
+            primary_key=tuple(_get_primary_keys(table)),
+            metadata=metadata,
         )
 
     @staticmethod
     def from_ref_and_schema(
         table_ref: bq.TableReference,
         schema: Sequence[bq.SchemaField],
+        location: str,
+        table_type: Literal["TABLE"] = "TABLE",
         cluster_cols: Optional[Sequence[str]] = None,
-    ) -> GbqTable:
-        return GbqTable(
+    ) -> GbqNativeTable:
+        return GbqNativeTable(
             project_id=table_ref.project,
             dataset_id=table_ref.dataset_id,
             table_id=table_ref.table_id,
+            metadata=TableMetadata(location=BigQueryRegion(location), type=table_type),
             physical_schema=tuple(schema),
-            is_physically_stored=True,
             cluster_cols=tuple(cluster_cols) if cluster_cols else None,
         )
 
+    @property
+    def is_physically_stored(self) -> bool:
+        return self.metadata.type in ["TABLE", "MATERIALIZED_VIEW"]
+
     def get_table_ref(self) -> bq.TableReference:
         return bq.TableReference(
             bq.DatasetReference(self.project_id, self.dataset_id), self.table_id
         )
 
+    def get_full_id(self, quoted: bool = False) -> str:
+        if quoted:
+            return f"`{self.project_id}`.`{self.dataset_id}`.`{self.table_id}`"
+        return f"{self.project_id}.{self.dataset_id}.{self.table_id}"
+
+    @property
+    @functools.cache
+    def schema_by_id(self):
+        return {col.name: col for col in self.physical_schema}
+
+
+@dataclasses.dataclass(frozen=True)
+class BiglakeIcebergTable:
+    project_id: str = dataclasses.field()
+    catalog_id: str = dataclasses.field()
+    namespace_id: str = dataclasses.field()
+    table_id: str = dataclasses.field()
+    physical_schema: Tuple[bq.SchemaField, ...] = dataclasses.field()
+    cluster_cols: typing.Optional[Tuple[str, ...]]
+    metadata: TableMetadata
+
+    def get_full_id(self, quoted: bool = False) -> str:
+        if quoted:
+            return f"`{self.project_id}`.`{self.catalog_id}`.`{self.namespace_id}`.`{self.table_id}`"
+        return (
+            f"{self.project_id}.{self.catalog_id}.{self.namespace_id}.{self.table_id}"
+        )
+
     @property
     @functools.cache
     def schema_by_id(self):
         return {col.name: col for col in self.physical_schema}
 
+    @property
+    def partition_col(self) -> Optional[str]:
+        return None
+
+    @property
+    def primary_key(self) -> Optional[Tuple[str, ...]]:
+        return None
+
 
 @dataclasses.dataclass(frozen=True)
 class BigqueryDataSource:
@@ -104,13 +225,13 @@ def __post_init__(self):
             self.schema.names
         )
 
-    table: GbqTable
+    table: Union[GbqNativeTable, BiglakeIcebergTable]
     schema: bigframes.core.schema.ArraySchema
     at_time: typing.Optional[datetime.datetime] = None
     # Added for backwards compatibility, not validated
     sql_predicate: typing.Optional[str] = None
     ordering: typing.Optional[orderings.RowOrdering] = None
-    # Optimization field
+    # Optimization field, must be correct if set, don't put maybe-stale number here
     n_rows: Optional[int] = None
 
 
@@ -188,6 +309,8 @@ def get_arrow_batches(
     project_id: str,
     sample_rate: Optional[float] = None,
 ) -> ReadResult:
+    assert isinstance(data.table, GbqNativeTable)
+
     table_mod_options = {}
     read_options_dict: dict[str, Any] = {"selected_fields": list(columns)}
 
@@ -245,3 +368,21 @@ def process_batch(pa_batch):
     return ReadResult(
         batches, session.estimated_row_count, session.estimated_total_bytes_scanned
     )
+
+
+def _get_primary_keys(
+    table: bq.Table,
+) -> List[str]:
+    """Get primary keys from table if they are set."""
+
+    primary_keys: List[str] = []
+    if (
+        (table_constraints := getattr(table, "table_constraints", None)) is not None
+        and (primary_key := table_constraints.primary_key) is not None
+        # This will be False for either None or empty list.
+        # We want primary_keys = None if no primary keys are set.
+        and (columns := primary_key.columns)
+    ):
+        primary_keys = columns if columns is not None else []
+
+    return primary_keys
@@ -207,9 +207,7 @@ def _table_to_ibis(
     source: bq_data.BigqueryDataSource,
     scan_cols: typing.Sequence[str],
 ) -> ibis_types.Table:
-    full_table_name = (
-        f"{source.table.project_id}.{source.table.dataset_id}.{source.table.table_id}"
-    )
+    full_table_name = source.table.get_full_id(quoted=False)
     # Physical schema might include unused columns, unsupported datatypes like JSON
     physical_schema = ibis_bigquery.BigQuerySchema.to_ibis(
         list(source.table.physical_schema)
 
@@ -21,6 +21,7 @@
 
 from bigframes.core import (
     agg_expressions,
+    bq_data,
     expression,
     guid,
     identifiers,
@@ -173,10 +174,21 @@ def compile_readlocal(node: nodes.ReadLocalNode, child: ir.SQLGlotIR) -> ir.SQLG
 @_compile_node.register
 def compile_readtable(node: nodes.ReadTableNode, child: ir.SQLGlotIR):
     table = node.source.table
+    if isinstance(table, bq_data.GbqNativeTable):
+        project, dataset, table_id = table.project_id, table.dataset_id, table.table_id
+    elif isinstance(table, bq_data.BiglakeIcebergTable):
+        project, dataset, table_id = (
+            table.project_id,
+            table.catalog_id,
+            f"{table.namespace_id}.{table.table_id}",
+        )
+
+    else:
+        raise ValueError(f"Unrecognized table type: {table}")
     return ir.SQLGlotIR.from_table(
-        table.project_id,
-        table.dataset_id,
-        table.table_id,
+        project,
+        dataset,
+        table_id,
         col_names=[col.source_id for col in node.scan_list.items],
         alias_names=[col.id.sql for col in node.scan_list.items],
         uid_gen=child.uid_gen,
 
@@ -825,9 +825,7 @@ def variables_introduced(self) -> int:
 
     @property
     def row_count(self) -> typing.Optional[int]:
-        if self.source.sql_predicate is None and self.source.table.is_physically_stored:
-            return self.source.n_rows
-        return None
+        return self.source.n_rows
 
     @property
     def node_defined_ids(self) -> Tuple[identifiers.ColumnId, ...]: