Skip to content

Commit 008aea3

Browse files
authored
Merge branch 'main' into b323176126-pandas-gbq
2 parents 1fab458 + b03a2af commit 008aea3

File tree

6 files changed

+184
-81
lines changed

6 files changed

+184
-81
lines changed

.kokoro/requirements.txt

Lines changed: 32 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -112,34 +112,38 @@ colorlog==6.8.2 \
112112
# via
113113
# gcp-docuploader
114114
# nox
115-
cryptography==43.0.1 \
116-
--hash=sha256:014f58110f53237ace6a408b5beb6c427b64e084eb451ef25a28308270086494 \
117-
--hash=sha256:1bbcce1a551e262dfbafb6e6252f1ae36a248e615ca44ba302df077a846a8806 \
118-
--hash=sha256:203e92a75716d8cfb491dc47c79e17d0d9207ccffcbcb35f598fbe463ae3444d \
119-
--hash=sha256:27e613d7077ac613e399270253259d9d53872aaf657471473ebfc9a52935c062 \
120-
--hash=sha256:2bd51274dcd59f09dd952afb696bf9c61a7a49dfc764c04dd33ef7a6b502a1e2 \
121-
--hash=sha256:38926c50cff6f533f8a2dae3d7f19541432610d114a70808f0926d5aaa7121e4 \
122-
--hash=sha256:511f4273808ab590912a93ddb4e3914dfd8a388fed883361b02dea3791f292e1 \
123-
--hash=sha256:58d4e9129985185a06d849aa6df265bdd5a74ca6e1b736a77959b498e0505b85 \
124-
--hash=sha256:5b43d1ea6b378b54a1dc99dd8a2b5be47658fe9a7ce0a58ff0b55f4b43ef2b84 \
125-
--hash=sha256:61ec41068b7b74268fa86e3e9e12b9f0c21fcf65434571dbb13d954bceb08042 \
126-
--hash=sha256:666ae11966643886c2987b3b721899d250855718d6d9ce41b521252a17985f4d \
127-
--hash=sha256:68aaecc4178e90719e95298515979814bda0cbada1256a4485414860bd7ab962 \
128-
--hash=sha256:7c05650fe8023c5ed0d46793d4b7d7e6cd9c04e68eabe5b0aeea836e37bdcec2 \
129-
--hash=sha256:80eda8b3e173f0f247f711eef62be51b599b5d425c429b5d4ca6a05e9e856baa \
130-
--hash=sha256:8385d98f6a3bf8bb2d65a73e17ed87a3ba84f6991c155691c51112075f9ffc5d \
131-
--hash=sha256:88cce104c36870d70c49c7c8fd22885875d950d9ee6ab54df2745f83ba0dc365 \
132-
--hash=sha256:9d3cdb25fa98afdd3d0892d132b8d7139e2c087da1712041f6b762e4f807cc96 \
133-
--hash=sha256:a575913fb06e05e6b4b814d7f7468c2c660e8bb16d8d5a1faf9b33ccc569dd47 \
134-
--hash=sha256:ac119bb76b9faa00f48128b7f5679e1d8d437365c5d26f1c2c3f0da4ce1b553d \
135-
--hash=sha256:c1332724be35d23a854994ff0b66530119500b6053d0bd3363265f7e5e77288d \
136-
--hash=sha256:d03a475165f3134f773d1388aeb19c2d25ba88b6a9733c5c590b9ff7bbfa2e0c \
137-
--hash=sha256:d75601ad10b059ec832e78823b348bfa1a59f6b8d545db3a24fd44362a1564cb \
138-
--hash=sha256:de41fd81a41e53267cb020bb3a7212861da53a7d39f863585d13ea11049cf277 \
139-
--hash=sha256:e710bf40870f4db63c3d7d929aa9e09e4e7ee219e703f949ec4073b4294f6172 \
140-
--hash=sha256:ea25acb556320250756e53f9e20a4177515f012c9eaea17eb7587a8c4d8ae034 \
141-
--hash=sha256:f98bf604c82c416bc829e490c700ca1553eafdf2912a91e23a79d97d9801372a \
142-
--hash=sha256:fba1007b3ef89946dbbb515aeeb41e30203b004f0b4b00e5e16078b518563289
115+
cryptography==44.0.1 \
116+
--hash=sha256:00918d859aa4e57db8299607086f793fa7813ae2ff5a4637e318a25ef82730f7 \
117+
--hash=sha256:1e8d181e90a777b63f3f0caa836844a1182f1f265687fac2115fcf245f5fbec3 \
118+
--hash=sha256:1f9a92144fa0c877117e9748c74501bea842f93d21ee00b0cf922846d9d0b183 \
119+
--hash=sha256:21377472ca4ada2906bc313168c9dc7b1d7ca417b63c1c3011d0c74b7de9ae69 \
120+
--hash=sha256:24979e9f2040c953a94bf3c6782e67795a4c260734e5264dceea65c8f4bae64a \
121+
--hash=sha256:2a46a89ad3e6176223b632056f321bc7de36b9f9b93b2cc1cccf935a3849dc62 \
122+
--hash=sha256:322eb03ecc62784536bc173f1483e76747aafeb69c8728df48537eb431cd1911 \
123+
--hash=sha256:436df4f203482f41aad60ed1813811ac4ab102765ecae7a2bbb1dbb66dcff5a7 \
124+
--hash=sha256:4f422e8c6a28cf8b7f883eb790695d6d45b0c385a2583073f3cec434cc705e1a \
125+
--hash=sha256:53f23339864b617a3dfc2b0ac8d5c432625c80014c25caac9082314e9de56f41 \
126+
--hash=sha256:5fed5cd6102bb4eb843e3315d2bf25fede494509bddadb81e03a859c1bc17b83 \
127+
--hash=sha256:610a83540765a8d8ce0f351ce42e26e53e1f774a6efb71eb1b41eb01d01c3d12 \
128+
--hash=sha256:6c8acf6f3d1f47acb2248ec3ea261171a671f3d9428e34ad0357148d492c7864 \
129+
--hash=sha256:6f76fdd6fd048576a04c5210d53aa04ca34d2ed63336d4abd306d0cbe298fddf \
130+
--hash=sha256:72198e2b5925155497a5a3e8c216c7fb3e64c16ccee11f0e7da272fa93b35c4c \
131+
--hash=sha256:887143b9ff6bad2b7570da75a7fe8bbf5f65276365ac259a5d2d5147a73775f2 \
132+
--hash=sha256:888fcc3fce0c888785a4876ca55f9f43787f4c5c1cc1e2e0da71ad481ff82c5b \
133+
--hash=sha256:8e6a85a93d0642bd774460a86513c5d9d80b5c002ca9693e63f6e540f1815ed0 \
134+
--hash=sha256:94f99f2b943b354a5b6307d7e8d19f5c423a794462bde2bf310c770ba052b1c4 \
135+
--hash=sha256:9b336599e2cb77b1008cb2ac264b290803ec5e8e89d618a5e978ff5eb6f715d9 \
136+
--hash=sha256:a2d8a7045e1ab9b9f803f0d9531ead85f90c5f2859e653b61497228b18452008 \
137+
--hash=sha256:b8272f257cf1cbd3f2e120f14c68bff2b6bdfcc157fafdee84a1b795efd72862 \
138+
--hash=sha256:bf688f615c29bfe9dfc44312ca470989279f0e94bb9f631f85e3459af8efc009 \
139+
--hash=sha256:d9c5b9f698a83c8bd71e0f4d3f9f839ef244798e5ffe96febfa9714717db7af7 \
140+
--hash=sha256:dd7c7e2d71d908dc0f8d2027e1604102140d84b155e658c20e8ad1304317691f \
141+
--hash=sha256:df978682c1504fc93b3209de21aeabf2375cb1571d4e61907b3e7a2540e83026 \
142+
--hash=sha256:e403f7f766ded778ecdb790da786b418a9f2394f36e8cc8b796cc056ab05f44f \
143+
--hash=sha256:eb3889330f2a4a148abead555399ec9a32b13b7c8ba969b72d8e500eb7ef84cd \
144+
--hash=sha256:f4daefc971c2d1f82f03097dc6f216744a6cd2ac0f04c68fb935ea2ba2a0d420 \
145+
--hash=sha256:f51f5705ab27898afda1aaa430f34ad90dc117421057782022edf0600bec5f14 \
146+
--hash=sha256:fd0ee90072861e276b0ff08bd627abec29e32a53b2be44e41dbcdf87cbee2b00
143147
# via
144148
# -r requirements.in
145149
# gcp-releasetool

docs/magics.rst

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@ in a Jupyter notebook cell.
66

77
.. code::
88
9-
%load_ext google.cloud.bigquery
9+
%load_ext bigquery_magics
1010
1111
This makes the ``%%bigquery`` magic available.
1212

@@ -27,8 +27,9 @@ Running a parameterized query:
2727
:start-after: [START bigquery_jupyter_query_params_scalars]
2828
:end-before: [END bigquery_jupyter_query_params_scalars]
2929

30-
API Reference
31-
-------------
30+
BigQuery Magics Reference
31+
-------------------------
3232

33-
.. automodule:: google.cloud.bigquery.magics.magics
34-
:members:
33+
- `BigQuery Magics Documentation`_
34+
35+
.. _BigQuery Magics Documentation: https://googleapis.dev/python/bigquery-magics/latest

google/cloud/bigquery/_pandas_helpers.py

Lines changed: 50 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@
2525
from itertools import islice
2626
import logging
2727
import queue
28+
import threading
2829
import warnings
2930
from typing import Any, Union, Optional, Callable, Generator, List
3031

@@ -134,6 +135,21 @@ def __init__(self):
134135
# be an atomic operation in the Python language definition (enforced by
135136
# the global interpreter lock).
136137
self.done = False
138+
# To assist with testing and understanding the behavior of the
139+
# download, use this object as shared state to track how many worker
140+
# threads have started and have gracefully shutdown.
141+
self._started_workers_lock = threading.Lock()
142+
self.started_workers = 0
143+
self._finished_workers_lock = threading.Lock()
144+
self.finished_workers = 0
145+
146+
def start(self):
147+
with self._started_workers_lock:
148+
self.started_workers += 1
149+
150+
def finish(self):
151+
with self._finished_workers_lock:
152+
self.finished_workers += 1
137153

138154

139155
BQ_FIELD_TYPE_TO_ARROW_FIELD_METADATA = {
@@ -819,25 +835,35 @@ def _bqstorage_page_to_dataframe(column_names, dtypes, page):
819835
def _download_table_bqstorage_stream(
820836
download_state, bqstorage_client, session, stream, worker_queue, page_to_item
821837
):
822-
reader = bqstorage_client.read_rows(stream.name)
838+
download_state.start()
839+
try:
840+
reader = bqstorage_client.read_rows(stream.name)
823841

824-
# Avoid deprecation warnings for passing in unnecessary read session.
825-
# https://github.com/googleapis/python-bigquery-storage/issues/229
826-
if _versions_helpers.BQ_STORAGE_VERSIONS.is_read_session_optional:
827-
rowstream = reader.rows()
828-
else:
829-
rowstream = reader.rows(session)
830-
831-
for page in rowstream.pages:
832-
item = page_to_item(page)
833-
while True:
834-
if download_state.done:
835-
return
836-
try:
837-
worker_queue.put(item, timeout=_PROGRESS_INTERVAL)
838-
break
839-
except queue.Full: # pragma: NO COVER
840-
continue
842+
# Avoid deprecation warnings for passing in unnecessary read session.
843+
# https://github.com/googleapis/python-bigquery-storage/issues/229
844+
if _versions_helpers.BQ_STORAGE_VERSIONS.is_read_session_optional:
845+
rowstream = reader.rows()
846+
else:
847+
rowstream = reader.rows(session)
848+
849+
for page in rowstream.pages:
850+
item = page_to_item(page)
851+
852+
# Make sure we set a timeout on put() so that we give the worker
853+
# thread opportunities to shutdown gracefully, for example if the
854+
# parent thread shuts down or the parent generator object which
855+
# collects rows from all workers goes out of scope. See:
856+
# https://github.com/googleapis/python-bigquery/issues/2032
857+
while True:
858+
if download_state.done:
859+
return
860+
try:
861+
worker_queue.put(item, timeout=_PROGRESS_INTERVAL)
862+
break
863+
except queue.Full:
864+
continue
865+
finally:
866+
download_state.finish()
841867

842868

843869
def _nowait(futures):
@@ -863,6 +889,7 @@ def _download_table_bqstorage(
863889
page_to_item: Optional[Callable] = None,
864890
max_queue_size: Any = _MAX_QUEUE_SIZE_DEFAULT,
865891
max_stream_count: Optional[int] = None,
892+
download_state: Optional[_DownloadState] = None,
866893
) -> Generator[Any, None, None]:
867894
"""Downloads a BigQuery table using the BigQuery Storage API.
868895
@@ -890,6 +917,9 @@ def _download_table_bqstorage(
890917
is True, the requested streams are limited to 1 regardless of the
891918
`max_stream_count` value. If 0 or None, then the number of
892919
requested streams will be unbounded. Defaults to None.
920+
download_state (Optional[_DownloadState]):
921+
A threadsafe state object which can be used to observe the
922+
behavior of the worker threads created by this method.
893923
894924
Yields:
895925
pandas.DataFrame: Pandas DataFrames, one for each chunk of data
@@ -948,7 +978,8 @@ def _download_table_bqstorage(
948978

949979
# Use _DownloadState to notify worker threads when to quit.
950980
# See: https://stackoverflow.com/a/29237343/101923
951-
download_state = _DownloadState()
981+
if download_state is None:
982+
download_state = _DownloadState()
952983

953984
# Create a queue to collect frames as they are created in each thread.
954985
#

samples/tests/test_download_public_data.py

Lines changed: 1 addition & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -12,29 +12,16 @@
1212
# See the License for the specific language governing permissions and
1313
# limitations under the License.
1414

15-
import logging
16-
1715
import pytest
1816

1917
from .. import download_public_data
2018

2119
pytest.importorskip("google.cloud.bigquery_storage_v1")
2220

2321

24-
def test_download_public_data(
25-
caplog: pytest.LogCaptureFixture, capsys: pytest.CaptureFixture[str]
26-
) -> None:
27-
# Enable debug-level logging to verify the BigQuery Storage API is used.
28-
caplog.set_level(logging.DEBUG)
29-
22+
def test_download_public_data(capsys: pytest.CaptureFixture[str]) -> None:
3023
download_public_data.download_public_data()
3124
out, _ = capsys.readouterr()
3225
assert "year" in out
3326
assert "gender" in out
3427
assert "name" in out
35-
36-
assert any(
37-
"Started reading table 'bigquery-public-data.usa_names.usa_1910_current' with BQ Storage API session"
38-
in message
39-
for message in caplog.messages
40-
)

samples/tests/test_download_public_data_sandbox.py

Lines changed: 2 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -12,29 +12,16 @@
1212
# See the License for the specific language governing permissions and
1313
# limitations under the License.
1414

15-
import logging
16-
1715
import pytest
1816

1917
from .. import download_public_data_sandbox
2018

2119
pytest.importorskip("google.cloud.bigquery_storage_v1")
2220

2321

24-
def test_download_public_data_sandbox(
25-
caplog: pytest.LogCaptureFixture, capsys: pytest.CaptureFixture[str]
26-
) -> None:
27-
# Enable debug-level logging to verify the BigQuery Storage API is used.
28-
caplog.set_level(logging.DEBUG)
29-
22+
def test_download_public_data_sandbox(capsys: pytest.CaptureFixture[str]) -> None:
3023
download_public_data_sandbox.download_public_data_sandbox()
31-
out, err = capsys.readouterr()
24+
out, _ = capsys.readouterr()
3225
assert "year" in out
3326
assert "gender" in out
3427
assert "name" in out
35-
36-
assert any(
37-
# An anonymous table is used because this sample reads from query results.
38-
("Started reading table" in message and "BQ Storage API session" in message)
39-
for message in caplog.messages
40-
)

tests/unit/test__pandas_helpers.py

Lines changed: 93 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616
import datetime
1717
import decimal
1818
import functools
19+
import gc
1920
import operator
2021
import queue
2122
from typing import Union
@@ -1889,6 +1890,98 @@ def fake_download_stream(
18891890
assert queue_used.maxsize == expected_maxsize
18901891

18911892

1893+
@pytest.mark.skipif(isinstance(pyarrow, mock.Mock), reason="Requires `pyarrow`")
1894+
def test__download_table_bqstorage_shuts_down_workers(
1895+
monkeypatch,
1896+
module_under_test,
1897+
):
1898+
"""Regression test for https://github.com/googleapis/python-bigquery/issues/2032
1899+
1900+
Make sure that when the top-level iterator goes out of scope (is deleted),
1901+
the child threads are also stopped.
1902+
"""
1903+
from google.cloud.bigquery import dataset
1904+
from google.cloud.bigquery import table
1905+
import google.cloud.bigquery_storage_v1.reader
1906+
import google.cloud.bigquery_storage_v1.types
1907+
1908+
monkeypatch.setattr(
1909+
_versions_helpers.BQ_STORAGE_VERSIONS, "_installed_version", None
1910+
)
1911+
monkeypatch.setattr(bigquery_storage, "__version__", "2.5.0")
1912+
1913+
# Create a fake stream with a decent number of rows.
1914+
arrow_schema = pyarrow.schema(
1915+
[
1916+
("int_col", pyarrow.int64()),
1917+
("str_col", pyarrow.string()),
1918+
]
1919+
)
1920+
arrow_rows = pyarrow.record_batch(
1921+
[
1922+
pyarrow.array([0, 1, 2], type=pyarrow.int64()),
1923+
pyarrow.array(["a", "b", "c"], type=pyarrow.string()),
1924+
],
1925+
schema=arrow_schema,
1926+
)
1927+
session = google.cloud.bigquery_storage_v1.types.ReadSession()
1928+
session.data_format = "ARROW"
1929+
session.arrow_schema = {"serialized_schema": arrow_schema.serialize().to_pybytes()}
1930+
session.streams = [
1931+
google.cloud.bigquery_storage_v1.types.ReadStream(name=name)
1932+
for name in ("stream/s0", "stream/s1", "stream/s2")
1933+
]
1934+
bqstorage_client = mock.create_autospec(
1935+
bigquery_storage.BigQueryReadClient, instance=True
1936+
)
1937+
reader = mock.create_autospec(
1938+
google.cloud.bigquery_storage_v1.reader.ReadRowsStream, instance=True
1939+
)
1940+
reader.__iter__.return_value = [
1941+
google.cloud.bigquery_storage_v1.types.ReadRowsResponse(
1942+
arrow_schema={"serialized_schema": arrow_schema.serialize().to_pybytes()},
1943+
arrow_record_batch={
1944+
"serialized_record_batch": arrow_rows.serialize().to_pybytes()
1945+
},
1946+
)
1947+
for _ in range(100)
1948+
]
1949+
reader.rows.return_value = google.cloud.bigquery_storage_v1.reader.ReadRowsIterable(
1950+
reader, read_session=session
1951+
)
1952+
bqstorage_client.read_rows.return_value = reader
1953+
bqstorage_client.create_read_session.return_value = session
1954+
table_ref = table.TableReference(
1955+
dataset.DatasetReference("project-x", "dataset-y"),
1956+
"table-z",
1957+
)
1958+
download_state = module_under_test._DownloadState()
1959+
assert download_state.started_workers == 0
1960+
assert download_state.finished_workers == 0
1961+
1962+
result_gen = module_under_test._download_table_bqstorage(
1963+
"some-project",
1964+
table_ref,
1965+
bqstorage_client,
1966+
max_queue_size=1,
1967+
page_to_item=module_under_test._bqstorage_page_to_arrow,
1968+
download_state=download_state,
1969+
)
1970+
1971+
result_gen_iter = iter(result_gen)
1972+
next(result_gen_iter)
1973+
assert download_state.started_workers == 3
1974+
assert download_state.finished_workers == 0
1975+
1976+
# Stop iteration early and simulate the variables going out of scope
1977+
# to be doubly sure that the worker threads are supposed to be cleaned up.
1978+
del result_gen, result_gen_iter
1979+
gc.collect()
1980+
1981+
assert download_state.started_workers == 3
1982+
assert download_state.finished_workers == 3
1983+
1984+
18921985
@pytest.mark.skipif(isinstance(pyarrow, mock.Mock), reason="Requires `pyarrow`")
18931986
def test_download_arrow_row_iterator_unknown_field_type(module_under_test):
18941987
fake_page = api_core.page_iterator.Page(

0 commit comments

Comments
 (0)