Skip to content
This repository was archived by the owner on Apr 1, 2026. It is now read-only.

Commit b6fcf57

Browse files
committed
feat: retry AI/ML jobs that fail more often
1 parent 827f5d5 commit b6fcf57

5 files changed

Lines changed: 245 additions & 4 deletions

File tree

.pre-commit-config.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@
1616
# See https://pre-commit.com/hooks.html for more hooks
1717
repos:
1818
- repo: https://github.com/pre-commit/pre-commit-hooks
19-
rev: v4.0.1
19+
rev: v5.0.0
2020
hooks:
2121
- id: trailing-whitespace
2222
- id: end-of-file-fixer

bigframes/session/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,7 @@
4040
import weakref
4141

4242
import bigframes_vendored.constants as constants
43+
import bigframes_vendored.google_cloud_bigquery.retry as third_party_gcb_retry
4344
import bigframes_vendored.ibis.backends.bigquery as ibis_bigquery # noqa
4445
import bigframes_vendored.pandas.io.gbq as third_party_pandas_gbq
4546
import bigframes_vendored.pandas.io.parquet as third_party_pandas_parquet
@@ -2051,6 +2052,7 @@ def _start_query_ml_ddl(
20512052
project=None,
20522053
timeout=None,
20532054
query_with_job=True,
2055+
job_retry=third_party_gcb_retry.DEFAULT_ML_JOB_RETRY,
20542056
)
20552057
return iterator, query_job
20562058

bigframes/session/_io/bigquery/__init__.py

Lines changed: 13 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -24,8 +24,10 @@
2424
import typing
2525
from typing import Dict, Iterable, Literal, Mapping, Optional, overload, Tuple, Union
2626

27+
import bigframes_vendored.google_cloud_bigquery.retry as third_party_gcb_retry
2728
import bigframes_vendored.pandas.io.gbq as third_party_pandas_gbq
2829
import google.api_core.exceptions
30+
import google.api_core.retry
2931
import google.cloud.bigquery as bigquery
3032

3133
from bigframes.core import log_adapter
@@ -245,8 +247,9 @@ def start_query_with_client(
245247
location: Optional[str],
246248
project: Optional[str],
247249
timeout: Optional[float],
248-
metrics: Optional[bigframes.session.metrics.ExecutionMetrics] = None,
250+
metrics: Optional[bigframes.session.metrics.ExecutionMetrics],
249251
query_with_job: Literal[True],
252+
job_retry: Optional[google.api_core.retry.Retry],
250253
) -> Tuple[bigquery.table.RowIterator, bigquery.QueryJob]:
251254
...
252255

@@ -260,8 +263,9 @@ def start_query_with_client(
260263
location: Optional[str],
261264
project: Optional[str],
262265
timeout: Optional[float],
263-
metrics: Optional[bigframes.session.metrics.ExecutionMetrics] = None,
266+
metrics: Optional[bigframes.session.metrics.ExecutionMetrics],
264267
query_with_job: Literal[False],
268+
job_retry: Optional[google.api_core.retry.Retry],
265269
) -> Tuple[bigquery.table.RowIterator, Optional[bigquery.QueryJob]]:
266270
...
267271

@@ -276,6 +280,11 @@ def start_query_with_client(
276280
timeout: Optional[float] = None,
277281
metrics: Optional[bigframes.session.metrics.ExecutionMetrics] = None,
278282
query_with_job: bool = True,
283+
# TODO(tswast): We can stop providing our own default once we use a
284+
# google-cloud-bigquery version with
285+
# https://github.com/googleapis/python-bigquery/pull/2256 merged, likely
286+
# version 3.36.0 or later.
287+
job_retry: Optional[google.api_core.retry.Retry] = third_party_gcb_retry.DEFAULT_JOB_RETRY,
279288
) -> Tuple[bigquery.table.RowIterator, Optional[bigquery.QueryJob]]:
280289
"""
281290
Starts query job and waits for results.
@@ -292,6 +301,7 @@ def start_query_with_client(
292301
location=location,
293302
project=project,
294303
api_timeout=timeout,
304+
job_retry=job_retry,
295305
)
296306
if metrics is not None:
297307
metrics.count_job_stats(row_iterator=results_iterator)
@@ -303,6 +313,7 @@ def start_query_with_client(
303313
location=location,
304314
project=project,
305315
timeout=timeout,
316+
job_retry=job_retry,
306317
)
307318
except google.api_core.exceptions.Forbidden as ex:
308319
if "Drive credentials" in ex.message:

pyproject.toml

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,9 @@
11
[build-system]
2-
requires = ["setuptools"]
2+
requires = ["setuptools", "setuptools_scm"]
33
build-backend = "setuptools.build_meta"
4+
5+
[project]
6+
dynamic=['version']
7+
8+
[tool.setuptools_scm]
9+
version_file = "bigframes/version.py"
Lines changed: 222 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,222 @@
1+
# Original: https://github.com/googleapis/python-bigquery/blob/main/google/cloud/bigquery/retry.py
2+
# Copyright 2018 Google LLC
3+
#
4+
# Licensed under the Apache License, Version 2.0 (the "License");
5+
# you may not use this file except in compliance with the License.
6+
# You may obtain a copy of the License at
7+
#
8+
# http://www.apache.org/licenses/LICENSE-2.0
9+
#
10+
# Unless required by applicable law or agreed to in writing, software
11+
# distributed under the License is distributed on an "AS IS" BASIS,
12+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
# See the License for the specific language governing permissions and
14+
# limitations under the License.
15+
16+
from google.api_core import exceptions
17+
from google.api_core import retry
18+
import google.api_core.future.polling
19+
from google.auth import exceptions as auth_exceptions # type: ignore
20+
import requests.exceptions
21+
22+
23+
_RETRYABLE_REASONS = frozenset(
24+
["rateLimitExceeded", "backendError", "internalError", "badGateway"]
25+
)
26+
27+
_UNSTRUCTURED_RETRYABLE_TYPES = (
28+
ConnectionError,
29+
exceptions.TooManyRequests,
30+
exceptions.InternalServerError,
31+
exceptions.BadGateway,
32+
exceptions.ServiceUnavailable,
33+
requests.exceptions.ChunkedEncodingError,
34+
requests.exceptions.ConnectionError,
35+
requests.exceptions.Timeout,
36+
auth_exceptions.TransportError,
37+
)
38+
39+
_MINUTE_IN_SECONDS = 60.0
40+
_HOUR_IN_SECONDS = 60.0 * _MINUTE_IN_SECONDS
41+
_DEFAULT_RETRY_DEADLINE = 10.0 * _MINUTE_IN_SECONDS
42+
43+
# Ambiguous errors (e.g. internalError, backendError, rateLimitExceeded) retry
44+
# until the full `_DEFAULT_RETRY_DEADLINE`. This is because the
45+
# `jobs.getQueryResults` REST API translates a job failure into an HTTP error.
46+
#
47+
# TODO(https://github.com/googleapis/python-bigquery/issues/1903): Investigate
48+
# if we can fail early for ambiguous errors in `QueryJob.result()`'s call to
49+
# the `jobs.getQueryResult` API.
50+
#
51+
# We need `_DEFAULT_JOB_DEADLINE` to be some multiple of
52+
# `_DEFAULT_RETRY_DEADLINE` to allow for a few retries after the retry
53+
# timeout is reached.
54+
#
55+
# Note: This multiple should actually be a multiple of
56+
# (2 * _DEFAULT_RETRY_DEADLINE). After an ambiguous exception, the first
57+
# call from `job_retry()` refreshes the job state without actually restarting
58+
# the query. The second `job_retry()` actually restarts the query. For a more
59+
# detailed explanation, see the comments where we set `restart_query_job = True`
60+
# in `QueryJob.result()`'s inner `is_job_done()` function.
61+
_DEFAULT_JOB_DEADLINE = 2.0 * (2.0 * _DEFAULT_RETRY_DEADLINE)
62+
63+
64+
def _should_retry(exc):
65+
"""Predicate for determining when to retry.
66+
67+
We retry if and only if the 'reason' is 'backendError'
68+
or 'rateLimitExceeded'.
69+
"""
70+
if not hasattr(exc, "errors") or len(exc.errors) == 0:
71+
# Check for unstructured error returns, e.g. from GFE
72+
return isinstance(exc, _UNSTRUCTURED_RETRYABLE_TYPES)
73+
74+
reason = exc.errors[0]["reason"]
75+
return reason in _RETRYABLE_REASONS
76+
77+
78+
DEFAULT_RETRY = retry.Retry(predicate=_should_retry, deadline=_DEFAULT_RETRY_DEADLINE)
79+
"""The default retry object.
80+
81+
Any method with a ``retry`` parameter will be retried automatically,
82+
with reasonable defaults. To disable retry, pass ``retry=None``.
83+
To modify the default retry behavior, call a ``with_XXX`` method
84+
on ``DEFAULT_RETRY``. For example, to change the deadline to 30 seconds,
85+
pass ``retry=bigquery.DEFAULT_RETRY.with_deadline(30)``.
86+
"""
87+
88+
89+
def _should_retry_get_job_conflict(exc):
90+
"""Predicate for determining when to retry a jobs.get call after a conflict error.
91+
92+
Sometimes we get a 404 after a Conflict. In this case, we
93+
have pretty high confidence that by retrying the 404, we'll
94+
(hopefully) eventually recover the job.
95+
https://github.com/googleapis/python-bigquery/issues/2134
96+
97+
Note: we may be able to extend this to user-specified predicates
98+
after https://github.com/googleapis/python-api-core/issues/796
99+
to tweak existing Retry object predicates.
100+
"""
101+
return isinstance(exc, exceptions.NotFound) or _should_retry(exc)
102+
103+
104+
# Pick a deadline smaller than our other deadlines since we want to timeout
105+
# before those expire.
106+
_DEFAULT_GET_JOB_CONFLICT_DEADLINE = _DEFAULT_RETRY_DEADLINE / 3.0
107+
_DEFAULT_GET_JOB_CONFLICT_RETRY = retry.Retry(
108+
predicate=_should_retry_get_job_conflict,
109+
deadline=_DEFAULT_GET_JOB_CONFLICT_DEADLINE,
110+
)
111+
"""Private, may be removed in future."""
112+
113+
114+
# Note: Take care when updating DEFAULT_TIMEOUT to anything but None. We
115+
# briefly had a default timeout, but even setting it at more than twice the
116+
# theoretical server-side default timeout of 2 minutes was not enough for
117+
# complex queries. See:
118+
# https://github.com/googleapis/python-bigquery/issues/970#issuecomment-921934647
119+
DEFAULT_TIMEOUT = None
120+
"""The default API timeout.
121+
122+
This is the time to wait per request. To adjust the total wait time, set a
123+
deadline on the retry object.
124+
"""
125+
126+
job_retry_reasons = (
127+
"rateLimitExceeded",
128+
"backendError",
129+
"internalError",
130+
"jobBackendError",
131+
"jobInternalError",
132+
"jobRateLimitExceeded",
133+
)
134+
135+
136+
def _job_should_retry(exc):
137+
# Sometimes we have ambiguous errors, such as 'backendError' which could
138+
# be due to an API problem or a job problem. For these, make sure we retry
139+
# our is_job_done() function.
140+
#
141+
# Note: This won't restart the job unless we know for sure it's because of
142+
# the job status and set restart_query_job = True in that loop. This means
143+
# that we might end up calling this predicate twice for the same job
144+
# but from different paths: (1) from jobs.getQueryResults RetryError and
145+
# (2) from translating the job error from the body of a jobs.get response.
146+
#
147+
# Note: If we start retrying job types other than queries where we don't
148+
# call the problematic getQueryResults API to check the status, we need
149+
# to provide a different predicate, as there shouldn't be ambiguous
150+
# errors in those cases.
151+
if isinstance(exc, exceptions.RetryError):
152+
exc = exc.cause
153+
154+
# Per https://github.com/googleapis/python-bigquery/issues/1929, sometimes
155+
# retriable errors make their way here. Because of the separate
156+
# `restart_query_job` logic to make sure we aren't restarting non-failed
157+
# jobs, it should be safe to continue and not totally fail our attempt at
158+
# waiting for the query to complete.
159+
if _should_retry(exc):
160+
return True
161+
162+
if not hasattr(exc, "errors") or len(exc.errors) == 0:
163+
return False
164+
165+
reason = exc.errors[0]["reason"]
166+
return reason in job_retry_reasons
167+
168+
169+
DEFAULT_JOB_RETRY = retry.Retry(
170+
predicate=_job_should_retry, deadline=_DEFAULT_JOB_DEADLINE
171+
)
172+
"""
173+
The default job retry object.
174+
"""
175+
176+
177+
DEFAULT_ML_JOB_RETRY = retry.Retry(
178+
predicate=_job_should_retry, deadline=_HOUR_IN_SECONDS
179+
)
180+
"""
181+
The default job retry object for AI/ML jobs.
182+
183+
Such jobs can take a long time to fail. See: b/436586523.
184+
"""
185+
186+
187+
def _query_job_insert_should_retry(exc):
188+
# Per https://github.com/googleapis/python-bigquery/issues/2134, sometimes
189+
# we get a 404 error. In this case, if we get this far, assume that the job
190+
# doesn't actually exist and try again. We can't add 404 to the default
191+
# job_retry because that happens for errors like "this table does not
192+
# exist", which probably won't resolve with a retry.
193+
if isinstance(exc, exceptions.RetryError):
194+
exc = exc.cause
195+
196+
if isinstance(exc, exceptions.NotFound):
197+
message = exc.message
198+
# Don't try to retry table/dataset not found, just job not found.
199+
# The URL contains jobs, so use whitespace to disambiguate.
200+
return message is not None and " job" in message.lower()
201+
202+
return _job_should_retry(exc)
203+
204+
205+
_DEFAULT_QUERY_JOB_INSERT_RETRY = retry.Retry(
206+
predicate=_query_job_insert_should_retry,
207+
# jobs.insert doesn't wait for the job to complete, so we don't need the
208+
# long _DEFAULT_JOB_DEADLINE for this part.
209+
deadline=_DEFAULT_RETRY_DEADLINE,
210+
)
211+
"""Private, may be removed in future."""
212+
213+
214+
DEFAULT_GET_JOB_TIMEOUT = 128
215+
"""
216+
Default timeout for Client.get_job().
217+
"""
218+
219+
POLLING_DEFAULT_VALUE = google.api_core.future.polling.PollingFuture._DEFAULT_VALUE
220+
"""
221+
Default value defined in google.api_core.future.polling.PollingFuture.
222+
"""

0 commit comments

Comments
 (0)