Skip to content

Commit d1e0c47

Browse files
committed
Extract SPOG org-id from cluster http_path for non-Thrift requests
For all-purpose-compute Thrift connections on SPOG (custom-URL) hosts the http_path is /sql/protocolv1/o/<workspace-id>/<cluster-id> and the workspace ID is encoded in the path itself. PoPP routes the Thrift request correctly off the /o/<wsid>/ segment, so the connection succeeds without an explicit ?o= query parameter. Other requests on the same connection (telemetry uploads to /telemetry-ext, feature-flag fetches, SEA REST calls) hit different paths that don't carry the workspace ID. Previously _extract_spog_headers only looked at ?o= in the http_path, so the x-databricks-org-id header was never set for cluster URLs without ?o=. On SPOG hosts PoPP then had no workspace context for these requests and redirected them to /login, silently dropping telemetry. Extend _extract_spog_headers to also extract the workspace ID from the cluster path segment as a fallback when ?o= is absent. Priority order: explicit caller header > ?o= query param > /o/<wsid>/ path segment. Adds five unit tests covering the new cluster-path extraction, leading slash, query-param-wins priority, explicit-header-wins priority, and a warehouse-path regression guard. Signed-off-by: Madhavendra Rathore <madhavendra.rathore@databricks.com>
1 parent 0309e7c commit d1e0c47

2 files changed

Lines changed: 89 additions & 17 deletions

File tree

src/databricks/sql/session.py

Lines changed: 51 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
import logging
2+
import re
23
from typing import Dict, Tuple, List, Optional, Any, Type
34

45
from databricks.sql.thrift_api.TCLIService import ttypes
@@ -170,37 +171,70 @@ def _create_backend(
170171
}
171172
return databricks_client_class(**common_args)
172173

174+
# All-purpose-compute Thrift http_path:
175+
# [/]sql/protocolv1/o/<workspace-id>/<cluster-id>[/...][?...]
176+
_CLUSTER_PATH_ORG_ID_RE = re.compile(r"(?:^|/)sql/protocolv1/o/(\d+)/[^/?]+")
177+
173178
@staticmethod
174179
def _extract_spog_headers(http_path, existing_headers):
175-
"""Extract ?o=<workspaceId> from http_path and return as a header dict for SPOG routing."""
176-
if not http_path or "?" not in http_path:
180+
"""Extract the workspace ID from http_path for SPOG routing and return it
181+
as an ``x-databricks-org-id`` header dict.
182+
183+
Two sources are inspected, in priority order:
184+
1. ``?o=<workspace-id>`` query parameter in http_path (warehouse paths
185+
typically encode the workspace this way on SPOG).
186+
2. ``/sql/protocolv1/o/<workspace-id>/<cluster-id>`` path segment
187+
(all-purpose compute paths embed the workspace in the path itself).
188+
189+
An explicit ``x-databricks-org-id`` already set by the caller wins over
190+
both. Returns an empty dict when no workspace ID can be determined.
191+
192+
On SPOG (Custom URL) hosts this header is required for non-Thrift
193+
endpoints — telemetry, feature flags, SEA — to be routed to the right
194+
workspace. Without it, PoPP falls back to default routing and
195+
workspace-scoped requests are redirected to ``/login``.
196+
"""
197+
if not http_path:
177198
return {}
178199

179-
from urllib.parse import parse_qs
180-
181-
query_string = http_path.split("?", 1)[1]
182-
params = parse_qs(query_string)
183-
org_id = params.get("o", [None])[0]
184-
if not org_id:
200+
# Caller already set the header — never override.
201+
if any(k == "x-databricks-org-id" for k, _ in existing_headers):
185202
logger.debug(
186-
"SPOG header extraction: http_path has query string but no ?o= param, "
187-
"skipping x-databricks-org-id injection"
203+
"SPOG header extraction: x-databricks-org-id already set by caller, "
204+
"not extracting from http_path"
188205
)
189206
return {}
190207

191-
# Don't override if explicitly set
192-
if any(k == "x-databricks-org-id" for k, _ in existing_headers):
208+
org_id = None
209+
source = None
210+
211+
if "?" in http_path:
212+
from urllib.parse import parse_qs
213+
214+
query_string = http_path.split("?", 1)[1]
215+
params = parse_qs(query_string)
216+
value = params.get("o", [None])[0]
217+
if value:
218+
org_id = value
219+
source = "?o= in http_path"
220+
221+
if org_id is None:
222+
cluster_match = Session._CLUSTER_PATH_ORG_ID_RE.search(http_path)
223+
if cluster_match:
224+
org_id = cluster_match.group(1)
225+
source = "cluster path segment"
226+
227+
if org_id is None:
193228
logger.debug(
194-
"SPOG header extraction: x-databricks-org-id already set by caller, "
195-
"not overriding with ?o=%s from http_path",
196-
org_id,
229+
"SPOG header extraction: no workspace ID found in http_path, "
230+
"skipping x-databricks-org-id injection"
197231
)
198232
return {}
199233

200234
logger.debug(
201-
"SPOG header extraction: injecting x-databricks-org-id=%s "
202-
"(extracted from ?o= in http_path)",
235+
"SPOG header extraction: injecting x-databricks-org-id=%s (extracted from %s)",
203236
org_id,
237+
source,
204238
)
205239
return {"x-databricks-org-id": org_id}
206240

tests/unit/test_session.py

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -273,3 +273,41 @@ def test_multiple_query_params(self):
273273
"/sql/1.0/warehouses/abc123?o=12345&extra=val", []
274274
)
275275
assert result == {"x-databricks-org-id": "12345"}
276+
277+
def test_extracts_org_id_from_cluster_path_segment(self):
278+
# All-purpose-compute path embeds workspace ID in /o/<wsid>/<cluster>.
279+
# Without ?o=, the driver must still set x-databricks-org-id so that
280+
# telemetry and other non-Thrift requests route to the right workspace
281+
# on SPOG hosts.
282+
result = Session._extract_spog_headers(
283+
"sql/protocolv1/o/6051921418418893/0528-220959-uzmcn1qt", []
284+
)
285+
assert result == {"x-databricks-org-id": "6051921418418893"}
286+
287+
def test_extracts_org_id_from_cluster_path_with_leading_slash(self):
288+
result = Session._extract_spog_headers(
289+
"/sql/protocolv1/o/6051921418418893/0528-220959-uzmcn1qt", []
290+
)
291+
assert result == {"x-databricks-org-id": "6051921418418893"}
292+
293+
def test_query_param_wins_over_cluster_path_segment(self):
294+
# When both forms are present, ?o= takes precedence.
295+
result = Session._extract_spog_headers(
296+
"sql/protocolv1/o/111/0528-220959-uzmcn1qt?o=222", []
297+
)
298+
assert result == {"x-databricks-org-id": "222"}
299+
300+
def test_explicit_header_wins_over_cluster_path_segment(self):
301+
existing = [("x-databricks-org-id", "from-caller")]
302+
result = Session._extract_spog_headers(
303+
"sql/protocolv1/o/111/0528-220959-uzmcn1qt", existing
304+
)
305+
assert result == {}
306+
307+
def test_warehouse_path_without_query_param_returns_empty(self):
308+
# Regression guard: the new cluster-path regex must not accidentally
309+
# match warehouse paths (which never embed the workspace ID).
310+
result = Session._extract_spog_headers(
311+
"/sql/1.0/warehouses/abc123", []
312+
)
313+
assert result == {}

0 commit comments

Comments
 (0)