Skip to content

Commit 6723350

Browse files
authored
Extract SPOG org-id from cluster http_path for non-Thrift requests (databricks#817)
* Extract SPOG org-id from cluster http_path for non-Thrift requests For all-purpose-compute Thrift connections on SPOG (custom-URL) hosts the http_path is /sql/protocolv1/o/<workspace-id>/<cluster-id> and the workspace ID is encoded in the path itself. PoPP routes the Thrift request correctly off the /o/<wsid>/ segment, so the connection succeeds without an explicit ?o= query parameter. Other requests on the same connection (telemetry uploads to /telemetry-ext, feature-flag fetches, SEA REST calls) hit different paths that don't carry the workspace ID. Previously _extract_spog_headers only looked at ?o= in the http_path, so the x-databricks-org-id header was never set for cluster URLs without ?o=. On SPOG hosts PoPP then had no workspace context for these requests and redirected them to /login, silently dropping telemetry. Extend _extract_spog_headers to also extract the workspace ID from the cluster path segment as a fallback when ?o= is absent. Priority order: explicit caller header > ?o= query param > /o/<wsid>/ path segment. Adds five unit tests covering the new cluster-path extraction, leading slash, query-param-wins priority, explicit-header-wins priority, and a warehouse-path regression guard. Signed-off-by: Madhavendra Rathore <madhavendra.rathore@databricks.com> * Respect mixed-case SPOG org-id headers Signed-off-by: Madhavendra Rathore <madhavendra.rathore@databricks.com> * Validate SPOG org id extraction Signed-off-by: Madhavendra Rathore <madhavendra.rathore@databricks.com> --------- Signed-off-by: Madhavendra Rathore <madhavendra.rathore@databricks.com>
1 parent 13987de commit 6723350

2 files changed

Lines changed: 140 additions & 28 deletions

File tree

src/databricks/sql/session.py

Lines changed: 57 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
import logging
2+
import re
23
from typing import Dict, Tuple, List, Optional, Any, Type
34

45
from databricks.sql.thrift_api.TCLIService import ttypes
@@ -72,10 +73,10 @@ def __init__(
7273
base_headers = [("User-Agent", self.useragent_header)]
7374
all_headers = (http_headers or []) + base_headers
7475

75-
# Extract ?o=<workspaceId> from http_path for SPOG routing.
76-
# On SPOG hosts, the httpPath contains ?o=<workspaceId> which routes Thrift
77-
# requests via the URL. For SEA, telemetry, and feature flags (which use
78-
# separate endpoints), we inject x-databricks-org-id as an HTTP header.
76+
# Extract workspace context from http_path for SPOG routing.
77+
# On SPOG hosts, the http_path can contain either ?o=<workspaceId> or an
78+
# all-purpose-compute /o/<workspaceId>/ path segment. For SEA, telemetry,
79+
# and feature flags, we inject x-databricks-org-id as an HTTP header.
7980
self._spog_headers = self._extract_spog_headers(http_path, all_headers)
8081
if self._spog_headers:
8182
all_headers = all_headers + list(self._spog_headers.items())
@@ -170,42 +171,76 @@ def _create_backend(
170171
}
171172
return databricks_client_class(**common_args)
172173

174+
# All-purpose-compute Thrift http_path:
175+
# [/]sql/protocolv1/o/<workspace-id>/<cluster-id>[/...][?...]
176+
_ORG_ID_RE = re.compile(r"^[0-9]+$")
177+
_CLUSTER_PATH_ORG_ID_RE = re.compile(r"^/?sql/protocolv1/o/([0-9]+)/[^/?]+")
178+
173179
@staticmethod
174180
def _extract_spog_headers(http_path, existing_headers):
175-
"""Extract ?o=<workspaceId> from http_path and return as a header dict for SPOG routing."""
176-
if not http_path or "?" not in http_path:
181+
"""Extract the workspace ID from http_path for SPOG routing and return it
182+
as an ``x-databricks-org-id`` header dict.
183+
184+
Two sources are inspected, in priority order:
185+
1. ``?o=<workspace-id>`` query parameter in http_path (warehouse paths
186+
typically encode the workspace this way on SPOG).
187+
2. ``/sql/protocolv1/o/<workspace-id>/<cluster-id>`` path segment
188+
(all-purpose compute paths embed the workspace in the path itself).
189+
190+
An explicit ``x-databricks-org-id`` already set by the caller wins over
191+
both. Returns an empty dict when no workspace ID can be determined.
192+
193+
On SPOG (Custom URL) hosts this header is required for non-Thrift
194+
endpoints — telemetry, feature flags, SEA — to be routed to the right
195+
workspace. Without it, PoPP falls back to default routing and
196+
workspace-scoped requests are redirected to ``/login``.
197+
"""
198+
if not http_path:
177199
return {}
178200

179-
from urllib.parse import parse_qs
180-
181-
query_string = http_path.split("?", 1)[1]
182-
params = parse_qs(query_string)
183-
org_id = params.get("o", [None])[0]
184-
if not org_id:
201+
# Caller already set the header; never override. Header names are case-insensitive.
202+
if any(k.lower() == "x-databricks-org-id" for k, _ in existing_headers):
185203
logger.debug(
186-
"SPOG header extraction: http_path has query string but no ?o= param, "
187-
"skipping x-databricks-org-id injection"
204+
"SPOG header extraction: x-databricks-org-id already set by caller, "
205+
"not extracting from http_path"
188206
)
189207
return {}
190208

191-
# Don't override if explicitly set
192-
if any(k == "x-databricks-org-id" for k, _ in existing_headers):
209+
org_id = None
210+
source = None
211+
212+
if "?" in http_path:
213+
from urllib.parse import parse_qs
214+
215+
query_string = http_path.split("?", 1)[1]
216+
params = parse_qs(query_string)
217+
value = params.get("o", [None])[0]
218+
if value and Session._ORG_ID_RE.fullmatch(value):
219+
org_id = value
220+
source = "?o= in http_path"
221+
222+
if org_id is None:
223+
cluster_match = Session._CLUSTER_PATH_ORG_ID_RE.match(http_path)
224+
if cluster_match:
225+
org_id = cluster_match.group(1)
226+
source = "cluster path segment"
227+
228+
if org_id is None:
193229
logger.debug(
194-
"SPOG header extraction: x-databricks-org-id already set by caller, "
195-
"not overriding with ?o=%s from http_path",
196-
org_id,
230+
"SPOG header extraction: no workspace ID found in http_path, "
231+
"skipping x-databricks-org-id injection"
197232
)
198233
return {}
199234

200235
logger.debug(
201-
"SPOG header extraction: injecting x-databricks-org-id=%s "
202-
"(extracted from ?o= in http_path)",
236+
"SPOG header extraction: injecting x-databricks-org-id=%s (extracted from %s)",
203237
org_id,
238+
source,
204239
)
205240
return {"x-databricks-org-id": org_id}
206241

207242
def get_spog_headers(self):
208-
"""Returns SPOG routing headers (x-databricks-org-id) if ?o= was in http_path."""
243+
"""Returns extracted SPOG routing headers (x-databricks-org-id), if any."""
209244
return dict(self._spog_headers)
210245

211246
def open(self):

tests/unit/test_session.py

Lines changed: 83 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -162,7 +162,10 @@ def test_socket_timeout_passthrough(self, mock_client_class):
162162

163163
@patch("%s.session.ThriftDatabricksClient" % PACKAGE_NAME)
164164
def test_configuration_passthrough(self, mock_client_class):
165-
mock_session_config = {"ANSI_MODE": "FALSE", "QUERY_TAGS": "team:engineering,project:data-pipeline"}
165+
mock_session_config = {
166+
"ANSI_MODE": "FALSE",
167+
"QUERY_TAGS": "team:engineering,project:data-pipeline",
168+
}
166169
databricks.sql.connect(
167170
session_configuration=mock_session_config, **self.DUMMY_CONNECTION_ARGS
168171
)
@@ -218,10 +221,15 @@ def test_query_tags_dict_sets_session_config(self, mock_client_class):
218221
)
219222

220223
call_kwargs = mock_client_class.return_value.open_session.call_args[1]
221-
assert call_kwargs["session_configuration"]["QUERY_TAGS"] == "team:data-eng,project:etl"
224+
assert (
225+
call_kwargs["session_configuration"]["QUERY_TAGS"]
226+
== "team:data-eng,project:etl"
227+
)
222228

223229
@patch("%s.session.ThriftDatabricksClient" % PACKAGE_NAME)
224-
def test_query_tags_dict_takes_precedence_over_session_config(self, mock_client_class):
230+
def test_query_tags_dict_takes_precedence_over_session_config(
231+
self, mock_client_class
232+
):
225233
databricks.sql.connect(
226234
query_tags={"team": "new-team"},
227235
session_configuration={"QUERY_TAGS": "team:old-team,other:value"},
@@ -242,9 +250,7 @@ def test_extracts_org_id_from_query_param(self):
242250
assert result == {"x-databricks-org-id": "6051921418418893"}
243251

244252
def test_no_query_param_returns_empty(self):
245-
result = Session._extract_spog_headers(
246-
"/sql/1.0/warehouses/abc123", []
247-
)
253+
result = Session._extract_spog_headers("/sql/1.0/warehouses/abc123", [])
248254
assert result == {}
249255

250256
def test_no_o_param_returns_empty(self):
@@ -268,8 +274,79 @@ def test_explicit_header_takes_precedence(self):
268274
)
269275
assert result == {}
270276

277+
def test_explicit_header_takes_precedence_case_insensitively(self):
278+
existing = [("X-Databricks-Org-Id", "explicit-value")]
279+
result = Session._extract_spog_headers(
280+
"/sql/1.0/warehouses/abc123?o=6051921418418893", existing
281+
)
282+
assert result == {}
283+
271284
def test_multiple_query_params(self):
272285
result = Session._extract_spog_headers(
273286
"/sql/1.0/warehouses/abc123?o=12345&extra=val", []
274287
)
275288
assert result == {"x-databricks-org-id": "12345"}
289+
290+
def test_non_numeric_query_param_returns_empty(self):
291+
result = Session._extract_spog_headers(
292+
"/sql/1.0/warehouses/abc123?o=abc123", []
293+
)
294+
assert result == {}
295+
296+
def test_control_char_query_param_returns_empty(self):
297+
result = Session._extract_spog_headers(
298+
"/sql/1.0/warehouses/abc123?o=123%0D%0AX-Injected:%20yes", []
299+
)
300+
assert result == {}
301+
302+
def test_empty_query_param_returns_empty(self):
303+
result = Session._extract_spog_headers(
304+
"/sql/1.0/warehouses/abc123?o=", []
305+
)
306+
assert result == {}
307+
308+
def test_extracts_org_id_from_cluster_path_segment(self):
309+
# All-purpose-compute path embeds workspace ID in /o/<wsid>/<cluster>.
310+
# Without ?o=, the driver must still set x-databricks-org-id so that
311+
# telemetry and other non-Thrift requests route to the right workspace
312+
# on SPOG hosts.
313+
result = Session._extract_spog_headers(
314+
"sql/protocolv1/o/6051921418418893/0528-220959-uzmcn1qt", []
315+
)
316+
assert result == {"x-databricks-org-id": "6051921418418893"}
317+
318+
def test_extracts_org_id_from_cluster_path_with_leading_slash(self):
319+
result = Session._extract_spog_headers(
320+
"/sql/protocolv1/o/6051921418418893/0528-220959-uzmcn1qt", []
321+
)
322+
assert result == {"x-databricks-org-id": "6051921418418893"}
323+
324+
def test_query_param_wins_over_cluster_path_segment(self):
325+
# When both forms are present, ?o= takes precedence.
326+
result = Session._extract_spog_headers(
327+
"sql/protocolv1/o/111/0528-220959-uzmcn1qt?o=222", []
328+
)
329+
assert result == {"x-databricks-org-id": "222"}
330+
331+
def test_explicit_header_wins_over_cluster_path_segment(self):
332+
existing = [("x-databricks-org-id", "from-caller")]
333+
result = Session._extract_spog_headers(
334+
"sql/protocolv1/o/111/0528-220959-uzmcn1qt", existing
335+
)
336+
assert result == {}
337+
338+
def test_nested_cluster_path_prefix_returns_empty(self):
339+
result = Session._extract_spog_headers(
340+
"evil/sql/protocolv1/o/999/0528-220959-uzmcn1qt", []
341+
)
342+
assert result == {}
343+
344+
def test_incomplete_cluster_path_returns_empty(self):
345+
result = Session._extract_spog_headers("sql/protocolv1/o/999/", [])
346+
assert result == {}
347+
348+
def test_warehouse_path_without_query_param_returns_empty(self):
349+
# Regression guard: the new cluster-path regex must not accidentally
350+
# match warehouse paths (which never embed the workspace ID).
351+
result = Session._extract_spog_headers("/sql/1.0/warehouses/abc123", [])
352+
assert result == {}

0 commit comments

Comments
 (0)