Skip to content

Commit 575c5a6

Browse files
committed
feat(gooddata-pipelines): support composite key references and optional WDF
1 parent efc6b2a commit 575c5a6

4 files changed

Lines changed: 324 additions & 31 deletions

File tree

packages/gooddata-pipelines/src/gooddata_pipelines/ldm_extension/input_processor.py

Lines changed: 81 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -154,6 +154,49 @@ def _date_ref_from_field(
154154
],
155155
)
156156

157+
@staticmethod
158+
def _build_parent_reference_sources(
159+
definition: CustomDatasetDefinition,
160+
) -> list[CatalogDeclarativeReferenceSource]:
161+
"""Resolve the list of parent reference sources.
162+
163+
Precedence:
164+
* If ``parent_dataset_references`` is set and non-empty, use it as-is.
165+
* Otherwise fall back to the legacy single-column fields wrapped in a
166+
one-element list. Missing legacy fields yield an empty list, which
167+
will be rejected downstream by the GoodData API.
168+
"""
169+
if definition.parent_dataset_references:
170+
return [
171+
CatalogDeclarativeReferenceSource(
172+
column=ref.source_column,
173+
data_type=ref.data_type.value,
174+
target=CatalogGrainIdentifier(
175+
id=ref.attribute_id,
176+
type=CustomFieldType.ATTRIBUTE.value,
177+
),
178+
)
179+
for ref in definition.parent_dataset_references
180+
]
181+
182+
if (
183+
definition.dataset_reference_source_column is not None
184+
and definition.dataset_reference_source_column_data_type is not None
185+
and definition.parent_dataset_reference_attribute_id is not None
186+
):
187+
return [
188+
CatalogDeclarativeReferenceSource(
189+
column=definition.dataset_reference_source_column,
190+
data_type=definition.dataset_reference_source_column_data_type.value,
191+
target=CatalogGrainIdentifier(
192+
id=definition.parent_dataset_reference_attribute_id,
193+
type=CustomFieldType.ATTRIBUTE.value,
194+
),
195+
)
196+
]
197+
198+
return []
199+
157200
@staticmethod
158201
def _get_sources(
159202
dataset: CustomDataset,
@@ -253,6 +296,41 @@ def datasets_to_ldm(
253296
# Get the data source info
254297
dataset_source_table_id, dataset_sql = self._get_sources(dataset)
255298

299+
# Build the parent reference source list. The composite-friendly
300+
# `parent_dataset_references` list takes precedence when set and
301+
# non-empty; otherwise fall back to the legacy single-column fields.
302+
parent_reference_sources = self._build_parent_reference_sources(
303+
dataset.definition
304+
)
305+
306+
# Workspace data filter fields are optional and must be set together
307+
# (validated on the input model). Emit columns/references only when
308+
# both are provided.
309+
wdf_columns: list[CatalogDeclarativeWorkspaceDataFilterColumn] = []
310+
wdf_references: list[
311+
CatalogDeclarativeWorkspaceDataFilterReferences
312+
] = []
313+
if (
314+
dataset.definition.workspace_data_filter_id is not None
315+
and dataset.definition.workspace_data_filter_column_name
316+
is not None
317+
):
318+
wdf_columns.append(
319+
CatalogDeclarativeWorkspaceDataFilterColumn(
320+
name=dataset.definition.workspace_data_filter_column_name,
321+
data_type=ColumnDataType.STRING.value,
322+
)
323+
)
324+
wdf_references.append(
325+
CatalogDeclarativeWorkspaceDataFilterReferences(
326+
filter_id=CatalogDatasetWorkspaceDataFilterIdentifier(
327+
id=dataset.definition.workspace_data_filter_id
328+
),
329+
filter_column=dataset.definition.workspace_data_filter_column_name,
330+
filter_column_data_type=ColumnDataType.STRING.value,
331+
)
332+
)
333+
256334
# Construct the declarative dataset object and append it to the list.
257335
declarative_datasets.append(
258336
CatalogDeclarativeDataset(
@@ -265,16 +343,7 @@ def datasets_to_ldm(
265343
id=dataset.definition.parent_dataset_reference,
266344
),
267345
multivalue=True,
268-
sources=[
269-
CatalogDeclarativeReferenceSource(
270-
column=dataset.definition.dataset_reference_source_column,
271-
data_type=dataset.definition.dataset_reference_source_column_data_type.value,
272-
target=CatalogGrainIdentifier(
273-
id=dataset.definition.parent_dataset_reference_attribute_id,
274-
type=CustomFieldType.ATTRIBUTE.value,
275-
),
276-
)
277-
],
346+
sources=parent_reference_sources,
278347
),
279348
]
280349
+ date_references,
@@ -283,21 +352,8 @@ def datasets_to_ldm(
283352
facts=facts,
284353
data_source_table_id=dataset_source_table_id,
285354
sql=dataset_sql,
286-
workspace_data_filter_columns=[
287-
CatalogDeclarativeWorkspaceDataFilterColumn(
288-
name=dataset.definition.workspace_data_filter_column_name,
289-
data_type=ColumnDataType.STRING.value,
290-
)
291-
],
292-
workspace_data_filter_references=[
293-
CatalogDeclarativeWorkspaceDataFilterReferences(
294-
filter_id=CatalogDatasetWorkspaceDataFilterIdentifier(
295-
id=dataset.definition.workspace_data_filter_id
296-
),
297-
filter_column=dataset.definition.workspace_data_filter_column_name,
298-
filter_column_data_type=ColumnDataType.STRING.value,
299-
)
300-
],
355+
workspace_data_filter_columns=wdf_columns or None,
356+
workspace_data_filter_references=wdf_references or None,
301357
tags=_effective_dataset_tags(dataset.definition),
302358
)
303359
)

packages/gooddata-pipelines/src/gooddata_pipelines/ldm_extension/models/custom_data_object.py

Lines changed: 101 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -61,8 +61,43 @@ def check_ids_not_equal(self) -> "CustomFieldDefinition":
6161
return self
6262

6363

64+
class ParentDatasetReference(BaseModel):
65+
"""One column of a (possibly composite) join to the parent dataset.
66+
67+
A list of these on ``CustomDatasetDefinition.parent_dataset_references``
68+
supports multi-column foreign keys. Each entry binds a source column on the
69+
new dataset to a grain attribute on the parent.
70+
"""
71+
72+
attribute_id: str = Field(
73+
description="Attribute ID on the parent dataset that this column joins to.",
74+
)
75+
source_column: str = Field(
76+
description="Column name on this dataset used to join to the parent.",
77+
)
78+
data_type: ColumnDataType = Field(
79+
description="Data type of the source column.",
80+
)
81+
82+
6483
class CustomDatasetDefinition(BaseModel):
65-
"""Input model for custom dataset definition."""
84+
"""Input model for custom dataset definition.
85+
86+
The reference to the parent dataset can be expressed in two ways:
87+
88+
* The legacy single-column form via ``parent_dataset_reference_attribute_id``,
89+
``dataset_reference_source_column`` and ``dataset_reference_source_column_data_type``.
90+
All three must be provided together.
91+
* The composite-friendly form via ``parent_dataset_references``: a list of
92+
``ParentDatasetReference`` entries, one per join column.
93+
94+
When ``parent_dataset_references`` is set and non-empty, it takes precedence
95+
and the legacy fields are ignored. Otherwise the legacy fields are used.
96+
97+
Workspace data filter fields are optional. Both must be set together or
98+
both left unset; when set, a single-column WDF binding is emitted on the
99+
declarative dataset.
100+
"""
66101

67102
workspace_id: str
68103
dataset_id: str
@@ -71,11 +106,36 @@ class CustomDatasetDefinition(BaseModel):
71106
dataset_source_table: str | None
72107
dataset_source_sql: str | None
73108
parent_dataset_reference: str
74-
parent_dataset_reference_attribute_id: str
75-
dataset_reference_source_column: str
76-
dataset_reference_source_column_data_type: ColumnDataType
77-
workspace_data_filter_id: str
78-
workspace_data_filter_column_name: str
109+
parent_dataset_reference_attribute_id: str | None = Field(
110+
default=None,
111+
deprecated=(
112+
"Use `parent_dataset_references` for richer (composite-key) joins. "
113+
"This field will be removed in a future release."
114+
),
115+
)
116+
dataset_reference_source_column: str | None = Field(
117+
default=None,
118+
deprecated=(
119+
"Use `parent_dataset_references` for richer (composite-key) joins. "
120+
"This field will be removed in a future release."
121+
),
122+
)
123+
dataset_reference_source_column_data_type: ColumnDataType | None = Field(
124+
default=None,
125+
deprecated=(
126+
"Use `parent_dataset_references` for richer (composite-key) joins. "
127+
"This field will be removed in a future release."
128+
),
129+
)
130+
parent_dataset_references: list[ParentDatasetReference] | None = Field(
131+
default=None,
132+
description=(
133+
"Composite-key reference to the parent dataset. When provided and "
134+
"non-empty, supersedes the legacy single-column reference fields."
135+
),
136+
)
137+
workspace_data_filter_id: str | None = None
138+
workspace_data_filter_column_name: str | None = None
79139
dataset_description: str | None = Field(
80140
default=None,
81141
description="Declarative description on the custom dataset.",
@@ -98,6 +158,41 @@ def check_source(self) -> "CustomDatasetDefinition":
98158
)
99159
return self
100160

161+
@model_validator(mode="after")
162+
def check_reference_form_exclusive(self) -> "CustomDatasetDefinition":
163+
"""Reject mixing the legacy single-column fields with ``parent_dataset_references``.
164+
165+
Forcing callers to pick one form prevents silent precedence surprises:
166+
without this check, setting both would quietly use the new list and
167+
ignore the legacy values, which is easy to miss when debugging.
168+
"""
169+
has_new = bool(self.parent_dataset_references)
170+
has_legacy = (
171+
self.parent_dataset_reference_attribute_id is not None
172+
or self.dataset_reference_source_column is not None
173+
or self.dataset_reference_source_column_data_type is not None
174+
)
175+
if has_new and has_legacy:
176+
raise ValueError(
177+
"Set either `parent_dataset_references` or the legacy single-column "
178+
"fields (`parent_dataset_reference_attribute_id`, "
179+
"`dataset_reference_source_column`, "
180+
"`dataset_reference_source_column_data_type`), not both."
181+
)
182+
return self
183+
184+
@model_validator(mode="after")
185+
def check_wdf_pair(self) -> "CustomDatasetDefinition":
186+
"""Workspace data filter id and column name must be provided together or both omitted."""
187+
has_id = self.workspace_data_filter_id is not None
188+
has_col = self.workspace_data_filter_column_name is not None
189+
if has_id != has_col:
190+
raise ValueError(
191+
"workspace_data_filter_id and workspace_data_filter_column_name "
192+
"must both be set or both be omitted"
193+
)
194+
return self
195+
101196

102197
class CustomDataset(BaseModel):
103198
"""Custom dataset with its definition and custom fields."""

packages/gooddata-pipelines/tests/test_ldm_extension/test_input_processor.py

Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -129,3 +129,50 @@ def test_datasets_to_ldm(mock_custom_dataset):
129129
assert ds.workspace_data_filter_references[0].filter_id.id == "wdf1"
130130
assert len(ldm.date_instances) == 1
131131
assert ldm.date_instances[0].id == "date1"
132+
133+
134+
def test_datasets_to_ldm_parent_dataset_references_composite():
135+
"""Multi-column references via `parent_dataset_references` produce N sources."""
136+
from gooddata_pipelines.ldm_extension.models.custom_data_object import (
137+
CustomDatasetDefinition,
138+
ParentDatasetReference,
139+
)
140+
141+
definition = CustomDatasetDefinition(
142+
workspace_id="workspace1",
143+
dataset_id="ds_composite",
144+
dataset_name="Composite Dataset",
145+
dataset_source_table="table1",
146+
dataset_datasource_id="ds_source",
147+
dataset_source_sql=None,
148+
parent_dataset_reference="parent_ds",
149+
parent_dataset_references=[
150+
ParentDatasetReference(
151+
attribute_id="parent_pk1",
152+
source_column="src_col1",
153+
data_type=ColumnDataType.STRING,
154+
),
155+
ParentDatasetReference(
156+
attribute_id="parent_pk2",
157+
source_column="src_col2",
158+
data_type=ColumnDataType.INT,
159+
),
160+
],
161+
)
162+
ds = CustomDataset(definition=definition, custom_fields=[])
163+
processor = LdmExtensionDataProcessor()
164+
model = processor.datasets_to_ldm({"ds_composite": ds})
165+
parent_ref = model.ldm.datasets[0].references[0]
166+
assert len(parent_ref.sources) == 2
167+
assert [s.column for s in parent_ref.sources] == ["src_col1", "src_col2"]
168+
169+
170+
def test_datasets_to_ldm_legacy_reference_fallback(mock_dataset_definition):
171+
"""When `parent_dataset_references` is not set, fall back to legacy fields."""
172+
mock_dataset_definition.parent_dataset_references = None
173+
ds = CustomDataset(definition=mock_dataset_definition, custom_fields=[])
174+
processor = LdmExtensionDataProcessor()
175+
model = processor.datasets_to_ldm({"ds1": ds})
176+
parent_ref = model.ldm.datasets[0].references[0]
177+
assert len(parent_ref.sources) == 1
178+
assert parent_ref.sources[0].column == "ref_col"

0 commit comments

Comments
 (0)