Skip to content

Commit e74ab13

Browse files
committed
feat(gooddata-pipelines): support composite key references and optional WDF
1 parent efc6b2a commit e74ab13

6 files changed

Lines changed: 176 additions & 46 deletions

File tree

packages/gooddata-pipelines/src/gooddata_pipelines/ldm_extension/input_processor.py

Lines changed: 51 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -253,6 +253,54 @@ def datasets_to_ldm(
253253
# Get the data source info
254254
dataset_source_table_id, dataset_sql = self._get_sources(dataset)
255255

256+
# Build one reference source per column for composite-key joins to
257+
# the parent dataset. The three list fields are validated to share
258+
# the same length on the input model.
259+
parent_reference_sources = [
260+
CatalogDeclarativeReferenceSource(
261+
column=column,
262+
data_type=data_type.value,
263+
target=CatalogGrainIdentifier(
264+
id=attribute_id,
265+
type=CustomFieldType.ATTRIBUTE.value,
266+
),
267+
)
268+
for column, data_type, attribute_id in zip(
269+
dataset.definition.dataset_reference_source_columns,
270+
dataset.definition.dataset_reference_source_column_data_types,
271+
dataset.definition.parent_dataset_reference_attribute_ids,
272+
strict=True,
273+
)
274+
]
275+
276+
# Workspace data filter fields are optional and must be set together
277+
# (validated on the input model). Emit columns/references only when
278+
# both are provided.
279+
wdf_columns: list[CatalogDeclarativeWorkspaceDataFilterColumn] = []
280+
wdf_references: list[
281+
CatalogDeclarativeWorkspaceDataFilterReferences
282+
] = []
283+
if (
284+
dataset.definition.workspace_data_filter_id is not None
285+
and dataset.definition.workspace_data_filter_column_name
286+
is not None
287+
):
288+
wdf_columns.append(
289+
CatalogDeclarativeWorkspaceDataFilterColumn(
290+
name=dataset.definition.workspace_data_filter_column_name,
291+
data_type=ColumnDataType.STRING.value,
292+
)
293+
)
294+
wdf_references.append(
295+
CatalogDeclarativeWorkspaceDataFilterReferences(
296+
filter_id=CatalogDatasetWorkspaceDataFilterIdentifier(
297+
id=dataset.definition.workspace_data_filter_id
298+
),
299+
filter_column=dataset.definition.workspace_data_filter_column_name,
300+
filter_column_data_type=ColumnDataType.STRING.value,
301+
)
302+
)
303+
256304
# Construct the declarative dataset object and append it to the list.
257305
declarative_datasets.append(
258306
CatalogDeclarativeDataset(
@@ -265,16 +313,7 @@ def datasets_to_ldm(
265313
id=dataset.definition.parent_dataset_reference,
266314
),
267315
multivalue=True,
268-
sources=[
269-
CatalogDeclarativeReferenceSource(
270-
column=dataset.definition.dataset_reference_source_column,
271-
data_type=dataset.definition.dataset_reference_source_column_data_type.value,
272-
target=CatalogGrainIdentifier(
273-
id=dataset.definition.parent_dataset_reference_attribute_id,
274-
type=CustomFieldType.ATTRIBUTE.value,
275-
),
276-
)
277-
],
316+
sources=parent_reference_sources,
278317
),
279318
]
280319
+ date_references,
@@ -283,21 +322,8 @@ def datasets_to_ldm(
283322
facts=facts,
284323
data_source_table_id=dataset_source_table_id,
285324
sql=dataset_sql,
286-
workspace_data_filter_columns=[
287-
CatalogDeclarativeWorkspaceDataFilterColumn(
288-
name=dataset.definition.workspace_data_filter_column_name,
289-
data_type=ColumnDataType.STRING.value,
290-
)
291-
],
292-
workspace_data_filter_references=[
293-
CatalogDeclarativeWorkspaceDataFilterReferences(
294-
filter_id=CatalogDatasetWorkspaceDataFilterIdentifier(
295-
id=dataset.definition.workspace_data_filter_id
296-
),
297-
filter_column=dataset.definition.workspace_data_filter_column_name,
298-
filter_column_data_type=ColumnDataType.STRING.value,
299-
)
300-
],
325+
workspace_data_filter_columns=wdf_columns or None,
326+
workspace_data_filter_references=wdf_references or None,
301327
tags=_effective_dataset_tags(dataset.definition),
302328
)
303329
)

packages/gooddata-pipelines/src/gooddata_pipelines/ldm_extension/models/custom_data_object.py

Lines changed: 47 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -62,7 +62,18 @@ def check_ids_not_equal(self) -> "CustomFieldDefinition":
6262

6363

6464
class CustomDatasetDefinition(BaseModel):
65-
"""Input model for custom dataset definition."""
65+
"""Input model for custom dataset definition.
66+
67+
Reference fields are lists to support composite-key joins to the parent
68+
dataset. ``parent_dataset_reference_attribute_ids``, ``dataset_reference_source_columns``,
69+
and ``dataset_reference_source_column_data_types`` must all have the same
70+
(non-zero) length and are zipped position-by-position to produce one
71+
declarative reference source per column.
72+
73+
Workspace data filter fields are optional. Both must be set together or
74+
both left unset; when set, a single-column WDF binding is emitted on the
75+
declarative dataset.
76+
"""
6677

6778
workspace_id: str
6879
dataset_id: str
@@ -71,11 +82,11 @@ class CustomDatasetDefinition(BaseModel):
7182
dataset_source_table: str | None
7283
dataset_source_sql: str | None
7384
parent_dataset_reference: str
74-
parent_dataset_reference_attribute_id: str
75-
dataset_reference_source_column: str
76-
dataset_reference_source_column_data_type: ColumnDataType
77-
workspace_data_filter_id: str
78-
workspace_data_filter_column_name: str
85+
parent_dataset_reference_attribute_ids: list[str]
86+
dataset_reference_source_columns: list[str]
87+
dataset_reference_source_column_data_types: list[ColumnDataType]
88+
workspace_data_filter_id: str | None = None
89+
workspace_data_filter_column_name: str | None = None
7990
dataset_description: str | None = Field(
8091
default=None,
8192
description="Declarative description on the custom dataset.",
@@ -98,6 +109,36 @@ def check_source(self) -> "CustomDatasetDefinition":
98109
)
99110
return self
100111

112+
@model_validator(mode="after")
113+
def check_reference_lists(self) -> "CustomDatasetDefinition":
114+
"""Reference list fields must be parallel (same non-zero length)."""
115+
n = len(self.dataset_reference_source_columns)
116+
if n == 0:
117+
raise ValueError(
118+
"dataset_reference_source_columns must contain at least one column"
119+
)
120+
if (
121+
len(self.parent_dataset_reference_attribute_ids) != n
122+
or len(self.dataset_reference_source_column_data_types) != n
123+
):
124+
raise ValueError(
125+
"parent_dataset_reference_attribute_ids, dataset_reference_source_columns, "
126+
"and dataset_reference_source_column_data_types must have the same length"
127+
)
128+
return self
129+
130+
@model_validator(mode="after")
131+
def check_wdf_pair(self) -> "CustomDatasetDefinition":
132+
"""Workspace data filter id and column name must be provided together or both omitted."""
133+
has_id = self.workspace_data_filter_id is not None
134+
has_col = self.workspace_data_filter_column_name is not None
135+
if has_id != has_col:
136+
raise ValueError(
137+
"workspace_data_filter_id and workspace_data_filter_column_name "
138+
"must both be set or both be omitted"
139+
)
140+
return self
141+
101142

102143
class CustomDataset(BaseModel):
103144
"""Custom dataset with its definition and custom fields."""

packages/gooddata-pipelines/tests/test_ldm_extension/conftest.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -59,9 +59,9 @@ def mock_dataset_definition():
5959
dataset_datasource_id="ds_source",
6060
dataset_source_sql=None,
6161
parent_dataset_reference="parent_ds",
62-
parent_dataset_reference_attribute_id="parent_attr",
63-
dataset_reference_source_column="ref_col",
64-
dataset_reference_source_column_data_type=ColumnDataType.STRING,
62+
parent_dataset_reference_attribute_ids=["parent_attr"],
63+
dataset_reference_source_columns=["ref_col"],
64+
dataset_reference_source_column_data_types=[ColumnDataType.STRING],
6565
workspace_data_filter_id="wdf1",
6666
workspace_data_filter_column_name="col1",
6767
)

packages/gooddata-pipelines/tests/test_ldm_extension/test_input_validator.py

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -25,9 +25,9 @@ def valid_dataset_definitions():
2525
dataset_source_table="table1",
2626
dataset_source_sql=None,
2727
parent_dataset_reference="parent1",
28-
parent_dataset_reference_attribute_id="parent1.id",
29-
dataset_reference_source_column="id",
30-
dataset_reference_source_column_data_type=ColumnDataType.STRING,
28+
parent_dataset_reference_attribute_ids=["parent1.id"],
29+
dataset_reference_source_columns=["id"],
30+
dataset_reference_source_column_data_types=[ColumnDataType.STRING],
3131
workspace_data_filter_id="wdf1",
3232
workspace_data_filter_column_name="id",
3333
),
@@ -39,9 +39,9 @@ def valid_dataset_definitions():
3939
dataset_source_table="table2",
4040
dataset_source_sql=None,
4141
parent_dataset_reference="parent2",
42-
parent_dataset_reference_attribute_id="parent2.id",
43-
dataset_reference_source_column="id",
44-
dataset_reference_source_column_data_type=ColumnDataType.INT,
42+
parent_dataset_reference_attribute_ids=["parent2.id"],
43+
dataset_reference_source_columns=["id"],
44+
dataset_reference_source_column_data_types=[ColumnDataType.INT],
4545
workspace_data_filter_id="wdf2",
4646
workspace_data_filter_column_name="id",
4747
),

packages/gooddata-pipelines/tests/test_ldm_extension/test_merge_ldm.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -67,9 +67,9 @@ def test_merge_removes_managed_dataset_not_in_input():
6767
dataset_source_table="table1",
6868
dataset_source_sql=None,
6969
parent_dataset_reference="parent_ds",
70-
parent_dataset_reference_attribute_id="parent_attr",
71-
dataset_reference_source_column="ref_col",
72-
dataset_reference_source_column_data_type=ColumnDataType.STRING,
70+
parent_dataset_reference_attribute_ids=["parent_attr"],
71+
dataset_reference_source_columns=["ref_col"],
72+
dataset_reference_source_column_data_types=[ColumnDataType.STRING],
7373
workspace_data_filter_id="wdf1",
7474
workspace_data_filter_column_name="col1",
7575
)

packages/gooddata-pipelines/tests/test_ldm_extension/test_models/test_custom_data_object.py

Lines changed: 66 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -34,9 +34,9 @@ def make_valid_dataset_def(**kwargs):
3434
"dataset_source_table": "table1",
3535
"dataset_source_sql": None,
3636
"parent_dataset_reference": "parent_ds",
37-
"parent_dataset_reference_attribute_id": "parent_attr",
38-
"dataset_reference_source_column": "src_col",
39-
"dataset_reference_source_column_data_type": ColumnDataType.STRING,
37+
"parent_dataset_reference_attribute_ids": ["parent_attr"],
38+
"dataset_reference_source_columns": ["src_col"],
39+
"dataset_reference_source_column_data_types": [ColumnDataType.STRING],
4040
"workspace_data_filter_id": "wdf1",
4141
"workspace_data_filter_column_name": "col1",
4242
}
@@ -100,3 +100,66 @@ def test_custom_dataset_model():
100100
assert dataset.definition.dataset_id == "ds1"
101101
assert len(dataset.custom_fields) == 1
102102
assert dataset.custom_fields[0].custom_field_id == "cf1"
103+
104+
105+
def test_custom_dataset_definition_composite_reference():
106+
"""Multi-column reference: lists with matching length zip into N sources."""
107+
data = make_valid_dataset_def(
108+
parent_dataset_reference_attribute_ids=["parent_pk1", "parent_pk2"],
109+
dataset_reference_source_columns=["src_col1", "src_col2"],
110+
dataset_reference_source_column_data_types=[
111+
ColumnDataType.STRING,
112+
ColumnDataType.INT,
113+
],
114+
)
115+
ds = CustomDatasetDefinition(**data)
116+
assert len(ds.dataset_reference_source_columns) == 2
117+
118+
119+
def test_custom_dataset_definition_mismatched_reference_lengths_raises():
120+
data = make_valid_dataset_def(
121+
parent_dataset_reference_attribute_ids=["a", "b"],
122+
dataset_reference_source_columns=["c"],
123+
dataset_reference_source_column_data_types=[ColumnDataType.STRING],
124+
)
125+
with pytest.raises(ValidationError) as exc:
126+
CustomDatasetDefinition(**data)
127+
assert "must have the same length" in str(exc.value)
128+
129+
130+
def test_custom_dataset_definition_empty_reference_columns_raises():
131+
data = make_valid_dataset_def(
132+
parent_dataset_reference_attribute_ids=[],
133+
dataset_reference_source_columns=[],
134+
dataset_reference_source_column_data_types=[],
135+
)
136+
with pytest.raises(ValidationError) as exc:
137+
CustomDatasetDefinition(**data)
138+
assert "at least one column" in str(exc.value)
139+
140+
141+
def test_custom_dataset_definition_wdf_optional_both_none():
142+
data = make_valid_dataset_def(
143+
workspace_data_filter_id=None, workspace_data_filter_column_name=None
144+
)
145+
ds = CustomDatasetDefinition(**data)
146+
assert ds.workspace_data_filter_id is None
147+
assert ds.workspace_data_filter_column_name is None
148+
149+
150+
def test_custom_dataset_definition_wdf_only_id_raises():
151+
data = make_valid_dataset_def(
152+
workspace_data_filter_id="wdf1", workspace_data_filter_column_name=None
153+
)
154+
with pytest.raises(ValidationError) as exc:
155+
CustomDatasetDefinition(**data)
156+
assert "both be set or both be omitted" in str(exc.value)
157+
158+
159+
def test_custom_dataset_definition_wdf_only_column_raises():
160+
data = make_valid_dataset_def(
161+
workspace_data_filter_id=None, workspace_data_filter_column_name="col1"
162+
)
163+
with pytest.raises(ValidationError) as exc:
164+
CustomDatasetDefinition(**data)
165+
assert "both be set or both be omitted" in str(exc.value)

0 commit comments

Comments
 (0)