Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 17 additions & 10 deletions cdisc_rules_engine/utilities/data_processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
DataServiceFactory,
DummyDataService,
)
from cdisc_rules_engine.exceptions.custom_exceptions import PreprocessingError
from cdisc_rules_engine.utilities.utils import (
search_in_list_of_dicts,
)
Expand Down Expand Up @@ -331,7 +332,7 @@ def _merge_supp_with_multiple_idvars(
validation_keys.append(idvar_for_qnam)
grouped = qnam_check.groupby(validation_keys).size()
if (grouped > 1).any():
raise ValueError(
raise PreprocessingError(
f"Multiple records with the same QNAM '{qnam}' match a single parent record"
)
return result_dataset
Expand All @@ -351,14 +352,20 @@ def process_supp(supp_dataset):
columns_to_drop = [
col for col in ["QNAM", "QVAL", "QLABEL"] if col in supp_dataset.columns
]
if "RDOMAIN" in supp_dataset.columns and supp_dataset["RDOMAIN"][0] == "DM":
excluded_columns = list(supp_dataset["QNAM"].unique()) + columns_to_drop
group_cols = [c for c in supp_dataset.columns if c not in excluded_columns]
supp_dataset = PandasDataset(
supp_dataset.data.groupby(group_cols, dropna=False, as_index=False).agg(
lambda x: (x.dropna().iloc[0] if not x.dropna().empty else pd.NA)
)
excluded_columns = list(supp_dataset["QNAM"].unique()) + columns_to_drop
group_cols = [c for c in supp_dataset.columns if c not in excluded_columns]
grouped = supp_dataset.data.groupby(
group_cols + ["QNAM"], dropna=False, as_index=False
).size()
if (grouped["size"] > 1).any():
raise PreprocessingError(
"Multiple records with the same QNAM match a single parent record"
)
supp_dataset = PandasDataset(
supp_dataset.data.groupby(group_cols, dropna=False, as_index=False).agg(
lambda x: (x.dropna().iloc[0] if not x.dropna().empty else pd.NA)
Comment thread
gerrycampion marked this conversation as resolved.
)
)
if columns_to_drop:
supp_dataset = supp_dataset.drop(labels=columns_to_drop, axis=1)
return supp_dataset
Expand All @@ -377,7 +384,7 @@ def _validate_qnam(
continue
grouped = qnam_check.groupby(common_keys).size()
if (grouped > 1).any():
raise ValueError(
raise PreprocessingError(
f"Multiple records with the same QNAM '{qnam}' match a single parent record"
)

Expand All @@ -397,7 +404,7 @@ def _validate_qnam_dask(
problem_groups = grouped_counts[grouped_counts > 1]
problem_groups_computed = problem_groups.compute()
if len(problem_groups_computed) > 0:
raise ValueError(
raise PreprocessingError(
f"Multiple records with the same QNAM '{qnam}' match a single parent record. "
)

Expand Down
1 change: 0 additions & 1 deletion tests/unit/test_dataset_preprocessor.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,6 @@
from cdisc_rules_engine.models.library_metadata_container import (
LibraryMetadataContainer,
)

from cdisc_rules_engine.models.dataset import PandasDataset


Expand Down
29 changes: 27 additions & 2 deletions tests/unit/test_merge_supp_datasets.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
from cdisc_rules_engine.utilities.data_processor import DataProcessor
import pandas as pd
import pandas.testing as pdt
from cdisc_rules_engine.exceptions.custom_exceptions import PreprocessingError


@pytest.fixture
Expand Down Expand Up @@ -55,6 +56,30 @@ def test_process_supp():
assert "QLABEL" not in processed_dataset.data.columns, "'QVAL' should be dropped."


def test_data_processor_suppae_multiple_qnams():
suppae_data = {
"STUDYID": ["CDISCPILOT01", "CDISCPILOT01"],
"RDOMAIN": ["AE", "AE"],
"USUBJID": ["CDISC008", "CDISC008"],
"IDVAR": ["", ""],
"IDVARVAL": ["", ""],
"QNAM": ["AESPID", "AEREL2"],
"QLABEL": ["Sponsor ID", "Relationship 2"],
"QVAL": ["SP001", "POSSIBLE"],
"QORIG": ["CRF", "CRF"],
"QEVAL": ["", ""],
}
suppae_ds = PandasDataset(pd.DataFrame(suppae_data))
assert suppae_ds.data.shape[0] == 2

result = DataProcessor().process_supp(suppae_ds).data

assert result.shape[0] == 1
assert {"AESPID", "AEREL2"}.issubset(set(result.columns))
assert result.loc[0, "AESPID"] == "SP001"
assert result.loc[0, "AEREL2"] == "POSSIBLE"


@patch.object(LocalDataService, "check_filepath", return_value=False)
@patch.object(LocalDataService, "_async_get_datasets")
def test_merge_pivot_supp_dataset(
Expand Down Expand Up @@ -258,7 +283,7 @@ def test_merge_supp_dataset_multi_idvar_aggregation(

@patch.object(LocalDataService, "check_filepath", return_value=False)
@patch.object(LocalDataService, "_async_get_datasets")
def test_merge_supp_dataset_multi_idvar_same_qnam_validation_error(
def test_merge_supp_dataset_same_qnam_validation_error(
mock_async_get_datasets, data_service
):
parent_dataset = PandasDataset(
Expand Down Expand Up @@ -292,7 +317,7 @@ def test_merge_supp_dataset_multi_idvar_same_qnam_validation_error(

mock_async_get_datasets.return_value = [parent_dataset, supp_dataset]

with pytest.raises(ValueError, match="Multiple records with the same QNAM"):
with pytest.raises(PreprocessingError, match="Multiple records with the same QNAM"):
DataProcessor.merge_pivot_supp_dataset(
data_service.dataset_implementation, parent_dataset, supp_dataset
)
Loading