Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
32 changes: 9 additions & 23 deletions cdisc_rules_engine/check_operators/dataframe_operators.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,11 @@
)
from cdisc_rules_engine.enums.dataset_title_case import DatasetTitleCase
from cdisc_rules_engine.constants import NULL_FLAVORS
from cdisc_rules_engine.utilities.utils import dates_overlap, parse_date
from cdisc_rules_engine.utilities.utils import (
dates_overlap,
parse_date,
custom_str_conversion,
)
import numpy as np
import dask.dataframe as dd
import pandas as pd
Expand Down Expand Up @@ -98,24 +102,6 @@ def _regex_str_conversion(self, x):
return f"{x:.0f}" if x.is_integer() else str(x).strip()
return x

def _custom_str_conversion(self, x):
"""used to normalize numeric representations i.e. treat 200.00 as 200 for comparisons"""
if pd.notna(x):
if isinstance(x, str):
try:
float_val = float(x)
if float_val.is_integer():
return str(int(float_val)).strip()
else:
return str(float_val).strip()
except (ValueError, TypeError):
return x.strip()
elif isinstance(x, int):
return str(x).strip()
elif isinstance(x, float):
return f"{x:.0f}" if x.is_integer() else str(x).strip()
return x

def convert_string_data_to_lower(self, data):
if self.value.is_series(data):
data = data.str.lower()
Expand Down Expand Up @@ -239,8 +225,8 @@ def _check_equality(
if round_values:
target_val, comparison_val = apply_rounding(target_val, comparison_val)
if type_insensitive:
target_val = self._custom_str_conversion(target_val)
comparison_val = self._custom_str_conversion(comparison_val)
target_val = custom_str_conversion(target_val)
comparison_val = custom_str_conversion(comparison_val)
if case_insensitive:
target_val = target_val.lower() if target_val else None
comparison_val = comparison_val.lower() if comparison_val else None
Expand Down Expand Up @@ -286,8 +272,8 @@ def _check_inequality(
if round_values:
target_val, comparison_val = apply_rounding(target_val, comparison_val)
if type_insensitive:
target_val = self._custom_str_conversion(target_val)
comparison_val = self._custom_str_conversion(comparison_val)
target_val = custom_str_conversion(target_val)
comparison_val = custom_str_conversion(comparison_val)
if case_insensitive:
target_val = target_val.lower() if target_val else None
comparison_val = comparison_val.lower() if comparison_val else None
Expand Down
21 changes: 18 additions & 3 deletions cdisc_rules_engine/utilities/data_processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
from cdisc_rules_engine.exceptions.custom_exceptions import PreprocessingError
from cdisc_rules_engine.utilities.utils import (
search_in_list_of_dicts,
custom_str_conversion,
)
from cdisc_rules_engine.utilities.sdtm_utilities import add_variable_wildcards

Expand Down Expand Up @@ -213,6 +214,8 @@ def merge_pivot_supp_dataset(
if len(unique_idvar_values) == 1:
right_dataset = DataProcessor.process_supp(right_dataset)
dynamic_key = right_dataset["IDVAR"].iloc[0]
temp_key = f"{dynamic_key}__norm"

is_blank: bool = pd.isna(dynamic_key) or str(dynamic_key).strip() == ""
# Determine the common keys present in both datasets
common_keys = [
Expand All @@ -221,11 +224,19 @@ def merge_pivot_supp_dataset(
if key in left_dataset.columns and key in right_dataset.columns
]
if not is_blank:
left_dataset[temp_key] = left_dataset[dynamic_key]

common_keys.append(dynamic_key)
current_supp = right_dataset.rename(columns={"IDVARVAL": dynamic_key})
current_supp = current_supp.drop(columns=["IDVAR"])
left_dataset[dynamic_key] = left_dataset[dynamic_key].astype(str)
current_supp[dynamic_key] = current_supp[dynamic_key].astype(str)

if pd.api.types.is_numeric_dtype(left_dataset[dynamic_key]):
left_dataset[dynamic_key] = left_dataset[dynamic_key].apply(
custom_str_conversion
)
current_supp[dynamic_key] = current_supp[dynamic_key].apply(
custom_str_conversion
)
else:
columns_to_drop = [
col for col in ["IDVAR", "IDVARVAL"] if col in right_dataset.columns
Expand All @@ -242,7 +253,7 @@ def merge_pivot_supp_dataset(
DataProcessor._validate_qnam_dask(left_dataset, qnam_list, common_keys)
else:
left_dataset = PandasDataset(
pd.merge(
pd.merge( # noqa
left_dataset.data,
current_supp.data,
how="left",
Expand All @@ -251,6 +262,9 @@ def merge_pivot_supp_dataset(
)
)
DataProcessor._validate_qnam(left_dataset.data, qnam_list, common_keys)
if not is_blank:
left_dataset[dynamic_key] = left_dataset[temp_key]
left_dataset = left_dataset.drop(columns=[temp_key])
else:
if dataset_implementation == DaskDataset:
left_dataset = PandasDataset(left_dataset.data.compute())
Expand All @@ -263,6 +277,7 @@ def merge_pivot_supp_dataset(
left_dataset = DataProcessor._merge_supp_with_multiple_idvars(
left_dataset, right_dataset, static_keys, qnam_list
)

return left_dataset

@staticmethod
Expand Down
19 changes: 19 additions & 0 deletions cdisc_rules_engine/utilities/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -468,3 +468,22 @@ def load_json_with_optional_encoding(path: str, encoding: str | None = None) ->
tried_msg = ", ".join(enc for enc, _ in tried)

raise ValueError(f"Unable to load JSON file '{path}'. Tried encodings: {tried_msg}")


def custom_str_conversion(x):
"""used to normalize numeric representations i.e. treat 200.00 as 200 for comparisons"""
if pd.notna(x):
if isinstance(x, str):
try:
float_val = float(x)
if float_val.is_integer():
return str(int(float_val)).strip()
else:
return str(float_val).strip()
except (ValueError, TypeError):
return x.strip()
elif isinstance(x, int):
return str(x).strip()
elif isinstance(x, float):
return f"{x:.0f}" if x.is_integer() else str(x).strip()
return x
93 changes: 93 additions & 0 deletions tests/unit/test_merge_supp_datasets.py
Original file line number Diff line number Diff line change
Expand Up @@ -150,6 +150,99 @@ def dummy_func(dataset_name, **kwargs):
" length of the merged dataset should match the parent dataset."


@patch.object(LocalDataService, "check_filepath", return_value=False)
@patch.object(LocalDataService, "_async_get_datasets")
@pytest.mark.parametrize(
"a_parent, id_var_val, expected_dataset",
[
(
[1.0, 2.0, 3.0],
["1", "2", "3"],
pd.DataFrame(
{
"STUDYID": [1, 2, 3],
"USUBJID": [101, 102, 103],
"APID": [201, 202, 203],
"POOLID": [301, 302, 303],
"SPDEVID": [401, 402, 403],
"A": [1.0, 2.0, 3.0],
"X": [10, pd.NA, pd.NA],
"Y": [pd.NA, 20, pd.NA],
"Z": [pd.NA, pd.NA, 30],
}
),
),
(
[1.1, 2.2, 3.3],
["1.1", "2.2", "3"],
pd.DataFrame(
{
"STUDYID": [1, 2, 3],
"USUBJID": [101, 102, 103],
"APID": [201, 202, 203],
"POOLID": [301, 302, 303],
"SPDEVID": [401, 402, 403],
"A": [1.1, 2.2, 3.3],
"X": [10, pd.NA, pd.NA],
"Y": [pd.NA, 20, pd.NA],
"Z": [pd.NA, pd.NA, pd.NA],
}
),
),
],
)
def test_merge_supp_str_float(
mock_async_get_datasets,
mock_check_filepath,
data_service,
a_parent,
id_var_val,
expected_dataset,
):
# Setup example datasets
parent_dataset = PandasDataset(
pd.DataFrame(
{
"STUDYID": [1, 2, 3],
"USUBJID": [101, 102, 103],
"APID": [201, 202, 203],
"POOLID": [301, 302, 303],
"SPDEVID": [401, 402, 403],
"A": a_parent,
}
)
)

supp_dataset = PandasDataset(
pd.DataFrame(
{
"STUDYID": [1, 2, 3],
"USUBJID": [101, 102, 103],
"APID": [201, 202, 203],
"POOLID": [301, 302, 303],
"SPDEVID": [401, 402, 403],
"IDVAR": ["A", "A", "A"],
"IDVARVAL": id_var_val,
"QNAM": ["X", "Y", "Z"],
"QVAL": [10, 20, 30],
"QLABEL": ["Label1", "Label2", "Label3"],
}
)
)

mock_async_get_datasets.return_value = [parent_dataset, supp_dataset]

merged_dataset = DataProcessor.merge_pivot_supp_dataset(
data_service.dataset_implementation, parent_dataset, supp_dataset
)
expected_dataset = PandasDataset(expected_dataset)
pdt.assert_frame_equal(
merged_dataset.data, expected_dataset.data, check_dtype=False
)
assert len(merged_dataset.data) == len(parent_dataset.data), "The"
" length of the merged dataset should match the parent dataset."


@patch.object(LocalDataService, "check_filepath", return_value=False)
@patch.object(LocalDataService, "_async_get_datasets")
def test_merge_supp_dataset_multi_idvar(mock_async_get_datasets, data_service):
Expand Down
3 changes: 3 additions & 0 deletions tests/unit/test_utilities/test_data_processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -249,6 +249,9 @@ def test_merge_pivot_supp_dataset_single_idvar(dataset_implementation):
assert "AESPID" in merged_df.columns, "AESPID column should be created from QNAM"
assert "QNAM" not in merged_df.columns, "QNAM should be dropped after pivot"
assert "QVAL" not in merged_df.columns, "QVAL should be dropped after pivot"

# we keep original "AESEQ" column which has int64 type
result_data["AESEQ"] = result_data["AESEQ"].astype(str)
assert result_data[result_data["AESEQ"] == "1"]["AESPID"].values[0] == "SP001"
assert result_data[result_data["AESEQ"] == "2"]["AESPID"].values[0] == "SP002"
assert result_data[result_data["AESEQ"] == "3"]["AESPID"].values[0] == "SP003"
Expand Down
Loading