Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
20 commits
Select commit Hold shift + click to select a range
5fcd3de
error handling and removed if conditional
SFJohnson24 Mar 16, 2026
7772a52
added test
SFJohnson24 Mar 16, 2026
ff69c53
merge main
SFJohnson24 Mar 20, 2026
8b6c074
Merge branch 'main' of https://github.com/cdisc-org/cdisc-rules-engine
SFJohnson24 Mar 20, 2026
9ac0214
Merge branch 'main' of https://github.com/cdisc-org/cdisc-rules-engine
SFJohnson24 Mar 31, 2026
88f68db
Merge branch 'main' of https://github.com/cdisc-org/cdisc-rules-engine
SFJohnson24 Apr 6, 2026
13af981
Merge branch 'main' of https://github.com/cdisc-org/cdisc-rules-engine
SFJohnson24 Apr 8, 2026
b2c4e7f
Merge branch 'main' of https://github.com/cdisc-org/cdisc-rules-engine
SFJohnson24 Apr 8, 2026
4a5fa2c
tables to datasets.csv
SFJohnson24 Apr 9, 2026
0680c7a
Merge branch 'main' of https://github.com/cdisc-org/cdisc-rules-engine
SFJohnson24 Apr 9, 2026
bee1ebf
Merge branch 'main' into variables.csv
SFJohnson24 Apr 9, 2026
acfdb15
Merge branch 'main' into variables.csv
SFJohnson24 Apr 17, 2026
f1d8b32
Merge branch 'main' into variables.csv
SFJohnson24 Apr 17, 2026
cca4560
data_processor main
SFJohnson24 Apr 17, 2026
e13940c
Merge branch 'variables.csv' of https://github.com/cdisc-org/cdisc-ru…
SFJohnson24 Apr 17, 2026
02232f8
Merge branch 'main' into variables.csv
SFJohnson24 Apr 21, 2026
d3197b0
metadata filenames
SFJohnson24 Apr 21, 2026
0343ce9
regression test
SFJohnson24 Apr 21, 2026
b5db496
Merge branch 'main' into variables.csv
SFJohnson24 Apr 22, 2026
24a1029
Merge branch 'main' into variables.csv
SFJohnson24 Apr 22, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -245,9 +245,9 @@ This will show the list of validation options.
-jcf, --jsonata-custom-functions Pair containing a variable name and a Path to directory containing a set of custom JSONata functions. Can be specified multiple times
-e, --encoding TEXT File encoding for reading datasets. If not specified, defaults to utf-8. Supported encodings: utf-8, utf-16, utf-32, cp1252, latin-1, etc.
-ft, --filetype TEXT File extension to filter datasets. Has higher priority than --dataset-path parameter.
-vcp, --variables-csv-path Path to variables.csv. Used when multiple dataset paths are provided and refer to different folders.
Not required if variables.txt exists in all -dp directories.
-tcp, --tables-csv-path Path to tables.csv. Required when multiple dataset paths are provided and refer to different folders.
-vcp, --variables-csv-path Path to _variables.csv. Used when multiple dataset paths are provided and refer to different folders.
Not required if _variables.txt exists in all -dp directories.
-dcp, --datasets-csv-path Path to _datasets.csv. Required when multiple dataset paths are provided and refer to different folders.
--help Show this message and exit.
```

Expand Down
2 changes: 1 addition & 1 deletion cdisc_rules_engine/models/validation_args.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,6 @@
"max_errors_per_rule",
"encoding",
"variables_csv_path",
"tables_csv_path",
"datasets_csv_path",
],
)
22 changes: 11 additions & 11 deletions cdisc_rules_engine/services/csv_metadata_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ def __init__(
file_name: str,
encoding: str = DEFAULT_ENCODING,
variables_csv_path: str = None,
tables_csv_path: str = None,
datasets_csv_path: str = None,
**kwargs,
):
self.file_path = file_path
Expand All @@ -23,12 +23,12 @@ def __init__(
self.variables_csv_path = (
Path(variables_csv_path)
if variables_csv_path
else Path(self.file_path).parent / "variables.csv"
else Path(self.file_path).parent / "_variables.csv"
)
self.tables_csv_path = (
Path(tables_csv_path)
if tables_csv_path
else Path(self.file_path).parent / "tables.csv"
self.datasets_csv_path = (
Path(datasets_csv_path)
if datasets_csv_path
else Path(self.file_path).parent / "_datasets.csv"
)

def read(self) -> dict:
Expand Down Expand Up @@ -111,11 +111,11 @@ def __get_variable_metadata(
def __dataset_label(self) -> dict:
logger = logging.getLogger("validator")

if not self.tables_csv_path.exists():
if not self.datasets_csv_path.exists():
return {}

try:
tables_df = pd.read_csv(self.tables_csv_path, encoding=self.encoding)
datasets_df = pd.read_csv(self.datasets_csv_path, encoding=self.encoding)
except (UnicodeDecodeError, UnicodeError) as e:
logger.error(
f"\n Error reading CSV from: {self.file_path}"
Expand All @@ -127,15 +127,15 @@ def __dataset_label(self) -> dict:
logger.error("Error reading CSV file %s. %s", self.file_path, e)
return {}

if "Filename" not in tables_df.columns or "Label" not in tables_df.columns:
if "Filename" not in datasets_df.columns or "Label" not in datasets_df.columns:
return {}

tables_df["dataset"] = tables_df["Filename"].apply(
datasets_df["dataset"] = datasets_df["Filename"].apply(
lambda x: Path(str(x)).stem.lower()
)

current_dataset = Path(self.file_name).stem.lower()
match = tables_df[tables_df["dataset"] == current_dataset]
match = datasets_df[datasets_df["dataset"] == current_dataset]

if match.empty:
return {}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ def __init__(
max_dataset_size: int = 0,
encoding: str = None,
variables_csv_path: str = None,
tables_csv_path=None,
datasets_csv_path=None,
):
if config.getValue("DATA_SERVICE_TYPE"):
self.data_service_name = config.getValue("DATA_SERVICE_TYPE")
Expand All @@ -56,7 +56,7 @@ def __init__(
self.max_dataset_size = max_dataset_size
self.encoding = encoding
self.variables_csv_path = variables_csv_path
self.tables_csv_path = tables_csv_path
self.datasets_csv_path = datasets_csv_path
self.dataset_size_threshold = self.config.get_dataset_size_threshold()

def get_data_service(
Expand Down Expand Up @@ -103,7 +103,7 @@ def get_data_service(
dataset_implementation=self.get_dataset_implementation(),
encoding=self.encoding,
variables_csv_path=self.variables_csv_path,
tables_csv_path=self.tables_csv_path,
datasets_csv_path=self.datasets_csv_path,
)

def get_dummy_data_service(self, data: List[DummyDataset]) -> DataServiceInterface:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ def __init__(
self.dataset_paths: Iterable[str] = kwargs.get("dataset_paths", [])
self.encoding: str = kwargs.get("encoding")
self.variables_csv_path: str = kwargs.get("variables_csv_path")
self.tables_csv_path: str = kwargs.get("tables_csv_path")
self.datasets_csv_path: str = kwargs.get("datasets_csv_path")

@classmethod
def get_instance(
Expand Down Expand Up @@ -215,7 +215,7 @@ def read_metadata(
file_name,
encoding=self.encoding,
variables_csv_path=self.variables_csv_path,
tables_csv_path=self.tables_csv_path,
datasets_csv_path=self.datasets_csv_path,
).read()
return {
"file_metadata": file_metadata,
Expand Down Expand Up @@ -252,7 +252,7 @@ def get_datasets(self) -> List[dict]:
dataset_metadata = self.get_raw_dataset_metadata(
dataset_name=dataset_path,
variables_csv_path=self.variables_csv_path,
tables_csv_path=self.tables_csv_path,
datasets_csv_path=self.datasets_csv_path,
)
datasets.append(dataset_metadata)
except InvalidDatasetFormat:
Expand Down
44 changes: 23 additions & 21 deletions core.py
Original file line number Diff line number Diff line change
Expand Up @@ -110,38 +110,40 @@ def _validate_csv_data_paths(
dataset_paths: list[str], encoding: str = DEFAULT_ENCODING
) -> list[str]:
"""
Filters dataset paths based on tables.csv content.
Filters dataset paths based on _datasets.csv content.

Raises InvalidCSVFile error if there are no proper tables.csv files in provided path.
Raises InvalidCSVFile error if there are no proper _datasets.csv files in provided path.

Keeps only datasets listed in tables.csv (Filename column).
Always excludes tables.csv and variables.csv from result.
Keeps only datasets listed in _datasets.csv (Filename column).
Always excludes _datasets.csv and _variables.csv from result.
"""
import pandas as pd

paths = [Path(p) for p in dataset_paths]

tables_path = list({p for p in paths if p.name.lower() == "tables.csv"})
if len(tables_path) > 1:
raise InvalidCSVFile("There is more than one tables.csv file in provided path.")
elif len(tables_path) == 0:
raise InvalidCSVFile("There is no tables.csv file in provided path.")
datasets_path = list({p for p in paths if p.name.lower() == "_datasets.csv"})
if len(datasets_path) > 1:
raise InvalidCSVFile(
"There is more than one _datasets.csv file in provided path."
)
elif len(datasets_path) == 0:
raise InvalidCSVFile("There is no _datasets.csv file in provided path.")
else:
tables_path = tables_path[0]
datasets_path = datasets_path[0]

dataset_files = [
p for p in paths if p.name.lower() not in ("tables.csv", "variables.csv")
p for p in paths if p.name.lower() not in ("_datasets.csv", "_variables.csv")
]

tables_df = pd.read_csv(tables_path, encoding=encoding)
datasets_df = pd.read_csv(datasets_path, encoding=encoding)

if "Filename" not in tables_df.columns or "Label" not in tables_df.columns:
if "Filename" not in datasets_df.columns or "Label" not in datasets_df.columns:
raise InvalidCSVFile(
"Metadata files is malformed. One of [Filename, Label] columns is missing."
)

allowed_datasets = {
Path(str(name)).stem.lower() for name in tables_df["Filename"].dropna()
Path(str(name)).stem.lower() for name in datasets_df["Filename"].dropna()
}

filtered = {
Expand Down Expand Up @@ -235,7 +237,7 @@ def _validate_dataset_paths(
[
str(p)
for p in dp_path.parent.glob("*")
if p.is_file() and p.name in {"tables.csv", "variables.csv"}
if p.is_file() and p.name in {"datasets.csv", "variables.csv"}
]
)
try:
Expand Down Expand Up @@ -536,13 +538,13 @@ def load_custom_dotenv_from_data_options(ctx, param, value):
"-vcp",
"--variables-csv-path",
required=False,
help="Path to variables.csv",
help="Path to _variables.csv",
)
@click.option(
"-tcp",
"--tables-csv-path",
"-dcp",
"--datasets-csv-path",
required=False,
help="Path to tables.csv",
help="Path to _datasets.csv",
)
@click.pass_context
def validate( # noqa
Expand Down Expand Up @@ -583,7 +585,7 @@ def validate( # noqa
max_errors_per_rule: tuple[int, bool],
encoding: str,
variables_csv_path: str,
tables_csv_path: str,
datasets_csv_path: str,
):
"""
Validate data using CDISC Rules Engine
Expand Down Expand Up @@ -692,7 +694,7 @@ def validate( # noqa
max_errors_per_rule,
encoding,
variables_csv_path,
tables_csv_path,
datasets_csv_path,
)
)

Expand Down
8 changes: 6 additions & 2 deletions env.example
Original file line number Diff line number Diff line change
Expand Up @@ -2,5 +2,9 @@ CDISC_LIBRARY_API_KEY=your_api_key_here
DATASET_SIZE_THRESHOLD=10485760 # max dataset size in bytes to force dask implementation
MAX_REPORT_ROWS = 10 # integer for maximum number of issues per excel sheet (plus headers) in result report. Defaults to 10000.
MAX_ERRORS_PER_RULE = (10, True) # Tuple for maximum number of errors to report per rule during a validation run. Also has a per dataset flag described as second bool value in readme. example value
DEFINE_XML
DATA_DIR
DEFINE_XML = define.xml path
CT = controlled terminology package
PRODUCT= standard
VERSION= version, denoted with a dash i.e. 3-4
SUBSTANDARD= TIG substandard
USE_CASE= TIG use case
2 changes: 1 addition & 1 deletion scripts/run_validation.py
Original file line number Diff line number Diff line change
Expand Up @@ -176,7 +176,7 @@ def run_validation(args: Validation_args):
library_metadata=library_metadata,
encoding=args.encoding,
variables_csv_path=args.variables_csv_path,
tables_csv_path=args.tables_csv_path,
datasets_csv_path=args.datasets_csv_path,
).get_data_service(args.dataset_paths)
# install dictionaries if needed
dictionary_versions = fill_cache_with_dictionaries(
Expand Down
4 changes: 4 additions & 0 deletions tests/resources/CoreIssue1558/datasets/_datasets.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
Filename,Label
pp,Pharmacokinetics Parameters
dm,Demographics
lb,Some Description
13 changes: 13 additions & 0 deletions tests/resources/CoreIssue1558/datasets/_variables.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
dataset,variable,label,type,length
dm,STUDYID,Study Identifier,Char,200
dm,DOMAIN,Domain Abbreviation,Char,2
dm,USUBJID,Unique Subject Identifier,Char,200
dm,SUBJID,Subject Identifier for the Study,Char,40
dm,RFSTDTC,Subject Reference Start Date/Time,Char,20
pp,STUDYID,Study Identifier,Char,200
pp,DOMAIN,Domain Abbreviation,Char,2
pp,USUBJID,Unique Subject Identifier,Char,200
pp,PPSEQ,Sequence Number,Num,8
pp,PPGRPID,Group ID,Char,40
pp,PPTESTCD,Parameter Short Name,Char,8
pp,PPTEST,Parameter Name,Char,40
4 changes: 0 additions & 4 deletions tests/resources/CoreIssue1558/datasets/tables.csv

This file was deleted.

13 changes: 0 additions & 13 deletions tests/resources/CoreIssue1558/datasets/variables.csv

This file was deleted.

Loading
Loading