cdisc-org · SFJohnson24 · Apr 23, 2026 · Mar 16, 2026 · Mar 16, 2026 · Mar 20, 2026
diff --git a/README.md b/README.md
@@ -245,9 +245,9 @@ This will show the list of validation options.
   -jcf, --jsonata-custom-functions Pair containing a variable name and a Path to directory containing a set of custom JSONata functions. Can be specified multiple times
   -e, --encoding TEXT             File encoding for reading datasets. If not specified, defaults to utf-8. Supported encodings: utf-8, utf-16, utf-32, cp1252, latin-1, etc.
   -ft, --filetype TEXT            File extension to filter datasets. Has higher priority than --dataset-path parameter.
-  -vcp, --variables-csv-path      Path to variables.csv. Used when multiple dataset paths are provided and refer to different folders.
-                                    Not required if variables.txt exists in all -dp directories.
-  -tcp, --tables-csv-path         Path to tables.csv. Required when multiple dataset paths are provided and refer to different folders.
+  -vcp, --variables-csv-path      Path to _variables.csv. Used when multiple dataset paths are provided and refer to different folders.
+                                    Not required if _variables.txt exists in all -dp directories.
+  -dcp, --datasets-csv-path         Path to _datasets.csv. Required when multiple dataset paths are provided and refer to different folders.
   --help                          Show this message and exit.
 ```
 

diff --git a/cdisc_rules_engine/models/validation_args.py b/cdisc_rules_engine/models/validation_args.py
@@ -29,6 +29,6 @@
         "max_errors_per_rule",
         "encoding",
         "variables_csv_path",
-        "tables_csv_path",
+        "datasets_csv_path",
     ],
 )
diff --git a/cdisc_rules_engine/services/csv_metadata_reader.py b/cdisc_rules_engine/services/csv_metadata_reader.py
@@ -14,7 +14,7 @@ def __init__(
         file_name: str,
         encoding: str = DEFAULT_ENCODING,
         variables_csv_path: str = None,
-        tables_csv_path: str = None,
+        datasets_csv_path: str = None,
         **kwargs,
     ):
         self.file_path = file_path
@@ -23,12 +23,12 @@ def __init__(
         self.variables_csv_path = (
             Path(variables_csv_path)
             if variables_csv_path
-            else Path(self.file_path).parent / "variables.csv"
+            else Path(self.file_path).parent / "_variables.csv"
         )
-        self.tables_csv_path = (
-            Path(tables_csv_path)
-            if tables_csv_path
-            else Path(self.file_path).parent / "tables.csv"
+        self.datasets_csv_path = (
+            Path(datasets_csv_path)
+            if datasets_csv_path
+            else Path(self.file_path).parent / "_datasets.csv"
         )
 
     def read(self) -> dict:
@@ -111,11 +111,11 @@ def __get_variable_metadata(
     def __dataset_label(self) -> dict:
         logger = logging.getLogger("validator")
 
-        if not self.tables_csv_path.exists():
+        if not self.datasets_csv_path.exists():
             return {}
 
         try:
-            tables_df = pd.read_csv(self.tables_csv_path, encoding=self.encoding)
+            datasets_df = pd.read_csv(self.datasets_csv_path, encoding=self.encoding)
         except (UnicodeDecodeError, UnicodeError) as e:
             logger.error(
                 f"\n  Error reading CSV from: {self.file_path}"
@@ -127,15 +127,15 @@ def __dataset_label(self) -> dict:
             logger.error("Error reading CSV file %s. %s", self.file_path, e)
             return {}
 
-        if "Filename" not in tables_df.columns or "Label" not in tables_df.columns:
+        if "Filename" not in datasets_df.columns or "Label" not in datasets_df.columns:
             return {}
 
-        tables_df["dataset"] = tables_df["Filename"].apply(
+        datasets_df["dataset"] = datasets_df["Filename"].apply(
             lambda x: Path(str(x)).stem.lower()
         )
 
         current_dataset = Path(self.file_name).stem.lower()
-        match = tables_df[tables_df["dataset"] == current_dataset]
+        match = datasets_df[datasets_df["dataset"] == current_dataset]
 
         if match.empty:
             return {}

diff --git a/cdisc_rules_engine/services/data_services/data_service_factory.py b/cdisc_rules_engine/services/data_services/data_service_factory.py
@@ -39,7 +39,7 @@ def __init__(
         max_dataset_size: int = 0,
         encoding: str = None,
         variables_csv_path: str = None,
-        tables_csv_path=None,
+        datasets_csv_path=None,
     ):
         if config.getValue("DATA_SERVICE_TYPE"):
             self.data_service_name = config.getValue("DATA_SERVICE_TYPE")
@@ -56,7 +56,7 @@ def __init__(
         self.max_dataset_size = max_dataset_size
         self.encoding = encoding
         self.variables_csv_path = variables_csv_path
-        self.tables_csv_path = tables_csv_path
+        self.datasets_csv_path = datasets_csv_path
         self.dataset_size_threshold = self.config.get_dataset_size_threshold()
 
     def get_data_service(
@@ -103,7 +103,7 @@ def get_data_service(
                 dataset_implementation=self.get_dataset_implementation(),
                 encoding=self.encoding,
                 variables_csv_path=self.variables_csv_path,
-                tables_csv_path=self.tables_csv_path,
+                datasets_csv_path=self.datasets_csv_path,
             )
 
     def get_dummy_data_service(self, data: List[DummyDataset]) -> DataServiceInterface:

diff --git a/cdisc_rules_engine/services/data_services/local_data_service.py b/cdisc_rules_engine/services/data_services/local_data_service.py
@@ -49,7 +49,7 @@ def __init__(
         self.dataset_paths: Iterable[str] = kwargs.get("dataset_paths", [])
         self.encoding: str = kwargs.get("encoding")
         self.variables_csv_path: str = kwargs.get("variables_csv_path")
-        self.tables_csv_path: str = kwargs.get("tables_csv_path")
+        self.datasets_csv_path: str = kwargs.get("datasets_csv_path")
 
     @classmethod
     def get_instance(
@@ -215,7 +215,7 @@ def read_metadata(
             file_name,
             encoding=self.encoding,
             variables_csv_path=self.variables_csv_path,
-            tables_csv_path=self.tables_csv_path,
+            datasets_csv_path=self.datasets_csv_path,
         ).read()
         return {
             "file_metadata": file_metadata,
@@ -252,7 +252,7 @@ def get_datasets(self) -> List[dict]:
                 dataset_metadata = self.get_raw_dataset_metadata(
                     dataset_name=dataset_path,
                     variables_csv_path=self.variables_csv_path,
-                    tables_csv_path=self.tables_csv_path,
+                    datasets_csv_path=self.datasets_csv_path,
                 )
                 datasets.append(dataset_metadata)
             except InvalidDatasetFormat:

diff --git a/core.py b/core.py
@@ -110,38 +110,40 @@ def _validate_csv_data_paths(
     dataset_paths: list[str], encoding: str = DEFAULT_ENCODING
 ) -> list[str]:
     """
-    Filters dataset paths based on tables.csv content.
+    Filters dataset paths based on _datasets.csv content.
 
-    Raises InvalidCSVFile error if there are no proper tables.csv files in provided path.
+    Raises InvalidCSVFile error if there are no proper _datasets.csv files in provided path.
 
-    Keeps only datasets listed in tables.csv (Filename column).
-    Always excludes tables.csv and variables.csv from result.
+    Keeps only datasets listed in _datasets.csv (Filename column).
+    Always excludes _datasets.csv and _variables.csv from result.
     """
     import pandas as pd
 
     paths = [Path(p) for p in dataset_paths]
 
-    tables_path = list({p for p in paths if p.name.lower() == "tables.csv"})
-    if len(tables_path) > 1:
-        raise InvalidCSVFile("There is more than one tables.csv file in provided path.")
-    elif len(tables_path) == 0:
-        raise InvalidCSVFile("There is no tables.csv file in provided path.")
+    datasets_path = list({p for p in paths if p.name.lower() == "_datasets.csv"})
+    if len(datasets_path) > 1:
+        raise InvalidCSVFile(
+            "There is more than one _datasets.csv file in provided path."
+        )
+    elif len(datasets_path) == 0:
+        raise InvalidCSVFile("There is no _datasets.csv file in provided path.")
     else:
-        tables_path = tables_path[0]
+        datasets_path = datasets_path[0]
 
     dataset_files = [
-        p for p in paths if p.name.lower() not in ("tables.csv", "variables.csv")
+        p for p in paths if p.name.lower() not in ("_datasets.csv", "_variables.csv")
     ]
 
-    tables_df = pd.read_csv(tables_path, encoding=encoding)
+    datasets_df = pd.read_csv(datasets_path, encoding=encoding)
 
-    if "Filename" not in tables_df.columns or "Label" not in tables_df.columns:
+    if "Filename" not in datasets_df.columns or "Label" not in datasets_df.columns:
         raise InvalidCSVFile(
             "Metadata files is malformed. One of [Filename, Label] columns is missing."
         )
 
     allowed_datasets = {
-        Path(str(name)).stem.lower() for name in tables_df["Filename"].dropna()
+        Path(str(name)).stem.lower() for name in datasets_df["Filename"].dropna()
     }
 
     filtered = {
@@ -235,7 +237,7 @@ def _validate_dataset_paths(
                 [
                     str(p)
                     for p in dp_path.parent.glob("*")
-                    if p.is_file() and p.name in {"tables.csv", "variables.csv"}
+                    if p.is_file() and p.name in {"datasets.csv", "variables.csv"}
                 ]
             )
         try:
@@ -536,13 +538,13 @@ def load_custom_dotenv_from_data_options(ctx, param, value):
     "-vcp",
     "--variables-csv-path",
     required=False,
-    help="Path to variables.csv",
+    help="Path to _variables.csv",
 )
 @click.option(
-    "-tcp",
-    "--tables-csv-path",
+    "-dcp",
+    "--datasets-csv-path",
     required=False,
-    help="Path to tables.csv",
+    help="Path to _datasets.csv",
 )
 @click.pass_context
 def validate(  # noqa
@@ -583,7 +585,7 @@ def validate(  # noqa
     max_errors_per_rule: tuple[int, bool],
     encoding: str,
     variables_csv_path: str,
-    tables_csv_path: str,
+    datasets_csv_path: str,
 ):
     """
     Validate data using CDISC Rules Engine
@@ -692,7 +694,7 @@ def validate(  # noqa
             max_errors_per_rule,
             encoding,
             variables_csv_path,
-            tables_csv_path,
+            datasets_csv_path,
         )
     )
 

diff --git a/env.example b/env.example
@@ -2,5 +2,9 @@ CDISC_LIBRARY_API_KEY=your_api_key_here
 DATASET_SIZE_THRESHOLD=10485760  # max dataset size in bytes to force dask implementation
 MAX_REPORT_ROWS = 10  # integer for maximum number of issues per excel sheet (plus headers) in result report.  Defaults to 10000.
 MAX_ERRORS_PER_RULE = (10, True)  # Tuple for maximum number of errors to report per rule during a validation run. Also has a per dataset flag described as second bool value in readme. example value 
-DEFINE_XML
-DATA_DIR
+DEFINE_XML = define.xml path
+CT = controlled terminology package
+PRODUCT= standard
+VERSION= version, denoted with a dash i.e. 3-4
+SUBSTANDARD= TIG substandard
+USE_CASE= TIG use case
diff --git a/scripts/run_validation.py b/scripts/run_validation.py
@@ -176,7 +176,7 @@ def run_validation(args: Validation_args):
             library_metadata=library_metadata,
             encoding=args.encoding,
             variables_csv_path=args.variables_csv_path,
-            tables_csv_path=args.tables_csv_path,
+            datasets_csv_path=args.datasets_csv_path,
         ).get_data_service(args.dataset_paths)
         # install dictionaries if needed
         dictionary_versions = fill_cache_with_dictionaries(

diff --git a/tests/resources/CoreIssue1558/datasets/_datasets.csv b/tests/resources/CoreIssue1558/datasets/_datasets.csv
@@ -0,0 +1,4 @@
+Filename,Label
+pp,Pharmacokinetics Parameters
+dm,Demographics
+lb,Some Description
diff --git a/tests/resources/CoreIssue1558/datasets/_variables.csv b/tests/resources/CoreIssue1558/datasets/_variables.csv
@@ -0,0 +1,13 @@
+dataset,variable,label,type,length
+dm,STUDYID,Study Identifier,Char,200
+dm,DOMAIN,Domain Abbreviation,Char,2
+dm,USUBJID,Unique Subject Identifier,Char,200
+dm,SUBJID,Subject Identifier for the Study,Char,40
+dm,RFSTDTC,Subject Reference Start Date/Time,Char,20
+pp,STUDYID,Study Identifier,Char,200
+pp,DOMAIN,Domain Abbreviation,Char,2
+pp,USUBJID,Unique Subject Identifier,Char,200
+pp,PPSEQ,Sequence Number,Num,8
+pp,PPGRPID,Group ID,Char,40
+pp,PPTESTCD,Parameter Short Name,Char,8
+pp,PPTEST,Parameter Name,Char,40
diff --git a/tests/resources/CoreIssue1558/datasets/tables.csv b/tests/resources/CoreIssue1558/datasets/tables.csv
diff --git a/tests/resources/CoreIssue1558/datasets/variables.csv b/tests/resources/CoreIssue1558/datasets/variables.csv