Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could you please add testing for this new meta variable covering edge cases?

Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from cdisc_rules_engine.dataset_builders.base_dataset_builder import BaseDatasetBuilder
import pandas as pd


class VariablesMetadataDatasetBuilder(BaseDatasetBuilder):
Expand All @@ -12,7 +13,128 @@ def build(self):
variable_size
variable_data_type
variable_format
variable_max_size (if needed by the rule)
"""
return self.data_service.get_variables_metadata(
# Get basic variable metadata
variables_metadata = self.data_service.get_variables_metadata(
dataset_name=self.dataset_path, datasets=self.datasets, drop_duplicates=True
)

# Check if the rule requires variable_max_size
if self.rule and self._needs_variable_max_size():
variables_metadata = self._add_variable_max_size(variables_metadata)

return variables_metadata

def _needs_variable_max_size(self):
"""
Check if the rule requires variable_max_size by examining:
- output_variables
- operations (operator field)
- conditions (all fields)
"""
return (
self._check_output_variables_for_variable_max_size()
or self._check_operations_for_variable_max_size()
or self._check_conditions_for_variable_max_size(self.rule.get("conditions"))
)

def _check_output_variables_for_variable_max_size(self):
"""Check if output_variables contains variable_max_size."""
output_variables = self.rule.get("output_variables")
if not output_variables:
return False

if isinstance(output_variables, list):
return "variable_max_size" in output_variables
elif isinstance(output_variables, dict):
return "variable_max_size" in output_variables.values()

return False

def _check_operations_for_variable_max_size(self):
"""Check if operations contains variable_max_size operator."""
operations = self.rule.get("operations")
if not operations:
return False

if isinstance(operations, list):
return any(
isinstance(op, dict) and op.get("operator") == "variable_max_size"
for op in operations
)
elif isinstance(operations, dict):
return operations.get("operator") == "variable_max_size"

return False

def _check_conditions_for_variable_max_size(self, conditions):
"""
Recursively check conditions for variable_max_size references.
Handles both ConditionComposite objects and dict/list structures.
"""
# If it's a ConditionComposite object, use its methods
if hasattr(conditions, "values"):
# Get all condition values as a flat list of dicts
condition_values = conditions.values()
for condition_dict in condition_values:
if self._contains_variable_max_size(condition_dict):
return True
# If it's a dict, check recursively
elif isinstance(conditions, dict):
if self._contains_variable_max_size(conditions):
return True
# If it's a list, check each item
elif isinstance(conditions, list):
for item in conditions:
if self._check_conditions_for_variable_max_size(item):
return True

return False

def _contains_variable_max_size(self, data):
"""
Check if data contains 'variable_max_size' reference.
Handles strings, dictionaries, and lists.
"""
if data == "variable_max_size":
return True
elif isinstance(data, dict):
for key, value in data.items():
if value == "variable_max_size":
return True
# Recursively check nested structures
if isinstance(value, (dict, list)):
if self._check_conditions_for_variable_max_size(value):
return True
elif isinstance(data, list):
for item in data:
if self._contains_variable_max_size(item):
return True

return False

def _add_variable_max_size(self, variables_metadata):
"""
Add variable_max_size column to the variables metadata.
This column contains the maximum length of actual data for each variable.
"""
# Get the dataset contents
dataset = self.data_service.get_dataset(dataset_name=self.dataset_path)

# Calculate max size for each variable
max_sizes = {}
for var_name in variables_metadata.data["variable_name"]:
if var_name in dataset.data.columns:
# Convert to string and get max length, ignoring null values
max_length = dataset.data[var_name].dropna().astype(str).str.len().max()
max_sizes[var_name] = max_length if not pd.isna(max_length) else 0
else:
max_sizes[var_name] = 0

# Add the max_size column to metadata
variables_metadata.data["variable_max_size"] = variables_metadata.data[
"variable_name"
].map(max_sizes)

return variables_metadata
4 changes: 4 additions & 0 deletions resources/schema/rule-merged/MetaVariables.json
Original file line number Diff line number Diff line change
Expand Up @@ -254,6 +254,10 @@
"const": "variable_label",
"markdownDescription": "\nVariable long label\n"
},
{
"const": "variable_max_size",
"markdownDescription": "\nMaximum length of actual data values in the variable\n"
},
{
"const": "variable_name",
"markdownDescription": "\nVariable short name\n"
Expand Down
2 changes: 1 addition & 1 deletion resources/schema/rule-merged/Rule_Type.json
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@
{
"const": "Variable Metadata Check",
"title": "Content metadata at variable level",
"markdownDescription": "\n#### Columns\n\n- `variable_name`\n- `variable_order_number`\n- `variable_label`\n- `variable_size`\n- `variable_data_type`\n- `variable_format`\n\n#### Rule Macro\n\nChecks variable-level metadata sourced from the submission dataset contents.\n\n#### Example\n\n```yaml\n- name: variable_label\n operator: longer_than\n value: 40\n```\n"
"markdownDescription": "\n#### Columns\n\n- `variable_name`\n- `variable_order_number`\n- `variable_label`\n- `variable_size`\n- `variable_data_type`\n- `variable_format`\n- `variable_max_size` (if needed by the rule)\n\n#### Rule Macro\n\nChecks variable-level metadata sourced from the submission dataset contents.\n\n#### Example\n\n```yaml\n- name: variable_label\n operator: longer_than\n value: 40\n```\n"
},
{
"const": "Variable Metadata Check against Define XML",
Expand Down
1 change: 1 addition & 0 deletions resources/schema/rule/MetaVariables.json
Original file line number Diff line number Diff line change
Expand Up @@ -159,6 +159,7 @@
{ "const": "variable_has_empty_values" },
{ "const": "variable_is_empty" },
{ "const": "variable_label" },
{ "const": "variable_max_size" },
{ "const": "variable_name" },
{
"const": "variable_order_number"
Expand Down
4 changes: 4 additions & 0 deletions resources/schema/rule/MetaVariables.md
Original file line number Diff line number Diff line change
Expand Up @@ -258,6 +258,10 @@ True/False value indicating whether a variable is completely empty

Variable long label

## variable_max_size

Maximum length of actual data values in the variable

## variable_name

Variable short name
Expand Down
1 change: 1 addition & 0 deletions resources/schema/rule/Rule_Type.md
Original file line number Diff line number Diff line change
Expand Up @@ -209,6 +209,7 @@ Pairs record-level data values from the submission datasets with dataset metadat
- `variable_size`
- `variable_data_type`
- `variable_format`
- `variable_max_size` (if needed by the rule)

#### Rule Macro

Expand Down
Loading
Loading