juaml · synchon · May 29, 2026 · May 29, 2026 · May 29, 2026 · May 29, 2026
diff --git a/docs/changes/newsfragments/498.feature b/docs/changes/newsfragments/498.feature
@@ -0,0 +1 @@
+Introduce :func:`.generate_yaml` to generate feature YAML from metadata by `Synchon Mandal`_
diff --git a/docs/links.inc b/docs/links.inc
@@ -13,6 +13,7 @@
 .. _`INM-7`: https://www.fz-juelich.de/inm/inm-7/EN/Home/home_node.html
 .. _`julearn`: https://juaml.github.io/julearn
 .. _`junifer-data`: https://github.com/juaml/junifer-data-client
+.. _`julio`: https://github.com/juaml/julio
 
 .. _`pandas`: https://pandas.pydata.org
 .. _`pandas.DataFrame` : https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.html

diff --git a/docs/using/generate_yaml.rst b/docs/using/generate_yaml.rst
@@ -0,0 +1,56 @@
+.. include:: ../links.inc
+
+.. _generate_yaml:
+
+Generating YAML from metadata
+=============================
+
+``junifer`` stores the pipeline metadata for a run along with the extracted feature data.
+So, the metadata for all the "elements" processed with a pipeline is unique. The metadata
+contains all the necessary information to recreate the configuration used for the processing.
+
+If one wants to generate the processing YAML, :func:`.generate_yaml` can be used for that.
+The only requirement is providing the metadata which can be extracted by following the initial steps of
+:ref:`analysing results <analysing_extracted_features>`.
+
+
+Configuration for ``julio``
+---------------------------
+
+When generating a registry with `julio`_, we can configure the YAML generation process. For now, only
+DataGrabbers can be configured through the use of ``_dump_exclude`` class variable, like so:
+
+   .. code-block:: python
+
+     from typing import ClassVar
+
+     from junifer.api.decorators import register_datagrabber
+     from junifer.datagrabber import PatternDataladDataGrabber
+
+
+     @register_datagrabber
+     class MyDataGrabber(PatternDataladDataGrabber):
+
+         _dump_exclude: ClassVar[set[str]] = {
+            "patterns",
+            "replacements",
+            "confounds_format",
+            "partial_pattern_ok",
+            "uri",
+            "rootdir",
+            "datadir",
+            "datalad_id",
+            "datalad_dirty",
+            "datalad_commit_id",
+        }
+
+
+The above can be considered a standard setup for a custom DataGrabber inheriting from :class:`.PatternDataladDataGrabber`.
+
+
+.. admonition:: Tip
+
+   - For DataGrabbers inheriting from :class:`.BaseDataGrabber` custom setup is possible but not required.
+   - For DataGrabbers inheriting from :class:`.PatternDataGrabber` no extra setup should be required.
+   - For :class:`.PatternDataladDataGrabber`\s specified via the YAML, it is not possible
+     to customise and is usually not required. If such a need arises, creating a custom DataGrabber is the only way.
diff --git a/docs/using/index.rst b/docs/using/index.rst
@@ -20,6 +20,7 @@ to interact with HPC and HTC systems.
    queueing
    configuring
    dumping
+   generate_yaml
 
 
 .. _using_components:

diff --git a/junifer/api/__init__.pyi b/junifer/api/__init__.pyi
@@ -1,4 +1,19 @@
-__all__ = ["decorators", "collect", "queue", "run", "reset", "list_elements"]
+__all__ = [
+    "decorators",
+    "collect",
+    "queue",
+    "run",
+    "reset",
+    "list_elements",
+    "generate_yaml",
+]
 
 from . import decorators
-from .functions import collect, list_elements, reset, run, queue
+from .functions import (
+    collect,
+    generate_yaml,
+    list_elements,
+    reset,
+    run,
+    queue,
+)
diff --git a/junifer/api/functions.py b/junifer/api/functions.py
@@ -6,9 +6,12 @@
 # License: AGPL
 
 import atexit
+import datetime as dt
+import io
 import os
 import shutil
 from pathlib import Path
+from typing import TYPE_CHECKING, Any
 
 from ..api.queue_context import GnuParallelLocalAdapter, HTCondorAdapter
 from ..datagrabber import BaseDataGrabber
@@ -30,7 +33,18 @@
 from ..utils import logger, raise_error, warn_with_log, yaml
 
 
-__all__ = ["collect", "list_elements", "queue", "reset", "run"]
+if TYPE_CHECKING:
+    from ruamel.yaml.comments import CommentedMap
+
+
+__all__ = [
+    "collect",
+    "generate_yaml",
+    "list_elements",
+    "queue",
+    "reset",
+    "run",
+]
 
 
 def _get_datagrabber(datagrabber_config: dict) -> DataGrabberLike:
@@ -463,3 +477,110 @@ def list_elements(
         elements_to_list.append(str_element)
 
     return "\n".join(elements_to_list)
+
+
+def generate_yaml(meta: dict) -> "CommentedMap":
+    """Generate the feature YAML from metadata.
+
+    Parameters
+    ----------
+    meta : dict
+        Feature metadata as dictionary.
+
+    Returns
+    -------
+    ruamel.yaml.comments.CommentedMap
+        Feature YAML.
+
+    """
+    y: dict[str, Any] = {}
+    y["workdir"] = ""
+    # Add "with" section if present
+    if "with" in meta:
+        y["with"] = meta["with"].copy()
+    # Set datagrabber
+    meta_dg = meta["datagrabber"].copy()
+    a = meta_dg.pop("class")
+    dg = PipelineComponentRegistry().get_class(step="datagrabber", name=a)
+    dg_model = dg.model_construct(**meta_dg)
+    y["datagrabber"] = {
+        "kind": a,
+        **dg_model.model_dump(
+            mode="json",
+            exclude=dg_model._dump_exclude
+            if hasattr(dg_model, "_dump_exclude")
+            else {},
+            exclude_defaults=True,
+            exclude_none=True,
+        ),
+    }
+    # Set preprocessor(s)
+    if "preprocess" in meta:
+        y["preprocess"] = []
+        meta_p = meta["preprocess"].copy()
+        if not isinstance(meta_p, list):
+            meta_p = [meta_p]
+        for mp in meta_p:
+            b = mp.pop("class")
+            p = PipelineComponentRegistry().get_class(
+                step="preprocessing", name=b
+            )
+            p_model = p.model_construct(**mp)
+            y["preprocess"].append(
+                {
+                    "kind": b,
+                    **p_model.model_dump(
+                        mode="json",
+                        exclude={"required_data_types"},
+                        exclude_defaults=True,
+                        exclude_none=True,
+                    ),
+                }
+            )
+    # Set marker
+    meta_m = meta["marker"].copy()
+    c = meta_m.pop("class")
+    m = PipelineComponentRegistry().get_class(step="marker", name=c)
+    m_model = m.model_construct(**meta_m)
+    y["markers"] = []
+    y["markers"].append(
+        {
+            "kind": c,
+            **m_model.model_dump(
+                mode="json",
+                exclude_defaults=True,
+                exclude_none=True,
+            ),
+        }
+    )
+    # Set storage
+    y["storage"] = {
+        "kind": "HDF5FeatureStorage",
+        "uri": "",
+    }
+    # Set queue
+    if "queue" in meta:
+        y["queue"] = meta["queue"].copy()
+    else:
+        y["queue"] = {
+            "jobname": meta["name"],
+            "kind": "",
+        }
+    # Dump and load yaml to format
+    f = io.StringIO()
+    yaml.dump(y, stream=f)
+    f.seek(0)
+    d = yaml.load(f)
+    # Add preamble
+    pre = (
+        "Auto-generated by junifer on "
+        f"{dt.datetime.now(tz=dt.UTC).strftime('%Y-%m-%d %H:%M:%S')} UTC\n\n"
+    )
+    if "dependencies" in meta:
+        for k, v in meta["dependencies"].items():
+            pre += f"{k}=={v}\n"
+    d.yaml_set_start_comment(pre)
+    # Add newline between sections
+    for s in d.keys():
+        d.yaml_set_comment_before_after_key(s, before="\n")
+    return d
diff --git a/junifer/configs/juseless/datagrabbers/aomic_id1000_vbm.py b/junifer/configs/juseless/datagrabbers/aomic_id1000_vbm.py
@@ -4,7 +4,7 @@
 #          Synchon Mandal <s.mandal@fz-juelich.de>
 # License: AGPL
 
-from typing import Literal
+from typing import ClassVar, Literal
 
 from pydantic import AnyUrl
 
@@ -31,6 +31,19 @@ class JuselessDataladAOMICID1000VBM(PatternDataladDataGrabber):
 
     """
 
+    _dump_exclude: ClassVar[set[str]] = {
+        "patterns",
+        "replacements",
+        "confounds_format",
+        "partial_pattern_ok",
+        "uri",
+        "rootdir",
+        "datadir",
+        "datalad_id",
+        "datalad_dirty",
+        "datalad_commit_id",
+    }
+
     uri: AnyUrl = AnyUrl("https://gin.g-node.org/felixh/ds003097_ReproVBM")
     types: list[Literal[DataType.VBM_GM]] = [DataType.VBM_GM]  # noqa: RUF012
     patterns: DataGrabberPatterns = {  # noqa: RUF012

diff --git a/junifer/configs/juseless/datagrabbers/camcan_vbm.py b/junifer/configs/juseless/datagrabbers/camcan_vbm.py
@@ -5,7 +5,7 @@
 #          Synchon Mandal <s.mandal@fz-juelich.de>
 # License: AGPL
 
-from typing import Literal
+from typing import ClassVar, Literal
 
 from pydantic import AnyUrl
 
@@ -32,6 +32,19 @@ class JuselessDataladCamCANVBM(PatternDataladDataGrabber):
 
     """
 
+    _dump_exclude: ClassVar[set[str]] = {
+        "patterns",
+        "replacements",
+        "confounds_format",
+        "partial_pattern_ok",
+        "uri",
+        "rootdir",
+        "datadir",
+        "datalad_id",
+        "datalad_dirty",
+        "datalad_commit_id",
+    }
+
     uri: AnyUrl = AnyUrl(
         "ria+http://cat_12.5.ds.inm7.de#a139b26a-8406-11ea-8f94-a0369f287950"
     )

diff --git a/junifer/configs/juseless/datagrabbers/ixi_vbm.py b/junifer/configs/juseless/datagrabbers/ixi_vbm.py
@@ -48,6 +48,19 @@ class JuselessDataladIXIVBM(PatternDataladDataGrabber):
 
     """
 
+    _dump_exclude: ClassVar[set[str]] = {
+        "patterns",
+        "replacements",
+        "confounds_format",
+        "partial_pattern_ok",
+        "uri",
+        "rootdir",
+        "datadir",
+        "datalad_id",
+        "datalad_dirty",
+        "datalad_commit_id",
+    }
+
     uri: AnyUrl = AnyUrl(
         "ria+http://cat_12.5.ds.inm7.de#b7107c52-8408-11ea-89c6-a0369f287950"
     )

diff --git a/junifer/configs/juseless/datagrabbers/ukb_vbm.py b/junifer/configs/juseless/datagrabbers/ukb_vbm.py
@@ -6,7 +6,7 @@
 # License: AGPL
 
 from pathlib import Path
-from typing import Literal
+from typing import ClassVar, Literal
 
 from pydantic import AnyUrl
 
@@ -33,6 +33,19 @@ class JuselessDataladUKBVBM(PatternDataladDataGrabber):
 
     """
 
+    _dump_exclude: ClassVar[set[str]] = {
+        "patterns",
+        "replacements",
+        "confounds_format",
+        "partial_pattern_ok",
+        "uri",
+        "rootdir",
+        "datadir",
+        "datalad_id",
+        "datalad_dirty",
+        "datalad_commit_id",
+    }
+
     uri: AnyUrl = AnyUrl("ria+http://ukb.ds.inm7.de#~cat_m0wp1")
     rootdir: Path = Path("m0wp1")
     types: list[Literal[DataType.VBM_GM]] = [DataType.VBM_GM]  # noqa: RUF012

diff --git a/junifer/datagrabber/aomic/id1000.py b/junifer/datagrabber/aomic/id1000.py
@@ -7,7 +7,7 @@
 #          Synchon Mandal <s.mandal@fz-juelich.de>
 # License: AGPL
 
-from typing import Annotated, Literal
+from typing import Annotated, ClassVar, Literal
 
 from pydantic import AnyUrl, BeforeValidator
 
@@ -52,6 +52,19 @@ class DataladAOMICID1000(PatternDataladDataGrabber):
 
     """
 
+    _dump_exclude: ClassVar[set[str]] = {
+        "patterns",
+        "replacements",
+        "confounds_format",
+        "partial_pattern_ok",
+        "uri",
+        "rootdir",
+        "datadir",
+        "datalad_id",
+        "datalad_dirty",
+        "datalad_commit_id",
+    }
+
     uri: AnyUrl = AnyUrl("https://github.com/OpenNeuroDatasets/ds003097.git")
     types: Annotated[_types | list[_types], BeforeValidator(ensure_list)] = [  # noqa: RUF012
         DataType.BOLD,
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		Introduce :func:`.generate_yaml` to generate feature YAML from metadata by `Synchon Mandal`_