Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions docs/changes/newsfragments/498.feature
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Introduce :func:`.generate_yaml` to generate feature YAML from metadata by `Synchon Mandal`_
1 change: 1 addition & 0 deletions docs/links.inc
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
.. _`INM-7`: https://www.fz-juelich.de/inm/inm-7/EN/Home/home_node.html
.. _`julearn`: https://juaml.github.io/julearn
.. _`junifer-data`: https://github.com/juaml/junifer-data-client
.. _`julio`: https://github.com/juaml/julio

.. _`pandas`: https://pandas.pydata.org
.. _`pandas.DataFrame` : https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.html
Expand Down
56 changes: 56 additions & 0 deletions docs/using/generate_yaml.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
.. include:: ../links.inc

.. _generate_yaml:

Generating YAML from metadata
=============================

``junifer`` stores the pipeline metadata for a run along with the extracted feature data.
So, the metadata for all the "elements" processed with a pipeline is unique. The metadata
contains all the necessary information to recreate the configuration used for the processing.

If one wants to generate the processing YAML, :func:`.generate_yaml` can be used for that.
The only requirement is providing the metadata which can be extracted by following the initial steps of
:ref:`analysing results <analysing_extracted_features>`.


Configuration for ``julio``
---------------------------

When generating a registry with `julio`_, we can configure the YAML generation process. For now, only
DataGrabbers can be configured through the use of ``_dump_exclude`` class variable, like so:

.. code-block:: python

from typing import ClassVar

from junifer.api.decorators import register_datagrabber
from junifer.datagrabber import PatternDataladDataGrabber


@register_datagrabber
class MyDataGrabber(PatternDataladDataGrabber):

_dump_exclude: ClassVar[set[str]] = {
"patterns",
"replacements",
"confounds_format",
"partial_pattern_ok",
"uri",
"rootdir",
"datadir",
"datalad_id",
"datalad_dirty",
"datalad_commit_id",
}


The above can be considered a standard setup for a custom DataGrabber inheriting from :class:`.PatternDataladDataGrabber`.


.. admonition:: Tip

- For DataGrabbers inheriting from :class:`.BaseDataGrabber` custom setup is possible but not required.
- For DataGrabbers inheriting from :class:`.PatternDataGrabber` no extra setup should be required.
- For :class:`.PatternDataladDataGrabber`\s specified via the YAML, it is not possible
to customise and is usually not required. If such a need arises, creating a custom DataGrabber is the only way.
1 change: 1 addition & 0 deletions docs/using/index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ to interact with HPC and HTC systems.
queueing
configuring
dumping
generate_yaml


.. _using_components:
Expand Down
19 changes: 17 additions & 2 deletions junifer/api/__init__.pyi
Original file line number Diff line number Diff line change
@@ -1,4 +1,19 @@
__all__ = ["decorators", "collect", "queue", "run", "reset", "list_elements"]
__all__ = [
"decorators",
"collect",
"queue",
"run",
"reset",
"list_elements",
"generate_yaml",
]

from . import decorators
from .functions import collect, list_elements, reset, run, queue
from .functions import (
collect,
generate_yaml,
list_elements,
reset,
run,
queue,
)
123 changes: 122 additions & 1 deletion junifer/api/functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,12 @@
# License: AGPL

import atexit
import datetime as dt
import io
import os
import shutil
from pathlib import Path
from typing import TYPE_CHECKING, Any

from ..api.queue_context import GnuParallelLocalAdapter, HTCondorAdapter
from ..datagrabber import BaseDataGrabber
Expand All @@ -30,7 +33,18 @@
from ..utils import logger, raise_error, warn_with_log, yaml


__all__ = ["collect", "list_elements", "queue", "reset", "run"]
if TYPE_CHECKING:
from ruamel.yaml.comments import CommentedMap


__all__ = [
"collect",
"generate_yaml",
"list_elements",
"queue",
"reset",
"run",
]


def _get_datagrabber(datagrabber_config: dict) -> DataGrabberLike:
Expand Down Expand Up @@ -463,3 +477,110 @@ def list_elements(
elements_to_list.append(str_element)

return "\n".join(elements_to_list)


def generate_yaml(meta: dict) -> "CommentedMap":
"""Generate the feature YAML from metadata.

Parameters
----------
meta : dict
Feature metadata as dictionary.

Returns
-------
ruamel.yaml.comments.CommentedMap
Feature YAML.

"""
y: dict[str, Any] = {}
y["workdir"] = ""
# Add "with" section if present
if "with" in meta:
y["with"] = meta["with"].copy()
# Set datagrabber
meta_dg = meta["datagrabber"].copy()
a = meta_dg.pop("class")
dg = PipelineComponentRegistry().get_class(step="datagrabber", name=a)
dg_model = dg.model_construct(**meta_dg)
y["datagrabber"] = {
"kind": a,
**dg_model.model_dump(
mode="json",
exclude=dg_model._dump_exclude
if hasattr(dg_model, "_dump_exclude")
else {},
exclude_defaults=True,
exclude_none=True,
),
}
# Set preprocessor(s)
if "preprocess" in meta:
y["preprocess"] = []
meta_p = meta["preprocess"].copy()
if not isinstance(meta_p, list):
meta_p = [meta_p]
for mp in meta_p:
b = mp.pop("class")
p = PipelineComponentRegistry().get_class(
step="preprocessing", name=b
)
p_model = p.model_construct(**mp)
y["preprocess"].append(
{
"kind": b,
**p_model.model_dump(
mode="json",
exclude={"required_data_types"},
exclude_defaults=True,
exclude_none=True,
),
}
)
# Set marker
meta_m = meta["marker"].copy()
c = meta_m.pop("class")
m = PipelineComponentRegistry().get_class(step="marker", name=c)
m_model = m.model_construct(**meta_m)
y["markers"] = []
y["markers"].append(
{
"kind": c,
**m_model.model_dump(
mode="json",
exclude_defaults=True,
exclude_none=True,
),
}
)
# Set storage
y["storage"] = {
"kind": "HDF5FeatureStorage",
"uri": "",
}
# Set queue
if "queue" in meta:
y["queue"] = meta["queue"].copy()
else:
y["queue"] = {
"jobname": meta["name"],
"kind": "",
}
# Dump and load yaml to format
f = io.StringIO()
yaml.dump(y, stream=f)
f.seek(0)
d = yaml.load(f)
# Add preamble
pre = (
"Auto-generated by junifer on "
f"{dt.datetime.now(tz=dt.UTC).strftime('%Y-%m-%d %H:%M:%S')} UTC\n\n"
)
if "dependencies" in meta:
for k, v in meta["dependencies"].items():
pre += f"{k}=={v}\n"
d.yaml_set_start_comment(pre)
# Add newline between sections
for s in d.keys():
d.yaml_set_comment_before_after_key(s, before="\n")
return d
15 changes: 14 additions & 1 deletion junifer/configs/juseless/datagrabbers/aomic_id1000_vbm.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
# Synchon Mandal <s.mandal@fz-juelich.de>
# License: AGPL

from typing import Literal
from typing import ClassVar, Literal

from pydantic import AnyUrl

Expand All @@ -31,6 +31,19 @@ class JuselessDataladAOMICID1000VBM(PatternDataladDataGrabber):

"""

_dump_exclude: ClassVar[set[str]] = {
"patterns",
"replacements",
"confounds_format",
"partial_pattern_ok",
"uri",
"rootdir",
"datadir",
"datalad_id",
"datalad_dirty",
"datalad_commit_id",
}

uri: AnyUrl = AnyUrl("https://gin.g-node.org/felixh/ds003097_ReproVBM")
types: list[Literal[DataType.VBM_GM]] = [DataType.VBM_GM] # noqa: RUF012
patterns: DataGrabberPatterns = { # noqa: RUF012
Expand Down
15 changes: 14 additions & 1 deletion junifer/configs/juseless/datagrabbers/camcan_vbm.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
# Synchon Mandal <s.mandal@fz-juelich.de>
# License: AGPL

from typing import Literal
from typing import ClassVar, Literal

from pydantic import AnyUrl

Expand All @@ -32,6 +32,19 @@ class JuselessDataladCamCANVBM(PatternDataladDataGrabber):

"""

_dump_exclude: ClassVar[set[str]] = {
"patterns",
"replacements",
"confounds_format",
"partial_pattern_ok",
"uri",
"rootdir",
"datadir",
"datalad_id",
"datalad_dirty",
"datalad_commit_id",
}

uri: AnyUrl = AnyUrl(
"ria+http://cat_12.5.ds.inm7.de#a139b26a-8406-11ea-8f94-a0369f287950"
)
Expand Down
13 changes: 13 additions & 0 deletions junifer/configs/juseless/datagrabbers/ixi_vbm.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,19 @@ class JuselessDataladIXIVBM(PatternDataladDataGrabber):

"""

_dump_exclude: ClassVar[set[str]] = {
"patterns",
"replacements",
"confounds_format",
"partial_pattern_ok",
"uri",
"rootdir",
"datadir",
"datalad_id",
"datalad_dirty",
"datalad_commit_id",
}

uri: AnyUrl = AnyUrl(
"ria+http://cat_12.5.ds.inm7.de#b7107c52-8408-11ea-89c6-a0369f287950"
)
Expand Down
15 changes: 14 additions & 1 deletion junifer/configs/juseless/datagrabbers/ukb_vbm.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
# License: AGPL

from pathlib import Path
from typing import Literal
from typing import ClassVar, Literal

from pydantic import AnyUrl

Expand All @@ -33,6 +33,19 @@ class JuselessDataladUKBVBM(PatternDataladDataGrabber):

"""

_dump_exclude: ClassVar[set[str]] = {
"patterns",
"replacements",
"confounds_format",
"partial_pattern_ok",
"uri",
"rootdir",
"datadir",
"datalad_id",
"datalad_dirty",
"datalad_commit_id",
}

uri: AnyUrl = AnyUrl("ria+http://ukb.ds.inm7.de#~cat_m0wp1")
rootdir: Path = Path("m0wp1")
types: list[Literal[DataType.VBM_GM]] = [DataType.VBM_GM] # noqa: RUF012
Expand Down
15 changes: 14 additions & 1 deletion junifer/datagrabber/aomic/id1000.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
# Synchon Mandal <s.mandal@fz-juelich.de>
# License: AGPL

from typing import Annotated, Literal
from typing import Annotated, ClassVar, Literal

from pydantic import AnyUrl, BeforeValidator

Expand Down Expand Up @@ -52,6 +52,19 @@ class DataladAOMICID1000(PatternDataladDataGrabber):

"""

_dump_exclude: ClassVar[set[str]] = {
"patterns",
"replacements",
"confounds_format",
"partial_pattern_ok",
"uri",
"rootdir",
"datadir",
"datalad_id",
"datalad_dirty",
"datalad_commit_id",
}

uri: AnyUrl = AnyUrl("https://github.com/OpenNeuroDatasets/ds003097.git")
types: Annotated[_types | list[_types], BeforeValidator(ensure_list)] = [ # noqa: RUF012
DataType.BOLD,
Expand Down
Loading
Loading