Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
54 changes: 52 additions & 2 deletions packages/ref/src/cmip_ref/datasets/obs4mips.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,48 @@ def extract_attr_with_regex(
return None


def parse_citation_metadata(info: dict[str, Any | None], ds: xr.Dataset) -> dict[str, Any | None]:
"""Parser for doi and tracking_id"""
try:
info["tracking_id"] = ds.attrs["tracking_id"]

except (KeyError, AttributeError, ValueError):
...

try:
info["doi"] = ds.attrs["doi"]
except (KeyError, AttributeError, ValueError):
info["doi"] = None
try:
info["references"] = ds.attrs["references"]
except (KeyError, AttributeError, ValueError):
info["references"] = None
if info["doi"] is None and info["references"] is not None:
doi_pattern = r"10.\d{4,9}/[-._;()/:A-Z0-9]+"
dois = re.findall(doi_pattern, info["references"], re.IGNORECASE)
if dois:
info["doi"] = dois[0]
return info


def parse_uncertainty(info: dict[str, Any | None], ds: xr.Dataset) -> dict[str, Any | None]:
"""Parser for uncertainty information"""
if "has_auxdata" in ds.attrs:
info["has_auxdata"] = ds.attrs["has_auxdata"]
else:
info["has_auxdata"] = "False"

if info["has_auxdata"] is True:
try:
info["aux_variable_id"] = ds.attrs["aux_variable_id"]
except (KeyError, AttributeError, ValueError):
info["has_auxdata"] = "False"
info["aux_variable_id"] = None
else:
info["aux_variable_id"] = None
return info


def parse_obs4mips(file: str) -> dict[str, Any | None]:
"""Parser for obs4mips"""
keys = sorted(
Expand Down Expand Up @@ -92,7 +134,8 @@ def parse_obs4mips(file: str) -> dict[str, Any | None]:
start_time, end_time = str(ds.cf["T"][0].data), str(ds.cf["T"][-1].data)
except (KeyError, AttributeError, ValueError):
...

info = parse_citation_metadata(info, ds)
info = parse_uncertainty(info, ds)
info["vertical_levels"] = vertical_levels
info["start_time"] = start_time
info["end_time"] = end_time
Expand All @@ -111,7 +154,7 @@ def parse_obs4mips(file: str) -> dict[str, Any | None]:

except (TypeError, AttributeError) as err:
if (len(err.args)) == 1:
logger.warning(str(err.args[0]))
logger.warning(str(file + " " + err.args[0]))
else:
logger.warning(str(err.args))
return {"INVALID_ASSET": file, "TRACEBACK": traceback_message}
Expand Down Expand Up @@ -145,6 +188,9 @@ class Obs4MIPsDatasetAdapter(DatasetAdapter):
"units",
"vertical_levels",
"source_version_number",
"doi",
"references",
"tracking_id",
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We should probably track the tracking id for CMIP6 datasets as well

slug_column,
)

Expand Down Expand Up @@ -178,6 +224,10 @@ def pretty_subset(self, data_catalog: pd.DataFrame) -> pd.DataFrame:
"variable_id",
"grid_label",
"source_version_number",
"doi",
"tracking_id",
"has_auxdata",
"aux_variable_id",
]
]

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
"""add doi and tracking_id to metadata tracking

Revision ID: c058c4ccf383
Revises: c5de99c14533
Create Date: 2025-04-04 16:01:26.205824

"""

from collections.abc import Sequence
from typing import Union

import sqlalchemy as sa
from alembic import op

# revision identifiers, used by Alembic.
revision: str = "c058c4ccf383"
down_revision: Union[str, None] = "c5de99c14533"
branch_labels: Union[str, Sequence[str], None] = None
depends_on: Union[str, Sequence[str], None] = None


def upgrade() -> None:
# ### commands auto generated by Alembic - please adjust! ###
with op.batch_alter_table("metric_execution_result", schema=None) as batch_op:
batch_op.create_foreign_key(
"fk_metric_execution_result_metric_execution_group_id_metric_execution_group", # <-- Name added
"metric_execution_group",
["metric_execution_group_id"],
["id"],
)

with op.batch_alter_table("obs4mips_dataset", schema=None) as batch_op:
batch_op.add_column(sa.Column("doi", sa.String(), nullable=True))
batch_op.add_column(sa.Column("references", sa.String(), nullable=True))
batch_op.add_column(sa.Column("tracking_id", sa.String(), nullable=True))
batch_op.alter_column("variant_label", existing_type=sa.VARCHAR(), nullable=True)

# ### end Alembic commands ###


def downgrade() -> None:
# ### commands auto generated by Alembic - please adjust! ###
with op.batch_alter_table("obs4mips_dataset", schema=None) as batch_op:
batch_op.alter_column("variant_label", existing_type=sa.VARCHAR(), nullable=False)
batch_op.drop_column("tracking_id")
batch_op.drop_column("references")
batch_op.drop_column("doi")

with op.batch_alter_table("metric_execution_result", schema=None) as batch_op:
batch_op.drop_constraint(
"fk_metric_execution_result_metric_execution_group_id_metric_execution_group", # <-- Name added
type_="foreignkey",
)

# ### end Alembic commands ###
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
"""add uncertainty information attributes

Revision ID: dc2393b21436
Revises: c058c4ccf383
Create Date: 2025-04-04 16:24:14.505451

"""

from collections.abc import Sequence
from typing import Union

import sqlalchemy as sa
from alembic import op

# revision identifiers, used by Alembic.
revision: str = "dc2393b21436"
down_revision: Union[str, None] = "c058c4ccf383"
branch_labels: Union[str, Sequence[str], None] = None
depends_on: Union[str, Sequence[str], None] = None


def upgrade() -> None:
# ### commands auto generated by Alembic - please adjust! ###
with op.batch_alter_table("obs4mips_dataset", schema=None) as batch_op:
batch_op.add_column(sa.Column("has_auxdata", sa.String(), nullable=True))
batch_op.add_column(sa.Column("aux_variable_id", sa.String(), nullable=True))

# ### end Alembic commands ###


def downgrade() -> None:
# ### commands auto generated by Alembic - please adjust! ###
with op.batch_alter_table("obs4mips_dataset", schema=None) as batch_op:
batch_op.drop_column("aux_variable_id")
batch_op.drop_column("has_auxdata")

# ### end Alembic commands ###
9 changes: 7 additions & 2 deletions packages/ref/src/cmip_ref/models/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -156,11 +156,15 @@ class Obs4MIPsDataset(Dataset):
source_type: Mapped[str] = mapped_column()
units: Mapped[str] = mapped_column()
variable_id: Mapped[str] = mapped_column()
variant_label: Mapped[str] = mapped_column()
variant_label: Mapped[str] = mapped_column(nullable=True)
vertical_levels: Mapped[int] = mapped_column()
source_version_number: Mapped[str] = mapped_column()

doi: Mapped[str] = mapped_column(nullable=True)
references: Mapped[str] = mapped_column(nullable=True)
tracking_id: Mapped[str] = mapped_column(nullable=True)
instance_id: Mapped[str] = mapped_column()
has_auxdata: Mapped[str] = mapped_column(nullable=True)
aux_variable_id: Mapped[str] = mapped_column(nullable=True)
"""
Unique identifier for the dataset.
"""
Expand All @@ -182,6 +186,7 @@ class Obs4MIPsFile(Base):
dataset_id: Mapped[int] = mapped_column(
ForeignKey("obs4mips_dataset.id", ondelete="CASCADE"), nullable=False
)

"""
Foreign key to the dataset table
"""
Expand Down
Loading