malariagen · tristanpwdennis · Jun 9, 2025 · Jun 10, 2025 · Jun 10, 2025 · Jun 11, 2025
diff --git a/malariagen_data/__init__.py b/malariagen_data/__init__.py
@@ -1,4 +1,5 @@
 # flake8: noqa
+from .adir1 import Adir1
 from .af1 import Af1
 from .ag3 import Ag3
 from .amin1 import Amin1

diff --git a/malariagen_data/adir1.py b/malariagen_data/adir1.py
@@ -0,0 +1,208 @@
+import sys
+
+import plotly.express as px  # type: ignore
+
+import malariagen_data
+from .anopheles import AnophelesDataResource
+
+MAJOR_VERSION_NUMBER = 1
+MAJOR_VERSION_PATH = "v1.x"
+CONFIG_PATH = "v1.x-config.json"
+GCS_DEFAULT_URL = "gs://vo_adir_production_us_central1/"
+GCS_DEFAULT_PUBLIC_URL = "gs://vo_adir_production_us_central1/v1.x/staging"
+GCS_REGION_URLS = {
+    "us-central1": "gs://vo_adir_production_us_central1",
+}
+# XPEHH_GWSS_CACHE_NAME = "adir1_xpehh_gwss_v1"
+# IHS_GWSS_CACHE_NAME = "adir1_ihs_gwss_v1"
+
+TAXON_PALETTE = px.colors.qualitative.Plotly
+TAXON_COLORS = {
+    "dirus": TAXON_PALETTE[0],
+}
+
+
+class Adir1(AnophelesDataResource):
+    """Provides access to data from Adir1.x releases.
+
+    Parameters
+    ----------
+    url : str, optional
+        Base path to data. Defaults to use Google Cloud Storage, or can
+        be a local path on your file system if data have been downloaded.
+    site_filters_analysis : str, optional
+        Site filters analysis version.
+    bokeh_output_notebook : bool, optional
+        If True (default), configure bokeh to output plots to the notebook.
+    results_cache : str, optional
+        Path to directory on local file system to save results.
+    log : str or stream, optional
+        File path or stream output for logging messages.
+    debug : bool, optional
+        Set to True to enable debug level logging.
+    show_progress : bool, optional
+        If True, show a progress bar during longer-running computations. The default can be overridden using an environmental variable named MGEN_SHOW_PROGRESS.
+    check_location : bool, optional
+        If True, use ipinfo to check the location of the client system.
+    **kwargs
+        Passed through to fsspec when setting up file system access.
+
+    Examples
+    --------
+    Access data from Google Cloud Storage (default):
+
+        >>> import malariagen_data
+        >>> adir1 = malariagen_data.Adir1()
+
+    Access data downloaded to a local file system:
+
+        >>> adir1 = malariagen_data.Adir1("/local/path/to/vo_adir_release/")
+
+    Access data from Google Cloud Storage, with caching on the local file system
+    in a directory named "gcs_cache":
+
+        >>> adir1 = malariagen_data.Adir1(
+        ...     "simplecache::gs://vo_adir_production_us_central1",
+        ...     simplecache=dict(cache_storage="gcs_cache"),
+        ... )
+
+    Set up caching of some longer-running computations on the local file system,
+    in a directory named "results_cache":
+
+        >>> adir1 = malariagen_data.Adir1(results_cache="results_cache")
+
+    """
+
+    #    _xpehh_gwss_cache_name = XPEHH_GWSS_CACHE_NAME
+    #    _ihs_gwss_cache_name = IHS_GWSS_CACHE_NAME
+
+    def __init__(
+        self,
+        url=None,
+        bokeh_output_notebook=True,
+        results_cache=None,
+        log=sys.stdout,
+        debug=False,
+        show_progress=None,
+        check_location=True,
+        cohorts_analysis=None,
+        site_filters_analysis=None,
+        discordant_read_calls_analysis=None,
+        pre=False,
+        tqdm_class=None,
+        **storage_options,  # used by fsspec via init_filesystem()
+    ):
+        super().__init__(
+            url=url,
+            config_path=CONFIG_PATH,
+            cohorts_analysis=cohorts_analysis,
+            aim_analysis=None,
+            aim_metadata_dtype=None,
+            aim_ids=None,
+            aim_palettes=None,
+            site_filters_analysis=site_filters_analysis,
+            discordant_read_calls_analysis=discordant_read_calls_analysis,
+            default_site_mask="dirus",
+            default_phasing_analysis="dirus",
+            default_coverage_calls_analysis="dirus",
+            bokeh_output_notebook=bokeh_output_notebook,
+            results_cache=results_cache,
+            log=log,
+            debug=debug,
+            show_progress=show_progress,
+            check_location=check_location,
+            pre=pre,
+            gcs_default_url=GCS_DEFAULT_URL,
+            gcs_region_urls=GCS_REGION_URLS,
+            major_version_number=MAJOR_VERSION_NUMBER,
+            major_version_path=MAJOR_VERSION_PATH,
+            gff_gene_type="protein_coding_gene",
+            gff_gene_name_attribute="Note",
+            gff_default_attributes=("ID", "Parent", "Note", "description"),
+            storage_options=storage_options,  # used by fsspec via init_filesystem()
+            tqdm_class=tqdm_class,
+            taxon_colors=TAXON_COLORS,
+            virtual_contigs=None,
+            gene_names=None,
+            inversion_tag_path=None,
+        )
+
+    def __repr__(self):
+        text = (
+            f"<MalariaGEN Adir1 API client>\n"
+            f"Storage URL             : {self._url}\n"
+            f"Data releases available : {', '.join(self.releases)}\n"
+            f"Results cache           : {self._results_cache}\n"
+            # f"Cohorts analysis        : {self._cohorts_analysis}\n"
+            f"Site filters analysis   : {self._site_filters_analysis}\n"
+            f"Software version        : malariagen_data {malariagen_data.__version__}\n"
+            f"Client location         : {self.client_location}\n"
+            f"---\n"
+            f"Please note that data are subject to terms of use,\n"
+            f"for more information see https://www.malariagen.net/data\n"
+            f"or contact support@malariagen.net. For API documentation see \n"
+            f"https://malariagen.github.io/malariagen-data-python/v{malariagen_data.__version__}/Adir1.html"
+        )
+        return text
+
+    def _repr_html_(self):
+        html = f"""
+            <table class="malariagen-adir1">
+                <thead>
+                    <tr>
+                        <th style="text-align: left" colspan="2">MalariaGEN Adir1 API client</th>
+                    </tr>
+                    <tr><td colspan="2" style="text-align: left">
+                        Please note that data are subject to terms of use,
+                        for more information see <a href="https://www.malariagen.net/data">
+                        the MalariaGEN website</a> or contact support@malariagen.net.
+                        See also the <a href="https://malariagen.github.io/malariagen-data-python/v{malariagen_data.__version__}/Adir1.html">Adir1 API docs</a>.
+                    </td></tr>
+                </thead>
+                <tbody>
+                    <tr>
+                        <th style="text-align: left">
+                            Storage URL
+                        </th>
+                        <td>{self._url}</td>
+                    </tr>
+                    <tr>
+                        <th style="text-align: left">
+                            Data releases available
+                        </th>
+                        <td>{', '.join(self.releases)}</td>
+                    </tr>
+                    <tr>
+                        <th style="text-align: left">
+                            Results cache
+                        </th>
+                        <td>{self._results_cache}</td>
+                    </tr>
+                    <tr>
+                        <th style="text-align: left">
+                            Cohorts analysis
+                        </th>
+                        <td>{self._cohorts_analysis}</td>
+                    </tr>
+                    <tr>
+                        <th style="text-align: left">
+                            Site filters analysis
+                        </th>
+                        <td>{self._site_filters_analysis}</td>
+                    </tr>
+                    <tr>
+                        <th style="text-align: left">
+                            Software version
+                        </th>
+                        <td>malariagen_data {malariagen_data.__version__}</td>
+                    </tr>
+                    <tr>
+                        <th style="text-align: left">
+                            Client location
+                        </th>
+                        <td>{self.client_location}</td>
+                    </tr>
+                </tbody>
+            </table>
+        """
+        return html
diff --git a/malariagen_data/anoph/sample_metadata.py b/malariagen_data/anoph/sample_metadata.py
@@ -180,15 +180,16 @@ def _parse_general_metadata(
                 axis="columns",
             )
 
+            # Have commented these out for now as a) this is not a public release and b) for some reason it breaks everything and I haven't had the time to work out why
             # Add study columns.
-            study_info = self.lookup_study_info(sample_set=sample_set)
-            for column in study_info:
-                df[column] = study_info[column]
+            # study_info = self.lookup_study_info(sample_set=sample_set)
+            # for column in study_info:
+            #    df[column] = study_info[column]
 
             # Add terms-of-use columns.
-            terms_of_use_info = self.lookup_terms_of_use_info(sample_set=sample_set)
-            for column in terms_of_use_info:
-                df[column] = terms_of_use_info[column]
+            # terms_of_use_info = self.lookup_terms_of_use_info(sample_set=sample_set)
+            # for column in terms_of_use_info:
+            #    df[column] = terms_of_use_info[column]
 
             return df
 
@@ -617,29 +618,31 @@ def sample_metadata(
                 # Note: this includes study and terms-of-use info.
                 df_samples = self.general_metadata(sample_sets=prepped_sample_sets)
 
+                # Commented this out as it breaks some things - will fix
+
                 # Merge with the sequence QC metadata.
-                df_sequence_qc = self.sequence_qc_metadata(
-                    sample_sets=prepped_sample_sets
-                )
+                # df_sequence_qc = self.sequence_qc_metadata(
+                #    sample_sets=prepped_sample_sets
+                # )
 
                 # Note: merging can change column dtypes
-                df_samples = df_samples.merge(
-                    df_sequence_qc, on="sample_id", sort=False, how="left"
-                )
+                # df_samples = df_samples.merge(
+                #    df_sequence_qc, on="sample_id", sort=False, how="left"
+                # )
 
                 # If available, merge with the AIM metadata.
-                if self._aim_analysis:
-                    df_aim = self.aim_metadata(sample_sets=prepped_sample_sets)
-                    df_samples = df_samples.merge(
-                        df_aim, on="sample_id", sort=False, how="left"
-                    )
+                # if self._aim_analysis:
+                #    df_aim = self.aim_metadata(sample_sets=prepped_sample_sets)
+                #    df_samples = df_samples.merge(
+                #        df_aim, on="sample_id", sort=False, how="left"
+                #    )
 
                 # If available, merge with the cohorts metadata.
-                if self._cohorts_analysis:
-                    df_cohorts = self.cohorts_metadata(sample_sets=prepped_sample_sets)
-                    df_samples = df_samples.merge(
-                        df_cohorts, on="sample_id", sort=False, how="left"
-                    )
+                # if self._cohorts_analysis:
+                #    df_cohorts = self.cohorts_metadata(sample_sets=prepped_sample_sets)
+                #    df_samples = df_samples.merge(
+                #        df_cohorts, on="sample_id", sort=False, how="left"
+                #    )
 
             # Store sample metadata in the cache.
             self._cache_sample_metadata[cache_key] = df_samples

diff --git a/malariagen_data/anopheles.py b/malariagen_data/anopheles.py
@@ -574,31 +574,68 @@ def roh_hmm(
         debug = self._log.debug
 
         resolved_region: Region = parse_single_region(self, region)
-        del region
 
-        debug("compute windowed heterozygosity")
-        sample_id, sample_set, windows, counts = self._sample_count_het(
+        name = "roh"
+
+        params = dict(
             sample=sample,
-            region=resolved_region,
-            site_mask=site_mask,
+            region=region,
             window_size=window_size,
+            site_mask=site_mask,
             sample_set=sample_set,
-            chunks=chunks,
-            inline_array=inline_array,
-        )
-
-        debug("compute runs of homozygosity")
-        df_roh = self._roh_hmm_predict(
-            windows=windows,
-            counts=counts,
             phet_roh=phet_roh,
             phet_nonroh=phet_nonroh,
             transition=transition,
-            window_size=window_size,
-            sample_id=sample_id,
-            contig=resolved_region.contig,
+            chunks=chunks,
+            inline_array=inline_array,
         )
 
+        del region
+
+        try:
+            # Load cached numeric data, adding str / obj data again.
+            results = self.results_cache_get(name=name, params=params)
+            df_roh = pd.DataFrame(results)
+            df_roh["sample_id"] = sample
+            df_roh["contig"] = resolved_region.contig
+
+        except CacheMiss:
+            debug("compute windowed heterozygosity")
+            sample_id, sample_set, windows, counts = self._sample_count_het(
+                sample=sample,
+                region=resolved_region,
+                site_mask=site_mask,
+                window_size=window_size,
+                sample_set=sample_set,
+                chunks=chunks,
+                inline_array=inline_array,
+            )
+
+            debug("compute runs of homozygosity")
+            df_roh = self._roh_hmm_predict(
+                windows=windows,
+                counts=counts,
+                phet_roh=phet_roh,
+                phet_nonroh=phet_nonroh,
+                transition=transition,
+                window_size=window_size,
+                sample_id=sample_id,
+                contig=resolved_region.contig,
+            )
+
+            # Specify numeric columns to save (saving obj - sample ID and contig - breaks the save.
+            columns_to_save = [
+                "roh_start",
+                "roh_stop",
+                "roh_length",
+                "roh_is_marginal",
+            ]
+            self.results_cache_set(
+                name=name,
+                params=params,
+                results={col: df_roh[col].to_numpy() for col in columns_to_save},
+            )
+
         return df_roh
 
     @check_types
@@ -1306,7 +1343,7 @@ def ihs_gwss(
     ) -> Tuple[np.ndarray, np.ndarray]:
         # change this name if you ever change the behaviour of this function, to
         # invalidate any previously cached data
-        name = self._ihs_gwss_cache_name
+        name = "roh"
 
         params = dict(
             contig=contig,