Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions malariagen_data/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
# flake8: noqa
from .adir1 import Adir1
from .af1 import Af1
from .ag3 import Ag3
from .amin1 import Amin1
Expand Down
208 changes: 208 additions & 0 deletions malariagen_data/adir1.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,208 @@
import sys

import plotly.express as px # type: ignore

import malariagen_data
from .anopheles import AnophelesDataResource

MAJOR_VERSION_NUMBER = 1
MAJOR_VERSION_PATH = "v1.x"
CONFIG_PATH = "v1.x-config.json"
GCS_DEFAULT_URL = "gs://vo_adir_production_us_central1/"
GCS_DEFAULT_PUBLIC_URL = "gs://vo_adir_production_us_central1/v1.x/staging"
GCS_REGION_URLS = {
"us-central1": "gs://vo_adir_production_us_central1",
}
# XPEHH_GWSS_CACHE_NAME = "adir1_xpehh_gwss_v1"
# IHS_GWSS_CACHE_NAME = "adir1_ihs_gwss_v1"

TAXON_PALETTE = px.colors.qualitative.Plotly
TAXON_COLORS = {
"dirus": TAXON_PALETTE[0],
}


class Adir1(AnophelesDataResource):
"""Provides access to data from Adir1.x releases.

Parameters
----------
url : str, optional
Base path to data. Defaults to use Google Cloud Storage, or can
be a local path on your file system if data have been downloaded.
site_filters_analysis : str, optional
Site filters analysis version.
bokeh_output_notebook : bool, optional
If True (default), configure bokeh to output plots to the notebook.
results_cache : str, optional
Path to directory on local file system to save results.
log : str or stream, optional
File path or stream output for logging messages.
debug : bool, optional
Set to True to enable debug level logging.
show_progress : bool, optional
If True, show a progress bar during longer-running computations. The default can be overridden using an environmental variable named MGEN_SHOW_PROGRESS.
check_location : bool, optional
If True, use ipinfo to check the location of the client system.
**kwargs
Passed through to fsspec when setting up file system access.

Examples
--------
Access data from Google Cloud Storage (default):

>>> import malariagen_data
>>> adir1 = malariagen_data.Adir1()

Access data downloaded to a local file system:

>>> adir1 = malariagen_data.Adir1("/local/path/to/vo_adir_release/")

Access data from Google Cloud Storage, with caching on the local file system
in a directory named "gcs_cache":

>>> adir1 = malariagen_data.Adir1(
... "simplecache::gs://vo_adir_production_us_central1",
... simplecache=dict(cache_storage="gcs_cache"),
... )

Set up caching of some longer-running computations on the local file system,
in a directory named "results_cache":

>>> adir1 = malariagen_data.Adir1(results_cache="results_cache")

"""

# _xpehh_gwss_cache_name = XPEHH_GWSS_CACHE_NAME
# _ihs_gwss_cache_name = IHS_GWSS_CACHE_NAME

def __init__(
self,
url=None,
bokeh_output_notebook=True,
results_cache=None,
log=sys.stdout,
debug=False,
show_progress=None,
check_location=True,
cohorts_analysis=None,
site_filters_analysis=None,
discordant_read_calls_analysis=None,
pre=False,
tqdm_class=None,
**storage_options, # used by fsspec via init_filesystem()
):
super().__init__(
url=url,
config_path=CONFIG_PATH,
cohorts_analysis=cohorts_analysis,
aim_analysis=None,
aim_metadata_dtype=None,
aim_ids=None,
aim_palettes=None,
site_filters_analysis=site_filters_analysis,
discordant_read_calls_analysis=discordant_read_calls_analysis,
default_site_mask="dirus",
default_phasing_analysis="dirus",
default_coverage_calls_analysis="dirus",
bokeh_output_notebook=bokeh_output_notebook,
results_cache=results_cache,
log=log,
debug=debug,
show_progress=show_progress,
check_location=check_location,
pre=pre,
gcs_default_url=GCS_DEFAULT_URL,
gcs_region_urls=GCS_REGION_URLS,
major_version_number=MAJOR_VERSION_NUMBER,
major_version_path=MAJOR_VERSION_PATH,
gff_gene_type="protein_coding_gene",
gff_gene_name_attribute="Note",
gff_default_attributes=("ID", "Parent", "Note", "description"),
storage_options=storage_options, # used by fsspec via init_filesystem()
tqdm_class=tqdm_class,
taxon_colors=TAXON_COLORS,
virtual_contigs=None,
gene_names=None,
inversion_tag_path=None,
)

def __repr__(self):
text = (
f"<MalariaGEN Adir1 API client>\n"
f"Storage URL : {self._url}\n"
f"Data releases available : {', '.join(self.releases)}\n"
f"Results cache : {self._results_cache}\n"
# f"Cohorts analysis : {self._cohorts_analysis}\n"
f"Site filters analysis : {self._site_filters_analysis}\n"
f"Software version : malariagen_data {malariagen_data.__version__}\n"
f"Client location : {self.client_location}\n"
f"---\n"
f"Please note that data are subject to terms of use,\n"
f"for more information see https://www.malariagen.net/data\n"
f"or contact support@malariagen.net. For API documentation see \n"
f"https://malariagen.github.io/malariagen-data-python/v{malariagen_data.__version__}/Adir1.html"
)
return text

def _repr_html_(self):
html = f"""
<table class="malariagen-adir1">
<thead>
<tr>
<th style="text-align: left" colspan="2">MalariaGEN Adir1 API client</th>
</tr>
<tr><td colspan="2" style="text-align: left">
Please note that data are subject to terms of use,
for more information see <a href="https://www.malariagen.net/data">
the MalariaGEN website</a> or contact support@malariagen.net.
See also the <a href="https://malariagen.github.io/malariagen-data-python/v{malariagen_data.__version__}/Adir1.html">Adir1 API docs</a>.
</td></tr>
</thead>
<tbody>
<tr>
<th style="text-align: left">
Storage URL
</th>
<td>{self._url}</td>
</tr>
<tr>
<th style="text-align: left">
Data releases available
</th>
<td>{', '.join(self.releases)}</td>
</tr>
<tr>
<th style="text-align: left">
Results cache
</th>
<td>{self._results_cache}</td>
</tr>
<tr>
<th style="text-align: left">
Cohorts analysis
</th>
<td>{self._cohorts_analysis}</td>
</tr>
<tr>
<th style="text-align: left">
Site filters analysis
</th>
<td>{self._site_filters_analysis}</td>
</tr>
<tr>
<th style="text-align: left">
Software version
</th>
<td>malariagen_data {malariagen_data.__version__}</td>
</tr>
<tr>
<th style="text-align: left">
Client location
</th>
<td>{self.client_location}</td>
</tr>
</tbody>
</table>
"""
return html
47 changes: 25 additions & 22 deletions malariagen_data/anoph/sample_metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -180,15 +180,16 @@ def _parse_general_metadata(
axis="columns",
)

# Have commented these out for now as a) this is not a public release and b) for some reason it breaks everything and I haven't had the time to work out why
# Add study columns.
study_info = self.lookup_study_info(sample_set=sample_set)
for column in study_info:
df[column] = study_info[column]
# study_info = self.lookup_study_info(sample_set=sample_set)
# for column in study_info:
# df[column] = study_info[column]

# Add terms-of-use columns.
terms_of_use_info = self.lookup_terms_of_use_info(sample_set=sample_set)
for column in terms_of_use_info:
df[column] = terms_of_use_info[column]
# terms_of_use_info = self.lookup_terms_of_use_info(sample_set=sample_set)
# for column in terms_of_use_info:
# df[column] = terms_of_use_info[column]

return df

Expand Down Expand Up @@ -617,29 +618,31 @@ def sample_metadata(
# Note: this includes study and terms-of-use info.
df_samples = self.general_metadata(sample_sets=prepped_sample_sets)

# Commented this out as it breaks some things - will fix

# Merge with the sequence QC metadata.
df_sequence_qc = self.sequence_qc_metadata(
sample_sets=prepped_sample_sets
)
# df_sequence_qc = self.sequence_qc_metadata(
# sample_sets=prepped_sample_sets
# )

# Note: merging can change column dtypes
df_samples = df_samples.merge(
df_sequence_qc, on="sample_id", sort=False, how="left"
)
# df_samples = df_samples.merge(
# df_sequence_qc, on="sample_id", sort=False, how="left"
# )

# If available, merge with the AIM metadata.
if self._aim_analysis:
df_aim = self.aim_metadata(sample_sets=prepped_sample_sets)
df_samples = df_samples.merge(
df_aim, on="sample_id", sort=False, how="left"
)
# if self._aim_analysis:
# df_aim = self.aim_metadata(sample_sets=prepped_sample_sets)
# df_samples = df_samples.merge(
# df_aim, on="sample_id", sort=False, how="left"
# )

# If available, merge with the cohorts metadata.
if self._cohorts_analysis:
df_cohorts = self.cohorts_metadata(sample_sets=prepped_sample_sets)
df_samples = df_samples.merge(
df_cohorts, on="sample_id", sort=False, how="left"
)
# if self._cohorts_analysis:
# df_cohorts = self.cohorts_metadata(sample_sets=prepped_sample_sets)
# df_samples = df_samples.merge(
# df_cohorts, on="sample_id", sort=False, how="left"
# )

# Store sample metadata in the cache.
self._cache_sample_metadata[cache_key] = df_samples
Expand Down
71 changes: 54 additions & 17 deletions malariagen_data/anopheles.py
Original file line number Diff line number Diff line change
Expand Up @@ -574,31 +574,68 @@ def roh_hmm(
debug = self._log.debug

resolved_region: Region = parse_single_region(self, region)
del region

debug("compute windowed heterozygosity")
sample_id, sample_set, windows, counts = self._sample_count_het(
name = "roh"

params = dict(
sample=sample,
region=resolved_region,
site_mask=site_mask,
region=region,
window_size=window_size,
site_mask=site_mask,
sample_set=sample_set,
chunks=chunks,
inline_array=inline_array,
)

debug("compute runs of homozygosity")
df_roh = self._roh_hmm_predict(
windows=windows,
counts=counts,
phet_roh=phet_roh,
phet_nonroh=phet_nonroh,
transition=transition,
window_size=window_size,
sample_id=sample_id,
contig=resolved_region.contig,
chunks=chunks,
inline_array=inline_array,
)

del region

try:
# Load cached numeric data, adding str / obj data again.
results = self.results_cache_get(name=name, params=params)
df_roh = pd.DataFrame(results)
df_roh["sample_id"] = sample
df_roh["contig"] = resolved_region.contig

except CacheMiss:
debug("compute windowed heterozygosity")
sample_id, sample_set, windows, counts = self._sample_count_het(
sample=sample,
region=resolved_region,
site_mask=site_mask,
window_size=window_size,
sample_set=sample_set,
chunks=chunks,
inline_array=inline_array,
)

debug("compute runs of homozygosity")
df_roh = self._roh_hmm_predict(
windows=windows,
counts=counts,
phet_roh=phet_roh,
phet_nonroh=phet_nonroh,
transition=transition,
window_size=window_size,
sample_id=sample_id,
contig=resolved_region.contig,
)

# Specify numeric columns to save (saving obj - sample ID and contig - breaks the save.
columns_to_save = [
"roh_start",
"roh_stop",
"roh_length",
"roh_is_marginal",
]
self.results_cache_set(
name=name,
params=params,
results={col: df_roh[col].to_numpy() for col in columns_to_save},
)

return df_roh

@check_types
Expand Down Expand Up @@ -1306,7 +1343,7 @@ def ihs_gwss(
) -> Tuple[np.ndarray, np.ndarray]:
# change this name if you ever change the behaviour of this function, to
# invalidate any previously cached data
name = self._ihs_gwss_cache_name
name = "roh"

params = dict(
contig=contig,
Expand Down
Loading
Loading