Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
742 changes: 742 additions & 0 deletions 3.analyze_data/visualize_umaps/compute_plate_umaps.ipynb

Large diffs are not rendered by default.

Comment thread
MattsonCam marked this conversation as resolved.
Comment thread
MattsonCam marked this conversation as resolved.
Comment thread
MattsonCam marked this conversation as resolved.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
177 changes: 177 additions & 0 deletions 3.analyze_data/visualize_umaps/nbconverted/compute_plate_umaps.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,177 @@
#!/usr/bin/env python
# coding: utf-8

# # Sample JUMP plate data to compute UMAP.
# This UMAP data was computed by sampling QC'd and feature-selected cellprofiler profiles.

# In[1]:


import pathlib

import numpy as np
import pandas as pd
import umap


# # Inputs

# In[2]:


git_root_path = pathlib.Path("../../")
big_drive_path = pathlib.Path("/mnt/big_drive").resolve(strict=True)
feature_data_path = big_drive_path / "feature_selected_sc_qc_data"
plate_to_treat_typedf = pd.read_csv(
(git_root_path / "reference_plate_data/barcode_platemap.csv").resolve(strict=True)
)
plate_to_treat_typedf = plate_to_treat_typedf.rename(
columns={
"Assay_Plate_Barcode": "Metadata_Plate",
"Plate_Map_Name": "Metadata_Treatment_Type",
}
)


# # Outputs

# In[3]:


umap_data_path = big_drive_path / "umap_data/feature_selected_sc_qc_data"
umap_data_path.mkdir(parents=True, exist_ok=True)


# ## Process Plate Mappings

# In[4]:


replacements = [
"crispr",
"orf",
"compound",
]

conditions = [
plate_to_treat_typedf["Metadata_Treatment_Type"].str.contains(
k, case=False, na=False
)
for k in replacements
]

plate_to_treat_typedf["Metadata_Treatment_Type"] = np.select(
conditions, replacements, default=plate_to_treat_typedf["Metadata_Treatment_Type"]
)


# # Sample Single Cells
# Sample cells from plate data.

# In[5]:


merge_cols = [
"Metadata_Plate",
"Metadata_Site",
"Metadata_Well",
"Metadata_ObjectNumber",
]

umapdf = []

for plate_path in feature_data_path.iterdir():
plate_name = plate_path.stem.split("_")[0]

print(f"Sampling Plate {plate_name}")
anomaly_path = (
big_drive_path
/ f"sc_anomaly_data/feature_selected_sc_qc_data/{plate_name}_feature_selected_sc_qc"
)

anomalydf = pd.concat(
[pd.read_parquet(path) for path in anomaly_path.iterdir()], axis=0
Comment thread
MattsonCam marked this conversation as resolved.
)

featdf = pd.read_parquet(
big_drive_path
/ f"feature_selected_sc_qc_data/{plate_name}_feature_selected_sc_qc.parquet"
)

result_cols = anomalydf.columns[anomalydf.columns.str.contains("Result")].tolist()

# Include the anomaly data
scdf = pd.merge(
left=anomalydf[result_cols + merge_cols],
right=featdf,
how="inner",
on=merge_cols,
)

# Include the treatment type
scdf = pd.merge(
left=scdf, right=plate_to_treat_typedf, how="inner", on="Metadata_Plate"
)

scdf.loc[
~scdf["Metadata_control_type"].isin(["negcon"]), "Metadata_control_type"
] = "other"

group_sizes = scdf["Metadata_control_type"].value_counts()
large_groups = group_sizes[group_sizes > 250].index
small_groups = group_sizes[group_sizes <= 250].index
sampled_large = (
scdf[scdf["Metadata_control_type"].isin(large_groups)]
.groupby("Metadata_control_type", group_keys=False)
.sample(n=250, random_state=0)
)
small = scdf[scdf["Metadata_control_type"].isin(small_groups)]
scdf = pd.concat([sampled_large, small], axis=0)

umapdf.append(scdf)

umapdf = pd.concat(umapdf, axis=0)
umapdf = umapdf.dropna(axis=1, how="any")
Comment thread
MattsonCam marked this conversation as resolved.

print("Shape of plate data after sampling:", umapdf.shape)
print(umapdf["Metadata_control_type"].unique())


# # Compute UMAP Components
# Drop all feature data not associated with UMAP, result, or metadata data.

# In[6]:


def compute_umap_components(umapdf: pd.DataFrame):
umap_drop_cols = [
col for col in umapdf.columns if "Metadata" in col or "Result" in col
]

umapdf = umapdf.sample(frac=1, random_state=0)
reducer = umap.UMAP(n_components=2, random_state=0)
umap_data = reducer.fit_transform(umapdf.drop(columns=umap_drop_cols))
umapdf = umapdf.copy()
umapdf[["umap_0", "umap_1"]] = umap_data[:, :2]

return umapdf[umap_drop_cols + ["umap_0", "umap_1"]]


umapdf = compute_umap_components(umapdf=umapdf)


# In[7]:


print("\nColumns of final umap:", umapdf.columns.tolist())
print(f"\nShape of final umap: {umapdf.shape}")
print(umapdf["Metadata_control_type"].value_counts())


# # Save UMAP Data

# In[8]:


umapdf.to_parquet(umap_data_path / "umap_feature_selected_sc_qc_data.parquet")

85 changes: 85 additions & 0 deletions 3.analyze_data/visualize_umaps/nbconverted/visualize_plate_umaps.r
Comment thread
MattsonCam marked this conversation as resolved.
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
suppressPackageStartupMessages(library(ggplot2))
suppressPackageStartupMessages(library(arrow))


# Find the root of the git repo
project_root <- normalizePath(
system("git rev-parse --show-toplevel", intern = TRUE),
winslash = "/",
mustWork = TRUE
)

big_drive_dir <- file.path(project_root, "../../../../mnt", "big_drive")
umap_anomaly_path <- file.path(big_drive_dir, "umap_data", "feature_selected_sc_qc_data")
output_fig_dir <- file.path(project_root, "3.analyze_data", "visualize_umaps", "figures")
dir.create(output_fig_dir, recursive = TRUE, showWarnings = FALSE)

# Set directory and file structure
umap_path <- file.path(umap_anomaly_path, "umap_feature_selected_sc_qc_data.parquet")
umap_df <- read_parquet(umap_path)

umap_axisless_theme <- theme_bw(base_size = 16) +
theme(
plot.title = element_text(size = 18, face = "bold"),
legend.title = element_text(size = 14),
legend.text = element_text(size = 12),
axis.title = element_text(size = 14),
axis.text = element_blank(),
axis.ticks = element_blank(),
axis.line = element_blank(),
panel.grid = element_blank()
)


umap_figure <-
ggplot(umap_df, aes(x = umap_0, y = umap_1, color = Metadata_Treatment_Type)) +
geom_point(shape = 20, size = 2, alpha = 4) +
scale_color_manual(
Comment thread
MattsonCam marked this conversation as resolved.
name = "Treatment Type",
values = c("compound" = "#FF6347", "orf" = "#4169E1", "crispr" = "#90EE90")
) +
labs(title = "UMAP Labeled with Treatment Type", x = "UMAP1", y = "UMAP2") +
umap_axisless_theme
umap_plot_fig <- file.path(output_fig_dir, "treatment_type_umap.png")

# Save the plot to a file (e.g., in PNG format)
ggsave(umap_plot_fig, umap_figure, width = 10, height = 8, dpi = 500)

umap_figure


# Create UMAP labelled with the anomaly score as gradient
umap_figure <-
ggplot(umap_df, aes(x = umap_0, y = umap_1, color = Result_anomaly_score)) +
geom_point(shape = 20, size = 1, alpha = 0.5) +
scale_color_gradient(
low = "blue", high = "lightgrey",
name = "Anomaly Score"
) +
labs(title = "UMAP Labeled with Anomaly Score", x = "UMAP1", y = "UMAP2") +
umap_axisless_theme

umap_plot_fig <- file.path(output_fig_dir, "anomaly_score_umap.png")

# Save the plot to a file (e.g., in PNG format)
ggsave(umap_plot_fig, umap_figure, width = 10, height = 8, dpi = 500)

umap_figure


umap_figure <-
ggplot(umap_df, aes(x = umap_0, y = umap_1, color = Metadata_control_type)) +
geom_point(shape = 20, size = 2, alpha = 4) +
scale_color_manual(
Comment thread
MattsonCam marked this conversation as resolved.
name = "Control Type",
values = c("other" = "#190744", "negcon" = "#d60101")
) +
labs(title = "UMAP Labeled with Control Type", x = "UMAP1", y = "UMAP2") +
umap_axisless_theme

umap_plot_fig <- file.path(output_fig_dir, "control_type_umap.png")

# Save the plot to a file (e.g., in PNG format)
ggsave(umap_plot_fig, umap_figure, width = 10, height = 8, dpi = 500)

umap_figure
Loading