# Inma Hernandez Lopez
# inmaculada.hernandez-lopez@ukr.de
# LIT - Leibniz-Institute für Immunotherapie
# Marc Elosua Bayés
# marc.elosua@cnag.crg.eu
# CNAG - CRG
# PSEUDOBULK
library(Seurat)
library(tidyverse)
library(devtools)
# install_github("https://github.com/MarcElosua/SPOTlight", ref = "bioc_rcpp")
library(SPOTlight)
library(SingleCellExperiment)
library(here)
library(glue)
source(here("paths.R"))
olig <- "{sc_data}/OE0145-IDH_integrated_oligodendroglioma_NMF_labelled_with_metacell_mapping_and_1p_scores_newlabel.rds" %>%
glue() %>%
here() %>%
readRDS()
# Remove SCT since we don't need it anymore
hvg <- VariableFeatures(olig)
DefaultAssay(olig) <- "RNA"
olig[["SCT"]] <- NULL
labs_ls <- "{sc_data}/labels.rds" %>%
glue() %>%
here() %>%
readRDS()
lab_oligo <- labs_ls$OD %>%
data.frame() %>%
tibble::rownames_to_column("bc") %>%
dplyr::rename("new_annot" = ".")
# Add them to the object
olig@meta.data <- olig@meta.data %>%
tibble::rownames_to_column("bc") %>%
left_join(lab_oligo, by = "bc") %>%
tibble::column_to_rownames("bc")
DimPlot(
olig,
group.by = c("new_annot", "Subclusters", "NMF_labelling"),
raster = FALSE,
ncol = 2)
with(olig@meta.data, table(new_annot, Subclusters))
# Remove excluded cells
olig <- olig[, olig$new_annot != "Excluded"]
# DefaultAssay(olig) <- "RNA"
pseudo_ls <- lapply(unique(olig$orig.ident), function(i) {
tmp <- olig[, olig$orig.ident == i]@assays$RNA@counts
as.matrix(rowSums(tmp))
})
olig_pseudobulk <- Reduce(cbind, pseudo_ls)
# Remove all genes that are 0
olig_pseudobulk <- olig_pseudobulk[rowSums(olig_pseudobulk) > 0, ]
colnames(olig_pseudobulk) <- unique(olig$orig.ident)
ct_proportion <- olig@meta.data %>%
dplyr::count(orig.ident, new_annot) %>%
dplyr::group_by(orig.ident) %>%
dplyr::mutate(freq = n / sum(n)) %>%
data.frame()
table(olig$Subclusters)
table(olig$NMF_labelling)
table(olig$Subclusters, olig$NMF_labelling)
# Load marker genes from MuSiC_inma_olig_pseudobulk
mgs <- "{sc_data}/mgs_olig.rds" %>%
glue() %>%
here() %>%
readRDS(file = )
mgenes <- unique(mgs$gene)
feats <- union(hvg, mgenes)
gc()
sp_deconv <- SPOTlight(
x = olig,
y = olig_pseudobulk,
groups = as.character(olig$new_annot),
mgs = mgs,
gene_id = "gene",
group_id = "cluster",
weight_id = "avg_log2FC",
hvg = hvg
)
"{sc_data}/spotlight_deconv_olig_pseudobulk.rds" %>%
glue() %>%
here() %>%
saveRDS(
object = sp_deconv,
file = .)
library(colorBlindness)
sp_pred <- sp_deconv[["mat"]]
sp_pred_long <- sp_pred %>%
data.frame(check.names = FALSE) %>%
tibble::rownames_to_column("ID") %>%
pivot_longer(
cols = -ID,
names_to = "cell_type",
values_to = "proportion")
# Color palette for these cell types
cpalette <- colorBlindness::paletteMartin[1:ncol(sp_pred)]
names(cpalette) <- colnames(sp_pred)
# Visualization of proportions
sp_pred_long %>%
ggplot() +
geom_col(
aes(x = ID, y = proportion, fill = cell_type),
position = "fill") +
scale_fill_manual(
values = cpalette) +
theme_classic() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
sp_pred_long %>%
filter(cell_type == "RA") %>%
ggplot() +
geom_col(
aes(x = forcats::fct_reorder(ID, -proportion), y = proportion, fill = cell_type)) +
scale_fill_manual(values = cpalette) +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
# ct_proportion$new_annot <- str_replace(ct_proportion$new_annot, "-", ".")
df <- sp_pred_long %>%
left_join(ct_proportion, by = c("ID" = "orig.ident", "cell_type" = "new_annot")) %>%
mutate(freq = if_else(is.na(freq), 0, freq))
"{sc_data}/spotlight_deconv_olig_pseudobulk_long.rds" %>%
glue() %>%
here() %>%
saveRDS(
object = df,
file = .)
ggplot(df) +
geom_point(aes(x = freq, y = proportion, color = ID)) +
geom_abline(slope = 1, intercept = 0, linetype = "dashed", color = "red") +
facet_wrap( ~ cell_type, scales = "free") +
labs(x = "Ground-Truth Proportions", y = "Predicted Proportions") +
scale_color_brewer(palette = "Dark2") +
theme_minimal()
# TCGA
library(Seurat)
library(tidyverse)
library(devtools)
# install_github("https://github.com/MarcElosua/SPOTlight", ref = "bioc_rcpp")
library(SPOTlight)
library(SingleCellExperiment)
library(here)
library(glue)
source(here("paths.R"))
olig <- "{sc_data}/OE0145-IDH_integrated_oligodendroglioma_NMF_labelled_with_metacell_mapping_and_1p_scores_newlabel.rds" %>%
glue() %>%
here() %>%
readRDS()
hvg <- VariableFeatures(olig)
# Remove SCT since we don't need it anymore
DefaultAssay(olig) <- "RNA"
olig[["SCT"]] <- NULL
labs_ls <- "{sc_data}/labels.rds" %>%
glue() %>%
here() %>%
readRDS()
lab_oligo <- labs_ls$OD %>%
data.frame() %>%
tibble::rownames_to_column("bc") %>%
dplyr::rename("new_annot" = ".")
# Add them to the object
olig@meta.data <- olig@meta.data %>%
tibble::rownames_to_column("bc") %>%
left_join(lab_oligo, by = "bc") %>%
tibble::column_to_rownames("bc")
DimPlot(
olig,
group.by = c("new_annot", "Subclusters", "NMF_labelling"),
raster = FALSE,
ncol = 2)
with(olig@meta.data, table(new_annot, Subclusters))
# Remove excluded cells
olig <- olig[, olig$new_annot != "Excluded"]
table(olig$Subclusters)
table(olig$NMF_labelling)
table(olig$Subclusters, olig$NMF_labelling)
# Load marker genes from MuSiC_inma_olig_pseudobulk
mgs <- "{sc_data}/mgs_olig.rds" %>%
glue() %>%
here() %>%
readRDS(file = )
mgenes <- unique(mgs$gene)
feats <- union(hvg, mgenes)
gc()
library(SummarizedExperiment)
olig_bulk <- "{tcga_data}/primary/Oligodendroglioma_bulkRNA_TCGA.rds" %>%
glue() %>%
here() %>%
readRDS(file = )
cdata <- "{tcga_data}/TCGA_metadata.rds" %>%
glue() %>%
here() %>%
readRDS(file = )
# Subset to Oligo
cdata <- cdata %>%
filter(file_name %in% colnames(olig_bulk)) %>%
tibble::rownames_to_column("TCGA_id") %>%
tibble::column_to_rownames("file_name") %>%
DataFrame()
se <- SummarizedExperiment(
assays = list("counts" = as.matrix(olig_bulk)),
colData = cdata[colnames(olig_bulk), ])
# quickly check the millions of fragments that could be mapped by Salmon to the genes
se[["depth_M"]] <- round( colSums(olig_bulk) / 1e6, 1 )
colData(se) %>%
data.frame() %>%
ggplot() +
geom_col(aes(x = forcats::fct_reorder(TCGA_id, -depth_M), y = depth_M, fill = tumor_type)) +
theme_minimal() +
theme(axis.text.x = element_text(angle = 90))
library(org.Hs.eg.db)
# Using the org.Hs.eg.db we set up mapping info - if you look at the documentation you
# can also obtain other keytypes
ens_genes <- str_split(string = rownames(olig_bulk), pattern = "\\.", simplify = TRUE)[, 1]
annots <- select(
org.Hs.eg.db,
keys = ens_genes,
columns = "SYMBOL",
keytype = "ENSEMBL")
# Check which Symbols are mapping to multiple ensemble IDs
symb_dup <- annots %>%
dplyr::count(SYMBOL) %>%
filter(n > 1 & ! is.na(SYMBOL)) %>%
pull("SYMBOL")
sym_to_ens_dup <- annots %>% dplyr::filter(SYMBOL %in% symb_dup) %>% pull(ENSEMBL)
##
ens_to_symb_dup <- annots %>%
dplyr::count(ENSEMBL) %>%
filter(n > 1) %>%
pull("ENSEMBL")
# Check which ensemble IDs don't map to any symbol
ens_na <- annots %>% dplyr::filter(is.na(SYMBOL)) %>% pull(ENSEMBL)
ens_rm <- Reduce(union, list(sym_to_ens_dup, ens_to_symb_dup, ens_na))
olig_bulk <- olig_bulk[which(!ens_genes %in% ens_rm), ]
symb <- str_split(string = rownames(olig_bulk), pattern = "\\.", simplify = TRUE)[, 1] %>%
data.frame("ENSEMBL" = .) %>%
left_join(annots, by = "ENSEMBL") %>%
pull(SYMBOL)
# replace ensemble ID with symbol
# ens_check <- symb %>% dplyr::count(ENSEMBL) %>% dplyr::filter(n > 1) %>% pull("ENSEMBL")
rownames(olig_bulk) <- symb
# Remove uninformative genes
olig_bulk <- olig_bulk[rowSums(olig_bulk) > 0, ]
sp_deconv <- SPOTlight(
x = olig,
y = as.matrix(olig_bulk),
groups = as.character(olig$new_annot),
mgs = mgs,
gene_id = "gene",
group_id = "cluster",
weight_id = "avg_log2FC",
hvg = hvg
)
"{sc_data}/spotlight_deconv_olig_TCGA.rds" %>%
glue() %>%
here() %>%
saveRDS(
object = sp_deconv,
file = .)
library(colorBlindness)
cdata$ID <- rownames(cdata)
sp_pred <- sp_deconv[["mat"]]
sp_pred_long <- sp_pred %>%
data.frame(check.names = FALSE) %>%
tibble::rownames_to_column("ID") %>%
pivot_longer(
cols = -ID,
names_to = "cell_type",
values_to = "proportion") %>%
left_join(data.frame(cdata), by = "ID")
# Color palette for these cell types
cpalette <- colorBlindness::paletteMartin[1:ncol(sp_pred)]
names(cpalette) <- colnames(sp_pred)
# Visualization of proportions
sp_pred_long %>%
ggplot() +
geom_col(
aes(x = TCGA_id, y = proportion, fill = cell_type),
position = "fill") +
scale_fill_manual(
values = cpalette) +
theme_classic() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
x_ord <- sp_pred_long %>%
dplyr::filter(cell_type == "RA") %>%
arrange(desc(proportion)) %>%
pull(TCGA_id)
sp_pred_long %>%
ggplot() +
geom_col(
aes(x = TCGA_id, y = proportion, fill = cell_type),
position = "fill") +
scale_fill_manual(values = cpalette) +
scale_x_discrete(limits = x_ord) +
theme_classic() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
sp_pred_long %>%
filter(cell_type == "RA") %>%
ggplot() +
geom_col(
aes(x = forcats::fct_reorder(TCGA_id, -proportion), y = proportion, fill = cell_type)) +
scale_fill_manual(values = cpalette) +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
"{sc_data}/spotlight_deconv_olig_TCGA_RA_proportion.rds" %>%
glue() %>%
here() %>%
saveRDS(
object = sp_pred_long,
file = .)
# CGGA
library(Seurat)
library(tidyverse)
library(devtools)
# install_github("https://github.com/MarcElosua/SPOTlight", ref = "bioc_rcpp")
library(SPOTlight)
library(SingleCellExperiment)
library(here)
library(glue)
source(here("paths.R"))
olig <- "{sc_data}/OE0145-IDH_integrated_oligodendroglioma_NMF_labelled_with_metacell_mapping_and_1p_scores_newlabel.rds" %>%
glue() %>%
here() %>%
readRDS()
hvg <- VariableFeatures(olig)
# Remove SCT since we don't need it anymore
DefaultAssay(olig) <- "RNA"
olig[["SCT"]] <- NULL
# lab_astro <- readRDS("~/Downloads/labels_astro_primary.rds")
labs_ls <- "{sc_data}/labels.rds" %>%
glue() %>%
here() %>%
readRDS()
lab_oligo <- labs_ls$OD %>%
data.frame() %>%
tibble::rownames_to_column("bc") %>%
dplyr::rename("new_annot" = ".")
# Add them to the object
olig@meta.data <- olig@meta.data %>%
tibble::rownames_to_column("bc") %>%
left_join(lab_oligo, by = "bc") %>%
tibble::column_to_rownames("bc")
DimPlot(
olig,
group.by = c("new_annot", "Subclusters", "NMF_labelling"),
raster = FALSE,
ncol = 2)
with(olig@meta.data, table(new_annot, Subclusters))
# Remove excluded cells
olig <- olig[, olig$new_annot != "Excluded"]
table(olig$Subclusters)
table(olig$NMF_labelling)
table(olig$Subclusters, olig$NMF_labelling)
# Load marker genes from MuSiC_inma_olig_pseudobulk
mgs <- "{sc_data}/mgs_olig.rds" %>%
glue() %>%
here() %>%
readRDS(file = )
mgenes <- unique(mgs$gene)
feats <- union(hvg, mgenes)
gc()
library(SummarizedExperiment)
olig_bulk <- "{cgga_data}/primary/Oligodendroglioma_primary_bulkRNA_CGGA.rds" %>%
glue() %>%
here() %>%
readRDS(file = )
cdata <- "{cgga_data}/metadata_CGGA.rds" %>%
glue() %>%
here() %>%
readRDS(file = )
# Subset to Oligo
cdata <- cdata %>%
filter(CGGA_ID %in% colnames(olig_bulk)) %>%
tibble::rownames_to_column("CGGA_id") %>%
tibble::column_to_rownames("CGGA_ID") %>%
DataFrame()
cdata$ID <- rownames(cdata)
cdata$CGGA_id <- NULL
se <- SummarizedExperiment(
assays = list("counts" = as.matrix(olig_bulk)),
colData = cdata[colnames(olig_bulk), ])
# quickly check the millions of fragments that could be mapped by Salmon to the genes
se[["depth_M"]] <- round( colSums(olig_bulk) / 1e6, 1 )
colData(se) %>%
data.frame() %>%
ggplot() +
geom_col(aes(x = forcats::fct_reorder(ID, -depth_M), y = depth_M, fill = PRS_type)) +
theme_minimal() +
theme(axis.text.x = element_text(angle = 90))
# Remove uninformative genes
olig_bulk <- olig_bulk[rowSums(olig_bulk) > 0, ]
sp_deconv <- SPOTlight(
x = olig,
y = as.matrix(olig_bulk),
groups = as.character(olig$new_annot),
mgs = mgs,
gene_id = "gene",
group_id = "cluster",
weight_id = "avg_log2FC",
hvg = hvg
)
"{sc_data}/spotlight_deconv_olig_CGGA.rds" %>%
glue() %>%
here() %>%
saveRDS(
object = sp_deconv,
file = .)
library(colorBlindness)
cdata$ID <- rownames(cdata)
sp_pred <- sp_deconv[["mat"]]
sp_pred_long <- sp_pred %>%
data.frame(check.names = FALSE) %>%
tibble::rownames_to_column("ID") %>%
pivot_longer(
cols = -ID,
names_to = "cell_type",
values_to = "proportion") %>%
left_join(data.frame(cdata), by = "ID")
# Color palette for these cell types
cpalette <- colorBlindness::paletteMartin[1:ncol(sp_pred)]
names(cpalette) <- colnames(sp_pred)
# Visualization of proportions
sp_pred_long %>%
ggplot() +
geom_col(
aes(x = ID, y = proportion, fill = cell_type),
position = "fill") +
scale_fill_manual(
values = cpalette) +
theme_classic() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
x_ord <- sp_pred_long %>%
dplyr::filter(cell_type == "RA") %>%
arrange(desc(proportion)) %>%
pull(ID)
sp_pred_long %>%
ggplot() +
geom_col(
aes(x = ID, y = proportion, fill = cell_type),
position = "fill") +
scale_fill_manual(values = cpalette) +
scale_x_discrete(limits = x_ord) +
theme_classic() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
sp_pred_long %>%
filter(cell_type == "RA") %>%
ggplot() +
geom_col(
aes(x = forcats::fct_reorder(ID, -proportion), y = proportion, fill = cell_type)) +
scale_fill_manual(values = cpalette) +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
"{sc_data}/spotlight_deconv_olig_CGGA_RA_proportion.rds" %>%
glue() %>%
here() %>%
saveRDS(
object = sp_pred_long,
file = .)