# Enrique Blanco Carmona# e.blancocarmona@kitz-heidelberg.de# PhD Student – Clinical Bioinformatics# Division of Pediatric Neurooncology (B062)# DKFZ-KiTZ | Germany# ------------------------------------------------------------------# 8 - NMF ANALYSIS ON THE TUMOR BULK# ------------------------------------------------------------------# Notes:# The first half of this code, up to individual NMF signature generation and also the function for# scoring metasignatures has been adapted from Volker Hovestadt's code.# Methodology explained in: https://www.ncbi.nlm.nih.gov/pmc/articles/PMC6754173/## The following code is strongly suggested to be run in a cluster, for each sample individually as a job.# It can take easily 100+ GB of RAM for each unique sample in sample$orig.ident.nmf_output_folder<-""# Folder to store the results.sample_index<-1# This will select the sample out of unique(sample$orig.ident). Modify this parameter as you run it on the cluster.tumor_clusters<-c()# Vector with the clusters that form the tumor bulk.sample.tumor<-sample[, sample$New_NMF_labelling%in%tumor_clusters]# Center the genes over malignant cells.sample.tumor<-Seurat::ScaleData(sample.tumor)nmf.list<-BiocGenerics::lapply(unique(sample.tumor$orig.ident)[sample_index],function(samp){message(samp)# Subset the tumor population pertaining to the sample.tum<-sample.tumor[, sample.tumor$orig.ident==samp]# Re-center the genes for this sample.tum<-Seurat::ScaleData(tum)# Extract the count matrix.cts<-tum@assays$SCT@scale.data# Set negative values to 0.cts[cts<0]<-0# Remove rows without expression in this sample.cts<-cts[rowSums(cts)>0, ]# Compute NMF for different K (i.e: signatures).# From 2 to 10 NMF signatures.k<-2:10nrun<-10seed<-777# Let's have some luck :P.# Run NMF.message(sprintf("Computing NMF for sample: %s", samp))nmf<-NMF::nmf(x =cts, rank =k, nrun =nrun, seed =seed, method ="snmf/r", .options ="p4v")# Save the results.save(list =c("cts", "nmf"), file =paste0(nmf_output_folder, "_", samp, "_nmf2to10", ".RData"))})# Generate further directories for the results.nmf_heatmap_summary<-sprintf("%s/signature_summary/", nmf_output_folder)nmf_feature_plots<-sprintf("%s/feature_plots/", nmf_output_folder)nmf_signature_heatmaps<-sprintf("%s/signature_heatmaps/", nmf_output_folder)nmf_go_kegg<-sprintf("%s/signature_GO_KEGG/", nmf_output_folder)nmf_top30<-sprintf("%s/top30_genes_per_signature/", nmf_output_folder)nmf_metaclusters<-sprintf("%s/metaclusters/", nmf_output_folder)dir.create(nmf_heatmap_summary, recursive =T)dir.create(nmf_feature_plots, recursive =T)dir.create(nmf_signature_heatmaps, recursive =T)dir.create(nmf_go_kegg, recursive =T)dir.create(nmf_top30, recursive =T)dir.create(nmf_metaclusters, recursive =T)# Remove MT genes, as being a snRNAseq experiment they are prone to appear later on as meta-signatures.# Retrieve MT genes.mitochondrial_genes<-rownames(tumor_scaled)[grepl("^MT", rownames(tumor_scaled))]# Filter out ribosomal genes.tumor_scaled<-tumor_scaled[!(rownames(tumor_scaled)%in%mitochondrial_genes), ]# Read in the individual NMF signatures.lf<-list.files(nmf_output_folder, pattern ="nmf2to10.R*", full.names =TRUE)nmf.list<-BiocGenerics::lapply(lf, function(x){load(x)nmf})fi.names<-lf<-list.files(nmf_output_folder, pattern ="nmf2to10.R*", full.names =FALSE)fi.names<-BiocGenerics::sapply(fi.names, function(x)strsplit(x, "_nmf")[[1]][1])fi.names<-BiocGenerics::sapply(fi.names, function(x)strsplit(x, "^_")[[1]][2])names(nmf.list)<-fi.namesmessage("Analyzing NMF signatures from k = 2 until k = 10.")times<-0only_one_k<-FALSE# Check if only NMF was computed for only one value of K.for(kinseq(2, 10)){message(sprintf("Using k = %s", k))message("...")times<-times+1# Start a report.nmf_heatmap_summary_out<-sprintf("%s/k_%s/", nmf_heatmap_summary, k)dir.create(nmf_heatmap_summary_out, recursive =T)grDevices::pdf(sprintf("%s/%s_nmf%s.heatmap.pdf", nmf_heatmap_summary, sample_name, k), width =6, height =6)graphics::par(mar =c(4, 4, 4, 4))# Initialize empty lists to store data.list.cor<-list()list.top30<-list()for(iinnames(nmf.list)){if(only_one_k==FALSE){X.nmf<-nmf.list[[i]][["fit"]][[times]]}elseif(only_one_k==TRUE){X.nmf<-nmf.list[[i]]@fit}# Basis (metagenes).X.nmf.basis<-NMF::basis(X.nmf)# Exclude mitochondrial genes.X.nmf.basis<-X.nmf.basis[!(rownames(X.nmf.basis)%in%rownames(X.nmf.basis)[grepl("^MT", rownames(X.nmf.basis))]), ]colnames(X.nmf.basis)<-paste0("k", seq(ncol(X.nmf.basis)))X.nmf.basis.hc1<-stats::hclust(stats::dist(X.nmf.basis), method ="average")# Plot heatmap.heatmap(X.nmf.basis, Rowv =stats::as.dendrogram(X.nmf.basis.hc1), Colv =NA, zlim =c(0, max(X.nmf.basis)), scale ="n", col =grDevices::grey.colors(1000, 1, 0), useRaster =TRUE, labRow =FALSE, main =paste0(i, ", base"))# Coef.X.nmf.coef<-NMF::coef(X.nmf)rownames(X.nmf.coef)<-paste0("k", seq(nrow(X.nmf.coef)))X.nmf.coef.hc1<-stats::hclust(stats::dist(X.nmf.coef), method ="average")X.nmf.coef.hc2<-stats::hclust(stats::dist(t(X.nmf.coef)), method ="average")heatmap(X.nmf.coef[rev(seq(nrow(X.nmf.coef))), ], Rowv =NA, Colv =stats::as.dendrogram(X.nmf.coef.hc2), zlim=c(0, max(X.nmf.coef)), scale="n", col =grDevices::grey.colors(1000, 1, 0), useRaster=FALSE, labCol=FALSE, main=paste0(i, ", coef"))# Top genes, just use basis.X.nmf.basis.cor<-t(X.nmf.basis)list.cor[[i]]<-X.nmf.basis.corX.nmf.basis.top30<-apply(X.nmf.basis.cor, 1, function(x)colnames(X.nmf.basis.cor)[order(x, decreasing =TRUE)[1:30]])list.top30[[i]]<-X.nmf.basis.top30Y<-tumor_scaled[as.vector(X.nmf.basis.top30), intersect(colnames(X.nmf.coef), colnames(tumor_scaled))]#Y<-Y-rowMeans(Y)# re-centerY.hc<-stats::as.dendrogram(stats::hclust(stats::as.dist(1-stats::cor(Y)), method ="ward.D"))#Y.hc <- stats::reorder(Y.hc, wts = colMeans(tumor_scaled[ribosomal_genes, ])-colMeans(tumor_scaled[c(x1, x2), ]), agglo.FUN = mean)#Z <- as.matrix(scaleMinMax(Y , -6, 6))Z<-as.matrix(scale(Y))heatmap(Z[rev(seq(nrow(Z))), ], Rowv =NA, Colv =as.dendrogram(Y.hc), scale ="n", zlim =c(0, 1), useRaster =TRUE, cexRow =0.5, cexCol =0.1, add.expr =graphics::abline(h =head(cumsum(rep(nrow(X.nmf.basis.top30),ncol(X.nmf.basis.top30))), -1)+0.5), main=paste0(i, ", ", paste0(dim(Z), collapse="x")))}grDevices::dev.off()save(list =c("list.top30", "list.cor", "nmf.list"), file =paste0(nmf_top30, "all_samples_nmf", k, "_signatures.RData"))list.top30.cbind<-do.call(cbind, list.top30)colnames(list.top30.cbind)<-unlist(BiocGenerics::lapply(names(list.top30), function(i){paste0(i, ".", seq(ncol(list.top30[[i]])))}))write.table(list.top30.cbind, file =paste0(nmf_top30, "all_samples_nmf", k, "_signatures.txt"), sep ="\t", quote =FALSE)}# Define 2 custom functions.# Define 2 functions.## (supposed) input for "scoreSignature":## X.center - centered expression matrix for all tumor cells## X.mean - mean expression for each gene across all tumor cells## s - signature genes# Function from Volker Hovestadt.scoreSignature<-function(X.center, X.mean, signature, n=100, verbose=F){if(verbose){message("Number of cells: ", ncol(X.center))message("Number of genes: ", nrow(X.center))message("Number of genes in signature: ", length(signature))message("...\n\n")}# Compute the signature score: A score per cell for the top 30 genes in this signature.s.score<-colMeans(# Compute the mean per column of the output so that you have 1 score per gene.do.call(rbind, # Concatenate the output row by row (gene by gene).BiocGenerics::lapply(signature, # Apply this function to each gene in the list.function(gene){## X.mean[gene] - X.mean => Substracts the median expression of the genes to the expression of the queried gene.## abs(X.mean[gene] - X.mean) ==> Have it as absolute values.## sort(abs(X.mean[gene] - X.mean)) ==> sort the values in ascending order.## names(sort(abs(X.mean[gene] - X.mean)) ==> keep only the names of the sorted genes in ascending order with the lowest difference in expression values.## names(sort(abs(X.mean[gene] - X.mean))[2:(n+1)]) Keep only the 2 to 101 genes (100 in totol) as control set (the 1st is the queried gene).g.n<-names(sort(abs(X.mean[gene]-X.mean))[2:(n+1)])## X.center[gene, ] => The expression values of the queried gene in each cell.## X.center[g.n, ] => The mean expression values of the control set in each cell.## X.center[gene, ] - colMeans(X.center[g.n, ]) => Difference in expression between queried gene and the control set per cell.X.center[gene, ]-colMeans(X.center[g.n, ])})# Therefore, this outputs a vector of the difference in expression values between the gene and the control set per cell.)# Therefore, the outputs a matrix where columns are cells and rows are each of the queried genes.# Each row will contain the difference of expression between each queried gene and the control set, per cells.)# Finally, this outputs the mean score of all genes belonging to the signature per cell.# In other words, it is the difference expression values between each gene belonging to the signature# and the control set of genes, which is defined as the top 100 most similar genes in expression values between each queried# gene and the rest of the genes in the centered expression matrix for all tumor cells, averaged for each cell.# s.score = named vector with names = each cell and value = signature score for the cells.return(s.score)}# Adaptation from Volker Hovestadt function.# Pretty much the same contept as scoreSignature but with a twist in the end. Instead of doing colMeans to get the average score for each cell for all of the genes in the signature.# rowMeans to get the averaged score for each gene for all of the cells in the dataset provided.scoreGene<-function(X.center, X.mean, signature, n=100, verbose=F){if(verbose){message("Number of cells: ", ncol(X.center))message("Number of genes: ", nrow(X.center))message("Number of genes in signature: ", length(signature))message("...\n\n")}s.score<-rowMeans(do.call(rbind, lapply(signature, function(gene){g.n<-names(sort(abs(X.mean[gene]-X.mean))[2:(n+1)])X.center[gene, ]-colMeans(X.center[g.n, ])})))return(s.score)}# The following code creates a list of heatmaps that will help you decide for which value of N (number of NMF signatures) is best.# This is decided based on the number and quality of the groups in the correlation plot.# List placeholder for plot outputs.list_plots<-list()list_nmf_signatures_heatmaps<-list()for(kinseq(2,10)){# Read in signatures.print("Reading signatures.")message(paste0("Using k = ", k))nmf.sign<-read.table(paste0(nmf_top30, "all_samples_nmf", k, "_signatures.txt"), sep ="\t")# Calculate the scores for each signature in each cell.X.mean<-rowMeans(tumor_scaled)# Mean expression level of the scale.data dataset of the tumor bulk.sign.scores<-apply(nmf.sign, 2, function(signature){scoreSignature(X.center =tumor_scaled, X.mean =X.mean,signature)})# Correlate/Cluster signatures over all scores.corr<-stats::cor(sign.scores)range<-max(abs(corr))# Compute the heatmap.h<-pheatmap::pheatmap(corr, color =grDevices::colorRampPalette(rev(RColorBrewer::brewer.pal(n =7, name ="RdBu")))(100), breaks =seq(-range, range, length.out =100), show_colnames =F, treeheight_col =0)# Store the heatmap.list_nmf_signatures_heatmaps[[paste0("nmf", k)]]<-h}# Modify this values accordinglyind_k<-10# Final k for individual signatures to use.meta_k<-4# Number of meta-clusters to derive from the signatures generated with ind_k.nmf.sign<-read.table(paste0(nmf_top30, "all_samples_nmf", ind_k, "_signatures.txt"), sep ="\t")X.mean<-rowMeans(tumor_scaled)sign.scores<-apply(nmf.sign, 2, function(signature){scoreSignature(X.center =tumor_scaled, X.mean =X.mean, s =signature)})# Correlate/Cluster signatures over all scores.corr<-stats::cor(sign.scores)range<-max(abs(corr))h<-pheatmap::pheatmap(corr, color =grDevices::colorRampPalette(rev(RColorBrewer::brewer.pal(n =7, name ="RdBu")))(100), breaks =seq(-range, range, length.out =100), show_colnames =F, treeheight_col =0, cutree_rows =meta_k, cutree_cols =meta_k)# Colors for the metasignature annotation.ann_colors<-colortools::wheel("steelblue", meta_k)names(ann_colors)<-BiocGenerics::sapply(as.character(seq(1, meta_k)), function(x){paste0("Metasignature_", x)})# Colors for the sample annotation.samp_colors<-colortools::wheel("navyblue", length(unique(sample.tumor$orig.ident)))names(samp_colors)<-unique(sample.tumor$orig.ident)# Colors for the grade annotation.subtype_colors<-c("#195A70", "#702F19")names(subtype_colors)<-c("2", "3")# List containing all colors.colors<-list("Metasignature"=ann_colors,"Origin"=samp_colors,"Grade"=subtype_colors)# Workaround to modify the names.metaclust<-stats::cutree(h$tree_row, k =meta_k)metaclust.s<-as.character(BiocGenerics::sapply(metaclust, function(x){paste0("Metasignature_", x)}))names(metaclust.s)<-names(metaclust)metaclust<-metaclust.s# Annotation values for the sample of origin.origin<-BiocGenerics::sapply(colnames(corr), function(x){substring(x, 1, nchar(x)-2)})origin<-BiocGenerics::sapply(origin, function(x){stringr::str_replace_all(x, "[.]", "-")})origin<-BiocGenerics::sapply(origin, function(x){stringr::str_replace_all(x, "-$", "")})# Annotation vlaues for the grade.subgroup<-origingrade_2<-unique(sample.tumor[, sample.tumor$grade=="2"]$orig.ident)grade_3<-unique(sample.tumor[, sample.tumor$grade=="3"]$orig.ident)origin<-factor(origin)subgroup[subgroup%in%grade_2]<-2subgroup[subgroup%in%grade_3]<-3cellwidth<-10cellheight<-10# First visualization of the correlation heatmap with annotations.h.anno<-pheatmap::pheatmap(corr, color =grDevices::colorRampPalette(rev(RColorBrewer::brewer.pal(n =7, name ="RdBu")))(100), breaks =seq(-range, range, length.out =100), show_colnames =F, treeheight_col =0, annotation_col =data.frame("Metasignature"=metaclust,"Origin"=origin,"Subtype"=subgroup), annotation_colors =colors, cellwidth =cellwidth, cellheight =cellheight)# Copy to modify the names. This can also be done to subset only given metasignatures if needed (hence, the .small naming).metaclust_small<-metaclust# Recompute the correlation.corr.small<-stats::cor(sign.scores[, names(metaclust_small)])range.small<-max(abs(corr.small))# Rename groups if needed.names_sign<-names(metaclust_small)assign_sign<-as.character(metaclust_small)# Rename the signatures accordingly.assign_sign[assign_sign%in%c("Metasignature_3", "Metasignature_5")]<-"Metasignature_1"# Rename several metasignatures at once.assign_sign[assign_sign=="Metasignature_6"]<-"Metasignature_3"# Rename individual metasignatures at one.# Regenerate the metaclust_small object.names(assign_sign)<-names_signmetaclust_small<-as.factor(assign_sign)# Recompute annotation. Same process as before.ann_colors_small<-colortools::wheel("steelblue", length(levels(metaclust_small)))names(ann_colors_small)<-levels(metaclust_small)origin_small<-BiocGenerics::sapply(colnames(corr.small), function(x){substring(x, 1, nchar(x)-2)})origin_small<-BiocGenerics::sapply(origin_small, function(x){stringr::str_replace_all(x, "[.]", "-")})origin_small<-BiocGenerics::sapply(origin_small, function(x){stringr::str_replace_all(x, "-$", "")})subgroup_small<-origin_smallgrade_2<-unique(sample.tumor[, sample.tumor$grade=="2"]$orig.ident)grade_3<-unique(sample.tumor[, sample.tumor$grade=="3"]$orig.ident)origin_small<-factor(origin_small)subgroup_small[subgroup_small%in%grade_2]<-2subgroup_small[subgroup_small%in%grade_3]<-3samp_colors<-colortools::wheel("navyblue", length(unique(sample.tumor$orig.ident)))names(samp_colors)<-unique(sample.tumor$orig.ident)colors_small<-list("Metasignature"=ann_colors_small,"Origin"=samp_colors,"Subtype"=subtype_colors)# Final visualization with the annotation fixed. It should look pretty neat.h.small<-pheatmap::pheatmap(corr.small, color =grDevices::colorRampPalette(rev(RColorBrewer::brewer.pal(n =7, name ="RdBu")))(100), breaks =seq(-range.small, range.small, length.out =100), show_colnames =F, treeheight_col =0,annotation_col =data.frame("Metasignature"=metaclust_small,"Origin"=origin_small,"Subtype"=subgroup_small), annotation_colors =colors_small, cellwidth =10, cellheight =10)# If the previous visualization is good, we can proceed with the metasignature assignment.metaclust<-metaclust_small# Assign the good one.# Compute a metasignature score for each of the metasignatures for each of the cells.avg.sc.cell<-as.data.frame(BiocGenerics::sapply(as.character(levels(metaclust)), function(c){rowMeans(sign.scores[,which(metaclust==c), drop =FALSE])}))# Add the scores as metadata.for(mcincolnames(avg.sc.cell)){scores<-avg.sc.cell[, mc]names(scores)<-rownames(avg.sc.cell)sample.tumor[[mc]]<-scores}# Assign to each cell the maximum score out of the different computed metasignatures.# PLEASE NOTE: while this might give you a hint towards the different populations in the tumor bulk, the assignment might not be perfect.# What if a cells is almost equally scored for several metasignatures? Use this as a first look, but not as a definitive result.# For this, we recommend the use of the top 30 genes per metasignature paired with enrichment scores (shown afterwards).sample.tumor$metaclust<-BiocGenerics::sapply(apply(avg.sc.cell, 1, which.max), function(x){paste0("Metasignature_", x)})sample.tumor$metaclust<-as.character(sample.tumor$metaclust)# Compute the top 30 scoring genes per each metasignaturefinal.sign<-list()for(mcincolnames(avg.sc.cell)){# c here is the FUNCTION to combine VALUES into a VECTOR.## nmf.sign[which(metaclust == mc) => retrieve the signatures (top 30 genes) from nmf.sign that belong to the meta-signature with the scores per cell for those signatures.## do.call(c, nmf.sign[which(metaclust == mc)])) => Generate a vector with the genes in each signature.## unique(do.call(c, nmf.sign[which(metaclust == mc)])) => Avoid repeting genes.## Therefore, sign is a vector of the unique genes present in all of the signatures that form the meta-signature.sign<-unique(do.call(c, nmf.sign[which(metaclust==mc)]))# Compute the gene scores for each gene for all the cells that were assigned to a given metacluster.gene.score<-scoreGene(tumor_scaled[, which(sample.tumor$metaclust==mc)], X.mean, sign)#expr <- rowMeans(Ec[sign, which(tumor.subs$metaclust == x)])top.genes<-sign[order(gene.score, decreasing =T)[1:30]]# Include the list of top.genes in the final signature list.final.sign[[mc]]<-top.genes}# Make sure to save both the metaclust and final.sign object.saveRDS(metaclust, "")# Modify the path accordingly.saveRDS(final.sign, "")# Modify the path accordingly.# With the top 30 scoring genes per MS, one can do multiple downstream analysis to try to infer the properties of the metasignatures.