# Enrique Blanco Carmona# e.blancocarmona@kitz-heidelberg.de# PhD Student – Clinical Bioinformatics# Division of Pediatric Neurooncology (B062)# DKFZ-KiTZ | Germany#' Retrieve statistically enriched cells for a list of genes in gradient-like enrichment cases.#'#' Designed for 10X datasets. The following analysis aims to find a way to statistically select the cells that are truly enriched in a list of marker genes.#' For this, a permutation analysis approach is followed, in which we compute a null distribution by shuffling the expression values#' of the genes queried for enrichment. Therefore, this disrupts the equation for the enrichment scores, since they are, in short,#' the result of the difference in means between the expression values of the genes queried (which we disrupted) and the control#' selected randomly by the function. This is done until we have one million values, which would act as the null distribution.#' Once we have a null distribution and an empirical one, we can assess how extreme our values in the empirical distribution are with#' respect to the values in the null distribution, being the p-value the proportion of values in the null distribution that are higher#' than the given value from the empirical distribution you are querying. These p-values are, then, adjusted for multiple testing, selecting#' a FDR cutoff of 0.05/n, being n the total number of lists of marker genes we are going to query to the same cells and use altogether#' to define a labeling (for instance, if we query two lists for the same tumor bulk, it would be 0.05/2). This is doing to avoid#' over inflation of the alpha error. Special thanks to Martin Sils for double checking the statistical part of the function.#'#' Estimated running time: 15 minutes.#'#' @param sample Seurat object.#' @param input_gene_list Named list of marker genes. Can contain multiple lists, the important point here is that each list has to be named.#' @param compute_enrichment Whether to compute the enrichment scores for the requested list of genes using \link[Seurat]{AddModuleScore}.#' @param FDR_cutoff FDR cutoff to apply (ranging from 0 to 1).#' @param idents.use Identities to include in the analysis.#' @param group.by Variable you want the cells to be colored for in the output DimPlot.#' @param colors.use Vector of named HEX values to color the cells. It has to match the number of unique values in either `Seurat::Idents(sample)` or the group.by variable.#' @param verbose Defaults to TRUE. It will provide different print statements. Progress bars can not be suppressed by this.#' @return A list containing the plots and the surpassed cells, together with the p-value matrix.#' @noRd#' @examples#' \dontrun{#' TBD#' }do_GradientSelection<-function(sample,input_gene_list,assay="SCT",slot="data",group.by=NULL,colors.use=NULL,FDR_cutoff=0.05/length(names(input_gene_list)),verbose=T,nbin=24,ctrl=100,font.size=14,font.type="sans",legend.position="bottom",use_viridis=TRUE,viridis_color_map="G",viridis_direction=1,sequential.palette="YlGnBu",sequential_direction=-1){# Checks for packages.#check_suggests(function_name = "do_GradientSelection")# Check if the sample provided is a Seurat object.#check_Seurat(sample = sample)# Define pipe operator internally.`%>%`<-magrittr::`%>%`# If the user has provided a color list.if(!is.null(colors.use)){#check_colors(colors.use)}if(is.null(group.by)){sample$group.by<-Seurat::Idents(sample)group.by<-"group.by"}list.out<-list()if(isTRUE(verbose)){message(paste("Computing enrichment scores..."))}out<-SCpubr::do_EnrichmentHeatmap(sample =sample, input_gene_list =input_gene_list, flavor ="Seurat", nbin =nbin, ctrl =ctrl, return_object =TRUE, use_viridis =FALSE, sequential_direction =1)list.out[["Enrichment_Heatmap"]]<-out$Heatmapfor(nameinnames(input_gene_list)){if(isTRUE(verbose)){message(paste("Running permutation testing for list:", name))}# Retrieve empirical distribution.test.dist<-out$Object@meta.data[, name, drop =FALSE]vect<-test.dist[, 1]names(vect)<-rownames(test.dist)test.dist<-vectlist.out[[name]][["Data"]][["Empirical_Distribution"]]<-test.dist# Compute Null distributionif(isTRUE(verbose)){message("Computing permutations...")}# We want a null distribution with at least 1.000.000 permutations.wanted_permutations<-1000000number_cells<-ncol(sample)nruns<-trunc(wanted_permutations/number_cells)+1null.dist<-data.frame(row.names =colnames(sample))message("Computing iterations...")for(iinseq_len(nruns)){message(i)# Set seed in every iteration.set.seed(i)genes.query<-input_gene_list[[name]]# First, create a replacement object.sample.null<-sample# Get the normalized data assay (sparse matrix).data.use<-Seurat::GetAssayData(object =sample.null, assay =assay, slot =slot)row.order<-rownames(data.use)# We want to preserve the original order of the matrix.# Subset out the matrix we do not want to reshuffle.data.keep<-data.use[-which(rownames(data.use)%in%genes.query), ]# Remove the input genes from the matrix.# Get the subset for reshuffle.data.to.shuffle<-data.use[which(rownames(data.use)%in%genes.query), ]# For each gene in the list of markers.df.new<-list()for(geneingenes.query){# Get the scores.expression.scores<-data.to.shuffle[gene, ]# Permute the scores for all cells for that given gene.shuffled.scores<-sample(expression.scores, length(expression.scores))# As the cell names get shuffled as well, we have to change them back to the original order.names(shuffled.scores)<-names(expression.scores)df.new[[gene]]<-shuffled.scores}data.to.shuffle<-Matrix::as.matrix(t(sapply(df.new, unlist)))data.use<-rbind(data.keep, data.to.shuffle)# Perform a rowbind of the two matrices (instead of directly modifying the first one, which takes forever to do)data.use<-data.use[row.order, ]# Reorder back the matrix.# Set the new matrix as the Assay data from which the enrichment scores will be computed on.sample.null<-Seurat::SetAssayData(sample.null, assay =assay, slot =slot, new.data =data.use)# Compute enrichment scores, which will be the null distribution of the iteration.sample.null<-SCpubr:::compute_enrichment_scores(sample =sample.null, input_gene_list =input_gene_list[name], assay =assay)# Retrieve teh null distribution.null.dist[[i]]<-sample.null[[]][, name]}# Assign the cell names back to the output matrix.rownames(null.dist)<-colnames(sample)list.out[[name]][["Data"]][["Null_Dist_Iteration_Vectors"]]<-null.dist# Prepare the data for plotting.data.plot<-as.data.frame(null.dist)%>%# Make the matrix into tidiverse accepted object.dplyr::mutate(Empirical =test.dist)%>%# Add the empirical distribution to the matrix of AddModuleScore() iterations.tidyr::pivot_longer(dplyr::everything(), names_to ="Distribution", values_to ="Enrichment")%>%dplyr::mutate(Distribution =ifelse(Distribution=="Empirical", "Empirical", "Null"))# Assign any name in Distribution that is not "Empirical" into "Null". By default, non-labelled df columns are named V1, V2...list.out[[name]][["Data"]][["Distributions"]]<-data.plot# Visualize the density plot of both distributions.p.dist<-data.plot%>%ggplot2::ggplot(ggplot2::aes(x =.data$Enrichment, y =..density.., color =.data$Distribution, fill =.data$Distribution))+ggplot2::geom_density(alpha =0.5)+ggplot2::scale_fill_manual(values =rev(RColorBrewer::brewer.pal(n =11, name ="RdBu")[c(1, 11)]))+ggplot2::scale_color_manual(values =rev(RColorBrewer::brewer.pal(n =11, name ="RdBu")[c(1, 11)]))+ggplot2::xlab("Enrichment scores")+ggplot2::ylab("Density")+ggplot2::labs(title =paste0(name, " | Null vs Empirical distributions"), caption ="P-value is, for a given cell and its empirical enrichment score,\n the proportion of values in the null distribution higher than it.")+ggplot2::guides(fill =ggplot2::guide_legend(title ="Distribution", title.position ="top", title.hjust =0.5))+ggplot2::theme_minimal(base_size =font.size)+ggplot2::theme(axis.title =ggplot2::element_text(color ="black", face ="bold"), axis.line.x =ggplot2::element_line(color ="black"), axis.line.y =ggplot2::element_blank(), axis.text.x =ggplot2::element_text(color ="black", face ="bold"), axis.text.y =ggplot2::element_text(color ="black", face ="bold"), axis.ticks =ggplot2::element_line(color ="black"), panel.grid.major =ggplot2::element_blank(), plot.title.position ="plot", plot.title =ggplot2::element_text(face ="bold", hjust =0), plot.subtitle =ggplot2::element_text(hjust =0), plot.caption =ggplot2::element_text(hjust =1), panel.grid =ggplot2::element_blank(), text =ggplot2::element_text(family =font.type), plot.caption.position ="plot", legend.text =ggplot2::element_text(face ="bold"), legend.position =legend.position, legend.title =ggplot2::element_text(face ="bold"), legend.justification ="center", plot.margin =ggplot2::margin(t =10, r =10, b =10, l =10), plot.background =ggplot2::element_rect(fill ="white", color ="white"), panel.background =ggplot2::element_rect(fill ="white", color ="white"), legend.background =ggplot2::element_rect(fill ="white", color ="white"), strip.text =ggplot2::element_text(color ="black", face ="bold"))list.out[[name]][["Plots"]][["Distirbution_plot"]]<-p.dist# Generate the p-values for each enrichment score in the empirical distribution.num_permutations<-sum(data.plot$Distribution=="Null")null_dist_values<-data.plot%>%# Gather the null distribution.dplyr::filter(Distribution=="Null")%>%# Filter only the values for the NULL.dplyr::select(Enrichment)null_dist_values<-null_dist_values$Enrichmentif(verbose){message("Computing p-values.")}p.value.vector<-c()# Will store all the p-values and will become a column of dist.data.# This might also take 10 minutes, since the vector is of 1 million data points to be suuuuuuper exact on the null distribution side.p.value.vector<-unlist(pbapply::pblapply(test.dist, function(x){greater_values<-sum(null_dist_values>x)p.value<-(greater_values+1)/(num_permutations+1)#https://pubmed.ncbi.nlm.nih.gov/21044043/names(p.value)<-names(x)# Assign the cell name to the p-value.p.value.vector<-c(p.value.vector, p.value)# Add the p-value to the output vector.}))# Generate a reporting matrix for the given permutation test.dist.data<-tidyr::tibble(Cell =names(test.dist), Empirical =test.dist, p.value =p.value.vector)# FDR correction.if(isTRUE(verbose)){message(paste0("Using the FDR cutoff of: ", FDR_cutoff))}dist.data<-dist.data%>%dplyr::arrange(.data$p.value)%>%# Order by ascending p-value.dplyr::mutate(p.value.adj =stats::p.adjust(.data$p.value, method ="BH"))%>%# Adjust for multiple testing and produce q-values.dplyr::mutate(significant =ifelse(.data$p.value.adj<=FDR_cutoff, TRUE, FALSE))list.out[[name]][["Data"]][["PlotData"]]<-dist.datalist.out[[name]][["Data"]][["Null_Distribution"]]<-null_dist_values# Visualizations.# UMAP coloring the cells that surpassed the FDR correction.surpassing_cells<-dist.data%>%dplyr::filter(.data$significant==TRUE)%>%dplyr::pull(.data$Cell)list.out[[name]][["Data"]][["Satatistically_Enriched_Cells"]]<-surpassing_cellsp.umap<-SCpubr::do_FeaturePlot(out$Object, features =name, use_viridis =use_viridis, viridis_color_map =viridis_color_map, viridis_direction =viridis_direction, sequential.palette =sequential.palette, sequential_direction =sequential_direction, legend.position =legend.position, font.size =font.size, font.type =font.type)list.out[[name]][["Plots"]][["Enrichment"]]<-p.umapif(length(surpassing_cells)>0){p.selection<-SCpubr::do_DimPlot(sample, cells.highlight =surpassing_cells, plot.caption =paste("Selected cells for gene list: ",name," | Enriched cells: ",sum(dist.data$p.value.adj<FDR_cutoff),"| FDR used: ",FDR_cutoff," | Enrichment score cutoff:",round(min(dist.data$Empirical[sum(dist.data$p.value.adj<FDR_cutoff)], na.rm =TRUE), 3)," | Number comparisons: ",length(names(input_gene_list))))list.out[[name]][["Plots"]][["Selection"]]<-p.selection}}return(list.out)}