\fontsize{8}{20}
library("knitr") opts_chunk$set(tidy.opts=list(width.cutoff=80),tidy=TRUE,dev="png",dpi=150)
NMF was run using function NMF
from sklearn.decomposition
in Python scikit-learn library. The results were then integrated into an R object, which we read in below. We varied the number of modules (n_components
argument in NMF function) from 5 to 25, and eventually chose to use the result from K=18, because it resulted in a low inconsistency and a high cophenetic coefficient when repeated 10 times with random initial conditions.
load_obj <- function(file.path){ temp.space <- new.env() obj<-load(file.path, temp.space) obj2<-get(obj, temp.space) rm(temp.space) return(obj2) } nmf_res=load_obj("NMF/Results/P2_use/result_tbls.Robj") nmf_K18=nmf_res$`K=18`$rep0
nmf_K18
contains the genes by modules matrix (G) and the modules by cells matrix (C) resulted from running NMF with n_components=18
.
library("Seurat") ALL_C18=new("seurat",raw.data=nmf_K18$C) ALL_C18=Setup(ALL_C18,project="allC18",min.cells = 2, names.field = 1,names.delim = "_",do.logNormalize = F,is.expr = 0.01,min.genes = 1) ident=ALL_C18@ident levels(ident)<-c(levels(ident),"MZoep","mix") ident[grep("wt",ident)]="mix" ident[grep("oep",ident)]="mix" ident[grep("oep_p0",names(ident))]="MZoep" ALL_C18@ident=ident ALL_C18=PCA(ALL_C18,do.print = F,pcs.print = 3,genes.print = 6,pc.genes = rownames(ALL_C18@data)) PCAPlot(ALL_C18,1,2,pt.size = 0.75)
wt_cells=c(grep("wt",ALL_C18@cell.names),grep("zf",ALL_C18@cell.names)) oep_cells=c(grep("oep",ALL_C18@cell.names)) ALL_C18wt=SubsetData(ALL_C18,cells.use = ALL_C18@cell.names[wt_cells]) ALL_C18oep=SubsetData(ALL_C18,cells.use = ALL_C18@cell.names[oep_cells]) batch_modulewt=BatchGene(ALL_C18wt,idents.use=c("zf1","zf2","zf3"),genes.use=rownames(ALL_C18wt@data),auc.cutoff = 0.67) batch_moduleoep=BatchGene(ALL_C18oep,idents.use=c("MZoep","mix"),genes.use=rownames(ALL_C18oep@data),auc.cutoff = 0.67) batch_modules18=union(batch_moduleoep,batch_modulewt) print("Batch modules:") print(batch_modules18)
batch_genes=nmf_K18$top30genes[,paste0("Module.",batch_modules18)] #knitr::kable(batch_genes,caption = "Top 30 genes in batch modules") print(xtable::xtable(batch_genes,caption = "Top 30 genes in batch modules"),type="latex",scalebox=0.55)
ALL_C18=PCA(ALL_C18,do.print = F,pcs.print = 3,genes.print = 6,pc.genes = setdiff(rownames(ALL_C18@data),batch_modules18)) PCAPlot(ALL_C18,1,2,pt.size = 0.75)
Cells are now much less separated by batch in the plot.
This batch effect removed expression matrix will be used later for spatial mapping.
tbl.dir="./NMF/Datasets/" dataset=read.table(paste0(tbl.dir,"ALL_noBatchCorrection_var.txt")) dataset_scl=read.csv("./NMF/Results/P2_use/tables/scaled_data.csv",row.names = 1)
This is done by subtracting the product of the batch matrices (portions of matrix G and C with the batch modules) from the original data matrix.
rmNMFbatch <- function(batch_modules, G, C, dataset_scl, dataset){ #multiply the matrices to calculate batch effect: batch_scl=as.matrix(G[,paste0("X",batch_modules)]) %*% as.matrix(C[batch_modules,]) #subtract it from the dataset used for running NMF batchRM_scl=dataset_scl-batch_scl ##dataset_scl is the nonzero-median scaled dataset #correct for negative values batchRM_scl[batchRM_scl<0]=0 #calculate non-zero median of each gene in the original dataset binaryData=dataset>0 ##dataset is the original log dataset (before median scaling) dataset_2=expm1(dataset) scl_fac=unlist(lapply(rownames(dataset_2), function(x) median(as.numeric(dataset_2[x,binaryData[x,]])))) #calculate non-zero median of the scaled dataset binaryData=dataset_scl>0 dataset_2=expm1(dataset_scl) scl_med=unlist(lapply(rownames(dataset_2), function(x) median(as.numeric(dataset_2[x,binaryData[x,]])))) #calculate non-zero median of the scaled dataset with batch effect subtracted #binaryData=batchRM_scl>0 ## dataset_2=expm1(batchRM_scl) batchRM_scl_med=unlist(lapply(rownames(dataset_2), function(x) median(as.numeric(dataset_2[x,binaryData[x,]])))) med_fac=batchRM_scl_med/scl_med final_scl_fac=scl_fac*med_fac/scl_med #final_scl_fac=scl_fac/scl_med #the medians in the scaled dataset is adjusted to the median of the non-zero medians #transform the batch corrected dataset to its (more or less) original scale ##batchRM_unscl=log1p(sweep(expm1(batchRM_scl),1,scl_fac/median(scl_fac),"*")) batchRM_unscl=log1p(sweep(expm1(batchRM_scl),1,final_scl_fac,"*")) return(batchRM_unscl) } K18_noBatch=rmNMFbatch(batch_modules18,nmf_K18$G,nmf_K18$C,dataset_scl,dataset)
PCA plots are used to visualize transcriptomes in gene space before and after batch effect removal.
data_raw=new("seurat",raw.data=dataset) K18_noBatch=new("seurat",raw.data=K18_noBatch) data_raw=Setup(data_raw,project="pre_batch_rm",min.cells = 1, names.field = 1,names.delim = "_",do.logNormalize = F,is.expr = 0.1,min.genes = 10) K18_noBatch=Setup(K18_noBatch,project="post_batch_rm",min.cells = 1, names.field = 1,names.delim = "_",do.logNormalize = F,is.expr = 0.1,min.genes = 10) ident=data_raw@ident levels(ident)<-c(levels(ident),"MZoep","mix") ident[grep("wt",ident)]="mix" ident[grep("oep",ident)]="mix" ident[grep("oep_p0",names(ident))]="MZoep" data_raw@ident=ident K18_noBatch@ident=ident data_raw=PCA(data_raw,pcs.print = 3,genes.print = 6,pc.genes = rownames(data_raw@data)) PCAPlot(data_raw,1,2,pt.size = 0.75) K18_noBatch=PCA(K18_noBatch,pcs.print = 3,genes.print = 6,pc.genes = rownames(K18_noBatch@data)) PCAPlot(K18_noBatch,1,2,pt.size = 0.75)
save.dir="./Datasets/" write.table(K18_noBatch@data,file=paste0(save.dir,"Batch_corrected_var.txt"),quote=FALSE,sep="\t")
The C matrix (non-batch modules by cells) is scaled such that each row has the same maximum value before clustering. Modules are assigned names according to knowledge about their top ranked genes.
maxScl <- function(df, dir='row', max_value=NULL, log_space=F){ if(dir=='row'){ dir=1 }else if(dir=='col'){ dir=2 }else{ print("dir must be 'row' or 'col'.") return } if(is.null(max_value)){ max_value=median(apply(df,dir,max)) } if(log_space){ df=expm1(df) max_value=expm1(max_value) } df_scl=sweep(df,dir,apply(df,dir,max),"/") df_scl=df_scl*max_value if(log_space){ df_scl=log1p(df_scl) } return(df_scl) } scld_C=maxScl(nmf_K18$C) rownames(scld_C)=c("0","1","2","Marginal","EVL","5","6","7","Marginal Dorsal","Dorsal","10","Apoptotic-like","YSL","13","PGC","Ventral Animal","Dorsal Animal","Ventral")
Davies-Bouldin index is used to determin the optimal number of clusters (as the number that gives the lowest DB index). Clustering result is shown as heatmaps, with color bars on top indicating genotype (dark blue = wt; light blue = MZoep) or cluster membership.
library(gplots) library(RColorBrewer) library(clusterSim) cluster_map <- function(scld_C,genos=NULL,group=c("cluster","geno"),group_colors=NULL,den_cut=0.75,metric=c("cor","dist"),method="complete",DB=F){ if(metric=="cor"){ stds=apply(scld_C,2,sd) zero_var=which(stds==0) if(length(zero_var)>0){ print(paste("removing ",length(zero_var),"zero-variance cell(s) from dataframe:")) print(colnames(scld_C)[zero_var]) scld_C=scld_C[,-zero_var] } hc <- hclust(as.dist(1-cor(as.matrix(scld_C))), method=method) }else if(metric=="dist"){ hc <- hclust(dist(as.matrix(t(scld_C)),method="euclidean"), method=method) } if("cluster"%in%group){ if(den_cut<1){ mycl <- cutree(hc, h=max(hc$height)*den_cut) }else{ mycl <- cutree(hc, k = den_cut) } if(is.null(group_colors)){ mycolhc <- topo.colors(length(unique(mycl))) mycolhc <- mycolhc[as.vector(mycl)] }else{ mycolhc <- group_colors[as.vector(mycl)] } cluster=mycolhc } if("geno"%in%group){ geno_code=vector("numeric",length=dim(scld_C)[2]) geno_code=geno_code*0+1 i=1 for(geno in genos){ geno_code[grep(geno,colnames(scld_C))]=i*2 i=i+1 } if(is.null(group_colors)){ mycolgeno <- topo.colors(2*length(genos)) mycolgeno <- mycolgeno[geno_code] }else{ mycolgeno <- group_colors[geno_code] } geno=mycolgeno } hmcol <- colorRampPalette(brewer.pal(9, "YlGnBu"))(100) if(DB){ DBs=c() for(i in c(3:20)){ mycl <- cutree(hc, k=i) if(metric=='cor'){ DB=index.DB(t(scld_C), mycl,dist(1-cor(as.matrix(scld_C))), centrotypes="centroids") }else if(metric=='dist'){ DB=index.DB(t(scld_C), mycl,dist(as.matrix(t(scld_C)),method="euclidean"), centrotypes="centroids") } DBs=c(DBs,DB$DB) } plot(c(3:20),DBs,type = 'b',main = "Davies-Bouldin Index",xlab = "K") } for(grp in group){ heatmap.2(as.matrix(scld_C), Colv = as.dendrogram(hc), ColSideColors=get(grp), col=hmcol,dendrogram ="column",sepwidth=0,trace='none',labCol = "",cexRow = 0.65) } return(mycl) } cluster_info=cluster_map(as.matrix(scld_C[setdiff(rownames(scld_C),batch_modules18),]),group = c("geno","cluster"),genos = c("oep"),metric="cor",den_cut=10,DB=T)
YSL is intensinally removed in our sample collecting procedure. The YSL module detected is likely resulted from incomplete yolk removal or contamination from YSL.
hist(as.numeric(scld_C['YSL',]),breaks=50, main="", xlab = "YSL module expression in cell") #length(which(scld_C['YSL',]>1)) #=9 C_noYSL=scld_C[,colnames(scld_C)[which(scld_C['YSL',]<0.7)]]
Again, we use Davies-Bouldin index to determin the optimal number of clusters, and show clustering result as heatmaps, with color bars on top indicating genotype (dark blue = wt; light blue = MZoep) or cluster membership.
cluster_info=cluster_map(as.matrix(C_noYSL[setdiff(rownames(C_noYSL),c("YSL",batch_modules18)),]),group = c("geno","cluster"),genos = c("oep"),metric="cor",den_cut=9,DB=T)
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.