Nothing
# compare_metrics_loc.R
#
# This function creates the plots to compare various signature summary statistic metrics against each other
# That is, compares the mean, median and first principal component with each other, and also produces PCA vs variance
# plot for the first 10 principal components of the dataset
# @param gene_sigs_list A list of genes representing the gene signature to be tested.
# @param names_sigs The names of the gene signatures (one name per gene signature, in gene_sigs_list)
# @param mRNA_expr_matrix A list of expression matrices
# @param names_datasets The names of the different datasets contained in mRNA_expr_matrix
# @param out_dir A path to the directory where the resulting output files are written
# @param file File representing the log file where errors can be written
# @param showResults Tells if open dialog boxes showing the computed results. Default is FALSE
# @param radar_plot_values A list of values that store computations that will be used in the final summary radarplot
# @keywords compare_metrics_loc
compare_metrics_loc <- function(gene_sigs_list,names_sigs, mRNA_expr_matrix, names_datasets, out_dir = '~',file=NULL,showResults = FALSE,radar_plot_values){
# require(gplots)
if(!dir.exists(file.path(out_dir,'metrics_tables'))){
dir.create(file.path(out_dir,'metrics_tables'))
}
score_cor_mats <- list()
for(k in 1:length(names_sigs)){
#for each signature we will make a separate file comparing the datasets
gene_sig <- gene_sigs_list[[names_sigs[k]]] # load the gene signature
if(is.matrix(gene_sig)){gene_sig = as.vector(gene_sig);}
#set up canvas for plotting
if (showResults){
grDevices::dev.new()
}else{
grDevices::pdf(file.path(out_dir,paste0('sig_compare_metrics_',names_sigs[k],'.pdf')),width=3*length(names_datasets),height=10)
}
#find the max number of characters in the title
max_title_length <- -999
for( i in 1:length(names_datasets)){
if(max_title_length < nchar(paste0(names_datasets[i],' ',names_sigs[k]))){
max_title_length <- nchar(paste0(names_datasets[i],' ',names_sigs[k]))
}
}
#set up the canvas
graphics::par(mfcol = c(4,length(names_datasets)),mar=c(4,4,4,4))
for ( i in 1:length(names_datasets)){
# now we can loop over the datasets for the plot and generate the metrics for every dataset with this signature
data.matrix = mRNA_expr_matrix[[names_datasets[i]]] #load the data
data.matrix[!(is.finite(as.matrix(data.matrix)))] <- NA #ensure that the data is not infintie
inter = intersect(gene_sig,rownames(data.matrix)) #consider only the genes actually present in the data
med_scores <- apply(data.matrix[inter,],2,function(x){stats::median(stats::na.omit(x))}) #compute median
mean_scores <- apply(data.matrix[inter,],2,function(x){mean(stats::na.omit(x))}) #compute mean
pca1_scores <- NULL
tryCatch({
pca1_scores <- stats::prcomp(stats::na.omit(t(data.matrix[inter,])),retx=T) #compute PCA1
output_mat <- pca1_scores$x
if(!is.null(colnames(data.matrix))){
row.names(output_mat) <- colnames(data.matrix)
}
utils::write.table(output_mat,file=file.path(out_dir,'metrics_tables', paste0('pca_loadings_',names_sigs[k],'_',names_datasets[i],'.txt')),quote=F,sep='\t')
},error=function(e){
pca1_scores <<- NULL
cat(paste0("There was an error when computing PCA1 score for: ",names_datasets[i]," ", names_sigs[k]," ", e,'\n'), file=file)
# print(paste0("error: ", e))
})
# print(paste0("test ",pca1_scores ))
if(length(pca1_scores) > 1){#!is.null(pca1_scores)){
props_of_variances <- pca1_scores$sdev^2/(sum(pca1_scores$sdev^2)) #for the scree plot
pca1_scores <- pca1_scores$x[,1] #takes the actual PCA1 scores
common_score_cols <- intersect(names(med_scores),intersect(names(mean_scores),names(pca1_scores))) #ensures we have the same samples for each plot
}else{
common_score_cols <- intersect(names(med_scores),names(mean_scores))
}
if(length(common_score_cols) > 1){
#the following is the colourmap for the 2D scatter
jet.colors <- grDevices::colorRampPalette(c("#00007F", "blue", "#007FFF", "cyan", "#7FFF7F", "yellow", "#FF7F00", "red", "#7F0000"))
#plotting commands for the 2D scatterplot for mean-median correlation as follows
graphics::smoothScatter(med_scores[common_score_cols],mean_scores[common_score_cols],colramp = jet.colors,xlab=NA,ylab=NA,main='Median vs Mean')
graphics::points(med_scores[common_score_cols],mean_scores[common_score_cols],pch='.') #draw the points on top of it
graphics::par(new=T)#,srt=45)
graphics::plot(stats::density(med_scores[common_score_cols]), axes=F, xlab=NA, ylab=NA, col='red',main=NA,lwd=2) #draw the density plot behind it
graphics::axis(side = 4)
graphics::mtext(side = 4, line = 2, 'Density',cex=0.8) #labels for the axes
graphics::mtext(side = 2, line = 2, 'Mean',cex=0.8)
graphics::mtext(side = 1, line = 2, 'Median',cex=0.8)
graphics::mtext(side=3,line=2.5,paste0(names_datasets[i],' ',names_sigs[k]),cex=min(1,3*10/max_title_length)) #title
rho <- stats::cor(med_scores[common_score_cols],mean_scores[common_score_cols],method='spearman')
rho_mean_med <- rho
graphics::mtext(paste0('rho = ',format(rho,digits = 2)),side=3,line=0,cex = 0.6,at=max(med_scores[common_score_cols]))
}else{
graphics::plot.new()
graphics::mtext(side=3,line=2.5,paste0(names_datasets[i],' ',names_sigs[k]),cex=min(1,3*10/max_title_length)) #title
graphics::title(paste0('\n\nToo many NA values for Mean/Median in \n',names_datasets[i],' ',names_sigs[k]))#cex=min(1,4*10/max_title_length))
rho_mean_med <- 0
}
if (length(pca1_scores) > 1){#(!is.null(pca1_scores)){
#plotting for the mean-pca1
graphics::smoothScatter(mean_scores[common_score_cols],pca1_scores[common_score_cols],colramp = jet.colors,xlab=NA,ylab=NA,main='Mean vs PCA1')
graphics::points(mean_scores[common_score_cols],pca1_scores[common_score_cols],pch='.')
graphics::par(new=T)
graphics::plot(stats::density(mean_scores[common_score_cols]), axes=F, xlab=NA, ylab=NA, col='red',main=NA,lwd=2)
graphics::axis(side = 4)
graphics::mtext(side = 4, line = 2, 'Density',cex=0.8)
rho <- stats::cor(mean_scores[common_score_cols],pca1_scores[common_score_cols],method='spearman')
rho_mean_pca1 <- rho
graphics::mtext(paste0('rho = ',format(rho,digits = 2)),side=3,line=0,cex = 0.6,at=max(mean_scores[common_score_cols]))
graphics::mtext(side = 2, line = 2, 'PCA1',cex=0.8)
graphics::mtext(side = 1, line = 2, 'Mean',cex=0.8)
#plotting for the pca1-median
graphics::smoothScatter(pca1_scores[common_score_cols],med_scores[common_score_cols],colramp = jet.colors,xlab=NA,ylab=NA,main='PCA1 vs Median')
graphics::points(pca1_scores[common_score_cols],med_scores[common_score_cols],pch='.')
graphics::par(new=T)
graphics::plot(stats::density(pca1_scores[common_score_cols]), axes=F, xlab=NA, ylab=NA, col='red',main=NA,lwd=2)
graphics::axis(side = 4)
graphics::mtext(side = 4, line = 2, 'Density',cex=0.8)
rho <- stats::cor(pca1_scores[common_score_cols],med_scores[common_score_cols],method='spearman')
rho_pca1_med <- rho
graphics::mtext(paste0('rho = ',format(rho,digits = 2)),side=3,line=0,cex = 0.6,at=max(pca1_scores[common_score_cols]))
graphics::mtext(side = 2, line = 2, 'Median',cex=0.8)
graphics::mtext(side = 1, line = 2, 'PCA1',cex=0.8)
}else{
# graphics::par(new=T)
graphics::plot.new()
graphics::title(paste0('\n\nToo many NA values for PCA1/Mean in \n',names_datasets[i],' ',names_sigs[k]))#cex=min(1,4*10/max_title_length))
rho_mean_pca1 <- 0
# graphics::par(new=T)
graphics::plot.new()
graphics::title(paste0('\n\nToo many NA values for Median/PCA1 in \n',names_datasets[i],' ',names_sigs[k]))#cex=min(1,4*10/max_title_length))
rho_pca1_med <- 0
}
# #here we output the table of mean, median and pca1 scores for each sample to a table
# if(length(pca1_scores) > 1){#(!is.null(pca1_scores)){
# output_mat <- cbind(mean_scores[common_score_cols],cbind(med_scores[common_score_cols],pca1_scores[common_score_cols]))
# colnames(output_mat) <- c('Mean_Scores','Median_Scores','PCA1_Scores')
# }else{
# output_mat <- cbind(mean_scores[common_score_cols],med_scores[common_score_cols])
# colnames(output_mat) <- c('Mean_Scores','Median_Scores')
# }
# utils::write.table(output_mat,file=file.path(out_dir,'metrics_tables', paste0('metrics_table_',names_sigs[k],'_',names_datasets[i],'.txt')),quote=F,sep='\t')
#stores values that will be used in the radarplot
radar_plot_values[[names_sigs[k]]][[names_datasets[i]]]['rho_mean_med'] <- rho_mean_med
radar_plot_values[[names_sigs[k]]][[names_datasets[i]]]['rho_pca1_med'] <- rho_pca1_med
radar_plot_values[[names_sigs[k]]][[names_datasets[i]]]['rho_mean_pca1'] <- rho_mean_pca1
if(length(pca1_scores) > 1){#(!is.null(pca1_scores)){
#draws the scree plot
bars_plot <- props_of_variances[1:min(10,length(props_of_variances))]
output_mat <- as.matrix(props_of_variances)
colnames(output_mat) <- c('Proportion of variance')
row.names(output_mat) <- paste0('PCA ',as.character(1:dim(output_mat)[1]))
utils::write.table(output_mat,file=file.path(out_dir,'metrics_tables', paste0('pca_vs_var_',names_sigs[k],'_',names_datasets[i],'.txt')),quote=F,sep='\t')
graphics::barplot(bars_plot,main="PCA vs proportion\n of variance") #ylim= c(0,1),
graphics::mtext(side = 1, line = 2, 'PCA',cex=0.8)
radar_plot_values[[names_sigs[k]]][[names_datasets[i]]]['prop_pca1_var']<- bars_plot[1]
}else{
# graphics::par(new=T)
graphics::plot.new()
graphics::title(paste0('\n\nToo many NA values for PCA1 in \n',names_datasets[i],' ',names_sigs[k]))#cex=min(1,4*10/max_title_length))
radar_plot_values[[names_sigs[k]]][[names_datasets[i]]]['prop_pca1_var']<- 0
}
}
#saves file
if(showResults){
grDevices::dev.copy(grDevices::pdf,file.path(out_dir,paste0('sig_compare_metrics_',names_sigs[k],'.pdf')),width=3*length(names_datasets),height=10)
}
if(grDevices::dev.cur()!=1){
g <- grDevices::dev.off() # to reset the graphics pars to defaults
}
}
cat('Metrics compared successfully.\n', file=file) #output to log
#-------------we will also compute the ES scores by 3 different methods here----------------------------
for(k in 1:length(names_sigs)){
#for each signature we will make a separate file comparing the datasets
gene_sig <- gene_sigs_list[[names_sigs[k]]] # load the gene signature
if(is.matrix(gene_sig)){gene_sig = as.vector(gene_sig);}
#set up canvas for plotting
if (showResults){
grDevices::dev.new()
}else{
grDevices::pdf(file.path(out_dir,paste0('sig_compare_ES_metrics_',names_sigs[k],'.pdf')),width=3*length(names_datasets),height=7.5)
}
#find the max number of characters in the title
max_title_length <- -999
for( i in 1:length(names_datasets)){
if(max_title_length < nchar(paste0(names_datasets[i],' ',names_sigs[k]))){
max_title_length <- nchar(paste0(names_datasets[i],' ',names_sigs[k]))
}
}
#set up the canvas
graphics::par(mfcol = c(3,length(names_datasets)),mar=c(4,4,4,4))
for ( i in 1:length(names_datasets)){
# now we can loop over the datasets for the plot and generate the metrics for every dataset with this signature
data.matrix = mRNA_expr_matrix[[names_datasets[i]]] #load the data
data.matrix[!(is.finite(as.matrix(data.matrix)))] <- NA #ensure that the data is not infintie
inter = intersect(gene_sig,rownames(data.matrix)) #consider only the genes actually present in the data
med_scores <- apply(data.matrix[inter,],2,function(x){stats::median(stats::na.omit(x))}) #compute median
mean_scores <- apply(data.matrix[inter,],2,function(x){mean(stats::na.omit(x))}) #compute mean
pca1_scores <- NULL
tryCatch({
pca1_scores <- stats::prcomp(stats::na.omit(t(data.matrix[inter,])),retx=T) #compute PCA1
output_mat <- pca1_scores$x
if(!is.null(colnames(data.matrix))){
row.names(output_mat) <- colnames(data.matrix)
}
utils::write.table(output_mat,file=file.path(out_dir,'metrics_tables', paste0('pca_loadings_',names_sigs[k],'_',names_datasets[i],'.txt')),quote=F,sep='\t')
},error=function(e){
pca1_scores <<- NULL
# print(paste0("error: ", e))
})
#need the matrix to be a matrix not a dataframe
data.matrix.gsva <- data.matrix
if(!is.matrix(data.matrix)){
data.matrix.gsva <- as.matrix.data.frame(data.matrix,rownames.force=T)
}
es.ssGSEA <- suppressWarnings(GSVA::gsva(data.matrix.gsva, list(inter), method="ssgsea", verbose=F, parallel.sz=1))
es.gsva <- suppressWarnings(GSVA::gsva(data.matrix.gsva, list(inter), verbose=F, parallel.sz=1))
es.plage <- suppressWarnings(GSVA::gsva(data.matrix.gsva, list(inter), method="plage", verbose=F, parallel.sz=1))
es.gsva <- es.gsva[1,]
es.ssGSEA <- es.ssGSEA[1,]
es.plage <- es.plage[1,]
common_score_cols <- intersect(names(es.gsva),intersect(names(es.ssGSEA),names(es.plage))) #ensures we have the same samples for each plot
if(length(common_score_cols) > 1){
#the following is the colourmap for the 2D scatter
jet.colors <- grDevices::colorRampPalette(c("#00007F", "blue", "#007FFF", "cyan", "#7FFF7F", "yellow", "#FF7F00", "red", "#7F0000"))
#plotting commands for the 2D scatterplot for mean-median correlation as follows
graphics::smoothScatter(es.gsva[common_score_cols],es.ssGSEA[common_score_cols],colramp = jet.colors,xlab=NA,ylab=NA,main='GSVA vs ssGSEA')
graphics::points(es.gsva[common_score_cols],es.ssGSEA[common_score_cols],pch='.') #draw the points on top of it
graphics::par(new=T)#,srt=45)
graphics::plot(stats::density(es.gsva[common_score_cols]), axes=F, xlab=NA, ylab=NA, col='red',main=NA,lwd=2) #draw the density plot behind it
graphics::axis(side = 4)
graphics::mtext(side = 4, line = 2, 'Density',cex=0.8) #labels for the axes
graphics::mtext(side = 2, line = 2, 'ssGSEA',cex=0.8)
graphics::mtext(side = 1, line = 2, 'GSVA',cex=0.8)
graphics::mtext(side=3,line=2.5,paste0(names_datasets[i],' ',names_sigs[k]),cex=min(1,3*10/max_title_length)) #title
rho <- stats::cor(es.gsva[common_score_cols],es.ssGSEA[common_score_cols],method='spearman')
rho_ssGSEA_gvsa <- rho
graphics::mtext(paste0('rho = ',format(rho,digits = 2)),side=3,line=0,cex = 0.6,at=max(es.gsva[common_score_cols]))
}else{
graphics::plot.new()
graphics::mtext(side=3,line=2.5,paste0(names_datasets[i],' ',names_sigs[k]),cex=min(1,3*10/max_title_length)) #title
graphics::title(paste0('\n\nToo many NA values for ssGSEA/GSVA in \n',names_datasets[i],' ',names_sigs[k]))#cex=min(1,4*10/max_title_length))
rho_ssGSEA_gvsa <- 0
}
if(length(common_score_cols) > 1){
#plotting for the ssGSEA-PLAGE
graphics::smoothScatter(es.ssGSEA[common_score_cols],es.plage[common_score_cols],colramp = jet.colors,xlab=NA,ylab=NA,main='ssGSEA vs PLAGE')
graphics::points(es.ssGSEA[common_score_cols],es.plage[common_score_cols],pch='.')
graphics::par(new=T)
graphics::plot(stats::density(es.ssGSEA[common_score_cols]), axes=F, xlab=NA, ylab=NA, col='red',main=NA,lwd=2)
graphics::axis(side = 4)
graphics::mtext(side = 4, line = 2, 'Density',cex=0.8)
rho <- stats::cor(es.ssGSEA[common_score_cols],es.plage[common_score_cols],method='spearman')
rho_ssGSEA_plage <- rho
graphics::mtext(paste0('rho = ',format(rho,digits = 2)),side=3,line=0,cex = 0.6,at=max(es.ssGSEA[common_score_cols]))
graphics::mtext(side = 2, line = 2, 'PLAGE',cex=0.8)
graphics::mtext(side = 1, line = 2, 'ssGSEA',cex=0.8)
}else{
# graphics::par(new=T)
graphics::plot.new()
graphics::title(paste0('\n\nToo many NA values for PLAGE/ssGSEA in \n',names_datasets[i],' ',names_sigs[k]))#cex=min(1,4*10/max_title_length))
rho_ssGSEA_plage <- 0
}
if(length(common_score_cols) > 1){
graphics::smoothScatter(es.plage[common_score_cols],es.gsva[common_score_cols],colramp = jet.colors,xlab=NA,ylab=NA,main='PLAGE vs GSVA')
graphics::points(es.plage[common_score_cols],es.gsva[common_score_cols],pch='.')
graphics::par(new=T)
graphics::plot(stats::density(es.plage[common_score_cols]), axes=F, xlab=NA, ylab=NA, col='red',main=NA,lwd=2)
graphics::axis(side = 4)
graphics::mtext(side = 4, line = 2, 'Density',cex=0.8)
rho <- stats::cor(es.plage[common_score_cols],es.gsva[common_score_cols],method='spearman')
rho_plage_gsva <- rho
graphics::mtext(paste0('rho = ',format(rho,digits = 2)),side=3,line=0,cex = 0.6,at=max(es.plage[common_score_cols]))
graphics::mtext(side = 2, line = 2, 'GSVA',cex=0.8)
graphics::mtext(side = 1, line = 2, 'PLAGE',cex=0.8)
}else{
# graphics::par(new=T)
graphics::plot.new()
graphics::title(paste0('\n\nToo many NA values for GSVA/PLAGE in \n',names_datasets[i],' ',names_sigs[k]))#cex=min(1,4*10/max_title_length))
rho_plage_gsva <- 0
}
# print(paste0("test ",pca1_scores ))
if(length(pca1_scores) > 1){#!is.null(pca1_scores)){
props_of_variances <- pca1_scores$sdev^2/(sum(pca1_scores$sdev^2)) #for the scree plot
pca1_scores <- pca1_scores$x[,1] #takes the actual PCA1 scores
common_score_cols <- intersect(common_score_cols,intersect(names(med_scores),intersect(names(mean_scores),names(pca1_scores)))) #ensures we have the same samples for each plot
}else{
common_score_cols <- intersect(common_score_cols,intersect(names(med_scores),names(mean_scores)))
}
#here we output the table of mean, median and pca1 scores for each sample to a table
if((length(common_score_cols) > 1) & (!is.null(pca1_scores)) ){
common_score_cols <- intersect(common_score_cols,names(pca1_scores))
output_mat <- cbind(mean_scores[common_score_cols],med_scores[common_score_cols],pca1_scores[common_score_cols],es.ssGSEA[common_score_cols],cbind(es.gsva[common_score_cols],es.plage[common_score_cols]))
colnames(output_mat) <- c('Mean_Scores','Median_Scores','PCA1_Scores','ssGSEA','GSVA','PLAGE')
utils::write.table(output_mat,file=file.path(out_dir,'metrics_tables', paste0('metrics_table_',names_sigs[k],'_',names_datasets[i],'.txt')),quote=F,sep='\t')
}else if( length(common_score_cols) > 1){
output_mat <- cbind(mean_scores[common_score_cols],med_scores[common_score_cols],es.ssGSEA[common_score_cols],cbind(es.gsva[common_score_cols],es.plage[common_score_cols]))
colnames(output_mat) <- c('Mean_Scores','Median_Scores','ssGSEA','GSVA','PLAGE')
utils::write.table(output_mat,file=file.path(out_dir,'metrics_tables', paste0('metrics_table_',names_sigs[k],'_',names_datasets[i],'.txt')),quote=F,sep='\t')
}
score_cor_mats[[paste0(names_datasets[i],'_',names_sigs[k])]]<- stats::cor(output_mat,method='spearman')
}
#saves file
if(showResults){
grDevices::dev.copy(grDevices::pdf,file.path(out_dir,paste0('sig_compare_ES_metrics_',names_sigs[k],'.pdf')),width=3*length(names_datasets),height=7.5)
}
if(grDevices::dev.cur()!=1){
g <- grDevices::dev.off() # to reset the graphics pars to defaults
}
}
cat('ES scores computed successfully.\n', file=file) #output to log
#-----now we need to output the score correlation matrices as heatmaps----
for(k in 1:length(names_sigs)){
for ( i in 1:length(names_datasets)){
#now we compute and output the correlation of scoring metrics in heatmap form
plot_mat <- score_cor_mats[[paste0(names_datasets[i],'_',names_sigs[k])]]
if (showResults){
grDevices::dev.new()
}else{
grDevices::pdf(file.path(out_dir,paste0('scoring_metrics_corr_',names_datasets[i],'_',names_sigs[k],'.pdf')),width=4,height=4)
}
#set up the canvas
graphics::par(mfcol = c(4,length(names_datasets)),mar=c(4,4,4,4))
nice_row_names <- c('Mean','Median','PCA1','GVSA','ssGSEA','PLAGE')
names(nice_row_names) <- c('Mean_Scores','Median_Scores','PCA1_Scores','GSVA','ssGSEA','PLAGE')
row_names.fontsize <- 10
row.names(plot_mat) <- nice_row_names[rownames(plot_mat)]
ans_hmap <- ComplexHeatmap::Heatmap(plot_mat,show_column_dend = F,
show_column_names = F,
name=names_datasets[i],
col= circlize::colorRamp2(c(-1, 0, 1), c("blue", "white", "red")),
heatmap_legend_param = list(title = 'Correlation', color_bar = "continuous",legend_direction='vertical'),
column_title = paste0(names_datasets[i],' ',names_sigs[k],'\nScoring Metric Correlation'),
row_names_gp = grid::gpar(fontsize = row_names.fontsize),
row_title = 'Scoring metrics')#,
ComplexHeatmap::draw(ans_hmap,heatmap_legend_side = "left")
#saves file
if(showResults){
grDevices::dev.copy(grDevices::pdf,file.path(out_dir,paste0('scoring_metrics_corr_',names_datasets[i],'_',names_sigs[k],'.pdf')),width=7,height=3.5)
}
if(grDevices::dev.cur()!=1){
g <- grDevices::dev.off() # to reset the graphics pars to defaults
}
}
}
#-----------------------------------------------------------------------------------------------------------
#------------------------------------------------------------------------------------------------------------------
#next we use the mclust package, and compute the log-likelihoods of the various Gaussian mixture models for the scores
mixture_model.out = file.path(out_dir, "mixture_model_out.txt")
mixture_model.con = file(mixture_model.out, open = "w") #open the output file
mixture_models <- list() #variable to store the mclust results
for(k in 1:length(names_sigs)){
mixture_models[[names_sigs[k]]] <- list()
gene_sig <- gene_sigs_list[[names_sigs[k]]] # load the gene signature
if(is.matrix(gene_sig)){gene_sig = as.vector(gene_sig);}
#set up canvas for plotting
if (showResults){
grDevices::dev.new()
}else{
grDevices::pdf(file.path(out_dir,paste0('sig_gaussian_mixture_model_',names_sigs[k],'.pdf')),width=3*length(names_datasets),height=10)
}
#find the max number of characters in the title
max_title_length <- -999
for( i in 1:length(names_datasets)){
if(max_title_length < nchar(paste0(names_datasets[i],' ',names_sigs[k]))){
max_title_length <- nchar(paste0(names_datasets[i],' ',names_sigs[k]))
}
}
#set up the canvas
graphics::par(mfcol = c(3,length(names_datasets)),mar=c(4,4,4,4))
for ( i in 1:length(names_datasets)){
# now we can loop over the datasets for the plot and generate the metrics for every dataset with this signature
data.matrix = mRNA_expr_matrix[[names_datasets[i]]] #load the data
data.matrix[!(is.finite(as.matrix(data.matrix)))] <- NA #ensure that the data is not infintie
inter = intersect(gene_sig,rownames(data.matrix)) #consider only the genes actually present in the data
med_scores <- apply(data.matrix[inter,],2,function(x){stats::median(stats::na.omit(x))}) #compute median
mean_scores <- apply(data.matrix[inter,],2,function(x){mean(stats::na.omit(x))}) #compute mean
pca1_scores <- NULL
tryCatch({
pca1_scores <- stats::prcomp(stats::na.omit(t(data.matrix[inter,])),retx=T) #compute PCA1
pca1_scores <- pca1_scores$x[,1] #gets the first component of PCA
},error=function(e){
pca1_scores <<- NULL
cat(paste0("There was an error when computing PCA1 score for: ",names_datasets[i]," ", names_sigs[k]," ", e,'\n'), file=file)
})
#the following are the mixture model variables that are going to be the outputs of the mclust function
mixture_models[[names_sigs[k]]][[names_datasets[i]]] <- list()
mixture_models[[names_sigs[k]]][[names_datasets[i]]][['median']] <- NULL
mixture_models[[names_sigs[k]]][[names_datasets[i]]][['mean']] <- NULL
mixture_models[[names_sigs[k]]][[names_datasets[i]]][['pca1']] <- NULL
if(length(med_scores) > 1){
max_clusters <- min(ceiling(sum(!is.na(med_scores)) / 2),10)
mixture_models[[names_sigs[k]]][[names_datasets[i]]][['median']] <- mclust::Mclust(med_scores,G=1:max_clusters)
mclust::plot.Mclust(x = mixture_models[[names_sigs[k]]][[names_datasets[i]]][['median']], what='BIC',main='Median score')
# graphics::title(main='Median score')
graphics::mtext(side=3,line=2.5,paste0(names_datasets[i],' ',names_sigs[k]),cex=min(1,3*10/max_title_length)) #title
output_string <- paste0(names_sigs[k],' ',names_datasets[i],', Median score: There ')
if(mixture_models[[names_sigs[k]]][[names_datasets[i]]][['median']]$G==1){
output_string <- paste0(output_string,' is one component in the Gaussian mixture model. ')
}else{
output_string <- paste0(output_string,' are ',mixture_models[[names_sigs[k]]][[names_datasets[i]]][['median']]$G,' components in the Gaussian mixture model. ')
}
if(mixture_models[[names_sigs[k]]][[names_datasets[i]]][['median']]$modelName=='V'){
output_string <- paste0(output_string, 'Best model is Gaussian distributions with equal variance (E). ' )
}else if(mixture_models[[names_sigs[k]]][[names_datasets[i]]][['median']]$modelName=='E'){
output_string <- paste0(output_string, 'Best model is Gaussian distributions with variable variances (V). ' )
}else{
output_string <- paste0(output_string, 'Best model is univariate Gaussian distribution. ' )
}
output_string <- paste0(output_string,'\n')
cat(output_string,file=mixture_model.con)
}else{
graphics::plot.new()
graphics::mtext(side=3,line=2.5,paste0(names_datasets[i],' ',names_sigs[k]),cex=min(1,3*10/max_title_length)) #title
graphics::title(paste0('\n\nToo many NA values for Median in \n',names_datasets[i],' ',names_sigs[k]))#cex=min(1,4*10/max_title_length))
}
if(length(mean_scores) > 1){
max_clusters <- min(ceiling(sum(!is.na(mean_scores)) / 2),10)
mixture_models[[names_sigs[k]]][[names_datasets[i]]][['mean']] <- mclust::Mclust(med_scores,G=1:max_clusters)
mclust::plot.Mclust(x = mixture_models[[names_sigs[k]]][[names_datasets[i]]][['mean']], what='BIC',main='Mean score')
# graphics::title(main='Mean score')
output_string <- paste0(names_sigs[k],' ',names_datasets[i],', Mean score: There ')
if(mixture_models[[names_sigs[k]]][[names_datasets[i]]][['mean']]$G==1){
output_string <- paste0(output_string,' is one component in the Gaussian mixture model. ')
}else{
output_string <- paste0(output_string,' are ',mixture_models[[names_sigs[k]]][[names_datasets[i]]][['mean']]$G,' components in the Gaussian mixture model. ')
}
if(mixture_models[[names_sigs[k]]][[names_datasets[i]]][['mean']]$modelName=='V'){
output_string <- paste0(output_string, 'Best model is Gaussian distributions with equal variance (E). ' )
}else if(mixture_models[[names_sigs[k]]][[names_datasets[i]]][['mean']]$modelName=='E'){
output_string <- paste0(output_string, 'Best model is Gaussian distributions with variable variances (V). ' )
}else{
output_string <- paste0(output_string, 'Best model is univariate Gaussian distribution. ' )
}
output_string <- paste0(output_string,'\n')
cat(output_string,file=mixture_model.con)
}else{
graphics::plot.new()
graphics::title(paste0('\n\nToo many NA values for Mean in \n',names_datasets[i],' ',names_sigs[k]))#cex=min(1,4*10/max_title_length))
}
if(length(pca1_scores) > 1){
max_clusters <- min(ceiling(sum(!is.na(pca1_scores)) / 2),10)
mixture_models[[names_sigs[k]]][[names_datasets[i]]][['pca1']] <- mclust::Mclust(pca1_scores,G=1:max_clusters)
mclust::plot.Mclust(x = mixture_models[[names_sigs[k]]][[names_datasets[i]]][['pca1']], what='BIC',main='PCA1 score')
# graphics::title(main='PCA1 score')
output_string <- paste0(names_sigs[k],' ',names_datasets[i],', PCA1 score: There ')
if(mixture_models[[names_sigs[k]]][[names_datasets[i]]][['pca1']]$G==1){
output_string <- paste0(output_string,' is one component in the Gaussian mixture model. ')
}else{
output_string <- paste0(output_string,' are ',mixture_models[[names_sigs[k]]][[names_datasets[i]]][['pca1']]$G,' components in the Gaussian mixture model. ')
}
if(mixture_models[[names_sigs[k]]][[names_datasets[i]]][['pca1']]$modelName=='V'){
output_string <- paste0(output_string, 'Best model is Gaussian distributions with equal variance (E). ' )
}else if(mixture_models[[names_sigs[k]]][[names_datasets[i]]][['pca1']]$modelName=='E'){
output_string <- paste0(output_string, 'Best model is Gaussian distributions with variable variances (V). ' )
}else{
output_string <- paste0(output_string, 'Best model is univariate Gaussian distribution. ' )
}
output_string <- paste0(output_string,'\n')
cat(output_string,file=mixture_model.con)
}else{
graphics::plot.new()
graphics::title(paste0('\n\nToo many NA values for PCA1 in \n',names_datasets[i],' ',names_sigs[k]))#cex=min(1,4*10/max_title_length))
}
}
#saves file
if(showResults){
grDevices::dev.copy(grDevices::pdf,file.path(out_dir,paste0('sig_gaussian_mixture_model_',names_sigs[k],'.pdf')),width=3*length(names_datasets),height=10)
}
if(grDevices::dev.cur()!=1){
g <- grDevices::dev.off() # to reset the graphics pars to defaults
}
}
close(mixture_model.con)
save(mixture_models,file=file.path(out_dir, "mixture_models_raw_out.rda"))
cat('Gaussian mixture models computed successfully.\n', file=file) #output to log
#-------------next thing we will do with these scores is to plot the QQ plots to check for normality--------
for(k in 1:length(names_sigs)){
mixture_models[[names_sigs[k]]] <- list()
gene_sig <- gene_sigs_list[[names_sigs[k]]] # load the gene signature
if(is.matrix(gene_sig)){gene_sig = as.vector(gene_sig);}
#set up canvas for plotting
if (showResults){
grDevices::dev.new()
}else{
grDevices::pdf(file.path(out_dir,paste0('sig_qq_plots_',names_sigs[k],'.pdf')),width=3*length(names_datasets),height=10)
}
#find the max number of characters in the title
max_title_length <- -999
for( i in 1:length(names_datasets)){
if(max_title_length < nchar(paste0(names_datasets[i],' ',names_sigs[k]))){
max_title_length <- nchar(paste0(names_datasets[i],' ',names_sigs[k]))
}
}
#set up the canvas
graphics::par(mfcol = c(3,length(names_datasets)),mar=c(4,4,4,4))
for ( i in 1:length(names_datasets)){
# now we can loop over the datasets for the plot and generate the metrics for every dataset with this signature
data.matrix = mRNA_expr_matrix[[names_datasets[i]]] #load the data
data.matrix[!(is.finite(as.matrix(data.matrix)))] <- NA #ensure that the data is not infintie
inter = intersect(gene_sig,rownames(data.matrix)) #consider only the genes actually present in the data
med_scores <- apply(data.matrix[inter,],2,function(x){stats::median(stats::na.omit(x))}) #compute median
mean_scores <- apply(data.matrix[inter,],2,function(x){mean(stats::na.omit(x))}) #compute mean
pca1_scores <- NULL
tryCatch({
pca1_scores <- stats::prcomp(stats::na.omit(t(data.matrix[inter,])),retx=T) #compute PCA1
pca1_scores <- pca1_scores$x[,1] #gets the first component of PCA
},error=function(e){
pca1_scores <<- NULL
cat(paste0("There was an error when computing PCA1 score for: ",names_datasets[i]," ", names_sigs[k]," ", e,'\n'), file=file)
})
#the following are the mixture model variables that are going to be the outputs of the mclust function
mixture_models[[names_sigs[k]]][[names_datasets[i]]] <- list()
mixture_models[['median']] <- NULL
mixture_models[['mean']] <- NULL
mixture_models[['pca1']] <- NULL
if(length(med_scores) > 1){
stats::qqnorm(med_scores,plot.it=T,main='Median score')
graphics::mtext(side=3,line=2.5,paste0(names_datasets[i],' ',names_sigs[k]),cex=min(1,3*10/max_title_length)) #title
}else{
graphics::plot.new()
graphics::mtext(side=3,line=2.5,paste0(names_datasets[i],' ',names_sigs[k]),cex=min(1,3*10/max_title_length)) #title
graphics::title(paste0('\n\nToo many NA values for Median in \n',names_datasets[i],' ',names_sigs[k]))#cex=min(1,4*10/max_title_length))
}
if(length(mean_scores) > 1){
stats::qqnorm(mean_scores,plot.it=T,main='Mean score')
}else{
graphics::plot.new()
graphics::title(paste0('\n\nToo many NA values for Mean in \n',names_datasets[i],' ',names_sigs[k]))#cex=min(1,4*10/max_title_length))
}
if(length(pca1_scores) > 1){
stats::qqnorm(pca1_scores,plot.it=T,main='PCA1 score')
}else{
graphics::plot.new()
graphics::title(paste0('\n\nToo many NA values for PCA1 in \n',names_datasets[i],' ',names_sigs[k]))#cex=min(1,4*10/max_title_length))
}
}
#saves file
if(showResults){
grDevices::dev.copy(grDevices::pdf,file.path(out_dir,paste0('sig_qq_plots_',names_sigs[k],'.pdf')),width=3*length(names_datasets),height=10)
}
if(grDevices::dev.cur()!=1){
g <- grDevices::dev.off() # to reset the graphics pars to defaults
}
}
cat('QQ plots computed successfully.\n', file=file) #output to log
#-----------------------------------------------------------------------------------------------------------
radar_plot_values #returns the radarplot values
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.