#' signature_generator_paper() Function
#'
#' This function generate PCA plots according to batch
#' @param raw_counts : raw counts matrix
#' @param samples_metadata : metadata where names of raw counts match with metadata
#' @param design_formula : formula used for DESeq2 process
#' @param covid_genes : vector of genes (symbols), 116 genes
#' @param random_genes : vecotr of genes (symbols), 98 genes
#' @param outDIR : out directory
#' @param outputName : file Name for final Rdata file
#' @param fileName : names added for each plots
#' @param heatmap_anno : heatmap annotation
#' @param heatmap_anno_colori : heatmap colors
#' @param colScale : scale_fill_manual
#' @param colScale2 : scale_color_manual
#' @param genesymbol : gene symbol in raw counts, "ensemble" or "symbols"
#' @param logTransform : if needed to process log2(x+1) transformation
#' @param DESeq2required : if needed to process DESeq2 normalisation
#' @param ranking_groups : groups that is used to calculate ranking
#' @param colorvalue : values of colors with ranking groups
#' @param batch : batch = NULL if not, run batch correction with limma packages (Chip variable)
#' @param blindRun : no messages during DESeq2 process (default = TRUE)
#' @keywords signature
#' @return normalised matrix
#' @export
#' @examples
#' signature_generator_paper()
signature_generator_paper <- function(raw_counts, samples_metadata, design_formula, rankingFormula,
covid_genes, random_genes, top25_genes_paper, outDIR, outputName, fileName,
heatmap_anno, heatmap_anno_colori, colScale, colScale2,
genesymbol, logTransform, DESeq2required, ranking_groups, colorvalue, blindRun = TRUE, batch=NULL){
if(file.exists(paste0("./",outDIR))){
cat("The folder already exists \n")
} else {
dir.create(paste0("./",outDIR))
}
# heatmap parameter
clusteringType <- "correlation"
message("*** Normalisation process")
if(DESeq2required==T){
vstcounts <- DESeq2_process(raw_counts, samples_metadata, design_formula, batch, blindRun, outDIR)
}else {
message("** No need to process the raw counts")
vstcounts <- raw_counts
}
message("*** Gene symbol check process")
if(genesymbol == "ensembl"){
message("** Converting Ensemble to Symbol")
symbols <- suppressMessages(AnnotationDbi::mapIds(org.Hs.eg.db::org.Hs.eg.db, keys = rownames(vstcounts), keytype = "ENSEMBL", column="SYMBOL"))
master_gene_table <- as.data.frame(symbols)
master_gene_table$ensembl <- rownames(master_gene_table)
message("** Loading normalised counts")
#norm_vst <- vstcounts
norm_vst <- as.data.frame(vstcounts)
norm_vst$GeneName <- master_gene_table[which(master_gene_table$ensembl %in% rownames(norm_vst)),"symbols"]
norm_vst <- norm_vst[!is.na(norm_vst$GeneName),]
}
else {
message("** Direct symbol is used for gene name")
norm_vst <- as.data.frame(raw_counts)
norm_vst$GeneName <- rownames(norm_vst)
}
data.table::setDT(norm_vst)
norm_vst <- norm_vst[, lapply(.SD, mean), by = GeneName]
norm_vst <- as.data.frame(norm_vst)
rownames(norm_vst) <- norm_vst$GeneName
norm_vst$GeneName <- NULL
message("*** Log 2 tranformation process")
if (logTransform==T){
log_data <- log2(norm_vst+1)
message("Log2 transformation required : log2(X+1)")
} else {
log_data <- norm_vst
message("Already log2 transformed, no transformation")
}
message("** Calculating Z-score")
zscore_data <- as.data.frame(t(scale(t(log_data))))
zscore_data <- zscore_data[!is.na(zscore_data),]
message("** Extracting the (116) genes of interest")
log_covid_genes <- log_data[which(rownames(log_data) %in% covid_genes),]
zscore_covid_genes <- zscore_data[which(rownames(zscore_data) %in% covid_genes),]
message(paste0(" Dimension of matrix : ",(dim(zscore_covid_genes))))
pheatmap::pheatmap(log_covid_genes,
annotation_col = heatmap_anno,
fontsize_col=16,
clustering_distance_rows = clusteringType,
clustering_distance_cols = clusteringType,
annotation_colors = heatmap_anno_colori,
cluster_rows = TRUE ,
cluster_cols = TRUE,
legend = TRUE, annotation_legend = TRUE,
main = "Log Normalise",
filename = paste0("./",outDIR,"/",fileName,"_Heatmap_log-norm.png"),width = 18,height = 18)
pheatmap::pheatmap(zscore_covid_genes,
annotation_col = heatmap_anno,
fontsize_col=16,
clustering_distance_rows = clusteringType,
clustering_distance_cols = clusteringType,
annotation_colors = heatmap_anno_colori,
cluster_rows = TRUE ,
cluster_cols = TRUE,
legend = TRUE, annotation_legend = TRUE,
main = "Z-score",
filename = paste0("./",outDIR,"/",fileName,"_Heatmap_zscore.png"),width = 18,height = 18)
message("** Calculating the scores")
score_tables_sig_genes <- table_to_score(zscore_covid_genes,samples_metadata,"group")
# replace NA with 0
is.na(score_tables_sig_genes[[1]]) <- 0
pheatmap::pheatmap(score_tables_sig_genes[[1]],
fontsize_col=8, angle_col = 45,
legend = TRUE, annotation_legend = TRUE,
main = "Score byGene",
filename = paste0("./",outDIR,"/",fileName,"_Score_byGene.png"),width = 18,height = 18)
message("** Take only selected groups for gene ranking")
score_table_by_sample <- score_tables_sig_genes[[2]]
score_table_by_sample <- score_table_by_sample[which(score_table_by_sample$groups %in% ranking_groups),]
score_table_by_sample$sample <- rownames(score_table_by_sample)
score_table_by_sample$groups <- factor(score_table_by_sample$groups, levels=scenario_groups)
write.csv(score_table_by_sample,paste0("./",outDIR,"/7_Score_bySample.csv"),row.names = F)
message("** Ranking")
message(paste0(" Formula: ",gsub("gene_ranked\\$","",rankingFormula)))
score_table_by_gene <- score_tables_sig_genes[[1]]
write.csv(score_table_by_gene,paste0("./",outDIR,"/7_Score_byGene.csv"),row.names = T)
# select only ranking_groups
score_table_by_gene <- score_table_by_gene[,which(colnames(score_table_by_gene) %in% ranking_groups)]
gene_ranked <- as.data.frame(score_table_by_gene)
gene_ranked$final_score <- eval(parse(text=rankingFormula))
gene_ranked <- gene_ranked[order(gene_ranked$final_score,decreasing = TRUE),]
gene_ranked$GeneName <- rownames(gene_ranked)
gene_ranked <- gene_ranked[order(gene_ranked$final_score,decreasing = TRUE),]
gene_ranked_4_plotting <- rownames(gene_ranked)
gene_ranked$GeneName <- factor(gene_ranked$GeneName,levels = gene_ranked_4_plotting)
write.csv(gene_ranked,paste0("./",outDIR,"/8_Score_byGene_ranked.csv"),row.names = FALSE)
ranked_genes <- ggplot2::ggplot(data = gene_ranked, aes(x = GeneName, y = final_score)) + geom_bar(stat="identity") +
theme_bw()+
theme(text = element_text(size=18),
axis.text.x = element_text(angle = 40, hjust = 1),
strip.text.x = element_text(size = 8)) +
labs(y=gsub("gene_ranked\\$","",rankingFormula)) +
geom_vline(xintercept = 26, linetype='dashed', color='red', size=2) +
annotate("text",x=c(27),y=min(gene_ranked$final_score),label=c("top 25"),hjust=0, angle = 90)
ggplot2::ggsave(ranked_genes,filename = paste0("./",outDIR,"/4_Score_byGene_ranked.png"),dpi = 300, width = 32,height = 9, bg = "white")
message("*** Plotting ranking in separated groups")
rankingplot_separate_group(gene_ranked, colorvalue, ranking_groups, outDIR, fileName)
message("** Processing Top 25 genes")
top25_genes <- gene_ranked[1:25,"GeneName"]
top25_score <- zscore_data[which(rownames(zscore_data) %in% top25_genes),]
message("** Plotting zscore values of Top 25 genes")
log_top25 <- log_data[which(rownames(log_data) %in% top25_genes),]
top25_score <- table_to_score(top25_score,samples_metadata,"group")
top25_score <- top25_score[[2]]
top25_score$sample <- rownames(top25_score)
top25_score$groups <- factor(top25_score$group, levels=scenario_groups)
write.csv(top25_score,paste0("./",outDIR,"/11_Score_bySample_top25.csv"),row.names = FALSE)
message("** Processing Top 25 genes from MISC paper")
top25_score_paper <- zscore_data[which(rownames(zscore_data) %in% top25_genes_paper),]
top25_score_paper <- table_to_score(top25_score_paper,samples_metadata,"group")
top25_score_paper <- top25_score_paper[[2]]
top25_score_paper$sample <- rownames(top25_score_paper)
top25_score_paper$groups <- factor(top25_score_paper$group, levels=scenario_groups)
write.csv(top25_score_paper,paste0("./",outDIR,"/13_Score_bySample_top25_from_paper.csv"),row.names = FALSE)
message("** Extracting the additional (98) genes")
# random_genes should be a vector
# it excludes the genes that are in the covid_gene list
random_genes <- random_genes[-which(random_genes %in% covid_genes)]
log_random_genes <- log_data[which(rownames(log_data) %in% random_genes),]
zscore_random_genes <- zscore_data[which(rownames(zscore_data) %in% random_genes),]
message(paste0(" Dimension of matrix : ",(dim(zscore_random_genes))))
top25_score_additional <- table_to_score(zscore_random_genes,samples_metadata,"group")
top25_score_additional <- top25_score_additional[[2]]
top25_score_additional$sample <- rownames(top25_score_additional)
top25_score_additional$groups <- factor(top25_score_additional$group, levels=scenario_groups)
write.csv(top25_score_additional,paste0("./",outDIR,"/17_Score_bySample_additionalGenes.csv"),row.names = FALSE)
message("** Saving R object")
save(samples_metadata,score_tables_sig_genes,covid_genes,random_genes,ranking_groups,
gene_ranked,top25_score,top25_score_paper, file=paste0("./Agilent_",outputName,".RData"))
message("** Done")
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.