Agilent_distributionplot.R

library(DESeq2)
library(ggplot2)
library(pheatmap)
library(wesanderson)
library(data.table)
library(org.Hs.eg.db)
library(ggridges)
library(ggpubr)
library(patchwork)
setwd("/home/joh/Projet/PED/April2022")

library(future)
options(future.fork.enable = FALSE)
supportsMulticore()

availableCores() #40 cores
availableWorkers()
plan()
plan("multiprocess", workers = 5)

#source("./signature_generator.R")
dataDIR <- "raw_data/agilent_data"

##-------- Load Counts and metadata
samples_counts.1 <- read.delim(paste0("./",dataDIR,"/NGS2021_4678.count.genes.txt"),skip = 1)
samples_counts.1 <- samples_counts.1[,-c(2:6)]
rownames(samples_counts.1) <- samples_counts.1$Geneid
#samples_counts.1$Geneid <- NULL
colnames(samples_counts.1)

samples_counts <- read.delim(paste0("./",dataDIR,"/NGS2022_5083.count.genes.txt"),skip = 1)
samples_counts <- samples_counts[,-c(2:6)]
rownames(samples_counts) <- samples_counts$Geneid
#samples_counts$Geneid <- NULL
colnames(samples_counts)

samples_counts <- merge(samples_counts.1, samples_counts)
colnames(samples_counts)
rownames(samples_counts) <- samples_counts$Geneid
samples_counts$Geneid <- NULL

samples_meta <- read.csv(paste0(dataDIR,"/TF_SampleID_JO.csv"))
#samples_meta$BulkName <- gsub("-","_",samples_meta$BulkName)
#samples_meta$BulkName <- gsub(" ","_",samples_meta$BulkName)
#write.csv(samples_meta, paste0(dataDIR,"/TF_SampleID_JO.csv"))

colnames(samples_counts) %in% samples_meta$BulkName

#---- Load 116 gene signature
covid_genes <- read.csv("./raw_data/MM_list.csv")
random_genes <- covid_genes # save for additional genes

covid_genes <- covid_genes[which(covid_genes$Group == "Myosig"),]
covid_genes <- covid_genes$Signature
covid_genes <- unique(covid_genes)
length(covid_genes)
length(random_genes$Signature)


#---- Load 105 genes
reduced_genes <- read.csv("./raw_data/public_data/MM_list_reduced.csv")
reduced_random_genes <- reduced_genes # save for additional genes

reduced_genes <- reduced_genes[which(reduced_genes$Group == "Myosig"),]
reduced_genes <- reduced_genes$Signature
reduced_genes <- unique(reduced_genes)
length(reduced_genes)
length(reduced_random_genes$Signature)

#--- MISC paper gene
top25_genes_paper <- read.csv("./raw_data/MM_sig_paper.csv")
top25_genes_paper <- top25_genes_paper[1:25,"SignatureMyo"]

#################################################################################
# Distribution
#################################################################################
samples_counts
samples_meta

message("** Selecting highly variable genes in 3 Chips")   
rv <- as.data.frame(rowVars(as.matrix(samples_counts)))
rownames(rv) <- rownames(samples_counts)
colnames(rv) <- "Variance"
rv$geneid <- rownames(rv)

# select the ntop genes by variance
ntop <- 1000
select <- rv[order(rv$Variance, decreasing=TRUE),]
select <- select[1:ntop,]

message("** Converting Ensemble to Symbol")   
symbols <- suppressMessages(mapIds(org.Hs.eg.db, keys = rownames(select), keytype = "ENSEMBL", column="SYMBOL"))
master_gene_table <- as.data.frame(symbols)
master_gene_table$ensembl <- rownames(master_gene_table)
master_gene_table$symbols <- ifelse(is.na(master_gene_table$symbols), master_gene_table$ensembl, master_gene_table$symbols)

message("** Finding tageted genes (214) + adding 100 non targeted highly variable genes")   
length(random_genes$Signature)
master_gene_table$target <- ifelse(master_gene_table$symbols %in% random_genes$Signature, "target","non")
table(master_gene_table$target)

selected_gene_table <- master_gene_table[which(master_gene_table$target=="non"),]
selected.gene <- selected_gene_table[1:100,]$symbols
selected_gene_table <- master_gene_table[which(master_gene_table$symbols %in% c(selected.gene, random_genes$Signature)),]
selected_gene_table$symbols <- factor(selected_gene_table$symbols, levels = c(random_genes$Signature, selected.gene))
nrow(selected_gene_table)

separate_data <- function(counts, samples_metadata, selected_group){
  samples_metadata <- samples_metadata[which(samples_metadata$Chip == selected_group),]
  counts <- counts[,samples_metadata$BulkName]
  message(paste0("Selected chip : ", unique(samples_metadata$Chip)))
  return(counts)
}

generate_normalised_data_byGene <- function(datasetname, counts, samples_metadata, selected_group, 
                                            selected.gene.table, genetype){
  tmp.data <- separate_data(counts, samples_metadata, selected_group)
  
  if(genetype=="ensembl"){
    tmp.data <- as.data.frame(t(tmp.data[selected.gene.table$ensembl,]))
  }else{
    tmp.data <- as.data.frame(t(tmp.data[selected.gene.table$symbols,]))
  }
  
  tmp.data$BulkName <- rownames(tmp.data)
  
  data.plt <- list()
  n <- ncol(tmp.data) -1 
  for(i in 1:n){
    d <- tmp.data[,c(i,ncol(tmp.data))]
    d$Gene <- colnames(d)[1]
    colnames(d) <- c("value","BulkName", "Gene")
    data.plt[[i]] <- d
  }
  
  final.data <- Reduce(rbind, data.plt)
  final.data$Chip <- selected_group
  final.data$dataset <- datasetname
  
  return(final.data)
  
}

generate_data_for_plot <- function(datasetname, counts, samples_metadata, selected_group, 
                                   selected.gene.table, genetype, master_gene_table){
  message("** Generating data for plots")   
  data <- generate_normalised_data_byGene(datasetname, counts, samples_metadata, selected_group, selected.gene.table, genetype)
  data$symbols <- master_gene_table$symbols[match(data$Gene, master_gene_table$ensembl)]
  data$symbols <- factor(data$symbols , levels=rev(levels(selected.gene.table$symbols)))
  
  message("** Define targeted genes")   
  data$target <- ifelse(data$symbols %in% random_genes$Signature, "target", "no")
  data$target <- as.factor(data$target)
  
  return(data)
}

plot_distribution <- function(data, title, saveFile){
  message(paste0("Plotting : ", title))   
  plt <- ggplot(data, aes(value, symbols, fill=target))+
    geom_density_ridges(aes(value, symbols, fill=target)) +
    theme_ridges() +
    ggtitle(title) +
    ylab("Gene")+
    theme(axis.text.y = element_text(size = 10)) 
  
  message("Saved plots")   
  ggsave(plt, filename = saveFile, height = 35, width = 15, bg = "white")
  return(plt)
}

message("** Plotting")   
data.chip1 <- generate_data_for_plot("Agilent", samples_counts, samples_meta, 1, selected_gene_table, "ensembl", master_gene_table)
unique(data.chip1$BulkName)
data.chip2 <- generate_data_for_plot("Agilent", samples_counts, samples_meta, 2, selected_gene_table, "ensembl", master_gene_table)
unique(data.chip2$BulkName)
data.chip3 <- generate_data_for_plot("Agilent", samples_counts, samples_meta, 3, selected_gene_table, "ensembl", master_gene_table)
unique(data.chip3$BulkName)

plot_distribution(data.chip1, "Chip 1", "./check_agilent/distribution_chip1.png")
plot_distribution(data.chip2, "Chip 2", "./check_agilent/distribution_chip2.png")
plot_distribution(data.chip3, "Chip 3", "./check_agilent/distribution_chip3.png")
jyoh1248/MyoSignature documentation built on May 18, 2022, 12:37 a.m.