ECCO: Analysis for the ECCO project

suppressMessages(library(affy))
suppressMessages(library(limma))
suppressMessages(library(gtools))
suppressMessages(library(sva))
suppressMessages(library("org.Hs.eg.db"))
suppressMessages(library(hgu133plus2hsentrezg.db))
suppressMessages(library(hgu133plus2hsentrezgcdf))
suppressMessages(library("genefilter"))
suppressMessages(library(stringr))
suppressMessages(library("gsubfn"))

options(show.error.locations = TRUE)



########################################################
########## Functions not callable by the user ##########
########## Load these before anything else  ############
########################################################


remove.genes.with.low.expression <- function(data, q)
	{
	data.means <- as.data.frame(rowMeans(data))  # RNA-seq data: removing genes with low expression ( < 10th quantile)
	min.exp <- quantile(data.means[,1], q, na.rm=TRUE)
	#data <- data[rownames(data.means)[data.means[,1] > min.exp],]
	data <- data[data.means[,1] > min.exp,]

	if (nrow(data) == 0)
		{stop("[     ERROR]\tNo genes in data\n")}

	return(data)
	}


remove.genes.with.low.CV <- function(data, q)
	{

	cv <- apply(data,1,sd)/apply(data,1,mean)
	cut <- quantile(cv, q)
	filter <- which(cv>cut)

	data <- data[filter,]

	if (nrow(data) == 0)
		{stop("[     ERROR]\tNo genes in data\n")}

	return(data)
	}


plot.CEL.vs.RNAseq <- function(CEL.data, RNAseq.data, file, xlab, ylab)
	{
	cel <- rowMeans(CEL.data)
	rnaseq <- rowMeans(RNAseq.data)

	min <- min(cel, rnaseq)
	max <- max(cel, rnaseq)

	gene.list <- intersect(rownames(CEL.data), rownames(RNAseq.data))

	pdf(file)

	x.list <- vector(, length(gene.list))
	y.list <- vector(, length(gene.list))

	plot(1, type="n", xlim=c(min,max), ylim=c(min,max), main="CEL vs RNA-seq\nComparison of means", xlab=xlab, ylab=ylab)

	for (i in 1:length(gene.list))
		{
		gene <- gene.list[i]

		x <- mean(RNAseq.data[gene,])
		y <- mean(CEL.data[gene,])

		points(x, y, pch=20, cex=0.6, col='blue')

		x.list[i] <- x
		y.list[i] <- y
		}
	
	segments(min,min,max,max, lty="dashed")

	c <- cor.test(x.list, y.list, method="pearson")

	legend("topright", legend=paste("r =", round(c$estimate,2)), cex=1, bty = 'n')

	dev.off()
	}



########################################################
########## File paths ##################################
########################################################

file.RNAseq.expression = "/media/daguilar/BIOINFO2/TNF/RNA-seq/expression/RSEM.genes.counts.results"
celpath = '/media/daguilar/BIOINFO2/TNF/CEL/DVD_merged'
file.info = '/media/daguilar/BIOINFO2/TNF/CEL/bd_BCN_tnf_biopsies_10062016.tsv'

######################################################
########## Read info file about the samples ##########
######################################################

info <- read.csv(file.info, header=TRUE, sep='\t',na.string=c("n.a.","n.a",""))

info <- info[!is.na(info$sample_location) & !is.na(info$cel_file) & !is.na(info$fastq_TMG_ID),]  # removing all rows with n.a. in the 'sample_location', 'cel_file' or 'fastq_TMG_ID' columns

info <- info[info$week == 0 | info$week == 14, ]  # removing all rows where week != 0 or 14

info$CD_endoscopic_response <- as.character(info$CD_endoscopic_response)

info[info$CD_endoscopic_response=="1" & !is.na(info$CD_endoscopic_response),"CD_endoscopic_response"]  <- "no"
info[info$CD_endoscopic_response=="2" & !is.na(info$CD_endoscopic_response),"CD_endoscopic_response"]  <- "no"
info[info$CD_endoscopic_response=="3" & !is.na(info$CD_endoscopic_response),"CD_endoscopic_response"]  <- "yes"

info[info$UC_endoscopic_response=="1" & !is.na(info$UC_endoscopic_response),"UC_endoscopic_response"]  <- "no"
info[info$UC_endoscopic_response=="2" & !is.na(info$UC_endoscopic_response),"UC_endoscopic_response"]  <- "no"
info[info$UC_endoscopic_response=="3" & !is.na(info$UC_endoscopic_response),"UC_endoscopic_response"]  <- "yes"

if (nrow(info)==0)
	{stop("[     ERROR]\tSample information is empty\n", sep="");}

########################################
########## Read RNA-seq files ##########
########################################

RNAseq.data.all <- as.matrix(read.csv(file.RNAseq.expression, header=TRUE, sep='\t'))

if (sum(!(which(colnames(RNAseq.data.all) %in% info$fastq_TMG_ID))) > 0) # If there are samples in RNAseq.data which are not present in the 'info' file
	{stop("[     ERROR]\n");}

if (grepl("counts", file.RNAseq.expression) == TRUE)  # se eliminan los decimales si trabajamos con counts (hay decimales pq RSEM calcula "expected counts")
	{
	RNAseq.data.all <- round(RNAseq.data.all, 0)
	}


############################################################################################
########## Change names to official HGNC symbol in RNA-seq data whenever possible ##########
############################################################################################

write.table(rownames(RNAseq.data.all), "/home/daguilar/scripts/expression/rownames.txt", sep="\t", row.names=FALSE, col.names=FALSE, quote=FALSE)

rownames.new <- system("perl /home/daguilar/scripts/expression/find_official_gene_names2.pl /home/daguilar/scripts/expression/rownames.txt /home/daguilar/HGNC/hgnc_complete_set.tsv /home/daguilar/references/hg19/genes.gtf 0", intern=TRUE)

if (sum(duplicated(rownames.new)) > 0)
	{stop("[     ERROR] Duplicated gene names: ", paste(rownames.new[duplicated(rownames.new)], collapse=" "),"\n");}

rownames(RNAseq.data.all) <- rownames.new

####################################
########## Read CEL files ##########
####################################

CEL.files.all <- intersect(as.character(info[!is.na(info$cel_file) & info$fastq_TMG_ID %in% colnames(RNAseq.data.all), "cel_file"]), list.files(celpath))
CEL.data.all <- ReadAffy(filenames=CEL.files.all, celfile.path=celpath, cdfname="hgu133plus2hsentrezgcdf")

#######################################################################################################################
########## We keep only the samples (colnames) present in both CEL and RNA-seq data (for comparison purposes) #########
#######################################################################################################################

RNAseq.data.all <- RNAseq.data.all[, as.character(info[info$cel_file %in% colnames(CEL.data.all), "fastq_TMG_ID"])]

if (ncol(RNAseq.data.all) != length(CEL.files.all))
	{stop("[     ERROR]\n");}


###################################
########## Normalization ##########
###################################

CEL.data.rma = rma(CEL.data.all, verbose=FALSE, normalize=TRUE)
CEL.data.all = exprs(CEL.data.rma)

RNAseq.data.all <- voom(RNAseq.data.all, normalize.method="quantile")
RNAseq.data.all <- RNAseq.data.all$E


#####################################################################################################
########## We change the sample (column) names to 'sample_id' in both CEL and RNA-seq data ########## 
#####################################################################################################

for (i in 1:ncol(CEL.data.all))
	{
	colnames(CEL.data.all)[i] <-  as.character(info[info$cel_file==colnames(CEL.data.all)[i], "Sample_id"])
	}

for (i in 1:ncol(RNAseq.data.all))
	{
	colnames(RNAseq.data.all)[i] <-  as.character(info[info$fastq_TMG_ID==colnames(RNAseq.data.all)[i], "Sample_id"])
	}


###################################################
########## Annotation data for CEL files ##########
###################################################

id <- as.character(rownames(CEL.data.all))

symb <- unlist(mget(id,hgu133plus2hsentrezgSYMBOL, ifnotfound=NA))
name <- unlist(mget(id,hgu133plus2hsentrezgGENENAME, ifnotfound=NA))
nameACC <- unlist(mget(id,hgu133plus2hsentrezgACCNUM, ifnotfound=NA))
entrez <- unlist(mget(id,hgu133plus2hsentrezgENTREZID, ifnotfound=NA))

annotation <- data.frame(probe=id,gene=symb,entrez=entrez,definition=name, stringsAsFactors=FALSE)


#############################################################################################################################################
########## Gene names are used as row names in CEL data (instead of probe names) to allow for an easy comparison with RNA-seq data ########## 
#############################################################################################################################################

CEL.data.all <- CEL.data.all[!(rownames(CEL.data.all) %in% (as.character(annotation[which(is.na(annotation$gene)),"probe"]))), ]
rownames(CEL.data.all) <- as.character(annotation[which(annotation$probe %in% rownames(CEL.data.all)),"gene"])

########################################
########## Filtering CEL data ##########
########################################

CEL.data.all <- remove.genes.with.low.expression(CEL.data.all, 0.10)
CEL.data.all <- remove.genes.with.low.CV(CEL.data.all, 0.25)

############################################
########## Filtering RNA-seq data ##########
############################################

RNAseq.data.all <- remove.genes.with.low.expression(RNAseq.data.all, 0.10)
RNAseq.data.all <- remove.genes.with.low.CV(RNAseq.data.all, 0.25)

######################################################################################################################
########## Calculate difference from median (~normalization) and save matrices for visualization with Excel ########## 
######################################################################################################################

CEL.data.median.all <- CEL.data.all-apply(CEL.data.all,1,median)

RNAseq.data.median.all <- RNAseq.data.all-apply(RNAseq.data.all,1,median)

##########################################
########## Plot CEL vs RNAS-seq ##########
##########################################


plot.CEL.vs.RNAseq(CEL.data.median.all, RNAseq.data.median.all, "CEL_vs_RNAseq.median.pdf", "mean of median-normalized expression levels (RNA-seq)", "mean of median-normalized expression levels (CEL)")

plot.CEL.vs.RNAseq(CEL.data.all, RNAseq.data.all, "CEL_vs_RNAseq.pdf", "mean of normalized expression levels (TPM; RNA-seq)", "mean of normalized expression levels (CEL)")