"
╔═╗╔═╦╗╔═╦═╦╦╦╦╗╔═╗╔╗═╦╗╔═╦╗╗╔╦╗╔═╗╔═╦╗╔═╦═╦╦╦╦╗╔═╗╔╗═╦╗╔═╦╗╗╔╦╗╔═╗╔═╦╗╔═╦═╦╦╦╦╗╔═╗╔╗═╦╗╔═╦╗╗╔╦╗
╚═╝╩═╩╝╚═╩══╩═╩═╩═╩╝╩═╩╝╚═╩═╩═╩╝╚═╝╩═╩╝╚═╩══╩═╩═╩═╩╝╩═╩╝╚═╩═╩═╩╝╚═╝╩═╩╝╚═╩══╩═╩═╩═╩╝╩═╩╝╚═╩═╩═╩╝
R helper functions, preceeded by various 'notes to self'
more up-to-date version in https://github.com/ks4471/addR 'adds'
╔═╦╗╔╦╗╔═╦═╦╦╦╦╗╔═╗╔═╦╗╔╦╗╔═╦═╦╦╦╦╗╔═╗╔═╦╦╦╦╗╔═╗╔═╦╗╔═╦╗╔╦╗╔═╦═╦╦╦╦╗╔═╗╔═╗╔═╦╗╔═╦╗╔╔═╦╗╔═╦╗╔╗
╠╗║╚╝║║╠╗║╚╣║║║║║╚╣ ╠╣║║║║║═╣║║╠╗║║╚╣╚╣╔╣╔╣╔╣╔╣║╚╣═╣║╚╣║║║╚╣╔╣╔╣║╚╣═╣║╗║╚╚╣
╚═╩══╩═╩═╩═╩╝╚╩═╩═╝╚═╩══╩═╩═╩═╩╝╚╩═╩═╩╝╚╩═╩═╝╚═╩══╩═╩═╩═╩╝╚╩═╩═╩╝╚╩═╩═╝═╩╝╚╩═╩═╩╝╩═╩═╩═╩╝╚╩═╩
"
#options(stringsAsFactors=F);library(colorout);rm(list=ls());ls()
#options(menu.graphics=FALSE);library(R.helper)
#devtools::install_github("ks471/R.helper")
##
#gsub("[^A-Za-z0-9 _.,!]", "", humpty$title) ## gsub all but alphanumeric AND punctuation ## http://stackoverflow.com/questions/7233447/a-regex-to-match-strings-with-alphanumeric-spaces-and-punctuation
#gsub("[^[:alnum:] ]", "", str) #http://stackoverflow.com/questions/8959243/r-remove-non-alphanumeric-symbols-from-a-string
## useful concept : add a readme.object.in.saved.R.object when saving file create a "method" / "description" of the file for easier handover / re-analysis
## readme.lires=
# "\tlires - differentially expressed genes in the hippocampus compared to 3 other brain regions FC, OC, TC Hardy UKBEC dataset n=102\n
# \tdesm=design matrix file for limma\n
# limma used for this code.file : 003.DifferentiallyExpressedGenes.limma.R |\n| adj.P - calculated based using FDR, CX excluded because it is very differentially expressed compared to the other 3 - all genes were differential with CX"
## save(lires,desm,readme,file="~/Dropbox/CapricaPrime/RU/dtb/secondary/HC.differentially.expressed.v.FC.OC.TC.R")
### saves 'cat' output to file, file separator can be controlled using the "\t" in relevant places
#sink("~/Dropbox/bin/gsea/modules_bonn.gtf")
# for(imod in 1:length(bgen)){
# cat(as.vector(paste(c(names(bgen)[imod],bgen[[names(bgen)[imod]]]),collapse="\t")),"\n",sep="")
# }
#sink()
## http://serverfault.com/questions/25199/using-wget-to-recursively-download-whole-ftp-directories
## Check below wget command to download data from FTP recursively
## wget --user="" --password="" -r -np -nH --cut-dirs=1 --reject "index.html*" ""
## -r : is for recursively download.
## -np : is for no parent ascending.
## -nH : is for disabling creation of directory having name same as URL i.e. abc.xyz.com
## --cut-dirs : is for ignoring no. of parent directories. Value of this option will differ for your command.
## extracting files using cmd / mac os
## https://www.cyberciti.biz/faq/tar-extract-linux/
## tar -xvf file.tar
## tar -xzvf file.tar.gz
## tar -xjvf file.tar.bz2
## -x : Extract a tar ball.
## -v : Verbose output or show progress while extracting files.
## -f : Specify an archive or a tarball filename.
## -j : Decompress and extract the contents of the compressed archive created by bzip2 program (tar.bz2 extension).
## -z : Decompress and extract the contents of the compressed archive created by gzip program (tar.gz extension).
## http://askubuntu.com/questions/25347/what-command-do-i-need-to-unzip-extract-a-tar-gz-file
## tar -xvzf community_images.tar.gz
## To explain a little further, tar collected all the files into one package, community_images.tar. The gzip program applied compression, hence the gz extension. So the command does a couple things:
## f: this must be the last flag of the command, and the tar file must be immediately after. It tells tar the name and path of the compressed file.
## z: tells tar to decompress the archive using gzip
## x: tar can collect files or extract them. x does the latter.
## v: makes tar talk a lot. Verbose output shows you all the files being extracted.
## gunzip file.gz
## gzip -d file.gz
####■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■
## running long jobs using 'screen' ## http://aperiodic.net/screen/quick_reference
# screen -ls ## look up existing screen / ids
# screen -S new_screen_name ## create new screen obj
# ctrl + a + d ## consecutive -> detach current screen
# screen -e new_screen_name ## resume 'new_screen_name' - if it is still running
## useful concept : add a readme.object.in.saved.R.object when saving file create a "method" / "description" of the file for easier handover / re-analysis
## readme.lires=
# "\tlires - differentially expressed genes in the hippocampus compared to 3 other brain regions FC, OC, TC Hardy UKBEC dataset n=102\n
# \tdesm=design matrix file for limma\n
# limma used for this code.file : 003.DifferentiallyExpressedGenes.limma.R |\n| adj.P - calculated based using FDR, CX excluded because it is very differentially expressed compared to the other 3 - all genes were differential with CX"
## save(lires,desm,readme,file="~/Dropbox/CapricaPrime/RU/dtb/secondary/HC.differentially.expressed.v.FC.OC.TC.R")
### saves 'cat' output to file, file separator can be controlled using the "\t" in relevant places
#sink("~/Dropbox/bin/gsea/modules_bonn.gtf")
# for(imod in 1:length(bgen)){
# cat(as.vector(paste(c(names(bgen)[imod],bgen[[names(bgen)[imod]]]),collapse="\t")),"\n",sep="")
# }
#sink()
## easier names for vars used by phyper and fisher.test:
# phyper(success_in_sample, success_in_bkgd, failure_in_bkgd, sample_size, lower.tail=TRUE)
#fisher.test(matrix(c(x, 13-x, 5-x, 34+x), 2, 2), alternative='less');
# Numerical parameters in order:
# (success-in-sample, success-in-left-part, failure-in-sample, failure-in-left-part).
#########################################################################################################
#########################################################################################################
# ----------------------------------------------------------------------------------------------------- #
# Alternate way to calculate PC1 ----------------------------------------------------
# ----------------------------------------------------------------------------------------------------- #
# removing the first Principal component from data and reconstructing full matrix
#SVD=svd(t(scale(t(Ccombi[2:(ncol(Ccombi)-3)]))))
#str(SVD)
#Expr=SVD$u%*%diag(SVD$d)%*%t(SVD$v)
#PC1=SVD$u[,1]%*%(SVD$d[1])%*%t(SVD$v[1,1])
#SVD=svd(t(scale(t(D1))))
#str(SVD)
#PC1=SVD$u[,1]%*%(SVD$d[1])%*%t(SVD$v[1,1])
#for(idat in 1:length(fnames)){
#cat(" ",dat_descr,"===========",which(fnames==fnames[idat]),"of",length(fnames),"\n")
#for(ireg in 1:length(names(list_expr))){
# print(paste("--------------------",names(list_expr)[ireg],"---------------------",ireg,"of",length(names(list_expr))))
# cat(round(j/ncol(Matrix),digits=2),"\r");flush.console()
#specify the name of the object in the R binary file you just loaded || can use this to load R binary files instead of 'load':
#file_path="/User/dy/my_R_data/a_data_set.RData"
#attach(file_path, pos=2, name=choose_a_name, warn.conflict=T)
#'warn.conflicts=T' is the default option
#'pos=2' is also the default; "2" refers to the position in your search path. For instance, position 1 is ".GlobalEnv." To get the entire array of search paths, use search(). So you would access the search path for the new object by search()[2]
#use 'detach' to remove the object
"
╔═╗╔═╦╗╔═╦═╦╦╦╦╗╔═╗╔╗═╦╗╔═╦╗╗╔╦╗╔═╗╔═╦╗╔═╦═╦╦╦╦╗╔═╗╔╗═╦╗╔═╦╗╗╔╦╗╔═╗╔═╦╗╔═╦═╦╦╦╦╗╔═╗╔╗═╦╗╔═╦╗╗╔╦╗
╚═╝╩═╩╝╚═╩══╩═╩═╩═╩╝╩═╩╝╚═╩═╩═╩╝╚═╝╩═╩╝╚═╩══╩═╩═╩═╩╝╩═╩╝╚═╩═╩═╩╝╚═╝╩═╩╝╚═╩══╩═╩═╩═╩╝╩═╩╝╚═╩═╩═╩╝
"
####■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■
# potentially very useful plot - eg factor plot
#par(mar=c(0,0,0,0))
# Set up the plotting area
#plot(NA, xlim=c(0,1), ylim=c(6.5, -0.5),
# xaxt="n", yaxt="n",
# xlab=NA, ylab=NA )
# Draw the lines
#for (i in 0:6) {
# points(c(0.25,1), c(i,i), lty=i, lwd=2, type="l")
#}
# Add labels
#text(0, 0, "0. 'blank'" , adj=c(0,.5))
#text(0, 1, "1. 'solid'" , adj=c(0,.5))
#text(0, 2, "2. 'dashed'" , adj=c(0,.5))
#text(0, 3, "3. 'dotted'" , adj=c(0,.5))
#text(0, 4, "4. 'dotdash'" , adj=c(0,.5))
#text(0, 5, "5. 'longdash'", adj=c(0,.5))
#text(0, 6, "6. 'twodash'" , adj=c(0,.5))
colmix=c(
"#0072B2"
,"#E69F00"
,"#009E73"
,"#56B4E9"
,"#D55E00"
,"#66A61E"
,"#7570B3"
,"#a50f15"
,"#A6761D"
,"#117733"
,"#332288"
,"#b15928"
,"#882255"
,"#999933"
,"#AA4499"
,"#1f78b4"
,"#F0E442"
,"#e31a1c"
,"#6a3d9a"
,"#b2df8a"
,"#08519c"
,"#ff7f00"
,"#fdbf6f"
,"#33a02c"
,"#b15928"
,"#f16913"
,"#238b45"
,"#807dba"
,"#d94801"
,"#41ab5d"
,"#fd8d3c"
,"#4292c6"
)
pastelcolmix=c(
"#cab2d6"
,"#ffff99"
,"#8dd3c7"
,"#ffffb3"
,"#bebada"
,"#fb8072"
,"#80b1d3"
,"#fdb462"
,"#b3de69"
,"#fccde5"
,"#d9d9d9"
,"#bc80bd"
,"#ccebc5"
,"#ffed6f"
,"#a6cee3"
,"#fb9a99"
)
colmixrb=c(colorRampPalette(c(
"#3f007d"
,"#313695"
,"#053061"
,"#08306b"
,"#045a8d"
,"#08519c"
,"#2171b5"
,"#6baed6"
))(8),rev(colorRampPalette(c(
"#67001f"
,"#67000d"
,"#a50f15"
,"#bd0026"
,"#cb181d"
,"#e31a1c"
,"#ef3b2c"
,"#f4a582"
))(8)))
colmixb=c(colorRampPalette(c(
"#3f007d"
,"#313695"
,"#053061"
,"#08306b"
,"#045a8d"
,"#08519c"
,"#2171b5"
,"#6baed6"
))(8))
colmixr=rev(colorRampPalette(c(
"#67000d"
,"#a50f15"
,"#bd0026"
,"#cb181d"
,"#e31a1c"
,"#ef3b2c"
,"#f4a582"
))(7))
colred=c('#67001f','#a50026','#f46d43','#fb9a99')
colblu=c('#2d004b','#053061','#2166ac','#4393c3')
colblus=c('#08306b','#08519c','#2171b5','#4292c6','#6baed6','#9ecae1','#c6dbef','#deebf7','#f7fbff')
colreds=c('#67000d','#a50f15','#cb181d','#ef3b2c','#fb6a4a','#fc9272','#fcbba1','#fee0d2','#fff5f0')
colrbd =c('#320000',"#800026","#bd0026","#e31a1c","#fc4e2a","#fd8d3c","#feb24c","#fed976" # reds
,"#c7e9b4","#7fcdbb","#41b6c4","#1d91c0","#225ea8","#253494","#081d58",'#49006a') # blues # dark purple , -- need a corresponding smth for the darkest red
colrb =c("#800026","#bd0026","#e31a1c","#fc4e2a","#fd8d3c","#feb24c","#fed976" # reds
,'white'
,"#c7e9b4","#7fcdbb","#41b6c4","#1d91c0","#225ea8","#253494","#081d58") # blues # dark purple , -- need a corresponding smth for the darkest red
colbw=c("#ffffff","#f0f0f0","#d9d9d9","#bdbdbd","#969696","#737373","#525252","#252525","#000000")
#--#~~blu=c(
#blue- violet
#--#~~"#fff7fb"
#--#~~#--#~~,"#ece7f2"
#--#~~,"#d0d1e6"
#--#~~,"#a6bddb"
#--#~~,"#74a9cf"
#--#~~,"#3690c0"
#--#~~,"#0570b0"
#--#~~,"#045a8d"
#--#~~,"#023858"
# blue - violet
#--#~~,"#f7fcfd"
#--#~~,"#e0ecf4"
#--#~~,"#bfd3e6"
#--#~~,"#9ebcda"
#--#~~,"#8c96c6"
#--#~~,"#8c6bb1"
#--#~~,"#88419d"
#--#~~,"#810f7c"
#--#~~,"#4d004b"
# violet
#--#~~,"#fcfbfd"
#--#~~,"#efedf5"
#--#~~,"#dadaeb"
#--#~~,"#bcbddc"
#--#~~,"#9e9ac8"
#--#~~,"#807dba"
#--#~~,"#6a51a3"
#--#~~,"#54278f"
#--#~~,"#3f007d"
#--#~~,"#f7fbff"
# blue
#--#~~,"#deebf7"
#--#~~,"#c6dbef"
#--#~~,"#9ecae1"
#--#~~,"#6baed6"
#--#~~,"#4292c6"
#--#~~,"#2171b5"
#--#~~,"#08519c"
#--#~~,"#08306b"
# mix
#--#~~,"#d1e5f0"
#--#~~,"#e0f3f8"
#--#~~,"#92c5de"
#--#~~,"#abd9e9"
#--#~~,"#74add1"
#--#~~,"#4393c3"
#--#~~,"#4575b4"
#--#~~,"#2166ac"
#--#~~,"#053061"
#--#~~,"#313695"
#--#~~)
#--#~~red=c(
# mix
#--#~~"#67001f"
#--#~~,"#a50026"
#--#~~,"#b2182b"
#--#~~,"#d73027"
#--#~~,"#d6604d"
#--#~~,"#f46d43"
#--#~~,"#fdae61"
#--#~~,"#fee090"
#--#~~,"#f4a582"
#--#~~,"#ffffbf"
#--#~~,"#ffffcc"
#--#~~,"#ffeda0"
#--#~~,"#fed976"
#--#~~,"#feb24c"
#--#~~,"#fd8d3c"
#--#~~,"#fc4e2a"
#--#~~,"#e31a1c"
#--#~~,"#bd0026"
#--#~~,"#800026"
#--#~~,"#fff5eb"
#--#~~,"#fee6ce"
#--#~~,"#fdd0a2"
#--#~~,"#fdae6b"
#--#~~,"#fd8d3c"
#--#~~,"#f16913"
#--#~~,"#d94801"
#--#~~,"#a63603"
#--#~~,"#7f2704"
#--#~~,"#fff5f0"
#--#~~,"#fee0d2"
#--#~~,"#fcbba1"
#--#~~,"#fc9272"
#--#~~,"#fb6a4a"
#--#~~,"#ef3b2c"
#--#~~,"#cb181d"
#--#~~,"#a50f15"
#--#~~,"#67000d")
#--#~~tencol=c(
#--#~~"#0a0972"
##--#~~,"#313695"
#--#~~,"#08306b"
#--#~~,"#023858"
##--#~~,"#053061"
##--#~~,"#08519c"
##--#~~,"#045a8d"
#--#~~,"#2166ac"
#--#~~,"#2171b5"
#--#~~,"#4575b4"
#--#~~,"#4393c3"
#--#~~,"#3690c0"
#--#~~,"#4292c6"
#--#~~,"#6baed6"
#--#~~,"#74a9cf"
#--#~~)
#ramp=colorRampPalette(c("#0a0972"#--#~~,"#023858"#--#~~,"#2166ac"#--#~~,"#4292c6"))(10)
#--#~~rampb=colorRampPalette(c(
#--#~~"#3f007d"
#--#~~,"#313695"
#--#~~,"#053061"
#--#~~,"#08306b"
#--#~~,"#045a8d"
#--#~~,"#08519c"
#--#~~,"#2171b5"
#--#~~,"#6baed6"
#--#~~))(10)
#--#~~rampr=colorRampPalette(c(
#--#~~"#67001f"
#--#~~,"#67000d"
#--#~~,"#a50f15"
#--#~~,"#bd0026"
#--#~~,"#cb181d"
#--#~~,"#e31a1c"
#--#~~,"#ef3b2c"
#--#~~,"#f4a582"
#--#~~))(10)
#colmix=blu
#colmix=red
#colmix=tencol
#
#--#~~colmix=c(rampb,rev(rampr))
#--#~~ par(mar=c(0,0,0,0))
#--#~~ plot(NA, xlim=c(0,1), ylim=c(length(colmix), -0.5),
#--#~~ xaxt="n", yaxt="n",
#--#~~ xlab=NA, ylab=NA )
#--#~~ for (i in 1:length(colmix)){
#--#~~ points(c(0.25,1), c(i,i), lwd=15, type="l",col=colmix[i])
#--#~~ text(0, i, paste(i,colmix[i]) , adj=c(0,.5))
#--#~~}
#--#~~ colmix=c(
#--#~~ "white"
#--#~~ ,"#1f78b4"
#--#~~ ,"#E69F00"
#--#~~ ,"#882255"
#--#~~ ,"white"
#--#~~ ,"#56B4E9"
#--#~~ ,"#66A61E"
#--#~~ ,"#bd0026"
#--#~~ ,"white"
#--#~~ ,"#0072B2"
#--#~~ ,"#117733"
#--#~~ ,"#A6761D"
#--#~~ ,"white"
#--#~~ ,"#D55E00"
#--#~~ ,"#009E73"
#--#~~ ,"white"
#--#~~ ,"#AA4499"
#--#~~ ,"#6a3d9a"
#--#~~ ,"white"
#--#~~ ,"#7570B3"
#--#~~ ,"white"
#--#~~ ,"#332288"
#--#~~ ,"#999933"
#--#~~ ,"#b2df8a"
#--#~~ ,"#33a02c"
#--#~~ ,"#ff7f00"
#--#~~ ,"#e31a1c"
#--#~~ ,"#fdbf6f"
#--#~~ ,"#b15928"
#--#~~ ,"#fc4e2a"
#--#~~ ,"#e31a1c"
#--#~~ ,"#800026"
#--#~~ ,"#F0E442"
#--#~~ )
############################################################################################################
##############################============= WGCNA Part 2 =============#############################
##############################----- Clustering Analysis -----###############################
############################################################################################################
#print("■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■")
####=================================================================================================
### Calculate consensus modules
## ------- 1 -------
#peer="PEER."
#peer=""
## ------- 2 -------
#pc1correct_data="pc1correct."
#
#pc1correct_data=""
####=================================================================================================
###-----------------------------------------------------------------------------------------------
# Plot the cut tree ----------------------------------------------------------------------------
#=================================================================================================
# 3) Cluster again? Copy - paste this until there's no change.
#------------------------------------------------------------------------------
####----------------------------------------------------------------------------------------------
## calculate networks
# May have to tinker with options to optimise tree cutting ---------------------------------
# maxBlockSize - increased to cut as single tree (avoid merging)
# deepSplit - reduced to 2 (from 3)
# an important parameter to be selected <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<***҉҉҉҉҉҉҉
#
make.numeric<-function(Matrix,col_factor="",fac_legend=F,verbose=F,help=F,char_as_fac=F){ #char_as_fac=T,
if(help==T){
cat("\tINPUTS:\tMatrix - rows : samples, columns - variables\n")
cat("\tOPTIONS:\tfactors - column numbers to convert to factor instead of numeric | default='none'\n")
cat("\tNOTE:\tcharacter columns can be converted to numeric 'categories' by as.numeric (default)\n\n")
}
# for any correlation analysis like linear modeling non-numeric categories eg 'sex' should always be converted to factor, numeric 1=male, 2=female is considered 2>1 vs 1!=2 if as.factor
# cat("converting",class(Matrix),"dimensions :",dim(Matrix),"\n")
# print(head(Matrix))
#if(char_as_fac==T){
# col_factor=names(which(lapply(Matrix,class)=="character"))
#}
legend=list()
num.mat=matrix(NA,ncol=ncol(Matrix),nrow=nrow(Matrix))
colnames(num.mat)=colnames(Matrix)
rownames(num.mat)=rownames(Matrix)
# if(char_as_fac){
# }
col_numeric=1:ncol(Matrix)
col_numeric=col_numeric[!(col_numeric%in%col_factor)]
#print(col_numeric)
#print(col_factor)
for(inum in col_numeric){
if(verbose==T){
cat("\t\t - column",inum,"coverted as.numeric\n")
}
num.mat[,inum]=as.numeric(Matrix[,inum])
if(class(Matrix[,inum])=="factor" & fac_legend){
descr=list(legend=Matrix[,inum])
descr$values=num.mat[,inum,drop=F]
descr=unique(as.data.frame(descr))
legend[[colnames(Matrix)[inum]]]=descr
}
# print(head(num.mat))
}
if(sum(col_factor=="")!=1){
if(verbose==T){
cat("\n\tNOTE:\tmatrix can only hold 1 kind of object eg numeric, character, etc..\n\t\t\tdata.frame will be used instead\n")
}
num.mat=as.data.frame(num.mat)
for(ifac in col_factor){
if(verbose==T){
cat("\t\t - column",ifac,"coverted as.factor\n")
}
num.mat[,ifac]=as.factor(Matrix[,ifac])
# print(head(num.mat))
}
}
# cat("new",class(num.mat),"dimensions :", dim(num.mat))
if(!fac_legend){
return(num.mat)
}
if(fac_legend){
return(list("numeric"=num.mat,"legend"=legend))
}
if(verbose==T){
cat("\t--------------------- make.numeric - finished ---------------------\n\n")
}
}
### ----------------------------------------------------------------------
## new version below is updated version to handle list objects more easily, and
#Head<-function(Matrix){
# if(is.vector(Matrix)==T & is.list(Matrix)==F){
# print(as.matrix(Matrix[1:min(c(length(Matrix),10))]))
# cat("\n vector length :",length(Matrix),"\n")
# cat("\n Numeric :",is.numeric(Matrix),"\n")
# }
# if(is.list(Matrix)==T & is.data.frame(Matrix)==F){
# print(str(Matrix[1:min(c(length(Matrix),10))]))
# print(as.matrix(Matrix[1:min(c(length(Matrix),10))]))
# Matrix=as.data.frame(Matrix)
# print(Matrix[1:min(c(nrow(Matrix),10)),1:min(c(ncol(Matrix),5))])
# cat("\n list length :",length(Matrix),"\n")
# cat("\n Numeric :",is.numeric(Matrix),"\n")
# }
# if(is.matrix(Matrix)==T | is.data.frame(Matrix)==T){
# print(Matrix[1:min(c(nrow(Matrix),10)),1:min(c(ncol(Matrix),5))])
# cat("\n",class(Matrix),"dimensions :",dim(Matrix),"\n")
# cat("\n Numeric :",is.numeric(Matrix),"\n")
# }
#}
### ----------------------------------------------------------------------
## Updated version to handle list objects more easily
Head<-function(data_obj,nlist=1,ncol=1:5,nrow=1:10){
cat("\n\tobject class : ",class(data_obj),"\n\n")
prev=""
if(class(data_obj)=="list"){
if(length(names(data_obj))<50){
cat("\t\tlist contains",length(names(data_obj)),"objects:\n")
cat("\t\t\t",as.matrix(names(data_obj)),"\n\n",sep=" ")
cat("\t\tlist[[",nlist,"]] contains ",class((data_obj[[1]]))," :",sep="")
prev=paste("list[[",nlist,"]] contains :",sep="")
data_obj=data_obj[[nlist]]
}
if(length(names(data_obj))>50){
cat("\t\tlist contains",length(names(data_obj)),"objects, first 50:\n")
cat("\t\t\t",as.matrix(names(data_obj)[1:min(50,length(names(data_obj)))]),"...\n\n",sep=" ")
cat("\t\tlist[[",nlist,"]] contains ",class((data_obj[[1]]))," :",sep="")
prev=paste("list[[",nlist,"]] contains :",sep="")
data_obj=data_obj[[nlist]]
}
}
if(class(data_obj)=="list"){
cat("",length(names(data_obj)),"objects:\n")
cat("\t\t\t",as.matrix(names(data_obj)),"\n\n",sep=" ")
}
if(class(data_obj)=="data.frame" | class(data_obj)=="matrix"){
cat("\n\n")
print(data_obj[min(1,min(nrow)):min(max(nrow),nrow(data_obj)),min(1,min(ncol)):min(max(ncol),ncol(data_obj)),drop=F])
cat("\n\t",prev,class(data_obj),"dimensions : ",dim(data_obj),"\n")
cat("\t\tis.numeric :",is.numeric(data_obj))
if(is.numeric(data_obj)){
cat('\tmin=',min(data_obj,na.rm=T),'max=',max(data_obj,na.rm=T),'\n')
}
cat('\n')
}
if(class(data_obj)=="vector"){
cat("\n\n")
print(data_obj[nrow])
cat("\n\t",prev,class(data_obj),"length : ",length(data_obj),"\n")
cat("\t\tis.numeric :",is.numeric(data_obj))
if(is.numeric(data_obj)){
cat('\tmin=',min(data_obj,na.rm=T),'max=',max(data_obj,na.rm=T),'\n')
}
cat('\n')
}
if(class(data_obj)!="list" & class(data_obj)!="data.frame" & class(data_obj)!="matrix" & class(data_obj)!="vector"){
cat("\n\n")
str(data_obj)
cat("\n\tis.numeric :",is.numeric(data_obj))
if(is.numeric(data_obj)){
cat('\tmin=',min(data_obj,na.rm=T),'max=',max(data_obj,na.rm=T),'\n')
}
cat('\n')
}
# cat("\t",class(data_obj),"dimensions : ",dim(data_obj),"\n")
}
#
sd.check<-function(dat_mat,check_rows=F,check_cols=T,verbose=T,help=F){
if(help==T){
cat("\nUSE: check that values in rows/columns of matrix/dataframe vary: sd>0 or factors are informative: have >1 level / not ids n.levels=n.rows\n")
}
#start_time=Sys.time()
if(is.matrix(dat_mat)){dat_class=apply(dat_mat,2,class)}
if(is.data.frame(dat_mat)){dat_class=unlist(lapply(dat_mat,class))}
#if(verbose==T){
cat("\n+++++++++++++++++ data qc check +++++++++++++++++\n")
cat("dat_mat contains",ncol(dat_mat),"variables, of which :\n")
print(table(dat_class))
#}
dat_num=dat_mat[,dat_class==("numeric"),drop=F]
dat_fac=dat_mat[,dat_class=="factor",drop=F]
dat_otr=dat_mat[,!(dat_class%in%c("factor","numeric")),drop=F]
rowind=1:nrow(dat_mat)
colind=1:ncol(dat_mat)
if(check_cols==T){
fac_col=apply(dat_fac,2,function(x) sum(table(x)!=0))
num_col=apply(dat_num,2,sd)
numvarcol=names(num_col)[num_col==0]
if(length(numvarcol)){cat("\tcol - values do not vary (sd=0) :\t",paste(numvarcol,collapse=", "),"\n")}
facvarcol=names(fac_col)[fac_col==1]
if(length(facvarcol)){cat("\tcol - contains single value :\t\t",paste(facvarcol,collapse=", "),"\n")}
facvaridc=names(fac_col)[fac_col==nrow(dat_fac)]
if(length(facvaridc)){cat("\tcol - as many levels as rows :\t\t",paste(facvaridc,collapse=", "),"\n")}
colind=!(colnames(dat_mat)%in% c(numvarcol,facvarcol,facvaridc))
# cat("\ttotal n cols removed :\t",length(c(numvarrow,facvarrow,facvarids)),"\n") # for this to work need to a track counter // or some count of n elements for each var
}
if(check_rows==T){
fac_row=apply(dat_fac,1,function(x) sum(table(x)!=0))
num_row=apply(dat_num,1,sd)
numvarrow=names(num_row)[num_row==0]
if(length(numvarrow)){cat("\t",length(numvarrow),"rows - values do not vary (sd=0) , first ",min(20,length(numvarrow)),":\n",paste(numvarrow[1:min(20,length(numvarrow))],collapse=", "),"\n")}
facvarrow=names(fac_row)[fac_row==1]
if(length(facvarrow)){cat("\t",length(facvarrow),"rows - contain single value 'factor', first ",min(20,length(facvarrow)),":\n",paste(facvarrow[1:min(20,length(facvarrow))],collapse=", "),"\n")}
facvaridr=names(fac_row)[fac_row==nrow(dat_fac)]
if(length(facvaridr)){cat("\trow - as many levels as rows :\t\t",paste(facvaridr,collapse=", "),"\n")}
rowind=!(rownames(dat_mat)%in% c(numvarrow,facvarrow,facvaridr))
# cat("\ttotal n rows removed :\t",length(c(numvarrow,facvarrow,facvarids)),"\n") # for this to work need to a track counter // or some count of n elements for each var
}
if(ncol(dat_otr)>0){
cat("\tcol - not factor nor numeric :\t",paste(names(dat_otr),collapse=", "),"\n")
}
cat("\n")
return(invisible(dat_mat[rowind,colind]))
# print(Sys.time()-start_time)
}
psig<-function(dat_mat,p_col,sort=F){
dat_mat$sig=""
dat_mat[dat_mat[,p_col]<0.1,"sig"]="."
dat_mat[dat_mat[,p_col]<0.05,"sig"]="+"
dat_mat[dat_mat[,p_col]<0.01,"sig"]="*"
dat_mat[dat_mat[,p_col]<0.001,"sig"]="**"
dat_mat[dat_mat[,p_col]<0.0001,"sig"]="***"
dat_mat[dat_mat[,p_col]<0.00001,"sig"]="****"
if(sort){
dat_mat=dat_mat[order(dat_mat[,p_col]),]
}
return(dat_mat)
}
# extract p-value form linear model
lmp <- function (modelobject) {
if (class(modelobject) != "lm") stop("Not an object of class 'lm' ")
f=summary(modelobject)$fstatistic
p=pf(f[1],f[2],f[3],lower.tail=F)
attributes(p)=NULL
return(p)
}
mcols= c( # color pallete optimised for omitting 1, if plotting using all the alternatives for 4:5, 7:8 are more useful
"#0072B2"
,"#E69F00"
,"#009E73"
,"#56B4E9" #,"#7570B3"
,"#D55E00" #,"#882255"
,"#66A61E"
,"#7570B3" #,"#D55E00"
,"#882255" #,"#D55E00" ####,"#CC79A7"
,"#F0E442"
,"#A6761D"
,"#AA4499"#
,"#117733"
,"#332288"
,"#999933")
lplot<-function(x,n,legend=T,xlab="",ylab="",main="",adjust=0,thresh=0.01){
coords=max((adjust+1),1):min(11,(ncol(x)+(adjust)))
print(coords)
par(mar=c(8.1,8.1,4.1,2.1),mgp=c(7,1,0))
pchl=c(21:25,15:20) # col - specifies color of 15:20 and border color in 21:25, bg=color of 21:25, nothing in 15:20
coll=c( # color pallete optimised for omitting 1, if plotting using all the alternatives for 4:5, 7:8 are more useful
"#0072B2"
,"#E69F00"
,"#009E73"
,"#56B4E9" #,"#7570B3"
,"#D55E00" #,"#882255"
,"#66A61E"
,"#7570B3" #,"#D55E00"
,"#882255" #,"#D55E00" ####,"#CC79A7"
,"#F0E442"
,"#A6761D"
,"#AA4499"#
,"#117733"
,"#332288"
,"#999933")
# for testing colors
#x=1:20
#x=cbind(x,x)
#plot(x,main="color scale")
#for(i in 1:length(coll)){
# abline(h=i,col=coll[i],lwd=10)
#}
if(ncol(x)>11){print("11 columns is the current maximum coded for plotting, columns 12 onwards ignored")}
plot(seq(floor(range(x[,(which(x==max(x,na.rm=T),arr=T))[1,2]])[1]),ceiling(range(x[,(which(x==max(x,na.rm=T),arr=T))[1,2]])[2]),length=nrow(x)),type="n",yaxt="n",xaxt="n",ylab=ylab,xlab=xlab,main=main,frame=F) # (which(x==max(x,na.rm=T),arr=T))[2] determiines which column has the highest number, the lowest ==0 (assumed)
abline(h=-log10(thresh),col="red",lty='dashed')
axis(1, at=1:nrow(x), labels=rownames(x),las=2) # las - changes orientation of labels
axis(2, at=0:ceiling(max(x,na.rm=T)),las=2) # las - changes orientation of labels
# axis(2, at=0:floor(max(x,na.rm=T)), labels=c("0","1","2","3","4","5","6","7")[0:(floor(max(x,na.rm=T))+1)],las=2) # las - changes orientation of labels
# axis(2, at=0:floor(max(x,na.rm=T)), labels=c("0","0.1","0.01","0.001","0.0001","0.00001","0.000001")[0:(floor(max(x,na.rm=T))+1)],las=2) # las - changes orientation of labels
# axis(2, at=0:floor(max(x,na.rm=T)), labels=c("100 %","10 %","1 %","0.1 %","0.01 %","0.001 %","0.0001 %")[0:(floor(max(x,na.rm=T))+1)],las=2) # las - changes orientation of labels
for(i in coords){
points(x[,i-(adjust)],pch=pchl[i],bg=coll[i],col=coll[i])
}
if(legend==T){
legend(x="topright",pch=pchl[coords],box.lwd=0,box.col="white",col=coll[coords],pt.bg=coll[coords],legend=colnames(x)[1:min(11,ncol(x))],cex=1)
}}
Heatmap<-function(cor.measures,min=-1,max=1,rowclust=F,colclust=F,ncols=101,dendrogram="none",main="",mode="cor",sig=T,cexrow=0.7,cexcol=0.7,margin=c(5,5),lheight=c(0.1,0.9),lwidth=c(0.7,0.3),lmatrix=rbind(c(4,3),c(1,2))){
library(gplots)
# print("Heatmap( 'matrix to use' , 'min value for legend cols' , 'max ..' , 'clust by rows', 'clust by cols' , 'ncols to use for legend', 'dendrogram=c('none','row','column','both')' )")
# min=-1
# max=1
# default settings for testing plots
#min=-1;max=1;rowclust=F;colclust=F;ncols=101;dendrogram="none";main="";mode="cor"
if(mode=="cor"){
print("plotting correlation based matrix")
# heatmap.2((cor.measures),breaks=seq(min,max,length=(ncols+1)),col=colorRampPalette(c("#0072B2","#56B4E9","white","#F0E442","darkred"))(ncols),tracecol=F,dendrogram=dendrogram,Rowv=rowclust,Colv=colclust,margins=margin,density.info="none",keysize=1,cexCol=0.7, cexRow=0.7, lheight=lheight,symkey=T,main=main,lmat=lmatrix)#,lwid=c(0.5,0.5))
heatmap.2((cor.measures),breaks=seq(min,max,length=(ncols+1)),col=colorRampPalette(c("#0072B2","#56B4E9","white","#F0E442","darkred"))(ncols),tracecol=F,dendrogram=dendrogram,Rowv=rowclust,Colv=colclust,margins=margin,density.info="none",keysize=1,cexCol=cexcol,cexRow=cexrow,lhei=lheight, symkey=T,main=main,lwid=lwidth,lmat=lmatrix)
# heatmap.2((cor.measures),breaks=seq(min,max,length=(ncols+1)),col=colorRampPalette(c("#08306b","#08519c","#2171b5","#4292c6","#6baed6","#9ecae1","#c6dbef","#deebf7","#f7fbff","white","#ffffcc","#ffeda0","#fed976","#feb24c","#fd8d3c","#fc4e2a","#e31a1c","#bd0026","#800026"))(ncols),tracecol=F,dendrogram=dendrogram,Rowv=rowclust,Colv=colclust,margins=margin,density.info="none",keysize=1,cexCol=cexcol,cexRow=cexrow,lhei=lheight, symkey=T,main=main,lwid=lwidth,lmat=lmatrix)
#"white","#ffffcc","#ffeda0","#fed976","#feb24c","#fd8d3c","#fc4e2a","#e31a1c","#bd0026","#800026"
}
if(mode=="pval"){
if(sig==T){
print("plotting p-value based matrix with cellnotes")
corm=round(cor.measures,digits=3)
cor.match=matrix(as.numeric(NA),nrow=nrow(cor.measures),ncol=ncol(cor.measures))
rownames(cor.match)=rownames(cor.measures)
colnames(cor.match)=colnames(cor.measures)
# Head(cor.match)
for(i in 1:nrow(corm)){
for(j in 1:ncol(corm)){
# cat(i,j,"\n")
if(corm[i,j]>0.05){
cor.match[i,j]=""
# print(">0.05")
}
if(corm[i,j]<=0.01){
cor.match[i,j]=gsub(" ","",paste(rep("*",min(4,floor(-log10(corm[i,j]))-1)),collapse=" ")) # backup square == .
# print("<0.01")
}
if(corm[i,j]<0.05 & corm[i,j]>0.01){
cor.match[i,j]="+"
# print("<0.05")
}
}
}
# heatmap.2(-log10(cor.measures),cellnote=(cor.match),notecol="black",breaks=seq(0,max(-log10(cor.measures)),length=(ncols+1)),col=colorRampPalette(c("white","#F0E442","darkred"))(ncols),tracecol=F,dendrogram=dendrogram,Rowv=rowclust,Colv=colclust,margins=margin,density.info="none",keysize=1,cexCol=cexcol,cexRow=cexrow,lhei=lheight,symkey=F,main=main,lwid=lwidth,lmat=lmatrix) #cexCol=1,cexRow=0.8,
heatmap.2(-log10(cor.measures),cellnote=(cor.match),notecol="black",breaks=seq(0,max(-log10(cor.measures)),length=(ncols+1)),col=colorRampPalette(c("white","#ffffbf","#fee090","#fdae61","#f46d43","#d73027","#a50026","darkred"))(ncols),tracecol=F,dendrogram=dendrogram,Rowv=rowclust,Colv=colclust,margins=margin,density.info="none",keysize=1,cexCol=cexcol,cexRow=cexrow,lhei=lheight,symkey=F,main=main,lwid=lwidth,lmat=lmatrix) #cexCol=1,cexRow=0.8,
# heatmap.2(-log10(cor.measures),cellnote=(cor.match),notecol="black",breaks=seq(0,max(-log10(cor.measures)),length=(ncols+1)),col=colorRampPalette(c("white","#ffffcc","#ffeda0","#fed976","#feb24c","#fd8d3c","#fc4e2a","#e31a1c","#bd0026","#800026"))(ncols),tracecol=F,dendrogram=dendrogram,Rowv=rowclust,Colv=colclust,margins=margin,density.info="none",keysize=1,cexCol=cexcol,cexRow=cexrow,lhei=lheight,symkey=F,main=main,lwid=lwidth,lmat=lmatrix) #cexCol=1,cexRow=0.8,
}
if(sig==F){
print("plotting p-value based matrix, no cellnotes")
# heatmap.2(-log10(cor.measures),breaks=seq(0,max(-log10(cor.measures)),length=(ncols+1)),col=colorRampPalette(c("white","#F0E442","darkred"))(ncols),tracecol=F,dendrogram=dendrogram,Rowv=rowclust,Colv=colclust,margins=margin,density.info="none",keysize=1,cexCol=cexcol,cexRow=cexrow,lhei=lheight,symkey=F,main=main,lwid=lwidth,lmat=lmatrix) #cexCol=1,cexRow=0.8,
heatmap.2(-log10(cor.measures),breaks=seq(0,max(-log10(cor.measures)),length=(ncols+1)),col=colorRampPalette(c("white","#ffffbf","#fee090","#fdae61","#f46d43","#d73027","#a50026","darkred"))(ncols),tracecol=F,dendrogram=dendrogram,Rowv=rowclust,Colv=colclust,margins=margin,density.info="none",keysize=1,cexCol=cexcol,cexRow=cexrow,lhei=lheight,symkey=F,main=main,lwid=lwidth,lmat=lmatrix)
# heatmap.2(-log10(cor.measures),breaks=seq(0,max(-log10(cor.measures)),length=(ncols+1)),col=colorRampPalette(c("white","#ffffcc","#ffeda0","#fed976","#feb24c","#fd8d3c","#fc4e2a","#e31a1c","#bd0026","#800026"))(ncols),tracecol=F,dendrogram=dendrogram,Rowv=rowclust,Colv=colclust,margins=margin,density.info="none",keysize=1,cexCol=cexcol,cexRow=cexrow,lhei=lheight,symkey=F,main=main,lwid=lwidth,lmat=lmatrix)
}
}
#min(na.omit(as.vector(cor.measures))),max(na.omit(as.vector(cor.measures)))
}
Heat<-function(cor.measures,values=F,values.rm='',Rowv=T,Colv=T,mingrey=F,values.cex=1,ncols=101,cexrow=0.7,cexcol=0.7,margin=c(12,12),dendrogram="both",verbose=F,...){
if(class(Rowv)=='logical'&class(Colv)=='logical'){
if(Rowv & Colv){dendrogram="both"}
if(Rowv & !Colv){dendrogram="row"}
if(!Rowv & Colv){dendrogram="column"}
if(!Rowv & !Colv){dendrogram="none"}
}
# can feasibly include a parameter for dendrogram as well, for more plottting flexibility
library(gplots)
min=floor(min(cor.measures))
max=ceiling(max(cor.measures))
if(verbose){cat('\tmin=',min,', max=',max,'\n') }
if(min<0 & max>=0){heat_colors =c(colorRampPalette(c("#0072B2","#56B4E9","white","#F0E442","darkred"))(ncols)); symmkey=T;ncols=ncols-1}
if(mingrey & min>=0 & max>=0){heat_colors=c('grey60',colorRampPalette(c("white","#F0E442","darkred"))(ncols));symmkey=F}
if(mingrey & min<=0 & max<=0){heat_colors=c('grey60',colorRampPalette(c("white","#56B4E9","#0072B2"))(ncols));symmkey=F}
if(!mingrey & min==0 & max>=0){heat_colors=c(colorRampPalette(c("white","#F0E442","darkred"))(ncols+1));symmkey=F}
if(!mingrey & min==0 & max<=0){heat_colors=c(colorRampPalette(c("white","#56B4E9","#0072B2"))(ncols+1));symmkey=F}
if(!values){
cor_heat=heatmap.2((cor.measures),breaks=seq(min,max,length=(ncols+2)),col=heat_colors,trace="none",dendrogram=dendrogram,Rowv=Rowv,Colv=Colv,margins=margin,density.info="none",keysize=1,cexCol=cexcol,cexRow=cexrow,symkey=symmkey,hclustfun=function(x) hclust(x, method="ward.D2"),...)#,hclustfun=function(x) hclust(x, method="ward.D2"))
}
if(values){
celdat=round(cor.measures,digits=2)
#celdat[celdat==min(celdat)]=""
if(values.rm!=''){
celdat[celdat%in%values.rm]=''
}
cor_heat=heatmap.2((cor.measures),cellnote=celdat,notecex=values.cex,,notecol="black",breaks=seq(min,max,length=(ncols+2)),col=heat_colors,trace="none",dendrogram=dendrogram,Rowv=Rowv,Colv=Colv,margins=margin,density.info="none",keysize=1,cexCol=cexcol,cexRow=cexrow,symkey=symmkey,hclustfun=function(x) hclust(x, method="ward.D2"),...)#,hclustfun=function(x) hclust(x, method="ward.D2"))
}
return(invisible(cor_heat))
}
read.file<-function(file,...){
read.delim(file=file,quote = "",colClasses = "character",check.names=F,comment.char="",...)
}
write.delim<-function(mat,file,row.names=T,col.names=T,missing.value.char="NA",sep="\t",...){
if(col.names==T & row.names==T){
col.names=NA
}
write.table(mat,file,row.names=row.names,col.names=col.names,sep=sep,quote=F,na=missing.value.char,...)
}
write.file<-function(mat,file,row.names=T,col.names=T,missing.value.char="NA",sep="\t",...){
if(col.names==T & row.names==T){
col.names=NA
}
write.table(mat,file,row.names=row.names,col.names=col.names,sep=sep,quote=F,na=missing.value.char,...)
}
DAVID_enrich_list<-function(backg,clusters_list,idType= "ENSEMBL_GENE_ID", query_type="Gene"){
library(RDAVIDWebService)
david=DAVIDWebService$new(email="aida.moreno-moral11@imperial.ac.uk")
cat("\t",length(backg),"background genes -------------------\n")
background = addList(david, backg, idType= "ENSEMBL_GENE_ID", listName="backg", listType="Background")
setCurrentBackgroundPosition(david, 1)
DAVID_clusters=list()
for (i in 1:length(clusters_list)){
cat("\t",length(clusters_list[[i]]),"cluster genes\t\t",i," of ",length(clusters_list),"\n")
result=addList(david, inputIds=clusters_list[[i]], idType=idType, listName=names(clusters_list)[i],listType=query_type)
setAnnotationCategories(david, "GOTERM_BP_ALL")
GOBPchart = getFunctionalAnnotationChart(david, threshold=0.1)
GOBPchart=GOBPchart[GOBPchart$FDR < 5,]
setAnnotationCategories(david, "GOTERM_MF_ALL")
GOMFchart = getFunctionalAnnotationChart(david, threshold=0.1)
GOMFchart=GOMFchart[GOMFchart$FDR < 5,]
setAnnotationCategories(david, "GOTERM_CC_ALL")
GOCCchart = getFunctionalAnnotationChart(david, threshold=0.1)
GOCCchart=GOCCchart[GOCCchart$FDR < 5,]
setAnnotationCategories(david, "KEGG_PATHWAY")
KEGGchart = getFunctionalAnnotationChart(david, threshold=0.1)
KEGGchart=KEGGchart[KEGGchart$FDR < 5,]
setAnnotationCategories(david, "OMIM_DISEASE")
OMIMchart = getFunctionalAnnotationChart(david, threshold=0.1)
OMIMchart=OMIMchart[OMIMchart$FDR < 5,]
# DAVID_clusters[[i]]=list("GOBPchart"=GOBPchart,"GOMFchart"=GOMFchart,"GOCCchart"=GOCCchart,"KEGGchart"=KEGGchart,"OMIMchart"=OMIMchart)
DAVID_clusters[[i]]=list(GOBPchart,GOMFchart,GOCCchart,KEGGchart,OMIMchart)
names(DAVID_clusters[[i]])=c("GOBPchart","GOMFchart","GOCCchart","KEGGchart","OMIMchart")
}
names(DAVID_clusters)=names(clusters_list)
return(DAVID_clusters)
}
DAVID_save_tables<-function(david_out,out_dir=getwd(),dataset_name=""){
cat("\tsaving tables to :\t",paste(out_dir,dataset_name,names(david_out),".terms.txt",sep=""),"\n")
# replace the david_out[[1]] with a higher level loop to support multiple module lists
# - may need to add checks to determine if annotations are present (but works well enough for now)
# - for proper pipeline may be worth editing out the un-necessary columns for easier formatting
write.delim(names(david_out),paste(out_dir,dataset_name,names(david_out),".terms.txt",sep=""),rownm=F,colnm=F)
for(itrm in 1:length(david_out[[1]])){
# spacers for easier overview
write.delim("",paste(out_dir,dataset_name,names(david_out),".terms.txt",sep=""),rownm=F,colnm=F,append=T)
write.delim("",paste(out_dir,dataset_name,names(david_out),".terms.txt",sep=""),rownm=F,colnm=F,append=T)
annot_table=david_out[[1]][[itrm]][,c("Term","X.","FDR","Genes")]
annot_table$Term=gsub(".*~","",annot_table$Term) ## remove trash from term names (should be fine to run both rather than loop KEGG/GO separately, neither have ':' or '~' in their names)
annot_table$Term=gsub(".*:","",annot_table$Term)
annot_table$FDR=annot_table$FDR/100 ## unfathonably the FDR is given as %
annot_table$X.=annot_table$X./100 ## % are easier to handle in Excell as decimals
colnames(annot_table)=c(names(david_out[[1]])[itrm],"Number of Genes (%)","Significance of Enrichment (FDR)","Ensembl Gene ID")
colnames(annot_table)=gsub("GOBPchart","Gene ontology biologial process (BP)",colnames(annot_table))
colnames(annot_table)=gsub("GOMFchart","Gene ontology molecular function (MF)",colnames(annot_table))
colnames(annot_table)=gsub("GOCCchart","Gene ontology cell compartment (CC)",colnames(annot_table))
colnames(annot_table)=gsub("KEGGchart","Gene ontology KEGG pathway",colnames(annot_table))
colnames(annot_table)=gsub("OMIMchart","Gene ontology OMIM term",colnames(annot_table))
write.delim(annot_table,paste(out_dir,dataset_name,names(david_out),".terms.txt",sep=""),append=T)
}
cat("\t•••••• NOTE : Multiple warnings about appending columns to file can be safely ignored ••••••\n")
}
plot_DAVID_enrich<-function(david_list_enrichments,out_path,name_module,module_info=""){
library(gplots)
library(ggplot2)
if(module_info!=""){
dir.create(file.path(out_path, module_info), showWarnings=F) # suppresses warnings generated if folder already exists
out_path=paste(out_path,module_info,sep="")
}
# dir.create(file.path(out_path, "david.annot"), showWarnings=F) # suppresses warnings generated if folder already exists
# dir.create(file.path(paste(out_path,"/david.annot",sep=""), "plots_GO"), showWarnings=F) # suppresses warnings generated if folder already exists
# dir.create(file.path(paste(out_path,"/david.annot",sep=""), "plots_KEGG"), showWarnings=F) # suppresses warnings generated if folder already exists
# dir.create(file.path(paste(out_path,"/david.annot",sep=""), "plots_OMIM"), showWarnings=F) # suppresses warnings generated if folder already exists
dir.create(file.path(out_path, "david_GO"), showWarnings=F) # suppresses warnings generated if folder already exists
dir.create(file.path(out_path, "david_KEGG"), showWarnings=F) # suppresses warnings generated if folder already exists
dir.create(file.path(out_path, "david_OMIM"), showWarnings=F) # suppresses warnings generated if folder already exists
#combine GO terms results in single table
GO_table=do.call(rbind,david_list_enrichments[1:3])
if(nrow(GO_table)>0){
GO_table=GO_table[with(GO_table, order(FDR,decreasing=T)), ]
GO_table$minusLog10BH= -log((GO_table$FDR/100), base=10)
GO_table$Term=gsub("GO:[0-9]+~","",GO_table$Term)
#add here if dim > 0, save pdf, add KEGG and OMIM
if(nrow(GO_table) > 10){number_plot=10}else{number_plot=nrow(GO_table)}
pdf(file=paste(out_path,"/david_GO/",module_info,"GO_functional_enrich_plot_",name_module,module_info,".pdf",sep=""),height=4,width=6)
GOBPplot = ggplot(GO_table[1:number_plot,c("Term","minusLog10BH")], aes(x=reorder(Term, -minusLog10BH), y=minusLog10BH)) + ylab("-Log10 (FDR)") +xlab("") +
geom_bar(colour="black",stat= "identity", width=.4, fill="darkred") + ggtitle(paste("GO ",name_module,module_info,sep="")) +
theme_bw(base_size=12, base_family="") + coord_flip()
print(GOBPplot)
dev.off()
write.delim(GO_table,paste(out_path,"/david_GO/",module_info,"GO_functional_enrich_plot_",name_module,module_info,".pdf",sep=""))
}
if(nrow(david_list_enrichments$KEGGchart)>0){
GO_table=david_list_enrichments$KEGGchart
GO_table=GO_table[with(GO_table, order(FDR,decreasing=T)), ]
GO_table$minusLog10BH= -log((GO_table$FDR/100), base=10)
GO_table$Term=unlist(lapply(strsplit(GO_table$Term,":"),function(x){x=x[2]}))
if(nrow(GO_table) > 10){number_plot=10}else{number_plot=nrow(GO_table)}
pdf(file=paste(out_path,"/david_KEGG/",module_info,"KEGG_functional_enrich_plot_",name_module,".pdf",sep=""),height=4,width=6)
GOBPplot = ggplot(GO_table[1:number_plot,c("Term","minusLog10BH")], aes(x=reorder(Term, -minusLog10BH), y=minusLog10BH)) + ylab("-Log10 (FDR)") +xlab("") +
geom_bar(colour="black",stat= "identity", width=.4, fill="darkgrey") + ggtitle(paste("KEGG ",name_module,module_info,sep="")) +
theme_bw(base_size=12, base_family="") + coord_flip()
print(GOBPplot)
dev.off()
write.delim(GO_table,paste(out_path,"/david_GO/",module_info,"KEGG_enrich_plot_",name_module,module_info,".pdf",sep=""))
}
if(nrow(david_list_enrichments$OMIMchart)>0){
GO_table=david_list_enrichments$OMIMchart
GO_table=GO_table[with(GO_table, order(FDR,decreasing=T)), ]
GO_table$minusLog10BH= -log((GO_table$FDR/100), base=10)
GO_table$Term=unlist(lapply(strsplit(GO_table$Term,":"),function(x){x=x[2]}))
#add here if dim > 0, save pdf, add KEGG and OMIM
if(nrow(GO_table) > 10){number_plot=10}else{number_plot=nrow(GO_table)}
pdf(file=paste(out_path,"/david_OMIM/",module_info,"OMIM_functional_enrich_plot_",name_module,module_info,".pdf",sep=""),height=4,width=6)
GOBPplot = ggplot(GO_table[1:number_plot,c("Term","minusLog10BH")], aes(x=reorder(Term, -minusLog10BH), y=minusLog10BH)) + ylab("-Log10 (FDR)") +xlab("") +
geom_bar(colour="black",stat= "identity", width=.4, fill="darkgrey") + ggtitle(paste("OMIM ",name_module,module_info,sep="")) +
theme_bw(base_size=12, base_family="") + coord_flip()
print(GOBPplot)
dev.off()
write.delim(GO_table,paste(out_path,"/david_GO/",module_info,"OMIM_enrich_plot_",name_module,module_info,".pdf",sep=""))
}
}
heatmap_DAVID_enrich<-function(david_list_enrichments,out_path=getwd(),histH=20,histW=10,terms=c("KEGGchart","GOBPchart","GOMFchart","GOCCchart","OMIMchart"),points_sig=F,module_info=""){
print(" NOTE : requires custom functions 'Heatmap' and 'make.numeric'")
####-----------------------------------------------------------------------------------------------------------
## TERM
for(iterm in 1:length(terms)){
print(paste("============================",terms[iterm],"============================"))
# create empty matrix to simplify the first merge
TERM=as.data.frame(matrix("--",ncol=1,nrow=1))
colnames(TERM)="Term"
names(david_list_enrichments)=gsub("###","",names(david_list_enrichments))
count=0
for(imod in 1:length(david_list_enrichments)){
if(nrow(david_list_enrichments[[names(david_list_enrichments)[imod]]][[terms[iterm]]])>=1){
term=(david_list_enrichments[[names(david_list_enrichments)[imod]]][[terms[iterm]]][c("Term","FDR")])
term$FDR=(term$FDR/100) # David FDR is in % => need to convert
colnames(term)[2]=paste("M",names(david_list_enrichments)[imod],sep="")
TERM=merge(TERM,term,by="Term",all=T)
print(paste("module",names(david_list_enrichments)[imod],nrow(term),terms[iterm],"found"))
}
if(nrow(david_list_enrichments[[names(david_list_enrichments)[imod]]][[terms[iterm]]])<1){
term=as.data.frame(matrix("--",ncol=2,nrow=1))
colnames(term)=c("Term",paste("M",names(david_list_enrichments)[imod],sep=""))
TERM=merge(TERM,term,by="Term",all=T)
print(paste("module",names(david_list_enrichments)[imod],"no",terms[iterm],"found ***"))
}
}
# clean up TERM matrix
rownames(TERM)=TERM$Term
TERM=(TERM[-which(TERM$Term=="--"),-which(colnames(TERM)=="Term")])
# remove Pathway ID numbers
if(terms[iterm] %in% c("GOBPchart","GOMFchart","GOCCchart")){
rownames(TERM)=gsub("GO:.*~","",rownames(TERM))
}
if(terms[iterm]=="KEGGchart"){
rownames(TERM)=gsub(".*:","",rownames(TERM))
}
TERM=TERM[do.call(order, as.data.frame(TERM)),] #could be ..as.data.frame(mat[,index_vec])..
write.table(TERM,file=paste(out_path,"combined.David.Annot.",module_info,terms[iterm],".txt",sep=""),sep="\t",col.names=NA)
TERM[is.na(TERM)]=1 # the Heatmap function does not handle -log10(0) nor NA very well
TERM=make.numeric(TERM)
if(nrow(TERM)>=1){
if(nrow(TERM<100)){
pdf(paste(out_path,module_info,terms[iterm],".pdf",sep=""),height=12,width=20)
Heatmap(make.numeric(TERM),mode="pval",sig=points_sig,margin=c(5,1),cexrow=1.5,cexcol=1) # Heatmap automatically does -log10() of the input matrix
dev.off()
}
if(nrow(TERM)>100){
pdf(paste(out_path,module_info,terms[iterm],".pdf",sep=""),height=histH,width=histW)
itimes=floor(nrow(TERM)/100)
for(iloop in 1:itimes){
if(iloop==1){Heatmap(make.numeric(TERM[1:100,]),mode="pval",sig=points_sig,margin=c(5,1),cexcol=1)}
if(iloop>1 & iloop<itimes){Heatmap(make.numeric(TERM[(iloop*100+1):(iloop*100+100),]),mode="pval",sig=points_sig,margin=c(5,1),cexcol=1)}
if(iloop==itimes){
# print("final loop")
term1=TERM[(iloop*100):((iloop*100+100)-((iloop*100+100)-nrow(TERM))),]
term2=matrix(1,nrow=100-nrow(term1),ncol=ncol(term1))
colnames(term2)=colnames(term1)
term=make.numeric(rbind(term1,term2))
Heatmap(term,mode="pval",sig=points_sig,margin=c(5,1),cexcol=1)
}
}
dev.off()
}
}
}
}
hist.norm<-function(x,normCurv=T,col='lightgray',points=F,density=F,prob=F,...){ #density=F, # need to add some sort of axis scaling is not very compatible
# breaks=max(10,sqrt(length(x)+100))
y=hist(x,prob=prob,...)
if(density){ # requires prob=TRUE for probabilities not counts
if(!prob){warning("\tWARNING: density plot is currently optimised for prob=T\n")}
lines(density(x), col="darkred", lwd=2) # add a density estimate with defaults
lines(density(x, adjust=2), lty="dotted", col="darkgreen", lwd=2) # add another "smoother" density
}
if(points){rug(x,)} # add tick marks below histogram
if(normCurv){
if(prob){warning("\tWARNING: normal curve fit is currently optimised for prob=F\n")}
a=min(x,na.rm=T)
b=max(x,na.rm=T)
xx=seq(a-(b-a)/10,b+(b-a)/10,length=100)
lines(sort(xx),dnorm(sort(xx),median(x,na.rm=T),sd(x,na.rm=T))*sum(y$counts*diff(y$breaks)),col='dodgerblue')
}
return(invisible(y))
# to work need to construct the plot_cols and corresponding plot_lines & add a layout param dependent on legend=T to provide space for it on the left
# legend(x="topright",pch=16,box.lwd=0,box.col="white",col=plot_cols,legend=plot_lines)
}
hist.dens<-function(x,points=F,col='lightgray',density=T,prob=T,...){ #density=F, # need to add some sort of axis scaling is not very compatible
y=hist(x,breaks=max(10,sqrt(length(x)+100)),prob=prob,...)
if(density){ # requires prob=TRUE for probabilities not counts
if(!prob){warning("\tWARNING: density plot is currently optimised for prob=T\n")}
lines(density(x), col="darkred", lwd=2) # add a density estimate with defaults
lines(density(x, adjust=2), lty="dotted", col="darkgreen", lwd=2) # add another "smoother" density
}
if(points){rug(x,)} # add tick marks below histogram
return(invisible(y))
# to work need to construct the plot_cols and corresponding plot_lines & add a layout param dependent on legend=T to provide space for it on the left
# legend(x="topright",pch=16,box.lwd=0,box.col="white",col=plot_cols,legend=plot_lines)
}
#hist(eruptions,,breaks=30, prob=TRUE, col="grey")# prob=TRUE for probabilities not counts
#lines(density(eruptions), col="blue", lwd=2) # add a density estimate with defaults
#lines(density(eruptions, adjust=2), lty="dotted", col="darkgreen", lwd=2)
#### Andree Delahaye-Duriez #### 2014/01/12
###########################
### Fisher's exact test ###
### pipeline ###
###########################
############## to test whether each cluster of a list of clusters
############## has FET enrichment in EE DNMs compared to DNMs in NAFE controls
############## function EE_FET
EE_FET <- function(clusters_list){
### load data needed from the "dataToLoad_forFETinEE" folder
load("~/Dropbox/LONDON-SINGAPORE/TOOLS/FETenrichment_in_EEDNMs_pipeline/input/dataToLoad_forFETinEE/NAFE_ctrlAllDNMs.Rdata") # path to be changed
load("~/Dropbox/LONDON-SINGAPORE/TOOLS/FETenrichment_in_EEDNMs_pipeline/input/dataToLoad_forFETinEE/NAFE_ctrlNsDNMs.Rdata") # path to be changed
load("~/Dropbox/LONDON-SINGAPORE/TOOLS/FETenrichment_in_EEDNMs_pipeline/input/dataToLoad_forFETinEE/EE_allDNMs.Rdata") # path to be changed
load("~/Dropbox/LONDON-SINGAPORE/TOOLS/FETenrichment_in_EEDNMs_pipeline/input/dataToLoad_forFETinEE/EE_nsDNMs.Rdata") # path to be changed
### annotation of genes with ENS gene ID
library(biomaRt)
ensembl=useMart("ensembl")
HUMensembl=useMart('ensembl',dataset='hsapiens_gene_ensembl')
eeGene = EE$Gene
nseeGene = nsEE$Gene
nafeGene =NAFE[,'CCDS_r14']
nsnafeGene =nsNAFE[,'CCDS_r14']
eeENS =getBM(attributes=c('ensembl_gene_id','external_gene_name'), filters='external_gene_name', values=eeGene, mart=HUMensembl)
nseeENS =getBM(attributes=c('ensembl_gene_id','external_gene_name'), filters='external_gene_name', values=nseeGene, mart=HUMensembl)
nafeENS =getBM(attributes=c('ensembl_gene_id','external_gene_name'), filters='external_gene_name', values=nafeGene, mart=HUMensembl)
nsnafeENS =getBM(attributes=c('ensembl_gene_id','external_gene_name'), filters='external_gene_name', values=nsnafeGene, mart=HUMensembl)
## keep in mind that there are sometimes several ENS gene id for only 1 external_gene_name
#length(which(duplicated(eeENS$external_gene_name) == TRUE))
#[1] 29
### create a matrix for results
EE_FET_clusters = matrix(nrow=length(clusters_list), ncol=6)
row.names(EE_FET_clusters) = names (clusters_list)
colnames(EE_FET_clusters) = c("FET p.value all DNMs","OR all DNMs","[95% CI] all DNMs","FET Pvalue nsDNMs","OR nsDNMs","[95% CI] nsDNMs")
### function to fill the matrix of results
for (i in 1:length(clusters_list)){
## function to calculate the number Mc of DNMs in CTRL involving a gene of the cluster i
y = lapply(clusters_list[[i]],FUN=function(x) {nafeENS[which(nafeENS$ensembl_gene_id == x),'external_gene_name']})
Mc = sum(sapply(as.matrix(unique(y)),FUN=function(ym) {length(which(NAFE[,'CCDS_r14'] == ym ))}))
## number NMc of remaining DNMs in CTRL involving a gene not in the cluster i
NMc = nrow(NAFE)-Mc
##function to calculate the number Mee of DNMs in EE involving a gene of the cluster i
z = lapply(clusters_list[[i]],FUN=function(x) {eeENS[which(eeENS$ensembl_gene_id == x),'external_gene_name']})
Mee = sum(sapply(as.matrix(unique(z)),FUN=function(zm) {length(which(EE$Gene == zm ))}))
## number NMee of remaining DNMs in EE involving a gene not in the cluster i
NMee = nrow(EE)-Mee
## function to calculate the number Mnsc of nsDNMs in CTRL NAFE involving a gene of the cluster i
nsy = lapply(clusters_list[[i]],FUN=function(x) {nsnafeENS[which(nsnafeENS$ensembl_gene_id == x),'external_gene_name']})
Mnsc = sum(sapply(as.matrix(unique(nsy)),FUN=function(ym) {length(which(nsNAFE[,'CCDS_r14'] == ym ))}))
## number NMnsc of remaining nsDNMs in CTRL involving a gene not in the cluster i
NMnsc = nrow(nsNAFE)-Mnsc
##function to calculate the number Mnsee of nsDNMs in EE involving a gene of the cluster i
nsz = lapply(clusters_list[[i]],FUN=function(x) {nseeENS[which(nseeENS$ensembl_gene_id == x),'external_gene_name']})
Mnsee = sum(sapply(as.matrix(unique(nsz)),FUN=function(zm) {length(which(nsEE$Gene == zm ))}))
## number NMnsee of remaining nsDNMs in EE involving a gene not in the cluster i
NMnsee = nrow(nsEE)-Mnsee
# contingency matrice for Fisher Exact Test FET all DNMs and ns DNMs
matrALL = matrix(c(Mee,Mc,NMee,NMc), nrow=2)
matrNS = matrix(c(Mnsee,Mnsc,NMnsee,NMnsc), nrow=2)
# FET
FisherMEE = fisher.test(matrALL)
FisherMEEp = FisherMEE$p.value
FisherMEEor = FisherMEE$estimate
FisherMEEci = paste(FisherMEE$conf.int[1],FisherMEE$conf.int[2], sep="-")
FisherMnsEE = fisher.test(matrNS)
FisherMnsEEp = FisherMnsEE$p.value
FisherMnsEEor = FisherMnsEE$estimate
FisherMnsEEci = paste(FisherMnsEE$conf.int[1],FisherMnsEE$conf.int[2], sep="-")
EE_FET_clusters[i,]=c(FisherMEEp,FisherMEEor,FisherMEEci,FisherMnsEEp,FisherMnsEEor,FisherMnsEEci)
}
write.table(EE_FET_clusters, sep='\t', file='EE_FET_clusters.txt', row.names=TRUE, quote=FALSE, col.names=NA)
return(EE_FET_clusters)
}
gwas.enrich<-function(in_gwas,module,bkgrnd,nperm=100000,seed=0,type=1){
cat("\n\tFUNCTION : gwas.enrich function inputs : in_gwas, module, bkgrnd, nperm=100000, seed=0, type=1\n")
cat("\tINPUTS : in_gwas - link to a csv file, ENSG - pval | module - list of modules | bkgrnd - matrix first column contains the ENSG list\n\n")
set.seed(seed)
options(stringsAsFactors=FALSE)
gwas=read.csv(file=in_gwas,sep="\t",header=FALSE)
# module=read.table(file=in_module,sep=" ")
# bkgrnd=read.table(file=bkgrnd,sep=" ")
enrich=as.data.frame(matrix(NA,nrow=length(names(module)),ncol=5))
rownames(enrich)=names(module)
colnames(enrich)=c("n.genes.module","pc.overlap.genes","GWAS.P.value","GWAS.FDR","GWAS.Bonferroni")
for(imod in names(module)){
idx=match(bkgrnd[,1],gwas[,1])
idx=sort(idx)
gwas_final=gwas[idx,]
idy=match(as.matrix(module[[imod]])[,1],gwas_final[,1])
idy=sort(idy)
module_final=gwas_final[idy,]
enrich[imod,"n.genes.module"]=length(module[[imod]])
enrich[imod,"pc.overlap.genes"]=round(length(idy)/length(module[[imod]]),digits=3)
## type 1=z-test (USE THIS ONE)
if(type==1){
cat("\t'z-test' enrichment || ",imod,"\t: ",which(names(module)==imod),"of",length(names(module)),"\n")
module_p=mean(-log10(module_final[,2]),na.rm=TRUE)
}
## type 2=fishers combined p-value
else if(type==2){
cat(" 'fisher's combined p-value' enrichment || ",imod," : ",which(names(module)==imod),"of",length(names(module)),"\n")
df=2*length(module_final[,2])
temp1=log(module_final[,2])
temp2= -2*sum(temp1)
module_p=pchisq(temp2,df,lower.tail=FALSE)
}
## type 3=stouffer combined p-value
else if(type==3){
cat(" 'stouffer combined p-value' enrichment || ",imod," : ",which(names(module)==imod),"of",length(names(module)),"\n")
temp1=qnorm(1-module_final[,2])/sqrt(length(module_final[,2]))
module_p=sum(temp1[!is.infinite(temp1)])
}
output_stat=vector()
for(i in 1:nperm){
rand1=sample(gwas_final[,2],length(module_final[,2]),replace=FALSE)
rand1=rand1[!is.infinite(rand1)]
if(type==1){
output_stat[i]=mean(-log10(rand1),na.rm=TRUE)
}else if(type==2){
df=2*length(rand1)
temp1=log(rand1)
temp2= -2*sum(temp1)
output_stat[i]=pchisq(temp2,df,lower.tail=FALSE)
}else if(type==3){
tempt=qnorm(1-rand1)/sqrt(length(rand1))
output_stat[i]=sum(tempt[!is.infinite(tempt)])
}
cat(round(i/nperm,digits=2),"\r");flush.console()
}
output_stat_sd=sd(output_stat)
output_stat_mean=mean(output_stat)
Z=(module_p - output_stat_mean)/(output_stat_sd)
# print(Z)
# enrich=pnorm(abs(Z),low=FALSE)
enrich[imod,"GWAS.P.value"]=pnorm(module_p,output_stat_mean,output_stat_sd,lower.tail=FALSE)
#enrich[imod,"n.genes.module"]=nrow(module_final)
# testingWTF1=t.test(output_stat,mu=module_p)
# testingWTF2=t.test(output_stat,mu=module_p,alternative="less")
# testingWTF3=t.test(output_stat,mu=module_p,alternative="greater")
}
enrich$GWAS.FDR=p.adjust(enrich$GWAS.P.value,method="fdr")
enrich$GWAS.Bonferroni=p.adjust(enrich$GWAS.P.value,method="bonferroni")
return(enrich)
}
p.adjust.mat<-function(p_mat,method='fdr',single_col=F,single_row=F,verbose=F){
## p.adjust appears to be designed to handle NA values
if(single_row & single_col){
stop('\n\tERROR :\tto perform "p.adjust()" across whole matrix use "single_col=F" & "single_row=F", ie default\n\n')
}
if(!single_col & !single_row){
if(verbose){cat('\n\t"p.adjust()" on whole matrix simultaneously using "method =',method,'"\n\n')}
adj_mat=matrix(p.adjust(unlist(p_mat),method=method),nrow=nrow(p_mat))
rownames(adj_mat)=rownames(p_mat)
colnames(adj_mat)=colnames(p_mat)
return(invisible(adj_mat))
}
if(single_col & ! single_row){
if(verbose){cat('\n\t"p.adjust()" individually on each column of supplied matrix "method =',method,'"\n\n')}
adj_mat=matrix(NA,nrow=nrow(p_mat),ncol=ncol(p_mat))
rownames(adj_mat)=rownames(p_mat)
colnames(adj_mat)=colnames(p_mat)
for(icol in colnames(p_mat)){
adj_mat[,icol]=p.adjust(p_mat[,icol],method=method)
}
return(invisible(adj_mat))
}
if(single_row){
if(verbose){cat('\n\t"p.adjust()" individually on each row of supplied matrix "method =',method,'"\n\n')}
adj_mat=matrix(NA,nrow=nrow(p_mat),ncol=ncol(p_mat))
rownames(adj_mat)=rownames(p_mat)
colnames(adj_mat)=colnames(p_mat)
for(irow in colnames(p_mat)){
adj_mat[irow,]=p.adjust(p_mat[irow,],method=method)
}
return(invisible(adj_mat))
}
}
gwas_module_enrich<-function(in_gwas,module,bkgrnd,nperm=100000,seed=0,type=1){
cat("\tINPUTS : in_gwas - link to a csv file, ENSG - pval | module - list of modules | bkgrnd - matrix first column contains the ENSG list")
set.seed(seed)
options(stringsAsFactors=FALSE)
gwas=read.csv(file=in_gwas,sep=",",header=FALSE)
# module=read.table(file=in_module,sep=" ")
# bkgrnd=read.table(file=bkgrnd,sep=" ")
enrich=as.data.frame(matrix(NA,nrow=length(names(module)),ncol=5))
rownames(enrich)=names(module)
colnames(enrich)=c("n.genes.module","pc.overlap.genes","GWAS.P.value","GWAS.FDR","GWAS.Bonferroni")
for(imod in names(module)){
idx=match(bkgrnd[,1],gwas[,1])
idx=sort(idx)
gwas_final=gwas[idx,]
idy=match(as.matrix(module[[imod]])[,1],gwas_final[,1])
idy=sort(idy)
module_final=gwas_final[idy,]
enrich[imod,"n.genes.module"]=length(module[[imod]])
enrich[imod,"pc.overlap.genes"]=round(length(idy)/length(module[[imod]]),digits=3)
## type 1=z-test (USE THIS ONE)
if(type==1){
cat(" 'z-test' enrichment || ",imod," : ",which(names(module)==imod),"of",length(names(module)),"\n")
module_p=mean(-log10(module_final[,2]),na.rm=TRUE)
}
## type 2=fishers combined p-value
else if(type==2){
cat(" 'fisher's combined p-value' enrichment || ",imod," : ",which(names(module)==imod),"of",length(names(module)),"\n")
df=2*length(module_final[,2])
temp1=log(module_final[,2])
temp2= -2*sum(temp1)
module_p=pchisq(temp2,df,lower.tail=FALSE)
}
## type 3=stouffer combined p-value
else if(type==3){
cat(" 'stouffer combined p-value' enrichment || ",imod," : ",which(names(module)==imod),"of",length(names(module)),"\n")
temp1=qnorm(1-module_final[,2])/sqrt(length(module_final[,2]))
module_p=sum(temp1[!is.infinite(temp1)])
}
output_stat=vector()
for(i in 1:nperm){
rand1=sample(gwas_final[,2],length(module_final[,2]),replace=FALSE)
rand1=rand1[!is.infinite(rand1)]
if(type==1){
output_stat[i]=mean(-log10(rand1),na.rm=TRUE)
}else if(type==2){
df=2*length(rand1)
temp1=log(rand1)
temp2= -2*sum(temp1)
output_stat[i]=pchisq(temp2,df,lower.tail=FALSE)
}else if(type==3){
tempt=qnorm(1-rand1)/sqrt(length(rand1))
output_stat[i]=sum(tempt[!is.infinite(tempt)])
}
cat(round(i/nperm,digits=2),"\r");flush.console()
}
output_stat_sd=sd(output_stat)
output_stat_mean=mean(output_stat)
Z=(module_p - output_stat_mean)/(output_stat_sd)
# print(Z)
# enrich=pnorm(abs(Z),low=FALSE)
enrich[imod,"GWAS.P.value"]=pnorm(module_p,output_stat_mean,output_stat_sd,lower.tail=FALSE)
#enrich[imod,"n.genes.module"]=nrow(module_final)
# testingWTF1=t.test(output_stat,mu=module_p)
# testingWTF2=t.test(output_stat,mu=module_p,alternative="less")
# testingWTF3=t.test(output_stat,mu=module_p,alternative="greater")
}
enrich$GWAS.FDR=p.adjust(enrich$GWAS.P.value,method="fdr")
enrich$GWAS.Bonferroni=p.adjust(enrich$GWAS.P.value,method="bonferroni")
return(enrich)
}
sm<-function(x){
return(as.matrix(sort(x,decreasing=T)))
}
smt<-function(x){
return(as.matrix(sort(table(x),decreasing=T)))
}
wgcnaPickBeta<-function(list_expr,saveTable=F,savePlots=F,outDir=getwd(),datDescr=""){
print(" NOTE : Input data (list_expr) is expected as a list with each entry : rows=genes, columns=samples")
print(" - currently does not return the tables - information is printed // can also be saved as plots and tables")
library('WGCNA')
allowWGCNAThreads()
set.seed(0) # reproducibility
for(ireg in 1:length(names(list_expr))){
run_time=Sys.time()
print(paste("--------------------",names(list_expr)[ireg],"---------------------",ireg,"of",length(names(list_expr))))
thChoice=pickSoftThreshold(as.matrix(t(list_expr[[names(list_expr)[ireg]]])),corFn='bicor')
print(thChoice$power)
if(saveTable==T){
write.table(thChoice$fitIndices[order(thChoice$fitIndices$mean.k.,decreasing=T),],paste(outDir,"tables/WGCNA.PowerTABLE-(beta).",datDescr,".",names(list_expr)[ireg],".Power",thChoice$power,".txt",sep=""),sep="\t",quote=F,row.names=F)
}
if(savePlots==T){
pdf(file=paste(outDir,"plots/WGCNA.PowerPLOT-(beta).",datDescr,".",names(list_expr)[ireg],".Power",thChoice$power,".pdf",sep=""),width=15,height=5)
#sizeGrWindow(9, 5)
par(mfrow=c(1,3))
cex1=0.9
# Scale-free topology fit index as a function of the soft-thresholding power
plot(thChoice$fitIndices[2:10,"Power"], -sign(thChoice$fitIndices[2:10,"slope"])*thChoice$fitIndices[2:10,"SFT.R.sq"],
xlab="Soft Threshold (power)",ylab="Scale Free Topology Model Fit,signed R^2",type="n",
main=paste("Scale independence"))
text(thChoice$fitIndices[2:10,"Power"], -sign(thChoice$fitIndices[2:10,"slope"])*thChoice$fitIndices[2:10,"SFT.R.sq"],
labels=thChoice$fitIndices[2:10,"Power"],cex=cex1,col="red")
# this line corresponds to using an R^2 cut-off of h
abline(h=0.8,col="red")
# Mean connectivity as a function of the soft-thresholding power
plot(thChoice$fitIndices[2:10,"Power"], thChoice$fitIndices[2:10,"mean.k."],
xlab="Soft Threshold (power)",ylab="Mean Connectivity", type="n",
main=paste("Mean connectivity"))
text(thChoice$fitIndices[2:10,"Power"], thChoice$fitIndices[2:10,"mean.k."],
labels=thChoice$fitIndices[2:10,"Power"], cex=cex1,col="red")
plot(thChoice$fitIndices[2:10,"mean.k."],-sign(thChoice$fitIndices[2:10,"slope"])*thChoice$fitIndices[2:10,"SFT.R.sq"],xlim=c(ceiling(max(thChoice$fitIndices[2:10,"mean.k."])),0)
,ylab="Mean Connectivity",xlab="Scale Free Topology Model Fit,signed R^2",type="n",main=paste("Scale independence"))
text(thChoice$fitIndices[2:10,"mean.k."],-sign(thChoice$fitIndices[2:10,"slope"])*thChoice$fitIndices[2:10,"SFT.R.sq"],labels=thChoice$fitIndices[c(2:10),"Power"],cex=1,col="red")
abline(h=0.8,col="red")
dev.off()
}
print(Sys.time()-run_time)
}
# return(pickSoftThreshold) <= needs to be returned as a list of all the objects
}
wgcna.beta<-function(expr_mat,corFn='cor',networkType='signed',corOptions='spearman',...){
cat('\tNOTE\t: use - apply WGCNA pickSoftThreshold() to matrix: samples=columns, genes=rows\n\n')
if(corFn=='cor'){cat('\t\t correlation function: ',corFn,' - ',corOptions,', network type: ',networkType,'\n\n',sep='')}
if(corFn=='bicor'){cat('\t\t correlation function',corFn,'network type:',networkType,'\n\n')}
library('WGCNA')
allowWGCNAThreads()
set.seed(0) # reproducibility
thChoice=pickSoftThreshold(t(expr_mat),...)
cat('\n\tpower sugggested by WGCNA:',thChoice$power,'\n\n',verbose=1)
return(invisible(thChoice))
}
wgcna.mods<-function(expr_mat,softPower=7,signType='signed',mergeCutHei=0.15,dat_descr='',...){
cat('\tUSE:\tperform basic WGCNA clustering analysis for a given matrix, return full outputs as list\n')
library(WGCNA)
allowWGCNAThreads()
t1=Sys.time()
minModSize=min(40, nrow(expr_mat)/2 )
# mergeCutHei=0.15
modules=blockwiseModules(t(expr_mat), # Input data, expect: samples - rows, genes - columns
# Data checking options ----------------------------------------------
checkMissingData=TRUE,
# Options for splitting data into blocks ----------------------------------------------
blocks=NULL,
maxBlockSize=30000,
blockSizePenaltyPower=5,
randomSeed=12345,
# if load TOM from previously saved file ----------------------------------------------
loadTOM=FALSE,
# Network construction arguments: correlation options ----------------------------------------------
corType="bicor", # pearson - default
maxPOutliers=1,
quickCor=0,
pearsonFallback="individual",
cosineCorrelation=FALSE,
# Adjacency function options ----------------------------------------------
power=softPower,
networkType=signType, # options 'signed', 'unsigned', default='signed'
# Topological overlap options ----------------------------------------------
TOMType="signed",
TOMDenom="min",
# Saving or returning TOM ----------------------------------------------
getTOMs=NULL,
saveTOMs=FALSE,
saveTOMFileBase="blockwiseTOM",
# Basic tree cut options ========================================================================
deepSplit=2, # default=2 simplified version of tree-cutting (may be worth looking at tree first and defining proper threshold)
detectCutHeight=0.995, # dendrogram cut height for module detection. See cutreeDynamic for more details
minModuleSize=minModSize, # minimum module size for module detection. See cutreeDynamic for more details
# Advanced tree cut options----------------------------------------------
maxCoreScatter=NULL, # maximum scatter of the core for a branch to be a cluster, given as the fraction of cutHeight relative to the 5th percentile of joining heights. See cutreeDynamic for more details
minGap=NULL,
maxAbsCoreScatter=NULL, minAbsGap=NULL,
minSplitHeight=NULL, minAbsSplitHeight=NULL,
useBranchEigennodeDissim=FALSE,
minBranchEigennodeDissim=mergeCutHei,
stabilityLabels=NULL,
minStabilityDissim=NULL,
pamStage=TRUE, pamRespectsDendro=TRUE,
# Gene reassignment, module trimming, and module "significance" criteria ----------------------------------------------
reassignThreshold=1e-6,
minCoreKME=0.5,
# minCoreKMESize=minModuleSize/3, minModuleSize not found error..?
minKMEtoStay=0.3,
# Module merging options ----------------------------------------------
mergeCutHeight=mergeCutHei,
impute=TRUE,
trapErrors=FALSE,
# Output options ----------------------------------------------
numericLabels=FALSE,
# Options controlling behaviour ----------------------------------------------
nThreads=0,
verbose=1, # options '0', '1' , '2' , '3'
indent=0,...)
collectGarbage()
print(Sys.time()-t1)
## generating mstat is a bit messy..
mstat=as.data.frame(table(modules$colors))
mstat=mstat[order(mstat[,2],decreasing=T),]
msta0=mstat[(mstat[,1]=='grey'),]
msta0$module='M0'
mstat=mstat[!(mstat[,1]=='grey'),]
mstat$module=paste0('M',1:nrow(mstat))
mstat=rbind(mstat,msta0)
colnames(mstat)=c('color','ngenes','module')
mstat$color=as.character(mstat$color)
# mstat$module=paste(mstat$module,mstat$color,sep="_") ## module name contains color
if(dat_descr!=''){mstat$module=paste(mstat$module,dat_descr,sep="_")} ## add info after module name
mes=modules$MEs
rownames(mes)=colnames(expr_mat)
colnames(mes)=gsub('ME','',colnames(mes))
mes=mes[,mstat$color]
module_list=list()
module_expr=list()
for(imod in 1:nrow(mstat)){
module_expr[[mstat$module[imod]]]=expr_mat[modules$colors==mstat$color[imod],]
module_list[[mstat$module[imod]]]=rownames(module_expr[[mstat$module[imod]]])
}
mbg=as.data.frame('bkgrnd')
mbg$length=nrow(expr_mat)
mbg$name='bkgrnd'
colnames(mbg)=colnames(mstat)
mstat=rbind(mstat,mbg)
module_expr[['bkgrnd']]=expr_mat
module_list[['bkgrnd']]=rownames(expr_mat)
cat('\n\tModules are named based on size M1 - biggest, M0 - unclustered, bkgrnd - all input genes, output contains :
\t1.\tmodule_list - list containing names of genes in each module
\t2.\tmodule_expr - expression matrix of all genes in module
\t3.\tMEs - module eigengenes (PC1) for each module
\t4.\tmstat - key used to name modules, includes module size
\t5.\twgcna_out - object containing full output of WGNCA blockwiseModules()
\n')
return(invisible(list(module_list=module_list,module_expr=module_expr,MEs=mes,mstat=mstat,wgcna_out=modules)))
}
wgcna.consensus<-function(list_expr,dat_descr='',corType='spearman',power=6,signType="signed",max_block_n=20000,...){
cat("\tNOTE :\tInput data (list_expr) is expected as a list with each entry : rows=genes, columns=samples\n")
# cat("\tWARNING :\tthe checkMissingData option for WGCNA is disabled, set checkMissingData=T in the code to make robust to missing, alternatively use sd.check or is.missing to determine/remove non-varying or missing values\n")
library(WGCNA)
allowWGCNAThreads()
# minModSize=min(40, nrow(expr_mat)/2 )
for(ilis in 1:length(list_expr)){
list_expr[[names(list_expr)[ilis]]]=list(data=t(list_expr[[names(list_expr)[ilis]]]))
}
t0=Sys.time()
modules=blockwiseConsensusModules(
list_expr ## lists with $data in each set containing expression: rows=samples, cols=genes
# Data checking options ------------------------------------------
,checkMissingData=T ## checks for missing and zero variance in expression..
# Blocking options ------------------------------------------
,blocks=NULL
,maxBlockSize=max_block_n ## ensure no separation is performed // if outdated machine ~4GB RAM, may need to change this to ~5,000, or better yet, consider upgrading
,blockSizePenaltyPower=5
,randomSeed=12345
# TOM precalculation arguments, if available ------------------------------------------
,individualTOMInfo=NULL ## pre-calculated TOM using blockwiseIndividualTOMs
,useIndivTOMSubset=NULL
# Network construction arguments: correlation options ------------------------------------------
# ,corType="pearson"
,maxPOutliers=1 ## used only with bicor: Specifies the maximum percentile of data that can be considered outliers on either side of the median separately
,quickCor=0 ## handling of missing
,pearsonFallback="individual" ## using pearson when mean absolute deviation == zero -> cant perform bicor...
,cosineCorrelation=FALSE
# Adjacency function options ------------------------------------------
# ,power=6 ## moved to function inputs
,networkType=signType ## moved to function inputs ## options: "unsigned"’, ‘"signed"’, ‘"signed hybrid" See ‘adjacency’
,checkPower=TRUE
# Topological overlap options ------------------------------------------
,TOMType=signType ## moved to function inputs ## options: "none"’, ‘"unsigned"’, ‘"signed"’. If ‘"none"’, adjacency will be used for clustering
,TOMDenom="min" ## min - standard, mean - expreimental
# Save individual TOMs? ------------------------------------------
,saveIndividualTOMs=F ## TOM saved to disk
,individualTOMFileNames="individualTOM-Set%s-Block%b.RData"
# Consensus calculation options: network calibration ------------------------------------------
,networkCalibration=c("single quantile","full quantile","none") ## "single", "quantile", "full quantile", "none"
# Simple quantile calibration options ------------------------------------------
,calibrationQuantile=0.95
,sampleForCalibration=TRUE
,sampleForCalibrationFactor=1000
,getNetworkCalibrationSamples=FALSE
# Consensus definition ------------------------------------------
,consensusQuantile=0 ## <<<<<<<<<<<<<<<<
,useMean=FALSE ## <<<<<<<<<<<<<<<< use mean instead of consensusQuantile
,setWeights=NULL ## <<<<<<<<<<<<<<<< for weighted mean
# Saving the consensus TOM ------------------------------------------
,saveConsensusTOMs=FALSE
,consensusTOMFileNames="consensusTOM-block.%b.RData"
# Internal handling of TOMs ------------------------------------------
,useDiskCache=F ## TRUE=slower but more RAM efficient
,chunkSize=NULL
,cacheBase=".blockConsModsCache"
,cacheDir="."
# Alternative consensus TOM input from a previous calculation ------------------------------------------
,consensusTOMInfo=NULL
# Basic tree cut options ------------------------------------------
,deepSplit=2
,detectCutHeight=0.995
,minModuleSize=50 ## default=20
,checkMinModuleSize=TRUE
# Advanced tree cut options ------------------------------------------
,maxCoreScatter=NULL
,minGap=NULL
,maxAbsCoreScatter=NULL
,minAbsGap=NULL
,minSplitHeight=NULL
,minAbsSplitHeight=NULL
,useBranchEigennodeDissim=FALSE
,minBranchEigennodeDissim=mergeCutHeight
,stabilityLabels=NULL
,minStabilityDissim=NULL
,pamStage=TRUE
,pamRespectsDendro=TRUE
# Gene reassignment and trimming from a module, and module "significance" criteria ------------------------------------------
,reassignThresholdPS=1e-4
# ,trimmingConsensusQuantile=consensusQuantile ## breaks if specified here without outside variables
,minCoreKME=0.5
# ,minCoreKMESize=minModuleSize/3
,minKMEtoStay=0.2
# Module eigengene calculation options ------------------------------------------
,impute=TRUE
,trapErrors=FALSE
#Module merging options
,equalizeQuantilesForModuleMerging=FALSE
,quantileSummaryForModuleMerging="mean"
,mergeCutHeight=0.15
# ,mergeConsensusQuantile=consensusQuantile ## breaks if specified here without outside variables
# Output options ------------------------------------------
,numericLabels=FALSE
# General options ------------------------------------------
,nThreads=0
,verbose=2
,indent=1
,...
)
print(Sys.time()-t0)
collectGarbage()
mstat=as.data.frame(table(modules$colors))
mstat=mstat[order(mstat[,2],decreasing=T),]
msta0=mstat[(mstat[,1]=='grey'),]
msta0$module='M0'
mstat=mstat[!(mstat[,1]=='grey'),]
mstat$module=paste0('M',1:nrow(mstat))
mstat=rbind(mstat,msta0)
colnames(mstat)=c('color','ngenes','module')
mstat$color=as.character(mstat$color)
if(dat_descr!=''){mstat$module=paste(mstat$module,dat_descr,sep="_")} ## add info after module name
## module membership ------------------------------------------
module_list=list()
for(imod in 1:nrow(mstat)){
module_list[[mstat$module[imod]]]=colnames(list_expr[[1]]$data)[modules$color==mstat$color[imod]] ## assuming all module gene names are in the same order, if not, modules are probably unreliable anyhow
}
module_list[['bkgrnd']]=colnames(list_expr[[1]]$data)
mes=modules$multiMEs ## comes as a list, one for each list input
names(mes)=names(list_expr)
module_expr=list()
for(ilis in 1:length(list_expr)){
## module eigene output ------------------------------------------
rownames(mes[[names(list_expr)[ilis]]]$data)=rownames(list_expr[[names(list_expr)[ilis]]]$data)
dummy=mes[[names(list_expr)[ilis]]]$data ## § get rid of the $data sub-tag in every list..
mes[[names(list_expr)[ilis]]]=dummy
## re-name MEs to match module names
colnames(mes[[names(list_expr)[ilis]]])=gsub('ME','',colnames(mes[[names(list_expr)[ilis]]]))
mes[[names(list_expr)[ilis]]]=mes[[names(list_expr)[ilis]]][,mstat$color]
colnames(mes[[names(list_expr)[ilis]]])=paste('me',mstat$module,sep='') # change the names to meM1_descr
## module expression ------------------------------------------
for(imod in 1:length(module_list)){
module_expr[[names(list_expr)[ilis]]][[names(module_list)[imod]]]=list_expr[[names(list_expr)[ilis]]]$data[,module_list[[names(module_list)[imod]]]]
}
}
## add bkgrnd info to mstat
mbg=as.data.frame('bkgrnd')
mbg$length=ncol(list_expr[[1]]$data)
mbg$name='bkgrnd'
colnames(mbg)=colnames(mstat)
mstat=rbind(mstat,mbg)
readme='\n\tModules are named based on size M1 - biggest, M0 - unclustered, bkgrnd - all input genes, output contains :
\t1.\tmodule_list - list containing names of genes in each module
\t2.\tmodule_expr - expression matrix of all genes in module / input dataset
\t3.\tMEs - module eigengenes (PC1) for each module / dataset
\t4.\tmstat - key used to name modules, includes module size
\t5.\twgcna_out - object containing full output of WGNCA blockwiseConsensusModules(), can be used for plots etc
\n'
cat(readme)
return(invisible(list(module_list=module_list,module_expr=module_expr,MEs=mes,mstat=mstat,wgcna_out=modules,readme=readme)))
}
wgcnaBuildModules<-function(list_expr,pow_range,minModuleSize=40,mergeHeight=0.15,savePlots=F,outDir=getwd(),datDescr="",signType="signed"){
print(" NOTE : Input data (list_expr) is expected as a list with each entry : rows=genes, columns=samples")
if (savePlots==F){print(" - currently does not return the tables - information is printed // can also be saved as plots and tables")}
library('WGCNA')
allowWGCNAThreads()
set.seed(0) # reproducibility
for(ireg in 1:length(names(list_expr))){
# pamStage=1 # seems irrelevant for ult networks - ie cv, pval 0.5, log transformed etc
print("■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■")
cat(" range of power scan :",pow_range,"\n")
print(paste(names(list_expr)[ireg],ireg,"of",length(names(list_expr))))
system(paste("mkdir ",outDir,"/objects",sep=""))
system(paste("mkdir ",outDir,"/modules",sep=""))
system(paste("mkdir ",outDir,"/plots",sep=""))
#system(paste("mkdir ",outDir,"/plots/pw",softPower,sep=""))
for(ipow in pow_range){
t0=Sys.time()
softPower=ipow
#merge.height.scan=c(0.15,0.2,0.25,0.3)
#for(ihei in mergeHeight){
ihei=mergeHeight
print(softPower)
print(names(list_expr)[ireg])
wgcnaModules=blockwiseModules(t(as.matrix(list_expr[[names(list_expr)[ireg]]])),
maxBlockSize=20000,
power=softPower,
#deepSplit=2, # simplified version of tree-cutting (may be worth looking at tree first and defining proper threshold)
mergeCutHeight=ihei,
#mergeCutHeight=0.25,
#detectCutHeight=0.995, # dendrogram cut height for module detection. See cutreeDynamic for more details
minModuleSize=minModuleSize, # minimum module size for module detection. See cutreeDynamic for more details
# maxCoreScatter # maximum scatter of the core for a branch to be a cluster, given as the fraction of cutHeight relative to the 5th
# percentile of joining heights. See cutreeDynamic for more details
corType="bicor", # pearson - default
#reassignThreshold=0,
networkType=signType,
#TOMtype="signed", #is default
numericLabels=FALSE,
#pamStage=1, pamRespectsDendro=TRUE,
#TOMDenom="min", # min=default/standard, mean - may produce better results but "experimental" atm
#saveTOMs=TRUE,
#saveTOMFileBase="femaleMouseTOM",
verbose=1 # options '1' , '2' , '3'
)
mergedColor<-wgcnaModules$colors
collectGarbage()
print(Sys.time()-t0)
# save modules
#module_ensg_expr=list()
module_ensg_list=list()
module_ensg_list[["M0_grey"]]=rownames(list_expr[[names(list_expr)[ireg]]])[mergedColor=="grey"]
write.table(rownames(list_expr[[names(list_expr)[ireg]]])[mergedColor=="grey"],file=paste(outDir,"/modules/M0_",datDescr,"_power",softPower,".txt",sep=""),quote=F,row.names=F,col.names=F)
n=1
for(imod in unique(mergedColor[-which(mergedColor =="grey")])) {
module_ensg_list[[paste("M",n,"_",imod,sep="")]]= rownames(list_expr[[names(list_expr)[ireg]]])[mergedColor==imod]
write.table(rownames(list_expr[[names(list_expr)[ireg]]])[mergedColor==imod],file=paste(outDir,"/modules/M",n,"_",imod,"_",datDescr,"_power",softPower,".txt",sep=""),quote=F,row.names=F,col.names=F)
n=n+1
}
module_ensg_list[["bkgrnd"]]=rownames(list_expr[[names(list_expr)[ireg]]])
write.table(module_ensg_list[["bkgrnd"]],file=paste(outDir,"/modules/bkgrnd_",datDescr,"_power",softPower,".txt",sep=""),quote=F,row.names=F,col.names=F)
save(wgcnaModules,module_ensg_list,file=paste0(outDir,"/objects/WGCNA_MODULES_",
names(list_expr)[ireg],"_",datDescr,
".merge-height=",ihei,
".genes=",nrow(list_expr[[names(list_expr)[ireg]]]),
".samples=",ncol(list_expr[[names(list_expr)[ireg]]]),
"_pw",softPower,"_bicor.R"))
if(savePlots==T){
pdf(file=paste(outDir,"/plots/pw",softPower,"_WGCNA_dendo_",names(list_expr)[ireg],datDescr,
#peer,
#paste(covarnm$short,collapse="."),
".merge-height=",ihei,
".genes=",nrow(list_expr[[names(list_expr)[ireg]]]),
".samples=",ncol(list_expr[[names(list_expr)[ireg]]]),
"_bicor.pdf",sep=""))
###--------------------------------------------------------------------------------------------------------------
# Plot the cut tree -----------------------------------------------------------------------------------------
#sizeGrWindow(12, 9) # open a graphics window
plotDendroAndColors(wgcnaModules$dendrograms[[1]], wgcnaModules$colors[wgcnaModules$blockGenes[[1]]],
dendroLabels=FALSE, hang=0.03,
addGuide=TRUE, guideHang=0.05,
main=paste("Cluster Dendrogram",names(list_expr)[ireg],"(power",softPower,")"))
#mergedColors=labels2colors(wgcnaModules$colors) # Convert labels to colors for plotting
#plotDendroAndColors(wgcnaModules$dendrograms[[1]], colors[wgcnaModules$blockGenes[[1]]],
# "Module colors",
# dendroLabels=FALSE, hang=0.03,
# addGuide=TRUE, guideHang=0.05) # Plot the dendrogram and the module colors underneath
#quartz(height=7,width=7) # opens a new plot window of specified size
#quartz()
##pdf(file=paste("./Dropbox/Cognition/WGCNA/finalised/plots/WGCNA.PDF.merge-height=",ihei,".pam=",pamStage,".",nrow(D1),"genes.",ncol(list_expr[[names(list_expr)[ireg]]]),"patients.powr",softPower,resid,quanNorm,logTransf,cv.filt,resid,".bicor.MEplot.only.pdf",sep=""))
## looking at the clustering of the MEs of the modules --------------------------------------------------------------------------
#me.curr=(MEcurr[,-which(colnames(MEcurr)=="grey")]) # omit grey cluster
#MEDiss=1-cor(me.curr) # calculate the distances
#MEcurr=wgcnaModules$MEs
#rownames(MEcurr)=colnames(list_expr[[names(list_expr)[ireg]]])
#colnames(MEcurr)=gsub("ME", "", colnames(MEcurr),perl=T) # get rid of the ME_ prefix in colors
# head(MEcurr)
# dim(MEcurr)
#write.table(MEcurr,paste("~/caprica/wgcna/tables/WGCNA.MEcurr.table.",bonn_background_restricted,names(list_expr)[ireg],peer,pc1correct_data,paste(covarnm$short,collapse="."),".merge-height=",ihei,".genes",nrow(list_expr[[names(list_expr)[ireg]]]),".samples",ncol(list_expr[[names(list_expr)[ireg]]]),"powr",softPower,".bicor.txt",sep=""),sep="\t",quote=F,row.names=T,col.names=NA)
#MEDiss=1-cor(MEcurr) # calculate the distances
#METree=hclust(as.dist(MEDiss), method="average") # Cluster module eigengenes
#plot(METree, main=paste("Clustering of ",bonn_background_restricted,names(list_expr)[ireg],"module eigengenes (power",softPower,")"), xlab="", sub="",cex=0.7) # Plot the result
# abline(h=0.3,col="orange")
# abline(h=0.25,col="red")
# abline(h=0.20,col="blue")
# abline(h=0.15,col="green")
#
dev.off()
}
}
}
}
mod.name.col <- function(module_ensg_list){
mcol=module_ensg_list$bkgrnd
for (i in 1:(length(module_ensg_list)-1)) {
mcol[mcol%in%module_ensg_list[[i]]]=names(module_ensg_list[i])
}
return(mcol)
}
peer.correct<-function(expDat,covDat=matrix(),sanity=F,cov_match=T,verbose=T){
# if(verbose){
# cat("\tNOTE : exprDat expected as matrix/data.frame of expression columns=genes rows=samples\n")
# cat("\tNOTE : covDat is optional covariate as a matrix columns=covariates, rows=samples, PEER requires numeric as.numeric(as.factor(cov))\n")
# }
####-----------------------------------------------------------------------------------------------------------
### PEER data correction for covariates and factors
# PEER package installation
# https://github.com/PMBio/peer/wiki/Installation-instructions
# R CMD INSTALL R_peer_source_1.3.tgz
#### PEER DATA CORRECTION ---------------------------------------------------------------------------------
### Applying PEER in R amounts to creating the model and setting its parameters, followed by inference. The model object will then hold the posterior distributions of the parameters.
### PEER can automatically include an additional factor (covariate) to account for the mean expression. For most use cases, including the mean effect is likely to be a good choice. To active mean factors, use
### Run correlation analyses between the inferred variables and batch confounding effects. If several inferred factors correlated with batch effects/confounders, this can be indicative of
### a more complex, nonlinear effect of these known covariates on the mRNA levels. Scatter plots can help understand the nature of these dependencies.
library(peer)
# peercor=list() # covariate adjusted PEER output
# peerfac=list() # list of factors generated by PEER and/or user-provided covariates used by PEER
### sub-select covariate data based on exprDat -> potentially a problem if not in correct setup /&/ missing / non-matching names
model=PEER() ## create model object
PEER_setPhenoMean(model, t(expDat)) ## add observed data
if(nrow(covDat)==1 & ncol(covDat)==1){
cat("\tno covariates selected\n")
}
if(nrow(covDat)>1 & ncol(covDat)>=1){
### add in covariates, can be used to correct the data -> residuals retrieved at the end
### covariates - samples=rows, have to be numeric, :: as.numeric(as.factor())
if(cov_match){
cat('\tcovariate data matched to expression\n')
covDat=covDat[rownames(covDat)%in%colnames(expDat),,drop=F]
}
# covDat
cat("\t\tcovariates selected\t:",paste(colnames(covDat),collapse=", ")," : ", length(colnames(covDat)),"\n")
cat("\t\tcovariates contain\t:", round(sum(rownames(covDat)==colnames(expDat))/nrow(covDat),digits=3)*100,"% matching samples \n")
PEER_setCovariates(model, covDat) ## add covariates to model
}
#dim(PEER_getPhenoMean(model))
### Number of factors to use (ref: http://www.ncbi.nlm.nih.gov/pmc/articles/PMC3398141/pdf/ukmss-48786.pdf)
### If no prior information on the magnitude of confounding effects is available,
### we recommend using 25% of the number of individuals contained in the study but no more than 100 factors.
#n_peer_fac=3
#if(ncol(expDat)<=30){n_peer_fac=3;cat('\tn samples </= 30, using',ncol(expDat)/n_peer_fac,'factors\n')}
#if(ncol(expDat)>30){n_peer_fac=4;cat('\tn samples > 30, using',ncol(expDat)/n_peer_fac,'factors\n')}
n_peer_fac=4
PEER_setNk(model,ceiling(ncol(expDat)/n_peer_fac)) ## PEER recommends running n/4 factors => these should be checked to determine what they measure || ensure they are not correcting for biological variation
PEER_getNk(model)
PEER_update(model) ### perform the inference.
### The result is the model object with posterior distributions of the variables.
### 2. Observing output
### You can get the posterior mean of the inferred confounders (NxK matrix), their weights (GxK matrix), precision (inverse variance) of the weights (Kx1 matrix), and the residual dataset (NxG matrix):
resid=t(PEER_getResiduals(model)) ## the residual dataset (NxG matrix) ## i.e. corrected data for all factors
factr =PEER_getX(model) ## the posterior mean of the inferred confounders (NxK matrix)
weights =PEER_getW(model) ## weights (GxK matrix) of the inferred confounders (NxK matrix)
precision=PEER_getAlpha(model) ## precision (inverse variance) of the weights (Kx1 matrix)
# print(cor(PEER_getX(model), PEER_getCovariates(model)))
colnames(resid)=colnames(expDat)
rownames(resid)=rownames(expDat)
rownames(factr)=colnames(expDat)
colnames(factr)=paste0('F',1:ncol(factr))
colnames(weights)=colnames(factr)
rownames(weights)=rownames(expDat)
rownames(precision)=colnames(factr)
colnames(precision)='inverse.var.weights'
## ************** need to re-name colnames and rownames in peer output since it does not save these automatically *************************
if(nrow(covDat)>1 & ncol(covDat)>=1){
readme="\n\tResults obtained via peerCovCorrect function using PEER PMC3398141 bayes PMC4158865
\t\t1. resid - the residual dataset (NxG matrix) ## i.e. corrected data for all factors
\t\t2. factr - covaraites the data was corrected for - can be used as 'phenotypes' || the posterior mean of the inferred confounders (NxK matrix)
\t\t3. weights - weights (GxK matrix) of the inferred confounders (NxK matrix)
\t\t4. precision - precision (inverse variance) of the weights (Kx1 matrix) || plot(precision)
\t\t5. model - PEER model object can be used to perform diagnostic plot: PEER_plotModel(model)
\t\t6. expr - original expression matrix
\t\t7. covar - original covariate matrix
\n"
return(list("expr"=expDat,"covar"=covDat,"resid"=resid,"factr"=factr,weights=weights,precision=precision,model=model,"readme"=readme))
}
if(nrow(covDat)==1 & ncol(covDat)==1){
readme="\n\tResults obtained via peerCovCorrect function using PEER PMC3398141 bayes PMC4158865
\t\t1. resid - the residual dataset (NxG matrix) ## i.e. corrected data for all factors
\t\t2. factr - covaraites the data was corrected for - can be used as 'phenotypes' || the posterior mean of the inferred confounders (NxK matrix)
\t\t3. weights - weights (GxK matrix) of the inferred confounders (NxK matrix)
\t\t4. precision - precision (inverse variance) of the weights (Kx1 matrix) || plot(precision)
\t\t5. model - PEER model object can be used to perform diagnostic plot: PEER_plotModel(model)
\t\t6. expr - original expression matrix
\n"
return(list("expr"=expDat,"resid"=resid,"factr"=factr,weights=weights,precision=precision,model=model,"readme"=readme))
}
cat("\tpeercov - peer corrected expression residuals\n")
cat("\tpeerfac - covaraites the data was corrected for - can be used as 'phenotypes'\n")
}
peerCovCorrect<-function(listExpr,covDat=matrix(),sanity=F){
cat("\tNOTE : listExpr expected as list of expression matrixes columns=genes rows=samples\n")
cat("\tNOTE : covDat is optional covariate as a matrix columns=covariates, rows=samples\n")
####-----------------------------------------------------------------------------------------------------------
### PEER data correction for covariates and factors
# PEER package installation
# https://github.com/PMBio/peer/wiki/Installation-instructions
# R CMD INSTALL R_peer_source_1.3.tgz
#### PEER DATA CORRECTION ---------------------------------------------------------------------------------
### Applying PEER in R amounts to creating the model and setting its parameters, followed by inference. The model object will then hold the posterior distributions of the parameters.
### PEER can automatically include an additional factor (covariate) to account for the mean expression. For most use cases, including the mean effect is likely to be a good choice. To active mean factors, use
### Run correlation analyses between the inferred variables and batch confounding effects. If several inferred factors correlated with batch effects/confounders, this can be indicative of
### a more complex, nonlinear effect of these known covariates on the mRNA levels. Scatter plots can help understand the nature of these dependencies.
library(peer)
peercov=list()
peerfac=list()
#pdf(paste("~/caprica/graphics/02.rma.plier-gcbg.",paste(covariates_selected,collapse="."),".peer.plots.pdf",sep=""))
# par(mfrow=c(1,2))
for(i in 1:length(names(listExpr))){
print(paste("--------------------",names(listExpr)[i],"---------------------",i,"of",length(names(listExpr))))
# expr[[names(listExpr)[i]]]=read.delim(paste("~/dtb/primary/ensg.expression.",names(listExpr)[i],".txt",sep=""),row.names=1)
# the whole of covarDat (below) can have no entries + comment out PEER_setCovariates
if(nrow(covDat)==1 & ncol(covDat)==1){
cat("no covariates selected\n")
}
### We run PEER three times:
### 1. Using 8 factors and all covariates
### The data matrix is assumed to have N rows and G columns, where N is the number of samples, and G is the number of genes.
### Now we can create the model object,
model=PEER()
PEER_setPhenoMean(model, t(listExpr[[names(listExpr)[i]]])) ### set the observed data
### add in covariates, can be used to correct the data -> residuals retrieved at the end
### covariates - samples=rows, have to be numeric, :: as.numeric(as.factor())
if(nrow(covDat)!=1 & ncol(covDat)!=1){
# covarDat=as.matrix(covDat[colnames(listExpr[[names(listExpr)[i]]]),colnames(covDat) %in% covariates_selected,drop=F]) # ************* special provision required when ONLY ONE covariate is selected in covarDat -> to retain row and column names
covarDat=as.matrix(covDat[intersect(rownames(covDat),colnames(listExpr[[names(listExpr)[i]]])),]) # ************* special provision required when ONLY ONE covariate is selected in covarDat -> to retain row and column names
if(sanity){
covarDat=sd.check(covarDat,T,F)
}
cat("\t\tcovariates selected\t:",paste(colnames(covarDat),collapse=", ")," : ", length(colnames(covarDat)),"\n")
cat("\t\tcovariates contain\t:", round(sum(rownames(covarDat)==colnames(listExpr[[1]]))/nrow(covarDat),digits=3)*100,"% matching samples \n")
# print(dim(covarDat))
# making sure that this is selected only when there are covariates chosen
PEER_setCovariates(model, covarDat)
}
# NULL
dim(PEER_getPhenoMean(model))
## [1] 102 17410
## (NULL response means no error here), say we want to infer K=10 hidden confounders,
### Number of factors to use (ref: http://www.ncbi.nlm.nih.gov/pmc/articles/PMC3398141/pdf/ukmss-48786.pdf)
### If no prior information on the magnitude of confounding effects is available,
### we recommend using 25% of the number of individuals contained in the study but no more than 100 factors.
### 102 samples / 4 => we use 25 factors
PEER_setNk(model,ncol(listExpr[[i]])/4)
# NULL
PEER_getNk(model)
# [1] 25
### and perform the inference.
PEER_update(model)
### The result is the model object with posterior distributions of the variables.
### 2. Observing output
### You can get the posterior mean of the inferred confounders (NxK matrix), their weights (GxK matrix), precision (inverse variance) of the weights (Kx1 matrix), and the residual dataset (NxG matrix):
factors=PEER_getX(model)
dim(factors)
# [1] 102 25
weights=PEER_getW(model)
dim(weights)
# [1] 17410 25
precision=PEER_getAlpha(model)
dim(precision)
# [1] 25 1
peercov[[names(listExpr)[i]]]=PEER_getResiduals(model)
peerfac[[names(listExpr)[i]]]=factors
dim(peercov$cov5peer_vst_filt)
# [1] 102 17410
cor (PEER_getX(model), PEER_getCovariates(model))
#### plotting metrics ------------------------------
# plot(precision,main=paste(names(listExpr)[i]))
# PEER_plotModel(model)
# }
# dev.off()
# peer does not save row/column names (also the matrixes are transposed so need to adjust for that)
#peercor=list()
# for(i in 1:length(names(listExpr))){
print(paste(names(listExpr)[i],i,"of",length(names(listExpr))))
peercorex=as.data.frame(t(peercov[[names(listExpr)[i]]]))
rownames(peercorex)=rownames(listExpr[[names(listExpr)[i]]])
colnames(peercorex)=colnames(listExpr[[names(listExpr)[i]]])
# peercov[[names(listExpr)[i]]]=peercorex
if(nrow(covDat)!=1 & ncol(covDat)!=1){
peerfactr=as.data.frame(peerfac[[names(listExpr)[i]]])
rownames(peerfactr)=rownames(covarDat)
colnames(peerfactr)=c(colnames(covarDat),paste("F",1:(ncol(peerfactr)-ncol(covarDat)),sep=""))
# peerfac[[names(listExpr)[i]]]=peerfactr
}
# write.delim(peercov[[names(listExpr)[i]]],paste("~/dtb/secondary/peerCorrected.",names(listExpr)[i],".txt",sep=""),rownm=T,colnm=T)
# write.delim(peercov[[names(listExpr)[i]]],paste("~/dtb/secondary/peerCorrected.cv.",names(listExpr)[i],".txt",sep=""),rownm=T,colnm=T)
# write.delim(peercov[[names(listExpr)[i]]],paste("~/dtb/secondary/peerCorrected.bonn_restrict.",names(listExpr)[i],".txt",sep=""),rownm=T,colnm=T)
}
readme="\n\tResults obtained via peerCovCorrect function using PEER PMC3398141 bayes PMC4158865
\t\t1. peercov - peer corrected expression residuals
\t\t2. peerfac - covaraites the data was corrected for - can be used as 'phenotypes'\n"
# \t\t3. expr - original expression matrix
# \t\t4. covar - original covariate matrix\n")
return(list("peercov"=peercov,"peerfac"=peerfac,"readme"=readme))
#return(list("peercov"=peercov,"peerfac"=peerfac,"expr"=listExpr,"covar"=covDat,"readme"=readme))
cat("\tpeercov - peer corrected expression residuals\n")
cat("\tpeerfac - covaraites the data was corrected for - can be used as 'phenotypes'\n")
# cat("\texpr - original expression matrix\n")
# cat("\tcovar - original covariate matrix\n")
cat("\texpr and covar - original inputs are no longer returned by peerCovCorrect()\n")
}
pcVarExpl<-function(cexpr){
#
corpc=list()
pcstats=list()
#pie.cols=c("#D55E00","#F0E442", "#009E73", "#56B4E9","#0072B2")
#pdf(paste("~/Dropbox/CapricaPrime/RU/out/plots/02.PC.ENSG.var.explained.",cv_correct,peer,paste(corrected,collapse="."),".pdf",sep=""),height=14,width=7)
# par(mfrow=c(5,2),cex=0.6)
#
for(ireg in 1:length(names(cexpr))){
print(paste("--------------------",names(cexpr)[ireg],"---------------------",ireg,"of",length(names(cexpr))))
pcs=prcomp(t(cexpr[[names(cexpr)[ireg]]]),scale=T,center=T)
# plot(pcs$x[,1],pcs$x[,2],main=paste("PC1 v PC2",names(cexpr)[ireg],paste(corrected,collapse=".")))
pcstat=as.matrix(summary(pcs)$importance["Proportion of Variance",1:5])
# pie(pcstat, main=paste("Variance Explained (%) \n by PC1-5 of expr",names(cexpr)[ireg],paste(corrected,collapse=".")), radius =1,
# labels =paste("PC",1:5," ",round(pcstat*100),"%",sep=""), col=pie.cols)
print(pcstat)
pcstats[[names(cexpr)[ireg]]]=pcstat
#
corpc[[names(cexpr)[ireg]]]=as.data.frame(lm(as.matrix(t(cexpr[[names(cexpr)[ireg]]]))~pcs$x[,1])$residuals)
# top and bottom 10% of genes in the loadings
# octx pc1 - ubiquitin, proteolysis
# octx pc1 - interesting things
# correct for pc1 or use more stuff from *** http://www.nature.com/nprot/journal/v7/n3/full/nprot.2011.457.html?WT.ec_id=NPROT-201203
# write.delim(names(pcs$rotation[(pcs$rotation[,1] > quantile(pcs$rotation[,1],0.9) | pcs$rotation[,1] < quantile(pcs$rotation[,1],0.1)),1]),paste("~/Dropbox/CapricaPrime/RU/out/tables/",bonn_background_restricted,cv_correct,peer,pc1correct_data,paste(corrected,collapse="."),".",names(cexpr)[ireg],".PC1.top.bottom.10pc.loadings.txt",sep=""),rownm=F,colnm=F)
# write.delim(names(pcs$rotation[(pcs$rotation[,2] > quantile(pcs$rotation[,2],0.9) | pcs$rotation[,2] < quantile(pcs$rotation[,2],0.1)),2]),paste("~/Dropbox/CapricaPrime/RU/out/tables/",bonn_background_restricted,cv_correct,peer,pc1correct_data,paste(corrected,collapse="."),".",names(cexpr)[ireg],".PC2.top.bottom.10pc.loadings.txt",sep=""),rownm=F,colnm=F)
# write.delim(names(pcs$rotation[(pcs$rotation[,2] > quantile(pcs$rotation[,2],0.9) | pcs$rotation[,2] < quantile(pcs$rotation[,2],0.1)),2]),paste("~/Dropbox/CapricaPrime/RU/out/tables/",bonn_background_restricted,cv_correct,peer,pc1correct_data,paste(corrected,collapse="."),".",names(cexpr)[ireg],".PC3.top.bottom.10pc.loadings.txt",sep=""),rownm=F,colnm=F)
}
#dev.off()
return(invisible(corpc))
}
frac<-function(subset,full,num=T,perc=F,sig_fig=2){
if(!num){return(round(length(subset)/length(full),digits=sig_fig))}
if(num&perc){return(round(subset/full,digits=sig_fig)*100)}
if(num&!perc){return(round(subset/full,digits=sig_fig))}
}
#list.as.df<-function(in_list){
# dflist=as.data.frame(matrix(NA,nrow=length(in_list),ncol=ncol(in_list[[1]])))
# colnames(dflist)=colnames(in_list[[1]])
# rownames(dflist)=names(in_list)
# for(idat in 1:length(in_list)){
# dflist[idat,]=in_list[[idat]]
# }
# return(unique(dflist))
#}
gnameToENSG<-function(gene_names){
hugo=read.delim("~/Dropbox/annotation/dtb/hgnc_complete_set.8.4.2015.txt")
# sum(nur$Gene.Symbol %in% hugo$Approved.Symbol)
# length(unique(nur$Gene.Symbol))
# sum(esc$Gene.Symbol %in% hugo$Approved.Symbol)
# length(unique(esc$Gene.Symbol))
gene_names=gene_names[gene_names!=""]
gene_names=as.data.frame(gene_names)
colnames(gene_names)="Gene.Name"
cat("\tinput : ",length(gene_names$Gene.Name)," genes || ",length(unique(gene_names$Gene.Name))," unique\n")
mapped=hugo[hugo$Approved.Symbol %in% gene_names$Gene.Name,c("Approved.Symbol","Ensembl.Gene.ID","Entrez.Gene.ID")]
mapped$input.id=mapped$Approved.Symbol
mapped$input.id.type="Approved.Symbol"
cat("\t\t",nrow(mapped)," directly mapped || ",length(unique(mapped$Approved.Symbol))," unique\n")
unmapped=unique(gene_names$Gene.Name[!(gene_names$Gene.Name %in% hugo$Approved.Symbol)])
# unmapped=unmapped[unmapped!=""]
#
remap=list()
imatch=1
for(igen in 1:nrow(hugo)){
# cat("\t\t\t",length(unlist(strsplit(hugo$Synonyms[igen],", ")))," ",sum(unlist(strsplit(hugo$Synonyms[igen],", ")) %in% unmapped),"\n")
if(sum(unlist(strsplit(hugo$Synonyms[igen],", ")) %in% unmapped)>0){
cat("\t\t\t",igen," Synonyms ",length(unlist(strsplit(hugo$Synonyms[igen],", ")))," ",sum(unlist(strsplit(hugo$Synonyms[igen],", ")) %in% unmapped),"\n")
remap[[imatch]]=(hugo[igen,c("Approved.Symbol","Ensembl.Gene.ID","Entrez.Gene.ID")])
remap[[imatch]]$input.id=unlist(strsplit(hugo$Synonyms[igen],", "))[unlist(strsplit(hugo$Synonyms[igen],", ")) %in% unmapped]
remap[[imatch]]$input.id.type="Synonym"
imatch=imatch+1
}
if(sum(unlist(strsplit(hugo$Previous.Symbols[igen],", ")) %in% unmapped)>0){
cat("\t\t\t",igen," Previous Symbols ",length(unlist(strsplit(hugo$Previous.Symbols[igen],", ")))," ",sum(unlist(strsplit(hugo$Previous.Symbols[igen],", ")) %in% unmapped),"\n")
remap[[imatch]]=hugo[igen,c("Approved.Symbol","Ensembl.Gene.ID","Entrez.Gene.ID")]
remap[[imatch]]$input.id=unlist(strsplit(hugo$Previous.Symbols[igen],", "))[unlist(strsplit(hugo$Previous.Symbols[igen],", ")) %in% unmapped]
remap[[imatch]]$input.id.type="Prevous Symbol"
imatch=imatch+1
}
if(sum(unlist(strsplit(hugo$Previous.Names[igen],", ")) %in% unmapped)>0){
cat("\t\t\t",igen," Previoius Names ",length(unlist(strsplit(hugo$Previous.Names[igen],", ")))," ",sum(unlist(strsplit(hugo$Previous.Names[igen],", ")) %in% unmapped),"\n")
remap[[imatch]]=hugo[igen,c("Approved.Symbol","Ensembl.Gene.ID","Entrez.Gene.ID")]
remap[[imatch]]$input.id=unlist(strsplit(hugo$Previous.Names[igen],", "))[unlist(strsplit(hugo$Previous.Names[igen],", ")) %in% unmapped]
remap[[imatch]]$input.id.type="Prevous Name"
imatch=imatch+1
}
if(sum(unlist(strsplit(hugo$Name.Synonyms[igen],", ")) %in% unmapped)>0){
cat("\t\t\t",igen," Name Synonyms ",length(unlist(strsplit(hugo$Name.Synonyms[igen],", ")))," ",sum(unlist(strsplit(hugo$Name.Synonyms[igen],", ")) %in% unmapped),"\n")
remap[[imatch]]=hugo[igen,c("Approved.Symbol","Ensembl.Gene.ID","Entrez.Gene.ID")]
remap[[imatch]]$input.id=unlist(strsplit(hugo$Name.Synonyms[igen],", "))[unlist(strsplit(hugo$Name.Synonyms[igen],", ")) %in% unmapped]
remap[[imatch]]$input.id.type="Name Synonym"
imatch=imatch+1
}
cat(round(igen/nrow(hugo),digits=2),"\r");flush.console()
}
# sum(nrow(mapped),length(remap))/nrow(gene_names)
# nrow(gene_names)-sum(nrow(mapped),length(remap))
length(remap)
remap=list.as.df(remap)
length(unique(remap$Approved.Symbol))
cat("\n\t\t",length(unique(mapped$Approved.Symbol)),"remapped using synonyms\n\n")
dim(remap)
mapped=rbind(mapped,remap)
dim(mapped)
# dim(esc)
# possible duplicates due to multiple 'old id type' for the same gene
#erm[erm$old.id %in% erm$old.id[duplicated(erm$old.id)],]
qc=mapped
mapped=unique(mapped[,-which(colnames(mapped)=="input.id.type")])
mapped=mapped[!(mapped$input.id %in% mapped$input.id[(duplicated(mapped$input.id))]),]
unmapped=gene_names[!(gene_names$Gene.Name %in% mapped$input.id),]
ambigous=mapped[mapped$input.id %in% mapped$input.id[(duplicated(mapped$input.id))],]
print(mapped[mapped$input.id %in% mapped$input.id[(duplicated(mapped$input.id))],])
print(as.matrix(unmapped))
return(list(mapped=mapped,unmapped=unmapped,ambigous=ambigous,qc=qc))
}
bgoverlap<-function(bg_list,union=F,intersect=T){
# NOTE: bg_list - expect list of vectors for list of dataframes use bgcommon
if(union&intersect){
stop('union & intersect are mutually exclusive, set union=T,intersect=F or vice versa')
}
bkg=bg_list[[1]]
for(idat in 2:length(bg_list)){
if(!is.vector(bg_list[[idat]])){stop(paste('\t',names(bg_list)[idat],'is not a vector'))}
if(union){bkg=union(bkg,bg_list[[idat]])}
if(intersect){bkg=intersect(bkg,bg_list[[idat]])}
}
cat('\t',length(bkg),'entries common to',length(bg_list),'lists\n')
return(bkg)
}
bgcommon<-function(list_dat,transform=F,dat_mat='',union=F,help=F,verbose=T){
if(help){
cat("\n\tUSE\t: determine common background (union or intersect) across all entries in expression list\n")
cat("\tNOTE\t: list_dat - list of expression matrices: row=genes, col=samples\n")
cat("\tNOTE\t: transfrom=T also returns the list_dat processed for common bg\n")
cat("\tNOTE\t: dat_mat required only if union=T & transform=T \n")
}
# if(verbose){cat('\tcalculate common background across',length(list_dat),'datasets\n')}
bgcommon=rownames(list_dat[[1]])
for(ilis in 2:length(list_dat)){
if(!union){bgcommon=intersect(bgcommon,rownames(list_dat[[ilis]]))}
if(union){bgcommon=union(bgcommon,rownames(list_dat[[ilis]]))}
}
if(!union&verbose){cat("\n\tintersect:",length(bgcommon),"genes common to all",length(list_dat),"datasets\n")}
if(union&verbose){cat("\n\tunion",length(bgcommon),"genes common to all",length(list_dat),"datasets\n")}
if(!transform){return(invisible(bgcommon))}
if(transform & !union){
if(verbose){cat('\ttransform all datasets to "intersect" background\n')}
listt=list()
for(ilis in 1:length(list_dat)){
listt[[names(list_dat)[ilis]]]=list_dat[[names(list_dat)[ilis]]][bgcommon,,drop=F]
}
return((listt))
}
if(transform & union){
if(verbose){cat('\ttransform all datasets to "union" background\n')}
listt=list()
for(ilis in 1:length(list_dat)){
listt[[names(list_dat)[ilis]]]=dat_mat[bgcommon,colnames(list_dat[[names(list_dat)[ilis]]]),drop=F]
}
return((listt))
}
}
bgfix<-function(mod_list,bg_vec){
bgcommon=intersect(unique(unlist(mod_list)),bg_vec)
cat('\n% genes remaining\n')
mod_common=list()
mod_common[[names(mod_list)[1]]]=intersect(mod_list[[names(mod_list)[1]]],bgcommon)
cat('\n\t',names(mod_list)[1],' \t',round(length(mod_common[[names(mod_list)[1]]])/length(mod_list[[names(mod_list)[1]]]),digits=2),'\n')
for(ilis in 2:length(mod_list)){
mod_common[[names(mod_list)[ilis]]]=intersect(mod_list[[names(mod_list)[ilis]]],bgcommon)
cat('\t',names(mod_list)[ilis],' \t',round(length(mod_common[[names(mod_list)[ilis]]])/length(mod_list[[names(mod_list)[ilis]]]),digits=2),'\n')
}
return(invisible(mod_common))
}
bgintersect<-function(list1,bg1){ #list2,,bg2
#bgintersect<-function(list1,bg.common){
cat("\t\tUSE : background intersect to only retain genes present in bkgrnd provided\n")
# cat("\t\tNOTE : list1 and list1 are expected as lists of IDs, bg1 and bg2 expected as vector, all are required\n\n")
# bg.common=(intersect(bg1,bg2))
# cat("\t",length(bg.common),"genes shared between datasets\t1:",length(bg1),pc(bg.common,bg1)*100,"% || 2:",length(bg2),pc(bg.common,bg2)*100,"%\n\n")
alis=list()
for(ilis in 1:length(list1)){
alis[[names(list1)[ilis]]]=list1[[ilis]][list1[[ilis]] %in% bg1]
# cat("\t",names(list1)[ilis],"\t--------",length(alis[[ilis]]),"/",length(list1[[ilis]]),"\t:",pc((alis[[ilis]]),(list1[[ilis]])),"\t--------",ilis,"of",length(list1),"\n")
}
# blis=list()
# for(ilis in 1:length(list2)){
# blis[[names(list2)[ilis]]]=list2[[ilis]][list2[[ilis]] %in% bg.common]
# cat("\t",names(list2)[ilis],"\t--------",length(blis[[ilis]]),"/",length(list2[[ilis]]),":\t",pc((blis[[ilis]]),(list2[[ilis]])),"\t--------",ilis,"of",length(list2),"\n")
# }
return(invisible(alis))
# return(list1=alis)
}
#bgintersect<-function(list1,list2,bg1,bg2){
#bgintersect<-function(list1,bg.common){
# cat("\t\tUSE : background intersect to only retain genes present in both lists provided\n")
# cat("\t\tNOTE : list1 and list1 are expected as lists of IDs, bg1 and bg2 expected as vector, all are required\n\n")
# bg.common=(intersect(bg1,bg2))
# cat("\t",length(bg.common),"genes shared between datasets\t1:",length(bg1),pc(bg.common,bg1)*100,"% || 2:",length(bg2),pc(bg.common,bg2)*100,"%\n\n")
# alis=list()
# for(ilis in 1:length(list1)){
# alis[[names(list1)[ilis]]]=list1[[ilis]][list1[[ilis]] %in% bg.common]
# cat("\t",names(list1)[ilis],"\t--------",length(alis[[ilis]]),"/",length(list1[[ilis]]),"\t:",pc((alis[[ilis]]),(list1[[ilis]])),"\t--------",ilis,"of",length(list1),"\n")
# }
# blis=list()
# for(ilis in 1:length(list2)){
# blis[[names(list2)[ilis]]]=list2[[ilis]][list2[[ilis]] %in% bg.common]
# cat("\t",names(list2)[ilis],"\t--------",length(blis[[ilis]]),"/",length(list2[[ilis]]),":\t",pc((blis[[ilis]]),(list2[[ilis]])),"\t--------",ilis,"of",length(list2),"\n")
# }
#return(list(list1=alis,list2=blis,bg_common=bg.common))
# return(list1=alis)
#}
empirical_mod_cons<-function(expr_cor_mat,mod_list,nperm=1000,do_plots=F,dat_descr=''){
if(nrow(expr_cor_mat)!=ncol(expr_cor_mat)){
cat('\n\tWARNING: expect a square matrix of gene correlations\n')
}
cat('\t')
perm_stat=list()
clus_stat=list()
diag(expr_cor_mat)=NA
expr_cor_mat=abs(expr_cor_mat)
for(imod in 1:length(mod_list)){
t2=Sys.time()
cat(" --------- ",names(mod_list)[imod],length(mod_list[[imod]]),"genes","---------------------",imod,"of",length(mod_list)," \n")
clus_stat[[names(mod_list)[imod]]]$mean.abscor_matrix=mean(expr_cor_mat[mod_list[[imod]],mod_list[[imod]]],na.rm=T)
# clus_stat[[names(mod_list)[imod]]]$mean2abscor_matrix=mean(apply(expr_cor_mat[mod_list[[imod]],mod_list[[imod]]],1,mean,na.rm=T),na.rm=T) # the results appear identical
clus_stat[[names(mod_list)[imod]]]$clust.length=length(mod_list[[names(mod_list)[imod]]])
for(iper in 1:nperm){
perm.ids=sample(rownames(expr_cor_mat),size=clus_stat[[names(mod_list)[imod]]]$clust.length,replace=FALSE)
perm.cor=as.matrix(expr_cor_mat[perm.ids,perm.ids])
diag(perm.cor)=NA
perm_stat[[names(mod_list)[imod]]][[as.character(iper)]]=mean(abs(perm.cor),na.rm=T)
cat(round(iper/nperm,digits=2),"\r");flush.console()
}
clus_stat[[names(mod_list)[imod]]]$pval=(sum(perm_stat[[names(mod_list)[imod]]]>clus_stat[[names(mod_list)[imod]]]$mean.abscor_matrix)+1)/(nperm+1)
cat('\temprirical P=',clus_stat[[names(mod_list)[imod]]]$pval,'\n')
print(Sys.time()-t2)
}
#### separating this out to optimise plotting axis across all plots
if(do_plots){
# x_lim=c((min(unlist(perm_stat),unlist(clus_stat))-0.2),(max(unlist(perm_stat),unlist(clus_stat))+0.2))
x_lim=c(0,0.5)
for(imod in 1:length(mod_list)){
hist(perm_stat[[names(mod_list)[imod]]],breaks=30,xlim=x_lim,main=paste(names(mod_list)[imod],dat_descr,'\n',nperm,"permutations, p=",round(clus_stat[[names(mod_list)[imod]]]$pval,digits=2)),xlab="Average absolute correlation")
abline(v=clus_stat[[names(mod_list)[imod]]]$mean.abscor_matrix,col="red")
}
}
return(invisible(list(clust_stat=t(as.data.frame(lapply(clus_stat,unlist))),perm_stat=perm_stat)))
}
##### more on visualisation of PCs biplot() and alternative ggbiplot()
###http://www.r-bloggers.com/computing-and-visualizing-pca-in-r/
##### vital info if ever using ggplots
###http://felixfan.github.io/rstudy/2013/11/27/ggplot2-remove-grid-background-margin/
#par(fig=c(0,0.8,0,1), new=TRUE)
#plot(ab$pcs,col=colmix[match(dat_vect,levels(dat_vect))],pch=16)
#par(fig=c(0.7,1,0,1),new=TRUE)
#legend("center",legend=unique(c1$SMTSD),col=1:length(c1$SMTSD),pch=1,cex=1)
pcstat<-function(expr_mat,plot_pcs=c(F,F,F),col="black",n_pcs=5,dat_descr="",pie_radius=1,help=F,verbose=T){
if(help==T){cat("\tINPUTS\t: expr_mat : rows - genes, columns - samples | n_pcs - number of PCs to use\n\n")}
# does not workpar???
# if(sum(plot_pcs>1)){
# par(mfrow=c(1,sum(plot_pcs)))
# par(mfrow=c(2,2))
# }
pcs=prcomp(t(expr_mat),scale=T,center=T)
# coll=c("#0072B2" ,"#E69F00","#009E73","#56B4E9","#D55E00","#66A61E","#7570B3","#882255","#F0E442","#A6761D","#AA4499","#117733","#332288","#999933")
pcstat=round(as.matrix(summary(pcs)$importance["Proportion of Variance",1:n_pcs]),digits=2)*100
# cat("\t\tPCs variance explained :\n\n")
colnames(pcstat)="variance explained %"
print(round(pcstat,digits=3))
if(plot_pcs[1]==T){
pie(pcstat, main=paste("Variance Explained (%) \n by PC1-5 of expr",dat_descr), radius =pie_radius,
labels =paste("PC",1:n_pcs," ",pcstat,"%",sep=""), col=colmix[1:n_pcs])
}
if(plot_pcs[2]==T){
plot(pcs$x[,1],pcs$x[,2],col=col,main=paste("PC1 v PC2",dat_descr),pch=16,frame.plot=F,xlab=paste("PC1 ",pcstat[1],"%",sep=""),ylab=paste("PC2 ",pcstat[2],"%",sep=""))
}
# plot(cbind())
pcs=pcs$x[,1:n_pcs]
if(plot_pcs[3]==T){
matplot(pcs,type="l",lty=1,lwd=2,col=colmix)
}
return(invisible(pcs))
}
#plot.counts<-function(dat_vec){
## USE : pretty plot counts in a vector
## DEPENDENCIES : matst()
# pdat=matst(dat_vec)
# p
#}
hplot<-function(dat_vec,nlab='max',text_col='dodgerblue',...){
print('WARNING : x axis is currently just an index 1:n_unique_values')
dat_stat=matst(dat_vec)
ymax=max(dat_stat$count)
ymin=floor(min(dat_stat$count))
ybuffer=max(ymax*0.05,5)
ymax=ymax+max(ymax*0.1,10)
if(nlab=='max'){nlab=nrow(dat_stat)}
plot(dat_stat$count,type='h',frame.plot=F,ylim=c(ymin,ymax),...)
dat_txt=dat_stat[1:nlab,]
text((dat_txt$count)+ybuffer,labels=dat_txt$percent*100,col=text_col)
}
pcplot<-function(dat_list,scale_dat=F,colmix="",pch=16,dat_descr="",main="",legend_space=8,...){
cat("\tUSE\t: use list of multiple datasets to plot PC1vPC2\n")
cat("\tNOTE\t: list1 - expect a list of expression matrices, rows=genes, columns=samples |*| assumes same row order\n")
cat("\tNOTE\t: if legend is clipping the plot, increase plot width or legend_space param\n")
if(class(dat_list)!="list"){
warning("\tWARNING\t: input is not a list\n")
}
if(colmix[1]==""){
cat("\t\tusing default color mix\n")
colmix=c("#0072B2","#E69F00","#009E73","#56B4E9","#D55E00","#66A61E","#7570B3","#a50f15","#A6761D","#117733","#332288","#b15928","#882255","#999933","#AA4499","#1f78b4","#F0E442","#e31a1c","#6a3d9a","#b2df8a","#08519c","#ff7f00","#fdbf6f","#33a02c","#b15928","#f16913","#238b45","#807dba","#d94801","#41ab5d","#fd8d3c","#4292c6")[1:length(dat_list)]
}
if(colmix[1]=="none"){
cat("\t\tusing default color mix\n")
colmix=rep('black',length(dat_list))
}
if(length(colmix)>length(dat_list)){
warning("\tWARNING\t: colmix is longer than number of elements in list, only first ",length(dat_list)," colors will be used\n")
colmix=colmix[1:length(dat_list)]
}
if(length(colmix)<length(dat_list)){
warning("\tWARNING\t: colmix is shorter than number of elements in list, default mix will be used instead *(to make all same color, use colmix='none'\n")
colmix=c("#0072B2","#E69F00","#009E73","#56B4E9","#D55E00","#66A61E","#7570B3","#a50f15","#A6761D","#117733","#332288","#b15928","#882255","#999933","#AA4499","#1f78b4","#F0E442","#e31a1c","#6a3d9a","#b2df8a","#08519c","#ff7f00","#fdbf6f","#33a02c","#b15928","#f16913","#238b45","#807dba","#d94801","#41ab5d","#fd8d3c","#4292c6")[1:length(dat_list)]
}
datleg=as.data.frame(names(dat_list))
colnames(datleg)="name"
datleg$color=colmix
datleg$n=NA
datleg$n[1]=ncol(dat_list[[1]])
datcol=rep(datleg[1,"color"],datleg$n[1])
for(idat in 2:length(dat_list)){
datleg$n[idat]=ncol(dat_list[[idat]])
datcol=c(datcol,rep(datleg[idat,"color"],datleg$n[idat]))
}
dat_lis=as.data.frame(dat_list)
## scale and center the data before plotting
if(scale_dat){
cat('\t- scale and center data (by columns)\n')
dat_lis=scale(dat_lis,scale=T,center=T)
}
pcs=prcomp(t(dat_lis))
pcstat=round(as.matrix(summary(pcs)$importance["Proportion of Variance",1:2]),digits=3)*100
# legend_space=8
# creates a plot with a wide right margin
cat('\t calculating PCs, will take time with big datasets\n')
par(mar=c(5,4,(4+nrow(datleg)), legend_space))
plot(pcs$x[,1:2],col=datcol,pch=pch,frame.plot=F,xlab=paste("PC1 ",pcstat[1],"%",sep=""),ylab=paste("PC2 ",pcstat[2],"%",sep=""),main=paste0(dat_descr,"\n",main),...)
# create a new plot overlay (with no left margin) with legend on the topright
par(fig=c(0,1,0,1), oma=c(0, 4, 0, 0), mar=c(0, 4, 0, 0), new=TRUE)
# plot(0, 0, type="n", bty="n", xaxt="n", yaxt="n")
plot.new()
legend(x="topright",pch=16,box.lwd=0,box.col="white",col=unique(datcol),legend=names(dat_list))
# would be nice to add biplot style arrows for key loadings (ie genes)
return(invisible(list(legend=datleg,data=dat_list,pcs=pcs$x,pcobj=pcs,cols=datcol)))
}
dat.type<-function(dat_mat){
if(is.matrix(dat_mat)){dat_class=apply(dat_mat,2,class)}
if(is.data.frame(dat_mat)){dat_class=unlist(lapply(dat_mat,class))}
print(matst(dat_class))
}
cov.impact.check<-function(expr_mat,cov_mat,plot_cov_distributions=F,single_gene_analysis=F,sanity=F,n_pcs=5,sd_check=T,dat_descr="",help=F){
if(help==T){
cat("\tINPUTS:\texpr_mat : rows - genes, columns - samples | cov_mat rows - samples , columns - covariates\n")
# cat("\tNOTE\t: cov_mat - non-numeric covariates will be used 'as.factor'\n")
cat("\tNOTE:\tcov_mat - should contain numeric or factor values only\n")
cat("\tNOTE:\tsd.check may be required if there are non-varying covariates or genes\n")
cat("\tUSE:\tsanity=T can be used to correctly order colnames(expr_mat) and rownames(cov)\n")
}
start_time=Sys.time()
if(!sanity){
order_check=sum(colnames(expr_mat)!=rownames(cov_mat))
if(order_check!=0){
warning("\n\t\tWARNING\t: (colnames(expr_mat)==rownames(cov_mat)) shows :",order_check,"not in same order","\n")
}
}
if(sanity){
# matst(colnames(expr_mat)==rownames(cov_mat))
expr_mat=expr_mat[,intersect(colnames(expr_mat),rownames(cov_mat))]
cov_mat=cov_mat[intersect(colnames(expr_mat),rownames(cov_mat)),]
cat("\t\tcolnames(expr_mat) n=",ncol(expr_mat)," rownames(cov_mat) n=",nrow(cov_mat)," same order",sum(colnames(expr_mat)==rownames(cov_mat)),"\n")
expr_mat=sd.check(expr_mat,T,F)
cov_mat=sd.check(cov_mat,F,T)
}
#library(WGCNA)
# sampleTree2=hclust(dist(expr_mat), method="average")
# traitColors=numbers2colors(cov_mat, signed=FALSE);
# plotDendroAndColors(sampleTree2, traitColors,groupLabels =names(datTraits),main ="Sample dendrogram and trait heatmap")
#print(dim(cov_mat))
#if(sd_check==T){
# cov_mat=sd.check(cov_mat)
#}
#print(dim(cov_mat))
###------------------------------------------------------------------------------------------------------------
## PC level analysis
cat("\t========== calculate PCs of expr_mat ==========\n")
pcs=pcstat(expr_mat,plot_pcs=c(T,T,F),n_pcs=5,verbose=F)
univpcP=matrix(NA,nrow=ncol(pcs),ncol=ncol(cov_mat))
colnames(univpcP)=paste(colnames(cov_mat),"Pval",sep=".")
rownames(univpcP)=colnames(pcs)
univpcR=matrix(NA,nrow=ncol(pcs),ncol=ncol(cov_mat))
colnames(univpcR)=paste(colnames(cov_mat),"Rsq",sep=".")
rownames(univpcR)=colnames(pcs)
cat("\t========== correlate of PCs with covariates ==========\n")
for(icov in 1:ncol(cov_mat)){
for(ipcs in 1:ncol(pcs)){
# cat(icov,ipcs,"\n")
univpcP[colnames(pcs)[ipcs],paste(colnames(cov_mat)[icov],"Pval",sep=".")]=lmp((lm(pcs[,ipcs]~cov_mat[,icov])))
univpcR[colnames(pcs)[ipcs],paste(colnames(cov_mat)[icov],"Rsq",sep=".")]=summary(lm(pcs[,ipcs]~cov_mat[,icov]))$r.sq
}
}
# temp measure to avoid breaking due to 'missing' P-values
univpcP[is.na(univpcP)]=1
# parmar=c(5.1,4.1,4.1,2.1)
# try(
# Heatmap(univpcP,mode="pval",main=paste(dat_descr,"\n covariate effect on PCs ",nrow(cov_mat)," samples",sep=""))
# )
# Heatmap(univpcR)
# par(mar=parmar)
library(corrplot)
par(mfrow=c(1,1))
# col2= colorRampPalette(c("#67001F", "#B2182B", "#D6604D", "#F4A582", "#FDDBC7","darkgoldenrod1", "#D1E5F0", "#92C5DE", "#4393C3", "#2166AC", "#053061"))#"#FFFFFF"
# col2= colorRampPalette(c("#67001F", "#B2182B", "#D6604D", "#F4A582","darkgoldenrod1", "#92C5DE", "#4393C3", "#2166AC", "#053061"))#"#FFFFFF"
# corrplot(-log10(univpcP),method="circle", is.corr=F,
# p.mat=univpcP, insig="blank",sig.level=0.01,col=rev(col2(100)),cl.align="l",tl.col="black",
# mar=c(0,0,4,0),tl.cex=0.7,
# title="expression PCs correlation with covars")
try(Heatmap(univpcP,mode="pval",margin=c(10,5),main="expression PCs correlation with covars"))
par(mar=c(5,4,4,2)+0.1)
# distributions of covariates
par(mfrow=c(1,1))
if(plot_cov_distributions){
hist(make.numeric(expr_mat),breaks=100,col="grey",main="expression matrix")
# hist(pcs[1,breaks=100,col="grey",main="expression matrix")
par(mfrow=c(3,2))
for(icov in 1:ncol(cov_mat)){
hist(make.numeric(cov_mat[icov]),breaks=50,col="grey",main=colnames(cov_mat)[icov])
}
par(mfrow=c(1,1))
}
###------------------------------------------------------------------------------------------------------------
## Single gene analysis
if(!single_gene_analysis){
return(invisible(list("pcs"=pcs,"univpcP"=univpcP,"univpcR"=univpcR)))
}
if(single_gene_analysis){
cat("\t========== perform single gene analysis on",nrow(expr_mat),"genes ==========\n")
par(mfrow=c(5,2))
univgenP=list()
univgenFDR=list()
univgenR=list()
for(icov in 1:ncol(cov_mat)){
cat("\t\t ----------- ",colnames(cov_mat)[icov],"\t-----------",icov,"of",ncol(cov_mat),"\n")
genp=list()
genr=list()
# for(igen in 1:100){
for(igen in 1:nrow(expr_mat)){
# genp[[paste(rownames(expr_mat)[igen],colnames(cov_mat)[icov],"Pval",sep=".")]]=lmp((lm(pcs[,ipcs]~cov_mat[,icov])))
test.pair=cbind(t(expr_mat[igen,,drop=F]),cov_mat[,icov,drop=F])
test.pair=test.pair[complete.cases(test.pair),]
lmobj=lm(test.pair[,1]~test.pair[,2])
# lmobj=(lm(t(expr_mat[igen,,drop=F])~.,data=(cov_mat[,icov,drop=F]))) # does not handle NA values
genp[[rownames(expr_mat)[igen]]]=lmp(lmobj)
# genr[[paste(rownames(expr_mat)[igen],colnames(cov_mat)[icov],"Rsq",sep=".")]]=summary(lm(pcs[,ipcs]~cov_mat[,icov]))$r.sq
genr[[rownames(expr_mat)[igen]]]=summary(lmobj)$r.sq
cat(round(igen/nrow(expr_mat),digits=2),"\r");flush.console()
}
univgenP[[paste(colnames(cov_mat)[icov],"Pval",sep=".")]]=unlist(genp)
univgenR[[paste(colnames(cov_mat)[icov],"Rsq",sep=".")]]=unlist(genr)
univgenFDR[[paste(colnames(cov_mat)[icov],"FDR",sep=".")]]=p.adjust(univgenP[[paste(colnames(cov_mat)[icov],"Pval",sep=".")]],method="fdr")
hist((univgenP[[icov]]),main=paste(dat_descr,"n=",nrow(test.pair),"\nlm( gene ~",colnames(cov_mat)[icov],")",sep=" "),xlab="P-value")
# hist(-log10(univgenP[[icov]]),main=paste("gene~",colnames(cov_mat)[icov],"-log10(Pval)",sep=" "))
# boxplot((univgenP[[icov]]),main=paste("gene~",colnames(cov_mat)[icov],"Pval",sep=" "))
# boxplot(-log10(univgenP[[icov]]),main=paste(dat_descr,"\nlm( gene ~",colnames(cov_mat)[icov],")",sep=" "),ylab="-log10(P-value)")
plot.new()
legend(x="center",box.lwd=0,box.col="white",xpd=TRUE,
legend=c(paste(sum(univgenP[[icov]]<0.05,na.rm=T)," (",round(sum(univgenP[[icov]]<0.05,na.rm=T)/length(univgenP[[icov]]),digits=2)*100,"%)"," genes P<5%",sep="")
,paste(sum(univgenFDR[[icov]]<0.05,na.rm=T)," (",round(sum(univgenFDR[[icov]]<0.05,na.rm=T)/length(univgenFDR[[icov]]),digits=2)*100,"%)"," genes FDR<5%",sep="")
),cex=1)
}
par(mfrow=c(1,1))
# par(mar=c(5,4,7,2) + 0.1)
par(mar=c(15,4,4,2))
boxplot(univgenR,main=paste(dat_descr,"\nlm( gene ~ single-covariate )",sep=" "),ylab="R-squared",ylim=c(0,1),las=2,frame=F,pch=16,cex=0.5)
return(invisible(list("pcs"=pcs,"univpcP"=univpcP,"univpcR"=univpcR,"univP"=univgenP,"univR"=univgenR,"univFDR"=univgenFDR)))
}
cat("\n\ttime taken : ",round(Sys.time()-start_time,digits=2),"mins\n\n")
}
Heatm<-function(cor.measures,min=-1,max=1,rowclust=F,colclust=F,ncols=101,dendrogram="none",main="",mode="cor",sig=T,cexrow=0.7,cexcol=0.7,margin=c(12,12)){
library(gplots)
# print("Heatmap( 'matrix to use' , 'min value for legend cols' , 'max ..' , 'clust by rows', 'clust by cols' , 'ncols to use for legend', 'dendrogram=c('none','row','column','both')' )")
# min=-1
# max=1
# default settings for testing plots
#min=-1;max=1;rowclust=F;colclust=F;ncols=101;dendrogram="none";main="";mode="cor"
if(mode=="cor"){
print("plotting correlation based matrix")
# heatmap.2((cor.measures),breaks=seq(min,max,length=(ncols+1)),col=colorRampPalette(c("#0072B2","#56B4E9","white","#F0E442","darkred"))(ncols),tracecol=F,dendrogram=dendrogram,Rowv=rowclust,Colv=colclust,margins=margin,density.info="none",keysize=1,cexCol=0.7, cexRow=0.7, lheight=lheight,symkey=T,main=main,lmat=lmatrix)#,lwid=c(0.5,0.5))
heatmap.2((cor.measures),breaks=seq(min,max,length=(ncols+1)),col=colorRampPalette(c("#0072B2","#56B4E9","white","#F0E442","darkred"))(ncols),tracecol=F,dendrogram=dendrogram,Rowv=rowclust,Colv=colclust,margins=margin,density.info="none",keysize=1,cexCol=cexcol,cexRow=cexrow,symkey=T,main=main)
# heatmap.2((cor.measures),breaks=seq(min,max,length=(ncols+1)),col=colorRampPalette(c("#08306b","#08519c","#2171b5","#4292c6","#6baed6","#9ecae1","#c6dbef","#deebf7","#f7fbff","white","#ffffcc","#ffeda0","#fed976","#feb24c","#fd8d3c","#fc4e2a","#e31a1c","#bd0026","#800026"))(ncols),tracecol=F,dendrogram=dendrogram,Rowv=rowclust,Colv=colclust,margins=margin,density.info="none",keysize=1,cexCol=cexcol,cexRow=cexrow,lhei=lheight, symkey=T,main=main,lwid=lwidth,lmat=lmatrix)
#"white","#ffffcc","#ffeda0","#fed976","#feb24c","#fd8d3c","#fc4e2a","#e31a1c","#bd0026","#800026"
}
if(mode=="pval"){
if(sig==T){
print("plotting p-value based matrix with cellnotes")
corm=round(cor.measures,digits=3)
cor.match=matrix(NA,nrow=nrow(cor.measures),ncol=ncol(cor.measures))
rownames(cor.match)=rownames(cor.measures)
colnames(cor.match)=colnames(cor.measures)
# Head(cor.match)
for(i in 1:nrow(corm)){
for(j in 1:ncol(corm)){
# cat(i,j,"\n")
if(corm[i,j]>0.05){
cor.match[i,j]=""
# print(">0.05")
}
if(corm[i,j]<=0.01){
cor.match[i,j]=gsub(" ","",paste(rep("*",min(4,floor(-log10(corm[i,j]))-1)),collapse=" ")) # backup square == .
# print("<0.01")
}
if(corm[i,j]<0.05 & corm[i,j]>0.01){
cor.match[i,j]="+"
# print("<0.05")
}
}
}
# heatmap.2(-log10(cor.measures),cellnote=(cor.match),notecol="black",breaks=seq(0,max(-log10(cor.measures)),length=(ncols+1)),col=colorRampPalette(c("white","#F0E442","darkred"))(ncols),tracecol=F,dendrogram=dendrogram,Rowv=rowclust,Colv=colclust,margins=margin,density.info="none",keysize=1,cexCol=cexcol,cexRow=cexrow,lhei=lheight,symkey=F,main=main,lwid=lwidth,lmat=lmatrix) #cexCol=1,cexRow=0.8,
heatmap.2(-log10(cor.measures),cellnote=(cor.match),notecol="black",breaks=seq(0,max(-log10(cor.measures)),length=(ncols+1)),col=colorRampPalette(c("white","#ffffbf","#fee090","#fdae61","#f46d43","#d73027","#a50026","darkred"))(ncols),tracecol=F,dendrogram=dendrogram,Rowv=rowclust,Colv=colclust,margins=margin,density.info="none",keysize=1,cexCol=cexcol,cexRow=cexrow,symkey=F,main=main) #cexCol=1,cexRow=0.8,
# heatmap.2(-log10(cor.measures),cellnote=(cor.match),notecol="black",breaks=seq(0,max(-log10(cor.measures)),length=(ncols+1)),col=colorRampPalette(c("white","#ffffcc","#ffeda0","#fed976","#feb24c","#fd8d3c","#fc4e2a","#e31a1c","#bd0026","#800026"))(ncols),tracecol=F,dendrogram=dendrogram,Rowv=rowclust,Colv=colclust,margins=margin,density.info="none",keysize=1,cexCol=cexcol,cexRow=cexrow,lhei=lheight,symkey=F,main=main,lwid=lwidth,lmat=lmatrix) #cexCol=1,cexRow=0.8,
}
if(sig==F){
print("plotting p-value based matrix, no cellnotes")
# heatmap.2(-log10(cor.measures),breaks=seq(0,max(-log10(cor.measures)),length=(ncols+1)),col=colorRampPalette(c("white","#F0E442","darkred"))(ncols),tracecol=F,dendrogram=dendrogram,Rowv=rowclust,Colv=colclust,margins=margin,density.info="none",keysize=1,cexCol=cexcol,cexRow=cexrow,lhei=lheight,symkey=F,main=main,lwid=lwidth,lmat=lmatrix) #cexCol=1,cexRow=0.8,
heatmap.2(-log10(cor.measures),breaks=seq(0,max(-log10(cor.measures)),length=(ncols+1)),col=colorRampPalette(c("white","#ffffbf","#fee090","#fdae61","#f46d43","#d73027","#a50026","darkred"))(ncols),tracecol=F,dendrogram=dendrogram,Rowv=rowclust,Colv=colclust,margins=margin,density.info="none",keysize=1,cexCol=cexcol,cexRow=cexrow,symkey=F,main=main)
# heatmap.2(-log10(cor.measures),breaks=seq(0,max(-log10(cor.measures)),length=(ncols+1)),col=colorRampPalette(c("white","#ffffcc","#ffeda0","#fed976","#feb24c","#fd8d3c","#fc4e2a","#e31a1c","#bd0026","#800026"))(ncols),tracecol=F,dendrogram=dendrogram,Rowv=rowclust,Colv=colclust,margins=margin,density.info="none",keysize=1,cexCol=cexcol,cexRow=cexrow,lhei=lheight,symkey=F,main=main,lwid=lwidth,lmat=lmatrix)
}
}
#min(na.omit(as.vector(cor.measures))),max(na.omit(as.vector(cor.measures)))
}
Load<-function(obj_path,...){
# obj=ls()
# obj=ls()
# all_obj=load(obj_path)
# print(as.matrix(sort(ls()[!(ls() %in% obj)]),decreasing=T))
# print(parent.frame())
# if(exists("readme")){cat(readme)} # needs to be the one read-in
return(as.matrix(sort(load(obj_path, parent.frame(),...),decreasing=F)))
# return(load(obj_path, parent.frame()))
}
mat<-function(dat_mat,decreasing=T){
return(as.matrix(dat_mat))
}
cmat<-function(dat_mat){
return(as.matrix(colnames(dat_mat)))
}
rmat<-function(dat_mat,decreasing=T){
return(as.matrix(rownames(dat_mat)))
}
mats<-function(dat_mat,decreasing=T){
return(as.matrix(sort(dat_mat,decreasing=decreasing)))
}
matst<-function(dat_mat,thresh=NA,sort=T,decreasing=T){
if(sort){
dummy=as.data.frame(sort(table(dat_mat),decreasing=decreasing))
}
if(!sort){
dummy=as.data.frame((table(dat_mat)))
}
if(ncol(dummy)==1){
holder=as.data.frame(rownames(dummy))
holder$count=dummy[,1]
dummy=holder
}
colnames(dummy)=c('entry','count')
dummy$percent=round(dummy$count/sum(as.numeric(dummy$count)),digits=3)
if(!is.na(thresh)){dummy=dummy[dummy$count>=thresh,]}
return(dummy)
}
# writing lists to file / matrix with different lengths
#lapply(modg, write, "~/Dropbox/Cognition/list.writing.test.txt", append=TRUE, ncolumns=5850)
is.missing<-function(data_mat,make_plot=F,use_grid=F,dat_descr="",define_na=c(NA,NaN)){
# needs optimisation for large datasets (calc missn from the start and use that for cat) <<<<<<<§§§§§§§§§§§§§§§§§§§§§§§
n_overlap=nrow(data_mat[complete.cases(data_mat),])
# cat("\t---------------- Total Missing : ",sum(is.na(data_mat))," (",round(sum(is.na(data_mat))/length(is.na(data_mat)),digits=3)*100,"%) ----------------\n",sep="")
cat("\t-------------------- Total Missing : ",sum((unlist(data_mat)%in%define_na))," (",round(sum((unlist(data_mat)%in%define_na))/length(unlist(data_mat)),digits=3)*100,"%) --------------------\n",sep="")
cat("\t ------ ",n_overlap,"samples remaining if 'complete.cases()' ------\n\n")
# missn=as.data.frame(mats(apply(data_mat,2,function(x)sum(is.na(x))),T))
missn=as.data.frame(mats(apply(data_mat,2,function(x)sum(x%in%define_na)),T))
colnames(missn)="n.missing"
missn$pc.missing=round(missn$n.missing/nrow(data_mat),digits=3)
if(sum(missn)<0 & make_plot){cat('\n\t no plot produced : missing detected\n')}
if(sum(missn)>0){
#merge_dist[merge_dist%in%c(NA,'NA','NaN','Inf','-Inf','')]
# data_mat[data_mat%in%c("NA","NaN")]=0
# data_mat[(data_mat%in%define_na)]=1
# data_mat[!(data_mat%in%define_na)]=0
dat_mat=apply(data_mat,2,function(x)(x%in%define_na))
rownames(dat_mat)=rownames(data_mat)
data_mat=dat_mat*1
# Head(data_mat)
# summary(as.numeric(as.matrix(data_mat)))
# data_mat[data_mat%in%c("NA","NaN")]=1
# table(as.numeric(as.matrix(data_mat)))
#Heat((matrix),min=0,max=1,rowclust=T,colclust=T,dendrogram='both')
# intended to remove rows with all missing, but these should be clustered in 1 corner and not cause issues anyhow
# if(!full.row.missing){
# apply(data_mat,1,function(x){sum(is.na(x))==length(x)})
# }
if(dat_descr!=""){
paste0(dat_descr,"\n")
}
#if(make_plot==T & use_grid==F & max(missn)!=0){
if(make_plot & !use_grid){
library(gplots)
heatmap.2(make.numeric(data_mat),breaks=seq(0,1,length=(3)),col=c("#9ebcda","#e6550d"),tracecol=F,dendrogram='both',Rowv=T,Colv=T,margins=c(12,12),density.info="none",keysize=1,cexCol=0.7,cexRow=0.7,symkey=F)
mtext(dat_descr,adj=1,side=3,line=2)
mtext(paste0("n.complete : ",n_overlap),adj=1,side=3)
}
#if(make_plot==T & use_grid==T & max(missn)!=0){
if(make_plot & use_grid){
library(gplots)
heatmap.2(make.numeric(data_mat),breaks=seq(0,1,length=(3)),col=c("#9ebcda","#e6550d"),tracecol=F,dendrogram='both',Rowv=T,Colv=T,margins=c(12,12),density.info="none",keysize=1,cexCol=0.7,cexRow=0.7,symkey=F,colsep=0:ncol(data_mat),rowsep=0:nrow(data_mat),sepcolor="white",sepwidth=c(0.05,0.05))
mtext(dat_descr,adj=1,side=3,line=2)
mtext(paste0("n.complete : ",n_overlap),adj=1,side=3)
}
return(missn)
# print(missn)
# return(invisible(data_mat[complete.cases(data.mat),]))
}
}
Legend<-function(legend,x='topright',...){
cat('\n\t',par()$fig,par()$oma,par()$mar,'\n')
par_mar_cur=par()$mar
par_fig_cur=par()$fig
par_oma_cur=par()$oma
par(fig=c(0,1,0,1), oma=c(0,4,0,0), mar=c(0,4,0,0), new=TRUE)
plot.new()
legend(legend=legend,x=x,box.lwd=0,box.col="white",...)
par(mar=par_mar_cur)#,no.readonly=T)
par(fig=par_fig_cur)#,no.readonly=T)
par(oma=par_oma_cur)#,no.readonly=T)
cat('\n\t',par()$fig,par()$oma,par()$mar,'\n')
}
Plot<-function(xdat,ydat,line45deg=F,...){
ylim.dat=c(floor(min(ydat)),ceiling(max(ydat)))
xlim.dat=c(floor(min(xdat)),ceiling(max(xdat)))
plot(xdat,ydat,pch=16,col=rgb(0, 0, 0,alpha=0.2),xlim=xlim.dat,ylim=ylim.dat,frame.plot=F,...)
# plot(xdat,ydat,pch=18,col=rgb(0, 0, 0,alpha=0.3),xlim=xlim.dat,ylim=ylim.dat,frame.plot=F,...)
if(line45deg){abline(coef=c(0,1),lty=2,col='grey60')}
}
#
cplot<-function(xdat,ydat,line45deg=T,legend_space=10,main='',legend_pos='topright',plot.pch=16,lm_lwd=1,...){
# cat(min(x),max(x),min(y),max(y),"\n")
# dummy=cbind(seq(floor(min(x)),(ceiling(max(x))+log10(ceiling(max(x)))),length.out=10),seq(floor(min(y)),(ceiling(max(y))+log10(ceiling(max(y)))),length.out=10))
# print(dummy)
#
par(mar=c(5,4,legend_space, 3))
plot(x=xdat,y=ydat
,pch=plot.pch
,frame.plot=F
,col=rgb(0, 0, 0,alpha=0.3)
# ,line45deg=line45deg
# ,dat_descr=dat_descr
,...)
# points(x,y,pch=16)
# abline(lm(xdat~ydat),col="dodgerblue")
lmdat=lm(ydat~xdat)
abline(lmdat,col="dodgerblue")
if(line45deg){abline(coef=c(0,1),lty=2,col='grey60',lwd=lm_lwd)}
Legend(legend=paste("spearman P =",signif(cor.test(ydat,xdat,method="spearman")$p.val,digits=2)
," R-sq =",round(cor(ydat,xdat,method="spearman"),digits=3)
,"\nkendall P =",signif(cor.test(ydat,xdat,method="kendall")$p.val,digits=2)
," R-sq =",round(cor(ydat,xdat,method="kendall"),digits=3)
,"\nlm P =",signif(lmp(lmdat),digits=2)
," R-sq =",signif(summary(lmdat)$r.sq,digits=3)))
}
eplot<-function(x,xlab="",ylab="",main="",legend.pos="topright"){
# cat(min(x),max(x),"\n")
dummy=matrix(
seq(floor(min(x)),(ceiling(max(x))+log10(ceiling(max(x)))+1),length.out=length(as.numeric(as.matrix(x))))
,nrow=nrow(x),ncol=ncol(x))
matplot(dummy,type="n",frame.plot=FALSE,xlab=xlab,ylab=ylab,main=main)
}
clust.analyse<-function(cov_mat,do_plots=c(T,T,T,T),sanity=F,box_sig=0.95,clust.method="ward.D2",cor.method="kendall",dat_descr=""){
cat("\tINPUTS\t: cov_mat rows - samples , columns - covariates (numeric or factor)\n")
cat("\t\t\t - clustering method :",clust.method," | correlation :",cor.method," | n.samples :",nrow(cov_mat),"\n\n")
# ward clustering : http://stats.stackexchange.com/questions/109949/what-algorithm-does-ward-d-in-hclust-implement-if-it-is-not-wards-criteria
if(sanity){
cov_mat=sd.check(cov_mat)
}
cor_mat=cor(make.numeric(cov_mat),method=cor.method)
###>> # attempt at introducing p-values of correlation (matrix solution rather than trying to loop thru manually)
###>>
# library("psych")
###>>
# cor_test=corr.test(cov_mat,method="kendall")
if(do_plots[1]){
plot(hclust(as.dist(1-cor_mat),method=clust.method),main=paste(dat_descr,"\n",clust.method,"clutering on",cor.method,"correlations, n=",nrow(cov_mat)))#\nline - 99th percentile dist"))
# abline(h=quantile(clust1$height,0.99),col="red")
plot(hclust(dist(t(cov_mat)),method=clust.method),main=paste(dat_descr,"\n",clust.method,"clutering on distance matrix (not correl), n=",nrow(cov_mat)))#\nline - 99th percentile dist"))
# abline(h=quantile(clust1$height,0.99),col="red")
}
if(do_plots[2]){
library(pvclust)
# Ward Hierarchical Clustering with Bootstrapped p values
cat("\n\tpvclust provides two types of p-values:\n\t\tBP (Bootstrap Probability)\t- computed by normal bootstrap resampling\n\t\tAU (Approximately Unbiased)\t- computed by multiscale bootstrap resampling (better approximation to unbiased p-value than BP)\n\n")
fit=pvclust((cor_mat), method.hclust=clust.method,nboot=10000)
plot(fit) # dendogram with p values
# add rectangles around groups highly supported by the data
# pvrect(fit, alpha=box_sig)
}
if(do_plots[3]){
library(corrplot)
# corrplot(colorRampPalette(c("white","#ffffbf","#fee090","#fdae61","#f46d43","#d73027","#a50026","darkred"))(ncols))
# col2=colorRampPalette(c("#0072B2","#56B4E9","white","#F0E442","darkred"))(20)
###>> corrplot.mixed((cor_test$r)^2,p.mat=cor_test$p,insig="blank",col=colmixrb,cl.align="l",tl.col="black",order="hclust",main=paste(cor.method,"samples=",nrow(cov_mat),"ordered using hclust"))
corrplot.mixed(cor_mat,col=colmixrb,cl.align="l",tl.col="black",order="hclust",main=paste(dat_descr,cor.method,"samples=",nrow(cov_mat),"ordered using hclust"))
# p.mat=res1[[1]], insig="blank"
# title="Maximum % of overlap with p hyper > 0.01"
# library(PerformanceAnalytics)
# chart.Correlation(cor_mat, pch=20,main=paste(cor.method,"samples=",nrow(cov_mat)))#bg=c("blue","red","yellow"),
# library(psych)
# pairs.panels(cor_mat) # covarate style plot
}
if(do_plots[4]){
library(gplots)
# cor_mat=cor(make.numeric(cov_mat),method=cor.method)
#
# Heat(cor_mat,rowclust=T,colclust=T,dendrogram="both",main=paste(cor.method,"samples=",nrow(cov_mat)))
Heat(cor_mat,main=paste(dat_descr,"\n",cor.method,"samples=",nrow(cov_mat)))
# Heatmap(cor_mat,rowclust=T,colclust=T,dendrogram="both",main=paste(cor.method,"samples=",nrow(cov_mat)))
}
# if(do_plots[4]==T){
#library(psych)
# biplot(pcs)
# pairs.panels(expr_ma) # covarate style plot
# }
}
#pclust<-function(cov_mat){
# cat("\tINPUTS\t: cov_mat rows - samples , columns - covariates (numeric or factor)\n")
# library(pvclust)
# Ward Hierarchical Clustering with Bootstrapped p values
# fit=pvclust((cov_mat), method.hclust="ward.D2", method.dist="euclidean")
# plot(fit) # dendogram with p values
# add rectangles around groups highly supported by the data
# pvrect(fit, alpha=.95)
#}
clustplot<-function(data,clust.method="ward.D2",cor.method="kendall",verbose=T){
if(verbose==T){
cat("\tHierarchical clustering on 1) correlation and 2) distance\n")
cat("\t\tclust.method :",clust.method," | cor.method :",cor.method," | n :",nrow(data),"\n\n")
}
# library(WGCNA)
par(mfrow=c(2,1))
plot(hclust(as.dist(1-cor(data,method=cor.method)),method=clust.method),main=paste(clust.method,"clutering on",cor.method,"correlations, n=",nrow(data)))#\nline - 99th percentile dist"))
# abline(h=quantile(clust1$height,0.99),col="red")
plot(hclust(dist(t(data)),method=clust.method),main=paste(clust.method,"clutering on distance matrix, n=",nrow(data)))#\nline - 99th percentile dist"))
# abline(h=quantile(clust1$height,0.99),col="red")
}
#rmerge<-function(data_mat1,data_mat2,all=T,...){
# dat_out=merge(data_mat1,data_mat2,by="row.names",...)
# rownames(dat_out)=dat_out$Row.names
# dat_out=dat_out[,-which(colnames(dat_out)=="Row.names")]
# return(dat_out)
#}
rmerge<-function(data_mat1,data_mat2,all=T,verbose=T){
dat_out=merge(data_mat1,data_mat2,by="row.names",all=all)
rownames(dat_out)=dat_out$Row.names
dat_out=dat_out[,-which(colnames(dat_out)=="Row.names")]
if(verbose){(cat('\n\t\tnrow x=',nrow(data_mat1),', y=',nrow(data_mat2),', merged=',nrow(dat_out),'\n\n',sep=''))}
return(dat_out)
}
rmerge.list<-function(dat_lis,all=F){
lis_merge=rmerge(dat_lis[[1]],dat_lis[[2]])
cat('\tnrows inputs:',nrow(dat_lis[[1]]),nrow(dat_lis[[1]])==nrow(dat_lis[[2]])) ## keeping track of nrows in each list
if(length(dat_lis)>2){
for(ilis in 3:length(dat_lis)){
cat(' ',nrow(dat_lis[[1]])==nrow(dat_lis[[ilis]])) ## keeping track of nrows in each list
lis_merge=rmerge(lis_merge,dat_lis[[ilis]])
}
}
cat('\tnrows output:',nrow(dat_lis[[1]])==nrow(lis_merge),'\n') ## keeping track of nrows final output
return(lis_merge)
}
lm.mat<-function(data_mat,verbose=F,symm_fix=T){
## USE: ~ lm for matrix of variables (pairwise) ~ cor.test()
## WARNING: y variable can-not be categorical => if 2 variables are both categorical, rest fixed by symmetry
## INPUT: matrix/data.frame of variables rows=samples, columns=variables (numeric or factor)
## missing values can be used, only relevant variable n will be affected
## for factor variables (in data_mat), min P of lm() is used\n\n")#
data_mat=as.data.frame(data_mat)
lmpstat=matrix(NA,ncol=ncol(data_mat),nrow=ncol(data_mat))
colnames(lmpstat)=colnames(data_mat)
rownames(lmpstat)=colnames(data_mat)
rsqstat=lmpstat
nsample=lmpstat
k=1
for(yvar in colnames(data_mat)){
for(xvar in colnames(data_mat)){
if(yvar!=xvar){
tester=data_mat[,c(yvar,xvar)]
tester=tester[complete.cases(tester),]
nsample[xvar,yvar]=nrow(tester)
if(!is.factor(tester[,yvar])){ ## y variable can-not be a factor
holder=summary(lm(as.matrix(tester[,yvar,drop=F])~.,data=tester[,xvar,drop=F]))
dummy=holder$coefficients
dummy=dummy[-1,,drop=F]
#### using min for factors - 1 p-value per factor ==> only interested in the lowest one
dummy=dummy[which(dummy[,"Pr(>|t|)"]==min(dummy[,"Pr(>|t|)"])),] ## not doing this risk selecting the wrong sign for Rsq below if factor
lmpstat[xvar,yvar]=dummy['Pr(>|t|)']
rsqstat[xvar,yvar]=holder$r.sq*sign(dummy['Estimate']) ## add the direction of relationship
}
}
}
k=lcount(k,length(colnames(data_mat)))
}
if(symm_fix){
## y variable can-not be a factor -> fix missing by symmetry,by construction, only columns are affected (and not rows)
decider=apply(rsqstat,2,function(x){(sum(is.na(x))==length(x))})
# rows and columns - same order by construction..
rsqstat[,decider]=t(rsqstat[decider,])
lmpstat[,decider]=t(lmpstat[decider,])
diag(rsqstat)=1
diag(lmpstat)=1
}
return(invisible(list(lmp=lmpstat,rsq=rsqstat,nsamp=nsample)))
}
univarlm<-function(data_mat,colname_y_var,verbose=F){
if(verbose){
cat("\tUSE: univariate lm for",ncol(data_mat),"variables, y =",colname_y_var,"\n")
cat("\t\tNOTE: missing values can be used, only relevant variable n will be affected\n")
cat("\t\tNOTE: for factor variables, min P is used\n\n")
}
vdr=make.numeric(data_mat[,which(colnames(data_mat)==colname_y_var),drop=F])
data_mat=(data_mat[,-which(colnames(data_mat)==colname_y_var),drop=F])
unistat=as.data.frame(matrix(NA,nrow=ncol(data_mat),ncol=4))
colnames(unistat)=c("lmP","lmRsq","n","sig")
rownames(unistat)=colnames(data_mat)
# Head(unistat)
for(icov in 1:ncol(data_mat)){
tester=rmerge(vdr,data_mat[,colnames(data_mat)[icov],drop=F])
# tester=merge(vdr,data_mat[,colnames(data_mat)[icov],drop=F],by="row.names")
# tester=tester[,-which(colnames(tester)=="Row.names")]
tester=tester[complete.cases(tester),]
unistat[icov,"n"]=nrow(tester)
#### using min for factors - 1 p-value per factor ==> only interested in the lowest one
unistat[icov,"lmP"]=min(summary(lm(tester[,1]~tester[,2]))$coefficients[,"Pr(>|t|)"][-1]) # -1 removes the intercept
# unistat[icov,"lmRsq"]=summary(lm(tester[,1]~tester[,2]))$r.sq*sign(min(summary(lm(tester[,1]~tester[,2]))$coefficients[,"Estimate"][-1]))
unistat[icov,"lmRsq"]=summary(lm(tester[,1]~tester[,2]))$r.sq
}
unistat[unistat$lmP>0.1,"sig"]=""
unistat[unistat$lmP<0.1,"sig"]="."
unistat[unistat$lmP<0.05,"sig"]="+"
unistat[unistat$lmP<0.01,"sig"]="*"
return(unistat[order(unistat$lmP),])
}
multivarlm<-function(data_mat,colname_y_var,verbose=T){
if(verbose==T){
cat("\tUSE: multivariate lm : y =",colname_y_var,"~",ncol(data_mat),"variables, \n\n")
}
lm_out=(lm(data_mat[,colname_y_var,drop=F]~.,data=as.data.frame(data_mat[,-which(colnames(data_mat)==colname_y_var)])))
if(verbose==T){
lm_stat=as.data.frame(sort(summary(lm_out)$coefficients[,"Pr(>|t|)"][-1]),decreasing=T)
colnames(lm_stat)="P"
lm_stat$FDR=p.adjust(lm_stat$P,method="fdr")
lm_stat$sig=""
lm_stat[lm_stat$P<0.1,"sig"]="."
lm_stat[lm_stat$P<0.05,"sig"]="+"
lm_stat[lm_stat$P<0.01,"sig"]="*"
print(lm_stat)
cat("\n\tR-squared :\t\t",summary(lm_out)$r.sq,"\n\tadjusted R-squared :\t",summary(lm_out)$adj.r.sq,"\n\n")
}
return((lm(data_mat[,colname_y_var,drop=F]~.,data=as.data.frame(data_mat[,-which(colnames(data_mat)==colname_y_var)]))))
}
meplot<-function(expr_mat,do_plots=T,net_col=rgb(1, 0, 0, 0.5),me_col='midnightblue'){
# expr_mat=bexpr$M
pca=prcomp(t(expr_mat),scale=T,center=T)
expr_mat=rmerge(pca$x[,1]*sign(pca$rotation[which.max(abs(pca$rotation[,1])),1]),t(expr_mat),all=F)
Head(expr_mat)
expr_mat=(as.data.frame(scale(expr_mat,center=T)))
eplot(expr_mat,xlab="samples",ylab="gene experssion",main=paste("PC1 explains",round(summary(pca)$importance["Proportion of Variance",1],digits=2)*100,"% variation"))
for(igen in 2:ncol(expr_mat)){
lines((expr_mat[,igen]),col=net_col)#rgb(1, 0, 0, 0.5)
}
lines(expr_mat$x,col=me_col,lwd=3)
legend("topright", legend=c("gene expression", "ME 'expression'"), bty="n",lwd=c(2,4), cex=1, col=c(net_col,me_col), lty=c(1, 1), pch=c(NA, NA))
}
net.me<-function(expr_list,nme=1){
mes=list()
for(imod in 1:length(expr_list)){
cat('\t',names(expr_list)[imod])
pcs=prcomp((expr_list[[names(expr_list)[imod]]]),scale=T,center=T)
for(imes in 1:nme){
cat(' ',paste0('ME',imes))
# mes[[paste0('ME',imes)]][[names(expr_list)[imod]]]=pcs$x[,imes]*sign(pcs$rotation[which.max(abs(pcs$rotation[,imes])),imes])
mes[[names(expr_list)[imod]]][[paste0('ME',imes)]]=pcs$x[,imes]*sign(pcs$rotation[which.max(abs(pcs$rotation[,imes])),imes])
}
mes[[names(expr_list)[imod]]]=as.data.frame(mes[[names(expr_list)[imod]]])
cat('\n')
}
return(invisible(mes))
}
list.as.df<-function(in_list){
# cat("\tUSE: convert 'uneven' list to data frame (missing introduced at the end)\n")
#'adapted' from : http://stackoverflow.com/questions/27153979/converting-nested-list-unequal-length-to-data-frame # N.B. 'adapted' is defined as 'shamelessly ripped off', which includes direct copy/paste of explanation below
# get the length of list element ('indx') by looping with sapply
# recent version of R - can use lengths to replace the sapply(.., length) step
# change the length of each element to the max length from the 'indx' (length<-) and thereby pad NA values at the end of the list elements with length less than the max length
# rbind the list elements, convert to data.frame and change the column names.
indx=sapply(in_list, length)
res=as.data.frame(do.call(rbind,lapply(in_list, `length<-`,max(indx))))
return(res)
}
dist.as.vector<-function(dist_mat){
dist_mat[upper.tri(dist_mat,diag=T)]=NA
dist_mat=unlist(dist_mat)
dist_mat=dist_mat[!is.na(dist_mat)]
return(dist_mat)
}
duplicates<-function(dat_vec,...){
return(dat_vec[duplicated(dat_vec)])
}
rm.duplicates<-function(matrix,colName,verbose=T){
clean=matrix[!duplicated(matrix[,which(colnames(matrix)==colName)]),]
if(verbose){cat(" ",sum(duplicated(matrix[,which(colnames(matrix)==colName)]))," duplicates removed || ",round(nrow(clean)/nrow(matrix),digits=3)*100,"% of data remaining || ",sum(duplicated(clean[,which(colnames(clean)==colName)])),"dupilicates remaining \n")}
return(invisible(clean))
}
get.duplicates<-function(dat_mat,col_dup,...){
ndup=sort(table(dat_mat[,col_dup]),decreasing=T)
# ndup=sum(duplicated(dat_mat[,col_dup]))
# Head(ndup)
ndup=ndup[ndup>1]
if(length(ndup)<1){
cat("\tno duplicates found in column :",col_dup,"\n")
return(invisible(list(n.duplicated="",duplicates="",unique=dat_mat)))
}
if(length(ndup)>=1){
# Head(ndup)
cat("\t",length(ndup),"duplicates found, top duplicates:\n")
print(as.matrix(ndup[1:min(15,length(ndup))]))
dat_dup=dat_mat[(dat_mat[,col_dup]%in%names(ndup)),]
dat_dup=unique(dat_dup[order(dat_dup[,col_dup]),])
dat_unq=unique(dat_mat[!(dat_mat[,col_dup]%in%names(ndup)),])
return(invisible(list(n.duplicated=ndup,duplicates=dat_dup,unique=dat_unq)))
}
}
Boxplot<-function(dat_mat,pch=16,cex=0.5,las=2,frame=F,varwidth=T,...){
boxplot(dat_mat,pch=pch,cex=cex,las=las,frame=frame,varwidth=varwidth,...)
}
sva.fac<-function(expr_mat,non_adjust="",adjust="",nsv=""){
## installing sva
#http://www.bioconductor.org/packages/release/bioc/html/sva.html
#source("https://bioconductor.org/biocLite.R") ## try http:// if https:// URLs are not supported
#biocLite("sva")
# data for sva tutorial ## https://www.bioconductor.org/packages/3.3/bioc/vignettes/sva/inst/doc/sva.pdf
#source("https://bioconductor.org/biocLite.R")
#biocLite("bladderbatch")
library(sva)
#library(bladderbatch)
#data(bladderdata)
#source("https://bioconductor.org/biocLite.R")
#biocLite("limma")
library(limma)
# install.packages('pamr')
library(pamr)
dummy=as.data.frame(rep(1,ncol(expr_mat)))
if(non_adjust!=""){
cat('\tusing specified covars to keep :',ncol(adjust),'variables\n')
mod=model.matrix(~.,data=non_adjust)
}
if(non_adjust==""){
cat('\tno variables specified to keep\n')
mod=model.matrix(~1,data=dummy)
}
if(adjust!=""){
cat('\tusing specified covars to adjust for :',ncol(adjust),'variables\n')
mod0=model.matrix(~.,data=adjust)
}
if(adjust==""){
cat('\tno variables specified to adjust for\n')
mod0=model.matrix(~1,data=dummy)
}
if(nsv!=""){
nsv
cat('\tusing pre-specified n.factors :',nsv,'\n')
}
if(nsv==""){
nsv=num.sv(expr_mat,mod,method="leek")
cat('\tn factors suggested by sva :',nsv,'\n')
}
svobj=sva(expr_mat,mod,mod0,n.sv=nsv)
}
# lm.mat<-function(data_mat,verbose=F){
# ## USE: ~ lm for matrix of variables (pairwise) ~ cor.test()
# ## WARNING: y variable can-not be categorical => NA in results (diag() is also left as NA)
# ## INPUT: matrix/data.frame of variables rows=samples, columns=variables (numeric or factor)
# ## missing values can be used, only relevant variable n will be affected
# ## for factor variables (in data_mat), min P of lm() is used\n\n")#
# lmpstat=matrix(NA,ncol=ncol(data_mat),nrow=ncol(data_mat))
# colnames(lmpstat)=colnames(data_mat)
# rownames(lmpstat)=colnames(data_mat)
# rsqstat=lmpstat
# nsample=lmpstat
# k=1
# for(yvar in colnames(data_mat)){
# cat(yvar,'\t')
# for(xvar in colnames(data_mat)){
# if(yvar!=xvar){
# cat(' ',xvar)
# tester=data_mat[,c(yvar,xvar)]
# tester=tester[complete.cases(tester),]
# nsample[xvar,yvar]=nrow(tester)
# if(!is.factor(tester[,yvar])){ ## y variable can-not be a factor
# holder=summary(lm(as.matrix(tester[,yvar,drop=F])~.,data=tester[,xvar,drop=F]))
# #### using min for factors - 1 p-value per factor ==> only interested in the lowest one
# lmpstat[xvar,yvar]=min(holder$coefficients[,"Pr(>|t|)"][-1]) # -1 removes the intercept
# #unistat[icov,"lmRsq"]=summary(lm(tester[,1]~tester[,2]))$r.sq*sign(min(summary(lm(tester[,1]~tester[,2]))$coefficients[,"Estimate"][-1]))
# rsqstat[xvar,yvar]=holder$r.sq
# }
# }
# }
# cat('\n')
# # k=lcount(k,length(colnames(data_mat)))
# }
# return(invisible(list(lmp=lmpstat,rsq=rsqstat,nsamp=nsample)))
# }
## wrapper code to run multlm for multiple MEs 1 at a time.., same applicable for genes
#mlm=list()
#for(imod in 1:nrow(mes)){
# mlm[[rownames(mes)[imod]]]=multlm(c1[,'Verbal.delayed.recall',drop=F],cbind(c1[,!(colnames(c1)=='Verbal.delayed.recall')],(mes[imod,])),T)
#}
multlm<-function(y_mat,x_mat,leave1out=F,verbose=F,help=F){
if(help){
cat('\tUSE\t: perform multivariate linear model analysis : lm(y_mat[,i]~.,data=x_mat)\n')
cat('\tNOTE\t: NA acceptable in x_mat and y_mat - for each model complete.cases is used\n')
cat('\tINPUTS\t: y_mat - y variables, uses 1 column at a time (eg phenotype), rows=samples, cols=variables\n')
cat('\tINPUTS\t: x_mat - x variables, all are used simultaneously (eg clinical covariates / genes etc), rows=samples, cols=variables\n')
}
sample_match=round(sum(rownames(y_mat)==rownames(x_mat))/nrow(x_mat),digits=3)
if(verbose){
cat('\t\t% sample names matching : ',sample_match,'\n')
}
if(sample_match<1){warning(paste0('\t\t% sample names matching : ',sample_match,'\n'))}
lmstat=list()
for(ivar in 1:ncol(y_mat)){
if(verbose){
cat('\t\t',colnames(y_mat)[ivar],'\t----------------------------\n')
}
covars=(cbind(y_mat[,ivar,drop=F],x_mat)) # cbind appears to remove factors => go bk to original input
covars=covars[complete.cases(covars),]
lmod=lm(as.matrix(y_mat[rownames(covars),ivar,drop=F])~.,data=x_mat[rownames(covars),])
# lmod=lm(covars[,1]~.,data=covars[,-1])
# save linear model stuffs --------------------------------------------------------------------------------
lmstat[['n']][[colnames(y_mat)[ivar]]]=nrow(covars)
lmstat[['mlm.var_P']][[colnames(y_mat)[ivar]]]=(summary(lmod)$coefficients[,"Pr(>|t|)"][-1])
lmstat[['mlm.var_Est']][[colnames(y_mat)[ivar]]]=summary(lmod)$coefficients[,"Estimate"][-1]
lmstat[['mlm.Rsq']][[colnames(y_mat)[ivar]]]=summary(lmod)$r.sq
lmstat[['mlm.adjRsq']][[colnames(y_mat)[ivar]]]=summary(lmod)$adj.r.sq
lmstat[['mlm.Pval']][[colnames(y_mat)[ivar]]]=lmp(lmod)
# lmstat[['mlm.FDR']]=p.adjust(lmstat[['mlm.Pval']],method='fdr')
if(leave1out){
# rough estimate of the magnitude of effect of covariate on whole model using leave1out principle ---------
if(verbose){
cat('\t\tperform leave-one-out for each column in x_mat\n')
}
leave1=list()
for(icov in 1:ncol(x_mat)){
lmod=lm(as.matrix(y_mat[rownames(covars),ivar,drop=F])~.,data=x_mat[rownames(covars),-icov])
leave1[['l1o.Rsq']][[colnames(x_mat)[icov]]]=summary(lmod)$r.sq
leave1[['l1o.adjRsq']][[colnames(x_mat)[icov]]]=summary(lmod)$adj.r.sq
}
leave1[['l1o.adjRsq']][leave1[['l1o.adjRsq']]<0]=0 # negative adj.R.sq ~ 0
lmstat[['l1o.Rsq']][[colnames(y_mat)[ivar]]]=lmstat[['mlm.Rsq']]-leave1[['l1o.Rsq']] # calculate the difference in Rsquared between full model and leave-1-out
lmstat[['l1o.adjRsq']][[colnames(y_mat)[ivar]]]=lmstat[['mlm.adjRsq']]-leave1[['l1o.adjRsq']] # calculate the difference in Rsquared between full model and leave-1-out
}
}
if(verbose){
cat('\n\toutput contains :
\t\t1. n - number of samples in model
\t\t2. mlm.var_P - significance of contribution of single variable to model
\t\t3. mlm.var_Est - estimate of single variable (coeff)
\t\t4. mlm.Rsq - overall model fit R-squared
\t\t5. mlm.adjRsq - overall model fit adjusted R-squared : (1 - (1 - R_squared) * ((n - df.int)/error.in.df)) corresponds Wherry Formula-1
\t\t\tmore info : http://stats.stackexchange.com/questions/48703/what-is-the-adjusted-r-squared-formula-in-lm-in-r-and-how-should-it-be-interpret
\t\t6. mlm.Pval - overall model significance P-value
\t\t7. l1o.Rsq - if leave1out=T, rough estimate of the magnitude of effect of covariate on whole model using leave1out principle on R-squared
\t\t8. l1o.adjRsq - if leave1out=T, rough estimate of the magnitude of effect of covariate on whole model using leave1out principle on adjusted R-squared
')
}
return(lmstat)
}
univlm<-function(ymat,xmat){
# univariate lm for each column in ymat against each column in xmat, i.e. ymat[,1]~xmat[,1]
lisp=list()
lisr=list()
lisd=list()
lmstat=list()
lmpval=list()
for(vary in 1:ncol(ymat)){
for(varx in 1:ncol(xmat)){
lmod=lm((ymat[,vary])~(as.matrix(xmat[,varx])))
lisp[[colnames(xmat)[varx]]]=lmp(lmod)
lisr[[colnames(xmat)[varx]]]=summary(lmod)$adj.r.squared
## extract the direction of corrrelation for the variable with lowest p-val (only an issue for factors)
coeff=summary(lmod)$coefficients[-1,,drop=F] ## disregard the intercept line
lisd[[colnames(xmat)[varx]]]=sign(coeff[coeff[,'Pr(>|t|)']==min(coeff[,'Pr(>|t|)']),'Estimate'])
}
lmstat[[colnames(ymat)[vary]]]=as.data.frame(list(p=unlist(lisp),fdr=p.adjust(unlist(lisp),method='fdr'),r.sq=round(unlist(lisr),digits=3),dir=unlist(lisd)))
lmstat[[colnames(ymat)[vary]]]=psig(lmstat[[colnames(ymat)[vary]]],'p')
lmpval[[colnames(ymat)[vary]]]=unlist(lisp) ## matrix of just p-values for plotting etc
}
return(list(lmstat=lmstat,lmpval=as.data.frame(lmpval)))
}
## lm for all genes explained by cov_mat
#multlm<-function(dat_mat,cov_mat,leave1out=F,verbose=T){
# if(verbose){
# cat('\tINPUTS\t: dat_mat - y variables, used 1 at a time, rows=variables, cols=samples')
# cat('\tINPUTS\t: cov_mat - x variables, all are used simultaneously, rows=samples, cols=variables')
# sample_match=round(sum(colnames(dat_mat)==rownames(cov_mat))/ncol(dat_mat),digits=3)
# cat('\t\t% sample names matching : ',sample_match,'\n')
# }
# if(sample_match<1){warning(paste0('\t\t% sample names matching : ',sample_match,'\n'))}
# lmstat=list()
# for(ivar in 1:nrow(dat_mat)){
# covars=(cbind(t(dat_mat[ivar,,drop=F]),c1))
# covars=covars[complete.cases(covars),]
# lmod=lm(t(dat_mat[ivar,rownames(covars),drop=F])~.,data=as.data.frame(make.numeric(cov_mat[rownames(covars),])))
# lmod=lm(covars[,1]~.,data=covars[,-1])
# parse to save linear model stuffs
# lmstat[['n']][[rownames(dat_mat)[ivar]]]=nrow(covars)
# lmstat[['mlm.var_P']][[rownames(dat_mat)[ivar]]]=(summary(lmod)$coefficients[,"Pr(>|t|)"][-1])
# lmstat[['mlm.var_Est']][[rownames(dat_mat)[ivar]]]=summary(lmod)$coefficients[,"Estimate"][-1]
# lmstat[['mlm.Rsq']][[rownames(dat_mat)[ivar]]]=summary(lmod)$r.sq
# lmstat[['mlm.adjRsq']][[rownames(dat_mat)[ivar]]]=summary(lmod)$adj.r.sq
# lmstat[['mlm.Pval']][[rownames(dat_mat)[ivar]]]=lmp(lmod)
# if(leave1out){
# for(icov in 1:ncol(cov_mat){
# }
# }
# lmstat[['mlm.FDR']]=p.adjust(lmstat[['mlm.Pval']],method='fdr')
# }
#}
## alternative way to run the two below for ppi only within the query
ppi<-function(query,dtb_loc='~/Dropbox/PROJ/ppi/dtb/processed/allDTB_collate_macro.GeneMania.Hippie.iRefWeb.Rdata'){
### NOTE : loaded dtb name is 'macro', first interactor column name='a', second interactor column name='b'
cat('\n\tloading database file\n')
Load(dtb_loc)
neta=macro[(macro$a%in%query & macro$b%in%query),]
# Head(neta)
cat('\t number of unique genes interacting :',length(unique(c(neta$a,neta$b))),pc(length(unique(c(neta$a,neta$b))),length(query)),'\n')
cat('\tperforming filtering step, takes a long time based on number of interactors\n')
# neta=ppi.dbfilt(neta)
return(invisible(neta))
}
ppi.interact<-function(query,dtb_loc='~/Dropbox/PROJ/ppi/dtb/processed/allDTB_collate_macro.GeneMania.Hippie.iRefWeb.Rdata'){
### NOTE : loaded dtb name is 'macro', first interactor column name='a', second interactor column name='b'
cat('\n\tloading database file\n')
Load(dtb_loc)
neta=macro[(macro$a%in%query | macro$b%in%query),]
# Head(neta)
cat('\t number of unique genes interacting :',length(unique(c(neta$a,neta$b))),pc(length(unique(c(neta$a,neta$b))),length(query)),'\n')
cat('\tperforming filtering step, takes a long time based on number of interactors\n')
# neta=ppi.dbfilt(neta)
return(invisible(neta))
}
ppi.net<-function(dtb,query,genex,geney,degree=1){
ppi=list()
for(icon in 1:degree){
# print(icon)
if(icon==1){ppi[[paste('d',icon,sep='')]]=dtb[dtb[,which(colnames(dtb)%in%genex)]%in%query | dtb[,which(colnames(dtb)%in%geney)]%in%query,]}
if(icon>1){
query=unique(c(ppi[[paste('d',icon-1,sep='')]][,which(colnames(dtb)%in%c(genex))],ppi[[paste('d',icon-1,sep='')]][,which(colnames(dtb)%in%c(geney))]))
ppi[[paste('d',icon,sep='')]]=dtb[dtb[,which(colnames(dtb)%in%genex)]%in%query | dtb[,which(colnames(dtb)%in%geney)]%in%query,]
}
cat('\tdegree =',icon,' unique genes =',length(unique(c(ppi[[paste('d',icon,sep='')]][,which(colnames(dtb)%in%c(genex))],ppi[[paste('d',icon,sep='')]][,which(colnames(dtb)%in%c(geney))]))),'connections=',nrow(ppi[[paste('d',icon,sep='')]]),'\n')
}
return(invisible(ppi))
}
ppi.dbfilt<-function(dtb){
t1=Sys.time()
dumpty=list()
n.entries=nrow(dtb)
dtb$x=paste(dtb$a,dtb$b)
dtb$y=paste(dtb$b,dtb$a)
while(nrow(dtb)>0){
humpty=dtb[which(dtb$x==dtb$x[1] | dtb$x==dtb$y[1] | dtb$y==dtb$y[1] | dtb$y==dtb$y[1]), ]
dtb=dtb[-which(dtb$x==dtb$x[1] | dtb$x==dtb$y[1] | dtb$y==dtb$y[1] | dtb$y==dtb$y[1]), ]
dumpty[[paste(sort(unique(c(humpty$a,humpty$b))),collapse='_')]]=t(as.data.frame(list(
a=sort(unique(c(humpty$a,humpty$b)))[1]
,b=sort(unique(c(humpty$a,humpty$b)))[2]
,weight=paste(sort(unique(unlist(strsplit(as.character(humpty$weight),'\\|')))),collapse=";")
,method=paste(sort(unique(unlist(strsplit(as.character(humpty$method),'\\|')))),collapse=";")
,type=paste(sort(unique(unlist(strsplit(as.character(humpty$type),'\\|')))),collapse=";")
,dtb=paste(sort(unique(unlist(strsplit(as.character(humpty$dtb),'\\|')))),collapse=";")
,ref=paste(sort(unique(unlist(strsplit(as.character(humpty$ref),'\\|')))),collapse=";")
,pmid=paste(sort(unique(unlist(strsplit(as.character(humpty$pmid),'\\|')))),collapse=";")
,mark=paste(sort(unique(unlist(strsplit(as.character(humpty$mark),'\\|')))),collapse=";")
)))
cat(1-round(nrow(dtb)/n.entries,digits=2),'\r');flush.console()
}
print(Sys.time()-t1)
return(invisible(as.data.frame(t(as.data.frame(dumpty)))))
}
deconv<-function(exp_mat,do_plots=F,verbose=F,...){
set.seed(0) ## required for reproducibility -> in a single + limited test the gene numbers in upper and lower dont change, but not fully tested
if(verbose){
cat('\tUSE : applies deconvolution method to separate bimodal distirbution into two, returns the greater normal\n')
cat('\tNOTE: expr_mat should be the log transformed matrix of gene expression\n')
}
library(mixtools)
## example of a more general case? fitting mean and sd is for comp time efficiency so may not be ideal for other data.. https://www.r-bloggers.com/fitting-mixture-distributions-with-the-r-package-mixtools/
M=apply(exp_mat,1,mean)
S=apply(exp_mat,1,sd)
yy=normalmixEM(M,mu=quantile(M,c(0.25,0.75),lambda=c(0.5,0.5)))
Mth=qnorm(0.95,min(yy$mu),yy$sigma[which.min(yy$mu)])
upper=exp_mat[M>Mth,,drop=F]
lower=exp_mat[M<Mth,,drop=F]
if(do_plots){
hist(exp_mat,col='darkgrey',breaks=100,prob=T,...)
# hist(lower,col=rgb(1,0,0,0.5),breaks=max(30,100*(nrow(lower)/nrow(exp_mat))),add=T)
# hist(upper,col=rgb(0,0,1,0.5),breaks=max(30,100*(nrow(upper)/nrow(exp_mat))),add=T)
lines(density(M, na.rm=T), lty=2, lwd=2)
par(new=T)
lines(density(upper, na.rm=T), lty=1, lwd=2,col='dodgerblue')
par(new=T)
lines(density(lower, na.rm=T), lty=1, lwd=2,col='darkred')
# legend(x="topright",pch=15,box.lwd=0,box.col="white",col=c('darkgrey',rgb(1,0,0,0.5),rgb(0,0,1,0.5)),pt.bg=c('darkgrey',rgb(1,0,0,0.5),rgb(0,0,1,0.5)),legend=c(paste0('n=',nrow(exp_mat)),paste0('n=',nrow(lower)),paste0('n=',nrow(upper))),cex=1)
legend(x="topright",pch=15,box.lwd=0,box.col="white",col=c('darkgrey','darkred','dodgerblue'),pt.bg=c('darkgrey',rgb(1,0,0,0.5),rgb(0,0,1,0.5)),legend=c(paste0('density n=',nrow(exp_mat)),paste0('lower n=',nrow(lower)),paste0('upper n=',nrow(upper))),cex=1)
cat('\tdataset n=',nrow(exp_mat),'lower n=',nrow(lower),'upper n=',nrow(upper),'\n')
}
return(invisible(list(upper=upper,lower=lower)))
#### a mediocre toy dataset to show bimodal distribution
##x=matrix(rnorm(9000, mean = 0, sd = 0.5),ncol=3)
##y=matrix(rnorm(27000, mean = 5, sd = 4),ncol=3)
##z=rbind(x,y)
##deconv(z,T)
}
#cat('\tstuffs\n')
#overlap<-function(A,B){
# both=union(A,B)
# inA=both %in% A
# inB=both %in% B
# return(table(inA,inB))
#}
#set.seed(1)
#A <- sample(letters[1:20],10,replace=TRUE)
#B <- sample(letters[1:20],10,replace=TRUE)
#xtab_set(A,B)
rowstat<-function(data_mat,add=F,round_digits=2,diag_na=T){
cat('\n\tcalculate mean and median for each row, na.rm=T\n\n')
# round(cbind(apply(pcmf,1,mean,na.rm=T),apply(pcmf,1,median,na.rm=T)),digits=2)
data_mat_calc=data_mat
if(diag_na){diag(data_mat_calc)=NA}
if(add){
data_mat$mean=round(apply(data_mat_calc,1,mean,na.rm=T),digits=round_digits)
data_mat$median=round(apply(data_mat_calc,1,median,na.rm=T),digits=round_digits)
return(data_mat)
}
if(!add){
dat_stat=round(cbind(apply(data_mat_calc,1,mean,na.rm=T),apply(data_mat_calc,1,median,na.rm=T)),digits=round_digits)
colnames(dat_stat)=c('mean','median')
return(dat_stat)
}
}
clust<-function(dat_mat,horiz=T,scale_dat=F,clust_method='ward.D2',k=1,cor_method='dist',do_plots=T,plot_cex=0.8,dat_descr='',par_mar=c(4,1,3,20),help=F,...){
## ,... refers to plotDendroAndColors :: library(WGCNA)
if(help){
cat('\n\tINPUT :\tdat_mat - auto-detect data frame or list of data frames - 1 per condition (samples=columns)\n')
cat('\tNOTE :\tcor_method - correlation method to calculate distance see "cor" for options, option: "dist" - distance on raw data, no correlation calculated, "as.dist" treat input as distances\n\n')
cat('\tNOTE :\tk- specify number of clusters for cutree, can use range i.e. 2:5, if k="dynamic", WGCNA function "cutreeHybrid" is used to cut the tree\n\n')
}
dat_is_list=F
if(class(dat_mat)=='list'){
cat('\t- processing list information\n')
dat_is_list=T
colvec=""
for(ilis in 1:length(dat_mat)){
colvec=c(colvec,rep(colmix[ilis],ncol(dat_mat[[ilis]])))
}
colvec=colvec[-1]
dat_mat=as.data.frame(dat_mat)
}
#print(horiz)
#print(dat_is_list)
if(scale_dat){
cat('\t- scale and center data (by columns)\n')
dat_mat=scale(dat_mat,scale=T,center=T)
}
if(do_plots){
library(dendextend) ## moved it here to avoid waiting for clustering only to find that the library does not exist..
}
if(cor_method=='as.dist'){
cat('\t- as.dist(dat_mat)\n')
distmat=as.dist(dat_mat)
}
if(cor_method=='dist'){
cat('\t- calculating eucledian distance\n')
distmat=dist(t(dat_mat))
}
if(cor_method!='dist'&cor_method!='as.dist'){
cat('\t- calculating distance based on',cor_method,'correlation\n')
distmat=as.dist(1-abs(t(cor(dat_mat,method=cor_method))))
}
clustat=hclust(distmat,method=clust_method)
if(k!='dynamic'){
cat('\t- cutree, k=',k,'\n')
trestat=cutree(clustat,k=k)
main_text=paste(dat_descr,'\ndistance=',cor_method,'cluster.method=',clust_method,'k =',paste(k,collapse=', '))
}
if(k=='dynamic'){
library(WGCNA) ## required for k='dynamic' only i.e. cutreeHybrid
cat('\t- dynamic tree cut - WGCNA function, works best with clust_method="average"\n')
trestat=cutreeHybrid(clustat,as.matrix(distmat),minClusterSize=2,deepSplit=0)$labels
main_text=paste(dat_descr,'\ndistance=',cor_method,'cluster.method=',clust_method,'clusters determined by WGCNA: "cutreeHybrid"')
}
# cutree$labels
clustnm=(unique(trestat))
clust=list()
for(iclust in 1:length(clustnm)){
clust[[as.character(clustnm)[iclust]]]=names(trestat)[trestat==clustnm[iclust]]
}
if(do_plots){
cat('\t- plotting results\n')
par_cur=par()$mar
par(mar=par_mar)
if(horiz){
clusden=as.dendrogram(clustat)
if(dat_is_list){labels_colors(clusden)=colvec[order.dendrogram(clusden)]}
clusden=color_branches(clusden, k=k)
if(dat_is_list){plot(clusden, horiz=TRUE,...)}
if(dat_is_list){colored_bars(colvec, clusden, horiz=TRUE)}
## colored_bars can be used to show k-means clusters (supports multiple bars, including colvec), as per EG
# k234=cutree(dend, k=2:4)
# colored_bars(cbind(k234[,3:1], col_car_type), dend, rowLabels=c(paste0("k=", 4:2), "Car Type"))
mtext(text=main_text,side=3,cex=1.5)
}
if(!horiz){
plotDendroAndColors(
clustat
,trestat
#,cutHeight=300
,hang=0.03
#,addGuide=TRUE
,guideHang=0.05
,main=main_text
,cex.colorLabels=plot_cex
,cex.dendroLabels=plot_cex
,cex.rowText=plot_cex
,...)
}
par(mar=par_cur)
}
# cat(readme)
if(dat_is_list){
readme='\n\toutput contains :
\t1. clust - members of modules based on k
\t2. distmat - distance matrix used to perfom hierarchical clustering
\t3. clustat - ouput of hcust(distmat)
\t4. trestat - output of cutree(clustat)
\t5. colvec - vector of colors used
\n'
return(invisible(list(clust=clust,distmat=distmat,clustat=clustat,trestat=trestat,colvec=colvec,readme=readme)))
}
if(!dat_is_list){
readme='\n\toutput contains :
\t1. clust - members of modules based on k
\t2. distmat - distance matrix used to perfom hierarchical clustering
\t3. clustat - ouput of hcust(distmat)
\t4. trestat - output of cutree(clustat)
\n'
return(invisible(list(clust=clust,distmat=distmat,clustat=clustat,trestat=trestat,readme=readme)))
}
}
##_#### fully working clustering version deprecated after expansion to cover lists with different color plots
##_##clust<-function(dat_mat,clust_method='ward.D2',k=1,cor_method='dist',do_plots=F,plot_cex=0.8,dat_descr='',help=F,...){
## ,... refers to plotDendroAndColors :: library(WGCNA)
##_## if(help){
##_## cat('\n\tINPUT :\tdat_mat - matrix, rows=genes, cols=samples\n')
##_## cat('\tNOTE :\tcor_method - correlation method to calculate distance see "cor" for options, option: "dist" - distance on raw data, no correlation calculated\n\n')
##_## cat('\tNOTE :\tk- specify number of clusters for cutree, can use range i.e. 2:5, if k="dynamic", WGCNA function "cutreeHybrid" is used to cut the tree\n\n')
##_## }
##_## if(do_plots){
##_## library(WGCNA) ## moved it here to avoid waiting for clustering only to find the plots failed
##_## }
##_## if(cor_method=='dist'){
##_## cat('\t- calculating eucledian distance\n')
##_## distmat=dist(t(dat_mat))
##_## }
##_## if(cor_method!='dist'){
##_## cat('\t- calculating distance based on',cor_method,'correlation\n')
##_## distmat=as.dist(1-abs(t(cor(dat_mat,method=cor_method))))
##_## }
##_## clustat=hclust(distmat,method=clust_method)
##_## if(k!='dynamic'){
##_## trestat=cutree(clustat,k=k)
##_## main_text=paste(dat_descr,'\ndistance=',cor_method,'cluster.method=',clust_method,'k =',paste(k,collapse=', '))
##_## }
##_## if(k=='dynamic'){
##_## trestat=cutreeHybrid(clustat,as.matrix(distmat),minClusterSize=2,deepSplit=0)$labels
##_## main_text=paste(dat_descr,'\ndistance=',cor_method,'cluster.method=',clust_method,'clusters determined by WGCNA: "cutreeHybrid"')
##_## }
# cutree$labels
##_## clustnm=(unique(trestat))
##_## clust=list()
##_## for(iclust in 1:length(clustnm)){
##_## clust[[as.character(clustnm)[iclust]]]=names(trestat)[trestat==clustnm[iclust]]
##_## }
##_## if(do_plots){
##_## plotDendroAndColors(
##_## clustat
##_## ,trestat
##_## #,cutHeight=300
##_## ,hang=0.03
##_## #,addGuide=TRUE
##_## ,guideHang=0.05,
##_## ,main=main_text
##_## ,cex.colorLabels=plot_cex
##_## ,cex.dendroLabels=plot_cex
##_## ,cex.rowText=plot_cex
##_## ,...
##_## )
##_## }
##_## readme='\n\toutput contains :
##_## \t1. clust - members of modules based on k
##_## \t2. distmat - distance matrix used to perfom hierarchical clustering
##_## \t3. clustat - ouput of hcust(distmat)
##_## \t4. trestat - output of cutree(clustat)
##_## \n'
##_### cat(readme)
##_## return(invisible(list(clust=clust,distmat=distmat,clustat=clustat,trestat=trestat,readme=readme)))
##_##}
clust.list<-function(dat_lis,clust_method='ward.D2',k=1,cor_method='dist',do_plots=F,do_indiv_plots=F,plot_cex=0.8,dat_descr='',help=F){
if(help){
cat('\tINPUT :\tdat_lis - expect list() of matrixes, rows=genes, cols=samples')
cat('\tNOTE :\tcor_method - correlation method to calculate distance see "cor" for options, option: "dist" - distance on raw data, no correlation calculated\n\n')
cat('\tNOTE :\tk- specify number of clusters for cutree, can use range i.e. 2:5, if k="dynamic", WGCNA function "cutreeHybrid" is used to cut the tree\n\n')
}
distmat=list()
clustat=list()
trestat=list()
for(ilis in 1:length(dat_lis)){
cat('\t',names(dat_lis)[ilis],'\t',ilis,'of',length(dat_lis))
dummy=clust(dat_lis[[names(dat_lis)[ilis]]],clust_method=clust_method,k=k,cor_method=cor_method,do_plots=do_indiv_plots,dat_descr=paste(names(dat_lis)[ilis],dat_descr))
distmat[[names(dat_lis)[ilis]]]=dummy$distmat
clustat[[names(dat_lis)[ilis]]]=dummy$clustat
trestat[[names(dat_lis)[ilis]]]=dummy$trestat
}
options(warn=-1)
trem=as.data.frame(trestat[[1]])
for(ireg in 2:length(trestat)){
trem=rmerge(trem,as.data.frame(trestat[[ireg]]),all=T)
}
colnames(trem)=names(trestat)
trem$mean=apply(trem,1,mean,na.rm=T)
trem$median=apply(trem,1,median,na.rm=T)
trem[is.na(trem)]=0
# if(do_plots){
Heat(make.numeric(as.matrix(trem)),rowclust=F)
# }
options(warn=0)
return(invisible(list(distmat=distmat,clustat=clustat,trestat=trestat,trem=trem)))
}
clicky.run<-function(module_genes_list,module_bkrnd,clicky_dir,dat_descr='',id_type='hsapiens__gene_symbol',os_type='osx'){
## os_type='osx' ; os_type='linux'
## hsapiens__ensembl_gene_stable_id
#cat('\tUSE : creates temp directory and runs clicky enrichments on a list of module gene names - list$ModuleName$ModuleGenes')
# system(paste0("mkdir ",clicky_dir,"/tmp_in_files/"))
## clunky, but saves having to paste the clickly path everywhere..
if(class(module_genes_list)!='list'){
stop('\n\tERROR :\trequire module_genes_list as list, each element - vector or matrix of module genes')
}
current_dir=getwd()
working_dir=paste0(clicky_dir,"/tmp_in_files/")
setwd(clicky_dir)
# getwd()
write.table(module_bkrnd,file="tmp_in_files/tmp_bkgrnd.txt", quote=F, sep="\t", row.names=F, col.names=F)
for(imod in 1:length(module_genes_list)){
cat('\t',names(module_genes_list)[imod],dat_descr,'\t',length(module_genes_list[[imod]]),'\t',length(module_bkrnd),'\n')
write.table(module_genes_list[[imod]],file="tmp_in_files/tmp_genelist.txt", quote=F, sep="\t", row.names=F, col.names=F)
if(os_type=='osx'){
system(paste0("python ",clicky_dir,"/bin/runner_modified_commandline_osx.py "
,working_dir,"tmp_genelist.txt "
,working_dir,"tmp_bkgrnd.txt "
,working_dir," "
,paste0(names(module_genes_list)[imod],'_',dat_descr)
," hsapiens ",id_type," ",id_type," 0.05"))
}
if(os_type=='linux'){
system(paste0("python ",clicky_dir,"/bin/clicky_python_commandline_linux.py "
,working_dir,"tmp_genelist.txt "
,working_dir,"tmp_bkgrnd.txt "
,working_dir," "
,paste0(names(module_genes_list)[imod],'_',dat_descr)
," hsapiens ",id_type," ",id_type," 0.05"))
}
# rm(enrich_list)
# enrich_list=gestalt_read(enrich_type=c("GO","Commons","KEGG","WIKI"),
# mod_names= names(module_genes_list)[imod],
# in_path=paste0(outpath,"/"),
# out_path=paste0(outpath,"/graphics"),
# dat_descr="SpeDE_eachRegion")
# pdf(paste0(outpath,"/graphics/",names(module_genes_list)[imod],"_graph.pdf"),width=15, height=25)
# gestalt_plot(enrich_list,p_threshold=0.05)
# dev.off()
}
setwd(current_dir)
#### system(paste0("rm -rf ",clicky_dir,"/tmp_in_files/")) ## if full plotting is done in a different directory, ie wrapping PDF around this function, with all 3 clicky functions enabled as above then can re-enable
}
Heatp<-function(pval_mat,sig=T,dat_descr=''){
# height=22,width=11 ## settings used for WebGestalt pdf() plots
library(gplots)
# if(breaks=="default"){breaks=seq(0,max(-log10(pval_mat)),length=(102))}
# if(col=="default"){col=colorRampPalette(c("white","#ffffbf","#fee090","#fdae61","#f46d43","#d73027","#a50026","darkred"))(length(breaks)-1)}
margin=c(max(13,(125-nrow(pval_mat))),12)
lheight=c(0.06,0.94)
if(sig){
dat_sig=as.data.frame(pval_mat)
dummy=pval_mat<=1e-5
dat_sig[dummy]="****"
dummy=pval_mat<=1e-4 & pval_mat>1e-5
dat_sig[dummy]="***"
dummy=pval_mat<=1e-3 & pval_mat>1e-4
dat_sig[dummy]="**"
dummy=pval_mat<=1e-2 & pval_mat>1e-3
dat_sig[dummy]="*"
dummy=pval_mat<=0.05 & pval_mat>1e-2
dat_sig[dummy]="+"
dummy=pval_mat>0.05
dat_sig[dummy]=""
heatmap.2(-log10(pval_mat),margins=margin,lhei=lheight,cellnote=(dat_sig),col=colorRampPalette(c("white","#ffffbf","#fee090","#fdae61","#f46d43","#d73027","#a50026","darkred"))(101),breaks=seq(0,max(-log10(pval_mat)),length=(102)),notecol="black",tracecol=F,dendrogram="none",Rowv=F,Colv=F,density.info="none",keysize=1,cexCol=0.7,cexRow=0.7,symkey=F,lwid=c(0.7,0.3),lmat=rbind(c(4,3),c(1,2)))
}
if(!sig){
heatmap.2(-log10(pval_mat),margins=margin,lhei=lheight,col=colorRampPalette(c("white","#ffffbf","#fee090","#fdae61","#f46d43","#d73027","#a50026","darkred"))(101),breaks=seq(0,max(-log10(pval_mat)),length=(102)),tracecol=F,dendrogram="none",Rowv=F,Colv=F,density.info="none",keysize=1,cexCol=0.7,cexRow=0.7,symkey=F,lwid=c(0.7,0.3),lmat=rbind(c(4,3),c(1,2)))
}
## text in top right corner
par(xpd = NA)
mtext(dat_descr, adj = 1, side = 3)
par(xpd = F)
}
gestaltheat<-function(dat_mat,descr,multi_page=F){
if(multi_page){
if(nrow(dat_mat>100)){
k=0
remainder=(nrow(dat_mat)-(k+100))
cat("\tmulti-page plot :\n")
while(remainder>0){
cat("\t",k+1,k+100,"\n")
dat_plot=dat_mat[(k+1):(k+100),]
k=k+100
remainder=(nrow(dat_mat)-(k+100))
Heatp(dat_plot,sig=T)
par(xpd=NA)
mtext(descr, adj=1, side=3)
par(xpd=F)
}
if(remainder<0){
cat("\t",k+1,k+(100+remainder),"\n")
dat_plot=dat_mat[(k+1):(k+(100+remainder)),]
Heatp(dat_plot,sig=T)
par(xpd=NA)
mtext(descr, adj=1, side=3)
par(xpd=F)
}
}
}
}
clicky.read<-function(enrich_type=c('KEGG','WIKI','Commons','GO'),mod_names="",p_threshold=0.1,in_path=getwd(),out_path="",dat_descr="",verbose=T){
if(verbose==T){
cat("\tUSE\t: read and process files saved from WebGestalt from specified folder, default: in_path=getwd()\n")
# cat("\tNOTE\t: if out_path is provided, the new tables will also be saved at the provided location\n")
cat("\tINPUTS\t: enrich_type currently supported : 'KEGG', 'WIKI', 'Commons', 'GO'\n")
cat("\tOUTPUT\t: list of module enrichments for use with gestalt_plot() function\n\n")
}
dtb=list()
types="na"
mod_names=unique(mod_names)
###-------------------
# get file names matching input criteria
inf=list.files(in_path)#,pattern=paste(enrich_type,"&",mod_names,collapse=""))
inf=inf[grepl(paste("(?=.*",paste(mod_names,collapse="|"),")",sep=""),inf,perl=T)]
inf=inf[grepl(paste("(?=.*",paste(enrich_type,collapse="|"),")",sep=""),inf,perl=T)]
if(length(inf)==0){
stop(paste("\tERROR\t: no files matching criteria",enrich_type,mod_names,"found at",in_path,"\n"))
}
cat("\t\t",length(inf),"files match criteria :\n")
enrich=list()
enrich_type=gsub("GO","GO:",enrich_type)
for(ifle in 1:length(inf)){
path=list()
#ifle=1
cat("\t\t ",(inf)[ifle])
mod=readLines(paste(in_path,inf[ifle],sep=""))
mod=mod[11:length(mod)]
mod=mod[mod!=""]
#
pos=grep(paste("(?=.*",paste(enrich_type,collapse="|"),")",sep=""),mod,perl=T)
if(length(pos)==0){
cat("\tno significant terms\n")
path=list(type="na",term="na")
path$adjPval=1
# if(out_path!=""){write.table(paste("No Significant results for",enrich_type,"at 0.1 FDR"),paste(out_path,"transform.",inf[imod],".txt",sep=""),sep="\t",row.names=F,quote=F)}
}
if(length(pos)>0){
path=list(type=paste(gsub("\t.*","",mod[pos])))
path$term=as.vector(t(as.data.frame(strsplit(mod[pos],"\t"))[2,]))
cat("\tterms found:",length(path$term),"\n")
path$nGenes=gsub(".*O=|;.*","",mod[pos+1])
path$rawPval=gsub(".*rawP=|;.*","",mod[pos+1])
path$adjPval=gsub(".*adjP=","",mod[pos+1])
endc=c((pos[-1]-1),length(mod))
ensg_all=list()
name_all=list()
# add in gene names using overly elaborate method
for(ipat in 1:length(pos)){
gen=(mod[(pos[ipat]+2):(endc[ipat])])
ensg=rep(NA,length(gen))
name=ensg
for(igen in 1:length(ensg)){
ensg[igen]=strsplit(gen[igen],"\t")[[1]][1]
name[igen]=strsplit(gen[igen],"\t")[[1]][3]
}
ensg_all[[ipat]]=paste(ensg,collapse=" ")
name_all[[ipat]]=paste(name,collapse=" ")
}
path$gene.id=unlist(ensg_all)
path$gene.name=unlist(name_all)
}
# for(ityp in 1:length(table(path$types))){
# dtb[[table(path$types)[ityp]]][[inf]]
# }
for(imod in 1:length(mod_names)){
if((grepl(mod_names[imod],inf[ifle]))){
modnm=mod_names[imod]
# print(modnm)
}
}
types=c(types,path$type)
path=as.data.frame(path)
path$module=modnm
enrich[[inf[ifle]]]=as.data.frame(path)
}
###-------------------
# sort the data for plotting
types=types[types!="na"]
types=names(table(types))
path=list()
for(ityp in 1:length(types)){
path[[types[ityp]]]=vector("list",length(mod_names))
names(path[[types[ityp]]])=mod_names
}
for(ifle in 1:length(enrich)){
# print(names(enrich)[ifle])
enri=enrich[[ifle]]
etyp=names(table(enri$type))
for(ityp in 1:length(etyp)){
if(etyp[ityp]%in%types){
# cat(ifle,ityp,"<<\n")
# print(etyp[ityp])
# print(unique(enrich[[ifle]]$module))
enrtyp=enri[enri$type==etyp[ityp],]
rownames(enrtyp)=enrtyp$term
enrtyp=enrtyp[,-(which(!(colnames(enrtyp)%in%"adjPval"))),drop=F]
colnames(enrtyp)=unique(enrich[[ifle]]$module)
# if(nrow(enrtyp)==0){
# print("no enrichment terms")
# enrtyp=as.data.frame(1)
# rownames(enrtyp)="na"
# colnames(enrtyp)=unique(enrich[[ifle]]$module)
# }
path[[etyp[ityp]]][[unique(enrich[[ifle]]$module)]]=enrtyp
}
}
}
names(path)=gsub("[ ]","_",names(path))
for(ityp in 1:length((path))){
for(imod in 1:length(path[[ityp]])){
if(class(path[[ityp]][[imod]])=="NULL"){
dummy=as.data.frame(NA)
rownames(dummy)="no_significant_enrichment"
colnames(dummy)=names(path[[ityp]][imod])
# cat(colnames(dummy),"erk\n")
path[[ityp]][[imod]]=dummy
}
}
}
return(path)
}
clicky.plot<-function(enrich_list,dat_descr="",p_threshold=0.1,height=22,width=11,margins=c(5, 20, 4, 2),points_sig=T){
for(ityp in 1:length(enrich_list)){
cat("\t",names(enrich_list)[ityp],"\n")
for(imod in 1:length(enrich_list[[ityp]])){
print(imod)
cat("\t\t",names(enrich_list[[ityp]])[imod],"\n")
print(nrow(enrich_list[[ityp]][[imod]]))
if(imod==1){enrich=as.data.frame(enrich_list[[ityp]][[imod]])}
if(imod>1){enrich=rmerge(enrich,make.numeric(enrich_list[[ityp]][[imod]]))}
}
enrich=make.numeric(enrich)
if("no_significant_enrichment"%in%rownames(enrich)){
enrich=(enrich[-which(rownames(enrich)=="no_significant_enrichment"),,drop=F])
}
enrich[is.na(enrich)]=1
enrich[enrich>p_threshold]=1
enrich=(make.numeric(enrich))
dim(enrich)
# remove rows where all values == 1 || enrich[-0,] seems to break, hence the logic check first
if(sum(apply(enrich,1,sum)==ncol(enrich))>0){
enrich=(enrich[!(apply(enrich,1,sum)==ncol(enrich)),,drop=F])
}
# print(str(enrich))
# cat("\t\tdim enrich :",str(enrich),"\n")
# Head(enrich)
print(head(enrich))
print(dim(enrich))
# pdf(paste("~/Dropbox/bin/clicky/graphics/dummy.",names(enrich_list)[ityp],".pdf",sep=""),width=width,height=height)
if(nrow(enrich)==0){
print("no plot")
plot.new()
legend("top",legend=paste(dat_descr,names(enrich_list)[ityp],colnames(enrich),"\n","NO SIGNIFICANT TERMS at P<",p_threshold,"\n"),cex=2)
}
if(nrow(enrich)>0){
if(ncol(enrich)==1){
print("single plot")
print(dim(enrich))
enrich=-log10(make.numeric(enrich))
enrich=enrich[order(enrich[,1]),,drop=F]
margins=c(100-(nrow(enrich)*1),(max(nchar(rownames(enrich)))/1.8),2.5,1)
par(mar=margins)
barplot(t(enrich),horiz=T,las=1,main=paste(names(enrich_list)[ityp],"\n",dat_descr,colnames(enrich)))
abline(v=-log10(0.01),col="dodgerblue")
abline(v=-log10(0.05),col="red")
margins=c(5.1,4.1,4.1,2.1) # reset default margins
}
if(ncol(enrich)>1){
print("multi plot")
library(corrplot)
enrich=enrich[do.call(order, (lapply(1:NCOL(enrich), function(i) enrich[, i]))), ]
gestaltheat(enrich,names(enrich_list)[ityp],T)
enrich=-log10(make.numeric(enrich))
}
# dev.off()
}
# return(invisible(enrich))
rm(enrich)
}
}
FUNDNMmap<-function(clusters_list,runname="",inpath="~/Dropbox/SHARED/tools/FUNDNMmap/",outpath=getwd(),selection=F){
cat('\tNOTE:\tclusters_list only ENSG ids currently supported')
library(MetaDE)
library('parallel')
Load(paste(inpath,"/PathoGeneENS.Rdata",sep="")) # PathoGene & Patho_ENSgeneID
Load(paste(inpath,"/ctr_GeneENS.Rdata",sep="")) # ctrGene & ctr_ENSgeneID
map = read.table(file=paste(inpath,'/functional_mut_rate.bias_corrected.local.canonical_tx_only.bed.txt',sep=""),
header=TRUE, sep='\t', blank.lines.skip=TRUE)
### create a matrix for results
FBET = matrix(nrow=length(clusters_list), ncol=23)
row.names(FBET) = names (clusters_list)
colnames(FBET) = c("module size","patho","FET p.value","FET FDR","OR","[95% OR CI] inf","OR [95% OR CI] sup",
"module DNMs in patients","module DNMs in controls",
"non module DNMs in patients","non module DNMs in controls",
"gene names of modules DNMs in patients",
"gene names of modules DNMs in controls",
"BET p.value","BET FDR","Theo Ps (mutation rate of map.M)","Estimated Ps","Ratio Obs/Exp",
"[95% EsPs CI] inf","[95% EsPs CI] sup",
"x=nb of DNM in map.M (success)","n=nb of DNM in map/mutation rate all genes of map (trials)",
#"ENSgeneID of M not in map",
"nb ENSgeneID of M not in map"
)
npDNM= list()
for (pat in 1:length(PathoGene)){
pathoENS=Patho_ENSgeneID[[pat]]
DNM_Gene=PathoGene[[pat]]
ctrlENS=ctr_ENSgeneID[['lgd']]
ctrl_Gene = ctrGene[['lgd']]
### function to fill the matrix of results
#for (i in 1:length(clusters_list)){
FUNC=function(i){
Ms=length(clusters_list[[i]])
#### FET
cat('\t',names(clusters_list)[i],' \tpatho: ',names(PathoGene)[pat],'\t',pat,'\tof ',length(PathoGene),'\n',sep='')
## function to calculate the number Mc of DNMs in CTRL involving a gene of the cluster i
y = lapply(clusters_list[[i]],FUN=function(x) {ctrlENS[which(ctrlENS$ensembl_gene_id == x),'external_gene_name']})
Mc = sum(sapply(as.matrix(unique(y)), FUN=function(ym) {length(which(ctrl_Gene == ym ))}))
McID = paste(unlist(y),collapse=", ")
## number NMc of remaining DNMs in CTRL involving a gene not in the cluster i
NMc = length(ctrl_Gene)-Mc
##function to calculate the number Mee of DNMs in patho involving a gene of the cluster i
z = lapply(clusters_list[[i]],FUN=function(x) {pathoENS[which(pathoENS$ensembl_gene_id == x),'external_gene_name']})
Mp = sum(sapply(as.matrix(unique(z)), FUN=function(zm) {length(which(DNM_Gene == zm ))}))
MpID = paste(unlist(z),collapse=", ")
## number NMee of remaining DNMs in EE involving a gene not in the cluster i
NMp = length(DNM_Gene)-Mp
# contingency matrice for Fisher Exact Test FET all DNMs and ns DNMs
matr = matrix(c(Mp,Mc,NMp,NMc), nrow=2)
# FET
# FisherM = fisher.test(matr,alternative="greater")
FisherM = fisher.test(matr)
Fisher.p = FisherM$p.value
Fisher.or = FisherM$estimate
Fisher.cinf = FisherM$conf.int[1]
Fisher.cis = FisherM$conf.int[2]
#### BET
# theorical Ps=Theorical probablity of sucess based on mutation rate map
CL.map.ens = intersect(clusters_list[[i]],map$Gene)
#NnID=paste(setdiff(clusters_list[[i]],map$Gene), collapse=", ")
nID=length(setdiff(clusters_list[[i]],map$Gene))
# if lgd (nonsens + missense)
ThPs=sum(sapply(CL.map.ens,FUN=function(x){sum(map[which(map$Gene == x),c("Missense_rate","Nonsense_rate")])}))
# nb of trials=nb of DNM falling in all map genes divided by the mutation rate on all map genes
patho.map.ens = intersect(pathoENS$ensembl_gene_id,map$Gene)
y = lapply(patho.map.ens,FUN=function(x) {pathoENS[which(pathoENS$ensembl_gene_id == x),'external_gene_name']})
n = round(sum(sapply(unique(y), FUN=function(x) {length(which(DNM_Gene == x ))}))/sum(map[,c("Missense_rate","Nonsense_rate")]))
# nb of sucess=nb of DNM falling in map of the module
z = lapply(CL.map.ens,FUN=function(x) {pathoENS[which(pathoENS$ensembl_gene_id == x),'external_gene_name']})
xz = sum(sapply(z, FUN=function(x) {length(which(DNM_Gene == x ))}))
BET= binom.test(xz,n,ThPs)
Binomial.p=BET$p.value
EsPs=BET$estimate
RobsE=xz /(n*ThPs)
CI.inf=BET$conf.int [1]
CI.sup=BET$conf.int [2]
#FBET[i,]=c(Ms,names(PathoGene[pat]),Fisher.p,NA,Fisher.or,Fisher.cinf,Fisher.cis,Mp,Mc,NMp,NMc,MpID,McID,Binomial.p,NA,ThPs,EsPs,RobsE,CI.inf,CI.sup,xz,n,NnID,nID)
FBET[i,]=c(Ms,names(PathoGene[pat]),Fisher.p,NA,Fisher.or,Fisher.cinf,Fisher.cis,Mp,Mc,NMp,NMc,MpID,McID,Binomial.p,NA,ThPs,EsPs,RobsE,CI.inf,CI.sup,xz,n,nID)
}
fbet = mclapply(1:length(clusters_list),FUNC,mc.cores=detectCores())
#The fbet output object of the mclapply function is a list of 44 vectors FBET[i,] in the good order
for (i in 1:length(clusters_list)){
FBET[i,]=fbet[[i]]
}
FBET[,"FET FDR"]= p.adjust(FBET[,"FET p.value"],method="fdr")
FBET[,"BET FDR"]= p.adjust(FBET[,"BET p.value"],method="fdr")
write.table(FBET, sep='\t', file=paste(outpath,"/",names(PathoGene)[pat],"_FBET_",runname,".txt",sep=""), row.names=TRUE, quote=FALSE, col.names=NA)
npDNM[[pat]]=FBET
}
names(npDNM) = names(PathoGene)
if(selection==T){
select= intersect(which(as.numeric(npDNM[[1]][,"FET FDR"]) < 0.2),which(as.numeric(npDNM[[1]][,"BET FDR"]) < 0.2))
cat("\tnumber of selected modules for\t\t", names(npDNM)[1]," :",length(select),'\n')
if (length(select)==1){
SignifT= npDNM[[1]][c(select,NA),]
SignifT= SignifT[-which(is.na(rownames(SignifT)) ==T),]
}else{
SignifT= npDNM[[1]][select,]
}
for (pat in 2:length(npDNM)){
select= intersect(which(as.numeric(npDNM[[pat]][,"FET FDR"]) < 0.2),which(as.numeric(npDNM[[pat]][,"BET FDR"]) < 0.2))
cat("\tnumber of selected modules for\t\t", names(npDNM)[pat]," :",length(select),'\n')
if (length(select)==1){
SignifT= rbind(SignifT,npDNM[[pat]][c(select,NA),])
SignifT= SignifT[-which(is.na(rownames(SignifT)) ==T),]
}else{
SignifT= rbind(SignifT,npDNM[[pat]][select,])
}
}
write.table(SignifT, sep='\t', file =paste(outpath,'/significantFBET_',runname,'.txt',sep=''), row.names=TRUE, quote=FALSE, col.names=NA)
}else{
allT= npDNM[[1]]
for (pat in 2:length(npDNM)){
allT= rbind(allT,npDNM[[pat]])
}
write.table(allT, sep='\t', file =paste(outpath,'/ALL_FBET_',runname,'.txt',sep=''), row.names=TRUE, quote=FALSE, col.names=NA)
}
return(invisible(npDNM))
}
fet<-function(sampl,bkgrnd,success,counts=F,samp.success,bkgrnd.success,samp.fail,bkgrnd.fail,...){
# alternative ='greater'
# phyper(success_in_sample, success_in_bkgd, failure_in_bkgd, sample_size, lower.tail=TRUE)
#fisher.test(matrix(c(x, 13-x, 5-x, 34+x), 2, 2), alternative='less');
# Numerical parameters in order:
# (success-in-sample, success-in-left-part, failure-in-sample, failure-in-left-part).
if(!counts){
bkgrnd=bkgrnd[!(bkgrnd%in%sampl)]
fet=list(samp.success=sum(sampl%in%success),bkgrnd.success=sum(bkgrnd%in%success),samp.fail=sum(!(sampl%in%success)),bkgrnd.fail=sum(!(bkgrnd%in%success)))
test_mat=matrix(unlist(fet),nrow=2,dimnames=list(c('samp','bkgrnd'),c('success','fail')))
test_out=fisher.test(test_mat,...)
# print(test_mat)
fet$n.genes=length(sampl)
fet$FETp=(test_out$p.value)
fet$fetOR=round(test_out$estimate) #,digits=3
fet$lowerCI=round(test_out$conf.int[1])#,digits=3
fet$upperCI=round(test_out$conf.int[2])#,digits=3
# fet$samp.success=paste(sampl[sampl%in%success],collapse=' ')
return(invisible((fet)))
}
if(counts){
fet=list(samp.success=samp.success,bkgrnd.success=bkgrnd.success,samp.fail=samp.fail,bkgrnd.fail=bkgrnd.fail)
test_mat=matrix(unlist(fet),nrow=2,dimnames=list(c('samp','bkgrnd'),c('success','fail')))
test_out=fisher.test(test_mat,...)
# print(test_mat)
fet$n.genes=sum(samp.success,samp.fail)
fet$FETp=(test_out$p.value)
fet$fetOR=round(test_out$estimate,digits=3)
fet$lowerCI=round(test_out$conf.int[1],digits=3)
fet$upperCI=round(test_out$conf.int[2],digits=3)
# fet$samp.success=paste(sampl[sampl%in%success],collapse=' ')
return(invisible((fet)))
}
}
msCellFET<-function(clusters_list,runname="",inpath="~/Dropbox/tools/Data_to_load_CellFET",outpath=getwd(),selection=F){
cat('\tNOTE:\tclusters_list only ENSG ids currently supported')
library(MetaDE)
library('parallel')
###load data
Load(paste(inpath,"/HUMQb.Rdata",sep="")) #"HUMQb" human ENSid orthologous genes of mice background genes
Load(paste(inpath,"/hmscDF.Rdata",sep="")) #"hmscDF" human ENSid orthologous of mice single cell enriched by class dataframe
### create a matrix for results
cFET=matrix(nrow=length(clusters_list), ncol=14)
row.names(cFET)=names (clusters_list)
colnames(cFET)=c("cell class","FET p.value","FET FDR","OR","[95% OR CI] inf","OR [95% OR CI] sup",
"module cell enriched genes","module out cell enriched genes",
"non module cell enriched genes","non module out cell enriched genes",
"gene names of in modules cell enriched genes","module size","cell enriched genes size",
"% of genes in module in cell background"
)
print("Cell background is the genes with one2one human orthologous of mice genes used to build the list of cell class enriched genes by Zeisel et al 2015 (Science)")
resMsc=list()
for (ccl in 1:length(hmscDF)){ # ccl: cell class
cclENS=hmscDF[[ccl]]
### function to fill the matrix of results
#for (i in 1:length(clusters_list)){
FUNC=function(i){
Ms=length(clusters_list[[i]]) #Ms: module size
CB=HUMQb[,'hsapiens_homolog_ensembl_gene'] #CB: cell background
Cs=length(cclENS) #Cs: cell enriched genes size
MCBp=length(intersect(CB,clusters_list[[i]]))/Ms #MCBp: % of genes in module in cell background
#cFET
print(paste(names(clusters_list[i]),", cell class:",names(hmscDF)[ccl]))
#calculate the number Mc of module i cell enriched genes (Mc: in module AND in cell class)
Mc=length(intersect(cclENS,clusters_list[[i]]))
McID=paste(unlist(HUMQb[which(CB %in% intersect(cclENS,clusters_list[[i]])),'external_gene_name']),collapse=", ")
#calculate the number NMc of remaining genes not in module but in cell class
NMc=length(cclENS)-Mc
#calculate the number Mnc of genes in module but not in cell class
Mnc=length(intersect(CB,clusters_list[[i]]))-Mc
#calculate the number NMnc of genes out of module AND not in cell class
NMnc=length(CB)-(Mc+NMc+Mnc)
# contingency matrice for Fisher Exact Test FET all DNMs and ns DNMs
matr=matrix(c(Mc,NMc,Mnc,NMnc), nrow=2)
#FET
#FisherM=fisher.test(matr,alternative="greater")
FisherM=fisher.test(matr)
Fisher.p=FisherM$p.value
Fisher.or=FisherM$estimate
Fisher.cinf=FisherM$conf.int[1]
Fisher.cis=FisherM$conf.int[2]
cFET[i,]=c(names(hmscDF)[ccl],Fisher.p,NA,Fisher.or,Fisher.cinf,Fisher.cis,Mc,Mnc,NMc,NMnc,McID,Ms,Cs,MCBp)
}
cfet=mclapply(1:length(clusters_list),FUNC,mc.cores=detectCores())
#The cfet output object of the mclapply function is a list of n vectors cFET[i,] in the good order
for (i in 1:length(clusters_list)){
cFET[i,]=cfet[[i]]
}
cFET[,"FET FDR"]=p.adjust(cFET[,"FET p.value"],method="fdr")
write.table(cFET, sep='\t', file=paste(outpath,"/",names(hmscDF)[ccl],"_cFET_",runname,".txt",sep=""), row.names=TRUE, quote=FALSE, col.names=NA)
resMsc[[ccl]]=cFET
}
names(resMsc)=names(hmscDF)
if(selection==T){
select=which(as.numeric(resMsc[[1]][,"FET FDR"]) < 0.2)
print(paste("number of selected modules for ", names(resMsc)[1]," :",length(select)))
if (length(select)==1){
SignifT=resMsc[[1]][c(select,NA),]
SignifT=SignifT[-which(is.na(rownames(SignifT)) ==T),]
}else{
SignifT=resMsc[[1]][select,]
}
for (ccl in 2:length(resMsc)){
select=which(as.numeric(resMsc[[ccl]][,"FET FDR"]) < 0.2)
print(paste("number of selected modules for ", names(resMsc)[ccl]," :",length(select)))
if (length(select)==1){
SignifT=rbind(SignifT,resMsc[[ccl]][c(select,NA),])
SignifT=SignifT[-which(is.na(rownames(SignifT)) ==T),]
}else{
SignifT=rbind(SignifT,resMsc[[ccl]][select,])
}
}
write.table(SignifT, sep='\t', file =paste(outpath,'/significant_cFET_',runname,'.txt',sep=''), row.names=TRUE, quote=FALSE, col.names=NA)
}else{
allT=resMsc[[1]]
for (ccl in 2:length(resMsc)){
allT=rbind(allT,resMsc[[ccl]])
}
write.table(allT, sep='\t', file =paste(outpath,'/ALL_cFET_',runname,'.txt',sep=''), row.names=TRUE, quote=FALSE, col.names=NA)
}
return(resMsc)
}
forest.plot<-function(dat_lis,mode='single',x_lim=c(0,10),use_log=NA,phen=NA,module=NA,p_thresh=NA,point_width_scale=4,line_width=0.1,ci_bar_height=0.35){
library(ggplot2)
cat('\tWARNING : currently does not work for a single module\n\n')
# colvec=c('red','darkblue','darkorange','magenta','orange')
colvec=colmix
cat('\tphenotypes detected :',paste(names(dat_lis),collapse=', '),'\n')
dat_lis=dat_lis[rev(names(dat_lis))]
options(warn=-1)
if(!is.na(phen)){
cat('\tdataset selected:',paste(phen,collapse=', '),'\n')
dat_lis=dat_lis[names(dat_lis)%in%phen]
}
if(!is.na(module)){
cat('\tmodules selected:',paste(module,collapse=', '),'\n')
}
plot_dat=list()
for(ilis in 1:length(dat_lis)){
if(!is.na(module)){
dat_lis[[names(dat_lis)[ilis]]]=dat_lis[[names(dat_lis)[ilis]]][rownames(dat_lis[[names(dat_lis)[ilis]]])%in%module,]
}
plot_dat[[names(dat_lis)[ilis]]]=as.data.frame(make.numeric(dat_lis[[names(dat_lis)[ilis]]][,c('module size','FET p.value','OR','[95% OR CI] inf','OR [95% OR CI] sup')]))
colnames(plot_dat[[names(dat_lis)[ilis]]])=c('n.genes','FETp','fetOR','lowerCI','upperCI')
plot_dat[[names(dat_lis)[ilis]]]$module=as.factor(paste(rownames(plot_dat[[names(dat_lis)[ilis]]]),names(dat_lis)[ilis],sep='_'))
plot_dat[[names(dat_lis)[ilis]]]$color=colvec[ilis]#c(rep(colvec[ilis],nrow(plot_dat[[names(dat_lis)[ilis]]])-1),'black')
# plot_dat[[names(dat_lis)[ilis]]]$color=c(rep(colvec[ilis],nrow(plot_dat[[names(dat_lis)[ilis]]])-1),'black')
plot_dat[[names(dat_lis)[ilis]]]$point_width=(plot_dat[[names(dat_lis)[ilis]]]$fetOR*3 )+point_width_scale
}
if(!is.na(p_thresh)){
dummy=list()
for(ilis in 1:length(plot_dat)){
tempr=plot_dat[[names(dat_lis)[ilis]]][plot_dat[[names(dat_lis)[ilis]]]$FETp<p_thresh,]
if(nrow(tempr)>0){
dummy[[names(plot_dat)[ilis]]]=plot_dat[[names(plot_dat)[ilis]]][plot_dat[[names(plot_dat)[ilis]]]$FETp<p_thresh,]
}
}
plot_dat=dummy
}
options(warn=0)
# if(!is.na(use_log)){
# plot_dat$fetOR=log(as.numeric(plot_dat$fetOR),base=use_log)
# plot_dat$lowerCI=log(as.numeric(plot_dat$lowerCI),base=use_log)
# plot_dat$upperCI=log(as.numeric(plot_dat$upperCI),base=use_log)
# }
####### since gplots insists on reversing all the phoenotypes, pre-empt it to keep original rownames
for(ilis in 1:length(plot_dat)){
plot_dat[[names(plot_dat)[ilis]]]=plot_dat[[names(plot_dat)[ilis]]][rev(rownames(plot_dat[[names(plot_dat)[ilis]]])),]
}
gplots_dat=list()
if(mode=='single'){
for(idat in 1:length(plot_dat)){
myplot=ggplot() + # initiate the plot space, saved to myplot variable for later display
# - subsequent additions / modifications of the plot can be done by manipulating this space
# geom_point(data=df, aes(y=Module, x=OR),colour='red', size=3) +
# geom_errorbar(data=pdat,aes(x=Module,y=OR,ymin=lower.95..CI,ymax=upper.95..CI),colour="grey60",width=0.2)
geom_vline(xintercept=1, linetype="dashed")+ # add vertical line
# geom_vline(xintercept=(1), linetype="dashed")+ # add vertical line
geom_errorbarh(data=plot_dat[[idat]], aes(y=module,x=fetOR, xmin=lowerCI, xmax=upperCI), colour="black", height=ci_bar_height,size=2)+ # add HORISONTAL error bars, geom_errorbar used for vertical..
geom_point(data=plot_dat[[idat]], aes(y=module, x=fetOR), shape=15, cex=plot_dat[[idat]]$point_width, colour=plot_dat[[idat]]$color)+ # add plotting points
# xlim(0, 5) # removes datapoints outside the range == scale_x_continuous(limits=c(-5000, 5000))
#
# coord_cartesian(xlim=c(0, 60)) + # moves the window only
coord_cartesian(xlim=x_lim) + # coord_cartesian(xlim=c(-0.5, 1)) # moves the window only
scale_x_continuous(breaks=c(0,1:9,seq(10,2000,by=10))) +
# scale_x_continuous(breaks=c(0,1,5,seq(10,2000,by=10))) +
# scale_size_area() +
xlab("FET odds ratio and CI") +
ylab("module")
## plot line styles || http://www.cookbook-r.com/Graphs/Shapes_and_line_types/
# ggtitle(names(table(dat$serie.name)[idat]))
## essential if using the ugly mess that is ggplot2, for more options : http://felixfan.github.io/rstudy/2013/11/27/ggplot2-remove-grid-background-margin/
# myplot + theme(panel.grid.major=element_blank(), panel.grid.minor=element_blank(), panel.background=element_blank(), axis.line=element_line(colour="black"))
#
gplots_dat[[names(plot_dat)[idat]]]=(myplot + theme_bw())
}
}
if(mode=='combined'){
plot_merge=plot_dat[[1]]
if(length(plot_dat)>1){
for(ilis in 2:length(plot_dat)){
plot_merge=rbind(plot_merge,plot_dat[[ilis]])
}}
myplot=ggplot() + # initiate the plot space, saved to myplot variable for later display
# - subsequent additions / modifications of the plot can be done by manipulating this space
# geom_point(data=df, aes(y=Module, x=OR),colour='red', size=3) +
# geom_errorbar(data=pdat,aes(x=Module,y=OR,ymin=lower.95..CI,ymax=upper.95..CI),colour="grey60",width=0.2)
geom_vline(xintercept=1, linetype="dashed")+ # add vertical line
# geom_vline(xintercept=(1), linetype="dashed")+ # add vertical line
geom_errorbarh(data=plot_merge, aes(y=module,x=fetOR, xmin=lowerCI, xmax=upperCI), colour="black", height=ci_bar_height,size=2)+ # add HORISONTAL error bars, geom_errorbar used for vertical..
geom_point(data=plot_merge, aes(y=module, x=fetOR), shape=15, cex=plot_merge$point_width, colour=plot_merge$color)+ # add plotting points
# xlim(0, 5) # removes datapoints outside the range == scale_x_continuous(limits=c(-5000, 5000))
#
# coord_cartesian(xlim=c(0, 60)) + # moves the window only
coord_cartesian(xlim=x_lim) + # coord_cartesian(xlim=c(-0.5, 1)) # moves the window only
scale_x_continuous(breaks=c(0,1:9,seq(10,2000,by=10))) +
# scale_x_continuous(breaks=c(0,1,5,seq(10,2000,by=10))) +
# scale_size_area() +
xlab("FET odds ratio and CI") +
ylab("module")
## plot line styles || http://www.cookbook-r.com/Graphs/Shapes_and_line_types/
# ggtitle(names(table(dat$serie.name)[idat]))
## essential if using the ugly mess that is ggplot2, for more options : http://felixfan.github.io/rstudy/2013/11/27/ggplot2-remove-grid-background-margin/
# myplot + theme(panel.grid.major=element_blank(), panel.grid.minor=element_blank(), panel.background=element_blank(), axis.line=element_line(colour="black"))
#
gplots_dat=(myplot + theme_bw())
}
return(gplots_dat)
}
spatiotemp<-function(dat_lis,mod_gene,scale_data=F,row_clust=T,use_cols=NA,ncols=10,...){
#cat('\n\tNOTE: this function requires an object "name_here", available from:\n https://\n\n') ## not yet but needs to be implemented
#### spatiotemproral transcriptome of human brain data:
#Load('~/Dropbox/PROJ/spatem/dtb/expr/GSE25219_GPL5175.exon_array.HUGO_GENE_Mapped.ngene13830.log26.nsamp1192.rin6.5_pmi24.covar_pmi_rin.Rdata')
## built to handle data_list with incomplete sub-list categories (ie missing)
# HIP contains all dev stages -> spatemp() works with all stages in correct order
#ballg=ballg[c('HIP',names(ballg)[names(ballg)!='HIP'])]
#ballc=ballc[names(ballg)]
#save(ballc,ballg,bfolc,bfold,bstats,gexpc,gexpr,ids,readme,sampl,file='~/Dropbox/PROJ/spatem/dtb/expr/GSE25219_GPL5175.exon_array.HUGO_GENE_Mapped.ngene13830.log26.nsamp1192.rin6.5_pmi24.covar_pmi_rin.Rdata')
library(gplots)
if(sum(mod_gene%in%rownames(dat_lis[[1]][[1]]))==0){
stop('\tno query genes not found in dataset')
}
cat('\tnumber of query genes in dataset:\n')
print(matst(mod_gene%in%rownames(dat_lis[[1]][[1]])))
dev_stage=names(dat_lis[[1]])
for(ilis in 2:length(dat_lis)){
dev_stage=unique(c(dev_stage,names(dat_lis[[ilis]])))
}
plot_dat=matrix(NA,nrow=length(dat_lis),ncol=length(dev_stage))
colnames(plot_dat)=dev_stage
rownames(plot_dat)=names(dat_lis)
# Head(plot_dat)
cat('\tcalculate median expression for query genes\n')
for(ireg in 1:length(dat_lis)){
for(idev in 1:length(dat_lis[[ireg]])){
dummy=dat_lis[[names(dat_lis)[ireg]]][[names(dat_lis[[ireg]])[idev]]]
# Head(dummy)
plot_dat[names(dat_lis)[ireg],names(dat_lis[[ireg]])[idev]]=mean(apply(dummy[rownames(dummy)%in%mod_gene,,drop=F],1,mean))
}
cat(round(ireg/length(dat_lis),digits=2),'\r');flush.console()
}
# cat(min(plot_dat,na.rm=T),max(plot_dat,na.rm=T),'\n')
if(!scale_data){
na_val_loc=is.na(plot_dat)
plot_dat[na_val_loc]=(min(plot_dat,na.rm=T))-0.2
}
if(scale_data){
plot_dat=t(make.numeric(scale(t(plot_dat),center=T,scale=F)))
na_val_loc=is.na(plot_dat)
plot_dat[na_val_loc]=(min(plot_dat,na.rm=T))-0.1
}
# Head(plot_dat)
#### check for user defined colors
options(warn=-1)
# if(is.na(use_cols)){heat_cols=c('grey60',colorRampPalette(c("#0072B2","#56B4E9","#abd9e9"))(ncols/2),colorRampPalette(c("#ffffbf","#F0E442","darkred"))(ncols/2))} #'#e31a1c' ,
if(is.na(use_cols)){heat_cols=c('grey60',colorRampPalette(c("#053061","#56B4E9","#abd9e9"))(ncols/2),colorRampPalette(c("#ffffbf","#F0E442","darkred"))(ncols/2))} #'#e31a1c' ,
if(!is.na(use_cols)){
heat_cols=use_cols
}
heatmap.2(
plot_dat
# ,main= dat_descr
,Colv=F
,Rowv=row_clust
# ,col=c('grey60',colorRampPalette(c("white","#ffffbf","#fee090","#fdae61","#f46d43","#d73027","#a50026","darkred"))(ncols))
# ,col=c('grey60',bluered(ncols))
,col=heat_cols
,key.title=NA
,key.xlab=NA
,key.ylab=NA
,trace='none'
,srtCol=0
,keysize=1.1
,density.info='none'
,colsep=1:(ncol(plot_dat)-1),rowsep=1:(nrow(plot_dat)-1),sepcolor="white",sepwidth=c(0.001,0.001)
# ,margins=c(5,5)
,...
)
# plot.new()
par(new=TRUE)
mtext('grey - no samples available',adj=1,side=3,line=1)
# mtext(dat_descr,adj=1,side=3,line=2)
# mtext(paste0("n.complete : ",n_overlap),adj=1,side=3)
options(warn=0)
plot_dat[na_val_loc]=NA
return(invisible(plot_dat))
}
listov<-function(dat_lis,union=T,intersect=F){
## USE : get union or intersect of all elements (vector) in list ~~ bgcommon for list of char vectors
if(union & intersect){stop('both union & intersect == T, expect 1 only')}
if(!union & !intersect){stop('both union & intersect == F, expect 1 only')}
udat=dat_lis[[1]]
for(ilis in 2:length(dat_lis)){
if(union){udat=union(udat,dat_lis[[ilis]])}
if(intersect){udat=intersect(udat,dat_lis[[ilis]])}
}
return(udat)
}
Venn<-function(dat_lis,main='',...){
library(gplots)
venn(dat_lis)
mtext(main,side=3,...)
lisun=length(listov(dat_lis,T,F))
lisin=length(listov(dat_lis,F,T))
mtext(paste0('\n\nintersect/union=',round(lisin/lisun,digits=3)*100,'%'),side=1)
}
dnm.enrich<-function(clusters_list,runname="",selection=F){ #,inpath="~/Dropbox/SHARED/tools/FUNDNMmap/"
cat('\tNOTE:\tclusters_list only ENSG ids currently supported\n\n')
library(MetaDE)
library('parallel')
# Load(paste(inpath,"/PathoGeneENS.Rdata",sep="")) # PathoGene & Patho_ENSgeneID
# Load(paste(inpath,"/ctr_GeneENS.Rdata",sep="")) # ctrGene & ctr_ENSgeneID
# gene_map_dat=read.table(file=paste(inpath,'/functional_mut_rate.bias_corrected.local.canonical_tx_only.bed.txt',sep=""),header=TRUE, sep='\t', blank.lines.skip=TRUE)
### create a matrix for results
FBET=matrix(nrow=length(clusters_list), ncol=23)
row.names(FBET)=names(clusters_list)
colnames(FBET)=c("module size","patho","FET p.value","FET FDR","OR","[95% OR CI] inf","OR [95% OR CI] sup",
"module DNMs in patients","module DNMs in controls",
"non module DNMs in patients","non module DNMs in controls",
"gene names of modules DNMs in patients",
"gene names of modules DNMs in controls",
"BET p.value","BET FDR","Theo Ps(mutation rate of map.M)","Estimated Ps","Ratio Obs/Exp",
"[95% EsPs CI] inf","[95% EsPs CI] sup",
"x=nb of DNM in map.M(success)","n=nb of DNM in map/mutation rate all genes of map(trials)",
#"ENSgeneID of M not in map",
"nb ENSgeneID of M not in map"
)
npDNM=list()
for(pat in 1:length(PathoGene)){
cat('\t=====================',names(PathoGene)[pat],'=====================',pat,' of ',length(PathoGene),'\n')
pathoENS=Patho_ENSgeneID[[pat]]
DNM_Gene=PathoGene[[pat]]
ctrlENS=ctr_ENSgeneID[['lgd']]
ctrl_Gene=ctrGene[['lgd']]
### function to fill the matrix of results
#for(i in 1:length(clusters_list)){
FUNC=function(i){
Ms=length(clusters_list[[i]])
#### FET
cat('\t\t',names(clusters_list)[i],'\n')
## function to calculate the number Mc of DNMs in CTRL involving a gene of the cluster i
y=lapply(clusters_list[[i]],FUN=function(x) {ctrlENS[which(ctrlENS$ensembl_gene_id==x),'external_gene_name']})
Mc=sum(sapply(as.matrix(unique(y)), FUN=function(ym) {length(which(ctrl_Gene==ym ))}))
McID=paste(unlist(y),collapse=", ")
## number NMc of remaining DNMs in CTRL involving a gene not in the cluster i
NMc=length(ctrl_Gene)-Mc
##function to calculate the number Mee of DNMs in patho involving a gene of the cluster i
z=lapply(clusters_list[[i]],FUN=function(x) {pathoENS[which(pathoENS$ensembl_gene_id==x),'external_gene_name']})
Mp=sum(sapply(as.matrix(unique(z)), FUN=function(zm) {length(which(DNM_Gene==zm ))}))
MpID=paste(unlist(z),collapse=", ")
## number NMee of remaining DNMs in EE involving a gene not in the cluster i
NMp=length(DNM_Gene)-Mp
# contingency matrice for Fisher Exact Test FET all DNMs and ns DNMs
matr=matrix(c(Mp,Mc,NMp,NMc), nrow=2)
# FET
# FisherM=fisher.test(matr,alternative="greater")
FisherM=fisher.test(matr)
Fisher.p=FisherM$p.value
Fisher.or=FisherM$estimate
Fisher.cinf=FisherM$conf.int[1]
Fisher.cis=FisherM$conf.int[2]
#### BET
# theorical Ps=Theorical probablity of sucess based on mutation rate map
CL.map.ens=intersect(clusters_list[[i]],gene_map_dat$Gene)
#NnID=paste(setdiff(clusters_list[[i]],map$Gene), collapse=", ")
nID=length(setdiff(clusters_list[[i]],gene_map_dat$Gene))
# if lgd(nonsens + missense)
ThPs=sum(sapply(CL.map.ens,FUN=function(x){sum(gene_map_dat[which(gene_map_dat$Gene==x),c("Missense_rate","Nonsense_rate")])}))
# nb of trials=nb of DNM falling in all map genes divided by the mutation rate on all map genes
patho.map.ens=intersect(pathoENS$ensembl_gene_id,gene_map_dat$Gene)
y=lapply(patho.map.ens,FUN=function(x) {pathoENS[which(pathoENS$ensembl_gene_id==x),'external_gene_name']})
n=round(sum(sapply(unique(y), FUN=function(x) {length(which(DNM_Gene==x ))}))/sum(gene_map_dat[,c("Missense_rate","Nonsense_rate")]))
# nb of sucess=nb of DNM falling in map of the module
z=lapply(CL.map.ens,FUN=function(x) {pathoENS[which(pathoENS$ensembl_gene_id==x),'external_gene_name']})
xz=sum(sapply(z, FUN=function(x) {length(which(DNM_Gene==x ))}))
BET=binom.test(xz,n,ThPs)
Binomial.p=BET$p.value
EsPs=BET$estimate
RobsE=xz /(n*ThPs)
CI.inf=BET$conf.int [1]
CI.sup=BET$conf.int [2]
#FBET[i,]=c(Ms,names(PathoGene[pat]),Fisher.p,NA,Fisher.or,Fisher.cinf,Fisher.cis,Mp,Mc,NMp,NMc,MpID,McID,Binomial.p,NA,ThPs,EsPs,RobsE,CI.inf,CI.sup,xz,n,NnID,nID)
FBET[i,]=c(Ms,names(PathoGene[pat]),Fisher.p,NA,Fisher.or,Fisher.cinf,Fisher.cis,Mp,Mc,NMp,NMc,MpID,McID,Binomial.p,NA,ThPs,EsPs,RobsE,CI.inf,CI.sup,xz,n,nID)
}
fbet=mclapply(1:length(clusters_list),FUNC,mc.cores=detectCores())
#The fbet output object of the mclapply function is a list of 44 vectors FBET[i,] in the good order
for(i in 1:length(clusters_list)){
FBET[i,]=fbet[[i]]
}
FBET[,"FET FDR"]=p.adjust(FBET[,"FET p.value"],method="fdr")
FBET[,"BET FDR"]=p.adjust(FBET[,"BET p.value"],method="fdr")
# write.table(FBET, sep='\t', file=paste(outpath,"/",names(PathoGene)[pat],"_FBET_",runname,".txt",sep=""), row.names=TRUE, quote=FALSE, col.names=NA)
npDNM[[pat]]=FBET
}
names(npDNM)=names(PathoGene)
if(selection==T){
select=intersect(which(as.numeric(npDNM[[1]][,"FET FDR"]) < 0.2),which(as.numeric(npDNM[[1]][,"BET FDR"]) < 0.2))
cat("\tnumber of selected modules for\t\t", names(npDNM)[1]," :",length(select),'\n')
if(length(select)==1){
SignifT=npDNM[[1]][c(select,NA),]
SignifT=SignifT[-which(is.na(rownames(SignifT))==T),]
}else{
SignifT=npDNM[[1]][select,]
}
for(pat in 2:length(npDNM)){
select=intersect(which(as.numeric(npDNM[[pat]][,"FET FDR"]) < 0.2),which(as.numeric(npDNM[[pat]][,"BET FDR"]) < 0.2))
cat("\tnumber of selected modules for\t\t", names(npDNM)[pat]," :",length(select),'\n')
if(length(select)==1){
SignifT=rbind(SignifT,npDNM[[pat]][c(select,NA),])
SignifT=SignifT[-which(is.na(rownames(SignifT))==T),]
}else{
SignifT=rbind(SignifT,npDNM[[pat]][select,])
}
}
# write.table(SignifT, sep='\t', file=paste(outpath,'/significantFBET_',runname,'.txt',sep=''), row.names=TRUE, quote=FALSE, col.names=NA)
}else{
allT=npDNM[[1]]
for(pat in 2:length(npDNM)){
allT=rbind(allT,npDNM[[pat]])
}
# write.table(allT, sep='\t', file=paste(outpath,'/ALL_FBET_',runname,'.txt',sep=''), row.names=TRUE, quote=FALSE, col.names=NA)
}
return(invisible(npDNM))
}
cellt.enrich<-function(clusters_list,runname="",outpath=getwd(),selection=F){ #inpath="~/Dropbox/SHARED/tools/Data_to_load_CellFET",
cat('\tNOTE:\tclusters_list only ENSG ids currently supported')
cat("Cell background is the genes with one2one human orthologous of mice genes used to build the list of cell class enriched genes by Zeisel et al 2015(Science)")
library(MetaDE)
library('parallel')
###load data
# Load(paste(inpath,"/HUMQb.Rdata",sep="")) #"HUMQb" human ENSid orthologous genes of mice background genes
# Load(paste(inpath,"/hmscDF.Rdata",sep="")) #"hmscDF" human ENSid orthologous of mice single cell enriched by class dataframe
### create a matrix for results
cFET=matrix(nrow=length(clusters_list), ncol=14)
row.names(cFET)=names(clusters_list)
colnames(cFET)=c("cell class","FET p.value","FET FDR","OR","[95% OR CI] inf","OR [95% OR CI] sup",
"module cell enriched genes","module out cell enriched genes",
"non module cell enriched genes","non module out cell enriched genes",
"gene names of in modules cell enriched genes","module size","cell enriched genes size",
"% of genes in module in cell background"
)
resMsc=list()
for(ccl in 1:length(hmscDF)){ # ccl: cell class
cat('\t=====================',names(hmscDF)[ccl],'=====================',ccl,' of ',length(hmscDF),'\n')
cclENS=hmscDF[[ccl]]
### function to fill the matrix of results
#for(i in 1:length(clusters_list)){
FUNC=function(i){
Ms=length(clusters_list[[i]]) #Ms: module size
CB=HUMQb[,'hsapiens_homolog_ensembl_gene'] #CB: cell background
Cs=length(cclENS) #Cs: cell enriched genes size
MCBp=length(intersect(CB,clusters_list[[i]]))/Ms #MCBp: % of genes in module in cell background
#cFET
cat('\t\t',names(clusters_list)[i],'\n')
#calculate the number Mc of module i cell enriched genes(Mc: in module AND in cell class)
Mc=length(intersect(cclENS,clusters_list[[i]]))
McID=paste(unlist(HUMQb[which(CB %in% intersect(cclENS,clusters_list[[i]])),'external_gene_name']),collapse=", ")
#calculate the number NMc of remaining genes not in module but in cell class
NMc=length(cclENS)-Mc
#calculate the number Mnc of genes in module but not in cell class
Mnc=length(intersect(CB,clusters_list[[i]]))-Mc
#calculate the number NMnc of genes out of module AND not in cell class
NMnc=length(CB)-(Mc+NMc+Mnc)
# contingency matrice for Fisher Exact Test FET all DNMs and ns DNMs
matr=matrix(c(Mc,NMc,Mnc,NMnc), nrow=2)
#FET
#FisherM=fisher.test(matr,alternative="greater")
FisherM=fisher.test(matr)
Fisher.p=FisherM$p.value
Fisher.or=FisherM$estimate
Fisher.cinf=FisherM$conf.int[1]
Fisher.cis=FisherM$conf.int[2]
cFET[i,]=c(names(hmscDF)[ccl],Fisher.p,NA,Fisher.or,Fisher.cinf,Fisher.cis,Mc,Mnc,NMc,NMnc,McID,Ms,Cs,MCBp)
}
# cfet=mclapply(1:length(clusters_list),FUNC,mc.cores=detectCores())
cfet=lapply(1:length(clusters_list),FUNC)
#The cfet output object of the mclapply function is a list of n vectors cFET[i,] in the good order
for(i in 1:length(clusters_list)){
cFET[i,]=cfet[[i]]
}
cFET[,"FET FDR"]=p.adjust(cFET[,"FET p.value"],method="fdr")
# write.table(cFET, sep='\t', file=paste(outpath,"/",names(hmscDF)[ccl],"_cFET_",runname,".txt",sep=""), row.names=TRUE, quote=FALSE, col.names=NA)
resMsc[[ccl]]=cFET
}
names(resMsc)=names(hmscDF)
if(selection==T){
select=which(as.numeric(resMsc[[1]][,"FET FDR"]) < 0.2)
cat("number of selected modules for ", names(resMsc)[1]," :",length(select),'\n')
if(length(select)==1){
SignifT=resMsc[[1]][c(select,NA),]
SignifT=SignifT[-which(is.na(rownames(SignifT))==T),]
}else{
SignifT=resMsc[[1]][select,]
}
for(ccl in 2:length(resMsc)){
select=which(as.numeric(resMsc[[ccl]][,"FET FDR"]) < 0.2)
cat("number of selected modules for ", names(resMsc)[ccl]," :",length(select),'\n')
if(length(select)==1){
SignifT=rbind(SignifT,resMsc[[ccl]][c(select,NA),])
SignifT=SignifT[-which(is.na(rownames(SignifT))==T),]
}else{
SignifT=rbind(SignifT,resMsc[[ccl]][select,])
}
}
# write.table(SignifT, sep='\t', file=paste(outpath,'/significant_cFET_',runname,'.txt',sep=''), row.names=TRUE, quote=FALSE, col.names=NA)
}else{
allT=resMsc[[1]]
for(ccl in 2:length(resMsc)){
allT=rbind(allT,resMsc[[ccl]])
}
# write.table(allT, sep='\t', file=paste(outpath,'/ALL_cFET_',runname,'.txt',sep=''), row.names=TRUE, quote=FALSE, col.names=NA)
}
return(resMsc)
}
dnm.enrich.bg<-function(module_list,bg_vect=NA,use_counts=F){
#cat('\n\tNOTE:\tid_type - options: "name" - gene name / HUGO name; "ensg" - ensembl gene id;"mouse.ensg" - mouse ensembl gene id - one2one orthologs to human only\n')
cat('\tNOTE:\tbg_vect - options: "NA" - one2one human orthologous of mice genes used to build the list of cell class enriched genes by Zeisel et al 2015(Science)\n\n')
## based on bg input - if data.frame or matrix -> create a list of identical bg length=length(module_list)
## + alternatively a list of modules with backgrounds in the same order -> flexibility option for unique backgrounds for each module
## + added benefit of allowing different color coding for each bg (eg darker or different style of line / OR dot)
## option for supporting multiple ID types -> ensg /+/ gene name
## + auto-detect ID option - a possibility but seems unnecessary given only 2 types
##===================================================================================================================================
## generate background as per options, if list provided (same length as modules)=================================
options(warn=-1)
# if(!is.na(bg_vect)){
if(class(bg_vect)=='list'){
if(length(bg_vect)==length(module_list)){
cat('\tusing multiple bg list as provided\n')
}
if(length(bg_vect)!=length(module_list)){
stop('\tlength of bg_vect provided not the same length as module_list. if same bkgrnd for all modules, do not provide as.list()\n')
}
}
if(class(bg_vect)!='list' & !is.na(bg_vect)){
dnmid=lapply(dnmid,function(x){unique(x[x[,'ids']%in%bg_vect|x[,'gene']%in%bg_vect,c('gene','count','ids')])}) ## limit ids to just those in bg_vect
cat('\tusing the provided bkgrnd, same for all modules\n')
}
if(is.na(bg_vect)){
cat('\tno background list provided, using default\n')
# bg_vect=unique(unlist(lapply(dnmid[c('EE','ASD','ID','SCZ','DDD','lgd')],function(x){unique(x[,c('gene')])}))) ## basically all the genes assayed
}
options(warn=0)
##===================================================================================================================================
## perform the fisher.test() using fet() function=================================
plot_dat=list()
for(idat in 1:(length(dnmid)-4)){
cat('\t=====================',names(dnmid)[idat],'=====================',idat,' of ',(length(dnmid)-4),'\n')
dumpty=list()
for(imod in 1:length(module_list)){
cat('\t\t',names(module_list)[imod],'\n')
humpty=list()
## case DNM in module
ms=dnmid[[idat]]
ms=unique(ms[ms$ids%in%module_list[[names(module_list)[imod]]] | ms$gene%in%module_list[[names(module_list)[imod]]],c('gene','count')])
## control DNM in module
mf=dnmid[['lgd']]
mf=unique(mf[mf$ids%in%module_list[[names(module_list)[imod]]] | mf$gene%in%module_list[[names(module_list)[imod]]],c('gene','count')])
## case DNM not in module
bs=dnmid[[idat]]
bs=unique(bs[!(bs$ids%in%module_list[[names(module_list)[imod]]] | bs$gene%in%module_list[[names(module_list)[imod]]]),c('gene','count')])
## control DNM not in module
bf=dnmid[['lgd']]
bf=unique(bf[!(bf$ids%in%module_list[[names(module_list)[imod]]] | bf$gene%in%module_list[[names(module_list)[imod]]]),c('gene','count')])
if(!use_counts){
ms=length(unique(ms$gene))
mf=length(unique(mf$gene))
bs=length(unique(bs$gene))
bf=length(unique(bf$gene))
humpty=fetc(samp.sucess=ms,bkgrnd.success=bs,samp.fail=mf,bkgrnd.fail=bf)
}
if(use_counts){
ms=sum(ms$count)
mf=sum(mf$count)
bs=sum(bs$count)
bf=sum(bf$count)
humpty=fetc(samp.sucess=ms,bkgrnd.success=bs,samp.fail=mf,bkgrnd.fail=bf)
}
humpty$n.genes=length(module_list[[names(module_list)[imod]]])
# if(is.infinite(humpty$upperCI)){humpty$upperCI=1}
# if(is.infinite(humpty$lowerCI)){humpty$upperCI=0}
# if(do_plots){
# colvec=colmix
# humpty$module=as.factor(paste(names(module_list)[imod],names(dnmid)[imod],sep='_'))
# humpty$color=colvec[imod]#c(rep(colvec[imod],nrow(plot_dat[[names(module_list)[imod]]])-1),'black')
# humpty$point_width=(humpty[[names(module_list)[imod]]]$fetOR*3)+point_width_scale
# }
dumpty[[names(module_list)[[imod]]]]=(unlist(humpty))
}
plot_dat[[names(dnmid)[idat]]]=(t(as.data.frame(dumpty)))
}
# if(!do_plots){
return(plot_dat)
# }
# if(do_plots){
##===================================================================================================================================
## generate plots=================================
}
#### this enrichment test is only for DNM in module compared to genome/background
##<<>>dnm.enrich.bg<-function(module_list,bg_vect=NA,use_counts=F){
##<<>>#cat('\n\tNOTE:\tid_type - options: "name" - gene name / HUGO name; "ensg" - ensembl gene id;"mouse.ensg" - mouse ensembl gene id - one2one orthologs to human only\n')
##<<>>cat('\tNOTE:\tbg_vect - options: "NA" - one2one human orthologous of mice genes used to build the list of cell class enriched genes by Zeisel et al 2015(Science)\n\n')
##<<>>## based on bg input - if data.frame or matrix -> create a list of identical bg length=length(module_list)
##<<>>## + alternatively a list of modules with backgrounds in the same order -> flexibility option for unique backgrounds for each module
##<<>>## + added benefit of allowing different color coding for each bg (eg darker or different style of line / OR dot)
##<<>>
##<<>>## option for supporting multiple ID types -> ensg /+/ gene name
##<<>>## + auto-detect ID option - a possibility but seems unnecessary given only 2 types
##<<>>
##<<>>
##<<>>##===================================================================================================================================
##<<>>## generate background as per options, if list provided (same length as modules)=================================
##<<>>options(warn=-1)
##<<>>
##<<>># if(!is.na(bg_vect)){
##<<>> if(class(bg_vect)=='list'){
##<<>> if(length(bg_vect)==length(module_list)){
##<<>> cat('\tusing multiple bg list as provided\n')
##<<>> }
##<<>> if(length(bg_vect)!=length(module_list)){
##<<>> stop('\tlength of bg_vect provided not the same length as module_list. if same bkgrnd for all modules, do not provide as.list()\n')
##<<>> }
##<<>> }
##<<>>
##<<>> if(class(bg_vect)!='list' & !is.na(bg_vect)){
##<<>> dnmid=lapply(dnmid,function(x){unique(x[x[,'ids']%in%bg_vect|x[,'gene']%in%bg_vect,c('gene','count','ids')])}) ## limit ids to just those in bg_vect
##<<>> cat('\tusing the provided bkgrnd, same for all modules\n')
##<<>> }
##<<>>
##<<>> if(is.na(bg_vect)){
##<<>> cat('\tno background list provided, using default\n')
##<<>> bg_vect=unique(unlist(lapply(dnmid[c('EE','ASD','ID','SCZ','DDD','lgd')],function(x){unique(x[,c('gene')])}))) ## basically all the genes assayed
##<<>> }
##<<>>options(warn=0)
##<<>>
##<<>>##===================================================================================================================================
##<<>>## perform the fisher.test() using fet() function=================================
##<<>>
##<<>> plot_dat=list()
##<<>> for(idat in 1:(length(dnmid)-4)){
##<<>> cat('\t=====================',names(dnmid)[idat],'=====================',idat,' of ',(length(dnmid)-4),'\n')
##<<>> dumpty=list()
##<<>> for(imod in 1:length(module_list)){
##<<>>
##<<>> cat('\t\t',names(module_list)[imod],'\n')
##<<>> humpty=list()
##<<>>
##<<>> not_mod=bg_vect[!(bg_vect%in%module_list[[names(module_list)[imod]]])]
##<<>>## case dnm in module
##<<>> ms=dnmid[[idat]]
##<<>> ms=unique(ms[ms$ids%in%module_list[[names(module_list)[imod]]] | ms$gene%in%module_list[[names(module_list)[imod]]],c('gene','count')])
##<<>>
##<<>>## control dnm in module
##<<>> mf=dnmid[['lgd']]
##<<>> mf=unique(mf[mf$ids%in%module_list[[names(module_list)[imod]]] | mf$gene%in%module_list[[names(module_list)[imod]]],c('gene','count')])
##<<>>
##<<>>## case dnm outside module
##<<>> bs=dnmid[[idat]]
##<<>> bs=unique(bs[bs$ids%in%not_mod | bs$gene%in%not_mod,c('gene','count')])
##<<>>
##<<>>## control dnm outside module
##<<>> bf=dnmid[['lgd']]
##<<>> bf=unique(bf[bf$ids%in%not_mod | bf$gene%in%not_mod,c('gene','count')])
##<<>>
##<<>>
##<<>> if(!use_counts){
##<<>> ms=length(unique(ms$gene))
##<<>> mf=length(unique(mf$gene))
##<<>> bs=length(unique(bs$gene))
##<<>> bf=length(unique(bf$gene))
##<<>>
##<<>> humpty=fetc(samp.sucess=ms,bkgrnd.success=bs,samp.fail=mf,bkgrnd.fail=bf)
##<<>> }
##<<>> if(use_counts){
##<<>> ms=sum(ms$count)
##<<>> mf=sum(mf$count)
##<<>> bs=sum(bs$count)
##<<>> bf=sum(bf$count)
##<<>>
##<<>> humpty=fetc(samp.sucess=ms,bkgrnd.success=bs,samp.fail=mf,bkgrnd.fail=bf)
##<<>>
##<<>> }
##<<>> humpty$n.genes=length(module_list[[names(module_list)[imod]]])
##<<>># if(is.infinite(humpty$upperCI)){humpty$upperCI=1}
##<<>># if(is.infinite(humpty$lowerCI)){humpty$upperCI=0}
##<<>>
##<<>># if(do_plots){
##<<>># colvec=colmix
##<<>># humpty$module=as.factor(paste(names(module_list)[imod],names(dnmid)[imod],sep='_'))
##<<>># humpty$color=colvec[imod]#c(rep(colvec[imod],nrow(plot_dat[[names(module_list)[imod]]])-1),'black')
##<<>># humpty$point_width=(humpty[[names(module_list)[imod]]]$fetOR*3)+point_width_scale
##<<>> # }
##<<>> dumpty[[names(module_list)[[imod]]]]=(unlist(humpty))
##<<>> }
##<<>>
##<<>> plot_dat[[names(dnmid)[idat]]]=(t(as.data.frame(dumpty)))
##<<>>
##<<>> }
##<<>>
##<<>># if(!do_plots){
##<<>> return(plot_dat)
##<<>># }
##<<>>
##<<>># if(do_plots){
##<<>> ##===================================================================================================================================
##<<>> ## generate plots=================================
##<<>>
##<<>>}
##<<>>
##<<>>
### appears to work..
cellt.enrich.bg<-function(module_list,bg_list=NA,id_type='name'){
cat('\n\tNOTE:\tid_type - options: "name" - gene name / HUGO name; "ensg" - ensembl gene id;"mouse.ensg" - mouse ensembl gene id - one2one orthologs to human only\n')
cat('\tNOTE:\tbg_list - options: "NA" - one2one human orthologous of mice genes used to build the list of cell class enriched genes by Zeisel et al 2015(Science)\n\n')
## based on bg input - if data.frame or matrix -> create a list of identical bg length=length(module_list)
## + alternatively a list of modules with backgrounds in the same order -> flexibility option for unique backgrounds for each module
## + added benefit of allowing different color coding for each bg (eg darker or different style of line / OR dot)
## option for supporting multiple ID types -> ensg /+/ gene name
## + auto-detect ID option - a possibility but seems unnecessary given only 2 types
#dtb_path='/Users/ks/Dropbox/PROJ/annot/dtb/processed'
#load(paste0(dtb_path,'/cellt.enrich.zeizel.dtb.Rdata'))
##===================================================================================================================================
## generate background as per options, if list provided (same length as modules)=================================
options(warn=-1)
# if(!is.na(bg_list)){
if(class(bg_list)=='list'){
if(length(bg_list)==length(module_list)){
cat('\tusing multiple bg list as provided\n')
}
if(length(bg_list)!=length(module_list)){
stop('\tlength of bg_list provided not the same length as module_list. if same bkgrnd for all modules, do not provide as.list()\n')
}
}
if(class(bg_list)!='list' & !is.na(bg_list)){
cellid=lapply(cellid,function(x){x[,id_type][x[,id_type]%in%bg_list]}) ## limit ids to just those in bg_list
cat('\tusing the provided bkgrnd, same for all modules\n')
bg_list=list()
for(imod in 1:length(module_list)){
bg_list[[names(module_list)[imod]]]=cellid$bkgrnd
}
}
if(is.na(bg_list)){
cat('\tno background list provided, using default\n')
bg_list=list()
for(imod in 1:length(module_list)){
bg_list[[names(module_list)[imod]]]=cellid$bkgrnd
}
}
options(warn=0)
##===================================================================================================================================
## perform the fisher.test() using fet() function=================================
plot_dat=list()
for(idat in 1:(length(cellid)-1)){
cat('\t=====================',names(cellid)[idat],'=====================',idat,' of ',(length(cellid)-1),'\n')
dumpty=list()
for(imod in 1:length(module_list)){
humpty=list()
cat('\t\t',names(module_list)[imod],'\n')
humpty=fet(sampl=module_list[[names(module_list)[imod]]],bg_list[[imod]],success=cellid[[idat]])
# if(is.infinite(humpty$upperCI)){humpty$upperCI=1}
# if(is.infinite(humpty$lowerCI)){humpty$upperCI=0}
# if(do_plots){
# colvec=colmix
# humpty$module=as.factor(paste(names(module_list)[imod],names(cellid)[imod],sep='_'))
# humpty$color=colvec[imod]#c(rep(colvec[imod],nrow(plot_dat[[names(module_list)[imod]]])-1),'black')
# humpty$point_width=(humpty[[names(module_list)[imod]]]$fetOR*3)+point_width_scale
# }
dumpty[[names(module_list)[[imod]]]]=(unlist(humpty))
}
plot_dat[[names(cellid)[idat]]]=(t(as.data.frame(dumpty)))
}
# if(!do_plots){
return(plot_dat)
# }
# if(do_plots){
##===================================================================================================================================
## generate plots=================================
}
forestp<-function(dat_lis,mode='single',x_lim=c(0,10),use_log=NA,phen=NA,module=NA,p_thresh=NA,point_width_scale=4,line_width=0.1,ci_bar_height=0.35){
library(ggplot2)
cat('\tWARNING : currently does not work for a single module\n\n')
# colvec=c('red','darkblue','darkorange','magenta','orange')
colvec=colmix
cat('\tphenotypes detected :',paste(names(dat_lis),collapse=', '),'\n')
options(warn=-1)
if(!is.na(phen)){
cat('\tdataset selected:',paste(phen,collapse=', '),'\n')
dat_lis=dat_lis[names(dat_lis)%in%phen]
}
if(!is.na(module)){
cat('\tmodules selected:',paste(module,collapse=', '),'\n')
}
plot_dat=list()
for(ilis in 1:length(dat_lis)){
if(!is.na(module)){
dat_lis[[names(dat_lis)[ilis]]]=dat_lis[[names(dat_lis)[ilis]]][rownames(dat_lis[[names(dat_lis)[ilis]]])%in%module,]
}
plot_dat[[names(dat_lis)[ilis]]]=as.data.frame(make.numeric(dat_lis[[names(dat_lis)[ilis]]][,c('n.genes','FETp','fetOR.odds ratio','lowerCI','upperCI')]))
colnames(plot_dat[[names(dat_lis)[ilis]]])=c('n.genes','FETp','fetOR','lowerCI','upperCI')
plot_dat[[names(dat_lis)[ilis]]]$module=as.factor(paste(rownames(plot_dat[[names(dat_lis)[ilis]]]),names(dat_lis)[ilis],sep='_'))
plot_dat[[names(dat_lis)[ilis]]]$color=colvec[ilis]#c(rep(colvec[ilis],nrow(plot_dat[[names(dat_lis)[ilis]]])-1),'black')
# plot_dat[[names(dat_lis)[ilis]]]$color=c(rep(colvec[ilis],nrow(plot_dat[[names(dat_lis)[ilis]]])-1),'black')
plot_dat[[names(dat_lis)[ilis]]]$point_width=(plot_dat[[names(dat_lis)[ilis]]]$fetOR*3 )+point_width_scale
}
if(!is.na(p_thresh)){
dummy=list()
for(ilis in 1:length(plot_dat)){
tempr=plot_dat[[names(dat_lis)[ilis]]][plot_dat[[names(dat_lis)[ilis]]]$FETp<p_thresh,]
if(nrow(tempr)>0){
dummy[[names(plot_dat)[ilis]]]=plot_dat[[names(plot_dat)[ilis]]][plot_dat[[names(plot_dat)[ilis]]]$FETp<p_thresh,]
}
}
plot_dat=dummy
}
options(warn=0)
# if(!is.na(use_log)){
# plot_dat$fetOR=log(as.numeric(plot_dat$fetOR),base=use_log)
# plot_dat$lowerCI=log(as.numeric(plot_dat$lowerCI),base=use_log)
# plot_dat$upperCI=log(as.numeric(plot_dat$upperCI),base=use_log)
# }
gplots_dat=list()
if(mode=='single'){
for(idat in 1:length(plot_dat)){
myplot=ggplot() + # initiate the plot space, saved to myplot variable for later display
# - subsequent additions / modifications of the plot can be done by manipulating this space
# geom_point(data=df, aes(y=Module, x=OR),colour='red', size=3) +
# geom_errorbar(data=pdat,aes(x=Module,y=OR,ymin=lower.95..CI,ymax=upper.95..CI),colour="grey60",width=0.2)
geom_vline(xintercept=1, linetype="dashed")+ # add vertical line
# geom_vline(xintercept=(1), linetype="dashed")+ # add vertical line
geom_errorbarh(data=plot_dat[[idat]], aes(y=module,x=fetOR, xmin=lowerCI, xmax=upperCI), colour="black", height=ci_bar_height,size=2)+ # add HORISONTAL error bars, geom_errorbar used for vertical..
geom_point(data=plot_dat[[idat]], aes(y=module, x=fetOR), shape=15, cex=plot_dat[[idat]]$point_width, colour=plot_dat[[idat]]$color)+ # add plotting points
# xlim(0, 5) # removes datapoints outside the range == scale_x_continuous(limits=c(-5000, 5000))
#
# coord_cartesian(xlim=c(0, 60)) + # moves the window only
coord_cartesian(xlim=x_lim) + # coord_cartesian(xlim=c(-0.5, 1)) # moves the window only
scale_x_continuous(breaks=c(0,1:x_lim[2])) +
# scale_x_continuous(breaks=c(0,1,5,seq(10,2000,by=10))) +
# scale_size_area() +
xlab("FET odds ratio and CI") +
ylab("module")
## plot line styles || http://www.cookbook-r.com/Graphs/Shapes_and_line_types/
# ggtitle(names(table(dat$serie.name)[idat]))
## essential if using the ugly mess that is ggplot2, for more options : http://felixfan.github.io/rstudy/2013/11/27/ggplot2-remove-grid-background-margin/
# myplot + theme(panel.grid.major=element_blank(), panel.grid.minor=element_blank(), panel.background=element_blank(), axis.line=element_line(colour="black"))
#
gplots_dat[[names(plot_dat)[idat]]]=(myplot + theme_bw())
}
}
if(mode=='combined'){
plot_merge=plot_dat[[1]]
if(length(plot_dat)>1){
for(ilis in 2:length(plot_dat)){
plot_merge=rbind(plot_merge,plot_dat[[ilis]])
}}
myplot=ggplot() + # initiate the plot space, saved to myplot variable for later display
# - subsequent additions / modifications of the plot can be done by manipulating this space
# geom_point(data=df, aes(y=Module, x=OR),colour='red', size=3) +
# geom_errorbar(data=pdat,aes(x=Module,y=OR,ymin=lower.95..CI,ymax=upper.95..CI),colour="grey60",width=0.2)
geom_vline(xintercept=1, linetype="dashed")+ # add vertical line
# geom_vline(xintercept=(1), linetype="dashed")+ # add vertical line
geom_errorbarh(data=plot_merge, aes(y=module,x=fetOR, xmin=lowerCI, xmax=upperCI), colour="black", height=ci_bar_height,size=2)+ # add HORISONTAL error bars, geom_errorbar used for vertical..
geom_point(data=plot_merge, aes(y=module, x=fetOR), shape=15, cex=plot_merge$point_width, colour=plot_merge$color)+ # add plotting points
# xlim(0, 5) # removes datapoints outside the range == scale_x_continuous(limits=c(-5000, 5000))
#
# coord_cartesian(xlim=c(0, 60)) + # moves the window only
coord_cartesian(xlim=x_lim) + # coord_cartesian(xlim=c(-0.5, 1)) # moves the window only
scale_x_continuous(breaks=c(0,1:x_lim[2])) +
# scale_x_continuous(breaks=c(0,1,5,seq(10,2000,by=10))) +
# scale_size_area() +
xlab("FET odds ratio and CI") +
ylab("module")
## plot line styles || http://www.cookbook-r.com/Graphs/Shapes_and_line_types/
# ggtitle(names(table(dat$serie.name)[idat]))
## essential if using the ugly mess that is ggplot2, for more options : http://felixfan.github.io/rstudy/2013/11/27/ggplot2-remove-grid-background-margin/
# myplot + theme(panel.grid.major=element_blank(), panel.grid.minor=element_blank(), panel.background=element_blank(), axis.line=element_line(colour="black"))
#
gplots_dat=(myplot + theme_bw())
}
return(gplots_dat)
}
bg.list<-function(module_list,bg_list){
# cat('\tUSE:\tcheck and, if necessary, generate a list of backgrounds the same length as mod_list based on bg_list class\n')
if(class(bg_list)=='list'){
if(length(bg_list)==length(module_list)){
cat('\tusing multiple bg list as provided\n')
}
if(length(bg_list)!=length(module_list)){
stop('\tlength of bg_list provided not the same length as module_list. if same bkgrnd for all modules, do not provide as.list()\n')
}
}
if(class(bg_list)!='list'){
cat('\tusing same bkgrnd for all modules, as provided (assumes vector)\n')
bg_nw=list()
for(imod in 1:length(module_list)){
bg_nw[[names(module_list)[imod]]]=bg_list
}
}
return(bg_nw)
}
net.cons<-function(alis,blis,abg=NA,bbg=NA,do_plots=T,p_thresh=0.01,main="Maximum % of overlap with p hyper > 0.01"){
# cat('\tUSE:\tcalculte overlaps between two lists')
# cat('\tUSE:\toptional - add backgrounds for bg.common() - not implemented')
#cat('\tNOTE:\tbg_list - options: 'NA' - one2one human orthologous of mice genes used to build the list of cell class enriched genes by Zeisel et al 2015(Science)\n\n')
##===================================================================================================================================
## generate background as per options, if list provided (same length as modules)=================================
options(warn=-1)
if(!is.na(abg)){
abg=bg.list(alis,abg)
}
if(!is.na(bbg)){
bbg=bg.list(blis,bbg)
}
options(warn=0)
results_mat=as.data.frame(matrix(NA,nrow=length(alis),ncol=length(blis)))
rownames(results_mat)=names(alis)
colnames(results_mat)=names(blis)
if(class(abg)=='list'){
cons_stat=list(
pc_test=results_mat
,phyper_test=results_mat
,pc_repl=results_mat
,phyper_repl=results_mat
)
}
if(class(abg)!='list'){
cons_stat=list(
pc_test=results_mat
,pc_repl=results_mat
)
}
for(amod in 1:length(alis)){
for(bmod in 1:length(blis)){
# cat(amod,' | ',bmod,'\n')
if(class(abg)=='list'){
bg_comn=intersect(abg[[amod]],bbg[[bmod]])
# cat('\t\t background overlap: ',round(length(bg_comn)/length(abg[[amod]]),digits=2),round(length(bg_comn)/length(bbg[[bmod]]),digits=2),'\n')
alis[[amod]]=alis[[amod]][alis[[amod]]%in%bg_comn]
blis[[bmod]]=blis[[bmod]][blis[[bmod]]%in%bg_comn]
comn=intersect(alis[[amod]],blis[[bmod]])
cat('\t\t gene overlap: ',names(alis[amod]),round(length(comn)/length(alis[[amod]]),digits=2),' | ',names(blis[bmod]),round(length(comn)/length(blis[[bmod]]),digits=2),'\n')
}
if(class(abg)!='list'){
comn=intersect(alis[[amod]],blis[[bmod]])
cat('\t\t gene overlap: ',names(alis[amod]),round(length(comn)/length(alis[[amod]]),digits=2),' | ',names(blis[bmod]),round(length(comn)/length(blis[[bmod]]),digits=2),'\n')
}
cons_stat$pc_test[names(alis[amod]),names(blis[bmod])]=round(length(comn)/length(alis[[amod]]),digits=2)
cons_stat$pc_repl[names(alis[amod]),names(blis[bmod])]=round(length(comn)/length(blis[[bmod]]),digits=2)
if(class(abg)=='list'){
cons_stat$phyper_test[names(alis[amod]),names(blis[bmod])]=phyper(
q=length(comn) # n.success.sample
,m=sum(blis[[bmod]] %in% bg_comn) # n.success.popn
,n=length(bg_comn) # n.population
,k=length(alis[[amod]]) # n.sample
,lower.tail=F)
cons_stat$phyper_repl[names(alis[amod]),names(blis[bmod])]=phyper(
q=length(comn) # n.success.sample
,m=sum(alis[[amod]] %in% bg_comn) # n.success.popn
,n=length(bg_comn) # n.population
,k=length(blis[[bmod]]) # n.sample
,lower.tail=F)
}
}
}
cons_stat$pc_max=pmax(cons_stat$pc_test,cons_stat$pc_repl)
if(class(abg)=='list'){
cons_stat$phyper_min=pmin(cons_stat$phyper_test,cons_stat$phyper_repl)
}
if(do_plots){
library(corrplot)
if(class(abg)!='list'){
corrplot(make.numeric(cons_stat$pc_max),method="circle",is.corr=FALSE
,cl.lim=c(0, 1),col=rev(c(colrb)),cl.align="l",tl.col="black"
,mar=c(0,0,4,0)
,title=main
)
}
if(class(abg)=='list'){
corrplot(make.numeric(cons_stat$pc_max),method="circle", is.corr=FALSE
,p.mat=make.numeric(cons_stat$phyper_min), insig="blank",sig.level=p_thresh
,cl.lim=c(0, 1),col=rev(c(colrb)),cl.align="l",tl.col="black"
,mar=c(0,0,4,0)
,title=main
)
}
}
return(invisible(cons_stat))
}
net.overlap<-function(alis,abg=NA,do_plots=T,rev_col=F,...){
# cat('\t USE:\t calculte overlaps between two lists')
# cat('\t USE:\t optional - add backgrounds for bg.common() - not implemented')
#cat('\t NOTE:\t bg_list - options: 'NA' - one2one human orthologous of mice genes used to build the list of cell class enriched genes by Zeisel et al 2015(Science)\n\n')
##===================================================================================================================================
## generate background as per options, if list provided (same length as modules)=================================
options(warn=-1)
if(!is.na(abg)){
abg=bg.list(alis,abg)
}
options(warn=0)
results_mat=as.data.frame(matrix(NA,nrow=length(alis),ncol=length(alis)))
rownames(results_mat)=names(alis)
colnames(results_mat)=names(alis)
stats_mat=as.data.frame(matrix(NA,nrow=length(alis),ncol=3))
rownames(stats_mat)=names(alis)
colnames(stats_mat)=c('n.genes','n.overlap')
if(class(abg)=='list'){
cons_stat=list(
pc=results_mat
,n.overlap=results_mat
,phyper=results_mat
)
}
if(class(abg)!='list'){
cons_stat=list(
pc=results_mat
,n.overlap=results_mat
)
}
for(amod in 1:length(alis)){
for(bmod in 1:length(alis)){
if(class(abg)=='list'){
bg_comn=intersect(abg[[amod]],bbg[[bmod]])
cat('\t\t background overlap: ',round(length(bg_comn)/length(abg[[amod]]),digits=2),round(length(bg_comn)/length(bbg[[bmod]]),digits=2),'\n')
humpty=alis[[amod]][alis[[amod]]%in%bg_comn]
dumpty=alis[[bmod]][alis[[bmod]]%in%bg_comn]
comn=intersect(alis[[amod]][alis[[amod]]%in%bg_comn],alis[[amod]][alis[[amod]]%in%bg_comn])
}
if(class(abg)!='list'){
comn=intersect(alis[[amod]],alis[[bmod]])
cat('\t\t gene overlap: ',names(alis[amod]),round(length(comn)/length(alis[[amod]]),digits=2),' \t| ',names(alis[bmod]),round(length(comn)/length(alis[[bmod]]),digits=2),'\n')
humpty=alis[[amod]]
dumpty=alis[[bmod]]
}
cons_stat$pc[names(alis[amod]),names(alis[bmod])]=round(length(comn)/length(alis[[amod]]),digits=2)
cons_stat$pc[names(alis[bmod]),names(alis[amod])]=round(length(comn)/length(alis[[bmod]]),digits=2)
cons_stat$pc[names(alis[amod]),names(alis[bmod])]=round(length(comn)/length(alis[[amod]]),digits=2)
cons_stat$pc[names(alis[bmod]),names(alis[amod])]=round(length(comn)/length(alis[[bmod]]),digits=2)
if(class(abg)=='list'){
cons_stat$phyper[names(alis[amod]),names(alis[bmod])]=phyper(
q=length(comn) # n.success.sample
,m=sum(alis[[bmod]] %in% bg_comn) # n.success.popn
,n=length(bg_comn) # n.population
,k=length(alis[[amod]]) # n.sample
,lower.tail=F)
cons_stat$phyper[names(alis[bmod]),names(alis[amod])]=phyper(
q=length(comn) # n.success.sample
,m=sum(alis[[amod]] %in% bg_comn) # n.success.popn
,n=length(bg_comn) # n.population
,k=length(alis[[bmod]]) # n.sample
,lower.tail=F)
}
}
}
if(do_plots){
library(corrplot)
diag(cons_stat$pc)=0.000001
if(!rev_col){corrplot(make.numeric(cons_stat$pc),p.mat=make.numeric(cons_stat$pc)*100,sig.level=0.001,col=rev(c(colrb)),cl.align="l",tl.col="black",method='circle',is.corr=F,insig='p-value',...)}#,cl.lim=c(0,70)
if(rev_col){corrplot(make.numeric(cons_stat$pc),p.mat=make.numeric(cons_stat$pc)*100,sig.level=0.001,col=c(colrb),cl.align="l",tl.col="black",method='circle',is.corr=F,insig='p-value',...)}#,cl.lim=c(0,70)
diag(cons_stat$pc)=1
}
return(invisible(cons_stat))
}
cyt.connect<-function(cor_mat,thresh=0.001,use_pcor=T){
# USE: \t- convert correlation matrix to cytoscape connections
# NOTE:\t- pcor=T - use Aracne can to calculate partial correlations
# NOTE:\t- thresh=0.001, pcor=F - use correlation matrix to get connections that pass the R2 threshold
if(nrow(cor_mat)!=ncol(cor_mat)){
stop('\n\tERROR: expect a square correlation matrix \n\n')
}
if(use_pcor){
cat('\tcalculating partial correlations using Aracne PMID: 16723010\n')
library(minet)
cor_mat=aracne(abs(cor_mat))
}
for(irow in 1:nrow(cor_mat)){
humpty=as.data.frame(cor_mat[irow,][cor_mat[irow,]>=thresh])
colnames(humpty)='pcor'
humpty$a=rownames(cor_mat)[irow]
humpty$b=rownames(humpty)
# cat(nrow(humpty),'\n')
# print(irow)
if(irow==1){dumpty=humpty}
if(irow>1){dumpty=rbind(dumpty,humpty)}
#print(nrow(dumpty))
}
dumpty=dumpty[dumpty$a!=dumpty$b,] ## remove genes interacting with themselves
rownames(dumpty)=1:nrow(dumpty)
cat('\t',length(unique(c(dumpty$a,dumpty$b))),'of',nrow(cor_mat),' : ',(length(unique(c(dumpty$a,dumpty$b)))/nrow(cor_mat))*100,'% of genes have connections','\n')
## filtering to remove duplicate connections ie a~b, b~a
dtb=dumpty
rm(humpty)
rm(dumpty)
## more elegant way to do this is first sort the genes then paste them..........
dtb$x=paste(dtb$a,dtb$b)
dtb$y=paste(dtb$b,dtb$a)
dumpty=list()
k=nrow(dtb)
cat('\tremoving duplicate connectoins\n')
while(nrow(dtb)>0){
humpty=dtb[which(dtb$x==dtb$x[1] | dtb$x==dtb$y[1] | dtb$y==dtb$y[1] | dtb$y==dtb$y[1]), ]
dtb=dtb[-which(dtb$x==dtb$x[1] | dtb$x==dtb$y[1] | dtb$y==dtb$y[1] | dtb$y==dtb$y[1]), ]
dumpty[[paste(sort(unique(c(humpty$a,humpty$b))),collapse='_')]]=t(as.data.frame(list(
a=sort(unique(c(humpty$a,humpty$b)))[1]
,b=sort(unique(c(humpty$a,humpty$b)))[2]
,pcor=max(humpty$pcor)
)))
cat(round(1-nrow(dtb)/k,digits=2),'\r');flush.console()
}
cat('\n')
return(invisible(as.data.frame(t(as.data.frame(dumpty)))))
## if partial correl does not give all genes connections, it is feasible to add a single connection to missing genes based on max of correl
}
install.bioc<-function(pkg_name){
source("https://bioconductor.org/biocLite.R")
biocLite(pkg_name)
}
install.dependencies<-function(){
## USE : install all packages used by one of the functions (yes these can indeed be specified as dependencies, not implemented yet)
## NOTE: example error below means the mirror used is not functional/unavailable, try another..
# Warning: unable to access index for repository https://mirrors.ebi.ac.uk/CRAN/src/contrib:
# cannot download all files
# Warning: unable to access index for repository https://mirrors.ebi.ac.uk/CRAN/bin/macosx/mavericks/contrib/3.3:
# cannot download all files
# Warning message:
# package ‘gplots’ is not available (for R version 3.3.0)
system('git clone https://github.com/jalvesaq/colorout.git')
system('sudo R CMD INSTALL colorout')
library(colorout)
install.packages('devtools')
install.packages('gplots')
## installing WGCNA - requires dependencies first - simplest is to use instruction as per website :
## http://labs.genetics.ucla.edu/horvath/CoexpressionNetwork/Rpackages/WGCNA/#cranInstall
source("http://bioconductor.org/biocLite.R")
biocLite(c("AnnotationDbi", "impute", "GO.db", "preprocessCore"))
install.packages("WGCNA")
install.packages('MetaDE')
install.packages('mixtools')
install.packages('pamr')
install.packages("psych")
install.packages('corrplot')
install.packages('gplots')
install.packages('ggplot2')
install.packages('pvclust')
install.packages('dendextend')
install.bioc('minet')
install.bioc('limma')
install.bioc('biomaRt')
}
lcount<-function(x,length){
cat(round(x/length,digits=2),"\r");flush.console()
return(x+1)
}
lprogr<-function(xvar,xful){
cat(xvar,which(xful==xvar),'of',length(xful),'\n')
}
overlap<-function(A,B,n=5){
## modified to run only for unique A & B, otherwise numbers can be misleading
unA=unique(A)
unB=unique(B)
cat('\n\tlength(A) : ',length(A),'\t unique(A) : ',length(unA),' \t',round(length(unA)/length(A),digits=3)*100,'%\n')
cat('\tlength(B) : ',length(B),'\t unique(B) : ',length(unB),' \t',round(length(unB)/length(B),digits=3)*100,'%','\n')
both=union(unA, unB)
intr=intersect(unA,unB)
inA=both %in% unA
inB=both %in% unB
cat('\n')
print(table(inA, inB))
exA=unA[!(unA%in%intr)]
exB=unB[!(unB%in%intr)]
cat('\n\t',length(intr),'\tinA & inB :\t',paste(sort(intr[1:n]) ,collapse=', '),'\n')
cat('\t',length(exA),'\tinA notB :\t', paste(sort(exA[1:n]),collapse=', '),'\n')
cat('\t',length(exB),'\tinB notA :\t', paste(sort(exB[1:n]),collapse=', '),'\n')
return(invisible(list(inter=intr,union=both,ina=exA,inb=exB)))
}
Intersect <- function(a,b,...){
## full credit to Abhishek K Baikady at http://stackoverflow.com/questions/3695677/how-to-find-common-elements-from-multiple-vectors
Reduce(intersect, list(a,b,...))
}
Union <- function(a,b,...){
## full credit to Abhishek K Baikady at http://stackoverflow.com/questions/3695677/how-to-find-common-elements-from-multiple-vectors
Reduce(union, list(a,b,...))
}
idconvert<-function(ids,verbose=T){
ids=toupper(ids)
if(length(unique(ids))<length(ids)){
warning('\tWARNING : some ids are not unique, ',length(unique(ids)),' of ',length(ids),' are unique after "toupper" case conversion\n')
}
if(verbose){
cat('\tnum genes recognised : ',sum(ids%in%idmap$ids),', ',frac(sum(ids%in%idmap$ids),length(ids),num=T)*100,'%\n',sep='')
}
idmap$ids=toupper(idmap$ids)
## idmap[idmap$ids%in%ids,c('gene','ids','name')]
return(idmap[idmap$ids%in%ids,c('gene','ids','name')])
## for(igen )
}
idconvert.ensg<-function(dat_lis){
## conveting list of vectors containing only official HUGO gene names (rownames) to ENSG
dat_out=list()
for(ilis in names(dat_lis)){
dat_out[[ilis]]=idmap.ensg[idmap.ensg$gene%in%(dat_lis[[ilis]]),'ids']
cat('\t',names(dat_lis[ilis]),'% genes mapped',round(length(dat_out[[ilis]])/length(dat_lis[[ilis]]),digits=3),'\n')
}
return(dat_out)
}
check.data<-function(dat_lis){
dummy=list()
for(ilis in names(dat_lis)){
dummy[[ilis]]=rownames(dat_lis[[ilis]])
cat('\t',ilis,'\tnrow=',length(dummy[[ilis]]),'\tncol=',ncol(dat_lis[[ilis]]),'\n')
}
stata=list()
for(ilis in 1:length(dummy)){
if(ilis==1){
holder=dummy[[names(dummy)[ilis]]]
}
if(ilis>1){
stata[[paste(names(dummy)[ilis-1],names(dummy)[ilis],sep='_')]]=sum(holder==dummy[[names(dummy)[ilis]]])/length(holder)
cat('\t\t% genes in same order between',names(dummy)[ilis-1],'&',names(dummy)[ilis],'wrt',names(dummy)[ilis-1],stata[[paste(names(dummy)[ilis-1],names(dummy)[ilis],sep='_')]],'\n')
}
}
}
applydiffcoex <- function(beta2,corr_mat_list,signtype=signType) # for multiple conditions
{
correl=vector(mode="list", length=length(corr_mat_list)); # empty (for adjacencies)
compDij=vector(mode="list", length=length(corr_mat_list)); # empty
#compute the cij0 (for all the conditions)
for (k in 1:length(corr_mat_list))
{
datmat=as.matrix(corr_mat_list[[k]])
correl[[k]]= sign(datmat)*(datmat)^2
diag(correl[[k]])=0 # change the diagonal to 0 instead of 1
}
# correl holds information about adjacency: sign(corr)*(corr)^2
cij0=Reduce("+",correl)/length(correl); # Reduce adds two matrices (correl is a list of matrices)
## A general-purpose adder:
#Example
#add <- function(x) Reduce("+", x)
#add(list(1, 2, 3))
## Like sum(), but can also used for adding matrices etc., as it will
## use the appropriate '+' method in each reduction step.
#compute Dij
for (element in 1:length(correl))
{
compDij[[element]]=abs(correl[[element]]- cij0)/2;
}
Dij=(1/(length(corr_mat_list)-1))*Reduce("+", compDij)^(beta2/2) # compDij is a list of matrices
dissTOM=TOMdist(Dij, TOMType=signtype); # WGCNA function
collectGarbage()
return(dissTOM)
}
#===================================================================================================================================
# # DiffCoEx (same as Kirill's but with some options and explanations)
#===================================================================================================================================
wgcna_diffcoex_L <- function (list_expr, pow=6, minModuleSize=40, mergeHeight=0.15, datDescr="", signType="unsigned")
{
print(" NOTE : Input data (list_expr) is expected as a list with each entry : rows=genes, columns=samples")
library("WGCNA")
enableWGCNAThreads()
print("Can deal with only one softpower (pow) at a time ")
set.seed(0)
bicorL=list() # correlation of gene expression
for (ireg in 1:length(list_expr)) {
print("■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■")
print(paste(names(list_expr)[ireg], ireg, "of", length(names(list_expr))))
t0=Sys.time()
COND=list_expr[[ireg]]
# CONDav=scale(COND, scale=F)
CONDav=COND
bicorL[[ireg]]=bicor(t(as.matrix(CONDav))) # bicor correlation of conditions
}
names(bicorL)=names(list_expr)
collectGarbage()
t0=Sys.time()
softPower=pow
print(softPower)
# dissTOM=applydiffcoex(softPower, bicorL, signtype=signType) # *
dissTOM=diffcoex_paper(softPower, bicorL, signtype=signType) # *
print(Sys.time() - t0)
collectGarbage()
geneTree=hclust(as.dist(dissTOM), method="average")
collectGarbage()
dynamicMods=cutreeDynamic(dendro=geneTree, distM=dissTOM,
method="hybrid", cutHeight=0.996, deepSplit=T,
pamRespectsDendro=FALSE, minClusterSize=minModuleSize)
dynamicColors=labels2colors(dynamicMods)
collectGarbage()
Datall=t(as.data.frame(list_expr)) # merge the conditions
collectGarbage()
mergedColor=mergeCloseModules(Datall, dynamicColors, cutHeight=mergeHeight)$color
print(paste("mergedColor=", length(unique(mergedColor)),
unique(mergedColor)))
print(Sys.time() - t0)
collectGarbage()
mstat=as.data.frame(table(mergedColor)) # using dynamicColors not Merged
mstat=mstat[order(mstat[, 2], decreasing=T), ]
msta0=mstat[(mstat[, 1]=="grey"), ]
msta0$module="M0"
mstat=mstat[!(mstat[, 1]=="grey"), ]
mstat$module=paste0("M", 1:nrow(mstat))
mstat=rbind(mstat, msta0)
colnames(mstat)=c("color", "ngenes", "module")
mstat$color=as.character(mstat$color)
if (datDescr !="") {
mstat$module=paste(mstat$module, dat_descr, sep="_")
}
module_list=list()
for (imod in 1:nrow(mstat)) {
module_list[[mstat$module[imod]]]=rownames(list_expr[[1]])[mergedColor==
mstat$color[imod]]
}
module_list[["bkgrnd"]]=rownames(list_expr[[1]])
module_expr=list()
for (ilis in 1:length(list_expr)) {
for (imod in 1:length(module_list)) {
module_expr[[names(list_expr)[ilis]]][[names(module_list)[imod]]]=list_expr[[names(list_expr)[ilis]]][module_list[[names(module_list)[imod]]],
]
}
}
mbg=as.data.frame("bkgrnd")
mbg$length=nrow(list_expr[[1]])
mbg$name="bkgrnd"
colnames(mbg)=colnames(mstat)
mstat=rbind(mstat, mbg)
readme="\n\tModules are named based on size M1 - biggest, M0 - unclustered, bkgrnd - all input genes, output contains :\n \t1. module_list - list containing names of genes in each module\n \t2. module_expr - expression matrix of all genes in module / input dataset\n \t3. mstat - key used to name modules, includes module size\n \t4. GeneTree - object to plot the WGCNA style dendrogram\n \n"
cat(readme)
return(invisible(list(module_list=module_list, module_expr=module_expr,
mstat=mstat, plotobj=geneTree, readme=readme)))
}
diffcoex_paper <- function(beta2,bicorL,signtype=signType){
AdjMatC1=sign(bicorL[[1]])*(bicorL[[1]])^2
AdjMatC2=sign(bicorL[[2]])*(bicorL[[2]])^2
message("Condition - Control")
diag(AdjMatC1)=0
diag(AdjMatC2)=0
beta1=beta2
dissTOMC1C2=TOMdist((abs(AdjMatC1-AdjMatC2)/2)^(beta1/2))
} # for 2 conditions (condition-control)
wgcna.diffcoex<-function(list_expr,pow=5,minModuleSize=40,mergeHeight=0.15,datDescr="",signType="unsigned"){
print(" NOTE : Input data (list_expr) is expected as a list with each entry : rows=genes, columns=samples")
library('WGCNA')
enableWGCNAThreads()
print("Can deal with only one softpower (pow) at a time ")
set.seed(0) # reproducibility
bicorL = list()
for(ireg in 1:length(list_expr)){
print("■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■")
print(paste(names(list_expr)[ireg],ireg,"of",length(names(list_expr))))
t0=Sys.time()
COND= list_expr[[ireg]]
#add a line to substract mean of gene expression row by row in each condition
# CONDav = scale(COND,scale=F) ## the data is likely scaled already (ie if removed a covariate etc)
bicorL[[ireg]]= bicor(t(as.matrix(COND))) # iteration to adapt to the seq chosen
}
names(bicorL)=names(list_expr)
collectGarbage()
t0=Sys.time()
softPower=pow
print(softPower)
dissTOM=applydiffcoex(softPower,bicorL,signtype=signType)
print(Sys.time()-t0) #41.17 min
collectGarbage()
geneTree=hclust(as.dist(dissTOM), method="average")
#print(Sys.time()-t0)
collectGarbage()
dynamicMods=cutreeDynamic(dendro=geneTree, distM=dissTOM,
method="hybrid",cutHeight=.996,
deepSplit=T, pamRespectsDendro=FALSE,
minClusterSize=minModuleSize)
dynamicColors=labels2colors(dynamicMods)
# print(paste("dynamicColors =",length(unique(dynamicColors)),unique(dynamicColors)))
#print(Sys.time()-t0)
collectGarbage()
Datall = t(as.data.frame(list_expr))
collectGarbage()
mergedColor=mergeCloseModules(Datall,dynamicColors,cutHeight=mergeHeight)$color
print(paste("mergedColor =",length(unique(mergedColor)),unique(mergedColor)))
print(Sys.time()-t0)
collectGarbage()
#' Plot the dendrogram and colors underneath
# pdf(file=paste(outDir,"/plots/",datDescr,"_power",softPower,"_dendogram.pdf",sep=""),width=10,paper='a4r')
# plotDendroAndColors(geneTree, cbind(dynamicColors, mergedColor), "Hybrid Tree Cut",
# dendroLabels=FALSE, hang=0.03,addGuide=TRUE,
# guideHang=0.05,main=paste("Cluster Dendrogram ",datDescr," (power ",softPower,")",sep=""))
# dev.off()
# print(Sys.time()-t0)
mstat=as.data.frame(table(mergedColor))
mstat=mstat[order(mstat[,2],decreasing=T),]
msta0=mstat[(mstat[,1]=='grey'),]
msta0$module='M0'
mstat=mstat[!(mstat[,1]=='grey'),]
mstat$module=paste0('M',1:nrow(mstat))
mstat=rbind(mstat,msta0)
colnames(mstat)=c('color','ngenes','module')
mstat$color=as.character(mstat$color)
if(datDescr!=''){mstat$module=paste(mstat$module,dat_descr,sep="_")} ## add info after module name
## module membership ------------------------------------------
module_list=list()
for(imod in 1:nrow(mstat)){
module_list[[mstat$module[imod]]]=rownames(list_expr[[1]])[mergedColor==mstat$color[imod]] ## assuming all module gene names are in the same order, if not, modules are probably unreliable anyhow
}
module_list[['bkgrnd']]=rownames(list_expr[[1]])
module_expr=list()
for(ilis in 1:length(list_expr)){
## module expression ------------------------------------------
for(imod in 1:length(module_list)){
module_expr[[names(list_expr)[ilis]]][[names(module_list)[imod]]]=list_expr[[names(list_expr)[ilis]]][module_list[[names(module_list)[imod]]],]
}
}
## add bkgrnd info to mstat
mbg=as.data.frame('bkgrnd')
mbg$length=nrow(list_expr[[1]])
mbg$name='bkgrnd'
colnames(mbg)=colnames(mstat)
mstat=rbind(mstat,mbg)
readme='\n\tModules are named based on size M1 - biggest, M0 - unclustered, bkgrnd - all input genes, output contains :
\t1. module_list - list containing names of genes in each module
\t2. module_expr - expression matrix of all genes in module / input dataset
\t3. mstat - key used to name modules, includes module size
\t4. GeneTree - object to plot the WGCNA style dendrogram
\n'
cat(readme)
return(invisible(list(module_list=module_list,module_expr=module_expr,mstat=mstat,plotobj=geneTree,readme=readme)))
}
cmap.meta<-function(lmod,bkg='NA',de_thresh=0.01,n_genes=5){ ## combine the stuffs below to use with function rather than combined stuff as is atm
cat('\n\tNOTE: this function requires two objects: "metsum" & "degen", available from:\nhttps://www.dropbox.com/s/5nyydp1y5htikba/004.DTB.full_info.single_DEG.meta_randM.batch_corrected.scored.Rdata?dl=0\n\n')
cat('\n\tNOTE: currently list of drugs to test is taken from "degen" only, adding a drug name/data to only "metsum" will not perform enrichments\n\n')
# INPUT: lmod - list of module names (for each expect chracter vector $up $down of ENSG, if unavailable, arbitrarily assign all genes to $up or $down)
#### input format :
##> str(bkg)
# chr [1:13210] "ENSG00000121410" "ENSG00000175899" "ENSG00000166535" ...
#> str(lmod)
#List of 4
# $ dif.hubs :List of 1
# ..$ up: chr [1:130] "ENSG00000198826" "ENSG00000105011" "ENSG00000066279" "ENSG00000087586" ...
# $ E2F1.hubs:List of 1
# ..$ up: chr [1:66] "ENSG00000198826" "ENSG00000066279" "ENSG00000087586" "ENSG00000178999" ...
# $ E2F1.M5 :List of 2
# ..$ down: chr [1:12] "ENSG00000127837" "ENSG00000159842" "ENSG00000149925" "ENSG00000100307" ...
# ..$ up : chr [1:101] "ENSG00000198826" "ENSG00000066279" "ENSG00000156802" "ENSG00000176208" ...
# $ M5 :List of 2
# ..$ down: chr [1:216] "ENSG00000127837" "ENSG00000165029" "ENSG00000127220" "ENSG00000159842" ...
# ..$ up : chr [1:262] "ENSG00000159251" "ENSG00000198826" "ENSG00000105011" "ENSG00000066279" ...
## - lmod - list of modules - for each module 2 lists of gene ids "ENSG" - "up" & "down" - genes up/down-regulated between treatment and control
#de_thresh=0.01
#n_genes=5
mstat=list()
sigen=list()
k=1
options(warn=-1)
if(bkg=='NA'){
cat('\tusing default background - all cmap genes')
bkg=unique(c(rownames(metsum[[1]]),rownames(degen[[1]])))
}
options(warn=0)
dkey=gsub('DE_Drug_(.*)_Cell_.*_Array_.*_Conc_.*_Conc_.*_time_.*_res','\\1',names(degen))
# matst(names(metsum)%in%dkey)
ukey=unique(dkey)
#idru='wortmannin'
cat('\n\tperform enrichment test for module genes in cmap\n')
for(idru in ukey){
if(idru%in%names(metsum)){
dummy=metsum[[idru]]
rownames(dummy)=gsub('_at','',rownames(dummy))
dummy=dummy[rownames(dummy)%in%bkg,]
dummy$log.fc=dummy$logFC.mean
dummy$adj.p=dummy$randmMod.fdr
dummy=dummy[,c('log.fc','adj.p')]
holder=list()
# holder$n.bg=nrow(dummy)
holder$n.bg.all=rownames(dummy)
holder$n.bg.sig=rownames(dummy[dummy$adj.p<de_thresh,])
holder$n.bg.sig_up=rownames(dummy[dummy$adj.p<de_thresh & dummy$log.fc>0,])
holder$n.bg.sig_down=rownames(dummy[dummy$adj.p<de_thresh & dummy$log.fc<0,])
sigen[[paste0('mixedM_',idru)]]$bkg=(holder)
if(length(holder$n.bg.sig)>n_genes){
# if(idru%in%names(metsum)){mstat$meta[[idru]]$bg=as.data.frame(humpty)}
# if(!idru%in%names(metsum)){mstat$single[[idru]]$bg=as.data.frame(humpty)}
for(imod in names(lmod)){
if('up' %in% names(lmod[[imod]])){
dumpty=dummy[rownames(dummy)%in%lmod[[imod]]$up,]
holder[[imod]]$up_downreg=dumpty[dumpty$adj.p<de_thresh & dumpty$log.fc<0,]
holder[[imod]]$up_upreg=dumpty[dumpty$adj.p<de_thresh & dumpty$log.fc>0,]
mstat[[paste0('mixedM_',idru)]][[paste(imod,'up_downreg',sep='.')]]=as.data.frame(fet(sampl=rownames(dumpty),bkgrnd=rownames(dummy),success=holder$n.bg.sig_down,counts=F))
mstat[[paste0('mixedM_',idru)]][[paste(imod,'up_upreg',sep='.')]] =as.data.frame(fet(sampl=rownames(dumpty),bkgrnd=rownames(dummy),success=holder$n.bg.sig_up,counts=F))
sigen[[paste0('mixedM_',idru)]][[paste(imod,'up',sep='.')]]=holder[[imod]]
}
if('down' %in% names(lmod[[imod]])){
dumpty=dummy[rownames(dummy)%in%lmod[[imod]]$down,]
holder[[imod]]$down_downreg=dumpty[dumpty$adj.p<de_thresh & dumpty$log.fc<0,]
holder[[imod]]$down_upreg=dumpty[dumpty$adj.p<de_thresh & dumpty$log.fc>0,]
mstat[[paste0('mixedM_',idru)]][[paste(imod,'down_downreg',sep='.')]]=as.data.frame(fet(sampl=rownames(dumpty),bkgrnd=rownames(dummy),success=holder$n.bg.sig_down,counts=F))
mstat[[paste0('mixedM_',idru)]][[paste(imod,'down_upreg',sep='.')]] =as.data.frame(fet(sampl=rownames(dumpty),bkgrnd=rownames(dummy),success=holder$n.bg.sig_up,counts=F))
sigen[[paste0('mixedM_',idru)]][[paste(imod,'down',sep='.')]]=holder[[imod]]
}
}
rm(dumpty)
}
k=lcount(k,length(ukey))
rm(dummy)
rm(holder)
}
if(!(idru%in%names(metsum))){
dummy=degen[dkey==idru]
if(length(dummy)>1){ ## if removing the above constraint will need to introduce another loop for each experiment within same drug
cat(idru,length(dummy))
}
if(length(dummy)==1){
dummy=dummy[[1]]
rownames(dummy)=gsub('_at','',rownames(dummy))
dummy=dummy[rownames(dummy)%in%bkg,]
dummy$log.fc=dummy$logFC
dummy$adj.p=dummy$adj.P.Val
dummy=dummy[,c('log.fc','adj.p')]
}
holder=list()
# holder$n.bg=nrow(dummy)
holder$n.bg.all=rownames(dummy)
holder$n.bg.sig=rownames(dummy[dummy$adj.p<de_thresh,])
holder$n.bg.sig_up=rownames(dummy[dummy$adj.p<de_thresh & dummy$log.fc>0,])
holder$n.bg.sig_down=rownames(dummy[dummy$adj.p<de_thresh & dummy$log.fc<0,])
sigen[[paste0('single_',idru)]]$bkg=(holder)
if(length(holder$n.bg.sig)>n_genes){
# if(idru%in%names(metsum)){mstat$meta[[idru]]$bg=as.data.frame(humpty)}
# if(!idru%in%names(metsum)){mstat$single[[idru]]$bg=as.data.frame(humpty)}
for(imod in names(lmod)){
if('up' %in% names(lmod[[imod]])){
dumpty=dummy[rownames(dummy)%in%lmod[[imod]]$up,]
holder[[imod]]$up_downreg=dumpty[dumpty$adj.p<de_thresh & dumpty$log.fc<0,]
holder[[imod]]$up_upreg=dumpty[dumpty$adj.p<de_thresh & dumpty$log.fc>0,]
mstat[[paste0('single_',idru)]][[paste(imod,'up_downreg',sep='.')]]=as.data.frame(fet(sampl=rownames(dumpty),bkgrnd=rownames(dummy),success=holder$n.bg.sig_down,counts=F))
mstat[[paste0('single_',idru)]][[paste(imod,'up_upreg',sep='.')]] =as.data.frame(fet(sampl=rownames(dumpty),bkgrnd=rownames(dummy),success=holder$n.bg.sig_up,counts=F))
sigen[[paste0('single_',idru)]][[paste(imod,'up',sep='.')]]=holder[[imod]]
}
#sampl=rownames(dumpty)
#bkgrnd=rownames(dummy)
#success=holder$n.bg.sig_down
#success=holder$n.bg.sig_up
#table(sampl%in%bkgrnd)
#table(success%in%sampl)
#table(success%in%bkgrnd)
if('down' %in% names(lmod[[imod]])){
dumpty=dummy[rownames(dummy)%in%lmod[[imod]]$down,]
holder[[imod]]$down_downreg=dumpty[dumpty$adj.p<de_thresh & dumpty$log.fc<0,]
holder[[imod]]$down_upreg=dumpty[dumpty$adj.p<de_thresh & dumpty$log.fc>0,]
mstat[[paste0('single_',idru)]][[paste(imod,'down_downreg',sep='.')]]=as.data.frame(fet(sampl=rownames(dumpty),bkgrnd=rownames(dummy),success=holder$n.bg.sig_down,counts=F))
mstat[[paste0('single_',idru)]][[paste(imod,'down_upreg',sep='.')]] =as.data.frame(fet(sampl=rownames(dumpty),bkgrnd=rownames(dummy),success=holder$n.bg.sig_up,counts=F))
sigen[[paste0('single_',idru)]][[paste(imod,'down',sep='.')]]=holder[[imod]]
}
}
# rm(dumpty)
}
k=lcount(k,length(ukey))
# rm(dummy)
# rm(holder)
}
}
# str(mstat[['mixedM_wortmannin']])
# str(sigen[['mixedM_wortmannin']])
# (sigen[['mixedM_wortmannin']][['E2F1.M5.down_upreg']]$down_upreg)
sigdru=list()
sigsum=list()
sigpc1=list()
sigpc2=list()
cat('\n\tcompiling results..\n\n')
for(idru in names(mstat)){
humpty=mstat[[idru]]
dumpty=as.data.frame(matrix(unlist(humpty),nrow=length(humpty),byrow=T))
rownames(dumpty)=names(humpty)
colnames(dumpty)=colnames(humpty[[1]])
dumpty$n_hits_pc=dumpty$samp.success/(dumpty$samp.success+dumpty$bkgrnd.success)
dumpty$n_mod_pc=dumpty$samp.success/(dumpty$n.genes)
# if(sum(dumpty$FETp<0.05)>=1){
sigdru[[idru]]=dumpty
sigsum[[idru]]=(dumpty[,c('FETp'),drop=F])
sigpc1[[idru]]=(dumpty[,c('n_hits_pc'),drop=F])
colnames(sigsum[[idru]])=paste(as.character(idru))#,colnames(sigsum[[idru]]))
colnames(sigpc1[[idru]])=paste(as.character(idru))#,colnames(sigpcs[[idru]]))
sigpc2[[idru]]=(dumpty[,c('n_mod_pc'),drop=F])
colnames(sigpc2[[idru]])=paste(as.character(idru))#,colnames(sigpcs[[idru]]))
## colnames(sigsum[[idru]])=paste(as.character(idru),colnames(sigsum[[idru]]),sep='.') #paste0('Drug_',as.character(idru))
# }
}
dnam=names(sigsum)
sigsum=t(as.data.frame(sigsum))
rownames(sigsum)=dnam
dnam=names(sigpc1)
sigpc1=t(as.data.frame(sigpc1))
rownames(sigpc1)=dnam
dnam=names(sigpc2)
sigpc2=t(as.data.frame(sigpc2))
rownames(sigpc2)=dnam
#sigsum['single_etoposide',]
#sigsum['single_monobenzone',]
#sigsum['single_trifluridine',]
#sigpcs['single_etoposide',]
#sigpcs['single_monobenzone',]
#sigpcs['single_trifluridine',]
#mstat['single_dequalinium chloride']
#### M2.up_downreg
## inf## 'single_dequalinium.chloride'
## NaN## 'single_dicycloverine'
# Head(sigsum)
readme='\tcmap differentially expressed genes from drug treatment, raw and meta-analysis
\t\tNOTE : currently individual experiments used to create the meta-analysis results are ignored
\t\t$sigsum - summary results - p-values of enrichment only
\t\t$sigpc1 - summary results - % of differentially expressed genes in module / background (n.success.module/n.success.bkg)
\t\t$sigpc2 - summary results - % of differentially expressed genes in module / all module genes (n.success.module/n.genes.module)
\t\t$sigdru - summary results - summary fisher exact test statistics
\t\t$sigen - lists of genes used to calculate fisher exact test statistics above
'
cat(readme)
return(invisible(list(sigsum=(sigsum),sigpc1=(sigpc1),sigpc2=(sigpc2),sigdru=sigdru,sigen=sigen,readme=readme)))#,dumpty=dumpty)))
}
cmap.enrich<-function(modg,bkg=NA,de_thresh=0.1,n_genes=5){ ## combine the stuffs below to use with function rather than combined stuff as is atm
cat('\n\tNOTE: this function REQUIRES properly formatted database named "degdb", cmap version available from :
\t\thttps://www.dropbox.com/s/18l1w2jbqld2mej/cmap.enrich.data.Rdata?dl=0\n\n')
#cat('\n\tNOTE: this function requires two objects: "metsum" & "degen", available from:\nhttps://www.dropbox.com/s/5nyydp1y5htikba/004.DTB.full_info.single_DEG.meta_randM.batch_corrected.scored.Rdata?dl=0\n\n')
#cat('\n\tNOTE: currently list of drugs to test is taken from "degen" only, adding a drug name/data to only "metsum" will not perform enrichments\n\n')
# INPUT: lmod - list of module names (for each expect chracter vector $up $down of ENSG, if unavailable, arbitrarily assign all genes to $up or $down)
#### input format :
##> str(bkg)
# chr [1:13210] "ENSG00000121410" "ENSG00000175899" "ENSG00000166535" ...
#> str(lmod)
#List of 4
# $ dif.hubs :List of 1
# ..$ up: chr [1:130] "ENSG00000198826" "ENSG00000105011" "ENSG00000066279" "ENSG00000087586" ...
# $ E2F1.hubs:List of 1
# ..$ up: chr [1:66] "ENSG00000198826" "ENSG00000066279" "ENSG00000087586" "ENSG00000178999" ...
# $ E2F1.M5 :List of 2
# ..$ down: chr [1:12] "ENSG00000127837" "ENSG00000159842" "ENSG00000149925" "ENSG00000100307" ...
# ..$ up : chr [1:101] "ENSG00000198826" "ENSG00000066279" "ENSG00000156802" "ENSG00000176208" ...
# $ M5 :List of 2
# ..$ down: chr [1:216] "ENSG00000127837" "ENSG00000165029" "ENSG00000127220" "ENSG00000159842" ...
# ..$ up : chr [1:262] "ENSG00000159251" "ENSG00000198826" "ENSG00000105011" "ENSG00000066279" ...
## - lmod - list of modules - for each module 2 lists of gene ids "ENSG" - "up" & "down" - genes up/down-regulated between treatment and control
#de_thresh=0.01
#n_genes=5
mstat=list()
sigen=list()
#options(warn=-1)
if(is.na(bkg)[1]){ ## first element check
cat('\tusing default background - all cmap genes\n')
bkg=bgcommon(degdb)
}
if(!(length(degdb)>0)){
stop('\t\tlength(degdb) < = 0\t\tie no database found..')
}
#options(warn=0)
#dkey=gsub('(.*)__(.*)__(.*)__(.*)__(.*)__(.*)__(.*)__(.*)','\\1;\\2',names(degdb))
# matst(names(metsum)%in%dkey)
#ukey=unique(dkey)
#idru='wortmannin'
k=1
cat('\n\tperform enrichment test for module genes in cmap\n')
for(idru in names(degdb)){
dummy=degdb[idru][[1]] ## guaranteed to be a single list since using the full names
dummy=dummy[rownames(dummy)%in%bkg,]
if(nrow(dummy)==0){stop('no overlap between provided bkg & degb')}
holder=list()
# holder$n.bg=nrow(dummy)
holder$n.bg.all=(dummy)
holder$n.bg.sig=(dummy[dummy$FDR<de_thresh,])
holder$n.bg.sig_up=(dummy[dummy$FDR<de_thresh & dummy$logFC>0,])
holder$n.bg.sig_down=(dummy[dummy$FDR<de_thresh & dummy$logFC<0,])
sigen[[idru]]$bkg=(holder)
if(nrow(holder$n.bg.sig)>=n_genes){
# print(idru)
# if(idru%in%names(metsum)){mstat$meta[[idru]]$bg=as.data.frame(humpty)}
# if(!idru%in%names(metsum)){mstat$single[[idru]]$bg=as.data.frame(humpty)}
for(imod in names(modg)){
if('up' %in% names(modg[[imod]])){
# print(imod)
dumpty=dummy[rownames(dummy)%in%modg[[imod]]$up,]
holder[[imod]]$up_module=(dumpty)
holder[[imod]]$up_downreg=(dumpty[dumpty$FDR<de_thresh & dumpty$logFC<0,])
holder[[imod]]$up_upreg=(dumpty[dumpty$FDR<de_thresh & dumpty$logFC>0,])
if(nrow(holder[[imod]]$up_downreg)>=n_genes){
# print('up')
mstat[[idru]][[paste(imod,'up_downreg',sep='.')]]=as.data.frame(fet(sampl=rownames(holder[[imod]]$up_module),bkgrnd=rownames(holder$n.bg.all),success=rownames(holder$n.bg.sig_down),counts=F))
mstat[[idru]][[paste(imod,'up_upreg',sep='.')]] =as.data.frame(fet(sampl=rownames(holder[[imod]]$up_module),bkgrnd=rownames(holder$n.bg.all),success=rownames(holder$n.bg.sig_up),counts=F))
sigen[[idru]][[paste(imod,'up',sep='.')]]=holder[[imod]]
}
# if(length(holder[[imod]]$up_downreg)<n_genes){
# mstat[[idru]][[paste(imod,'up_downreg',sep='.')]] =NA
# mstat[[idru]][[paste(imod,'up_upreg',sep='.')]] =NA
# sigen[[idru]][[paste(imod,'up',sep='.')]] =NA
# }
}
if('down' %in% names(modg[[imod]])){
# print(imod)
dumpty=dummy[rownames(dummy)%in%modg[[imod]]$down,]
holder[[imod]]$down_module=(dumpty)
holder[[imod]]$down_downreg=(dumpty[dumpty$FDR<de_thresh & dumpty$logFC<0,])
holder[[imod]]$down_upreg=(dumpty[dumpty$FDR<de_thresh & dumpty$logFC>0,])
if(nrow(holder[[imod]]$down_upreg)>=n_genes){
# print('down')
mstat[[idru]][[paste(imod,'down_downreg',sep='.')]]=as.data.frame(fet(sampl=rownames(holder[[imod]]$down_module),bkgrnd=rownames(holder$n.bg.all),success=rownames(holder$n.bg.sig_down),counts=F))
mstat[[idru]][[paste(imod,'down_upreg',sep='.')]] =as.data.frame(fet(sampl=rownames(holder[[imod]]$down_module),bkgrnd=rownames(holder$n.bg.all),success=rownames(holder$n.bg.sig_up),counts=F))
sigen[[idru]][[paste(imod,'down',sep='.')]]=holder[[imod]]
}
# if(length(holder[[imod]]$up_downreg)<n_genes){
# mstat[[idru]][[paste(imod,'down_downreg',sep='.')]] =NA
# mstat[[idru]][[paste(imod,'down_upreg',sep='.')]] =NA
# sigen[[idru]][[paste(imod,'down',sep='.')]] =NA
# }
sigen[[paste0('mixedM_',idru)]][[paste(imod,'down',sep='.')]]=holder[[imod]]
}
rm(dumpty)
}}
k=lcount(k,length(degdb))
rm(dummy)
rm(holder)
}
sigdru=list()
sigsum=list()
sigpc1=list()
sigpc2=list()
cat('\n\tcompiling results..\n\n')
for(idru in names(mstat)){
humpty=mstat[[idru]]
dumpty=as.data.frame(matrix(unlist(humpty),nrow=length(humpty),byrow=T))
rownames(dumpty)=names(humpty)
colnames(dumpty)=colnames(humpty[[1]])
dumpty$n_hits_pc=dumpty$samp.success/(dumpty$samp.success+dumpty$bkgrnd.success)
dumpty$n_mod_pc=dumpty$samp.success/(dumpty$n.genes)
# if(sum(dumpty$FETp<0.05)>=1){
sigdru[[idru]]=dumpty
sigsum[[idru]]=(dumpty[,c('FETp'),drop=F])
sigpc1[[idru]]=(dumpty[,c('n_hits_pc'),drop=F])
colnames(sigsum[[idru]])=paste(as.character(idru))#,colnames(sigsum[[idru]]))
colnames(sigpc1[[idru]])=paste(as.character(idru))#,colnames(sigpcs[[idru]]))
sigpc2[[idru]]=(dumpty[,c('n_mod_pc'),drop=F])
colnames(sigpc2[[idru]])=paste(as.character(idru))#,colnames(sigpcs[[idru]]))
## colnames(sigsum[[idru]])=paste(as.character(idru),colnames(sigsum[[idru]]),sep='.') #paste0('Drug_',as.character(idru))
# }
}
cat('\t\t1 of 3\n')
mergy=''
l=1
for(idru in names(sigsum)){
mergy=rmerge(mergy,sigsum[[idru]],verbose=F,all=T)
l=lcount(l,length(sigsum))
}
mergy=mergy[-1,-1]
sigsum=as.data.frame(t(mergy))
cat('\t\t2 of 3\n')
mergy=''
l=1
for(idru in names(sigpc1)){
mergy=rmerge(mergy,sigpc1[[idru]],verbose=F,all=T)
l=lcount(l,length(sigpc1))
}
mergy=mergy[-1,-1]
sigpc1=round(as.data.frame(t(mergy)),digits=3)
cat('\t\t3 of 3\n')
mergy=''
l=1
for(idru in names(sigpc2)){
mergy=rmerge(mergy,sigpc2[[idru]],verbose=F,all=T)
l=lcount(l,length(sigpc2))
}
mergy=mergy[-1,-1]
sigpc2=round(as.data.frame(t(mergy)),digits=3)
#sigsum['single_etoposide',]
#sigsum['single_monobenzone',]
#sigsum['single_trifluridine',]
#sigpcs['single_etoposide',]
#sigpcs['single_monobenzone',]
#sigpcs['single_trifluridine',]
#mstat['single_dequalinium chloride']
#### M2.up_downreg
## inf## 'single_dequalinium.chloride'
## NaN## 'single_dicycloverine'
# Head(sigsum)
readme='\n\tcmap differentially expressed genes from drug treatment, raw and meta-analysis
\t\tNOTE : currently individual experiments used to create the meta-analysis results are ignored
\t\t$sigsum - summary results - p-values of enrichment only
\t\t$sigpc1 - summary results - % of differentially expressed genes in module / background (n.success.module/n.success.bkg)
\t\t$sigpc2 - summary results - % of differentially expressed genes in module / all module genes (n.success.module/n.genes.module)
\t\t$sigdru - summary results - summary fisher exact test statistics
\t\t$sigen - lists of genes used to calculate fisher exact test statistics above
'
cat(readme)
return(invisible(list(sigsum=(sigsum),sigpc1=(sigpc1),sigpc2=(sigpc2),sigdru=sigdru,sigen=sigen,readme=readme)))#,dumpty=dumpty)))
}
annot.combine<-function(expr_mat,annot_mat,annot_from,annot_to,combine_method='median'){
## OPTIONS : supported "combine_method"=c('mean','median','sum') ## sum for RNA-seq : transcript per million (TPM) if mapped such that reads are not shared across transcripts (eg cufflinks), else median is more appropriate
## process expression matrix with corresponding annotation matrix
## - re-mapping id types and taking median of non-uniquly mapping ids
## - annot_from & annot_to - expect the name of colname containing current and new ids respectively
if(!combine_method%in%c('mean','median','sum')){
stop(paste0('combine_method="',combine_method,'" not currently supported'))
}
## identify multiple id to gene mappings and take median expression
ids=intersect(annot_mat[,annot_from],rownames(expr_mat))
cat('\n\texpr_mat',round(length(ids)/nrow(expr_mat),digits=3)*100,'% ids intresect with annot_mat\n')
cat('\tannot_mat',annot_from,round(length(ids)/length(unique(annot_mat[,annot_from]))*100,digits=3),'% ids intresect with expr_mat\n')
expr_mat=expr_mat[ids,]
annot_mat=annot_mat[annot_mat[,annot_from]%in%ids,]
cat('\n\tremove non-specific (duplicated) ids in',annot_from,' :\n')
annot_mat=get.duplicates(annot_mat,annot_from)$unique
cat('\n\tget genes with multiple ids in',annot_to,' :\n')
dupli=get.duplicates(annot_mat,annot_to)
unic=dupli$unique
dupl=dupli$duplicates
if(nrow(unic)>0){
ids=intersect(unic[,annot_from],rownames(expr_mat))
expu=expr_mat[sort(ids),]
rownames(unic)=unic[,annot_from]
unic=unic[rownames(expu),]
matst(rownames(expu)==rownames(unic))
matst(unic[,annot_from]==rownames(unic))
rownames(expu)=unic[,annot_to]
cat('\n\t',frac(nrow(expu),nrow(expr_mat))*100,'% ids uniquely mapped\n')
expr_out=expu
}
udup=unlist(unique(dupl[,annot_to,drop=F]))
if(nrow(unic)!=nrow(expr)){
if(length(udup)>1){
cat('\n\tmerge',length(udup),'genes, using ',combine_method,' on all mapped probes :\n')
# idup=udup[5]
dupmed=list()
k=1
for(idup in udup){
humpty=expr_mat[as.character(dupl[dupl[,annot_to]==idup,annot_from]),]
if(combine_method=='median'){holder=apply(humpty,2,median,na.rm=T)}
if(combine_method=='mean'){holder=apply(humpty,2,mean,na.rm=T)}
if(combine_method=='sum'){holder=apply(humpty,2,sum,na.rm=T)}
dupmed[[idup]]=holder
# dumpty=cor(humpty,holder)
# diag(dumpty)=NA
# dumpty=c(min(dumpty,na.rm=T),max(dumpty,na.rm=T),length(dumpty))
# cat('\t\t\t',paste(round(dumpty,digits=2),collapse=",\t"),'\n')
k=lcount(k,length(udup))
}
dupfin=t(as.data.frame(dupmed))
rownames(dupfin)=names(dupmed)
expr_out=rbind(expu,dupfin)
}
}
cat('\n\tfinal output contains',nrow(expr_out),' genes, ',round(nrow(expr_out)/nrow(expr_mat),digits=2)*100,'% of original input\n')
return(invisible(expr_out))
}
pubchem.idmatch<-function(query,pubchem_db){
# INPUTS: pubchem_db - colnames("pubchem_CID" "synonym" "lower") lower=tolower(pubchem_db$synonym
# INPUTS: query - character string of synonyms to search the "lower"
query=unique(tolower(query))
cat('\n\tconvert',length(query),'unique synonyms to pubchem IDs :\n')
holder=pubchem_db[pubchem_db$lower%in%query,]
cat('\t\t',length(unique(holder$query)),'of',length(unique(query)),'ids mapped',round(length(unique(holder$query))/length(unique(query)),digits=3)*100,'%\n')
return(unique(holder[order(holder$lower),]))
}
##~~<>pubchem.idmatch<-function(query,tolower=T,loose=F,grep=F){
# cat('\tNOTE:\t- function requires synu & synd objects from: /Data/pubchem/dtb/extras/compound.CID_Synonym_filtered.sep2016.duplicates.Rdata\n')
# query - expect vector of drug names
##~~<> if(!tolower){
##~~<> query=unique(query)
##~~<> cat('\tmapping',length(query),'compounds - same case matches\n')
##~~<> mapped=synu[synu$synonym%in%query,,drop=F]
##~~<> cat('\t\t',round(length(unique(mapped$synonym))/length(query),digits=2),'\t"unique" matches - may contain duplicated pubchem compound ids\n')
# if(loose){
# ambigous=synd[synd$synonym%in%query,,drop=F]
# cat('\t\t',round(length(unique(ambigous$synonym))/length(query),digits=2),'\tambigous matches - names that match multiple compounds\n')
# }
##~~<> if(grep){
##~~<> partial=synu[grepl(paste(query,collapse='|'),synu$synonym),,drop=F]
##~~<> cat('\t\t',round(length(unique(partial$synonym))/length(query),digits=2),'\tgrep matches in "unique" names\n')
##~~<> }
##~~<> }
##~~<> if(tolower){
##~~<> query=unique(tolower(query))
##~~<> cat('\tmapping',length(query),'compounds - all lower case\n')
##~~<> mapped=synu[synu$lower%in%query,,drop=F]
##~~<> cat('\t\t',round(length(unique(mapped$lower))/length(query),digits=2),'\t"unique" matches - may contain duplicated pubchem compound ids\n')
# if(loose){
# ambigous=synd[synd$lower%in%query,,drop=F]
# cat('\t\t',round(length(unique(ambigous$lower))/length(query),digits=2),'\tambigous matches - names that match multiple compounds\n')
# }
##~~<> if(grep){
##~~<> partial=synu[grepl(paste(query,collapse='|'),synu$lower),,drop=F]
##~~<> cat('\t\t',round(length(unique(partial$lower))/length(query),digits=2),'\tgrep matches in "unique" names\n')
##~~<> }
##~~<> }
##~~<> if(!loose&!grep){
##~~<> return(list(mapped=mapped))
##~~<> }
##~~<> if(loose&!grep){
##~~<> return(list(mapped=mapped,ambigous=ambigous))
##~~<> }
##~~<> if(loose&grep){
##~~<> return(list(mapped=mapped,ambigous=ambigous,partial=partial))
##~~<> }
##~~<> cat('\n')
##~~<>}
distpc <- function(x,perc) ecdf(x)(perc)
## estimate quantile of a value in a distribution
# SOURCE: http://stats.stackexchange.com/questions/50080/estimate-quantile-of-value-in-a-vector
# USE: ecdf_fun(1:10,8)
# NOTES: this function looks really 'creepy', no brackets i get, but where is return?? how does (perc) work?? spooky..
geo.matrix<-function(datid,path){
## USE: download GEO matrix data and process into smth usable
cur_dir=getwd()
datid=toupper(datid)
# dir.create(paste0(path,'/dtb/'))
# dir.create(paste0(path,'/dtb/',datid))
dir.create(paste0(path,'/',datid))
# dir.create(file.path(path,'dtb',datid))
# paste0(path,'/dtb/',datid)
#setwd(file.path(path,'dtb',datid))
setwd(file.path(paste0(path,'/',datid)))
getwd()
system(paste0('wget -r -nH --cut-dirs=7 ftp://ftp.ncbi.nlm.nih.gov/geo/series/',substr(datid,1,5),'nnn/',datid,'/matrix/'))
# system(paste0('wget -r -nH --cut-dirs=7 https://www.ncbi.nlm.nih.gov/geo/download/?acc=',datid,'&format=file'))
# -r ## recursively Dounload
# -nH (--no-host-directories) ## cuts out hostname
# --cut-dirs=X ## (cuts out X directories)
system(paste0('gunzip ',datid,'_series_matrix.txt.gz'))
# list.files()
gse=readLines(file.path(paste0(path,'/',datid,'/',datid,'_series_matrix.txt')))
cord=c(
grep('!Sample_title',gse)
,grep('!series_matrix_table_begin',gse)
,grep('!series_matrix_table_end',gse)
)
meta=as.data.frame(strsplit(gse[1:(cord[1]-2)],'\t'))
colnames(meta)=meta[1,]
meta=t(meta[-1,])
samp=as.data.frame(strsplit(gse[cord[1]:(cord[2]-1)],'\t'))
for(icol in colnames(samp)){
samp[,icol]=(gsub('"','',samp[,icol],fixed=T))
}
colnames(samp)=samp[1,]
samp=samp[-1,]
colnames(samp)=(gsub('"','',colnames(samp),fixed=T))
colnames(samp)=(gsub('!Sample_','',colnames(samp),fixed=T))
rownames(samp)=samp$geo_accession
# Head(samp)
cat('\t',datid,'contains phenotype data for',nrow(samp),'samples\n')
expr=as.data.frame(strsplit(gse[(cord[2]+1):(cord[3]-1)],'\t'))
colnames(expr)=as.character(expr[1,])
rownames(expr)=as.character(expr[,1])
expr=t(expr[-1,-1])
if(nrow(expr)==0){
cat('\t\tno expression data available\n')
return(invisible(list(samp=samp,meta=meta)))
}
if(nrow(expr)>0){
cat('\t',datid,'contains expression data for',ncol(expr),'samples\n')
# expr=as.data.frame(strsplit(gse[(cord[2]+1):(cord[3]-1)],'\t'))
# colnames(expr)=as.character(expr[1,])
# rownames(expr)=as.character(expr[,1])
# expr=t(expr[-1,-1])
expr=make.numeric(expr)
rownames(expr)=(gsub('"','',rownames(expr),fixed=T))
colnames(expr)=(gsub('"','',colnames(expr),fixed=T))
Head(expr)
overlap(rownames(samp),colnames(expr))
samp=samp[colnames(expr),]
colnames(expr)=gsub(' ','.',samp$title)
cat('\t\texpression data available,',nrow(expr),' genes\n')
setwd(cur_dir)
return(invisible(list(expr=expr,samp=samp,meta=meta)))
}
}
geo.query<-function(file_loc){
cat('\treading file\n')
## INPUTS: - location of the file downloaded from geo as 'query' for datasets / series / both
arri=readLines(file_loc)
pos=as.data.frame(grep('^[0-9]*[.] ',arri)) ## all entries start as: '1. ' '2. ' '3. ' etc..
colnames(pos)='start'
pos$end=c((pos$start[2:nrow(pos)]-1),length(arri))
cat('\t',nrow(pos),'datasets detected\n')
#nstat=list()
cat('\t\tparsing entries\n')
gdat=list()
for(idat in 1:nrow(pos)){
dummy=arri[pos[idat,'start']:pos[idat,'end']]
dummy=dummy[dummy!='']
# grepl('Organism:',dummy)
# grepl('Type:',dummy)
# grepl('Platform:',dummy)
# grepl('download:',dummy)
# grepl('DataSet',dummy)
if(length(dummy)==7){
gdat[[paste0('n',idat)]]=as.data.frame(dummy)
}
lcount(idat,nrow(pos))
## nstat[[as.character(idat)]]=length(dummy) ## used to determine what the datasets look like, as below
}
cat('\n\t\tcleaning up\n')
#nstat=t(as.data.frame(nstat))
#matst(nstat)
#names(nstat[nstat[,1]==6,])[1]
#names(nstat[nstat[,1]==7,])[1]
#names(nstat[nstat[,1]==8,])[1]
gdat=t(as.data.frame(gdat))
# Head(gdat)
geod=as.data.frame(tolower(gdat[,1]))
colnames(geod)='title'
geod$full=tolower(gdat[,2])
geod$organism=gsub('Organism:\t','',gdat[,3])
geod$type=gsub('Type:\t\t','',gdat[,4])
# geod$type=gsub('Type:\t\t','',gdat[,4])
holder=strsplit(gdat[,5],' ')
dummy=list()
k=1
for(idat in names(holder)){
dummy[[idat]]=holder[[idat]][(length(holder[[idat]])-1)]
k=lcount(k,length(holder))
}
geod$n=as.numeric(unlist(dummy))
geod$platforms=gdat[,5]
geod$ftp=gdat[,6]
geod$ids=gdat[,7]
rownames(geod)=1:nrow(geod)
dummy=geod$ids
geod$id=gsub('.*\t\tAccession: (G.*)\tID: (.*)','\\1',dummy)
geod$otr=gsub('.*\t\tAccession: (G.*)\tID: (.*)','\\2',dummy)
geod$ftp=gsub('.* (ftp:.*)','\\1',geod$ftp)
dummy=geod$platforms
geod$platform=gsub('Platform: (.*) (.*) Samples','\\1',dummy)
geod$n.samp=gsub('Platform: (.*) (.*) Samples','\\2',dummy)
return(invisible(geod))
}
ll<-function(dir_path=getwd()){
system(paste0('ls -lht ',dir_path))
}
cd<-function(target_path,show=T){
if(target_path=='..'){target_path= gsub('(.*[/]).*','\\1',target_path);setwd(target_path)}
if(target_path!='..'){setwd(paste0(getwd(),'/',target_path))}
if(show){ll()}
}
pwd<-function(){
getwd()
}
mkdir<-function(dir_name,dir_path=getwd(),show=T,warn=T){
# if(show){ll()}
dir.create(file.path(dir_path,dir_name),showWarnings=warn)
if(show){ll(dir_path)}
}
rmdir<-function(dir_name,dir_path=getwd(),show=T){
# if(show){ll()}
system(paste0('rmdir ',dir_name))
if(show){ll(dir_path)}
}
cp<-function(file_names,target_path,show=T){
if(show){ll(target_path)}
system(paste('cp',paste(file_names,collapse=' '),target_path))
if(show){ll(target_path)}
}
read.zip <- function(file,verbose=T,...){
## SOURCE : http://stackoverflow.com/questions/8986818/automate-zip-file-reading-in-r
zipFileInfo = unzip(file, list=TRUE)
if(nrow(zipFileInfo) > 1){
stop(paste0("more than one data file inside ",file))
}
else{
if(verbose){cat(' ',file,'\tcontains 1 file : ',as.character(zipFileInfo$Name),'\n')}
read.csv(unz(file, as.character(zipFileInfo$Name)), ...)
}
}
# http://www.programiz.com/r-programming/examples/odd-even
# http://www.programiz.com/r-programming/examples/prime-number <<<<<< related : prime number
# check if the input number is odd or even.
# A number is even if division by 2 give a remainder of 0. If remainder is 1, it is odd.
#num = as.integer(readline(prompt="Enter a number: "))
#if((num %% 2) == 0) {
# print(paste(num,"is Even"))
#} else {
# print(paste(num,"is Odd"))
#}
list.overlap<-function(alis,blis='',do.pcs=T,do.fet=T,verbose=T,do_plots=F,...){ ## integration required for T/F flags to change inpu
# if(do_plots){library(corrplot)} ## dont want to run the loop if breaks after nearly done
if(length(blis)==1){ if(blis==''){blis=alis}} ## convoluted to avoid the length>1 warning msg
print(length(blis))
matpc=as.data.frame(matrix(NA,nrow=(length(alis)),ncol=(length(blis))))
print(dim(matpc))
colnames(matpc)=names(blis)
rownames(matpc)=names(alis)
matpv=matpc
k=1
for(ilis in names(alis)){
if(verbose){k=lcount(k,length(alis))}
for(jlis in names(blis)){
humpty=alis[[ilis]]
dumpty=blis[[jlis]]
inters=intersect(humpty,dumpty)
unions=union(humpty,dumpty)
if(do.pcs){
matpc[ilis,jlis]=(length(inters)/length(unions))
# matpc[ilis,jlis]=(length(inters)/length(dumpty))
}
if(do.fet){
matpv[ilis,jlis]=fet(sampl=humpty,bkgrnd=unions,success=inters,counts=F,alternative = "greater")$FETp
# matpv[ilis,jlis]=fet(sampl=dumpty,bkgrnd=unions,success=inters,counts=F,alternative = "greater")$FETp
}
}
}
if(do_plots){
if(do.pcs){Heat(as.matrix(round(matpc,digits=2)*100),values=T,values.rm='0',Colv=F,Rowv=F,main='(intersect / union) *100 2sf',...)}
if(do.fet){Heat(as.matrix(-log10(matpv)),values=T,Colv=F,Rowv=F,main='-log10(fet P-val)',...)}
}
if(do.pcs&do.fet){return(invisible(list(perc=matpc,pval=matpv)))}
if(do.pcs){return(invisible(matpc))}
if(do.fet){return(invisible(matpv))}
}
demands<-function(expr,anno,netw,case_cont_ind){
# INPUTs: case_cont_ind=list(case=colnames(expr)[case],cont=colnames(expr)[cont]) ## if numeric, used directly as index
library(DeMAND)
dobj=demandClass(exp=expr, anno=anno, network=netw)
if(is.numeric(case_cont_ind$case)){
caseInd=case_cont_ind$case
controlInd=case_cont_ind$cont
print('num')
}
if(!is.numeric(case_cont_ind$case)){
caseInd=(which(colnames(exp)%in%case_cont_ind$case))
controlInd=(which(colnames(exp)%in%case_cont_ind$cont))
print('name')
}
dobj=runDeMAND(dobj, fgIndex=caseInd, bgIndex=controlInd)
return(dobj)
}
cid.parent<-function(dat_mat,dnam_col_name,cid_col_name,no_orphans=T,parent_dtb=""){
## INPUT : parent_dtb, if using load() obj='pare' - colnames("cid","parent") - "cid" used to search, "parent" column added to dat_mat, "cid" with no parent are same as input
## alternatively specify a table colnames("cid","parent") to be used as is..
if(class(parent_dtb)=='character'){ ## avoid potential problem of trying to match a huge dtb to ""
if(parent_dtb==""){
parent_dtb='/Data/drud/pubchem/dtb/cidmap/pubchem.cid_to_parent.Rdata'
cat('\tloading "parent_dtb" from default location:\t',parent_dtb,'\n')
Load(parent_dtb)
parent_dtb=pare
}
}
## contingencies required - dtb is provided
# if(parent_dtb!=""){
# parent_dtb='/Data/drug_db/pubchem/dtb/extras/compound/compound.cid_to_parent.sep2016.Rdata'
# cat('\tloading "parent_dtb" from specified location:\t',parent_dtb,'\n')
# parent_dtb=pare
# }
udru_in=unique(dat_mat[,dnam_col_name])
parent_dtb=parent_dtb[parent_dtb$cid%in%dat_mat[,cid_col_name],]
cat('\tmapping CID to parent CID\n')
holder=merge(dat_mat,parent_dtb,by.x=cid_col_name,by.y='cid',all=T)
if(no_orphans==F){cat('\t',length(unique(holder[,dnam_col_name][(is.na(holder$parent))])),'\tcompounds to not have a "parent"\n')}
if(no_orphans==T){
cat('\t',length(unique(holder[,dnam_col_name][(is.na(holder$parent))])),'\tcompounds to not have a "parent" -> use provided CID\n')
holder$parent[is.na(holder$parent)]=holder[,cid_col_name][is.na(holder$parent)]
}
udru_ot=unique(holder[,dnam_col_name])
if((sum(udru_ot%in%udru_in)/length(udru_in))!=1){
cat('\tlost drug names?!\n')
}
return(invisible(holder))
}
cid.match<-function(query,pubchem_db="",parent_db=""){
# INPUTS: query=vector of "synonyms" / drug names to map to PubChem cid
# INPUTS: pubchem_db - colnames("pubchem_CID" "synonym" "lower") lower=tolower(pubchem_db$synonym
# INPUTS: query - character string of synonyms to search the "lower"
if(class(pubchem_db)=='character'){ ## avoid potential problem of trying to match a huge dtb to ""
if(pubchem_db==""){
pubchem_db='/Data/drud/pubchem/dtb/cidmap/pubchem.cid.mesh.lower.Rdata'
cat('\tloading "pubchem_db" from default location:\t',pubchem_db,'\n')
Load(pubchem_db)
pubchem_db=cidmap
}
}
if(class(parent_db)=='character'){ ## avoid potential problem of trying to match a huge dtb to ""
if(parent_db==""){
parent_db='/Data/drud/pubchem/dtb/cidmap/pubchem.cid_to_parent.Rdata'
cat('\tloading "parent_db" from default location:\t',parent_db,'\n')
Load(parent_db)
parent_db=pare
}
}
query=unique(tolower(query[query!='']))
cat('\n\tquery contains',length(query),'ids after unique(tolower(query)) & query!=""\n')
holder=unique(pubchem_db[pubchem_db$synon%in%(as.character(query)),])
dummy=parent_db[parent_db$cid%in%holder$cid,]
holder=merge(holder,dummy,by='cid',all=T)
holder$parent[is.na(holder$parent)]=holder$cid[is.na(holder$parent)]
# get.duplicates(holder,'cid')
# get.duplicates(holder,'synon')
# get.duplicates(holder,'parent')
dim(holder)
holder=unique(holder[,c('parent','synon')]) ## should not loose any synonyms by definition of using unique()
dim(holder)
# overlap(holder$synon,dummy$synon)
## remove duplicates in both cid and synon, should get rid of non-overlapping matches to
# dupcid=unique(holder$cid[duplicated(holder$cid)])
# dupsyn=unique(holder$synon[duplicated(holder$synon)])
# dupdat=holder[(holder$cid%in%dupcid & holder$synon%in%dupsyn),]
# sindat=holder[!(holder$cid%in%dupcid & holder$synon%in%dupsyn),]
## remove duplicates in both cid and synon, should get rid of cid that overlap with 2 different synon while keeping one random cid // synon
# dupdat=holder[(duplicated(holder$cid) & duplicated(holder$synon)),]
# sindat=holder[!(duplicated(holder$cid) & duplicated(holder$synon)),]
# overlap(dupdat$synon,sindat$synon)
dim(holder)
holder=holder[!(duplicated(holder$parent) & duplicated(holder$synon)),]
dim(holder)
usyn=unique(holder$synon)
## possible way to remove duplicates across multiple entries (handled above instead) ie !duplicated(parent and synon)
#isyn='naproxen'
# for(isyn in usyn){
# humpty=holder[holder$synon==isyn,]
# dumpty=holder[holder$cid%in%humpty$cid & holder$synon!=isyn,]
# for(imat in unique(dumpty$synon)){
#
# }
# overlap(humpty$cid,dumpty$cid)
## cid overlap completely - perfect synonyms
## cid partial overlap - keep non overlapping
##
# }
colnames(holder)=c('cid','synon')
cat('\t',round(length(unique(holder$synon))/length(query),digits=3)*100,'% query ids matched\n')
return(unique(holder[order(holder$synon),]))
}
tanidist<-function(sdf_set){
## USE : generate a matrix of tanimoto (similarity by default) distances based on sdfset object generated via ChemmineR
## DEPENDENCIES : rmerge() - custom function in R.helper
library('ChemmineR')
## runing the default sets for now
## make apset etc using "ChemmineR", straight from https://www.bioconductor.org/packages/release/bioc/vignettes/ChemmineR/inst/doc/ChemmineR.html
cidlis=sdfid(sdf_set) ## make sure the CID are same as internal ones in sdfset
cid(sdf_set)=cidlis
apset=sdf2ap(sdf_set)
fpchar=desc2fp(apset, descnames=1024, type="character")
fpset=as(fpchar, "FPset")
params=genParameters(fpset)
k=1
for(icid in cidlis){
holder=fpSim(fpset[icid], fpset, sorted=TRUE, method="Tanimoto", addone=1, cutoff=0, top="all", alpha=1, beta=1, parameters=params,scoreType="similarity")
dummy=holder[,'similarity',drop=F]
colnames(dummy)=icid
if(k==1){
distmat=dummy
}
if(k!=1){
distmat=rmerge(distmat,dummy,verbose=F)
}
# print(k)
k=lcount(k,length(cidlis))
}
distmat=1-distmat
return(distmat[cidlis,cidlis])
}
gsea.enrich<-function(genlis,rnkdat,dat_descr='',gsea_path="/Data/ks/lib/",nperm=10000,min_clust_size=10,max_clust_size=5000,do_plots=F,...){
## WARNING: if running multiple instances of the enrichment, select a different gsea_path to avoid conflicts / breaking the functions.. likely need to additional copies of the script to run as well
# warning('function will break if folders with "my_analysis" in the name already exist in gsea_path')
# gsea_path="/Users/ks/Dropbox/bin/gsea/"
system(paste0('mkdir ',gsea_path,'/working'))
# if(dat_descr!=''){dat_descr=paste0(dat_descr,'.')}
#### .gmt file - module 1 per row, name followed by \t genes in module ------------------------------------------
# write.delim(t(c(names(genlis),t(as.data.frame(genlis)))),file=paste0(gsea_path,'/working/',dat_descr,'gsea_enrich.gmt'),row.names=F,col.names=F)
# write.delim(t(c(names(genlis),t(as.data.frame(genlis)))),file=paste0(gsea_path,'/working/gsea_enrich.gmt'),row.names=F,col.names=F)
cat('\twrite genlis.gmt\n')
sink(paste0(gsea_path,'/working/gsea_enrich.gmt'))
for(ilis in names(genlis)){
cat(as.vector(paste(c(ilis,ilis,genlis[ilis][[1]]),collapse="\t")),"\n",sep="")
}
sink()
#### .rnk file - 2 columns "IDs", "P" ------------------------------------------
if(!ncol(rnkdat)%in%c(1,2)){stop('rnkdat does not have 2 coluns')}
if(ncol(rnkdat)==2){
bkg=rnkdat[,1]
cat('\tncol(rnkdat)==2, assuming rnkdat already in correct format colnames = c("IDs","P"), where P==rank measure\n')
# write.delim(rnkdat,file=paste0(gsea_path,'/working/',dat_descr,'gsea_enrich.rnk'),row.names=F,col.names=T)
write.delim(rnkdat,file=paste0(gsea_path,'/working/gsea_enrich.rnk'),row.names=F,col.names=T)
}
if(ncol(rnkdat)==1){
cat('\tncol(rnkdat)==1, assuming rnkdat has appropriate rownames and ranks in the column\n')
rnkdat=as.data.frame(rnkdat)
colnames(rnkdat)='P'
rnkdat$IDs=rownames(rnkdat)
rnkdat=rnkdat[,c('IDs','P')]
bkg=rownames(rnkdat)
# write.delim(rnkdat,file=paste0(gsea_path,'/working/',dat_descr,'gsea_enrich.rnk'),row.names=F,col.names=T)
write.delim(rnkdat,file=paste0(gsea_path,'/working/gsea_enrich.rnk'),row.names=F,col.names=T)
}
#### .chip file - colunms: "Probe Set ID", "Gene Symbol", "Gene Title" - same is easiest (background) ------------------------------------------
# build using the bkg list - derived based on the rnk list above
chipdat=list("Probe Set ID"=bkg,"Gene Symbol"=bkg,"Gene Title"=bkg)
chipdat=as.data.frame(chipdat)
colnames(chipdat)=c("Probe Set ID","Gene Symbol","Gene Title") ## assuming GSEA is actually picky about colnames
# write.delim(chipdat,file=paste0(gsea_path,'/working/',dat_descr,'gsea_enrich.chip'),row.names=F,col.names=T)
write.delim(chipdat,file=paste0(gsea_path,'/working/gsea_enrich.chip'),row.names=F,col.names=T)
options(scipen=999) ## prevent R using scientific notation to numbers
##-Xmx5000mm flags the amount of memory available to java. The default is -Xmx512m
## all hail Aida for working out the full cmd code to run GSEA, i still remember her frustration when she was doing this..
## -rpt_label adds a label to the out_folder which can make it too long -> mid part of the name becomes ".." - the part required for grep to work ==> hardcoded as blank
system(
paste0("java -cp ",gsea_path,"gsea2-2.2.3.jar -Xmx50g xtools.gsea.GseaPreranked -gmx "
# ,gsea_path,'/working/',dat_descr,'gsea_enrich.gmt'
,gsea_path,'/working/gsea_enrich.gmt'
," -collapse false -mode Max_probe -norm meandiv -nperm ",nperm
# ," -rnk ",gsea_path,'/working/',dat_descr,'gsea_enrich.rnk'
," -rnk ",gsea_path,'/working/gsea_enrich.rnk'
, " -scoring_scheme classic -rpt_label -chip ",gsea_path,'/working/gsea_enrich.chip'
," -include_only_symbols true -make_sets true -plot_top_x 20 -rnd_seed timestamp -set_max "
,max_clust_size, " -set_min ",min_clust_size," -zip_report false -out ",gsea_path, " -gui false"
)
)
out_dir_nam=list.files(gsea_path,pattern='my_analysis')
# out_dir_nam=out_dir_nam[out_dir_nam!='working']
resfnam=list.files(paste0(gsea_path,'/',out_dir_nam),pattern='.xls')
humpty=read.delim(paste0(gsea_path,'/',out_dir_nam,'/',resfnam[grepl('gsea_report_for_na_pos',resfnam)]))
dumpty=read.delim(paste0(gsea_path,'/',out_dir_nam,'/',resfnam[grepl('gsea_report_for_na_neg',resfnam)]))
bumpty=read.delim(paste0(gsea_path,'/',out_dir_nam,'/',resfnam[grepl('na_pos_versus_na_neg',resfnam)]))
fumpty=read.delim(paste0(gsea_path,'/',out_dir_nam,'/',resfnam[grepl('gene_set_sizes',resfnam)]))
resfnam=resfnam[!grepl('gsea_report_for_na_pos|gsea_report_for_na_neg|na_pos_versus_na_neg|gene_set_sizes',resfnam)]
resfnam=gsub('[.]xls','',resfnam)
sidat=list()
for(isig in resfnam){
sidat[[isig]]=read.delim(paste0(gsea_path,'/',out_dir_nam,'/',isig,'.xls'))
}
#pdf(paste0(gsea_path,'img/dummy.pdf'),height=5,width=10) ## for testing purposes, to save plots, more efficient to create a pdf() before running the function & dev.off() after
if(do_plots){
for(isig in names(sidat)){
holder=sidat[isig][[1]]
ylimdat=c(min(holder$RUNNING.ES)-0.1,(max(holder$RUNNING.ES)))
plot(x=holder$RANK.IN.GENE.LIST,y=holder$RUNNING.ES,type='l',lwd=4,col='darkgreen',frame.plot=F,ylim=ylimdat,main=paste0(isig,'\n',dat_descr),las=1,xlab='rank in gene list',ylab='GSEA enrichment score',...) #ylim=ylimdat
rug(x=holder$RANK.IN.GENE.LIST, ticksize = 0.1, side = 1, lwd = 0.5, col = par("fg"),quiet = getOption("warn") < 0)
mtext(paste0('FDR pos ',humpty[humpty$NAME==isig,]$FDR.q.val,' FDR neg ',dumpty[dumpty$NAME==isig,]$FDR.q.val),adj=1,side=1,line=4)
}
}
#dev.off()
cat('\tclean-up - removing ',paste0('rm ',gsea_path,'/',out_dir_nam),'directory generated by GSEA\n')
system(paste0('rm ',gsea_path,'/',out_dir_nam,'/*'))
system(paste0('rm ',gsea_path,'/',out_dir_nam,'/edb/*'))
system(paste0('rmdir ',gsea_path,'/',out_dir_nam,'/edb'))
system(paste0('rmdir ',gsea_path,'/',out_dir_nam))
cat('\tclean-up - removing ',paste0('rm ',gsea_path,'/working *'),'input files for GSEA\n')
system(paste0('rm ',gsea_path,'/working/*'))
return(invisible(list(genlis=holder,pos=humpty,neg=dumpty,bkg=bumpty,set_size=fumpty,sigdat=sidat)))
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.