knitr::opts_chunk$set(echo = TRUE) knitr::opts_chunk$set(class.source='fold-show') knitr::opts_chunk$set(tidy.opts=list(width.cutoff=100),tidy=TRUE)
library(devtools) library(usethis)
# used rstudio version for Rcpp! usethis::use_package('data.table',type = "Imports",min_version = T) usethis::use_package('stringr',type = "Imports",min_version = T)
```r library(data.table) library(stringr) library(parallel)
external <- readRDS("~/Dropbox/Gene_lists/input_data/PBSnj/all_SNPs_and_pvalues/pbsnj.subspecies.pvalues.rds") external[,end:=start]
internal <- readRDS("~/Dropbox/Gene_lists/input_data/3P-CLR/all_windows_and_pvalues/threepclr.anc.pvalues_unfiltered.rds") internal[,end:=start]
genes.coordinates <- fread("~/Dropbox/Gene_lists/input_data/genome_annotation/all.genes.bed", col.names = c("chr","start","end","gene")) genes.coordinates[,start:=start+2]
genes.coordinates[,start:=start-2000] genes.coordinates[,end:=end+2000]
winsize <- 100e3 windows <- external[,.(start=min(start),end=max(start)),by=chr][order(chr)] windows <- windows[,.(start=seq(from=start,to=end,by=winsize)),by=chr][,.(chr,start,end=start+winsize-1)] windows[,id:= .I]
setkey(windows,chr,start,end) setkey(external,chr,start,end) window.feature.density <- foverlaps(external,windows,nomatch=0) dropped.windows <- windows[!id %in% window.feature.density[,unique(id)]] windows <- windows[id %in% window.feature.density[,unique(id)]]
setkey(genes.coordinates,chr,start,end) setkey(dropped.windows,chr,start,end) dropped.genes <- foverlaps(genes.coordinates,dropped.windows, nomatch=NULL)
setkey(genes.coordinates,chr,start,end) setkey(windows,chr,start,end) genes.windows <- foverlaps(genes.coordinates,windows, nomatch=NULL)
genes.windows[,win.relative_start:=ifelse(start < i.start, i.start-start,1)] genes.windows[,win.relative_end:=ifelse(i.end < end, i.end-start,end-start)]
saveRDS(genes.windows,"genes.windows.rds")
external <- external[e.p < 0.01 | c.p < 0.01 | n.p < 0.01 | w.p < 0.01 ] setkey(windows,chr,start,end) setkey(internal,chr,start,end) setkey(external,chr,start,end) internal <- foverlaps(internal,windows) internal[,win.relative_start:=ifelse(start < i.start, i.start-start,1)] external <- foverlaps(external,windows) external[,win.relative_start:=ifelse(start < i.start, i.start-start,1)]
saveRDS(external[c.p < 0.1,.(chr=id,start=win.relative_start,end=win.relative_start,c.p)],"c.p.sites.rds") saveRDS(external[e.p < 0.1,.(chr=id,start=win.relative_start,end=win.relative_start,e.p)],"e.p.sites.rds") saveRDS(external[n.p < 0.1,.(chr=id,start=win.relative_start,end=win.relative_start,n.p)],"n.p.sites.rds") saveRDS(external[w.p < 0.1,.(chr=id,start=win.relative_start,end=win.relative_start,w.p)],"w.p.sites.rds") saveRDS(internal[anc.p < 0.1,.(chr=id,start=win.relative_start,end=win.relative_start,i.p=anc.p)],"i.p.sites.rds") saveRDS(genes.windows,"genes.windows.rds") saveRDS(windows,"windows.rds")
external.pops <- c("e.p") external.p.cut <- c(0.00025) int.p <- (0.5/100) allowMultiHits=TRUE source('R/make.cand.sets.R') test.snps <- getAllCandidateSNPs(ext.pops.vec = external.pops, ext.p.cut.vec = external.p.cut, external.dt=external, int.p.cut=NA, internal.dt=NA, allowMultiHits=allowMultiHits)
test.genes <- genes.windows[,.(chr=id,start=win.relative_start,end=win.relative_end,gene)] saveRDS(test.genes,"test.genes.rds") setkey(test.snps,chr,start,end) setkey(test.genes,chr,start,end) if(allowMultiHits==TRUE){ obs.genes <- sort(foverlaps(test.snps,test.genes,nomatch = NULL)[order(gene),unique(gene),by=stat]$V1) } else{obs.genes <- foverlaps(test.snps,test.genes,nomatch = NULL)[order(gene),unique(gene)]}
n.perms <- 100000
make_Permuations <- function(genes.windows,test.snps,allowMultiHits){ rel.id.pos <- genes.windows[,.(id=unique(id),new=sample(unique(id),uniqueN(id),replace = FALSE)),by=chr] perm.genes <- genes.windows[rel.id.pos, on=.(id)][,.(chr=new,start=win.relative_start,end=win.relative_end,gene)] setkey(test.snps,chr,start,end) setkey(perm.genes,chr,start,end) if(allowMultiHits==TRUE) { return(sort(foverlaps(test.snps,perm.genes,nomatch = NULL)[order(gene),unique(gene),by=stat]$V1)) }else{return(foverlaps(test.snps,perm.genes,nomatch = NULL)[order(gene),unique(gene)])} } perm.sets <- mclapply(1:n.perms, function(x) make_Permuations(genes.windows,test.snps,allowMultiHits),mc.cores = 10)
func.gene.set.file <- "~/Dropbox/Gene_lists/input_data/gene_sets/REAC.biosystems.gene.set_and.all.txt" func.gene.set <- fread(func.gene.set.file,sep="\t",col.names = c("id","name","genes"),header=FALSE)
func.gene.set <- func.gene.set[name!="all.genes"] func.gene.set <- func.gene.set[id!="GO:9999999"] func.gene.setl <- func.gene.set[,strsplit(genes, " "),by=id]
saveRDS(func.gene.set,"REAC.set.rds") saveRDS(func.gene.setl,"REAC.setL.rds")
min.geneset.size <- 10 results <- data.table() p.dist <- data.table() for(i in seq_along(func.gene.set$id)) { idt <- func.gene.set$id[i] # get the category genes cat.genes = func.gene.setl[id==idt]$V1 n.cat.genes = length(cat.genes) if(n.cat.genes < min.geneset.size) next # for each perm, the pr of genes that are in the test category if(allowMultiHits==TRUE){ cat.perms <- sapply(perm.sets, function(x) { n.int = length(x[x %in% cat.genes]); n.perm=length(x); p.perm = n.int/n.perm; return(p.perm) }) }else{cat.perms <- sapply(perm.sets, function(x) { n.int = length(intersect(x,cat.genes)); n.perm=length(x); p.perm = n.int/n.perm; return(p.perm) })}
# observed candiate genes in test cateogry if(allowMultiHits==TRUE){ obs.genes.in.category <- obs.genes[obs.genes %in% cat.genes] }else{obs.genes.in.category <- intersect(obs.genes,cat.genes)}
n.obs.genes.in.category <- length(obs.genes.in.category) u.n.obs.genes.in.category <- length(unique(obs.genes.in.category)) pr.obs.genes.in.category <- n.obs.genes.in.category/length(obs.genes) mean.n.perms <- mean(cat.perms)*length(obs.genes) emp.pvalue <- (length(which(cat.perms >= pr.obs.genes.in.category))+1)/(n.perms+1) results <- rbindlist(list(results,data.table(id=idt, mean.n.perms, n.cat.genes, n.obs.genes.in.category, u.n.obs.genes.in.category, emp.pvalue, genes=paste(obs.genes.in.category,collapse=",")))) # capture the p-value distribution. pvalues <- data.table(table(cat.perms))[order(cat.perms)] lengthp <- dim(pvalues)[1] for(j in 1:lengthp){ n.equal.greater = pvalues[j:lengthp,sum(N)-1] # how many proportions are equal or greater than this? if(emp.pvalue >= pvalues[j]$cat.perms) { n.equal.greater <- n.equal.greater+1 } pr.p.value = (n.equal.greater+1)/(n.perms+1) pvalues[j,p:=pr.p.value] } pvalues[,cat.perms:=NULL] p.dist <- rbindlist(list(p.dist,pvalues),use.names = TRUE) p.dist <- p.dist[,.(N=sum(N)),by=p] }
results <- results[func.gene.set[,.(name,id)],on=.(id)][order(emp.pvalue)][!is.na(emp.pvalue)]
p.dist <- p.dist[order(-p)] max.p <- 0 for(i in 1:dim(results)[1]) { emp <- results[i]$emp.pvalue Rexp <- p.dist[p <= emp,sum(N)]/n.perms Robs <- results[emp.pvalue <= emp,.N] rawfdr <- Rexp/Robs if(rawfdr > 1){ rawfdr <- 1 } if(rawfdr > max.p){ max.p <- rawfdr } if(rawfdr <= max.p) { rawfdr <- max.p } results[i,fdr:=rawfdr] }
results[,BH:=p.adjust(results$emp.pvalue,method = "BH")]
saveRDS(perm.sets,"e_0.00025_100K_multihit.permuations.rds")
saveRDS(results,file = "c_0.00025_int.0.5%_100K_multihit.permuations_KEGG.enrichment.rds")
saveRDS(results,file = "e.c_0.00025_int.0.5%_100K_multihit.permuations_mergedhtvip.enrichment.rds")
saveRDS(results,file = "e.c_0.00025_int.0.5%_5000.permuations_BIOC.enrichment.rds")
saveRDS(results,file = "e.c_0.00025_int.0.5%_5000.permuations_REAC.enrichment.rds")
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.