knitr::opts_chunk$set(echo = TRUE)
knitr::opts_chunk$set(class.source='fold-show')
knitr::opts_chunk$set(tidy.opts=list(width.cutoff=100),tidy=TRUE)

My default template

library(devtools)
library(usethis)
# used rstudio version for Rcpp!
usethis::use_package('data.table',type = "Imports",min_version = T)
usethis::use_package('stringr',type = "Imports",min_version = T)

```r library(data.table) library(stringr) library(parallel)

input data, and tidying

external <- readRDS("~/Dropbox/Gene_lists/input_data/PBSnj/all_SNPs_and_pvalues/pbsnj.subspecies.pvalues.rds") external[,end:=start]

internal <- readRDS("~/Dropbox/Gene_lists/input_data/3P-CLR/all_windows_and_pvalues/threepclr.anc.pvalues_unfiltered.rds") internal[,end:=start]

genes.coordinates <- fread("~/Dropbox/Gene_lists/input_data/genome_annotation/all.genes.bed", col.names = c("chr","start","end","gene")) genes.coordinates[,start:=start+2]

gene def +- 2kb

genes.coordinates[,start:=start-2000] genes.coordinates[,end:=end+2000]

make windows covering the genome

winsize <- 100e3 windows <- external[,.(start=min(start),end=max(start)),by=chr][order(chr)] windows <- windows[,.(start=seq(from=start,to=end,by=winsize)),by=chr][,.(chr,start,end=start+winsize-1)] windows[,id:= .I]

check windows have at least one SNP in them.

setkey(windows,chr,start,end) setkey(external,chr,start,end) window.feature.density <- foverlaps(external,windows,nomatch=0) dropped.windows <- windows[!id %in% window.feature.density[,unique(id)]] windows <- windows[id %in% window.feature.density[,unique(id)]]

any genes overlap dropped windows?

setkey(genes.coordinates,chr,start,end) setkey(dropped.windows,chr,start,end) dropped.genes <- foverlaps(genes.coordinates,dropped.windows, nomatch=NULL)

a few, or pieces of a few. largely reflects assembly then/

map genes to windows.

setkey(genes.coordinates,chr,start,end) setkey(windows,chr,start,end) genes.windows <- foverlaps(genes.coordinates,windows, nomatch=NULL)

n windows per gene

n.windows.gene <- genes.windows[,.(n.windows = uniqueN(id)),by=gene]

n.windows.gene[n.windows > 1]

think easiest thing is assocaute genes with windows, and relativise coordiantes

essentially window becomes chr in overlap. genes 'split over' chromosomes.

candidate SNPs 'stay' where they are

genes.windows[,win.relative_start:=ifelse(start < i.start, i.start-start,1)] genes.windows[,win.relative_end:=ifelse(i.end < end, i.end-start,end-start)]

genes.windows[,max_id_chr:=max(id),by=chr]

genes.windows[,min_id_chr:=min(id),by=chr]

saveRDS(genes.windows,"genes.windows.rds")

lets relativise the coordinates of our statistics

first filter external sites, dont need all these sites!

external <- external[e.p < 0.01 | c.p < 0.01 | n.p < 0.01 | w.p < 0.01 ] setkey(windows,chr,start,end) setkey(internal,chr,start,end) setkey(external,chr,start,end) internal <- foverlaps(internal,windows) internal[,win.relative_start:=ifelse(start < i.start, i.start-start,1)] external <- foverlaps(external,windows) external[,win.relative_start:=ifelse(start < i.start, i.start-start,1)]

output rds files

saveRDS(external[c.p < 0.1,.(chr=id,start=win.relative_start,end=win.relative_start,c.p)],"c.p.sites.rds") saveRDS(external[e.p < 0.1,.(chr=id,start=win.relative_start,end=win.relative_start,e.p)],"e.p.sites.rds") saveRDS(external[n.p < 0.1,.(chr=id,start=win.relative_start,end=win.relative_start,n.p)],"n.p.sites.rds") saveRDS(external[w.p < 0.1,.(chr=id,start=win.relative_start,end=win.relative_start,w.p)],"w.p.sites.rds") saveRDS(internal[anc.p < 0.1,.(chr=id,start=win.relative_start,end=win.relative_start,i.p=anc.p)],"i.p.sites.rds") saveRDS(genes.windows,"genes.windows.rds") saveRDS(windows,"windows.rds")

external.pops <- c("e.p") external.p.cut <- c(0.00025) int.p <- (0.5/100) allowMultiHits=TRUE source('R/make.cand.sets.R') test.snps <- getAllCandidateSNPs(ext.pops.vec = external.pops, ext.p.cut.vec = external.p.cut, external.dt=external, int.p.cut=NA, internal.dt=NA, allowMultiHits=allowMultiHits)

make the object for genes, relative to windows

test.genes <- genes.windows[,.(chr=id,start=win.relative_start,end=win.relative_end,gene)] saveRDS(test.genes,"test.genes.rds") setkey(test.snps,chr,start,end) setkey(test.genes,chr,start,end) if(allowMultiHits==TRUE){ obs.genes <- sort(foverlaps(test.snps,test.genes,nomatch = NULL)[order(gene),unique(gene),by=stat]$V1) } else{obs.genes <- foverlaps(test.snps,test.genes,nomatch = NULL)[order(gene),unique(gene)]}

permuations

shifter <- function(x, n = 1) {

if (n == 0) x else c(tail(x, -n), head(x, n))

}

n.perms <- 100000

perm.sets <- lapply(1:n.perms, function(i) obs.genes)

for(i in 1:n.perms) {

rel.id.pos <- genes.windows[,.(id=unique(id),new=sample(unique(id),uniqueN(id),replace = FALSE)),by=chr]

perm.genes <- genes.windows[rel.id.pos, on=.(id)][,.(chr=new,start=win.relative_start,end=win.relative_end,gene)]

setkey(test.snps,chr,start,end)

setkey(perm.genes,chr,start,end)

if(allowMultiHits==TRUE) {

perm.sets[[i]] <- sort(foverlaps(test.snps,perm.genes,nomatch = NULL)[order(gene),unique(gene),by=stat]$V1)

}else{perm.sets[[i]] <- foverlaps(test.snps,perm.genes,nomatch = NULL)[order(gene),unique(gene)]}

}

make_Permuations <- function(genes.windows,test.snps,allowMultiHits){ rel.id.pos <- genes.windows[,.(id=unique(id),new=sample(unique(id),uniqueN(id),replace = FALSE)),by=chr] perm.genes <- genes.windows[rel.id.pos, on=.(id)][,.(chr=new,start=win.relative_start,end=win.relative_end,gene)] setkey(test.snps,chr,start,end) setkey(perm.genes,chr,start,end) if(allowMultiHits==TRUE) { return(sort(foverlaps(test.snps,perm.genes,nomatch = NULL)[order(gene),unique(gene),by=stat]$V1)) }else{return(foverlaps(test.snps,perm.genes,nomatch = NULL)[order(gene),unique(gene)])} } perm.sets <- mclapply(1:n.perms, function(x) make_Permuations(genes.windows,test.snps,allowMultiHits),mc.cores = 10)

gene sets

func.gene.set.file <- "~/Dropbox/Gene_lists/input_data/gene_sets/REAC.biosystems.gene.set_and.all.txt" func.gene.set <- fread(func.gene.set.file,sep="\t",col.names = c("id","name","genes"),header=FALSE)

do not need and all

func.gene.set <- func.gene.set[name!="all.genes"] func.gene.set <- func.gene.set[id!="GO:9999999"] func.gene.setl <- func.gene.set[,strsplit(genes, " "),by=id]

saveRDS(func.gene.set,"REAC.set.rds") saveRDS(func.gene.setl,"REAC.setL.rds")

perm anlaysis

min.geneset.size <- 10 results <- data.table() p.dist <- data.table() for(i in seq_along(func.gene.set$id)) { idt <- func.gene.set$id[i] # get the category genes cat.genes = func.gene.setl[id==idt]$V1 n.cat.genes = length(cat.genes) if(n.cat.genes < min.geneset.size) next # for each perm, the pr of genes that are in the test category if(allowMultiHits==TRUE){ cat.perms <- sapply(perm.sets, function(x) { n.int = length(x[x %in% cat.genes]); n.perm=length(x); p.perm = n.int/n.perm; return(p.perm) }) }else{cat.perms <- sapply(perm.sets, function(x) { n.int = length(intersect(x,cat.genes)); n.perm=length(x); p.perm = n.int/n.perm; return(p.perm) })}

# observed candiate genes in test cateogry if(allowMultiHits==TRUE){ obs.genes.in.category <- obs.genes[obs.genes %in% cat.genes] }else{obs.genes.in.category <- intersect(obs.genes,cat.genes)}

n.obs.genes.in.category <- length(obs.genes.in.category) u.n.obs.genes.in.category <- length(unique(obs.genes.in.category)) pr.obs.genes.in.category <- n.obs.genes.in.category/length(obs.genes) mean.n.perms <- mean(cat.perms)*length(obs.genes) emp.pvalue <- (length(which(cat.perms >= pr.obs.genes.in.category))+1)/(n.perms+1) results <- rbindlist(list(results,data.table(id=idt, mean.n.perms, n.cat.genes, n.obs.genes.in.category, u.n.obs.genes.in.category, emp.pvalue, genes=paste(obs.genes.in.category,collapse=",")))) # capture the p-value distribution. pvalues <- data.table(table(cat.perms))[order(cat.perms)] lengthp <- dim(pvalues)[1] for(j in 1:lengthp){ n.equal.greater = pvalues[j:lengthp,sum(N)-1] # how many proportions are equal or greater than this? if(emp.pvalue >= pvalues[j]$cat.perms) { n.equal.greater <- n.equal.greater+1 } pr.p.value = (n.equal.greater+1)/(n.perms+1) pvalues[j,p:=pr.p.value] } pvalues[,cat.perms:=NULL] p.dist <- rbindlist(list(p.dist,pvalues),use.names = TRUE) p.dist <- p.dist[,.(N=sum(N)),by=p] }

preliminary results

results <- results[func.gene.set[,.(name,id)],on=.(id)][order(emp.pvalue)][!is.na(emp.pvalue)]

fdr correction

p.dist <- p.dist[order(-p)] max.p <- 0 for(i in 1:dim(results)[1]) { emp <- results[i]$emp.pvalue Rexp <- p.dist[p <= emp,sum(N)]/n.perms Robs <- results[emp.pvalue <= emp,.N] rawfdr <- Rexp/Robs if(rawfdr > 1){ rawfdr <- 1 } if(rawfdr > max.p){ max.p <- rawfdr } if(rawfdr <= max.p) { rawfdr <- max.p } results[i,fdr:=rawfdr] }

aslo BH method

results[,BH:=p.adjust(results$emp.pvalue,method = "BH")]

save perm sets

saveRDS(perm.sets,"e_0.00025_100K_multihit.permuations.rds")

save KEGG

saveRDS(results,file = "c_0.00025_int.0.5%_100K_multihit.permuations_KEGG.enrichment.rds")

save htvip

saveRDS(results,file = "e.c_0.00025_int.0.5%_100K_multihit.permuations_mergedhtvip.enrichment.rds")

BIOC

saveRDS(results,file = "e.c_0.00025_int.0.5%_5000.permuations_BIOC.enrichment.rds")

REACTOME

saveRDS(results,file = "e.c_0.00025_int.0.5%_5000.permuations_REAC.enrichment.rds")



joshuamschmidt/multiPermr documentation built on Oct. 12, 2020, 11:42 a.m.