original_rscrlpt.R

#!/usr/bin/env Rscript
library(data.table)
library(stringr)
library(parallel)
# argv
args <- commandArgs(trailingOnly = TRUE)
# stats a,b,c...n p-value cutiffs,
if(length(args)< 2){
  stop("error: must have at least one stat and one p-value cutoff specified")
}
# stats
# stats are e.p c.p n.p w.p or i.p specificed as comma delimited string: e.p,c.p
stats <- strsplit(args[1], ",")[[1]]
# for each stat, what is the pvalue cutoff that defines candidates: 0.00025,0.00025
pvalues <- as.numeric(strsplit(args[2], ",")[[1]])
# which of KEGG, VIP or CD4 (siv responsive)
funcset <- strsplit(args[3], ",")[[1]]
# TRUE = count each statistic idenpendent, allowing a gene to b e counted twice. FALSE = count only unique genes in the union of stats.
allowMultiHits <- args[4]
nperms <- as.numeric(args[5])

# read in data
setwd("/Users/joshuaschmidt/Box/multistat.block.perm/multi.stat.block.perm")
test.genes <- readRDS("test.genes.rds")
genes.windows <- readRDS("genes.windows.rds")
test.snps <- data.table()
for(i in seq_along(stats)){
  instat <- paste0(stats[i],".sites.rds")
  tmp <- readRDS(instat)
  tmp <- tmp[get(stats[i]) < pvalues[i]][,.(chr,start,end)]
  tmp[,stat:=stats[i]]
  test.snps <- rbindlist(list(test.snps,tmp))
}
setkey(test.snps,chr,start,end)
if(allowMultiHits==FALSE){
  test.snps[,stat:=NULL]
  test.snps <- unique(test.snps)
}

# obs genes
setkey(test.snps,chr,start,end)
setkey(test.genes,chr,start,end)
if(allowMultiHits==TRUE){
  obs.genes <- sort(foverlaps(test.snps,test.genes,nomatch = NULL)[order(gene),unique(gene),by=stat]$V1)
} else{obs.genes <- foverlaps(test.snps,test.genes,nomatch = NULL)[order(gene),unique(gene)]}

# do the permutations

make_Permuations <- function(genes.windows,test.snps,allowMultiHits){
  rel.id.pos <- genes.windows[,.(id=unique(id),new=sample(unique(id),uniqueN(id),replace = FALSE)),by=chr]
  perm.genes <- genes.windows[rel.id.pos, on=.(id)][,.(chr=new,start=win.relative_start,end=win.relative_end,gene)]
  setkey(test.snps,chr,start,end)
  setkey(perm.genes,chr,start,end)
  if(allowMultiHits==TRUE) {
    return(sort(foverlaps(test.snps,perm.genes,nomatch = NULL)[order(gene),unique(gene),by=stat]$V1))
  }else{return(foverlaps(test.snps,perm.genes,nomatch = NULL)[order(gene),unique(gene)])}
}
perm.sets <- mclapply(1:nperms, function(x) make_Permuations(genes.windows,test.snps,allowMultiHits),mc.cores = 10)

# read in functional gene sets

func.gene.set <- readRDS(paste0(funcset,".set.rds"))
func.gene.setl <- readRDS(paste0(funcset,".setL.rds"))

# calc enrichment in functional set categories
min.geneset.size <- 10
results <- data.table()
p.dist <- data.table()
for(i in seq_along(func.gene.set$id)) {
  idt <- func.gene.set$id[i]
  # get the category genes
  cat.genes = func.gene.setl[id==idt]$V1
  n.cat.genes = length(cat.genes)
  if(n.cat.genes < min.geneset.size) next
  # for each perm, the pr of genes that are in the test category
  if(allowMultiHits==TRUE){
    cat.perms <- sapply(perm.sets, function(x) { n.int = length(x[x %in% cat.genes]); n.perm=length(x); p.perm = n.int/n.perm; return(p.perm) })
  }else{cat.perms <- sapply(perm.sets, function(x) { n.int = length(intersect(x,cat.genes)); n.perm=length(x); p.perm = n.int/n.perm; return(p.perm) })}
  
  # observed candiate genes in test cateogry
  if(allowMultiHits==TRUE){
    obs.genes.in.category <- obs.genes[obs.genes %in% cat.genes]
  }else{obs.genes.in.category <- intersect(obs.genes,cat.genes)}
  
  n.obs.genes.in.category <- length(obs.genes.in.category)
  u.n.obs.genes.in.category <- length(unique(obs.genes.in.category))
  pr.obs.genes.in.category <- n.obs.genes.in.category/length(obs.genes)
  mean.n.perms <- mean(cat.perms)*length(obs.genes)
  emp.pvalue <-  (length(which(cat.perms >= pr.obs.genes.in.category))+1)/(nperms+1)
  results <- rbindlist(list(results,data.table(id=idt, mean.n.perms, n.cat.genes, n.obs.genes.in.category, u.n.obs.genes.in.category, emp.pvalue, genes=paste(obs.genes.in.category,collapse=","))))
  # capture the p-value distribution.
  pvalues <- data.table(table(cat.perms))[order(cat.perms)]
  lengthp <- dim(pvalues)[1]
  for(j in 1:lengthp){
    n.equal.greater = pvalues[j:lengthp,sum(N)-1] # how many proportions are equal or greater than this?
    if(emp.pvalue >= pvalues[j]$cat.perms) {
      n.equal.greater <- n.equal.greater+1
    }
    pr.p.value =   (n.equal.greater+1)/(nperms+1)
    pvalues[j,p:=pr.p.value]
  }
  pvalues[,cat.perms:=NULL]
  p.dist <- rbindlist(list(p.dist,pvalues),use.names = TRUE)
  p.dist <- p.dist[,.(N=sum(N)),by=p]
}

# preliminary results
results <- results[func.gene.set[,.(name,id)],on=.(id)][order(emp.pvalue)][!is.na(emp.pvalue)]
# fdr correction
p.dist <- p.dist[order(-p)]
max.p <- 0
for(i in 1:dim(results)[1]) {
  emp <- results[i]$emp.pvalue
  Rexp <- p.dist[p <= emp,sum(N)]/nperms
  Robs <- results[emp.pvalue <= emp,.N]
  rawfdr <- Rexp/Robs
  if(rawfdr > 1){
    rawfdr <- 1
  }
  if(rawfdr > max.p){
    max.p <- rawfdr
  }
  if(rawfdr <= max.p) {
    rawfdr <- max.p
  }
  results[i,fdr:=rawfdr]
}
# also BH method
results[,BH:=p.adjust(results$emp.pvalue,method = "BH")]

# save perm sets
setwd("/Users/joshuaschmidt/Projects/ancestral_chimp/mult.stat.enrichments")
# outperm <- paste0("svec.",args[1],"_pvec.",args[2],"_perms.",round(nperms/1000),"K_multHit.",allowMultiHits,"_set.",funcset,"_permuations.rds")
# saveRDS(perm.sets,outperm)
outenrich <- paste0("svec.",args[1],"_pvec.",args[2],"_perms.",round(nperms/1000),"K_multHit.",allowMultiHits,"_set.",funcset,"_enrichment.rds")
saveRDS(results,outenrich)
joshuamschmidt/multiPermr documentation built on Oct. 12, 2020, 11:42 a.m.