R/gene-assoc.r

#!/usr/bin/env R
#
# This file is part of peakAnalysis,
# http://github.com/alexjgriffith/alpha-score/, 
# and is Copyright (C) University of Ottawa, 2015. It is Licensed under 
# the three-clause BSD License; see LICENSE.txt.
# Author : Alexander Griffith
# Contact: griffitaj@gmail.com
#

#' Genomic Regions
#'
#' Implements the Stanford GREAT genomic region generation
#' @export
genomeDomains<-function(chrom,tss,strand,proxUp,proxDown,distal,n=length(chrom)){
    extendsMax<-function(n,chrom,basalDomains,tss,distal){
        withinChrom<-(chrom[n]==chrom)
        a<-basalDomains[withinChrom,1]-tss[n]
        ap<-a[a> basalDomains[n,2]-tss[n] & a<distal]
        ab<-a[a>0 & a< basalDomains[n,2]-tss[n]]
        
        if(length(ab))
            lm<-basalDomains[n,2]
        else if(length(ap))
            lm<-tss[n]+min(ap)
        else
            lm<-tss[n]+distal            
        b<-tss[n]-basalDomains[withinChrom,2]
        bp<-b[ b> tss[n]-basalDomains[n,1] & b<distal]
        bb<-b[b>0 & b< tss[n]-basalDomains[n,1]]
        if(length(bb))
            um<-basalDomains[n,1]
        else if(length(bp>0))
            um<-tss[n]-min(bp)
        else
            um<-tss[n]-distal
        geneDomain<-c(um,lm)
        geneDomain
    }
    determ<-function(csl,gene,fun){
        if(as.character(fun)=="+")
            psl<-c(csl,as.character(gene))
        else{
            psl<-csl[which(csl != as.character(gene))]}
        psl
    }
    basalDomains<-t(apply(cbind(tss-strand*proxUp,tss+strand*proxDown),1,sort))
    genomeDomains<-do.call(rbind,lapply(seq(n),extendsMax,chrom,basalDomains,tss,distal))
    return(genomeDomains)
}


#' Genomic Regions
#' @export
genomicRegions<-function(
    a,chrom,
    chroms=unique(chrom)){
beg<-list()
n=0
b<-cbind(c(as.numeric(a[,1]),as.numeric(a[,2])),seq(length(a[,1])),unlist(lapply(c(0,1),function(x)rep(x,length(a[,1])))))

for(cro in chroms){    
    region=(chrom==cro)
    k<-b[region,]
    c<-k[order(k[,1]),]
    buffer<-c(c[1,2])
    reg<-list()
    start<-c[1,1]
    for(i in seq(2,length(which(region))*2)){
        reg<-append(reg,list(c(cro,start,c[i,1],buffer)))
        if(c[i,3]==0){
            buffer<-c(buffer,c[i,2])
        }else{
            buffer<-buffer[c[i,2]!=buffer]}        
        start<-c[i,1]
    } 
    n<-n+1
    beg<-append(beg,reg[unlist(lapply(reg,function(x) length(x)>3))])
    #beg<-append(beg,reg)
    
}
return(beg)
}

#' Peak Gene Regions
#'
#' compare peak locations to gene regions generated by genomicRegions
#' Return a list of genes which corospond to the overlaped gene regions
#' @examples
#' heightFile<-"~/Dropbox/UTX-Alex/jan/combined_heights.bed"
#' geneList<-read.delim(paste(fileLocation,"hg19.RefSeqGenes.csv",sep=""))
#' data<-loadHeightFile(heightFile)$data
#' reg<-mapply(function(pc,loc)buildRegions(data,pc,loc)[,1],
#'             list(1,1,3,c(3,5),c(3,7),c(3,7),7),
#'             list("top","bottom","top",c("top","top"),c("top","bottom"),c("top","bottom"),"top"))
#' chrom<-as.character(geneList$chrom)
#' tss<-as.numeric(geneList$txStart)
#' start<-Sys.time()
#' a<-genomicRegions(chrom,tss,5000,1000,50000)
#' Sys.time()-start
#' #
#' jurkGenes<-peakGeneRegions(bedData[reg[,2],],a,geneList)
#' filenames<-lapply(cbind("erythroid","t-all","ecfc","other","hspc","meka","diff"),paste, "-genes.txt",sep="")
#' for (i in seq(7)){
#'    genes<-peakGeneRegions(bedData[reg[,i],],a,geneList)
#'    write.table(genes,filenames[[i]],quote=FALSE,col.names=FALSE,row.names=FALSE)}
#' @export
peakGeneRegions<-function(bedData,genes,geneList){
    aChr<-as.character(lapply(genes,"[[",1))
    aStart<-as.numeric(lapply(genes,"[[",2))
    aEnd<-as.numeric(lapply(genes,"[[",3))
    peaks<-apply(bedData[,2:3],1,mean)
    pc<-"chr0"    
    chroms<-(aChr==pc)
    cStart<-aStart[chroms]
    cEnd<-aEnd[chroms]
    locs<-c()
    for (i in seq(length(bedData[,1]))){
        if(pc!=as.character(bedData[i,1])){
            pc<-as.character(bedData[i,1])
            chroms<-which((aChr==pc))
            cStart<-aStart[chroms]
            cEnd<-aEnd[chroms]
        }
        gene<-chroms[which(peaks[i]>cStart &peaks[i]<cEnd)]
        locs<-c(locs,gene)
    }
    geneList[
                   as.numeric(unique(unlist(lapply(genes[unique(locs)],function(x) {x[4:length(x)]}))))
    ]
    #locs
}

#' Nearest Gene
#' @examples
#' heightFile<-"~/Dropbox/UTX-Alex/jan/combined_heights.bed"
#' data<-loadHeightFile(heightFile)$data
#' geneList<-read.delim(paste(fileLocation,"hg19.RefSeqGenes.csv",sep=""))
#' reg<-mapply(function(pc,loc)buildRegions(data,pc,loc)[,1],
#'             list(1,1,3,c(3,5),c(3,7),c(3,7),7),
#'             list("top","bottom","top",c("top","top"),c("top","bottom"),c("top","bottom"),"top"))
#' a<-geneAssocFlipAux(geneList,bedData,c(50000,500,0,0))
#' names<-mapply(function(a,b) paste(a,b,sep="-"), as.character(geneList$name),as.character(geneList$name2))
#' filenames<-lapply(cbind("erythroid","t-all","ecfc","other","hspc","meka","diff"),paste, "-genes.txt",sep="")
#' for (i in seq(7)){
#'    print(filenames[[i]])
#'    filename<-filenames[[i]]
#'    sequence<-reg[,i]
#'    b<-lapply(a,function(x,y) list(x[[1]],x[[2]][which(x[[2]]%in%y)]) ,which(sequence))
#'    c<-nearestGene(b,sequence,as.numeric(geneList$txStart),names,bedData)
#'    write.table(unique(unlist(c)),filename ,col.names = FALSE,row.names=FALSE,quote=FALSE)
#' }
#' sequence<-ascore(data,1,"top",6)
#' filename<-"erythroid-6-genes.txt"
#' @export
nearestGene<-function(b,sequence,geneStart,name,peak){
  c<-do.call(rbind,lapply(b,function(x) cbind(x[[1]],(function(x)if(length(x>0)) x else 0)(x[[2]]))))
  sequence2<-intersect(which(sequence),unique(c[,2]))
  lapply(sequence2, function(n) name[getMin(abs(geneStart[c[which(c[,2]==n),1]]-peak[n,2]))])
}

#' Rotate List
#' @export
rotateList<-function(b,range){
  c<-do.call(rbind,lapply(b,function(x) cbind(x[[1]],(function(x)if(length(x>0)) x else 0)(x[[2]]))))
  lapply(do.call(seq,as.list(range)), function(n) c[which(c[,2]==n),1])
}

#' Get Min
#' will return the single value if there is only a single value in the list
#' @export
getMin<-function(x){
    if(length(x)>1)
        which.min(x)
    else
        1 #unlist(x)
}

#' Gene Assoc Flip Aux
#' @export
geneAssocFlipAux<-function(geneList,point,bounds){
    peak<-(as.numeric(point[,2])+as.numeric(point[,3]))/2
    tss<-as.numeric(geneList$txStart)
    ess<-as.numeric(geneList$txEnd)
    chrom<-as.character(geneList$chrom)
    name<-as.numeric(rownames(geneList))#as.character(geneList$name)
    lapply(seq(length(name)),function(x) geneAssocFlip(name[x],tss[x],ess[x],chrom[x],bedData,peak,c(50000,0,0,0)))
}

#' Gene Assoc Flip
#' @export
geneAssocFlip<-function(name,tss,ess,chrom,point,peak,bounds){
    a<-which(chrom==as.character(point[,1]))
    b<-a[which(tss-bounds[1]<peak[a])]
    r<-b[which(ess+bounds[3]>peak[b])]
    e<-r[c(which(tss-bounds[2]>peak[r]),
           which(ess+bounds[4]<peak[r]))]
    list(name,e)}

#' Gene Assoc
#' @export
geneAssoc<-function(point,geneList,bounds){
    # Currently Returns the enhancer locations
    peak<-(as.numeric(point[[2]])+as.numeric(point[[3]]))/2
    tss<-geneList$txStart
    ess<-geneList$txEnd
    chrom<-geneList$chrom
    a<-which(chrom==as.character(point[[1]]))
    b<-a[which(tss[a]-bounds[1]<peak)]
    r<-b[which(ess[b]+bounds[3]>peak)]
    e<-r[c(which(tss[r]-bounds[2]>peak),which(ess[r]+bounds[4]<peak))]
    e}

#' lene Assoc
#' parallel gene assoc
#' @export
leneAssoc<-function(point,geneList,bounds){
    peak<-(as.numeric(point[[2]])+as.numeric(point[[3]]))/2
    tss<-geneList$txStart
    ess<-geneList$txEnd
    chrom<-geneList$chrom
    a<-which(chrom==as.character(point[[1]]))
    b<-a[which(tss[a]-bounds[1]<peak)]
    r<-b[which(ess[b]+bounds[3]>peak)]
    locations<-r[c(which(tss[r]-bounds[2]>peak),which(ess[r]+bounds[4]<peak))]
    t<-geneList$txStart[unlist(locations)]
    gene<-as.character(geneList$name[locations[order(abs(t-peak))]])
    if(identical(gene,character(0))){"None"}else(gene)}

#' Min Gene
#' @export
minGene<-function(x,y){
    x[order(abs(x-y))]}


#' Gene Association
#'
#' Assosiate each ChIP enrinchement region with the closest peak within a bound defined by the user
#'
#' @param bedData three column data.frame 
#' @param geneList a data.frame loaded without a header from the refseq gene data base
#' @param bounds c(A,B,C,D) Region 1 = (tss-A,tss+B) Region 2 = (ess-C ess+D)
#' @param n Number of nodes to use.
#'
#' @examples
#' fileLocation="~/Dropbox/UTX-Alex/jan/"
#' bedData<-read.delim(paste(fileLocation,"combined_sorted.bed",sep=""),header=0)
#' geneList<-read.delim(paste(fileLocation,"hg19.RefSeqGenes.csv",sep=""))
#' genes<-geneAssociation(bedData,geneList,  c(50000,0,0,0))
#' write.table(cbind(bedData,unlist(t(lapply(genes, function(x) {if(identical(x,character(0))){"None"} else{x}})))) ,"combined_tagged_genes.bed" ,col.names=FALSE,row.names=FALSE,quote=FALSE,sep="\t")
#' 
#' @export
#' @template authorTemplate
geneAssociation<-function(bedData,geneList,bounds,n=FALSE){
    if(is.logical(n)){        
        peak<-(as.numeric(bedData[,2])+as.numeric(bedData[,3]))/2    
        locations<-apply(bedData,1,geneAssoc,geneList,bounds)
        t<-lapply(locations,function(locations) as.character(geneList$name2)[unlist(locations)])
        
        tssGenes<-Map(minGene,t,peak)
        a<-lapply( tssGenes, function(x){as.character(geneList$name2[x])})}
    else{
        cs<-makeForkCluster(n,renice=0)
        a<-parApply(cs,bedData,1,leneAssoc,geneList=geneList,bounds=bounds)
    stopCluster(cs)}
    return (a)}
alexjgriffith/mulcal documentation built on May 10, 2019, 8:53 a.m.