R/main.r

## This file is part of mT1
## http://github.com/alexjgriffith/mT1/, 
## and is Copyright (C) University of Ottawa, 2016. It is Licensed under 
## the GPL License; see LICENSE.txt.
## Author : Alexander Griffith
## Contact: griffitaj@gmail.com

#' Is IUPAC
#' 
#' Check if a string contains only iupac characters
#' @param string An atomic character
#' @return A logical, True if all IUPAC false otherwise
#' @examples
#' isIUPAC("CANNTG")
#' isIUPAC("CAXCTG")
#' @export
isIUPAC<-function(string){
    if(is.null(IUPACDNA))
        stop("missing IUPACDNA")
    spl<-strsplit(string,split="")[[1]]
    all(unlist(lapply(spl,function(s)!is.null(unlist(IUPACDNA[s])))))
}

#' IUPAC to Base
#'
#' This function Transforms an IUPAC DNA sequence into a string of 
#' neucleotides. Combo characters are replaced by their components in 
#' bracket. For example CNC -> C[ACGT]C. With the rl flag set to TRUE the
#' expanded set of sequences will be returned. For example CNC
#' would become c("CAC","CCC","CGC","CTC").
#' @param char the input string of IUPAC characters
#' @param rl flag to return all variants
#' @return Character containing neucleotides + brackets
#' @examples
#' mT1:::.IUPACtoBase("HGATAA")
#' mT1:::.IUPACtoBase("CANNTG")
#' mT1:::.IUPACtoBase("CANNTG",TRUE)#!/usr/bin/env R
.IUPACtoBase<-function (char, rl = FALSE){
    ## break atomic Character into a Character vector where the nchar
    ## in each Character is 1
    IUPAC <- strsplit(char, "")[[1]]
    if(is.null(IUPACDNA))
        stop("missing IUPACDNA")
    ## List of IUPAC characters and their composition
    ## IUPAC characters let you represent possible combinations of
    ## neucleotides as a single character. For example N = [ACGT]
    IUPACCharacters<-IUPACDNA
    ## Ensure all characters in char are IUPAC
    if (any(!IUPAC %in% names(IUPACCharacters))) {
        stop("Input string contains non IUPAC characters.")
    }
    ## Transform IUPAC characters to their component base pairs
    vals <- sapply(IUPAC, function(x) {
        (IUPACCharacters[x])
    })
    ## If the rl flag is set to true, set Base to a list of the
    ## component base pairs
    if (rl) {
        Base <- c("")
        for (i in vals) {
            kit <- c()
            for (j in Base) kit <- c(kit, paste(j, i, sep = ""))
            Base <- kit
        }
    }
    ## If the rl flag is false (default) concat all Base pair components.
    ## If the components are not singleton enclose them in brackets. This
    ## form allows for easy use in grep.
    else {
        Base <- c("")
        for (i in vals) if (length(i) == 1) 
            Base <- paste(Base, i, sep = "")
        else Base <- paste(Base, "[", do.call(paste, as.list(c(i, 
            sep = ""))), "]", sep = "")
    }
    return(Base)
}

#' Complement
#'
#' Takes a string of neucleotides (ACTG) and returns the complement
#' (TGAC). This works on strings only, for DNAStringSet please refer
#' to the `Biostrings::complement` function. To transform IUPAC characters
#' into neucleotide form refer to \code{\link{IUPACtoBase}}.
#' @param string the input string of nucleotides 
#' @return The compliment composite string as an atomic Character
#' @examples
#' mT1:::.complement(mT1:::.IUPACtoBase("CANNTG"))
#' mT1:::.complement("CA[AC]TT[ACGT]GG")
.complement<-function (string){
    ## Possible base pairs + Brackets
    chars <- c("A", "G", "C", "T", "[", "]")
    names(chars) <- c("T", "C", "G", "A", "]", "[")
    ## Generate the complement
    paste(rev(sapply(strsplit(string, "")[[1]], function(x) {
        (chars[x])
    })), collapse = "")
}

#' Get JASPAR
#'
#' A function to be used in conjunction with the mT1_jaspar.
#' Pass `getJASPAR` a composite motif from the jaspar database
#' and it will return the name of the motif.
#' @param name A composite motif. IUPAC DNA characters only.
#' @return The name of the motif as an atomic character, if it came
#' from the jaspar database, otherwise NULL.
#' @examples
#' # load the required Jaspar motif data
#' a<-mT1_jaspar$names[1]
#' getJASPAR(a)
#' @export
getJASPAR<-function(name){
    with(mT1_jaspar,{
        ## make sure required global variables are loaded
        if(is.null(mT1_jaspar$jfid)||is.null(mT1_jaspar$jnames)){
            stop(paste0("jaspar.RData must be loaded. ",
                        "load(system.file(\"extdata\",\"jaspar.RData\"",
                        ",package=\"mT1\"))"))
        }
        ## Find appropriate jasapar final id from the name index
        ret<-jfid[jnames == name]
        ## Test if null, currently this step is redundant
        ifelse(is.null(ret),NULL,ret)
    })
}

#' Motif Distance
#'
#' Find the distances between two motifs within set of genomic ranges.
#' The subsets are represented by DNAStringSet from the Biostrings package.
#' If the index of genomic ranges for each motif is already known use
#' \code{\link{diffMotif}}. If the distribution of motifs are already known
#' look into mT1:::.motifDiffFromDens.
#' 
#' @param fasta  A DNAStringSet with a set of genomic ranges
#' @param motif1 A motif as an atomic Character of IUPAC characters
#' @param motif2 A motif as an atomic Character of IUPAC characters
#' @return a numeric vector represented by two columns, the first being
#' the index the second being the position of the motif within that index.
#' distance
#' @examples
#' ## load fasta file
#' fasta<-mT1_fasta
#' ## Find distances
#' distances<-motifDistance(fasta,"CANNTG","HGATAA")
#'
#' ## Determine the range of the motif locations and plot the
#' ## frequency
#' width<-fasta@@ranges@@width[1] # all fasta strings should be the same width
#' r<-seq(-width+1,width-1)
#' y<-combHeights(r,distances[,2])[[1]]
#' plot(r,y,main="CANNTG-HGATAA")
#' @export
motifDistance<-function(fasta,motif1,motif2){
    motifs<-c(motif1,motif2)
    ## Find the location of all motifs
    mloc<-lapply(motifs,function(x) grep(.IUPACtoBase(x), fasta))
    ## Find the location of all motif complements
    cloc<-lapply(motifs,function(x) grep(.complement(.IUPACtoBase(x)),fasta))
    names(mloc)<-motifs
    names(cloc)<-motifs
    ## Call diff motifs to find the difference distribution between motif1
    ## and motif2
    diffMotif(fasta,motifs,mloc,cloc)
}

#' Motif Difference
#' 
#' Find the distances between two motifs within a subset of the genome
#' represented by fasta. If the distribution of motifs are already known
#' look into mT1:::.motifDiffFromDens.
#' @param fasta A DNAStringSet
#' @param motifs The list of motif Character Atomics
#' @param mloc A list of indicies  of each motif in fasta
#' @param cloc A list of indicies of each composite motif in fasta
#' @param i Motif index 1
#' @param j Motif index index 2
#' @param min numerical atomic, minimum length of mloc[[i or j]]
#' @param combine What to do if two motifs occur in the same region
#' Merged|First
#' @return A numeric vector with two columns , the first being the
#' genomic index the second being the distance.
diffMotif<-function(fasta,motifs,mloc,cloc,i=1,j=2,min=200,combine="Merged"){
    ## Select the motifs from index i an and j
    n1<-motifs[[i]]
    n2<-motifs[[j]]
    ## Find the genomic regions that are shared between motifs
    sharedm<-intersect(mloc[[n1]],mloc[[n2]])
    sharedc<-intersect(cloc[[n1]],cloc[[n2]])    
    both<-intersect(sharedc,sharedm)
    ## if there are not enough overlaping locations return NA
    if(length(union(sharedm,sharedc))<min)
        return(NA);
    ## Find the position of each motif in genomic regions that are shared
    l1m<-gregexpr(.IUPACtoBase(n1), fasta[sharedm,],ignore.case=TRUE)
    l2m<-gregexpr(.IUPACtoBase(n2), fasta[sharedm,],ignore.case=TRUE)
    l1c<-gregexpr(.complement(.IUPACtoBase(n1)),
                  fasta[sharedc,],ignore.case=TRUE)
    l2c<-gregexpr(.complement(.IUPACtoBase(n2)),
                  fasta[sharedc,],ignore.case=TRUE)
    ## Locations under the same peak are combined in two ways
    ## First: Keep the smallest distance
    ## Merged: Keep all distances   
    if(combine == "First"){
        ## Determine the distance between motifs
        mh<-cbind(loc=sharedm,pos=mapply(function(a,b){
            x<-unique(c(outer(a,b,Vectorize(function(i,j) i-j))))
            x[which.min(abs(x))]},l1m,l2m))
        ch<-cbind(loc=sharedc,pos=mapply(function(a,b){
            x<-unique(c(outer(a,b,Vectorize(function(i,j) j-i))))
            x[which.min(abs(x))]},l1c,l2c))
        ## Ensure that regions that have a motif and its complement found only
        ## return a single distance
        p2<-ch[!ch[,1] %in% both,]
        p3<-mh[!mh[,1] %in% both,]        
        ret<-rbind(p2,p3)
        if(length(both)>0){
            p1<-cbind(both,apply(cbind(ch[ch[,1] %in% both,2],
                                       mh[mh[,1] %in% both,2]),1,
                                 function(x) x[which.min(abs(x))]))
            ret<-rbind(ret,p1)
        }
    }
    else if(combine == "Merged"){
        ## A function that takes the distances between motifs and returns
        ## a two column numerical array with the distances and their
        ## genomic index.
        keep<-function(l1,l2,fun,shared){
            pos<-mapply(function(a,b) {
                unique(c(outer(a,b,Vectorize(fun))))},l1,l2,SIMPLIFY=FALSE)
            lens<-lapply(pos,length)
            loc<-unlist(mapply(function(x,n){
                rep(x,n)} ,shared,lens,SIMPLIFY=FALSE))
            cbind(loc=loc,pos=unlist(pos))
        }
        ## Difference in motif length;
        dchar<-nchar(n2)-nchar(n1)
        ## Determine the distances between motifs
        mh<-keep(l1m,l2m,function(i,j){i-j},sharedm)
        ch<-keep(l1c,l2c,function(i,j){j-i+dchar},sharedc)
        ## Make sure palandromic motifs are not listed twice
        p2<-ch[!ch[,1] %in% both,]
        p3<-mh[!mh[,1] %in% both,]        
        x<-merge(as.data.frame(ch),as.data.frame(mh))
        ret<-rbind(do.call(cbind,c(x[order(x[,1]),])),p2,p3)
    }        
    colnames(ret)<-c("loc","pos")
    ## A function used to make sure that occurances where the two motifs
    ## overlap are set to 0. This prevents the occurrence of high valued
    ## variants such as CANNTG and CACCTG with a prefered distance of 0.
    clean<-function(df){
        x<-df[,2]
        change<- x>= (- nchar(n2)) & x<= nchar(n1)        
        df[(!change), ]
    }
    clean(ret[order(ret[,1]),])
}

#' Motif PDF
#'
#' Generate a distribution of locations for a single motif within a set
#' of genomic regions.
#' @param fasta A DNAStringSet from Biostrings.
#' @param motif An atomic Character containing only DNA IUPAC characters
#' @return A numeric vector with two columns , the first being the index
#' the second being the distance
#' @examples
#' ## load fasta file
#' fasta<-mT1_fasta
#' pdf<-motifPDF(fasta,"CANNTG")
#' x<-seq(1:100)
#' y<-combHeights(x,pdf[,2])[[1]]
#' ## plot the results
#' plot(x,y,main="CANNTG",ylab="Frequency",xlab="Index")
#' @export
motifPDF<-function(fasta,motif){
    ## Find the indicies of the genomic ranges where the motif and
    ## motif complement are present
    mloc<-grep(.IUPACtoBase(motif), fasta)
    cloc<-grep(.complement(.IUPACtoBase(motif)),fasta)
    ## Use findlocs to build the distribution.
    findLocs(fasta,mloc,cloc,motif)
}

#' Find Motif Locations
#'
#' Finds the position of motifs on each index of fasta. The results can
#' be used to build the emperical PDF of motif locations.
#' @param fasta A DNAStringSet
#' @param mloc fasta The indicies that have  motifs
#' @param cloc fasta The indicies that have compliment motifs
#' @param n1 The motif name, an Atomic Character of only IUPAC characters
#' @param combine If two motifs are found on the same index Merged|First
#' @return Two columns , the first being the instance the second being the
#' distance
findLocs<-function(fasta,mloc,cloc,n1,combine="Merged"){
    ## Find the positions of the individual motif n1
    l1m<-gregexpr(.IUPACtoBase(n1), fasta[mloc],ignore.case=TRUE)
    l1c<-gregexpr(.complement(.IUPACtoBase(n1)),fasta[cloc],ignore.case=TRUE)
    both<-intersect(mloc,cloc)
    ## The remainder of this function is very similar to motifDiffs
    ## applied to a single motif
    keep<-function(l,loc){
        pos=unlist(lapply(l,unique))
        len<-sapply(l,length)
        eloc<-unlist(mapply(function(x,n)rep(x,n),loc,len,SIMPLIFY=FALSE))
        cbind(loc=eloc,pos=pos)
    }
    if(combine == "First"){
        mh<-cbind(loc=mloc,sapply(l1m,function(x) x[which.min(abs(x))]))
        ch<-cbind(loc=cloc,sapply(l1c,function(x) x[which.min(abs(x))]))    
        p2<-ch[!ch[,1] %in% both,]
        p3<-mh[!mh[,1] %in% both,]
        ret<-rbind(p2,p3)
        if(length(both)>0){        
            p1<-cbind(both,apply(cbind(ch[ch[,1] %in% both,2],
                                       mh[mh[,1] %in% both,2]),1,
                                 function(x) x[which.min(abs(x))]))
            ret<-rbind(ret,p1)
        }
        colnames(ret)<-c("loc","pos")
    }
    else if(combine == "Merged"){        
        mh<-keep(l1m,mloc)
        ch<-keep(l1c,cloc)
        p2<-ch[!ch[,1] %in% both,]
        p3<-mh[!mh[,1] %in% both,]
        x<-merge(as.data.frame(ch),as.data.frame(mh))
        ret<-rbind(do.call(cbind,c(x[order(x[,1]),])),p2,p3)
        colnames(ret)<-c("loc","pos")
    }
    else{
        ## This option should be used if you want to sort through the
        ## occurances of motifs and their complements later.
        mh<-cbind(keep(l1m,mloc),0)
        ch<-cbind(keep(l1c,cloc),1)
        ret<-rbind(mh,ch)
        colnames(ret)<-c("loc","pos","dir")
    }        
    ret[order(ret[,1]),]        
}

#' Make Title
#'
#' A generic function that  Generates a title to be used for plotting
#' an object.
#' @param x The obj to make title from
#' @param ... possible properties
#' @rdname makeTitle
makeTitle<-function(x,...){
    UseMethod("makeTitle",x)
}


#' @return An Atomic Character with motif info
#' @param i The index of interest from print(x)
#' @rdname makeTitle
#' @method makeTitle mT1
#' @export
makeTitle.mT1<-function(x,i,...){
    with(x,{
        do.call(paste,append(lapply(combs[i,],function(x)
            motifs[x]),list(sep="-")))
    })
}

#' @method summary mT1
#' @export
summary.mT1<-function(object,...){
    main<-data.frame(t(apply(object$combs[!object$sig,],1,
                             function(i) object$motifs[i])))
    xa<-seq(-object$width+1,object$width-1)
    if(length(main)>0){
        colnames(main)<-c("motif1","motif2")
        mpv<-sapply(which(!object$sig),function(i)
            min(object$pvalue[[i]]))
        mpvl<-sapply(which(!object$sig),function(i)
            xa[which.min(object$pvalue[[i]])])
        return(data.frame(main,mpv,mpvl))
    }
    else
        return (NULL)
}

#' @method print mT1
#' @export
print.mT1<-function(x,...){
    #cat("mT1\n\n")
    cat("motifs:  ")
    cat(paste0(x$motifs[1],"\n"))
    cat(paste("        ",x$motifs[2:length(x$motifs)],collapse="\n"))
    cat("\n\n")
    cat(paste0("combs: ",length(x$combs), "\n"))
    cat(paste0("sufficent: ",sum(!x$sig)),"\n")
    main<-summary.mT1(x)
    print(main)
}

#' @method plot mT1
#' @export
plot.mT1<-function(x,motif1=NULL,motif2=NULL,i=NULL,...){
    if(!is.null(i)){
        main<-data.frame(t(apply(x$combs[!x$sig,],1,function(i) x$motifs[i])))
        motif1<-as.character(main[i,1])
        motif2<-as.character(main[i,2])
    }
    with(x,{        
        xa<-seq(-width+1,width-1)
        n1<-which(motifs == motif1)[1]
        n2<-which(motifs == motif2)[1]
        i<-union(which(combs[,1] == n1 & combs[,2] == n2),
                 which(combs[,1] == n2 & combs[,2] == n1))[1]
        par(mfrow=c(3,2))
        plot(density(dens[[n1]][,2]-width/2),main=motif1)
        plot(density(dens[[n2]][,2]-width/2),main=motif2)
        plot(xa,mp[[i]],type="l",ylab="p",xlab="Index",main="Convolution")
        plot(xa,hs[[i]],type="l",xlab="Index",ylab="Frequency",
             main=paste0(motifs[combs[i,1]],"-",motifs[combs[i,2]]),...)
        plot(combHeights(seq(0,max(hs[[i]])),
                         hs[[i]])[[1]],type="l",xlab="Height",
             ylab="Frequency",main="Freq")
        plot(xa,pvalue[[i]],type="l",ylab="p-value",xlab="Index",
             main="Tests",...)
    })
}

#' Motif Difference From Density
#'
#' If two set of motif locations and positions are known this function
#' can be used to determine the difference in motif location.
#' 
#' @param a The distribution of motif a
#' @param b The distribution of motif b
#' @param diff The difference in motif size between a and b
#' @return A two column vector with location indexes and positions
.motifDiffFromDens<-function(a,b,diff){
    ## todo: clean up .motifDiffFromDens, giving better names to t2 and t3
    af<-as.data.frame(a)
    bf<-as.data.frame(b)
    colnames(af)<-c("loc","a","dir")
    colnames(bf)<-c("loc","b","dir")
    t2<-merge(af[af$dir == 1,c(1,2)],bf[bf$dir == 1,c(1,2)])
    back<-data.frame(loc=t2[,1],pos=t2[,3]-t2[,2]+diff)
    t3<-merge(af[af$dir == 0,c(1,2)],bf[bf$dir == 0,c(1,2)])
    forw<-data.frame(loc=t3[,1],pos=t3[,2]-t3[,3])    
    ret<-rbind(back,forw)    
    #ret<-rbind(cbind(t3[,1],forw,0),cbind(t2[,1],back,1))
    #colnames(ret)<-c("loc","pos","dir")
    ret[!duplicated(ret),]
}



#' Motifs in Tandem with One Another
#'
#' Generates a mT1 object from DNAStringSet and a vector of motifs. A mT1
#' object can be used for the investigation of preferred motif positions
#' and identifying composite motifs. There must be at least 3 motifs being
#' compared. For the comparison of two motifs look at motifDiff. The purpose
#' of a mT1 object is to identify novel preferred distances from a large
#' set of motifs.
#' @param fasta The set of genomic locations as a DNAStringSet
#' @param motifs The list of three or more motifs being compared,
#' IUPAC chars only.
#' @param verbose A flag to print motifs as completed.
#' @param cl A cluster from makeForkCluster
#' @return A mT1 object
#' @examples
#' library(Biostrings) # needed to get fasta data from genomic co-ords
#' library(BSgenome.Hsapiens.UCSC.hg19) # genome
#' ## load a set of Jaspar motifs as strings
#' ## load(system.file("extdata","jaspar.RData",package="mT1"))
#' jaspar<-mT1_jaspar
#' ## Example set of peaks
#' ## load(system.file("extdata","peaks.RData",package="mT1"))
#' peaks<-mT1_peaks
#' ## Transform genomic co-ords into neucleotides
#' genome<-BSgenome.Hsapiens.UCSC.hg19
#' fasta<-getSeq(genome,peaks$chr,start=peaks$start,end=peaks$end)
#' 
#' ## Motifs to compare
#' motifs<-c("CANNTG","GATAA",jaspar$jsublM[1:2])
#' 
#' ## Find the preferred distances between `motifs` under the peaks
#' my_sampleMT1<-mT1(fasta,motifs)
#' 
#' ## plot based on string
#' plot(my_sampleMT1,"CANNTG","GATAA")
#' 
#' ## Plot based on printed order
#' print(my_sampleMT1)
#' plot(my_sampleMT1,i=1)
#' @export
mT1<-function(fasta,motifs,verbose=FALSE,cl=NULL){
    makePind<-function(l,n,name=NULL){
        if(!is.null(name))
            cat(name)
        if(l<n){
            pind<-seq(l)
            per <- 1/(l-2)*100
            pb <- txtProgressBar(style=3,width=60,title=name)
        }
        else{
            pind<-c(ceiling(seq(1,l-ceiling(l/(n+1)),by=l/(n+1))),l)
            per<-1/n*100
            pb <- txtProgressBar(style=3,width=60,label=name)
        }
        return(list(pind=pind,per=per,pb=pb,go=TRUE,name=name))
    }
    checkPind<-function(complete,pind){
        if(pind$go){
            perc<-(which(complete ==pind$pind)-1)*pind$per
            if(length(perc)==1){                    
                setTxtProgressBar(pind$pb, perc/100,label=pind$name)
                if(which(complete==pind$pind)==length(pind$pind) |
                   perc/100==1){
                    close(pind$pb)
                    pind$go<-FALSE
                }

            }
        }
        pind
    }
    ## Make sure motifs are unique
    tmots<-unique(motifs)
    if(length(tmots)!=length(motifs)){
        warning("motifs must be unique")
        motifs<-tmots
    }
    ## Ensure that there are 3 motifs at least. For the comparison of
    ## two motifs refer to motifDiff    
    if(length(motifs)<3){
        warning("must provide mT1 with more than 2 motifs")
        return (NA)
    }
    ## find which fasta indicies have the motifs of interest
    if(verbose){
        pind<-makePind(length(motifs),20,"Finding motif co-ords ...\n")
    }
    mloc<-lapply(motifs,function(x){
        if(verbose){
            pind<-checkPind(which(motifs==x),pind)
        }
        grep(.IUPACtoBase(x), fasta)})
    if(verbose){
        pind<-makePind(length(motifs),20,"Finding complement co-ords ...\n")
    }
    cloc<-lapply(motifs,function(x){
        if(verbose){
            pind<-checkPind(which(motifs==x),pind)
        }
        grep(.complement(.IUPACtoBase(x)),fasta)})
    names(mloc)<-motifs
    names(cloc)<-motifs
    ## Determine the individual PDFs for each motif
    if(verbose){
        pind<-makePind(length(motifs),20,"Finding motif locations ...\n")
    }
    a<-lapply(motifs,function(x){
        if(verbose){
            pind<-checkPind(which(motifs==x),pind)
        }
        findLocs(fasta,mloc[[x]],cloc[[x]],x,"FALSE")})
    ## combination of all motifs
    tofind<-t(combn(seq(length(motifs)),2))

    if(verbose){
        pind<-makePind(dim(tofind)[1],20,
                       paste0("Difference Analysis Begining.",
                              " Searching through ",dim(tofind)[1],
                              " combinations\n"))        
    }
    t1<-apply(tofind,1,function(x){        
        if(verbose){
            checkPind(which(tofind[,1]==x[1] &  tofind[,2]==x[2]),pind)
        }
        m1<-a[[x[1]]]
        m2<-a[[x[2]]]
        diff<-nchar(motifs[x[2]])-nchar(motifs[x[1]])
        if(length(intersect(m1[,1],m2[,1]))>300){
            y<-.motifDiffFromDens(m1,m2,diff)        
            change<- y[,2]>= (- nchar(motifs[x[2]])) &
                y[,2]<= nchar(motifs[x[1]])
            y<-y[(!change), ]
        }
        else
            y<-NA
        y
    })
    ## Which indicies have values
    large<-sapply(t1,function(x) all(unlist(is.na(x))))
    mT1<-list(diff=t1,dens=a,sig=large,motifs=motifs,combs=tofind)
    ## determine the p-values for each motif comb
    ## If biostrings is loaded nchar(fasta) will be equivalent
    ## Check this after any update to IRanges
    width<-fasta@ranges@width[1]
    ## Determine probabilities
    if(verbose){
        pind<-makePind(dim(tofind)[1],20, "Finding P-Values ...\n")
    }
    prob<-apply(cbind(tofind,seq(dim(tofind)[1])),1,
                function(x){
                    if(verbose)
                        checkPind(x[3],pind)                    
                    .ePD(t1[[x[3]]],a[[x[1]]][,2],a[[x[2]]][,2],
                                 width,motifs[[x[2]]])})
    if(verbose)
        cat("Finalizing mT1 object ...\n")
    ## Build and return the mT1 object
    mT1<-append(mT1,
                list(fasta=fasta,width=width,hs=lapply(prob,function(x) x[["hs"]]),
                mp=lapply(prob,function(x) x[["mp"]]),
                pvalue=lapply(prob,function(x) x[["pvalue"]])))
    ## Set class type mT1
    attr(mT1,"class")<-"mT1"
    mT1
}

#' @method c mT1
#' @export
c.mT1<-function(...,recursive=FALSE){
    wl<-list(...)
    if(length(wl)<2)
        return (wl[[1]])
    ## this function can be greatly simplified by looping over
    ## c rbind and first
    toRbind<-c("combs")
    toC<-c("diff","dens","sig","motifs","hs" ,"mp","pvalue")
    toTest<-c("fasta","width")
    res<-list()
    for(n in toC){        
        res[[n]]<-do.call(c,lapply(wl,function(x){
            if(is.null(x[[n]]))
                stop(paste0(n," not found in mT1"))
            x[[n]]}))
    }
    for(n in toRbind){
        res[[n]]<-do.call(rbind,lapply(wl,function(x){
            if(is.null(x[[n]]))
                stop(paste0(n," not found in mT1"))
            x[[n]]}))
    }
    fasta<-lapply(wl,function(x) x[["fasta"]])
    if(!Reduce(function(a,b)a == all(b == fasta[[1]]),
               init=TRUE,x=fasta))
        stop("different genomic ranges")
    width<-unlist(lapply(wl,function(x) x[["width"]]))
    if(any(!(width[1] == width)))
        stop("different widths")
    else{        
        width <- width[1]
        
    }
    first<-function(...){
        lapply(list(...),function(x) x[[1]])
    }
    for(n in toTest){
        res[[n]]<-do.call(first,lapply(wl,function(x){
            if(is.null(x[[n]]))
                stop(paste0(n," not found in mT1"))
            x[[n]]}))
    }
    res$width<-width
    attr(res,"class")<-"mT1"
    return( res)
}

#' Add Motif
#'
#' Takes a motif and creates a new object based on the class of object
#'
#' @param object A object with an addMotif method
#' @param motif A Atomic Character containing IUPAC characters
#' @param ... Variables to pass to object.
#' @return mT1 object with new motif
#' @rdname addMotif
#' @examples
#' new<-addMotif(mT1_sampleMT1,"CACCTG")
#' @export
addMotif<-function(object,motif,...){
    UseMethod("addMotif",object)
}

#' @rdname addMotif
#' @method addMotif default
#' @export
addMotif.default<-function(object,motif,...){
    motif
}

#' @rdname addMotif
#' @method addMotif mT1
#' @export
addMotif.mT1<-function(object,motif,...){
    ## Much of this function is shared with mT1
    ## I need to extract the similarities and move them
    ## to a set of private mT1 helper functions
    return ( with(object,{
        mloc<-grep(.IUPACtoBase(motif), fasta)
        cloc<- grep(.complement(.IUPACtoBase(motif)),fasta)        
        a<-findLocs(fasta,mloc,cloc,motif,"FALSE")
        nm1<-length(motifs)
        tofind<-cbind(nm1+1,seq(nm1))
        t1<-apply(tofind,1,function(x){
                m1<-a
                m2<-dens[[x[2]]]
                diff<-nchar(motifs[x[2]])-nchar(motif)
                if(length(intersect(m1[,1],m2[,1]))>300){
                    y<-.motifDiffFromDens(m1,m2,diff)        
                    change<- y[,2]>= (- nchar(motifs[x[2]])) &
                        y[,2]<= nchar(motif)
                    y<-y[(!change), ]
                }
                else
                    y<-NA
                y
        })
        prob<-apply(cbind(tofind,seq(dim(tofind)[1])),1,
                    function(x) .ePD(t1[[x[3]]],a[,2],dens[[x[2]]][,2],
                                     width,motifs[[x[2]]]))
        large = sapply(t1,function(x) all(unlist(is.na(x))))
        ret<-list(diff=t1,dens=list(a),sig=large,motifs=motif,combs=tofind,
             width=width,
             fasta=fasta,
             hs=lapply(prob,function(x) x[["hs"]]),
             mp=lapply(prob,function(x) x[["mp"]]),
             pvalue=lapply(prob,function(x) x[["pvalue"]]))
        class(ret)<-"mT1"
        c(object,ret)
    }))
}

#' Binomial Test
#'
#' A wrapper around binom.test that ensures that n!=0 and that k is at
#' least a minimum number.
#' @param k count
#' @param n total
#' @param p prob
#' @param kmin minimum k cut off
#' @param nmin minimum n cut off
#' @return a atomic numeric p-value
#' @examples
#' ## How many times are two motifs found D base pairs apart
#' frequencyAtD<-10
#' ## How may times are two motifs found any D appart
#' possibleAtD<-200
#' ## If one motif distance was computed what is the probability of it
#' ## being D
#' probabilityAtD<-0.03
#' btest(frequencyAtD,possibleAtD,probabilityAtD)
#' @export
btest<-function(k,n,p,kmin=10,nmin=600){
    if(n<nmin) # n must be greater than 0
        return(0)
    ## binom.test needs to be speed up, half of the time is spent here
    else if(k>kmin){ # ensure that there are more k than min
        ##return(log(binom.test(k,n,p)$p.value,10))
        return (log(dbinom(k,n,p),10))}
    else
        return(0)
}

#' Expected Motif Probability
#'
#' Determine the expected probability of finding two motifs at a specific
#' distance based on the empirical PDFs of the individual motifs
#' @param a numerical vector PDF of motif a
#' @param b numerical vector PDF of motif b
#' @param width width of fasta indicies
#' @param nb Difference in length between motif a and motif b
#' @return expectation for each motif distance 
#' @export
eMP<-function(a,b,width,nb=0){
    refl<-function(x,width,sizex){        
        abs(x-width+sizex-2)
    }
    y<-combHeights(seq(1,width),a,refl(b,width,nb))
    mod<-convolve(y[[1]],y[[2]],type="open")
    mod[mod<0]<-0
    mod/sum(mod)
}

#' Expected Probability Distance
#'
#' Determine the pvalue from a vector of distances and PDFs from the
#' two motifs being compared.
#' @param t1 A width 2 numerical vector of indicies and distances
#' @param a A numerical vector PDF of motif a
#' @param b A numerical vector PDF of motif b
#' @param width A numeric atomic, the width of fasta indicies, should
#' @param ma String motif A
#' @param mb String motif B
#' be uniform
#' @return list(hs,mp,p-values)
.ePD<-function(t1,a,b,width,mb){
    ## make sure there are rows in t1
    if(any(is.na(t1))){        
        return(list(hs=NA,mp=NA,pvalue=NA))
    }
    hs<-combHeights(seq((-1* width+1),width-1),t1[,2])[[1]]
    n<-sum(hs)
    if(max(hs)>n){
        stop("n >hs")
    }
    mp<-eMP(a,b,width,nchar(mb))
    pvalue<-dbinom(hs,n,mp,log=TRUE)
    pvalue[hs<1]<-0
    list(hs=hs,mp=mp,pvalue=pvalue)
}

#' Combination Heights
#'
#' Determines the number of occurrences in each member of ... along
#' sequence x.
#' @param x A sequence of possible values represented as indicies
#' @param ... The vectors of values to be mapped to x and summed
#' @return List with length of x range
#' @examples
#' x<-seq(20)
#' y<-runif(200,1,20)
#' par(mfrow=c(1,2))
#' plot(y)
#' plot(x,combHeights(x,y)[[1]])
#' @export
combHeights<-function(x,...){
    if(length(as.list(...)) == 0){
        warning("combHeights should be passed one additional variable")
        return(NA)
    }
    min<-min(x)
    max<-max(x)
    lapply(list(...),function(h){        
        y<- rep(0,length(x))
        for(i in h){
            if(i>=min & i<=max)
                y[i-min +1]<-y[i-min +1]+1
        }
        y
    })
}
alexjgriffith/mT1 documentation built on May 10, 2019, 8:53 a.m.