R/GenomicProfileGenericFunctions.R

Defines functions .what.is.geneRef .what.is.chromatinState .what.is.goodnessOfFit .what.is.ChIPScore .what.is.occupancy .what.is.predictedProfile .internalUnlist .cleanUpAfterYourself .unpackMetrics .optimalExtraction .OptimalFormatConversion .allMetrics .geometricRatio .peakExtractionFScore .meanGoFScore .cleanGoF .AccessExtract searchSites .generateChIPProfile .generateChIPProfileRcpp .computeBPFrequency .updateGenomicProfiles .is.parameterOptions .is.genomicProfiles .scoreDNAStringSet .getIndexOfPWMThresholded .parsePFMFromFile .parseJASPARPFM .parseTransfacPFM .parseRawPFM .parsePFM .computePWMFromPFM .computePWM

Documented in searchSites

### Computes the PWM matrix from the PFM matrix
### PFM: the PFM matrix from file
### pseudocount=1: the pseudocount used to compute the PWM
### BPFrequency=rep(0.25,4): a vector with the genome wide
###frequencies of the four nucleotides.
### naturalLog=FALSE: if this is TRUE the method uses the natural logarithm
####when computing the PWM (Berg and von Hippel, 1987),
####otherwise the method uses logarithm in base 2 (Stormo, 2000)
### noOfSites=NULL: the number of DNA sequences used to compute the PFM.
###If NULL it will be computed based on the PFM
### return: a 4 row matrix of the PWM, with rows: A, C, G, T
.computePWM <- function(PFM, pseudocount, BPFrequency,naturalLog,noOfSites){
    if(min(PFM) < 0){
    #the provided PFM is the actual PWM
        PWM <- PFM
    } else{
    #get the sum on each colum (it should be equal)
        if(noOfSites == 0){
            noOfSites <- max(apply(PFM,2,sum))
    }
    #normalise the pfm
    for(i in seq_len(ncol(PFM))){
        PFM[,i] <- PFM[,i]/max(apply(PFM,2,sum))
    }

    #generate the pwm based on the pfm
    PWM <- .computePWMFromPFM(PFM,max(noOfSites), pseudocount,
        BPFrequency,naturalLog)
    }


    return(PWM);
}



### Computed the PWM matrix using the PFM matrix
### PFM: the PFM matrix
### noOfSites: the total number of sites used to compute the PFM matrix
### pseudocount=1: the pseudocount used to compute the PWM
### BPFrequency=rep(0.25,4): a vector with the genome wide
###frequencies of the four nucleotides .
### naturalLog=FALSE: if this is TRUE the method uses the natural
###logarithm  when computing the PWM (Berg and von Hippel, 1987),
###otherwise logarithm  in base 2 is used (Stormo, 2000)
### return: a 4 row matrix of the PWM, with rows: A, C, G, T
.computePWMFromPFM <- function(PFM,noOfSites, pseudocount,
    BPFrequency,naturalLog){
    PWM <- matrix(0,nrow = nrow(PFM),ncol = nrow(PFM))
    PWM <- PFM
    for(i in seq_len(nrow(PFM))){
        for(j in seq_len(ncol(PFM))){
            PWM[i,j] <- (PFM[i,j]*noOfSites+BPFrequency[i]*pseudocount)/
                (noOfSites+pseudocount)
            if(naturalLog){
                PWM[i,j] <- log(PWM[i,j]/BPFrequency[i])
            } else{
                PWM[i,j] <- log2(PWM[i,j]/BPFrequency[i])
            }
        }
    }
    return(PWM)
}

### Attempts to parse a file and extract the PFM. It uses the
###following formats: RAW, TRANSFAC, Jaspar or constructs one from sequences.
###For the correct formats see
###http://www.benoslab.pitt.edu/stamp/help.html#input
### filename: the file storing the motif
### format="raw": the format of the PWM: raw, transfac, jaspar,
###seqs (list of binding sequences)
### return: a PFM matrix, with rows: A, C, G, T
.parsePFM <- function(filename, format){
    PFM <- NULL
    if(format == "raw"){
        PFM <- .parseRawPFM(filename)
    }
    if(format == "transfac"){
        PFM <- .parseTransfacPFM(filename)
    }
    if(format == "JASPAR"){
        PFM <- .parseJASPARPFM(filename)
    }
    if(format == "sequences"){
        PFM <- .parsePFMFromFile(filename)
    }
    #transpose the PFM/PWM if required
    if(nrow(PFM) != 4){
        if(ncol(PFM) == 4){
            PFM <- t(PFM);
        } else{
        stop("Could not load PFM/PWM of 4 rows or columns. Wrong PFM/PWM")
        }
    }
    return(PFM)
}


### Extracts the PFM matrix from a raw PSSM style file.
###It also works for MEME output
### filename: the file containing the motif
### return: the PFM matrix
.parseRawPFM <- function(filename){
    buffer <- read.table(filename,row.names = NULL,stringsAsFactors = FALSE)
    if(ncol(buffer) == 4){
        PFM <- t(buffer)
        rownames(PFM) <- c("A","C","G","T")
    } else{
        PFM <- NULL
    }
    return(PFM)
}


### Extracts the PFM matrix from a transfac style file.
### filename: the file containing the motif
### return: a 4 row PFM matrix, with rows: A, C, G, T
.parseTransfacPFM <- function(filename){
    buffer <- read.table(filename,row.names = NULL,stringsAsFactors = FALSE)
    if(ncol(buffer) > 4){
        PFM <- t(buffer[,2:5])
        if(is.character(PFM[,1])){
        PFM <- PFM[,2:ncol(PFM)]
        }
        rownames(PFM)=c("A","C","G","T")
    } else{
    PFM <- NULL
    }
    return(PFM)
}

### Extracts the PFM matrix from a jaspar style file
### filename: the file containing the motif
### return: a 4 row PFM matrix, with rows: A, C, G, T
.parseJASPARPFM <- function(filename){
    buffer <- readLines(filename)
    error <- FALSE
    for(i in seq_along(buffer)){
        if(!error){
            buffer[i] <- sub("[","",buffer[i],fixed=TRUE)
            buffer[i] <- sub("]","",buffer[i],fixed=TRUE)
            cells <- strsplit(buffer[i]," +",fixed=FALSE)[[1]]

            if(length(cells) > 1){
                values <- as.numeric(cells[2:(length(cells))])
                if(i == 1){
                    PFM <- values
                } else{
                    PFM <- rbind(PFM,values)
                }

            } else{
                error <- TRUE
            }
        }
    }
    if(error){
        PFM <- NULL
    } else {
        rownames(PFM) <- c("A","C","G","T")
    }
    return(PFM)
}




### Reads a list of sequences from a file and produces the PFM mtrix;
### fileame: the filename with the sequences
### return: a 4 row PFM matrix, with rows: A, C, G, T
.parsePFMFromFile <- function(fileame){
    sequences <- readDNAStringSet(fileame)
    if(!is.null(sequences) & length(sequences) > 0){
        PFMExtended <- consensusMatrix(sequences,as.prob = TRUE,
            baseOnly = TRUE)
        PFM <- PFMExtended[1:4,]
    } else{
        PFM <- NULL
    }
    return(PFM)
}

### generates a list of all PWM scores that are above a certain threshold
### PWMScore: a list a PWM scores vectors for each DNA segments
### PWMThreshold: the PWM score threshold
### minPWMScore: the min PWM score
### maxPWMScore: the max PWM score
### return: a list consisting of a vector with the indexes of the PWM
###scores higher than a threshold for each chromosome (or locus)
.getIndexOfPWMThresholded <- function(PWMScore, PWMThreshold){
    indexes <- vector("list",length(PWMScore))
    names(indexes) <- names(PWMScore)
    for(i in seq_along(PWMScore)){
        indexes[[i]] <- which(PWMScore[[i]] > PWMThreshold)
    }
    return(indexes)
}




.scoreDNAStringSet <- function(PWM,DNASequenceSet, strand="+",
    strandRule="max"){
    scoreSet <- vector("list", length(DNASequenceSet))
    IndexPositive <- vector("list",length(DNASequenceSet))
    IndexNegative <- vector("list",length(DNASequenceSet))
    names(scoreSet) <- names(DNASequenceSet)
    for(i in 1:length(DNASequenceSet)){
        sequenceLength <- (length(DNASequenceSet[[i]]) - ncol(PWM) + 1)
        scorePositive <- NULL
        scoreNegative <- NULL

        if(length(grep("+",strand))){
        ## Warnings from BioStrings Package
        scorePositive <- PWMscoreStartingAt(PWM,DNASequenceSet[[i]],
            starting.at = seq_len(sequenceLength))
        }

        if(length(grep("-",strand))){
            scoreNegative <- rev(PWMscoreStartingAt(PWM,
                reverseComplement(DNASequenceSet[[i]]),
                starting.at = seq_len(sequenceLength)))
        }

        if(is.null(scorePositive) & is.null(scoreNegative)){
            # no strand is specified so consider the positive strand
            scoreSet[[i]] <- PWMscoreStartingAt(PWM,DNASequenceSet[[i]],
                starting.at = seq_len(sequenceLength))
        } else if(!is.null(scorePositive) & !is.null(scoreNegative)){
        # both strands are specified
            if(strandRule == "sum"){
                scoreSet[[i]] <- scorePositive + scoreNegative
            } else if(strandRule =="mean"){
                scoreSet[[i]] <- (scorePositive + scoreNegative)/2
            } else{
                scoreSet[[i]] <- pmax(scorePositive,scoreNegative)
                IndexPositive[[i]] <- which(scorePositive ==
                    pmax(scorePositive,scoreNegative))
                IndexNegative[[i]] <- which(scoreNegative ==
                    pmax(scorePositive,scoreNegative))

            }
        } else {
            if(!is.null(scoreNegative)){
                scoreSet[[i]] <- scoreNegative
            } else{
                scoreSet[[i]] <- scorePositive
            }
        }
    }
    return(list(scoreSet,IndexPositive,IndexNegative))
}


# Validity check function for Genomic Profile Parameters
.is.genomicProfiles<- function(object){
    if(is(object,"genomicProfiles")){
        return(TRUE)
    } else {
        return(FALSE)
    }
}

# Validity check function for occupancy Profile Parameters
.is.parameterOptions <- function(object){
    if(is(object,"parameterOptions")){
        return(TRUE)
    } else {
        return(FALSE)
    }
}

.updateGenomicProfiles<-function(genomicProfiles,parameterOptions){
    slots<-slotNames(parameterOptions)
    localEnvir<-environment()
    lapply(slots,function(slots){

           if(!any(slot(genomicProfiles,slots)%in%slot(parameterOptions,slots))){
           slot(genomicProfiles,slots,check=TRUE)<-slot(parameterOptions,slots)
           assign("genomicProfiles",genomicProfiles,envir=localEnvir)
         }})
    return(genomicProfiles)
}

# Extracting and computing Base Pair frequency from BSGenome when
#contructing genomicProfileParamter
.computeBPFrequency <- function(object){
    if(is(object,"BSgenome")){
    object <- getSeq(object)
    }
    if(is(object,"DNAStringSet")){
    object <- object
    }
    if(length(object)>1){
    BPFrequency <- apply((alphabetFrequency(object,baseOnly=TRUE)[,1:4]),
    2,sum)
    BPFrequency <- BPFrequency/sum(as.numeric(BPFrequency))
    } else {
    BPFrequency <- (alphabetFrequency(object,baseOnly=TRUE)[,1:4])
    BPFrequency <- BPFrequency/sum(as.numeric(BPFrequency))
    }
    return(BPFrequency)
}
# Compute Thetha from MSE and Corr
#Generate chip Profile
.generateChIPProfileRcpp <- function(inputVector, mean, sd, smooth = NULL,
    norm = TRUE, peakSignificantThreshold=NULL){

    if(is.null(peakSignificantThreshold)){
        peakSignificantThreshold <- 0.0001
    }
    originalMax=max(inputVector);

    var = sd^2;
    shp = mean^2/var
    scl = var/mean
    l = length(inputVector)


    f = dgamma(0:length(inputVector), shape = shp, scale = scl)
    bigF = rev(cumsum(rev(f)))
    peakSignificantSize = min(which(bigF<peakSignificantThreshold))

    kernel = c(rev(bigF[seq(2,peakSignificantSize)]),
    bigF[seq_len(peakSignificantSize)])
    leftKernel=(floor(length(kernel)/2) - 1)
    rightKernel=length(kernel)-leftKernel-1

    rawVector = inputVector;
    rawVector[which(inputVector <= mean(inputVector))]=0
    rawVector = c(rep(0,times=leftKernel), rawVector, rep(0,times=rightKernel))

    peaks = roll_sum(rawVector, length(kernel), weights = kernel)

    if(!is.null(smooth)){
        leftKernel=(floor(smooth/2) - 1)
        rightKernel=smooth-leftKernel-1
        peaks = roll_mean(c(rep(0,times=leftKernel),peaks,
        rep(0,times=rightKernel)), smooth, weights = rep(1, smooth))
    }

    #print((kernel))

    if(norm && max(peaks)!=0){
        peaks=originalMax*peaks/max(peaks)
    }
    peaks <- c(peaks[2:length(peaks)],0)
    return(peaks)
}

#Generate chip Profile
.generateChIPProfile <- function(inputVector, mean, sd,
    smooth = NULL, norm = TRUE, quick = TRUE,peakSignificantThreshold=NULL){

    if(is.null(peakSignificantThreshold)){
        peakSignificantSize <- 1250
    }

    originalMax <- max(inputVector)

    var <- sd^2
    shp <- mean^2/var
    scl <- var/mean
    l <- length(inputVector)



    f <- dgamma(0:length(inputVector), shape = shp, scale = scl)
    gamma <- rev(cumsum(rev(f)))
    sp <- c(rev(gamma[seq(2,peakSignificantSize)]),
    gamma[seq_len(peakSignificantSize)])
    lp <- length(sp)
    hp <- (lp - 1)/2

    peak.centres <- which(inputVector > mean(inputVector))
    peaks <- vector("numeric", l)

    if(!quick){
        for(pc in peak.centres) {
            thisPeak <- vector("numeric", l)
            thisPeak[pc:l] <- gamma[1:(l-pc+1)]
            if(pc!=1){
                thisPeak[1:(pc-1)] <- gamma[pc:2]
            }
            peaks <- peaks + thisPeak * inputVector[pc]
        }
    } else {
        for(pc in peak.centres) {
            thisPeak <- sp
            left <- pc - hp
            right <- pc + hp
            cutLeft <- max(0, 0 - left)
            cutRight <- (-min(0, l - right))
            if(cutLeft != 0){
                thisPeak <- thisPeak[-(1:(cutLeft+1))]
            }
            if(cutRight != 0){
                thisPeak <- thisPeak[-((lp-cutRight+1):lp)]
            }
            left <- max(1, left)
            right <- left + length(thisPeak) - 1
            peaks[left:right] <- peaks[left:right] + thisPeak * inputVector[pc]
        }
    }

    if(!is.null(smooth)){
        if((smooth %% 2) == 0){
            smooth <- smooth - 1
        }
        mid <- round(smooth/2,0) + 1
        d <- smooth - mid
        for(i in mid:(length(peaks) - d)) {
            peaks[i] <- mean(peaks[(i-d):(i+d)])
        }
    }


    if(norm && max(peaks) != 0){
        peaks <- originalMax*peaks/max(peaks)
    }

    return(peaks)
}


## Search sites in occupancy and Chip if a lot of them
## wow that is some good english right there

searchSites <- function(Sites,lambdaPWM="all",
    BoundMolecules="all",Locus="all"){

    if(!is.null(names(Sites)) & all(names(Sites) %in%c("Optimal","Occupancy","ChIPProfiles","goodnessOfFit"))){

       bufferSites<-list("Optimal"=Sites[[1]],
                         "Occupancy"=searchSites(unname(Sites$Occupancy),lambdaPWM=lambdaPWM,
                                                 BoundMolecules=BoundMolecules,
                                                 Locus=Locus),
                         "ChIPProfiles"=searchSites(unname(Sites$ChIPProfile),lambdaPWM=lambdaPWM,
                                                  BoundMolecules=BoundMolecules,
                                                  Locus=Locus),
                          "goodnessOfFit"=searchSites(unname(Sites$goodnessOfFit),lambdaPWM=lambdaPWM,
                                                  BoundMolecules=BoundMolecules,
                                                  Locus=Locus))
       return(bufferSites)
    }else if(is(Sites,"genomicProfiles") | is(Sites,"list")){

        if(is(Sites,"genomicProfiles")){
            Sites <- profiles(Sites)
        }






    #Setting Up for search

    lambda <- "lambda = "
    bound <- "boundMolecules = "

    buffer <- c()
    bufferSites <- Sites

    #Search for Scaling Factor
    if(all(lambdaPWM=="all")){
        bufferSites <- bufferSites
        buffer <- 0
    } else {
        localNames <- sapply(strsplit(sapply(strsplit(names(bufferSites)," &"),
        "[[",1),lambda),"[[",2)
        for( i in seq_along(lambdaPWM)){
            buffer <- c(buffer,grep(paste0("^",lambdaPWM[i],"$"),
            localNames))

        }

        if(length(buffer)>=1){
        bufferSites <- bufferSites[buffer]
        }
    }

    #Search for boundMolecules
    buffer <- c()
    if(all(BoundMolecules=="all")){
        bufferSites <-bufferSites
        buffer <- 0
    } else {
        localNames <- sapply(strsplit(names(bufferSites),bound),"[[",2)
        for( j in seq_along(BoundMolecules)){
            buffer <- c(buffer, grep(paste0("^",as.character(as.integer(BoundMolecules[j])),"$"),
            as.character(as.integer(localNames))))

        }
#browser()
        if(length(buffer)>=1){
            bufferSites <- bufferSites[buffer]
        }
    }
    #Search for Loci

    if(all(Locus == "all")){
        bufferSites <- bufferSites
    } else {
        if(is(bufferSites,"list")){
        for( k in seq_along(bufferSites)){
            LocalLocus <- names(bufferSites[[k]])
            buffer <- which(LocalLocus==Locus)
            bufferSites[[k]] <- bufferSites[[k]][buffer]
        }
        } else {
            LocalLocus <- names(bufferSites)
            buffer <- which(Locus %in% LocalLocus)
            bufferSites <- bufferSites[buffer]
        }
    }
    return(bufferSites)
  } else{
    stop("Oops Something went wrong. We are not sure what you are parsing to Sites")
  }
}

### Extracting Non Accesible DNA
.AccessExtract<-function(subject,query,isCS=FALSE){
    setLocal<-vector("list",length(subject))

    for(i in seq_along(subject)){
        localIntersect<-setdiff(subject[i], query)
        if(isCS){
          setLocal[[i]]<-data.frame("chr"=as.character(seqnames(localIntersect)),
                                    "start"=start(localIntersect),
                                    "end"=end(localIntersect),
                                    "CS" = mcols(localIntersect)[,1])
        }else {
          setLocal[[i]]<-data.frame("chr"=as.character(seqnames(localIntersect)),
                                    "start"=start(localIntersect),
                                    "end"=end(localIntersect))
        }

    }
    names(setLocal)<-names(subject)
    return(setLocal)
}


#### building a skelton object for accuracy estimates

.cleanGoF <- function(genomicProfiles,ValidationSet,stepSize=10){
     #just in case it needs some reordering

     ValidationSet<-ValidationSet[match(names(genomicProfiles[[1]]),names(ValidationSet))]

     setUpGoF<-lapply(genomicProfiles,function(gp,chip,stepSize){

                      predicted <-lapply(gp,function(x){
                                         return(x$ChIP)})

                      idx<-lapply(gp,function(x){
                                  return(floor(seq(1,sum(width(x)),length.out=length(x))))})
                      validation<-mapply(function(x,idx){
                                  return(x[idx])},x=chip,idx=idx,SIMPLIFY=FALSE)
                      ## internalCheck
                      internalCheck <- all.equal(sapply(predicted,length),sapply(validation,length))
                      if(!internalCheck){
                         stop("predicted and valdiation set are not the same length")
                      }

                      paralleValues<-mapply(function(x,y){
                                     local<-list("prediction"=x,"validation"=y)},
                                     predicted,validation,SIMPLIFY=FALSE)
                      return(paralleValues)
       },ValidationSet,stepSize)

       return(setUpGoF)

}


.meanGoFScore <-function(GoF){
    tags<-names(GoF[[1]])
    idx<-seq_along(tags)

    meanGoF <- sapply(idx,function(idx,GoF){
                      res<-mean(sapply(GoF,"[[",idx),na.rm=TRUE)
                      return(res)
    },GoF=GoF)
    names(meanGoF)<-paste0(tags,"Mean")


    for(i in seq_along(GoF)){
       GoF[[i]]<-c(GoF[[i]],meanGoF)
    }
    return(GoF)
}



## Fscore computation for all regions
.peakExtractionFScore<-function(predicted,locusProfile,region=TRUE){
    # making sure that they are same length just for idx purposes
    if(length(predicted)!=length(locusProfile)){
        stop("precited Profile and locusProfile are not the same length")
    }
    ## threshold increments
    high<-max(max(predicted),max(locusProfile))
    low<-min(min(predicted),min(locusProfile))
    if(low==0){
        subs<-seq(low,high,length.out=21)
        subs<-sapply(subs,"^",2)[seq(2,21)]
    } else{
        subs<-seq(low,high,length.out=20)
        subs<-sapply(subs,"^",2)
    }

    subprof<-matrix(0,ncol=11,nrow=length(subs))

    #rownames(paste0("thresh=",subs))
    #browser()
    matpred<-matrix(0,ncol=length(subs), nrow=length(locusProfile))
    matloc<-matrix(0,ncol=length(subs), nrow=length(locusProfile))

    for(i in seq_along(subs)){

        localProfile<-rep(0,length(locusProfile))
        ## thresh extraction

        localProfile[locusProfile>=subs[i]]<-1
        if(all(localProfile==1)){
             localProfile[1]<-0
        }
        if(all(localProfile==0)){
             localProfile[1]<-1
        }
        matpred[,i]<-factor(predicted)
        matloc[,i]<-factor(localProfile)
    }

    ## Predictions and performance
    pred<-prediction(matpred,matloc)
    prec<-mean(sapply(performance(pred,"prec")@y.values,mean,na.rm=T))
    rec<-mean(sapply(performance(pred,"rec")@y.values,mean,na.rm=T))
    fscore<-mean(sapply(performance(pred,"f")@y.values,mean,na.rm=T))
    acc<-mean(sapply(performance(pred,"acc")@y.values,mean,na.rm=T))
    mcc<-mean(sapply(performance(pred,"mat")@y.values,mean,na.rm=T))

    auc<-mean(sapply(performance(pred,"auc")@y.values,mean,na.rm=T))


    return(c("precision"=prec,"recall"=rec,"f1"=fscore,"accuracy"=acc,"MCC"=mcc,"AUC"=auc))
}

##### geometricRatio

.geometricRatio<-function(predcited,locusProfile,step){
       diff<-abs(locusProfile-predcited)
       leftEdge<-diff[seq(1,length.out=length(diff)-1)]
       rightEdge<-diff[seq(2,length.out=length(diff)-1)]

       areaDiff<-rep(0,length(diff)-1)
       areaShared<-rep(0,length(diff)-1)
       area<-rep(0,length(diff)-1)

       for(i in seq_along(areaDiff)){
       ## Computing Diff and shared
          ## Diff
          areaDiff[i]<-step*((leftEdge[i]+rightEdge[i])/2)
          ## Shared
          if(locusProfile[i]>=predcited[i] & locusProfile[i+1]>=predcited[i+1]){
              areaShared[i]<-step*((predcited[i]+predcited[i+1])/2)
          } else {
              areaShared[i]<-step*((locusProfile[i]+locusProfile[i+1])/2)
          }
        }
        ## Considering zeroes
        if(sum(areaShared)!=0){
            area<-sum(areaDiff)/sum(areaShared)
        } else {

            area<-sum(areaDiff)
        }

    return(area)

}


### all metric extraction just to make things cleaner(should be using switch )

.allMetrics<-function(predicted,locusProfile,stepSize){
    # Checking for bullshit and doing some correlation stuff

    if((sd(predicted,na.rm=TRUE)==0) | (sd(locusProfile,na.rm=TRUE)==0)){
        corrP<-0
        corrS<-0
        corrK<-0
    } else {

        corrP<-cor(na.omit(predicted),na.omit(locusProfile),method="pearson")

        corrS<-cor(na.omit(predicted),na.omit(locusProfile),method="spearman")

        corrK<-cor(na.omit(predicted),na.omit(locusProfile),method="kendall")

    }

    ## ks distance
    ks<-ks.test(predicted,locusProfile)
    D<-unname(ks[[1]])

    ## Geometric
    geo<-.geometricRatio(predicted,locusProfile,stepSize)

    ## fscore
    AUC<-.peakExtractionFScore(predicted,locusProfile)

    metrics<-c("pearson"=corrP,
        "spearman"=corrS,
        "kendall"=corrK,
        "KsDist"=D,
        "geometric"=geo,
         AUC)
    return(metrics)
}



## rankBased method
.OptimalFormatConversion<-function(goodnessOfFit){
     ## for some reason arrays return this weird thing..
     ## things are reordered but not in the way i want them
     ## Im not entirely sure how they work
     ## For now we will use this method
     ## just easier to build
     fitVec<-seq_along(goodnessOfFit[[1]][[1]])
     lociVec<-seq_along(goodnessOfFit[[1]])
     paramVec<-seq_along(goodnessOfFit)

     convertedFormat<-lapply(fitVec,function(fitVec,lociVec,paramVec,goodnessOfFit){


                            param<-lapply(paramVec,function(paramVec,fitVec,lociVec,goodnessOfFit){
                                      loci<-sapply(goodnessOfFit[[paramVec]],"[[",fitVec)
                                      return(loci)

                            },fitVec,lociVec,goodnessOfFit)
                            mat<-do.call("rbind",param)
                            rownames(mat)<-names(goodnessOfFit)
                            return(mat)
     },lociVec,paramVec,goodnessOfFit)
     names(convertedFormat)<-names(goodnessOfFit[[1]][[1]])
     return(convertedFormat)
}


### Processing optimal matricies and parameters


.optimalExtraction <-function(goodnessOfFit,rank=FALSE){

    accu<-profiles(goodnessOfFit)
    bm<-boundMolecules(goodnessOfFit)
    lambda<-lambdaPWM(goodnessOfFit)

    ## converting format
    metrics<-.OptimalFormatConversion(accu)


    if(rank){
         metrics<-metrics[!grepl("Mean",names(metrics))]
         tag<-names(metrics)
         idx<-seq_along(tag)

         # extracting Rank based matrics
         metrics<-lapply(idx,function(idx,metrics,tag){
                              local<-metrics[[idx]]
                              localTag<-tag[idx]
                              internalIdx<-seq_len(ncol(local))
                              TopHits<-lapply(internalIdx,function(idx,met,tags){
                                              if(tags %in% c("geometric","MSE","KsDist")){
                                                 best<-which(met[,idx]==min(met[,idx]))
                                                 buffer<-as.vector(met[best,idx])
                                                 names(buffer)<-rownames(met)[best]


                                              }else{
                                                best<-which(met[,idx]==max(met[,idx]))
                                                buffer<-as.vector(met[best,idx])
                                                names(buffer)<-rownames(met)[best]
                                              }
                                              return(buffer)
                              },local,localTag)
                              #names(TopHits)<-colnames(local)
                              TopHits<-sort(table(names(unlist(TopHits))),decreasing=TRUE)
                              return(TopHits)},metrics,tag)
          ## extracting optimal parameter
          optimalParama<-mapply(function(met,tags){
                              unpacked<-.unpackMetrics(met)
                              if(tags %in% c("geometric","MSE","KsDist")){
                                  optimal<-unpacked[which(unpacked[,"score"]==min(unpacked[,"score"])),c("lambda","boundMolecules")]
                              } else{
                                  optimal<-unpacked[which(unpacked[,"score"]==max(unpacked[,"score"])),c("lambda","boundMolecules")]
                              }
                              return(optimal)

          },met=metrics,tags=tag,SIMPLIFY=FALSE)
          names(optimalParama)<-tag

          # building rank based matricies
          mattemp<-matrix(0,ncol=length(bm),nrow=length(lambda))
          rownames(mattemp)<-lambda
          colnames(mattemp)<-bm
          Loc<-lapply(metrics,function(met,mattemp){
                      unpacked<-.unpackMetrics(met)

                      idx<-seq_len(nrow(unpacked))


                      for(i in idx){
                        mattemp[as.character(unpacked[i,"lambda"]),as.character(unpacked[i,"boundMolecules"])]<-as.numeric(unpacked[i,"score"])
                      }

                      return(mattemp)},mattemp)
          names(Loc)<-tag

          FinalOptimalBuild<-list("OptimalParameters"=optimalParama,"OptimalMatrix"=Loc,"method"="rank")

    } else {
        metrics<-metrics[!grepl("Mean",names(metrics))]
        tag<-names(metrics)
        idx<-seq_along(tag)
        metrics<-lapply(metrics,function(met){
                        return(apply(met,1,mean))
        })

        ## Optimal Paramters
        optimalParama<-mapply(function(met,tag){
                              unpacked<-.unpackMetrics(met)
                              if(tag %in% c("geometric","MSE","KsDist")){
                                  optimal<-unpacked[which(unpacked[,"score"]==min(unpacked[,"score"])),c("lambda","boundMolecules")]
                              } else{
                                  optimal<-unpacked[which(unpacked[,"score"]==max(unpacked[,"score"])),c("lambda","boundMolecules")]
                              }
                              return(optimal) },met=metrics,tag=tag,SIMPLIFY=FALSE)
       ## building matricies
       mattemp<-matrix(0,ncol=length(bm),nrow=length(lambda))
       rownames(mattemp)<-lambda
       colnames(mattemp)<-bm
       Loc<-lapply(metrics,function(met,mattemp){
                   unpacked<-.unpackMetrics(met)

                   idx<-seq_len(nrow(unpacked))


                   for(i in idx){
                      mattemp[as.character(unpacked[i,"lambda"]),as.character(unpacked[i,"boundMolecules"])]<-as.numeric(unpacked[i,"score"])
                   }

                   return(mattemp)},mattemp)
       names(Loc)<-tag

       FinalOptimalBuild<-list("OptimalParameters"=optimalParama,"OptimalMatrix"=Loc,"method"="MeanScore")
    }

   return(FinalOptimalBuild)


}

.unpackMetrics<-function(metrics){

      lambda <- "lambda = "
      bound <- "boundMolecules = "
      lambdaLoc <- sapply(strsplit(sapply(strsplit(names(metrics)," &"),
      "[[",1),lambda),"[[",2)
      bmLoc <- sapply(strsplit(sapply(strsplit(names(metrics)," &"),
      "[[",2),bound),"[[",2)

      unpacked<-cbind("lambda"=as.numeric(lambdaLoc),"boundMolecules"=as.numeric(bmLoc),"score"=as.numeric(metrics))

      return(unpacked)
}




## cleaning function to drop unneccesary objects

.cleanUpAfterYourself<-function(...){
    args<-list(...)
    rm(args)
    gc()
}

### So for some reason unlist
.internalUnlist<-function(object){
    localList<-GRangesList()
    for(i in seq_along(object)){
      localList<-c(localList,object[[i]])
    }
    return(localList)
}



####

.what.is.predictedProfile<-function(predictedProfile){
    if(.is.genomicProfiles(predictedProfile)){
       if(.tags(predictedProfile)=="ChIPProfile"){
         predictedProfile <-profiles(predictedProfile)
    }
  }

    if(is(predictedProfile,"list") &
              any(grepl("lambda",names(predictedProfile)))){
        stepSize<-unique(width(predictedProfile[[1]][[1]]))
        loci <- unique(unlist(GRangesList(lapply(predictedProfile[[1]], function(x){
                return(GRanges(seqnames=as.character(seqnames(x)),
                               ranges=IRanges(start(x)[1], end(x)[length(x)])))
        }))))

         loci<-rep(loci,length(predictedProfile))

         predictedProfile<-.internalUnlist(predictedProfile)
         predictedProfile<-lapply(predictedProfile, function(x){return(x$ChIP)})

         #warning("No Parameter Combination selected - We will just plot everything",immediate.=TRUE)
    } else if(is(predictedProfile,"list") &
              any(!grepl("lambda",names(predictedProfile)))){
           stepSize<-unique(width(predictedProfile[[1]]))
           loci <- unique(unlist(GRangesList(lapply(predictedProfile, function(x){
                   return(GRanges(seqnames=as.character(seqnames(x)),
                                  ranges=IRanges(start(x)[1], end(x)[length(x)])))
           }))))
           predictedProfile <- lapply(predictedProfile, function(x){return(x$ChIP)})
    } else if(is(predictedProfile,"GRanges")){
         stepSize<-unique(width(predictedProfile))
         loci<- GRanges(seqnames=as.character(predictedProfile),
                        ranges=IRanges(start(predictedProfile)[1],end(predictedProfile)[length(predictedProfile)]))
         predictedProfiles<-list(predictedProfile$ChIP)
    } else{
       stop("Oops Somthing went wrong. We are not sure what your are parsing to predictedProfile")
    }
    # building loci from predicted
    return(list("loci"=loci,"predictedProfile"=predictedProfile,"stepSize"=stepSize))
}


.what.is.occupancy<-function(occupancy,PWM=FALSE){
  if(.is.genomicProfiles(occupancy)){
     if(.tags(occupancy)=="Occupancy"){
       occupancy <-profiles(occupancy)
  }
}

    if(is(occupancy,"list") &
       any(grepl("lambda",names(occupancy)))){
            occupancy<-.internalUnlist(occupancy)

           #warning("No Parameter Combination selected - We will just plot everything",immediate.=TRUE)
      } else if(is(occupancy,"list") &
                any(!grepl("lambda",names(occupancy)))){


          occupancy-occupancy
      } else if(is(occupancy,"GRanges")){

           occupancy<-list(occupancy)
      } else{
         stop("Oops Somthing went wrong. We are not sure what your are parsing to predictedProfile")
      }

    return(occupancy)
}



.what.is.ChIPScore <- function(ChIPScore){
    if(is(ChIPScore,"ChIPScore")){

        ChIPScore<-scores(ChIPScore)


    }

      ## once un packed we can move twords checking the other elements
    if(is(ChIPScore,"list")){
         ChIPScore<-ChIPScore
    } else if(is(ChIPScore,"numeric")){
          ChIPScore<-list(ChIPScore)
    }else{
       stop("Oops Somthing went wrong. We are not sure what you are parsing to ChIPScore")
    }
    return(ChIPScore)
}


.what.is.goodnessOfFit<-function(goodnessOfFit){
  if(.is.genomicProfiles(goodnessOfFit)){
     if(.tags(goodnessOfFit)=="GoF"){
       goodnessOfFit <-profiles(goodnessOfFit)
  }
}

    if(is(goodnessOfFit,"list") &
       any(grepl("lambda",names(goodnessOfFit)))){

                  goodnessOfFit<-unlist(goodnessOfFit,recursive=FALSE)

           #warning("No Parameter Combination selected - We will just plot everything",immediate.=TRUE)
      } else if(is(goodnessOfFit,"list") &
                any(!grepl("lambda",names(goodnessOfFit)))){
          goodnessOfFit<-goodnessOfFit

      } else if(is(goodnessOfFit,"numeric")) {
           goodnessOfFit <-list(goodnessOfFit)
      } else{
         stop("Oops Somthing went wrong. We are not sure what your are parsing to predictedProfile")
      }
}

.what.is.chromatinState <- function(cs,lociLocal){
    
    if(is(cs,"GRanges")){
        csList <- list()
        for(i in seq_along(lociLocal)){
            tmp <- cs[queryHits(findOverlaps(cs, lociLocal[i]))]
           
            start(tmp) <- pmax(start(tmp),start(lociLocal[i]))
            end(tmp) <- pmin(end(tmp),end(lociLocal[i]))
            if(ncol(mcols(tmp))==0){
                tmp$stateID <- "Inaccessible DNA"
            } else {
                # Making sure that it is correctly named 
                colnames(mcols(tmp))[1] <- "stateID"
            }
            csList[[i]] <- as.data.frame(tmp)
            
        }
    } else {
        stop("Oopss Somthing went wrong. Please provide GRanges for chromatinState")
    }
    return(csList)
}

.what.is.geneRef <- function(gr,lociLocal){
    if(is(gr,"GRanges")){
        grList <- list()
        for(i in seq_along(lociLocal)){
           tmp <- gr[queryHits(findOverlaps(gr, lociLocal[i]))]
           start(tmp) <- pmax(start(tmp),start(lociLocal[i]))
           end(tmp) <- pmin(end(tmp),end(lociLocal[i]))
           tmp <- as.data.frame(tmp)
           grList[[i]] <- tmp[,c("seqnames","start","end","width","strand","type","gene_id")]
        }
    } else {
        stop("Oopss Somthing went wrong. Please provide GRanges for chromatinState")
    }
    return(grList)
}
patrickCNMartin/ChIPanalyser documentation built on Nov. 24, 2022, 12:02 a.m.