R/calcExp.R
In casper: Characterization of Alternative Splicing based on Paired-End Reads

Documented in calcExp mergeExp

setMethod("procExp", signature(distrs='readDistrsList'), function(distrs, genomeDB, pc, readLength, islandid, rpkm=TRUE, priorq=2, priorqGeneExpr=2, citype='none', niter=10^3, burnin=100, mc.cores=1, verbose=FALSE, totReads, tgroups=5, min.gt.freq=NULL) {

    if(!'gene_type' %in% colnames(genomeDB@aliases)) stop("gene_type column must be present in genomeDB")
    types <- table(genomeDB@aliases$gene_type)
    if(!is.null(min.gt.freq)){
        freqs <- types/sum(types)
        mer <- names(freqs[freqs < min.gt.freq])
    } else {
        freqs <- sort(types, decreasing=T)
        mer <- names(freqs[tgroups:length(freqs)])
    }
    newgt <- genomeDB@aliases$gene_type
    newgt[newgt %in% mer] <- 'merged'
    newgt <- as.character(newgt)
    names(newgt) <- rownames(genomeDB@aliases)
    #genomeDB@aliases$gene_type_merged <- as.character(newgt)
    
    ord <- length(unique(newgt))-rank(table(newgt)) +1
    types <- tapply(newgt, genomeDB@aliases$island_id, unique)
    sel <- which(unlist(lapply(types, length))>1)
    types[sel] <- lapply(types[sel], function(x) x[which.min(ord[x])])
    types <- unlist(types)
    
#    genomeDB@aliases$type2use <- types[genomeDB@aliases$island_id]

    types <- types[islandid]
    ans <- lapply(unique(types), function(type){
        dis <- new("readDistrs", lenDis=distrs@lenDis[[type]], stDis=distrs@stDis[[type]])
        x <- procExp(dis, genomeDB, pc, readLength, islandid=intersect(islandid, names(types)[types==type]), rpkm, priorq, priorqGeneExpr, citype, niter, burnin, mc.cores, verbose, totReads)
    })

    x <- lapply(ans, exprs)
    x <- do.call(rbind, x)

    f <- lapply(ans, featureData)
    f <- new("AnnotatedDataFrame", data.frame(do.call(rbind, lapply(f, slot, 'data')), distr=newgt[rownames(x)]))

    new('ExpressionSet', exprs=x, featureData=f)
    
})

setMethod("procExp", signature(distrs='readDistrs'), function(distrs, genomeDB, pc, readLength, islandid, rpkm=TRUE, priorq=2, priorqGeneExpr=2, citype='none', niter=10^3, burnin=100, mc.cores=1, verbose=FALSE, totReads) {
    
  #Format input
  startcdf <- distrs@stDis(seq(0,1,.001))
  lendis <- as.double(distrs@lenDis/sum(distrs@lenDis))
  lenvals <- as.integer(names(distrs@lenDis))
  readLength <- as.integer(readLength)
  priorq <- as.double(priorq)
  citype <- as.integer(switch(citype, none=0, asymp=1, exact=2))
  if (length(citype)==0) stop("citype should take the value 'none', 'asymp', or 'exact'")
  niter <- as.integer(niter)
  burnin <- as.integer(burnin)
  verbose <- as.integer(verbose)
  if ((niter<=burnin) & citype==2) stop("Too many burnin iterations specified. Decrease burnin or increase niter")
  #if (missing(islandid)) islandid <- names(genomeDB@islands)[elementNROWS(genomeDB@islands)>1]
  if (missing(islandid)) islandid <- names(genomeDB@islands)

  exons <- as.integer(names(genomeDB@islands@unlistData))
  names(exons) <- rep(names(genomeDB@islands), elementNROWS(genomeDB@islands))
  exons <- split(unname(exons), names(exons))
  exonwidth <- width(genomeDB@islands@unlistData)
  names(exonwidth) <- rep(names(genomeDB@islands), elementNROWS(genomeDB@islands))
  exonwidth <- split(unname(exonwidth), names(exonwidth))
  strand <- as.character(strand(genomeDB@islands@unlistData))[cumsum(c(1, elementNROWS(genomeDB@islands)[-length(genomeDB@islands)]))]
  names(strand) <- names(genomeDB@islands)
  
  if (!all(islandid %in% names(exons))) stop('islandid not found in genomeDB@islands')
  if (!all(islandid %in% names(pc))) stop('islandid not found in pc')
  if (!all(islandid %in% names(genomeDB@transcripts))) stop('islandid not found in genomeDB@transcripts')
  
      #Define basic function
  
  f <- function(z) {
    islandid <- as.integer(z)
    exons <- exons[z]
    exonwidth <- exonwidth[z]
    transcripts <- genomeDB@transcripts[z]
    tmp <- strand[z]
    strand <- vector(mode='integer', length=length(tmp))
    sel <- tmp=='+'
    strand[sel] <- 1
    sel <- tmp=='-'
    strand[sel] <- -1
    sel <- tmp=='*'
    strand[sel] <- 0
    strand <- as.list(as.integer(strand))
    pc <- pc[z]
    ans <- calcKnownMultiple(exons=exons,exonwidth=exonwidth,transcripts=transcripts,islandid=as.list(islandid),pc=pc,startcdf=startcdf,lendis=lendis,lenvals=lenvals,readLength=readLength,priorq=priorq, strand=strand, citype=citype, niter=niter, burnin=burnin, verbose=verbose)
    if (citype==0) {
      ans <- lapply(ans, function(z) { res=vector("list",2); res[[1]]= z[[1]]; names(res[[1]])= z[[2]]; res[[2]]=z[[5]]; res })
    } else if (citype==1) {
      ans <- lapply(ans, function(z) { res=vector("list",3); res[[1]]= z[[1]]; res[[2]]= z[[3]]; names(res[[1]])= names(res[[2]])= z[[2]]; res[[3]] = z[[5]]; res })
    } else if (citype==2) {
      ans <- lapply(ans, function(z) { res=vector("list",4); res[[1]]= z[[1]]; res[[2]]= z[[3]]; res[[3]]= matrix(z[[4]],nrow=niter-burnin); names(res[[1]])= names(res[[2]])= z[[2]]; res[[4]] = z[[5]]; res })
    }
    ans
  }

  #Run
  sel <- !sapply(pc[islandid], is.null)
  all <- islandid
  islandid <- islandid[sel]
  sel <- elementNROWS(genomeDB@islands[islandid])>1
  islandid <- islandid[sel]
  if(length(islandid)>0){
    if (verbose) cat("Obtaining expression estimates...\n")
    if (mc.cores>1 && length(islandid)>mc.cores) {
      if ('parallel' %in% loadedNamespaces()) {
                    #split into smaller jobs
        islandid <- split(islandid, cut(1:length(islandid), mc.cores))
        ans <- parallel::mclapply(islandid,f,mc.cores=mc.cores, mc.preschedule=FALSE)
        ans <- unlist(ans, recursive=F)
        names(ans) <- unlist(islandid)
      } else stop('parallel library has not been loaded!')
    } else {
      ans <- f(islandid)
      names(ans) <- islandid
    }
  } else ans <- NULL

  nreads <- unlist(lapply(pc[all], sum))
  nreads[names(ans)] <- unlist(sapply(ans, '[[', length(ans[[1]])))  #count only reads with positive probability under provided variants
  
  sel <- sapply(ans, function(z) sum(z[[1]])==0) #genes with a single variant
  ans[sel] <- NULL

  miss <- lapply(genomeDB@transcripts[all[!(all %in% names(ans))]], names)  #careful: some genes repeated in miss though they appear only once in transcripts, e.g. '20306' sum(names(miss)=='20306')

  #Format as ExpressionSet

  if (verbose) cat("Formatting output...\n")
  transcript <- unlist(lapply(ans, function(z) names(z[[1]])))
  island_id <- rep(names(ans),sapply(ans,function(z) length(z[[1]])))
  fdata <- data.frame(transcript=transcript, island_id=island_id)
  if(length(miss)>0){
    transcript <- c(transcript, unname(unlist(miss)))
    island_id <- c(island_id, rep(names(miss), sapply(miss, length)))
    fdata <- data.frame(transcript=transcript, island_id=island_id)
    ntx.miss <- sapply(miss,length) 
    misse <- rep(1/ntx.miss,ntx.miss) 
    a0 <- rep(priorq*ntx.miss,ntx.miss) 
    missSE <- sqrt(priorq * (a0-priorq) / (a0^2 * (a0+1))) 
    names(misse) <- unlist(miss)
    exprsx <- matrix(c(unlist(lapply(ans,'[[',1)), misse) ,ncol=1)
    rownames(exprsx) <- c(unlist(lapply(ans, function(z) names(z[[1]]))), names(misse))
  } else {
    exprsx <- matrix(unlist(lapply(ans, '[[', 1)), ncol=1)
    rownames(exprsx) <- unlist(lapply(ans, function(z) names(z[[1]])))
  }
    exprsx <- round(exprsx,10)
    if (citype==1) {
      se <- unlist(lapply(ans,'[[',2))
      if(length(miss)>0) se <- c(se, missSE)
      if (!rpkm) {
        alpha <- .05
        fdata$ci95.low <- qnorm(alpha/2,mean=exprsx,sd=se)
        fdata$ci95.high <- qnorm(1-alpha/2,mean=exprsx,sd=se)
        sel <- round(fdata$ci95.low,10)<0 & !is.na(se); fdata$ci95.low[sel] <- 0
        se[sel][se[sel]==0] <- 1e-20
        fdata$ci95.high[sel] <- unlist(mapply(function(m,s) qtnorm(1-alpha/2,mean=m,sd=s,lower=0,upper=1), as.list(exprsx[sel,1]), as.list(se[sel])))
        sel <- round(fdata$ci95.high,10)>1 & !is.na(se); fdata$ci95.high[sel] <- 1
        fdata$ci95.low[sel] <- unlist(mapply(function(m,s) qtnorm(alpha/2,mean=m,sd=s,lower=0,upper=1), as.list(exprsx[sel,1]), as.list(se[sel])))
      }
      fdata$se <- se
    }
    if (citype==2) {
      if(sum(unlist(lapply(ans, function(x) is.na(x[[3]]))))==0){
        if (mc.cores>1) {
          if ('parallel' %in% loadedNamespaces()) {
           se <- unlist(parallel::mclapply(ans, function(z) (colMeans(z[[3]]^2) - colMeans(z[[3]])^2) * nrow(z[[3]])/(nrow(z[[3]]) - 1), mc.cores=mc.cores))
         } else stop('parallel library has not been loaded!')
        } else se <- unlist(lapply(ans, function(z) (colMeans(z[[3]]^2) - colMeans(z[[3]])^2) * nrow(z[[3]])/(nrow(z[[3]]) - 1)))
        
        if(length(miss)>0) se <- c(se, missSE)
        fdata$se <- se
        if (!rpkm) {
          q <- lapply(ans,function(z) apply(z[[3]],2,quantile,probs=c(.025,.975)))
          q <- t(do.call(cbind,q))
          if(length(miss)>0) q <- rbind(q, matrix(NA, nrow=length(misse), ncol=2))
          fdata$ci95.low <- q[,1]; fdata$ci95.high <- q[,2]
          tmp <- cbind(exprsx, fdata[,c('ci95.low','ci95.high')])
          tmp <- as.data.frame(t(apply(tmp, 1, function(x){
            if(!any(is.na(x))){
              y <- x
              if(x[2]>x[1]) y[2]=y[1]
              if(x[3]<x[1]) y[3]=y[1]
            } else y <- x
            y
            })))
          fdata$ci95.low <- tmp$ci95.low
          fdata$ci95.high <- tmp$ci95.high
        }
      }
    }
  rownames(exprsx) <- rownames(fdata) <- fdata$transcript

  fdata <- cbind(fdata, nreads[as.character(fdata$island_id)])
  colnames(fdata)[ncol(fdata)] <- "explCnts"

  #Compute log(RPKM)
  if (rpkm) {
    if (citype != 0) se.logpi <- fdata$se / exprsx
    nreads <- nreads[unique(as.character(fdata$island_id))]
    totReads <- sum(nreads) + priorqGeneExpr*length(nreads) 
    txLength <- txLength(genomeDB=genomeDB)
    apost <- nreads + (priorqGeneExpr-1)
    thest <- apost/sum(apost)
    theta <- data.frame(island_id=names(nreads), thest=thest)
    exprsx <- as.matrix(logrpkm(fdata, th=theta, pi=exprsx, genomeDB=genomeDB))
    colnames(exprsx) <- NULL
    if (citype != 0) {
      se.theta <- sqrt(apost * (1-apost/totReads) / (totReads * (totReads+1))) 
      se.logtheta <- se.theta / thest
      fdata$se <- sqrt(se.logpi^2 + se.logtheta[as.character(fdata$island_id)]^2)
      alpha <- 0.05
      err <- qnorm(alpha/2)*fdata$se; fdata$ci95.low <- exprsx + err; fdata$ci95.high <- exprsx - err #Added
    }
  }
  #Return ExpressionSet
  tx2gene <- unique(genomeDB@aliases[,c('tx_name','gene_id')])
  names(tx2gene)[1] <- 'transcript'
  fdata$transcript <- as.character(fdata$transcript)
  tx2gene$transcript <- as.character(tx2gene$transcript)
  fdata <- merge(fdata, tx2gene, by='transcript')
  rownames(fdata) <- fdata$transcript
  if(citype!=0) {
    fdata <- fdata[rownames(exprsx),c('transcript','gene_id','island_id','explCnts','se','ci95.low','ci95.high')]
  } else fdata <- fdata[rownames(exprsx),c('transcript','gene_id','island_id','explCnts')]
  fdata <- new("AnnotatedDataFrame",fdata)
  ans <- new("ExpressionSet",exprs=exprsx,featureData=fdata)
  ans
})

mergeExp <- function(minus, plus){
  exp <- rbind(exprs(plus), exprs(minus))
  fdata <- new("AnnotatedDataFrame", rbind(fData(plus), fData(minus)))
  ans <- new("ExpressionSet",exprs=exp,featureData=fdata)
  ans
}

calcExp <- function(distrs, genomeDB, pc, readLength, islandid, rpkm=TRUE, priorq=2, priorqGeneExpr=2, citype='none', niter=10^3, burnin=100, mc.cores=1, verbose=FALSE) {
  totReads <- getNreads(pc); totReads <- sum(totReads) + priorqGeneExpr*length(totReads) #Modified: totReads includes prior sample size
  
  if (missing(readLength)) stop("readLength must be specified")
  if (genomeDB@denovo) stop("genomeDB must be a known genome")
  if (pc@denovo) stop("pc must be a pathCounts object from known genome")
  if(pc@stranded){
    if(missing(islandid)) islandid <- c(names(pc@counts$plus), names(pc@counts$minus))
    is <- as.character(strand(genomeDB@islands@unlistData))[cumsum(c(1, elementNROWS(genomeDB@islands)[-length(genomeDB@islands)]))]
    names(is) <- names(genomeDB@islands)
    plusGI <- islandid[is[islandid]=="+"]
    plus <- NULL
    if(length(plusGI)>0) {
      plusDB <- genomeBystrand(genomeDB, "+")
      plus <- procExp(distrs, plusDB, pc=pc@counts$plus, readLength=readLength, islandid=plusGI, rpkm=rpkm, priorq=priorq, priorqGeneExpr=priorqGeneExpr, niter=niter, burnin=burnin, mc.cores=mc.cores, citype=citype, totReads=totReads)
    }
    minusGI <- islandid[is[islandid]=="-"]
    minus <- NULL
    if(length(minusGI)>0) {
      minusDB <- genomeBystrand(genomeDB, "-")
      minus <- procExp(distrs, minusDB, pc=pc@counts$minus, readLength=readLength, islandid=minusGI, rpkm=rpkm, priorq=priorq, priorqGeneExpr=priorqGeneExpr, niter=niter, burnin=burnin, mc.cores=mc.cores, citype=citype, totReads=totReads)
    } 
    if(is.null(plus) & is.null(minus)) stop("No counts in islandid genes")
    if(!(is.null(plus) | is.null(minus))) { ans <- mergeExp(plus, minus) }
    else { if(is.null(plus)) {ans <- minus } else ans <- plus }
  } else {
    if(missing(islandid)) islandid <- names(pc@counts[[1]])
    if(sum(!unlist(lapply(pc@counts[islandid], is.null)))) stop("No counts in islandid genes")
    ans <- procExp(distrs, genomeDB, pc=pc@counts[[1]], readLength=readLength, islandid=islandid, rpkm=rpkm, priorq=priorq, priorqGeneExpr=priorqGeneExpr, niter=niter, burnin=burnin, mc.cores=mc.cores, citype=citype, totReads=totReads)
  }

  ans
}
    
calcKnownMultiple <- function(exons, exonwidth, transcripts, islandid, pc, startcdf, lendis, lenvals, readLength, priorq, strand, citype, niter, burnin, verbose) {
  ans <- .Call("calcKnownMultiple",exons,exonwidth,transcripts,islandid,pc,startcdf, lendis, lenvals, readLength, priorq, strand, citype, niter, burnin, verbose, PACKAGE="casper")
  return(ans)
}


lhoodGrid <- function(pc, distrs, genomeDB, readLength, islandid, grid, priorq=2) {
  if (length(islandid)>1) stop("Only 1 gene is allowed, please specify a single gene in argument 'islandid'")
  if (missing(readLength)) stop("readLength must be specified")
  if (genomeDB@denovo) stop("genomeDB must be a known genome")
  if (pc@denovo) stop("pc must be a pathCounts object from known genome")

  #Format input
  startcdf <- distrs@stDis(seq(0,1,.001))
  lendis <- as.double(distrs@lenDis/sum(distrs@lenDis))
  lenvals <- as.integer(names(distrs@lenDis))
  readLength <- as.integer(readLength)
  priorq <- as.double(priorq)

  exons <- as.integer(names(genomeDB@islands[[islandid]]))
  exonwidth <- width(genomeDB@islands[[islandid]])
  strand <- as.character(strand(genomeDB@islands@unlistData))[cumsum(c(1, elementNROWS(genomeDB@islands)[-length(genomeDB@islands)]))]
  names(strand) <- names(genomeDB@islands)
  strand   <- strand[islandid]
  transcripts <- genomeDB@transcripts[[islandid]]
  if (length(transcripts)==1) stop("Single transcript specified, estimation not run")
  strand <- as.integer(ifelse(strand=='+', 1, -1))
  if (pc@stranded) {
     if (strand==1) pc <- pc@counts[['plus']][[islandid]] else pc <- pc@counts[['minus']][[islandid]]
  } else {
     pc <- pc@counts[[1]][[islandid]]
  }
  islandid <- as.integer(islandid)
  
  if (missing(grid)) {
    grid <- lapply(1:(length(transcripts)-1), function(z) seq(-10,10,length=10^(5/(length(transcripts)-1))))
    #grid <- lapply(1:length(transcripts), function(z) seq(0.0001,.9999,length=10^(5/length(transcripts))))
  } else {
    if (!is.list(grid)) stop("grid must be of class list")
    if (length(grid) != length(transcripts)-1) stop("grid must have one row less than the number of known transcripts")
  }
  milogit <- function(theta) cbind(rep(1,nrow(theta)),exp(theta))/(1+rowSums(exp(theta))) #matrix conversion
  gridmat <- t(milogit(expand.grid(grid)))
  
  ans <- .Call("lhoodGrid",gridmat,exons,exonwidth,transcripts,pc,startcdf,lendis,lenvals,readLength,priorq,strand)
  names(ans[[2]]) <- ans[[3]]; ans[[3]] <- NULL
  names(ans) <- c('logpos','piest','logpos.piest','S','pathprob','marginalLhood')
  rownames(ans$pathprob) <- names(ans$piest)
  ans$pathprob <- t(ans$pathprob)
  names(ans$marginalLhood) <- c('laplace','IS')
  ans$grid <- grid
  return(ans)  
}