R/sim.R
In metaseqR2: An R package for the analysis and result reporting of RNA-Seq data by combining multiple statistical algorithms

Documented in downsampleCounts estimateAufcWeights estimateSimParams makeSimDataSd makeSimDataTcc

estimateAufcWeights <- function(counts,normalization,statistics,nsim=10,
    N=10000,samples=c(3,3),ndeg=c(500,500),top=500,modelOrg="mm9",fcBasis=1.5,
    drawFpc=FALSE,rc=NULL,...) {
    if (!requireNamespace("zoo"))
        stopwrap("R pacakage zoo is required in order to estimate AUFC ",
            "weights!")

    if (ncol(counts)<4)
        stopwrap("Cannot estimate AUFC weights with an initial dataset with ",
            "less than 4 samples!")
    else if (ncol(counts)>=4 && ncol(counts)<10) {
        reind <- sample(seq_len(ncol(counts)),20,replace=TRUE)
        counts <- counts[,reind]
    }
    parList <- estimateSimParams(counts,...)

    disp("Running simulations... This procedure requires time... Please ",
        "wait...")
    simResults <- cmclapply(seq_len(nsim),function(x,normalization,statistics,N,
        parList,samples,ndeg,fcBasis,modelOrg) {
        D <- makeSimDataSd(N=N,param=parList,samples=samples,ndeg=ndeg,
            fcBasis=fcBasis,modelOrg=modelOrg)
        dd <- D$simdata
        
        if (!is.null(modelOrg)) {
            tmp <- metaseqr2(
                counts=dd,
                sampleList=list(G1=paste("G1_rep",seq_len(samples[1]),sep=""),
                    G2=paste("G2_rep",seq_len(samples[2]),sep="")),
                contrast=c("G1_vs_G2"),
                annotation="embedded",
                embedCols=list(
                    idCol=4,
                    gcCol=5,
                    nameCol=7,
                    btCol=8
                ),
                org=modelOrg,
                countType="gene",
                normalization=normalization,
                statistics=statistics,
                metaP="simes",
                figFormat="png",
                preset="all_basic",
                exportWhere=tempdir(),
                restrictCores=rc,
                qcPlots=NULL,
                report=FALSE,
                runLog=FALSE,
                outList=TRUE
            )
        }
        else {
            tmp <- metaseqr2(
                counts=dd,
                sampleList=list(G1=paste("G1_rep",seq_len(samples[1]),sep=""),
                    G2=paste("G2_rep",seq_len(samples[2]),sep="")),
                contrast=c("G1_vs_G2"),
                annotation="embedded",
                embedCols=list(
                    idCol=4,
                    gcCol=5,
                    nameCol=7,
                    btCol=8
                ),
                countType="gene",
                normalization=normalization,
                statistics=statistics,
                metaP="simes",
                figFormat="png",
                preset="all_basic",
                exportWhere=tempdir(),
                qcPlots=NULL,
                report=FALSE,
                runLog=FALSE,
                outList=TRUE,
                restrictCores=rc
            )
        }

        # Retrieve several p-values
        pList <- vector("list",length(statistics))
        for (s in statistics) {
            field <- paste("p-value",s,sep="_")
            pList[[s]] <- tmp$data[[1]][,field]
            names(pList[[s]]) <- rownames(tmp$data[[1]])
        }
        pMatrix <- do.call("cbind",pList)
        return(list(simdata=D,pvalues=pMatrix))
    },normalization,statistics,N,parList,samples,ndeg,fcBasis,modelOrg,rc=rc)

    disp("Estimating AUFC weights... Please wait...")
    fpcObj <- cmclapply(simResults,function(x) {
        trueDe <- x$simdata$truedeg
        names(trueDe) <- rownames(x$simdata$simdata)
        pMatrix <- x$pvalues
        trueDe <- trueDe[rownames(pMatrix)]
        fdc <- diagplotFtd(trueDe,pMatrix,type="fpc",draw=FALSE)
    },rc=rc)
    avgFpc <- diagplotAvgFtd(fpcObj,draw=drawFpc)

    x <- seq_len(top)
    aufc <- apply(avgFpc$avgFtdr$means[seq_len(top),],2,function(x,i) {
        return(sum(diff(i)*rollmean(x,2)))
    },x)
    weightAufc <- (sum(aufc)/aufc)/sum(sum(aufc)/aufc)
    return(weightAufc)
}

makeSimDataTcc <- function(...) {
    if (suppressWarnings(!requireNamespace("TCC")))
        stopwrap("Bioconductor package TCC is required to create ",
            "simulated data!")
    #tcc <- simulateReadCounts(Ngene=Ngene,PDEG=PDEG,DEG.assign=DEG.assign,
    #    DEG.foldchange=DEG.foldchange,replicates=replicates)
    tcc <- TCC::simulateReadCounts(...)
    n <- nrow(tcc$count)
    # Now we have to simulate annotation
    chromosome <- paste("chr",1+round(20*runif(n)),sep="")
    start <- 1 + round(1e+6*runif(n))
    end <- start + 250 + round(1e+6*runif(n))
    gene_id <- gene_name <- rownames(tcc$count)
    gc_content <- runif(n)
    strand <- sample(c("+","-"),n,replace=TRUE)
    biotype <- sample(paste("biotype",seq_len(10)),n,replace=TRUE)
    simData <- data.frame(
        chromosome=chromosome,
        start=start,
        end=end,
        gene_id=gene_id,
        gc_content=gc_content,
        strand=strand,
        gene_name=gene_name,
        biotype=biotype
    )
    simData <- cbind(simData,tcc$count)
    return(list(simdata=simData,simparam=tcc$simulation))
}

makeSimDataSd <- function(N,param,samples=c(5,5),ndeg=rep(round(0.1*N),2),
    fcBasis=1.5,libsizeRange=c(0.7,1.4),libsizeMag=1e+7,modelOrg=NULL,
    simLengthBias=FALSE) {
    if (!is.null(modelOrg)) {
        modelOrg <- tolower(modelOrg)
        checkTextArgs("modelOrg",modelOrg,c("hg18","hg19","mm9","mm10",
            "rno5","dm3","rn5","rn6","danrer7","pantro4","tair10"),
            multiarg=FALSE)
        ann <- getAnnotation(modelOrg,"gene")
        realGc <- as.numeric(ann$gc_content)
        realStart <- as.numeric(ann$start)
        realEnd <- as.numeric(ann$end)
        realStrand <- as.character(ann$strand)
    }
    muHat <- param$muHat
    phiHat <- param$phiHat

    if (simLengthBias) {
        sind <- sort(muHat,index.return=TRUE)$ix
        muHat <- muHat[sind]
        phiHat <- phiHat[sind]
        if (length(muHat)>=N)
            ii <- sort(sample(seq_len(length(muHat)),N))
        else
            ii <- sort(sample(seq_len(length(muHat)),N,replace=TRUE))
    }
    else {
        if (length(muHat)>=N)
            ii <- sample(seq_len(length(muHat)),N)
        else
            ii <- sample(seq_len(length(muHat)),N,replace=TRUE)
    }

    s1 <- samples[1]
    s2 <- samples[2]
    L1 <- round(libsizeMag*runif(s1,min=libsizeRange[1],
        max=libsizeRange[2]))
    L2 <- round(libsizeMag*runif(s2,min=libsizeRange[1],
        max=libsizeRange[2]))

    lambda1 <- do.call("cbind",rep(list(muHat[ii]),s1))
    mu1 <- sweep(lambda1,2,L1/sum(lambda1[,1]),"*")
    sim1 <- matrix(0,N,s1)
    for (j in seq_len(s1))
        sim1[,j] <- rnbinom(N,size=1/phiHat[ii],mu=mu1[,j])

    v <- numeric(N)
    if (sum(ndeg)>0) {
        iUpdown <- sample(seq_len(length(v)),sum(ndeg))
        regDir <- rep(c(1,-1),c(ndeg[1],ndeg[2]))
        v[iUpdown] <- regDir
        lambda2 <- ((fcBasis + rexp(N))^v)*lambda1
        mu2 <- sweep(lambda2,2,L2/sum(lambda2[,1]),"*")
        sim2 <- matrix(0,N,s2)
        for (j in seq_len(s2))
            sim2[,j] <- rnbinom(N,size=1/phiHat[ii],mu=mu2[,j])
    }
    else {
        lambda2 <- lambda1
        mu2 <- sweep(lambda2,2,L2/sum(lambda2[,1]),"*")
        sim2 <- matrix(0,N,s2)
        for (j in seq_len(s2))
            sim2[,j] <- rnbinom(N,size=1/phiHat[ii],mu=mu2[,j])
    }

    # Now we have to simulate annotation
    chromosome <- paste("chr",1+round(20*runif(N)),sep="")
    gene_id <- gene_name <- paste("gene",seq_len(N),sep="_")
    if (!is.null(modelOrg)) {
        if (length(realGc)>=N)
            sampleInd <- sample(seq_len(length(realGc)),N)
        else
            sampleInd <- sample(seq_len(length(realGc)),N,replace=TRUE)
        gc_content <- realGc[sampleInd]
        start <- realStart[sampleInd]
        end <- realEnd[sampleInd]
        strand <- realStrand[sampleInd]
        if (simLengthBias) {
            lenix <- sort(end-start,index.return=TRUE)$ix
            start <- start[lenix]
            end <- end[lenix]
            gc_content <- gc_content[lenix]
            strand <- strand[lenix]
        }
    }
    else {
        gc_content <- runif(N)
        start <- 1 + round(1e+6*runif(N))
        end <- start + 250 + round(1e+6*runif(N))
        strand <- sample(c("+","-"),N,replace=TRUE)
        if (simLengthBias) {
            lenix <- sort(end-start,index.return=TRUE)$ix
            start <- start[lenix]
            end <- end[lenix]
            gc_content <- gc_content[lenix]
            strand <- strand[lenix]
        }
    }
    biotype <- sample(paste("biotype",seq_len(10)),N,replace=TRUE)
    simData <- data.frame(
        chromosome=chromosome,
        start=start,
        end=end,
        gene_id=gene_id,
        gc_content=gc_content,
        strand=strand,
        gene_name=gene_name,
        biotype=biotype
    )
    colnames(sim1) <- paste("G1_rep",seq_len(s1),sep="")
    colnames(sim2) <- paste("G2_rep",seq_len(s2),sep="")
    rownames(sim1) <- rownames(sim2) <- names(v) <- gene_id

    return(list(simdata=cbind(simData,sim1,sim2),truedeg=v))
}

estimateSimParams <- function(realCounts,libsizeGt=3e+6,rowmeansGt=5,
    eps=1e-11,rc=NULL,draw=FALSE) {
    if (is.data.frame(realCounts))
        mat <- as.matrix(realCounts)
    else if (is.matrix(realCounts))
        mat <- realCounts
    else if (file.exists(realCounts)) {
        realData <- read.delim(realCounts,row.names=1)
        mat <- as.matrix(realData)
    }
    else
        stopwrap("The input count data must be either a file, a matrix or a ",
            "data frame!")
    
    lowLib <- which(apply(mat,2,sum)<libsizeGt)
    if (length(lowLib)==ncol(mat))
        stopwrap("Cannot estimate simulation parameters as the library sizes ",
            "are too small! Try lowering the value of the libsizeGt ",
            "parameter...")
    if (length(lowLib)>0)
        mat <- mat[,-lowLib]
    disp("Downsampling counts...")
    dmat <- downsampleCounts(mat)
    lowCo <- which(apply(dmat,1,
        function(x) if (mean(x)<5) TRUE else FALSE))
    if (length(lowCo)>0)
        dmat <- dmat[-lowCo,]
    muHat <- apply(dmat,1,mean)
    disp("Estimating initial dispersion population...")
    phiEst <- apply(dmat,1,function(x) {
        m <- mean(x)
        v <- var(x)
        phi <- (v-m)/m^2
        return(phi)
    })
    phiInd <- which(phiEst>0)
    phiEst <- phiEst[phiInd]
    dmat <- dmat[phiInd,]
    disp("Estimating dispersions using log-likelihood...\n")
    init <- cmclapply(seq_along(seq_len(nrow(dmat))),function(i,d,p) {
        list(y=d[i,],h=p[i])
    },dmat,phiEst,rc=rc)
    phiHat <- unlist(cmclapply(init,function(x,eps) {
        optimize(mlfo,c(x$h-1e-2,x$h+1e-2),y=x$y,tol=eps)$minimum
    },eps,rc=rc))
    if (draw) {
        dev.new()
        plot(log10(muHat[phiInd]),log10(phiHat),col="blue",pch=20,cex=0.5,
            xlab="",ylab="")
        title(xlab="mean",ylab="dispesion",font=2,cex=0.9)
        grid()
    }
    return(list(muHat=muHat[phiInd],phiHat=phiHat))
}

downsampleCounts <- function(counts) {
    libSizes <- apply(counts,2,sum)
    targetSize <- min(libSizes)
    toRemove <- libSizes-targetSize
    ii <- which(toRemove>0)
    dcounts <- counts
    for (i in ii) {
        tmp <- round(toRemove[i]*(counts[,i]/sum(counts[,i])))
        victimSize <- sum(tmp)
        if (victimSize>toRemove[i]) {
            dif <- victimSize - toRemove[i]
            #victims <- sample(seq_len(length(tmp)),dif)
            victims <- sort(tmp,decreasing=TRUE,
                index.return=TRUE)$ix[seq_len(dif)]
            tmp[victims] <- tmp[victims] - 1
        }
        else if (victimSize<toRemove[i]) {
            dif <- toRemove[i] - victimSize
            #victims <- sample(seq_len(length(tmp)),dif)
            victims <- sort(tmp,decreasing=TRUE,
                index.return=TRUE)$ix[seq_len(dif)]
            tmp[victims] <- tmp[victims] + 1
        }
        dcounts[,i] <- dcounts[,i] - tmp
    }
    return(dcounts)
}

mlfo <- function(phi,y) {
    N <- length(y)
    mu <- mean(y)
    -(sum(lgamma(y+1/phi)) - N*lgamma(1/phi) - sum(lgamma(y+1)) + 
        sum(y*log(mu*phi/(1+mu*phi))) - (N/phi)*log(1+mu*phi))
}

makePermutation <- function(counts,sampleList,contrast,repl=FALSE) {
    cnts <- strsplit(contrast,"_vs_")[[1]]
    virtualContrast <- paste(paste("VirtCond",seq_len(length(cnts)),sep=""),
        collapse="_vs_")
    vitualSampleList <- vector("list",length(sampleList))
    names(vitualSampleList) <- paste("VirtCond",seq_along(sampleList),sep="")
    # Avoid the extreme case of returning a vector with all samples the same
    if (repl) {
        resample <- rep(1,ncol(counts))
        while(length(unique(resample))==1)
            resample <- sample(seq_len(ncol(counts)),ncol(counts),replace=repl)
    }
    else
        resample <- sample(seq_len(ncol(counts)),ncol(counts),replace=repl)
    virtualCounts <- counts[,resample]
    samples <- paste("VirtSamp",seq_len(ncol(counts)),sep="")
    colnames(virtualCounts) <- samples
    nsample <- vapply(sampleList,length,numeric(1))
    virtualSamples <- split(samples,rep(seq_len(length(nsample)),nsample))
    names(virtualSamples) <- names(vitualSampleList)
    for (n in names(vitualSampleList))
        vitualSampleList[[n]] <- virtualSamples[[n]]
    return(list(counts=virtualCounts,sampleList=vitualSampleList,
        contrast=virtualContrast))
}

calcOtr <- function(truth,p,sig=0.05) {
    if (is.list(p))
        pmat <- do.call("cbind",p)
    else if (is.data.frame(p))
        pmat <- as.matrix(p)
    else if (is.matrix(p))
        pmat <- p
    if (is.null(colnames(pmat)))
        colnames(pmat) <- paste("p",seq_len(ncol(pmat)),sep="_")

    sigGenes <- trueIsects <- missed <- vector("list",ncol(pmat))
    names(sigGenes) <- names(trueIsects) <- names(missed) <- colnames(pmat)
    for (n in colnames(pmat)) {
        sigGenes[[n]] <- names(which(pmat[,n]<sig))
        trueIsects[[n]] <- intersect(sigGenes[[n]],names(which(truth!=0)))
        missed[[n]] <- setdiff(names(which(truth!=0)),trueIsects[[n]])
    }
    result <- data.frame(
        P=vapply(sigGenes,length,numeric(1)),
        TP=vapply(trueIsects,length,numeric(1)),
        FN=vapply(missed,length,numeric(1))
    )
    result$FP <- result$P - result$TP
    otr <- result$TP/(result$FP+result$FN)
    names(otr) <- rownames(result)
    return(list(result=result,otr=otr))
}

calcF1Score <- function(truth,p,sig=0.05) {
    if (is.list(p))
        pmat <- do.call("cbind",p)
    else if (is.data.frame(p))
        pmat <- as.matrix(p)
    else if (is.matrix(p))
        pmat <- p
    if (is.null(colnames(pmat)))
        colnames(pmat) <- paste("p",seq_len(ncol(pmat)),sep="_")

    sigGenes <- trueIsects <- missed <- vector("list",ncol(pmat))
    names(sigGenes) <- names(trueIsects) <- names(missed) <- colnames(pmat)
    for (n in colnames(pmat)) {
        sigGenes[[n]] <- names(which(pmat[,n]<sig))
        trueIsects[[n]] <- intersect(sigGenes[[n]],names(which(truth!=0)))
        missed[[n]] <- setdiff(names(which(truth!=0)),trueIsects[[n]])
    }
    result <- data.frame(
        P=vapply(sigGenes,length,numeric(1)),
        TP=vapply(trueIsects,length,numeric(1)),
        FN=vapply(missed,length,numeric(1))
    )
    result$FP <- result$P - result$TP
    f1 <- 2*result$TP/(2*result$TP+result$FP+result$FN)
    names(f1) <- rownames(result)
    return(list(result=result,f1=f1))
}

Any scripts or data that you put into this service are public.

metaseqR2 documentation built on Nov. 8, 2020, 7:34 p.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

metaseqR2
An R package for the analysis and result reporting of RNA-Seq data by combining multiple statistical algorithms

R/sim.R
In metaseqR2: An R package for the analysis and result reporting of RNA-Seq data by combining multiple statistical algorithms

Defines functions calcF1Score calcOtr makePermutation mlfo downsampleCounts estimateSimParams makeSimDataSd makeSimDataTcc estimateAufcWeights

Documented in downsampleCounts estimateAufcWeights estimateSimParams makeSimDataSd makeSimDataTcc

Try the metaseqR2 package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

metaseqR2 An R package for the analysis and result reporting of RNA-Seq data by combining multiple statistical algorithms

R/sim.R In metaseqR2: An R package for the analysis and result reporting of RNA-Seq data by combining multiple statistical algorithms

Defines functions calcF1Score calcOtr makePermutation mlfo downsampleCounts estimateSimParams makeSimDataSd makeSimDataTcc estimateAufcWeights

Documented in downsampleCounts estimateAufcWeights estimateSimParams makeSimDataSd makeSimDataTcc

Try the metaseqR2 package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

metaseqR2
An R package for the analysis and result reporting of RNA-Seq data by combining multiple statistical algorithms

R/sim.R
In metaseqR2: An R package for the analysis and result reporting of RNA-Seq data by combining multiple statistical algorithms