R/readSimFFPE_targeted.R

Defines functions targetReadSimFFPE

Documented in targetReadSimFFPE

targetReadSimFFPE <- function(referencePath, PhredScoreProfile,
                              targetRegions, outFile, coverage,
                              readLen=150, meanInsertLen=250, sdInsertLen=80,
                              enzymeCut=FALSE, chimericRatio=0.08,
                              localMatchRatio=0.1, padding=NA, 
                              minGap=NA, windowLen=10000, matchWinLen=10000,
                              meanLogSeedLen=1.7, sdLogSeedLen=0.4, 
                              seedPassRate=0.78, sdTargetDist=120, 
                              sameStrandProb=0.5, spikeWidth=1500, 
                              betaShape1=0.5, betaShape2=0.5, 
                              sameTarRegionProb=0, chimMutRate=0.005, 
                              noiseRate=0.0015, highNoiseRate=0.08, 
                              highNoiseProb=0.015, pairedEnd=TRUE, 
                              prefix="SimFFPE", threads=1,
                              localChimeric=TRUE, distantChimeric=TRUE,
                              normalReads=TRUE, overWrite=FALSE) {
    
    if(missing(referencePath)) stop("referencePath is required")
    if(missing(PhredScoreProfile)) stop("PhredScoreProfile is required")
    if(missing(targetRegions)) stop("targetRegions is required")
    if(missing(outFile)) stop("outFile is required")
    if(missing(coverage)) stop("coverage is required")
    if(missing(readLen)) stop("readLen is required")
    if(missing(meanInsertLen)) stop("meanInsertLen is required")
    if(missing(sdInsertLen)) stop("sdInsertLen is required")
    
    if(max(betaShape1, betaShape2) > 1) {
        stop ("betaShape1 and betaShape2 should not be greater than 1")
    }
    if(min(betaShape1, betaShape2) <= 0) {
        stop ("betaShape1 and betaShape2 should be greater than 0")
    }
    
    if (nrow(PhredScoreProfile) != readLen) {
        stop("The number of rows in PhredScoreProfile should be the same as", 
             " the input readLen")
    }
    
    if (overWrite) {
        overWriteFastq(outFile = outFile, pairedEnd = pairedEnd)
    }

    covFFPE <- coverage * chimericRatio / seedPassRate
    covNorm <- coverage - covFFPE

    if(!pairedEnd) {
        enzymeCut <- FALSE
    }
    
    reference <- readDNAStringSet(referencePath)
    reference <- reference[width(reference) >= matchWinLen]
    names(reference) <- unlist(lapply(names(reference),
                                      function(x) strsplit(x, " ")[[1]][1]))
    
    if (is(targetRegions, "GenomicRanges")) {
        targetRegions <- data.frame(seqnames(targetRegions), 
                                    start(targetRegions), 
                                    end(targetRegions))
    }
    colnames(targetRegions) <- c("chr", "start", "end")
    targetRegions <- targetRegions[targetRegions$chr %in% names(reference), ]
    targetRegions <- targetRegions[targetRegions$end >= targetRegions$start, ]

    if (distantChimeric) {
        allChr <- names(reference)
        totalLen <- do.call("sum", lapply(reference, length))
        chrProb <- unlist(lapply(reference, length)) / totalLen
    }

    if (is.na(padding)) {
        padding <- ceiling(meanInsertLen / 2)
    }
    if (is.na(minGap)) {
        minGap <- readLen
    }

    nReadsLocal <- 0
    nReadsDistant <- 0
    nReadsNorm <- 0
    matchPadding <- 5000

    cl <- makeCluster(threads)
    registerDoParallel(cl)
    message("Using ", threads, " thread(s) for simulation.")
    for (chr in allChr[allChr %in% unique(targetRegions$chr)]) {
        chrName <- chr
        targetRegion <- targetRegions[targetRegions$chr == chrName, ]

        if (nrow(targetRegion) > 0) {
            fullSeq <- reference[[chr]]
            message("Simulating FFPE reads on the chromosome ", chr, "...")

            targetRegion$start <- unlist(lapply(targetRegion$start,
                                                function(x)
                                                    max(0, x - padding)))
            chrLen <- length(fullSeq)
            targetRegion$end <- unlist(lapply(targetRegion$end,
                                              function(x)
                                                  min(chrLen, x + padding)))

            targetRegion <- targetRegion[order(targetRegion$start),]

            if (nrow(targetRegion) > 1) {
                starts <- targetRegion$start
                starts <- starts[2:length(starts)]
                ends <- targetRegion$end
                ends <- ends[seq_len(length(ends) - 1)]
                ifMerge <- (starts - ends) <= minGap
                allStartPos <- c(targetRegion$start[1], starts[!ifMerge])
                allEndPos <- c(ends[!ifMerge],
                               targetRegion$end[nrow(targetRegion)])
                keep <- (allEndPos - allStartPos + 1) >= readLen
                allStartPos <- allStartPos[keep]
                allEndPos <- allEndPos[keep]
                startPoses <- unlist(mapply(function(x, y) seq(from = x,
                                                               to = y,
                                                               by = windowLen),
                                            allStartPos, allEndPos))
                endPoses <- mapply(function(x, y) seq(from = x,
                                                      to = y,
                                                      by = windowLen) +
                                       windowLen - 1,
                                   allStartPos, allEndPos)
                for (i in seq_along(endPoses)) {
                    endPoses[[i]][endPoses[[i]] > allEndPos[i]] <- allEndPos[i]
                }
                endPoses <- unlist(endPoses)
                endPoses[endPoses > length(fullSeq)] <- length(fullSeq)
                allStartPos <- unlist(allStartPos)
                allEndPos <- unlist(allEndPos)
            } else {
                startPoses <- targetRegion$start
                endPoses <- targetRegion$end
                allStartPos <- targetRegion$start
                allEndPos <- targetRegion$end
            }

            if (localChimeric) {
                if (pairedEnd) {
                    nSeeds <- round((endPoses - startPoses + 1) *
                                        covFFPE * localMatchRatio / readLen / 2)
                } else {
                    nSeeds <- round((endPoses - startPoses + 1) *
                                        covFFPE * localMatchRatio / readLen)
                }
                targetStarts <- startPoses - matchPadding
                targetEnds <- endPoses + matchPadding
                remove <- targetStarts < 0 | targetEnds > length(fullSeq)
                targetStarts[remove] <- startPoses[remove]
                targetEnds[remove] <- endPoses[remove]
                paddings <- rep(matchPadding, length(targetStarts))
                paddings[remove] <- 0
                startPos <- NULL
                endPos <- NULL
                targetStart <- NULL
                targetEnd <- NULL
                nSeed <- NULL
                paddingLen <- NULL
                simReads <- foreach(startPos = startPoses,
                                    endPos = endPoses,
                                    targetStart = targetStarts,
                                    targetEnd = targetEnds,
                                    nSeed = nSeeds,
                                    paddingLen = paddings,
                                    .combine = "c",
                                    .inorder = TRUE,
                                    .verbose = FALSE,
                                    .errorhandling    = "remove",
                                    .packages = c("Biostrings"),
                                    .export = c(
                                        "targetRegionalChimericReads",
                                        "targetRegionalChimericSeqs",
                                        "generateEnzymicCutSeq",
                                        "addRandomMutation",
                                        "generateReads",
                                        "addNoise",
                                        "rtruncnorm",
                                        "round")
                ) %dopar% {
                    targetRegionalChimericReads(
                        fullSeq = fullSeq,
                        startPos = startPos,
                        endPos = endPos,
                        targetStart = targetStart,
                        targetEnd = targetEnd,
                        padding = paddingLen,
                        nSeed = nSeed,
                        meanLogSeedLen = meanLogSeedLen,
                        sdLogSeedLen = sdLogSeedLen,
                        sdTargetDist = sdTargetDist,
                        meanInsertLen = meanInsertLen,
                        sdInsertLen = sdInsertLen,
                        enzymeCut = enzymeCut,
                        readLen = readLen,
                        chimMutRate = chimMutRate,
                        noiseRate = noiseRate,
                        highNoiseRate = highNoiseRate,
                        highNoiseProb = highNoiseProb,
                        pairedEnd = pairedEnd,
                        sameStrandProb = sameStrandProb)
                }
                readsToFastq(simReads = simReads,
                             PhredScoreProfile = PhredScoreProfile,
                             prefix = prefix,
                             prefix2 = "localChimeric",
                             chr = chr,
                             pairedEnd = pairedEnd,
                             outFile = outFile,
                             threads = threads)
                message("Generated ", length(simReads), 
                        " local chimeric reads ",
                        "on chromosome ", chr, ".")
                nReadsLocal <- nReadsLocal + length(simReads)
                simReads <- NULL
            }

            if (distantChimeric) {
                if (pairedEnd) {
                    nSeeds <- round((allEndPos - allStartPos + 1) * covFFPE *
                                        (1 - localMatchRatio) / readLen / 2)
                } else {
                    nSeeds <- round((allEndPos - allStartPos + 1) * covFFPE *
                                        (1 - localMatchRatio) / readLen)
                }

                simReads <- foreach(startPos = allStartPos,
                                    windowLen = allEndPos - allStartPos + 1,
                                    nSeed = nSeeds,
                                    .combine = "c",
                                    .inorder = TRUE,
                                    .verbose = FALSE,
                                    .packages = "Biostrings",
                                    .errorhandling = "remove",
                                    .export = c("generateDistantChimericReads",
                                                "generateDistantChimericSeqs",
                                                "generateReads",
                                                "addRandomMutation",
                                                "addNoise",
                                                "generateTargetSeqs",
                                                "rbeta", "round",
                                                "rtruncnorm")
                ) %dopar% {
                    generateDistantChimericReads(
                        fullSeq = fullSeq,
                        referencePath = referencePath,
                        startPos = startPos,
                        windowLen = windowLen,
                        matchWinLen = matchWinLen,
                        nSeed = nSeed,
                        meanLogSeedLen = meanLogSeedLen,
                        sdLogSeedLen = sdLogSeedLen,
                        meanInsertLen = meanInsertLen,
                        sdInsertLen = sdInsertLen,
                        readLen = readLen,
                        allChr = allChr,
                        chrProb = chrProb,
                        chimMutRate = chimMutRate,
                        noiseRate = noiseRate,
                        highNoiseRate = highNoiseRate,
                        highNoiseProb = highNoiseProb,
                        pairedEnd = pairedEnd,
                        spikeWidth = spikeWidth,
                        betaShape1 = betaShape1,
                        betaShape2 = betaShape2,
                        sameTarRegionProb = sameTarRegionProb,
                        sameStrandProb = 0.5)
                }
                readsToFastq(simReads = simReads,
                             PhredScoreProfile = PhredScoreProfile,
                             prefix = prefix,
                             prefix2 = "distantChimeric",
                             chr = chr,
                             pairedEnd = pairedEnd,
                             outFile = outFile,
                             threads = threads)

                message("Generated ", length(simReads), 
                        " distant chimeric reads ",
                        "on chromosome ", chr, ".")
                nReadsDistant <- nReadsDistant + length(simReads)
                simReads <- NULL
            }

            if (normalReads) {
                if (pairedEnd) {
                    nSeeds <- round((allEndPos - allStartPos + 1) * covNorm /
                                        readLen / 2)
                } else {
                    nSeeds <- round((allEndPos - allStartPos + 1) * covNorm
                                    / readLen)
                }
                simReads <- foreach(start = allStartPos,
                                    end = allEndPos,
                                    nSeed = nSeeds,
                                    .combine = "c",
                                    .inorder = TRUE,
                                    .verbose = FALSE,
                                    .errorhandling  = "remove",
                                    .packages = c("Biostrings"),
                                    .export = c("addNoise",
                                                "rtruncnorm",
                                                "generateNormalReads")
                ) %dopar% {
                    generateNormalReads(
                        fullSeq = fullSeq[start:end],
                        nSeq = nSeed,
                        meanInsertLen = meanInsertLen,
                        sdInsertLen = sdInsertLen,
                        readLen = readLen,
                        noiseRate = noiseRate,
                        highNoiseRate = highNoiseRate,
                        highNoiseProb = highNoiseProb,
                        pairedEnd = pairedEnd)
                }

                readsToFastq(simReads = simReads,
                             PhredScoreProfile = PhredScoreProfile,
                             prefix = prefix,
                             prefix2 = "Normal",
                             chr = chr,
                             pairedEnd = pairedEnd,
                             outFile = outFile,
                             threads = threads)

                message("Generated ", length(simReads), " normal reads ",
                        "on chromosome ", chr, ".")
                nReadsNorm <- nReadsNorm + length(simReads)
                simReads <- NULL
            }
        }
    }
    stopCluster(cl)
    message("In totoal ", nReadsLocal, " local chimeric reads, ",
            nReadsDistant, " distant chimeric reads, ",
            nReadsNorm, " normal reads were generated.",
            "\nAlltogether ", nReadsLocal + nReadsDistant + nReadsNorm,
            " reads were generated.", "\nSimulation done.")
}

Try the SimFFPE package in your browser

Any scripts or data that you put into this service are public.

SimFFPE documentation built on Nov. 8, 2020, 5:44 p.m.