R/read_generator.R

Defines functions generateNormalReads generateReads

generateReads <- function(seqs, readLen, pairedEnd, noiseRate,
                          highNoiseRate, highNoiseProp) {
    noiseRate <- noiseRate / 0.75
    highNoiseRate <- highNoiseRate / 0.75
    revComp <- reverseComplement(seqs)
    nSeq <- length(seqs)
    if (pairedEnd) {
        ranges <- IRangesList(rep(list(IRanges(start = 1, width = readLen)),
                                  nSeq))
        reads1 <- unlist(extractAt(x = seqs, at = ranges))
        reads2 <- unlist(extractAt(x = revComp, at = ranges))
        removedReads <- union(which(grepl("N", reads1)),
                              which(grepl("N", reads2)))
        if (length(removedReads) > 0) {
            reads <- c(reads1[-removedReads], reads2[-removedReads])
        } else {
            reads <- c(reads1, reads2)
        }
        nReads <- length(reads)
        if (nReads > 0) {
            noiseRates <- rep(noiseRate, length(reads))
            noiseRates[sample(seq_len(nReads),
                              size = nReads * highNoiseProp,
                              replace = TRUE)] <- highNoiseRate
            reads <- addNoise(reads, noiseRates=noiseRates, readLen = readLen)
            reads <- as.character(reads)
            if (nReads > 2) {
                mid <- nReads / 2
                mid2 <- round(mid / 2)
                reads1 <- c(rbind(reads[seq_len(mid2)],
                                  reads[(mid + 1):(mid + mid2)]))
                reads2 <- c(rbind(reads[(mid + mid2 + 1):nReads],
                                  reads[(mid2 + 1):mid]))
                reads <- c(reads1, reads2)
            }
            return(reads)
        }

    } else {
        mid <- round(nSeq / 2)
        ranges <- IRangesList(rep(list(IRanges(start = 1, width = readLen)),
                                  mid))
        reads1 <- unlist(extractAt(x = seqs, at = ranges))
        reads2 <- unlist(extractAt(x = revComp, at = ranges))
        reads <- c(reads1, reads2)

        reads <- reads[!grepl("N", reads)]
        nReads <- length(reads)
        if (nReads > 0) {
            noiseRates <- rep(noiseRate, length(reads))
            noiseRates[sample(seq_len(nReads),
                              size = nReads * highNoiseProp,
                              replace = TRUE)] <- highNoiseRate
            reads <- addNoise(reads, noiseRates=noiseRates, readLen = readLen)
            reads <- as.character(reads)
            return(reads)
        }
    }
}

addNoise <- function (reads, noiseRates, readLen) {
    noiseIndex <- matrix(nrow = length(reads), ncol = readLen)
    noises <- character(length = length(reads))
    for (i in seq_along(reads)) {
        noiseIndex[i, ] <- runif(readLen) <= noiseRates[i]
        noises[i] <- paste0(sample(c("A", "C", "G", "T"),
                                   size = sum(noiseIndex[i, ]),
                                   replace = TRUE), collapse = "")
    }


    reads <- replaceLetterAt(x = reads,
                             at = noiseIndex,
                             letter =  noises)

    return(reads)
}


generateNormalReads <- function(fullSeq, nSeq, meanInsertLen, sdInsertLen,
                                readLen, noiseRate, highNoiseRate,
                                highNoiseProp, pairedEnd) {


    starts <- sample(seq_len(length(fullSeq) - meanInsertLen + 1),
                     size = nSeq, replace = TRUE)
    starts <- starts[starts > 0]
    insertLens <- as.integer(rtruncnorm(n = length(starts), a = readLen,
                                        mean = meanInsertLen,
                                        sd = sdInsertLen))
    ends <- starts + insertLens - 1
    starts <- starts[ends < length(fullSeq)]
    ends <- ends[ends < length(fullSeq)]

    revComp <- reverseComplement(fullSeq)
    noiseRate <- noiseRate / 0.75
    highNoiseRate <- highNoiseRate / 0.75
    fullLen <- length(fullSeq)

    if (pairedEnd && length(starts) > 0) {
        reads1 <- extractAt(x = fullSeq,
                            at = IRanges(start = starts, width = readLen))
        reads2 <- extractAt(x = revComp,
                            at = IRanges(start = fullLen - ends + 1,
                                         width = readLen))

        removedReads <- union(which(grepl("N", reads1)),
                              which(grepl("N", reads2)))
        if (length(removedReads) > 0) {
            reads <- c(reads1[-removedReads], reads2[-removedReads])
        } else {
            reads <- c(reads1, reads2)
        }

        nReads <- length(reads)
        if (nReads > 0) {
            noiseRates <- rep(noiseRate, length(reads))
            noiseRates[sample(seq_len(nReads),
                              size = nReads * highNoiseProp,
                              replace = TRUE)] <- highNoiseRate
            reads <- addNoise(reads, noiseRates=noiseRates, readLen = readLen)
            reads <- as.character(reads)
            mid <- nReads / 2
            mid2 <- round(mid / 2)
            reads1 <- c(rbind(reads[seq_len(mid2)],
                              reads[(mid + 1):(mid + mid2)]))
            reads2 <- c(rbind(reads[(mid + mid2 + 1):nReads],
                              reads[(mid2 + 1):mid]))
            reads <- c(reads1, reads2)
            return(reads)
        }

    } else if (!pairedEnd && length(starts) > 0){
        mid <- round(length(starts) / 2)
        reads1 <- extractAt(x = fullSeq,
                            at = IRanges(start = starts[seq_len(mid)],
                                         width = readLen))
        reads2 <-
            extractAt(x = revComp,
                      at = IRanges(start = fullLen - 
                                       ends[(mid+1):length(ends)] + 1,
                                   width = readLen))
        reads <- c(reads1, reads2)

        reads <- reads[!grepl("N", reads)]
        nReads <- length(reads)
        if (nReads > 0) {
            noiseRates <- rep(noiseRate, length(reads))
            noiseRates[sample(seq_len(nReads),
                              size = nReads * highNoiseProp,
                              replace = TRUE)] <- highNoiseRate
            reads <- addNoise(reads, noiseRates=noiseRates, readLen = readLen)
            reads <- as.character(reads)
            return(reads)
        }
    }
}
LanyingWei/SimFFPE documentation built on Nov. 22, 2020, 3:37 a.m.