R/scAPAtrap_funlib.R

Defines functions generateAlignBam generateRefIndex extractBcAndUb generatescExpMa countPeaks generateFinalBam .computePAcoord mergePA generateSAF write.bed findPeaks splitPeaks loadBpCoverages .findIntersectBypeak findTails findChrTails .readChrInfo separateBamBystrand dedupByPos findUniqueMap

Documented in dedupByPos findChrTails findPeaks findTails findUniqueMap loadBpCoverages separateBamBystrand splitPeaks

# 2020/7/18 funclibs for scAPAtrap
#-----------------------------------Preprocess----------------------------------------------
#' findUniqueMap
#'
#' @param samtools.path
#' @param input
#' @param thread
#' @param sort
#' @param index
#' @export
findUniqueMap <- function(samtools.path, input, thread, sort = T, index = T) {
    output <- paste0(gsub(".bam$", "", input), ".Uniq.bam")
    command <- paste0(samtools.path, " view -@ ", thread, " -h -F 256 -bS ", input, " > ", output)
    system(command = command, wait = T)

    res <- output
    if (sort) {
        output.sorted <- paste0(gsub(".Uniq.bam$", "", output), ".UniqSorted.bam")
        command <- paste0(samtools.path, " sort -@ ", thread, " -o ", output.sorted, " ", output)
        system(command = command, wait = T)
        res <- output.sorted
        file.remove(output)
    }
    if (index) {
        command <- paste0(samtools.path, " index -@ ", thread, " ", res)
        system(command = command, wait = T)
    }

    return(res)
}

#-----umi_tools dedup----
#' dedupPos
#' @param umitools.path
#' @param input
#' @param TenX
#' @export
dedupByPos <- function(umitools.path, input, TenX = T) {
    output <- paste0(gsub(".bam", "", input), ".dedup.bam")
    if (TenX) {
        dedup.command <- paste0(umitools.path, " dedup -I ", input, " -S ", output, " --method=unique --extract-umi-method=tag --umi-tag=UB --cell-tag=CB")
        system(command = dedup.command, wait = T)
    } else {
        dedup.command <- paste0(umitools.path, " dedup -I ", input, " -S ", output, " --method=unique")
        system(command = dedup.command, wait = T)
    }
    file.remove(input)
    file.remove(paste0(input, ".bai"))
    return(output)
}


#' separateBamBystrand
#'
#'@param samtools.path
#'@param input
#'@param thread
#'@export
separateBamBystrand <- function(samtools.path, input, thread) {
    outputF <- paste0(gsub(".bam", "", input), ".forward.bam")
    outputR <- paste0(gsub(".bam", "", input), ".reverse.bam")
    samtools.Fcommand <- paste0(samtools.path, " view -@ ", thread, " -h -F 0x10 -bS ", input, " > ", outputF)
    system(command = samtools.Fcommand, wait = T)
    samtools.Rcommand <- paste0(samtools.path, " view -@ ", thread, " -h -f 0x10 -bS ", input, " > ", outputR)
    system(command = samtools.Rcommand, wait = T)

    command <- paste0(samtools.path, " index -@ ", thread, " ", outputF)
    system(command = command, wait = T)
    command <- paste0(samtools.path, " index -@ ", thread, " ", outputR)
    system(command = command, wait = T)

    file.remove(input)
    return(c(outputF, outputR))
}

#--------------------Calculate the peak----------------------------
#--------findTails-------------

.readChrInfo <- function(file) {
    chr_length <- read.delim(file = file)
    colnames(chr_length) <- c("flag", "chr", "coord")
    chr_length$chr <- gsub("SN:", "", chr_length$chr)
    chr_length$coord <- as.numeric(gsub("LN:", "", chr_length$coord))
    chr_length <- subset(chr_length, chr != "MT")
    chr.g <- with(chr_length, GRanges(seqnames = chr, ranges = IRanges(start = 1, end = coord)))

    return(chr.g)
}

#' findChrTails
#'
#' @description 遍历每条染色体,通过末端的A-rich发现位点。需要注意的是BAM文件需要构建索引
#' @param bamfile bam文件的路径
#' @param which 参考Rsamtools的ScanBamParam的参数解释
#' @param what 参考Rsamtools的ScanBamParam的参数解释
#' @return
#' @export
findChrTails <- function(bamfile, which, what) {
    library(dplyr)
    param <- Rsamtools::ScanBamParam(what = what, which = which)
    gal1 <- GenomicAlignments::readGAlignments(bamfile, use.names = TRUE, param = param)
    s_1 <- (grepl("[0-9]*M[1-9]{2,}S", gal1@cigar) & as.vector(gal1@strand) == "+")
    s_2 <- (grepl("[1-9]{2,}S[0-9]*M", gal1@cigar) & as.vector(gal1@strand) == "-")

    bam1 <- gal1[s_1]
    bam2 <- gal1[s_2]

    bam1 <- bam1[grepl("(A{3,}[^A]{0,2})*A{6,}([^A]{0,2}A{3,})*.{0,2}?$", bam1@elementMetadata@listData$seq)]
    bam2 <- bam2[grepl("^.{0,2}?(T{3,}[^T]{0,2})*T{6,}([^T]{0,2}T{3,})*", bam2@elementMetadata@listData$seq)]

    final_bam1 <- data.frame(chr = as.vector(seqnames(bam1)), strand = as.vector(strand(bam1)), coord = end(bam1))
    final_bam2 <- data.frame(chr = as.vector(seqnames(bam2)), strand = as.vector(strand(bam2)), coord = start(bam2))

    bam <- rbind(final_bam1, final_bam2)
    bam <- dplyr::group_by(bam, chr, strand, coord) %>% summarise(count = n())

    return(bam)
}


#' findTails
#'
#' @param bamfile bam的文件路径
#' @param what 参考Rsamtools的ScanBamParam的参数解释
#' @export
findTails <- function(bamfile, write = T) {
    library(dplyr)
    what <- c("rname", "strand", "pos", "cigar", "seq")
    param <- Rsamtools::ScanBamParam(what = what)
    gal1 <- GenomicAlignments::readGAlignments(bamfile, use.names = TRUE, param = param)
    s_1 <- (grepl("[0-9]*M[1-9]{2,}S", gal1@cigar) & as.vector(gal1@strand) == "+")
    s_2 <- (grepl("[1-9]{2,}S[0-9]*M", gal1@cigar) & as.vector(gal1@strand) == "-")

    bam1 <- gal1[s_1]
    bam2 <- gal1[s_2]

    bam1 <- bam1[grepl("(A{3,}[^A]{0,2})*A{6,}([^A]{0,2}A{3,})*.{0,2}?$", bam1@elementMetadata@listData$seq)]
    bam2 <- bam2[grepl("^.{0,2}?(T{3,}[^T]{0,2})*T{6,}([^T]{0,2}T{3,})*", bam2@elementMetadata@listData$seq)]

    final_bam1 <- data.frame(chr = as.vector(seqnames(bam1)), strand = as.vector(strand(bam1)), coord = end(bam1))
    final_bam2 <- data.frame(chr = as.vector(seqnames(bam2)), strand = as.vector(strand(bam2)), coord = start(bam2))

    bam <- rbind(final_bam1, final_bam2)
    bam <- dplyr::group_by(bam, chr, strand, coord) %>% summarise(count = n())

    chr <- colnames(bam)[1]
    return(bam)
}

.findIntersectBypeak <- function(qrypeak, sbjpeak, d = 50) {
    gr1 <- with(qrypeak, GRanges(seqnames = chr, ranges = IRanges(start = start, end = end), strand = strand))

    gr2 <- with(sbjpeak, GRanges(seqnames = chr, ranges = IRanges(start = coord, end = coord), strand = strand))

    ov = GenomicRanges::findOverlaps(gr1, gr2, maxgap = d, minoverlap = 0L, type = c("any"), select = "all", ignore.strand = FALSE)

    return(qrypeak[unique(ov@from), ])
}


#' loadBpCoverages
#'
#' @param files
#' @chrs
#' @export
loadBpCoverages <- function(files, chrs) {
    # files <- derfinder::rawFiles(dir_name,sampledirs = sampledir,fileterm = fileterm)
    chrs <- chrs
    fullCov <- derfinder::fullCoverage(files = files, chrs = chrs)
    file.remove(files)
    return(fullCov)
}

#' splitPeaks
#'
#' @param pos_count
#' @param chrom
#' @param pos
#' @param L
#' @export
splitPeaks <- function(pos_count, chrom, pos, L) {
    chr <- rep(chrom, length(pos_count))
    throshold <- quantile(pos_count, 0.25)
    tempdata <- bumphunter::regionFinder(pos_count, chr = chr, pos = pos, cutoff = throshold, verbose = F)
    tempdata <- subset(tempdata, L > L)

    return(tempdata)
}

#' findPeaks
#'
#' @param fullCov
#' @param strand
#' @param L
#' @param maxwidth
#' @param cutoff
#' @export
findPeaks <- function(fullCov, Strand, L, maxwidth, cutoff = 0) {
    regionMat <- derfinder::regionMatrix(fullCov = fullCov, L = L, cutoff = cutoff)
    DefPeak <- GenomicRanges::GRanges()
    for (i in names(regionMat)) {
        index <- which(GenomicRanges::width(regionMat[[i]]$regions) > maxwidth)
        tempDataFram <- data.frame()
        for (j in index) {
            # browser()
            rawstart <- start(regionMat[[i]]$regions[j]) - 1
            temp <- regionMat[[i]]$bpCoverage[[j]]
            temp$pos <- c(1:dim(temp)[1])
            # 进行第一次切分,如果还存在peak大于1000,则继续切分
            first.split.peaks <- splitPeaks(temp$value, i, temp$pos)
            pre.split.peaks <- first.split.peaks
            split.peaks <- data.frame()
            iter <- 0
            while (TRUE) {
                if (iter >= 3) {
                  (break)()
                }
                Ind <- which(pre.split.peaks$L > maxwidth)
                cur.split.peaks <- data.frame()
                if (length(Ind) > 0) {
                  # 将小于1000的peak 存储到split.peaks
                  split.peaks <- rbind(split.peaks, pre.split.peaks[-Ind, ])
                  for (k in Ind) {
                    pos.temp <- temp[pre.split.peaks$start[k]:pre.split.peaks$end[k], ]
                    cur.split.peaks <- splitPeaks(pos.temp$value, i, pos.temp$pos, L)
                  }
                } else {
                  split.peaks <- rbind(split.peaks, pre.split.peaks)
                  (break)()
                }
                pre.split.peaks <- cur.split.peaks
                iter <- iter + 1

            }
            split.peaks$start <- split.peaks$start + rawstart
            split.peaks$end <- split.peaks$end + rawstart
            tempDataFram <- rbind(tempDataFram, split.peaks)
        }
        DefPeak <- c(DefPeak, regionMat[[i]]$regions[-index])
        if (length(tempDataFram) != 0) {
            DefPeak <- c(DefPeak, regioneR::toGRanges(tempDataFram))
        }
    }
    DefPeak <- DefPeak[-which(DefPeak$area == GenomicRanges::width(DefPeak))]
    DefPeak <- DefPeak[GenomicRanges::width(DefPeak) > L]
    strand(DefPeak) <- Strand
    if (Strand == "+") {
        DefPeak <- as.data.frame(DefPeak, row.names = 1:length(DefPeak))
        DefPeak <- dplyr::mutate(DefPeak, chr = seqnames, coord = end)
    } else {
        DefPeak <- as.data.frame(DefPeak, row.names = 1:length(DefPeak))
        DefPeak <- dplyr::mutate(DefPeak, chr = seqnames, coord = start)
    }

    return(DefPeak)
}

write.bed <- function(.x, f) {
    write.table(x = .x, file = f, sep = "\t", col.names = F, row.names = F, quote = F)
}

generateSAF <- function(forwardPeaks, reversePeaks, outputdir) {
    peaks <- rbind(forwardPeaks, reversePeaks)
    peaks$PeakID <- paste0("peak", "_", 1:length(peaks$chr))
    peaks.saf <- peaks[, c("PeakID", "chr", "start", "end", "strand")]

    if (!file.exists(outputdir)) {
        dir.create(outputdir)
    }

    output <- paste0(outputdir, "/", "peaks.saf")
    write.bed(peaks.saf, output)
}

mergePA <- function(bam, adjBp = 10) {
    peak <- GenomicRanges::GRanges(seq = Rle(bam$chr), ranges = IRanges(bam$coord, bam$coord + adjBp), strand = Rle(bam$strand))

    peak <- GenomicRanges::reduce(peak)
    peak <- as.data.frame(peak)
    peak$end <- peak$end - adjBp
    peak$width <- peak$end - peak$start + 1

    peak$coord <- 0
    peak$coord[peak$strand == "+"] <- peak$end[peak$strand == "+"]
    peak$coord[peak$strand == "-"] <- peak$start[peak$strand == "-"]
    names(peak) <- c("chr", names(peak)[-1])
    return(peak)
}


.computePAcoord <- function(peak) {
    peak$coord <- 0
    peak$coord[peak$strand == "+"] <- peak$end[peak$strand == "+"]
    peak$coord[peak$strand == "-"] <- peak$start[peak$strand == "-"]

    return(peak)
}


generateFinalBam <- function(featureCounts.path, samtools.path, input, peakfile, thread) {
    output.dir <- dirname(input)
    log <- paste0(output.dir, "/peak_assigned")
    command <- paste0(featureCounts.path, " -a ", peakfile, " -F SAF -t exon -s 1 -M --largestOverlap -o ", log, " -R BAM -T ",
        thread, " ", input)
    system(command = command, wait = T)

    input <- paste0(input, ".featureCounts.bam")
    output <- paste0(dirname(input), "/", "final.bam")
    command <- paste0(samtools.path, " sort -@ ", thread, " ", input, " -o ", output, " && ", samtools.path, " index -@ ",
        thread, " ", output)

    system(command = command, wait = T)
}

countPeaks <- function(umitools.path, input, outputdir, TenX = T) {
    output <- paste0(outputdir, "/counts.tsv.gz")
    if (TenX) {
        command <- paste0(umitools.path, " count --per-gene --gene-tag=XT --assigned-status-tag=XS --method=unique --extract-umi-method=tag --umi-tag=UB --cell-tag=CB --per-cell -I ",
            input, " -S ", output)
        system(command = command, wait = T)
    } else {
        command <- paste0(umitools.path, " count --per-gene --gene-tag=XT --assigned-status-tag=XS --method=unique --per-cell -I ",
            input, " -S ", output)
        system(command = command, wait = T)
    }
}

generatescExpMa <- function(countsfile, peaksfile, barcode, tails, min.cells = 2, min.count = 0) {

    browser()
    counts <- read.delim(countsfile, header = T)
    saf <- read.delim(peaksfile, header = F)
    names(saf) <- c("peakID", "chr", "start", "end", "strand")

    saf <- .computePAcoord(saf)
    saf <- .findIntersectBypeak(saf, tails)

    counts <- subset(counts, cell %in% barcode)

    peak <- group_by(counts, gene) %>% dplyr::summarize(cells = n(), count = sum(count))
    peak <- subset(peak, cells >= min.cells & count >= min.count)

    interPeak <- intersect(saf$peakID, peak$gene)
    saf <- subset(saf, peakID %in% interPeak)

    counts <- subset(counts, gene %in% interPeak)

    scExpMa <- reshape2::dcast(counts, gene ~ cell, value.var = "count")
    scExpMa[is.na(scExpMa)] <- 0


    scExpMa$gene <- factor(scExpMa$gene, levels = saf$peakID)
    scExpMa <- arrange(scExpMa, gene)
    rownames(scExpMa) <- scExpMa$gene
    scExpMa <- scExpMa[, -1]


    scExpMa <- cbind(saf, scExpMa)
    rownames(scExpMa) <- saf$peakID

    return(scExpMa)
}



extractBcAndUb <- function(umitools.path, pattern, input.seq, whitelist) {
    output1 <- paste0(gsub(".fastq$", "", input.seq[1]), ".extracted.fastq")
    output2 <- paste0(gsub(".fastq$", "", input.seq[2]), ".extracted.fastq")
    command <- paste0(umitools.path, " extract --bc-pattern=", pattern, " --stdin ", input.seq[1], " --stdout ", output1,
        " --read2-in ", input.seq[2], " --read2-out=", output2, " --filter-cell-barcode ", " --whitelist=", whitelist)

    system(command = command, wait = T)
}

generateRefIndex <- function(star.path, genome.fasta, indexdir, thread) {
    command <- paste0(star.path, " --runMode genomeGenerate --genomeDir ", indexdir, " --genomeFastaFiles ", genome.fasta,
        " --runThreadN ", thread)

    system(command = command, wait = T)
}

generateAlignBam <- function(star.path, indexdir, input.seq, prefix, thread) {
    command <- paste0(star.path, " --runThreadN ", thread, " --genomeDir ", indexdir, " --readFilesIn ", input.seq, " --outFilterMultimapNmax 1 ",
        " --outSAMtype BAM SortedByCoordinate ", " --outFileNamePrefix ", prefix)
    system(command = command, wait = T)
}
liutao199527/scAPAtrap1 documentation built on Aug. 16, 2020, 7:24 p.m.