# 2020/7/18 funclibs for scAPAtrap
#-----------------------------------Preprocess----------------------------------------------
#' findUniqueMap
#'
#' @param samtools.path
#' @param input
#' @param thread
#' @param sort
#' @param index
#' @export
findUniqueMap <- function(samtools.path, input, thread, sort = T, index = T) {
output <- paste0(gsub(".bam$", "", input), ".Uniq.bam")
command <- paste0(samtools.path, " view -@ ", thread, " -h -F 256 -bS ", input, " > ", output)
system(command = command, wait = T)
res <- output
if (sort) {
output.sorted <- paste0(gsub(".Uniq.bam$", "", output), ".UniqSorted.bam")
command <- paste0(samtools.path, " sort -@ ", thread, " -o ", output.sorted, " ", output)
system(command = command, wait = T)
res <- output.sorted
file.remove(output)
}
if (index) {
command <- paste0(samtools.path, " index -@ ", thread, " ", res)
system(command = command, wait = T)
}
return(res)
}
#-----umi_tools dedup----
#' dedupPos
#' @param umitools.path
#' @param input
#' @param TenX
#' @export
dedupByPos <- function(umitools.path, input, TenX = T) {
output <- paste0(gsub(".bam", "", input), ".dedup.bam")
if (TenX) {
dedup.command <- paste0(umitools.path, " dedup -I ", input, " -S ", output, " --method=unique --extract-umi-method=tag --umi-tag=UB --cell-tag=CB")
system(command = dedup.command, wait = T)
} else {
dedup.command <- paste0(umitools.path, " dedup -I ", input, " -S ", output, " --method=unique")
system(command = dedup.command, wait = T)
}
file.remove(input)
file.remove(paste0(input, ".bai"))
return(output)
}
#' separateBamBystrand
#'
#'@param samtools.path
#'@param input
#'@param thread
#'@export
separateBamBystrand <- function(samtools.path, input, thread) {
outputF <- paste0(gsub(".bam", "", input), ".forward.bam")
outputR <- paste0(gsub(".bam", "", input), ".reverse.bam")
samtools.Fcommand <- paste0(samtools.path, " view -@ ", thread, " -h -F 0x10 -bS ", input, " > ", outputF)
system(command = samtools.Fcommand, wait = T)
samtools.Rcommand <- paste0(samtools.path, " view -@ ", thread, " -h -f 0x10 -bS ", input, " > ", outputR)
system(command = samtools.Rcommand, wait = T)
command <- paste0(samtools.path, " index -@ ", thread, " ", outputF)
system(command = command, wait = T)
command <- paste0(samtools.path, " index -@ ", thread, " ", outputR)
system(command = command, wait = T)
file.remove(input)
return(c(outputF, outputR))
}
#--------------------Calculate the peak----------------------------
#--------findTails-------------
.readChrInfo <- function(file) {
chr_length <- read.delim(file = file)
colnames(chr_length) <- c("flag", "chr", "coord")
chr_length$chr <- gsub("SN:", "", chr_length$chr)
chr_length$coord <- as.numeric(gsub("LN:", "", chr_length$coord))
chr_length <- subset(chr_length, chr != "MT")
chr.g <- with(chr_length, GRanges(seqnames = chr, ranges = IRanges(start = 1, end = coord)))
return(chr.g)
}
#' findChrTails
#'
#' @description 遍历每条染色体,通过末端的A-rich发现位点。需要注意的是BAM文件需要构建索引
#' @param bamfile bam文件的路径
#' @param which 参考Rsamtools的ScanBamParam的参数解释
#' @param what 参考Rsamtools的ScanBamParam的参数解释
#' @return
#' @export
findChrTails <- function(bamfile, which, what) {
library(dplyr)
param <- Rsamtools::ScanBamParam(what = what, which = which)
gal1 <- GenomicAlignments::readGAlignments(bamfile, use.names = TRUE, param = param)
s_1 <- (grepl("[0-9]*M[1-9]{2,}S", gal1@cigar) & as.vector(gal1@strand) == "+")
s_2 <- (grepl("[1-9]{2,}S[0-9]*M", gal1@cigar) & as.vector(gal1@strand) == "-")
bam1 <- gal1[s_1]
bam2 <- gal1[s_2]
bam1 <- bam1[grepl("(A{3,}[^A]{0,2})*A{6,}([^A]{0,2}A{3,})*.{0,2}?$", bam1@elementMetadata@listData$seq)]
bam2 <- bam2[grepl("^.{0,2}?(T{3,}[^T]{0,2})*T{6,}([^T]{0,2}T{3,})*", bam2@elementMetadata@listData$seq)]
final_bam1 <- data.frame(chr = as.vector(seqnames(bam1)), strand = as.vector(strand(bam1)), coord = end(bam1))
final_bam2 <- data.frame(chr = as.vector(seqnames(bam2)), strand = as.vector(strand(bam2)), coord = start(bam2))
bam <- rbind(final_bam1, final_bam2)
bam <- dplyr::group_by(bam, chr, strand, coord) %>% summarise(count = n())
return(bam)
}
#' findTails
#'
#' @param bamfile bam的文件路径
#' @param what 参考Rsamtools的ScanBamParam的参数解释
#' @export
findTails <- function(bamfile, write = T) {
library(dplyr)
what <- c("rname", "strand", "pos", "cigar", "seq")
param <- Rsamtools::ScanBamParam(what = what)
gal1 <- GenomicAlignments::readGAlignments(bamfile, use.names = TRUE, param = param)
s_1 <- (grepl("[0-9]*M[1-9]{2,}S", gal1@cigar) & as.vector(gal1@strand) == "+")
s_2 <- (grepl("[1-9]{2,}S[0-9]*M", gal1@cigar) & as.vector(gal1@strand) == "-")
bam1 <- gal1[s_1]
bam2 <- gal1[s_2]
bam1 <- bam1[grepl("(A{3,}[^A]{0,2})*A{6,}([^A]{0,2}A{3,})*.{0,2}?$", bam1@elementMetadata@listData$seq)]
bam2 <- bam2[grepl("^.{0,2}?(T{3,}[^T]{0,2})*T{6,}([^T]{0,2}T{3,})*", bam2@elementMetadata@listData$seq)]
final_bam1 <- data.frame(chr = as.vector(seqnames(bam1)), strand = as.vector(strand(bam1)), coord = end(bam1))
final_bam2 <- data.frame(chr = as.vector(seqnames(bam2)), strand = as.vector(strand(bam2)), coord = start(bam2))
bam <- rbind(final_bam1, final_bam2)
bam <- dplyr::group_by(bam, chr, strand, coord) %>% summarise(count = n())
chr <- colnames(bam)[1]
return(bam)
}
.findIntersectBypeak <- function(qrypeak, sbjpeak, d = 50) {
gr1 <- with(qrypeak, GRanges(seqnames = chr, ranges = IRanges(start = start, end = end), strand = strand))
gr2 <- with(sbjpeak, GRanges(seqnames = chr, ranges = IRanges(start = coord, end = coord), strand = strand))
ov = GenomicRanges::findOverlaps(gr1, gr2, maxgap = d, minoverlap = 0L, type = c("any"), select = "all", ignore.strand = FALSE)
return(qrypeak[unique(ov@from), ])
}
#' loadBpCoverages
#'
#' @param files
#' @chrs
#' @export
loadBpCoverages <- function(files, chrs) {
# files <- derfinder::rawFiles(dir_name,sampledirs = sampledir,fileterm = fileterm)
chrs <- chrs
fullCov <- derfinder::fullCoverage(files = files, chrs = chrs)
file.remove(files)
return(fullCov)
}
#' splitPeaks
#'
#' @param pos_count
#' @param chrom
#' @param pos
#' @param L
#' @export
splitPeaks <- function(pos_count, chrom, pos, L) {
chr <- rep(chrom, length(pos_count))
throshold <- quantile(pos_count, 0.25)
tempdata <- bumphunter::regionFinder(pos_count, chr = chr, pos = pos, cutoff = throshold, verbose = F)
tempdata <- subset(tempdata, L > L)
return(tempdata)
}
#' findPeaks
#'
#' @param fullCov
#' @param strand
#' @param L
#' @param maxwidth
#' @param cutoff
#' @export
findPeaks <- function(fullCov, Strand, L, maxwidth, cutoff = 0) {
regionMat <- derfinder::regionMatrix(fullCov = fullCov, L = L, cutoff = cutoff)
DefPeak <- GenomicRanges::GRanges()
for (i in names(regionMat)) {
index <- which(GenomicRanges::width(regionMat[[i]]$regions) > maxwidth)
tempDataFram <- data.frame()
for (j in index) {
# browser()
rawstart <- start(regionMat[[i]]$regions[j]) - 1
temp <- regionMat[[i]]$bpCoverage[[j]]
temp$pos <- c(1:dim(temp)[1])
# 进行第一次切分,如果还存在peak大于1000,则继续切分
first.split.peaks <- splitPeaks(temp$value, i, temp$pos)
pre.split.peaks <- first.split.peaks
split.peaks <- data.frame()
iter <- 0
while (TRUE) {
if (iter >= 3) {
(break)()
}
Ind <- which(pre.split.peaks$L > maxwidth)
cur.split.peaks <- data.frame()
if (length(Ind) > 0) {
# 将小于1000的peak 存储到split.peaks
split.peaks <- rbind(split.peaks, pre.split.peaks[-Ind, ])
for (k in Ind) {
pos.temp <- temp[pre.split.peaks$start[k]:pre.split.peaks$end[k], ]
cur.split.peaks <- splitPeaks(pos.temp$value, i, pos.temp$pos, L)
}
} else {
split.peaks <- rbind(split.peaks, pre.split.peaks)
(break)()
}
pre.split.peaks <- cur.split.peaks
iter <- iter + 1
}
split.peaks$start <- split.peaks$start + rawstart
split.peaks$end <- split.peaks$end + rawstart
tempDataFram <- rbind(tempDataFram, split.peaks)
}
DefPeak <- c(DefPeak, regionMat[[i]]$regions[-index])
if (length(tempDataFram) != 0) {
DefPeak <- c(DefPeak, regioneR::toGRanges(tempDataFram))
}
}
DefPeak <- DefPeak[-which(DefPeak$area == GenomicRanges::width(DefPeak))]
DefPeak <- DefPeak[GenomicRanges::width(DefPeak) > L]
strand(DefPeak) <- Strand
if (Strand == "+") {
DefPeak <- as.data.frame(DefPeak, row.names = 1:length(DefPeak))
DefPeak <- dplyr::mutate(DefPeak, chr = seqnames, coord = end)
} else {
DefPeak <- as.data.frame(DefPeak, row.names = 1:length(DefPeak))
DefPeak <- dplyr::mutate(DefPeak, chr = seqnames, coord = start)
}
return(DefPeak)
}
write.bed <- function(.x, f) {
write.table(x = .x, file = f, sep = "\t", col.names = F, row.names = F, quote = F)
}
generateSAF <- function(forwardPeaks, reversePeaks, outputdir) {
peaks <- rbind(forwardPeaks, reversePeaks)
peaks$PeakID <- paste0("peak", "_", 1:length(peaks$chr))
peaks.saf <- peaks[, c("PeakID", "chr", "start", "end", "strand")]
if (!file.exists(outputdir)) {
dir.create(outputdir)
}
output <- paste0(outputdir, "/", "peaks.saf")
write.bed(peaks.saf, output)
}
mergePA <- function(bam, adjBp = 10) {
peak <- GenomicRanges::GRanges(seq = Rle(bam$chr), ranges = IRanges(bam$coord, bam$coord + adjBp), strand = Rle(bam$strand))
peak <- GenomicRanges::reduce(peak)
peak <- as.data.frame(peak)
peak$end <- peak$end - adjBp
peak$width <- peak$end - peak$start + 1
peak$coord <- 0
peak$coord[peak$strand == "+"] <- peak$end[peak$strand == "+"]
peak$coord[peak$strand == "-"] <- peak$start[peak$strand == "-"]
names(peak) <- c("chr", names(peak)[-1])
return(peak)
}
.computePAcoord <- function(peak) {
peak$coord <- 0
peak$coord[peak$strand == "+"] <- peak$end[peak$strand == "+"]
peak$coord[peak$strand == "-"] <- peak$start[peak$strand == "-"]
return(peak)
}
generateFinalBam <- function(featureCounts.path, samtools.path, input, peakfile, thread) {
output.dir <- dirname(input)
log <- paste0(output.dir, "/peak_assigned")
command <- paste0(featureCounts.path, " -a ", peakfile, " -F SAF -t exon -s 1 -M --largestOverlap -o ", log, " -R BAM -T ",
thread, " ", input)
system(command = command, wait = T)
input <- paste0(input, ".featureCounts.bam")
output <- paste0(dirname(input), "/", "final.bam")
command <- paste0(samtools.path, " sort -@ ", thread, " ", input, " -o ", output, " && ", samtools.path, " index -@ ",
thread, " ", output)
system(command = command, wait = T)
}
countPeaks <- function(umitools.path, input, outputdir, TenX = T) {
output <- paste0(outputdir, "/counts.tsv.gz")
if (TenX) {
command <- paste0(umitools.path, " count --per-gene --gene-tag=XT --assigned-status-tag=XS --method=unique --extract-umi-method=tag --umi-tag=UB --cell-tag=CB --per-cell -I ",
input, " -S ", output)
system(command = command, wait = T)
} else {
command <- paste0(umitools.path, " count --per-gene --gene-tag=XT --assigned-status-tag=XS --method=unique --per-cell -I ",
input, " -S ", output)
system(command = command, wait = T)
}
}
generatescExpMa <- function(countsfile, peaksfile, barcode, tails, min.cells = 2, min.count = 0) {
browser()
counts <- read.delim(countsfile, header = T)
saf <- read.delim(peaksfile, header = F)
names(saf) <- c("peakID", "chr", "start", "end", "strand")
saf <- .computePAcoord(saf)
saf <- .findIntersectBypeak(saf, tails)
counts <- subset(counts, cell %in% barcode)
peak <- group_by(counts, gene) %>% dplyr::summarize(cells = n(), count = sum(count))
peak <- subset(peak, cells >= min.cells & count >= min.count)
interPeak <- intersect(saf$peakID, peak$gene)
saf <- subset(saf, peakID %in% interPeak)
counts <- subset(counts, gene %in% interPeak)
scExpMa <- reshape2::dcast(counts, gene ~ cell, value.var = "count")
scExpMa[is.na(scExpMa)] <- 0
scExpMa$gene <- factor(scExpMa$gene, levels = saf$peakID)
scExpMa <- arrange(scExpMa, gene)
rownames(scExpMa) <- scExpMa$gene
scExpMa <- scExpMa[, -1]
scExpMa <- cbind(saf, scExpMa)
rownames(scExpMa) <- saf$peakID
return(scExpMa)
}
extractBcAndUb <- function(umitools.path, pattern, input.seq, whitelist) {
output1 <- paste0(gsub(".fastq$", "", input.seq[1]), ".extracted.fastq")
output2 <- paste0(gsub(".fastq$", "", input.seq[2]), ".extracted.fastq")
command <- paste0(umitools.path, " extract --bc-pattern=", pattern, " --stdin ", input.seq[1], " --stdout ", output1,
" --read2-in ", input.seq[2], " --read2-out=", output2, " --filter-cell-barcode ", " --whitelist=", whitelist)
system(command = command, wait = T)
}
generateRefIndex <- function(star.path, genome.fasta, indexdir, thread) {
command <- paste0(star.path, " --runMode genomeGenerate --genomeDir ", indexdir, " --genomeFastaFiles ", genome.fasta,
" --runThreadN ", thread)
system(command = command, wait = T)
}
generateAlignBam <- function(star.path, indexdir, input.seq, prefix, thread) {
command <- paste0(star.path, " --runThreadN ", thread, " --genomeDir ", indexdir, " --readFilesIn ", input.seq, " --outFilterMultimapNmax 1 ",
" --outSAMtype BAM SortedByCoordinate ", " --outFileNamePrefix ", prefix)
system(command = command, wait = T)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.