#' Generate strand-specific two-dimensional coordinate files (prebed) of the BAM for clustering, and extract the cb list
#'
#' @param in_bamfile A indexed BAM file, and the index files (.bai) must be in the same directory of the indexed BAM
#' @param in_nSBfile A indexed soft-clipped BAM file, generated and indexed by samtools script
#' @param chr_info A data.table that contains the chrinfo, can be generated by ExtractChrinfo()
#' @param out_dir The directory where directories of two strands are created and the output files are stored
#' @param SA_cutoff The minimum number of soft-clipped nts (default 5)
#' @param min_median The minimum number of locations of the same umi when screening (default 5)
#' @param RCn The cut-off value that decides the minimum weight of output when processing the in_bamfile (default 1)
#'
#' @return NULL, write the prebeds (sa, rc_end, rc_uni) in strand dirs under the out_dir and the cb list in out_dir.
#' @export
#'
#' @import data.table
#' @rawNamespace import(IRanges, except = c(which, shift))
#' @importFrom GenomicAlignments readGAlignments
#'
#' @examples ExtractAPA("test.bam", "test.s.bam", ExtractChrinfo("test.bam"), "output/")
ExtractAPA <- function(in_bamfile, in_nSBfile, chr_info, out_dir, SA_cutoff = 5, min_median = 5, RCn = 1){
### check and create two directories to save the output
if (!dir.exists(out_dir)){
warning(out_dir, " does not exist")
return(message("ERROR: check the dir"))
}
if (dir.exists(paste0(out_dir, "positive_strand/"))){
warning(paste0(out_dir, "positive_strand/"), " already exists")
return(message("ERROR: check the dir."))
}
if (dir.exists(paste0(out_dir, "negative_strand/"))){
warning(paste0(out_dir, "negative_strand/"), " already exists")
return(message("ERROR: check the dir."))
}
print("ExtractAPA: creat the dir")
cat("ExtractAPA: creat the dir" , file = paste0(out_dir, "runstat.o"), append = TRUE)
dir.create(paste0(out_dir, "positive_strand/"))
dir.create(paste0(out_dir, "negative_strand/"))
# cellbarcode
cb_list <- list()
### chr loop
for (chrno in as.character(chr_info[, chr])) {
## load seq info of the chr and convert it into data.table
print(paste0("Load ", chrno, "to extract prebeds"))
cat(paste0("Load ", chrno, "to extract prebeds"), file = paste0(out_dir, "runstat.o"), append = TRUE)
which <- GenomicRanges::GRanges(seqnames = chrno, ranges = IRanges::IRanges(1, chr_info[chr == chrno, V1] ))
paramsx<- Rsamtools::ScanBamParam(tag = c("UB","CB"),which = which)
paramsy<- Rsamtools::ScanBamParam(tag = c("UB","CB"),what = "seq",which = which)
x <- GenomicAlignments::readGAlignments(in_bamfile, param = paramsx) # x, info of the origin bam file
y <- GenomicAlignments::readGAlignments(in_nSBfile, param = paramsy) # y, info of the soft-clipped bam file
aln_all <- data.table(Chr = as.character(seqnames(x)),
strand = as.character(strand(x)),
start = start(x),
end = end(x),
cigar = as.character(cigar(x)),
UB = mcols(x)[, 1],
CB = mcols(x)[, 2])
aln_ns <- data.table(Chr = as.character(seqnames(y)),
strand = as.character(strand(y)),
start = start(y),
end = end(y),
cigar = as.character(cigar(y)),
UB = mcols(y)[, 2],
CB = mcols(y)[, 3],
seq = as.character(mcols(y)[, 1]))
## key index
setkey(aln_all, Chr, start)
setorder(aln_all, Chr, start)
setkey(aln_ns, Chr, start)
setorder(aln_ns, Chr, start)
## check seq length
if(length(unique(qwidth(x))) != 1){
warning(paste0("Inconsistent length in ", chrno))
}
seqlength <- qwidth(x)[1]
## clean
rm(x)
rm(y)
rm(paramsx)
rm(paramsy)
rm(which)
gc()
## rm na
aln_all <- aln_all[complete.cases(UB, CB), ]
aln_all[strand == "+" , uni := length(unique(end)), by = .(UB,CB)]
aln_all[strand == "-" , uni := length(unique(start)), by = .(UB,CB)]
setorder(aln_all, Chr, start)
## filter rc
setcolorder(aln_all,c("Chr", "strand", "start", "end", "cigar", "UB", "CB", "uni"))
setkey(aln_all, Chr, start, end)
setorder(aln_all, Chr, start)
aln_ne <- aln_all[strand == "-", .SD[1], by = .(UB,CB)]
setorder(aln_all, Chr, -end)
aln_po <- aln_all[strand == "+", .SD[1], by = .(UB,CB)]
aln_all_end <- rbindlist(list(aln_po,aln_ne))
setcolorder(aln_all_end, c("Chr", "strand", "start", "end", "cigar", "UB", "CB", "uni"))
setkey(aln_all_end, Chr, start, end)
setorder(aln_all_end, Chr, start)
## extract CBlist of the chr and clean
if(nrow(aln_all) != 0 ){
cb_list[[chrno]] <- aln_all[, .(unique(CB))]
}
rm(aln_ne)
rm(aln_po)
rm(aln_all)
gc()
## filter rc by uni
median_umi <- median(aln_all_end$uni) # then check
print(paste0("Median of ", chrno," is ", median_umi))
cat(paste0("Median of ", chrno," is ", median_umi, "\n") , file = paste0(out_dir, "runstat.o"), append = TRUE)
setorder(aln_all_end, Chr, start)
median_umi <- max(median_umi, min_median) #### bigger than 4
# max(loc) : no non-missing arguments to max; returning -Inf
aln_ne <- aln_all_end[strand == "-" & uni >= median_umi, .SD[1], by = .(UB,CB)]
setorder(aln_all_end, Chr, -end)
aln_po <- aln_all_end[strand == "+" & uni >= median_umi, .SD[1], by = .(UB,CB)]
aln_all_uni <- rbindlist(list(aln_po,aln_ne))
setcolorder(aln_all_uni, c("Chr", "strand", "start", "end", "cigar", "UB", "CB", "uni"))
setkey(aln_all_uni, Chr, start, end)
setorder(aln_all_uni, Chr, start)
## filter sa
aln_ns <- aln_ns[complete.cases(UB, CB, seq), ]
aln_ns[strand == "+" , uni := length(unique(end)), by = .(UB,CB)]
aln_ns[strand == "-" , uni := length(unique(start)), by = .(UB,CB)]
setorder(aln_ns, Chr, start)
aln_ne <- aln_ns[strand == "-", .SD[1], by = .(UB,CB)]
setorder(aln_ns, Chr, -end)
aln_po <- aln_ns[strand == "+", .SD[1], by = .(UB,CB)]
aln_ns <- rbindlist(list(aln_po,aln_ne))
setcolorder(aln_ns, c("Chr", "strand", "start", "end", "cigar", "UB", "CB", "seq", "uni"))
setkey(aln_ns, Chr, start, end)
setorder(aln_ns, Chr, start)
# clean
rm(aln_po)
rm(aln_ne)
## write DT
# negative strand
if (nrow(aln_all_end[(strand == "-"), ]) != 0) {
# filtered by uni end
aln_pb_n <- aln_all_end[(strand == "-"), .(start)]
fwrite(aln_pb_n[, .(chrno, .N, strand = "-"), by = start][N >= RCn][, .(chrno, start, N, strand)],
paste0(out_dir, "negative_strand/","rc_end.prebed"),
quote = FALSE, sep = "\t",
col.names = FALSE,
append = TRUE)
rm(aln_pb_n)
# filtered by uni f end
aln_pb_n <- aln_all_uni[(strand == "-"), .(start)]
fwrite(aln_pb_n[, .(chrno, .N, strand = "-"), by = start][N >= RCn][, .(chrno, start, N, strand)],
paste0(out_dir, "negative_strand/","rc_uni.prebed"),
quote = FALSE, sep = "\t",
col.names = FALSE,
append = TRUE)
rm(aln_pb_n)
## extract negative strand SAfile of the chr
if(nrow(aln_ns[(strand == "-")]) != 0){
aln_ns_n <- aln_ns[(strand == "-"),.(start, cigar, seq)]
# catch the number of soft-clipped nucleotides
cigarn_n <- regexpr("^([1-9][0-9]|[3-9])S", aln_ns_n$cigar)
Sstart_n <- 1
Send_n <- attr(cigarn_n, "match.length") - 1
S_length_n <- as.numeric(substr(aln_ns_n$cigar, Sstart_n, Send_n))
aln_ns_n <- aln_ns_n[S_length_n >= SA_cutoff]
S_length_n <- S_length_n[which(S_length_n >= SA_cutoff)]
# split the seq to get the soft-clipped nucleotides
mo_loc <- aln_ns_n[, grep("TTTA[A|T]T", substr(seq, S_length_n + 10, S_length_n + 30))]
aln_ns_n[, hatseq := substr(seq, S_length_n + 1, S_length_n + 10)]
aln_ns_n[, seq := substr(seq, 1, S_length_n)]
# count the number and proportion of T
aln_ns_n[, AL := sapply(base::strsplit(seq, ""), function(x) length(grep("T", x)))]
aln_ns_n[, hatAL := sapply(base::strsplit(hatseq, ""), function(x) length(grep("T", x)))]
aln_ns_n[, AP := as.numeric(unlist(AL)/S_length_n)]
aln_ns_n[, hatAP := as.numeric(unlist(hatAL)/10)]
aln_mo_n <- aln_ns_n[mo_loc]
# write down
fwrite(aln_ns_n[(AP >= 0.9) & (hatAP <= 0.5), .(chrno, .N, strand = "-"), by = start][, .(chrno, start, N, strand)],
paste0(out_dir, "negative_strand/", "sa.prebed"),
quote = FALSE, sep = "\t",
col.names = FALSE,
append = TRUE)
# write down motif
fwrite(aln_mo_n[(AP >= 0.9) & (hatAP <= 0.5), .(chrno, .N, strand = "+"), by = start][, .(chrno, start, N, strand)],
paste0(out_dir, "negative_strand/", "sa_motif.prebed"),
quote = FALSE, sep = "\t",
col.names = FALSE,
append = TRUE)
# clean
rm(aln_ns_n)
rm(S_length_n)
rm(Sstart_n)
rm(Send_n)
gc()
}
}
## positive strand
if (nrow(aln_all_end[(strand == "+"),]) != 0) {
# filtered by uni end
aln_pb_p <- aln_all_end[(strand == "+"), .(end)]
fwrite(aln_pb_p[, .(chrno, .N, strand = "+"), by = end][N >= RCn][, .(chrno, end, N, strand)],
paste0(out_dir, "positive_strand/","rc_end.prebed"),
quote = FALSE, sep = "\t",
col.names = FALSE,
append = TRUE)
rm(aln_pb_p)
# filtered by uni f end
aln_pb_p <- aln_all_uni[(strand == "+"), .(end)]
fwrite(aln_pb_p[, .(chrno, .N, strand = "+"), by = end][N >= RCn][, .(chrno, end, N, strand)],
paste0(out_dir, "positive_strand/","rc_uni.prebed"),
quote = FALSE, sep = "\t",
col.names = FALSE,
append = TRUE)
rm(aln_pb_p)
## extract positive strand SAfile of the chr
if(nrow(aln_ns[(strand == "+")]) != 0){
aln_ns_p <- aln_ns[(strand == "+"), .(end, cigar, seq)]
# catch the number of soft-clipped nucleotides
cigarn_p <- regexpr("([1-9][0-9]|[3-9])S$", aln_ns_p$cigar)
Sstart_p <- cigarn_p
Send_p <- cigarn_p + attr(cigarn_p, "match.length")-2
S_length_p <- as.numeric(substr(aln_ns_p$cigar, Sstart_p, Send_p))
aln_ns_p <- aln_ns_p[S_length_p >= SA_cutoff]
S_length_p <- S_length_p[which(S_length_p >= SA_cutoff)]
# split the seq to get the soft-clipped nucleotides
mo_loc <- aln_ns_p[, grep("A[A|T]TAAA", substr(seq, seqlength - S_length_p - 30, seqlength - S_length_p - 10))]
aln_ns_p[, hatseq := substr(seq, seqlength - S_length_p + 1 - 10, seqlength - S_length_p)]
aln_ns_p[, seq := substr(seq, seqlength - S_length_p + 1, seqlength)]
# count the number and proportion of A
aln_ns_p[, AL := sapply(base::strsplit(seq, ""),function(x) length(grep("A", x)))]
aln_ns_p[, hatAL := sapply(base::strsplit(hatseq, ""),function(x) length(grep("A", x)))]
aln_ns_p[, AP := as.numeric(unlist(AL)/S_length_p)]
aln_ns_p[, hatAP := as.numeric(unlist(hatAL)/10)]
aln_mo_p <- aln_ns_p[mo_loc]
# write down
fwrite(aln_ns_p[(AP >= 0.9) & (hatAP <= 0.5), .(chrno, .N, strand = "+"), by = end][, .(chrno, end, N, strand)],
paste0(out_dir, "positive_strand/", "sa.prebed"),
quote = FALSE, sep = "\t",
col.names = FALSE,
append = TRUE)
# write down motif
fwrite(aln_mo_p[(AP >= 0.9) & (hatAP <= 0.5), .(chrno, .N, strand = "+"), by = end][, .(chrno, end, N, strand)],
paste0(out_dir, "positive_strand/", "sa_motif.prebed"),
quote = FALSE, sep = "\t",
col.names = FALSE,
append = TRUE)
# clean
rm(aln_ns_p)
rm(S_length_p)
rm(Sstart_p)
rm(Send_p)
gc()
}
}
# clean
mem <- system(paste0("cat /proc/",Sys.getpid(),"/status | grep VmSize"), intern = TRUE)
cat(paste0(chrno, ": ", mem, "\n"), file = paste0(out_dir, "runstat.o"), append = TRUE)
rm(aln_all_end)
rm(aln_ns)
gc()
}
### write down cb_list
print("write down cblist")
cat("write down cblist", file = paste0(out_dir, "runstat.o"), append = TRUE)
cb_list_DT <- unique(rbindlist(cb_list))
setkey(cb_list_DT, V1)
setorder(cb_list_DT, V1)
fwrite(cb_list_DT, paste0(out_dir, "_CBlist.txt"),
quote = FALSE, sep = "\t", col.names = FALSE)
print(paste0("output files are in ",
out_dir, "positive_strand/", " , ",
out_dir, "negative_strand/", " and ",
out_dir, "_CBlist.txt"))
cat(paste0("output files are in ",
out_dir, "positive_strand/", " , ",
out_dir, "negative_strand/", " and ",
out_dir, "_CBlist.txt"),
file = paste0(out_dir, "runstat.o"), append = TRUE)
print("ExtractAPA: finish")
cat("ExtractAPA: finish", file = paste0(out_dir, "runstat.o"), append = TRUE)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.