#' Create a APA_cluster-Cell_Barcode count matrix for Seurart analysis
#'
#' @param cluster_p Annotated APA clusters file of the positive strand, generated by AnnoAPAc()
#' @param cluster_n Annotated APA clusters file of the positive strand, generated by AnnoAPAc()
#' @param in_bamfile A indexed BAM file, and the index files (.bai) must be in the same directory of the indexed BAM
#' @param cb_file A cell barcodes list, can be generated by extractAPA() or cellranger
#' @param chr_info A data.table that contains the chrinfo, can be generated by ExtractChrinfo()
#' @param out_dir The directory where a directory named "matrix/" is created and the output files are stored
#' @param filter_method The way to filter reads (defult "end")
#' @param UBtag UB tag (defult "UB")
#'
#' @return NULL. writes output in the directory "matrix/" of the out_dir
#' @export
#'
#' @import data.table Matrix
#' @rawNamespace import(IRanges, except = c(which, shift))
#' @importFrom GenomicAlignments readGAlignments
#'
#' @examples CountAPAc("p/APA_map.out.anno", "n/APA_map.out.anno", "test.bam", "_CBlist.txt", ExtractChrinfo("test.s.bam"), "output/")
CountAPAc <- function(cluster_p, cluster_n, in_bamfile, cb_file, chr_info, out_dir, filter_method = "end", UBtag = "UB"){
### check and create two directories to save the output
print("CountAPAc: creat matrix dir")
cat("CountAPAc: creat matrix dir", file = paste0(out_dir, ".runstat.o"), append = TRUE)
if (dir.exists(paste0(out_dir, "matrix/"))){
warning(paste0(out_dir, "matrix/"), " already exists")
return(message("ERROR: check the dir"))
}
dir.create(paste0(out_dir, "matrix/"))
### read the files
bedfile_p <- fread(cluster_p, header=F)
bedfile_n <- fread(cluster_n, header=F)
cblist <- fread(cb_file, header=F)
## rewrite the cb_list
cblist <- cblist$V1
cb.dim <- length(cblist)
write.table(cblist, file=paste0(out_dir, "matrix/", "/barcodes.tsv"),
quote = FALSE, row.names = FALSE, col.names = FALSE)
## rewrite the apa_clusters
print("load and write apa features")
# positive strand
setcolorder(bedfile_p,
c("V1", "V6", "V7", "V4", "V5", "V9", "V10", "V8", "V2", "V3", "V11"))
bedfile_p[is.na(V4), V8 := bedfile_p[!is.na(V11), V11]]
setnames(bedfile_p,
c("V1", "V6", "V7", "V4", "V5", "V9", "V10", "V8", "V2", "V3", "V11"),
c("V1", "V2", "V3", "V4", "V5", "V6", "V7", "V8", "V9", "V10", "V11"))
bedfile_p[, c("V11") := NULL]
bedfile_p[, strand := "+"]
# negative strand
setcolorder(bedfile_n,
c("V1", "V6", "V7", "V4", "V5", "V9", "V10", "V8", "V2", "V3", "V11"))
bedfile_n[is.na(V4), V8 := bedfile_n[!is.na(V11), V11]]
setnames(bedfile_n,
c("V1", "V6", "V7", "V4", "V5", "V9", "V10", "V8", "V2", "V3", "V11"),
c("V1", "V2", "V3", "V4", "V5", "V6", "V7", "V8", "V9", "V10", "V11"))
bedfile_n[, c("V11") := NULL]
bedfile_n[, strand := "-"]
# rbind
bedfile <- rbindlist(list(bedfile_p, bedfile_n))
setkey(bedfile, V1, V2)
setorder(bedfile, V1, V2)
## add unique id to every clusters
bedfile[, unino := .I]
bed.dim <- nrow(bedfile)
bedfile[strand == "+", endid := c(1:.N), by = .(V4,V6)]
bedfile[strand == "-", endid := c(.N:1), by = .(V4,V6)]
bedfile[, nend := .N, by = .(V4,V6)]
## add relative position to every clusters
bedfile[, per :=
((V9 + V10)/2 - min((V9 + V10)/2))/(max((V9 + V10)/2) - min((V9 + V10)/2) + 1), by = .(V4,V6)]
bedfile[strand == "-", per := (1 - per)]
bedfile[nend == 1, per := 1]
bedfile[, per := round(per, 2)]
bedtowt <- bedfile[, .(paste0(unino, ":", V1, ":", V2, "-", V3, "-", V8, "-", per))]
bedtowt[!is.na(bedfile$V4),
V2 := bedfile[!is.na(V4), paste0(V4, "-", V5, ":", V8, ":", endid, "/", nend)]]
bedtowt[is.na(bedfile$V4),
V2 := bedfile[is.na(V4), paste0("-", V6, "-", V7, ":", V8, ":", endid, "/", nend)]]
## write down features_file and re_annotated_clusters_file
write.table(bedtowt, file=paste0(out_dir, "matrix/", "/genes.tsv"),
quote = FALSE, row.names = FALSE, col.names = FALSE, sep = "\t")
fwrite(bedfile, paste0(out_dir, "apa_clusters.table"),
quote = FALSE, sep = "\t", col.names = FALSE)
rm(bedfile_n)
rm(bedfile_p)
rm(bedtowt)
gc()
### loop to get count matrix
print("start to get matrix")
all_MM <- list()
for (chrno in chr_info[, chr]) {
## load cb and ub into data.base
print(paste0(chrno," starts loading and counting"))
cat(paste0(chrno," starts loading and counting"), file = paste0(out_dir, ".runstat.o"), append = TRUE)
which <- GenomicRanges::GRanges(seqnames = chrno, ranges = IRanges::IRanges(1, chr_info[chr == chrno, V1] ))
paramsx<- Rsamtools::ScanBamParam(tag = c(UBtag, "CB"), which = which)
x <- GenomicAlignments::readGAlignments(in_bamfile, param=paramsx)
aln_all <- data.table(Chr = as.character(seqnames(x)),
strand=as.character(strand(x)),
start=start(x),
end=end(x),
cigar=as.character(cigar(x)),
UB=mcols(x)[,1],
CB=mcols(x)[,2])
setkey(aln_all, Chr, start)
setorder(aln_all, Chr, start)
## clean
rm(x)
rm(which)
rm(paramsx)
gc()
## deduplicate and rm na
aln_all <- aln_all[complete.cases(UB, CB), ]
## filter_method
if(filter_method == "auto"){
aln_all[strand == "+" , uni := length(unique(end)), by = .(UB,CB)]
aln_all[strand == "-" , uni := length(unique(start)), by = .(UB,CB)]
median_umi <- median(aln_all$uni)
setorder(aln_all, Chr, start)
aln_ne <- aln_all[strand == "-" & uni >= median_umi, .SD[1], by = .(UB,CB)]
setorder(aln_all, Chr, -end)
aln_po <- aln_all[strand == "+" & uni >= median_umi, .SD[1], by = .(UB,CB)]
aln_all <- rbindlist(list(aln_po,aln_ne))
aln_all[, uni := NULL]
}else if(filter_method == "end"){
setorder(aln_all, Chr, start)
aln_ne <- aln_all[strand == "-", .SD[1], by = .(UB,CB)]
setorder(aln_all, Chr, -end)
aln_po <- aln_all[strand == "+", .SD[1], by = .(UB,CB)]
aln_all <- rbindlist(list(aln_po,aln_ne))
}
## sort
setcolorder(aln_all,c("Chr","strand","start","end","cigar", "UB", "CB"))
setkey(aln_all, Chr, start, end)
setorder(aln_all, Chr, start)
rm(aln_po)
rm(aln_ne)
print("start computing")
# prepare input for + strand of the chr
bed_todo <- bedfile[(V1 == chrno)&(strand == "+"), .(V2, V3, unino)]
ubcb_todo <- aln_all[(strand == "+"), .(locleft = end, locright = end, cb = CB)]
setkey(bed_todo, V2, V3)
setkey(ubcb_todo, locleft, locright)
# findoverlaps & map
mapindex <- foverlaps(ubcb_todo, bed_todo, type = "within", which = TRUE, nomatch = 0)
mapcc <- data.table(clusterno = bed_todo[mapindex[["yid"]], unino],
cb = ubcb_todo[mapindex[["xid"]], cb])
countcc <- mapcc[, .N, by = .(clusterno, cbno=match(cb, cblist))]
rm(mapcc)
# prepare input for - strand of the chr
bed_todo <- bedfile[(V1 == chrno)&(strand == "-"), .(V2, V3, unino)]
ubcb_todo <- aln_all[(strand == "-"), .(locleft = start, locright = start, cb = CB)]
setkey(bed_todo, V2, V3)
setkey(ubcb_todo, locleft, locright)
# findoverlaps & map
mapindex <- foverlaps(ubcb_todo, bed_todo, type = "within", which = TRUE, nomatch = 0)
mapcc <- data.table(clusterno = bed_todo[mapindex[["yid"]], unino], cb = ubcb_todo[mapindex[["xid"]], cb])
#bind +&- of chrno
all_MM[[chrno]] <- rbindlist(list(countcc[complete.cases(cbno,N)],
mapcc[, .N,by = .(clusterno, cbno = match(cb, cblist))][complete.cases(cbno, N)]))
rm(aln_all)
rm(bed_todo)
rm(ubcb_todo)
rm(countcc)
rm(mapcc)
rm(mapindex)
gc()
}
## get the data.table of annotated APA clusters
all_MM_DT <- rbindlist(all_MM)
rm(all_MM)
rm(bedfile)
gc()
setindex(all_MM_DT, cbno)
setorder(all_MM_DT, cbno)
## make sparse matrix &
all_MM_sparse <- Matrix::sparseMatrix(all_MM_DT$clusterno, all_MM_DT$cbno, dims = c(bed.dim, cb.dim), x=all_MM_DT$N)
print("write down matrix")
Matrix::writeMM(all_MM_sparse, file = paste0(out_dir, "matrix/", "/matrix.mtx"))
print("CountAPAc: finish")
cat("CountAPAc: finish", file = paste0(out_dir, ".runstat.o"), append = TRUE)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.