hatpal2: APA clusters calling and analysis

Documented in CountAPAc

#' Create a APA_cluster-Cell_Barcode count matrix for Seurart analysis
#'
#' @param cluster_p Annotated APA clusters file of the positive strand, generated by AnnoAPAc()
#' @param cluster_n Annotated APA clusters file of the positive strand, generated by AnnoAPAc()
#' @param in_bamfile A indexed BAM file, and the index files (.bai) must be in the same directory of the indexed BAM
#' @param cb_file A cell barcodes list, can be generated by extractAPA() or cellranger
#' @param chr_info A data.table that contains the chrinfo, can be generated by ExtractChrinfo()
#' @param out_dir The directory where a directory named "matrix/" is created and the output files are stored
#' @param filter_method The way to  filter reads (defult "end")
#' @param UBtag UB tag (defult "UB")
#'
#' @return NULL. writes output in the directory "matrix/" of the out_dir
#' @export
#'
#' @import data.table Matrix
#' @rawNamespace import(IRanges, except = c(which, shift))
#' @importFrom GenomicAlignments readGAlignments
#'
#' @examples CountAPAc("p/APA_map.out.anno", "n/APA_map.out.anno", "test.bam", "_CBlist.txt", ExtractChrinfo("test.s.bam"), "output/")
CountAPAc <- function(cluster_p, cluster_n, in_bamfile, cb_file, chr_info, out_dir, filter_method = "end", UBtag = "UB"){


  ### check and create two directories to save the output
  print("CountAPAc: creat matrix dir")
  cat("CountAPAc: creat matrix dir", file = paste0(out_dir, ".runstat.o"), append = TRUE)
  if (dir.exists(paste0(out_dir, "matrix/"))){
    warning(paste0(out_dir, "matrix/"), " already exists")
    return(message("ERROR: check the dir"))
  }
  dir.create(paste0(out_dir, "matrix/"))


  ### read the files
  bedfile_p <- fread(cluster_p, header=F)
  bedfile_n <- fread(cluster_n, header=F)
  cblist <- fread(cb_file, header=F)

  ## rewrite the cb_list
  cblist <- cblist$V1
  cb.dim <- length(cblist)
  write.table(cblist, file=paste0(out_dir, "matrix/",  "/barcodes.tsv"),
              quote = FALSE, row.names = FALSE, col.names = FALSE)

  ## rewrite the apa_clusters
  print("load and write apa features")
  # positive strand
  setcolorder(bedfile_p,
              c("V1", "V6", "V7", "V4", "V5", "V9", "V10", "V8", "V2", "V3", "V11"))
  bedfile_p[is.na(V4), V8 := bedfile_p[!is.na(V11), V11]]
  setnames(bedfile_p,
           c("V1", "V6", "V7", "V4", "V5", "V9", "V10", "V8", "V2", "V3", "V11"),
           c("V1", "V2", "V3", "V4", "V5", "V6", "V7", "V8", "V9", "V10", "V11"))
  bedfile_p[, c("V11") := NULL]
  bedfile_p[, strand := "+"]
  # negative strand
  setcolorder(bedfile_n,
              c("V1", "V6", "V7", "V4", "V5", "V9", "V10", "V8", "V2", "V3", "V11"))
  bedfile_n[is.na(V4), V8 := bedfile_n[!is.na(V11), V11]]
  setnames(bedfile_n,
           c("V1", "V6", "V7", "V4", "V5", "V9", "V10", "V8", "V2", "V3", "V11"),
           c("V1", "V2", "V3", "V4", "V5", "V6", "V7", "V8", "V9", "V10", "V11"))
  bedfile_n[, c("V11") := NULL]
  bedfile_n[, strand := "-"]
  # rbind
  bedfile <- rbindlist(list(bedfile_p, bedfile_n))
  setkey(bedfile, V1, V2)
  setorder(bedfile, V1, V2)

  ## add unique id to every clusters
  bedfile[, unino := .I]
  bed.dim <- nrow(bedfile)
  bedfile[strand == "+", endid := c(1:.N), by = .(V4,V6)]
  bedfile[strand == "-", endid := c(.N:1), by = .(V4,V6)]
  bedfile[, nend := .N, by = .(V4,V6)]

  ## add relative position to every clusters
  bedfile[, per :=
            ((V9 + V10)/2 - min((V9 + V10)/2))/(max((V9 + V10)/2) - min((V9 + V10)/2) + 1), by = .(V4,V6)]
  bedfile[strand == "-", per := (1 - per)]
  bedfile[nend == 1, per := 1]
  bedfile[, per := round(per, 2)]
  bedtowt <- bedfile[, .(paste0(unino, ":", V1, ":", V2, "-", V3, "-", V8, "-", per))]
  bedtowt[!is.na(bedfile$V4),
          V2 := bedfile[!is.na(V4), paste0(V4, "-", V5, ":", V8, ":", endid, "/", nend)]]
  bedtowt[is.na(bedfile$V4),
          V2 := bedfile[is.na(V4), paste0("-", V6, "-", V7, ":", V8, ":", endid, "/", nend)]]

  ## write down features_file and re_annotated_clusters_file
  write.table(bedtowt, file=paste0(out_dir, "matrix/",  "/genes.tsv"),
              quote = FALSE, row.names = FALSE, col.names = FALSE, sep = "\t")
  fwrite(bedfile, paste0(out_dir, "apa_clusters.table"),
         quote = FALSE, sep = "\t", col.names = FALSE)
  rm(bedfile_n)
  rm(bedfile_p)
  rm(bedtowt)
  gc()


  ### loop to get count matrix
  print("start to get matrix")
  all_MM <- list()
  for (chrno in chr_info[, chr]) {

    ## load cb and ub into data.base
    print(paste0(chrno," starts loading and counting"))
    cat(paste0(chrno," starts loading and counting"), file = paste0(out_dir, ".runstat.o"), append = TRUE)
    which <- GenomicRanges::GRanges(seqnames = chrno, ranges = IRanges::IRanges(1, chr_info[chr == chrno, V1] ))
    paramsx<- Rsamtools::ScanBamParam(tag = c(UBtag, "CB"), which = which)
    x <- GenomicAlignments::readGAlignments(in_bamfile, param=paramsx)
    aln_all <- data.table(Chr = as.character(seqnames(x)),
                          strand=as.character(strand(x)),
                          start=start(x),
                          end=end(x),
                          cigar=as.character(cigar(x)),
                          UB=mcols(x)[,1],
                          CB=mcols(x)[,2])
    setkey(aln_all, Chr, start)
    setorder(aln_all, Chr, start)

    ## clean
    rm(x)
    rm(which)
    rm(paramsx)
    gc()

    ## deduplicate and rm na
    aln_all <- aln_all[complete.cases(UB, CB), ]

    ## filter_method
    if(filter_method == "auto"){
      aln_all[strand == "+" , uni := length(unique(end)), by = .(UB,CB)]
      aln_all[strand == "-" , uni := length(unique(start)), by = .(UB,CB)]
      median_umi <- median(aln_all$uni)
      setorder(aln_all, Chr, start)
      aln_ne <- aln_all[strand == "-" & uni >= median_umi, .SD[1], by = .(UB,CB)]
      setorder(aln_all, Chr, -end)
      aln_po <- aln_all[strand == "+" & uni >= median_umi, .SD[1], by = .(UB,CB)]
      aln_all <- rbindlist(list(aln_po,aln_ne))
      aln_all[, uni := NULL]
    }else if(filter_method == "end"){
      setorder(aln_all, Chr, start)
      aln_ne <- aln_all[strand == "-", .SD[1], by = .(UB,CB)]
      setorder(aln_all, Chr, -end)
      aln_po <- aln_all[strand == "+", .SD[1], by = .(UB,CB)]
      aln_all <- rbindlist(list(aln_po,aln_ne))
    }

    ## sort
    setcolorder(aln_all,c("Chr","strand","start","end","cigar", "UB", "CB"))
    setkey(aln_all, Chr, start, end)
    setorder(aln_all, Chr, start)
    rm(aln_po)
    rm(aln_ne)
    print("start computing")


    # prepare input for + strand of the chr
    bed_todo <- bedfile[(V1 == chrno)&(strand == "+"), .(V2, V3, unino)]
    ubcb_todo <- aln_all[(strand == "+"), .(locleft = end, locright = end, cb = CB)]
    setkey(bed_todo, V2, V3)
    setkey(ubcb_todo, locleft, locright)

    # findoverlaps & map
    mapindex <- foverlaps(ubcb_todo, bed_todo, type = "within", which = TRUE, nomatch = 0)
    mapcc <- data.table(clusterno = bed_todo[mapindex[["yid"]], unino],
                        cb = ubcb_todo[mapindex[["xid"]], cb])
    countcc <- mapcc[, .N, by = .(clusterno, cbno=match(cb, cblist))]
    rm(mapcc)

    # prepare input for - strand of the chr
    bed_todo <- bedfile[(V1 == chrno)&(strand == "-"), .(V2, V3, unino)]
    ubcb_todo <- aln_all[(strand == "-"), .(locleft = start, locright = start, cb = CB)]
    setkey(bed_todo, V2, V3)
    setkey(ubcb_todo, locleft, locright)

    # findoverlaps & map
    mapindex <- foverlaps(ubcb_todo, bed_todo, type = "within", which = TRUE, nomatch = 0)
    mapcc <- data.table(clusterno = bed_todo[mapindex[["yid"]], unino], cb = ubcb_todo[mapindex[["xid"]], cb])

    #bind +&- of chrno
    all_MM[[chrno]] <- rbindlist(list(countcc[complete.cases(cbno,N)],
                                      mapcc[, .N,by = .(clusterno, cbno = match(cb, cblist))][complete.cases(cbno, N)]))
    rm(aln_all)
    rm(bed_todo)
    rm(ubcb_todo)
    rm(countcc)
    rm(mapcc)
    rm(mapindex)
    gc()
  }


  ## get the data.table of annotated APA clusters
  all_MM_DT <- rbindlist(all_MM)
  rm(all_MM)
  rm(bedfile)
  gc()
  setindex(all_MM_DT, cbno)
  setorder(all_MM_DT, cbno)

  ## make sparse matrix  &
  all_MM_sparse <- Matrix::sparseMatrix(all_MM_DT$clusterno, all_MM_DT$cbno, dims = c(bed.dim, cb.dim), x=all_MM_DT$N)
  print("write down matrix")
  Matrix::writeMM(all_MM_sparse, file = paste0(out_dir, "matrix/", "/matrix.mtx"))
  print("CountAPAc: finish")
  cat("CountAPAc: finish", file = paste0(out_dir, ".runstat.o"), append = TRUE)
}