hatpal2: APA clusters calling and analysis

Documented in ExtractAPA

#' Generate strand-specific two-dimensional coordinate files (prebed) of the BAM for clustering, and extract the cb list
#'
#' @param in_bamfile A indexed BAM file, and the index files (.bai) must be in the same directory of the indexed BAM
#' @param in_nSBfile A indexed soft-clipped BAM file, generated and indexed by samtools script
#' @param chr_info A data.table that contains the chrinfo, can be generated by ExtractChrinfo()
#' @param out_dir The directory where directories of two strands are created and the output files are stored
#' @param SA_cutoff The minimum number of soft-clipped nts (default 5)
#' @param min_median  The minimum number of locations of the same umi when screening (default 5)
#' @param RCn The cut-off value that decides the minimum weight of output when processing the in_bamfile (default 1)
#'
#' @return NULL, write the prebeds (sa, rc_end, rc_uni) in strand dirs under the out_dir and the cb list in out_dir.
#' @export
#'
#' @import data.table
#' @rawNamespace import(IRanges, except = c(which, shift))
#' @importFrom GenomicAlignments readGAlignments
#'
#' @examples ExtractAPA("test.bam", "test.s.bam", ExtractChrinfo("test.bam"), "output/")
ExtractAPA <- function(in_bamfile, in_nSBfile, chr_info, out_dir, SA_cutoff = 5, min_median = 5, RCn = 1){


  ### check and create two directories to save the output
  if (!dir.exists(out_dir)){
    warning(out_dir, " does not exist")
    return(message("ERROR: check the dir"))
  }
  if (dir.exists(paste0(out_dir, "positive_strand/"))){
    warning(paste0(out_dir, "positive_strand/"), " already exists")
    return(message("ERROR: check the dir."))
  }
  if (dir.exists(paste0(out_dir, "negative_strand/"))){
    warning(paste0(out_dir, "negative_strand/"), " already exists")
    return(message("ERROR: check the dir."))
  }
  print("ExtractAPA: creat the dir")
  cat("ExtractAPA: creat the dir" , file = paste0(out_dir, "runstat.o"), append = TRUE)
  dir.create(paste0(out_dir, "positive_strand/"))
  dir.create(paste0(out_dir, "negative_strand/"))
  # cellbarcode
  cb_list <- list()


  ### chr loop
  for (chrno in as.character(chr_info[, chr])) {

    ## load seq info of the chr and convert it into data.table
    print(paste0("Load ", chrno, "to extract prebeds"))
    cat(paste0("Load ", chrno, "to extract prebeds"), file = paste0(out_dir, "runstat.o"), append = TRUE)
    which <- GenomicRanges::GRanges(seqnames = chrno, ranges = IRanges::IRanges(1, chr_info[chr == chrno, V1] ))
    paramsx<- Rsamtools::ScanBamParam(tag = c("UB","CB"),which = which)
    paramsy<- Rsamtools::ScanBamParam(tag = c("UB","CB"),what = "seq",which = which)
    x <- GenomicAlignments::readGAlignments(in_bamfile, param = paramsx)  # x, info of the origin bam file
    y <- GenomicAlignments::readGAlignments(in_nSBfile, param = paramsy)  # y, info of the soft-clipped bam file
    aln_all <- data.table(Chr = as.character(seqnames(x)),
                          strand = as.character(strand(x)),
                          start = start(x),
                          end = end(x),
                          cigar = as.character(cigar(x)),
                          UB = mcols(x)[, 1],
                          CB = mcols(x)[, 2])
    aln_ns <- data.table(Chr = as.character(seqnames(y)),
                         strand = as.character(strand(y)),
                         start = start(y),
                         end = end(y),
                         cigar = as.character(cigar(y)),
                         UB = mcols(y)[, 2],
                         CB = mcols(y)[, 3],
                         seq = as.character(mcols(y)[, 1]))

    ## key index
    setkey(aln_all, Chr, start)
    setorder(aln_all, Chr, start)
    setkey(aln_ns, Chr, start)
    setorder(aln_ns, Chr, start)

    ## check seq length
    if(length(unique(qwidth(x))) != 1){
      warning(paste0("Inconsistent length in ", chrno))
    }
    seqlength <- qwidth(x)[1]

    ## clean
    rm(x)
    rm(y)
    rm(paramsx)
    rm(paramsy)
    rm(which)
    gc()

    ## rm na
    aln_all <- aln_all[complete.cases(UB, CB), ]
    aln_all[strand == "+" , uni := length(unique(end)), by = .(UB,CB)]
    aln_all[strand == "-" , uni := length(unique(start)), by = .(UB,CB)]
    setorder(aln_all, Chr, start)

    ## filter rc
    setcolorder(aln_all,c("Chr", "strand", "start", "end", "cigar", "UB", "CB", "uni"))
    setkey(aln_all, Chr, start, end)
    setorder(aln_all, Chr, start)
    aln_ne <- aln_all[strand == "-", .SD[1], by = .(UB,CB)]
    setorder(aln_all, Chr, -end)
    aln_po <- aln_all[strand == "+", .SD[1], by = .(UB,CB)]
    aln_all_end <- rbindlist(list(aln_po,aln_ne))
    setcolorder(aln_all_end, c("Chr", "strand", "start", "end", "cigar", "UB", "CB", "uni"))
    setkey(aln_all_end, Chr, start, end)
    setorder(aln_all_end, Chr, start)

    ## extract CBlist of the chr and clean
    if(nrow(aln_all) != 0 ){
      cb_list[[chrno]] <- aln_all[, .(unique(CB))]
    }
    rm(aln_ne)
    rm(aln_po)
    rm(aln_all)
    gc()

    ## filter rc by uni
    median_umi <- median(aln_all_end$uni) # then check
    print(paste0("Median of ", chrno," is ", median_umi))
    cat(paste0("Median of ", chrno," is ", median_umi, "\n") , file = paste0(out_dir, "runstat.o"), append = TRUE)
    setorder(aln_all_end, Chr, start)
    median_umi <- max(median_umi, min_median) #### bigger than 4
    # max(loc) : no non-missing arguments to max; returning -Inf
    aln_ne <- aln_all_end[strand == "-" & uni >= median_umi, .SD[1], by = .(UB,CB)]
    setorder(aln_all_end, Chr, -end)
    aln_po <- aln_all_end[strand == "+" & uni >= median_umi, .SD[1], by = .(UB,CB)]
    aln_all_uni <- rbindlist(list(aln_po,aln_ne))
    setcolorder(aln_all_uni, c("Chr", "strand", "start", "end", "cigar", "UB", "CB", "uni"))
    setkey(aln_all_uni, Chr, start, end)
    setorder(aln_all_uni, Chr, start)

    ## filter sa
    aln_ns <- aln_ns[complete.cases(UB, CB, seq), ]
    aln_ns[strand == "+" , uni := length(unique(end)), by = .(UB,CB)]
    aln_ns[strand == "-" , uni := length(unique(start)), by = .(UB,CB)]
    setorder(aln_ns, Chr, start)
    aln_ne <- aln_ns[strand == "-", .SD[1], by = .(UB,CB)]
    setorder(aln_ns, Chr, -end)
    aln_po <- aln_ns[strand == "+", .SD[1], by = .(UB,CB)]
    aln_ns <- rbindlist(list(aln_po,aln_ne))
    setcolorder(aln_ns, c("Chr", "strand", "start", "end", "cigar", "UB", "CB", "seq", "uni"))
    setkey(aln_ns, Chr, start, end)
    setorder(aln_ns, Chr, start)
    # clean
    rm(aln_po)
    rm(aln_ne)

    ## write DT
    # negative strand
    if (nrow(aln_all_end[(strand == "-"), ]) != 0) {
      # filtered by uni end
      aln_pb_n <- aln_all_end[(strand == "-"), .(start)]
      fwrite(aln_pb_n[, .(chrno, .N, strand = "-"), by = start][N >= RCn][, .(chrno, start, N, strand)],
             paste0(out_dir, "negative_strand/","rc_end.prebed"),
             quote = FALSE, sep = "\t",
             col.names = FALSE,
             append = TRUE)
      rm(aln_pb_n)
      # filtered by uni f end
      aln_pb_n <- aln_all_uni[(strand == "-"), .(start)]
      fwrite(aln_pb_n[, .(chrno, .N, strand = "-"), by = start][N >= RCn][, .(chrno, start, N, strand)],
             paste0(out_dir, "negative_strand/","rc_uni.prebed"),
             quote = FALSE, sep = "\t",
             col.names = FALSE,
             append = TRUE)
      rm(aln_pb_n)

      ## extract negative strand SAfile of the chr
      if(nrow(aln_ns[(strand == "-")]) != 0){
        aln_ns_n <- aln_ns[(strand == "-"),.(start, cigar, seq)]
        # catch the number of soft-clipped nucleotides
        cigarn_n <- regexpr("^([1-9][0-9]|[3-9])S", aln_ns_n$cigar)
        Sstart_n <- 1
        Send_n <- attr(cigarn_n, "match.length") - 1
        S_length_n <- as.numeric(substr(aln_ns_n$cigar, Sstart_n, Send_n))
        aln_ns_n <- aln_ns_n[S_length_n >= SA_cutoff]
        S_length_n <- S_length_n[which(S_length_n >= SA_cutoff)]
        # split the seq to get the soft-clipped nucleotides
        mo_loc <- aln_ns_n[, grep("TTTA[A|T]T", substr(seq, S_length_n + 10, S_length_n + 30))]
        aln_ns_n[, hatseq := substr(seq, S_length_n + 1, S_length_n + 10)]
        aln_ns_n[, seq := substr(seq, 1, S_length_n)]
        # count the number and proportion of T
        aln_ns_n[, AL := sapply(base::strsplit(seq, ""), function(x) length(grep("T", x)))]
        aln_ns_n[, hatAL := sapply(base::strsplit(hatseq, ""), function(x) length(grep("T", x)))]
        aln_ns_n[, AP := as.numeric(unlist(AL)/S_length_n)]
        aln_ns_n[, hatAP := as.numeric(unlist(hatAL)/10)]
        aln_mo_n <- aln_ns_n[mo_loc]
        # write down
        fwrite(aln_ns_n[(AP >= 0.9) & (hatAP <= 0.5), .(chrno, .N, strand = "-"), by = start][, .(chrno, start, N, strand)],
               paste0(out_dir, "negative_strand/", "sa.prebed"),
               quote = FALSE, sep = "\t",
               col.names = FALSE,
               append = TRUE)
        # write down motif
        fwrite(aln_mo_n[(AP >= 0.9) & (hatAP <= 0.5), .(chrno, .N, strand = "+"), by = start][, .(chrno, start, N, strand)],
               paste0(out_dir, "negative_strand/", "sa_motif.prebed"),
               quote = FALSE, sep = "\t",
               col.names = FALSE,
               append = TRUE)
        # clean
        rm(aln_ns_n)
        rm(S_length_n)
        rm(Sstart_n)
        rm(Send_n)
        gc()
      }
    }

    ##  positive strand
    if (nrow(aln_all_end[(strand == "+"),]) != 0) {
      # filtered by uni end
      aln_pb_p <- aln_all_end[(strand == "+"), .(end)]
      fwrite(aln_pb_p[, .(chrno, .N, strand = "+"), by = end][N >= RCn][, .(chrno, end, N, strand)],
             paste0(out_dir, "positive_strand/","rc_end.prebed"),
             quote = FALSE, sep = "\t",
             col.names = FALSE,
             append = TRUE)
      rm(aln_pb_p)
      # filtered by uni f end
      aln_pb_p <- aln_all_uni[(strand == "+"), .(end)]
      fwrite(aln_pb_p[, .(chrno, .N, strand = "+"), by = end][N >= RCn][, .(chrno, end, N, strand)],
             paste0(out_dir, "positive_strand/","rc_uni.prebed"),
             quote = FALSE, sep = "\t",
             col.names = FALSE,
             append = TRUE)
      rm(aln_pb_p)

      ## extract positive strand SAfile of the chr
      if(nrow(aln_ns[(strand == "+")]) != 0){
        aln_ns_p <- aln_ns[(strand == "+"), .(end, cigar, seq)]
        # catch the number of soft-clipped nucleotides
        cigarn_p <- regexpr("([1-9][0-9]|[3-9])S$", aln_ns_p$cigar)
        Sstart_p <- cigarn_p
        Send_p <- cigarn_p + attr(cigarn_p, "match.length")-2
        S_length_p <- as.numeric(substr(aln_ns_p$cigar, Sstart_p, Send_p))
        aln_ns_p <- aln_ns_p[S_length_p >= SA_cutoff]
        S_length_p <- S_length_p[which(S_length_p >= SA_cutoff)]
        # split the seq to get the soft-clipped nucleotides
        mo_loc <- aln_ns_p[, grep("A[A|T]TAAA", substr(seq, seqlength - S_length_p - 30, seqlength - S_length_p - 10))]
        aln_ns_p[, hatseq := substr(seq, seqlength - S_length_p + 1 - 10, seqlength - S_length_p)]
        aln_ns_p[, seq := substr(seq, seqlength - S_length_p + 1, seqlength)]
        # count the number and proportion of A
        aln_ns_p[, AL := sapply(base::strsplit(seq, ""),function(x) length(grep("A", x)))]
        aln_ns_p[, hatAL := sapply(base::strsplit(hatseq, ""),function(x) length(grep("A", x)))]
        aln_ns_p[, AP := as.numeric(unlist(AL)/S_length_p)]
        aln_ns_p[, hatAP := as.numeric(unlist(hatAL)/10)]
        aln_mo_p <- aln_ns_p[mo_loc]
        # write down
        fwrite(aln_ns_p[(AP >= 0.9) & (hatAP <= 0.5), .(chrno, .N, strand = "+"), by = end][, .(chrno, end, N, strand)],
               paste0(out_dir, "positive_strand/", "sa.prebed"),
               quote = FALSE, sep = "\t",
               col.names = FALSE,
               append = TRUE)
        # write down motif
        fwrite(aln_mo_p[(AP >= 0.9) & (hatAP <= 0.5), .(chrno, .N, strand = "+"), by = end][, .(chrno, end, N, strand)],
               paste0(out_dir, "positive_strand/", "sa_motif.prebed"),
               quote = FALSE, sep = "\t",
               col.names = FALSE,
               append = TRUE)
        # clean
        rm(aln_ns_p)
        rm(S_length_p)
        rm(Sstart_p)
        rm(Send_p)
        gc()
      }
    }

    # clean
    mem <- system(paste0("cat /proc/",Sys.getpid(),"/status | grep VmSize"), intern = TRUE)
    cat(paste0(chrno, ": ", mem, "\n"), file = paste0(out_dir, "runstat.o"), append = TRUE)
    rm(aln_all_end)
    rm(aln_ns)
    gc()
  }


  ### write down cb_list
  print("write down cblist")
  cat("write down cblist", file = paste0(out_dir, "runstat.o"), append = TRUE)
  cb_list_DT <- unique(rbindlist(cb_list))
  setkey(cb_list_DT, V1)
  setorder(cb_list_DT, V1)
  fwrite(cb_list_DT, paste0(out_dir, "_CBlist.txt"),
         quote = FALSE, sep = "\t", col.names = FALSE)
  print(paste0("output files are in ",
               out_dir, "positive_strand/", " , ",
               out_dir, "negative_strand/", " and ",
               out_dir, "_CBlist.txt"))
  cat(paste0("output files are in ",
             out_dir, "positive_strand/", " , ",
             out_dir, "negative_strand/", " and ",
             out_dir, "_CBlist.txt"),
      file = paste0(out_dir, "runstat.o"), append = TRUE)
  print("ExtractAPA: finish")
  cat("ExtractAPA: finish", file = paste0(out_dir, "runstat.o"), append = TRUE)

}