R/import.R

Defines functions importBAM importDFAM importMSA importRMSK importUCSC importTEs import10x .readBamF

Documented in importBAM importDFAM importRMSK importTEs importUCSC

#' Parallel import of BAM files
#'
#' Imports reads stored in BAM files
#'
#' @param path Character. Path to file.
#' @param main_assembly Logical. If TRUE (the default), only returns TE intervals on the main chromosomes. 
#' @return Returns a GRanges object with TE intervals
#' @export
importBAM <- function(bams, 
                      paired       = NA, 
                      multi        = TRUE,
                      what         = NA,
                      tag          = NA,
                      mate         = NA,
                      anchor       = NULL,
                      meta         = NA,
                      granges      = FALSE,
                      barcode      = NA,
                      data.table   = TRUE,
                      sorted       = TRUE,
                      use_gcluster = FALSE)
{
  # construct data.frame
  if (sum(class(bams) == 'data.frame') > 0)
  {
    if (is.null(bams$barcode))  { bams$barcode <- barcode }
    if (is.null(bams$chunk))    { bams$chunk   <- 1       }
  } else {
    bams <-
      data.frame(paths   = bams,
                 paired  = rep(paired,  length(bams)),
                 mate    = rep(mate,    length(bams)),
                 barcode = rep(barcode, length(bams)),
                 meta    = rep(meta,    length(bams)),
                 chunk   = 1,
                 stringsAsFactors = FALSE)
  }
  
  # character to factor columns
  bams$paths   <- as.character(bams$paths)
  bams$barcode <- as.character(bams$barcode)
  
  # adjust read in fields according to barcode
  if (sum(nchar(na.omit(bams$barcode))) > 0)
  {
    tag <- unique(as.character(na.omit(c(tag, bams$barcode[nchar(bams$barcode) == 2]))))
  }
  
  # pre-checking
  if (is.na(paired) & is.na(bams$paired)) { stop ('Please specify paired status of bam files')  }
  if (is.na(mate)   & is.na(bams$mate))   { stop ('Please specify mate to read from bam files') }
  
  if (sum(dirname(bams$paths) == '.') > 0) { stop ('Please provide full path to files') }
  
  # import in parallel (nested)
  message ('Importing BAM file(s)')
  if (use_gcluster)
  {
     # commands for mafft alignment and conversion
    cmds <- list()
    for (i in unique(bams$chunk))
    {
      cmds[[i]] <- 
        glue("Reputils:::.readBamF(bams[bams$chunk == {i}, ], what = {what}, tag = {tag}, data.table = {data.table})")
    }
    # create new empty env and fill with relevant
    empty <- new.env(parent = emptyenv())
    empty$bams <- bams
    
    # distribute, compute and combine
    res <- 
      gcluster.run3(command_list = cmds,  
                    packages = c("data.table", "base", "stats"), 
                    job_names = names(cmds),
                    max.jobs = 50, 
                    envir = empty, 
                    io_saturation = FALSE)
    if (data.table)
    {
      reads <- rbindlist(lapply(res, function(x) x$retv))
    } else {
      reads <- unlist(GAlignmentsList(lapply(res, function(x) x$retv)))
    }
  } else {
    res <- listenv::listenv()
      for (i in unique(bams$chunk))
      {
        print(i)
        res[[i]] %<-% Reputils:::.readBamF(bams[bams$chunk == i, ], what = what, tag = tag, data.table = data.table) %packages% "data.table"
      }
    if (data.table)
    {
      reads <- rbindlist(as.list(res))
    } else {
      reads <- unlist(GAlignmentsList(as.list(res)))
    }
  }
  message ('Done')

  # throw multi mapping reads if desired
  if (!multi)
  {
    if (data.table)
    {
      reads <- reads[which(NH == 1), ]
    } else {
      reads <- reads[mcols(reads)$NH == 1]
    }
  }
  
  # anchor coordinates
  if (!is.null(anchor))
  {
    message ('Anchoring reads at ', anchor)
    if (anchor == 'fiveprime')
    {
      #reads <- mutate(anchor_5p(reads), width = 1)
      reads[which(strand == '+'), end   := start]
      reads[which(strand == '-'), start := end]
    }
    if (anchor == 'threeprime')
    {
      #reads <- mutate(anchor_3p(reads), width = 1)
      reads[which(strand == '+'), start := end]
      reads[which(strand == '-'), end   := start]
    }
  }
  
  if (!data.table)
  {
    if (GenomeInfoDb::seqlevelsStyle(reads) != 'UCSC')
    {
      GenomeInfoDb::seqlevelsStyle(reads) <- 'UCSC' 
      if (!is.null(mcols(reads)$mrnm))
      {
        mcols(reads)$mrnm <- paste0('chr', mcols(reads)$mrnm)
        mcols(reads)$mrnm <- gsub('^chrMT', 'chrM', mcols(reads)$mrnm)
      }
    }
  } # else {
    # if (GenomeInfoDb::seqlevelsStyle(as_granges(reads[sample(1:nrow(reads), min(nrow(reads), 1000)),])) != 'UCSC')
    # {
      # reads <- reads[, seqnames := paste0('chr', seqnames)]
      # reads <- reads[which(seqnames == 'chrMT'), seqnames := 'chrM']
      # if (!is.null(reads$mrnm))
      # {
        # reads <- reads[, mrnm := paste0('chr', mrnm)]
        # reads <- reads[which(mrnm == 'chrMT'), mrnm := 'chrM']
      # }
    # }  
  # }
  
  if (sorted)
  {
    if (data.table)
    {
      reads <- reads[order(seqnames, start, end), ]
    } else {
      reads <- sort(reads, ignore.strand = TRUE)
    }  
  }
  
  return(reads)
}

#' Import TE intervals in DFAM format
#'
#' Imports TE interval coordinates from DFAM database
#'
#' @param path Character. Path to file.
#' @param curate Logical. If TRUE, calls `curateTEs` to resolve overlapping TE intervals.
#' @param main_assembly Logical. If TRUE (the default), only returns TE intervals on the main chromosomes. 
#' @return Returns a GRanges object with TE intervals
#' @export
importDFAM <- function(path,
                       curate = FALSE,
                       main_assembly = TRUE)          
{
  dfam_hits <- fread(path)
  
  # mod column names
  colnames(dfam_hits) <- gsub('-', '_', colnames(dfam_hits))
  colnames(dfam_hits) <- gsub('#', '', colnames(dfam_hits))
  setnames(dfam_hits, c('hmm_st', 'hmm_en', 'ali_st', 'ali_en', 'seq_name', 'family_name'), c('position_start', 'position_end', 'start', 'end', 'seqnames', 'name'))
  
  dfam_hits[, id_unique := paste(name, 1:nrow(dfam_hits), sep = '|')]
  dfam_hits[, start_old := start][which(strand == '-'), ':=' (start = end, end = start_old)][, start_old := NULL]
  
  res <- as_granges(dfam_hits)
  
  if (main_assembly)
  {
    res <- GenomeInfoDb::keepStandardChromosomes(res, pruning.mode = 'coarse')
  }
  
  if (curate)
  {
    res <- curateTEs(res)
  }
    
  return(res)
}

importMSA <- function(fnames,
                      long = TRUE,
                      n_jobs = 500)
{
  # f_sizes <- 
    # tibble(fname = fnames, size = file.info(fnames)$size) %>%
    # arrange(size) %>%
    # mutate(chunk = cut(cumsum(size), breaks = seq(1, max(cumsum(size)) + 1e9, by = 1e9), labels = FALSE, include.lowest = TRUE))
  # test <- future.apply::future_by(f_sizes[1:5,], f_sizes$chunk[1:5], function(x) readDNAStringSet(x$fname))
  
  if (length(fnames) > 1)
  {
    # create commands for distribution
    cmds <- list()
    for (fname in fnames)
    {
      if (long)
      {
        cmds[[basename(fname)]] <- glue("Repexpr:::msaToLong(readDNAStringSet('{fname}'))")
      } else {
        cmds[[basename(fname)]] <- glue("readDNAStringSet('{fname}')")
      }
    }
  
    # create new empty env and fill with relevant
    empty <- new.env(parent = emptyenv())
    
    # distribute, compute and combine
    res_gclust <- 
      Reputils:::gcluster.run3(command_list = cmds,  
                    packages = c("Biostrings", "base", "stats"), 
                    job_names = names(cmds),
                    max.jobs = n_jobs, 
                    envir = empty, 
                    io_saturation = FALSE)
    
    res <- rbindlist(lapply(res_gclust, function(x) x$retv), use.names = FALSE, fill = FALSE, idcol = NULL)
  } else {
    if (long)
    {
      res <- msaToLong(readDNAStringSet(fnames))
    } else {
      res <- readDNAStringSet(fnames)
    }
  }
  
  return(res)
}

#' Import TE intervals in Repeatmasker format
#'
#' Imports TE interval coordinates from Repeatmasker
#'
#' @param path Character. Path to file.
#' @param curate Logical. If TRUE, calls `curateTEs` to resolve overlapping TE intervals.
#' @param main_assembly Logical. If TRUE (the default), only returns TE intervals on the main chromosomes. 
#' @param main_classes Logical. If TRUE (the default), only returns TE intervals unambiguously belonging to DNA, LINE, SINE, and LTR classes.
#' @param proper_alignments Logical. If TRUE (the default), only returns TE intervals with proper start and end coordinates.
#' @return Returns a GRanges object with TE intervals
#' @export  
importRMSK <- function(path, 
                       curate = FALSE,
                       main_assembly = TRUE,
                       main_classes = TRUE,
                       proper_alignments = TRUE)          
{
  rmsk_hits <- 
    fread(path, skip = 3, 
          fill = TRUE,
          header = FALSE,
          data.table = TRUE,
          col.names = c('sw_score', 'perc_div', 'perc_del', 'perc_ins', 'seqnames', 'start', 'end', 'left_chrom', 'strand', 'name', 'class_family', 
								        'position_start', 'position_end', 'left_rep', 'id_unique'))
  
  rmsk_hits[, c("class", "repfamily") := tstrsplit(class_family, "/", fixed=TRUE)]
  rmsk_hits[, strand := gsub('C', '-', strand, fixed = TRUE)]
  
  # keep only major classes
  if (main_classes)
  {
    rmsk_hits <- rmsk_hits[which(class %fin% c('DNA', 'SINE', 'LTR', 'LINE')), ]
  }

  # adjust position on alignment based on strand
  rmsk_hits <- rmsk_hits[which(strand == '-'), ':=' (position_start = as.character(position_end), position_end = as.integer(left_rep))][, !'left_rep']
  rmsk_hits[, position_start := as.integer(position_start)]
  
  if (proper_alignments)
  {
    rmsk_hits <- rmsk_hits[which(strand == '+' & position_start < position_end | strand == '-' & position_start > position_end), ]
  }
   
  res <- 
    rmsk_hits[, id_unique := paste(name, 1:nrow(rmsk_hits), sep = '|')
      ][, ':=' (left_chrom = NULL, class_family = NULL)
        ][order(seqnames, start, end), ]    
  res <- as_granges(res)
    
  if (main_assembly)
  {
    res <- GenomeInfoDb::keepStandardChromosomes(res, pruning.mode = 'coarse')
  }
  
  if (curate)
  {
    res <- curateTEs(res)
  }
    
  return(res)
}

#' Import TE intervals in UCSC table browser format
#'
#' Import TE intervals from UCSC table browser
#'
#' @param path Character. Path to file.
#' @param curate Logical. If TRUE, calls `curateTEs` to resolve overlapping TE intervals.
#' @param main_assembly Logical. If TRUE (the default), only returns TE intervals on the main chromosomes. 
#' @param main_classes Logical. If TRUE (the default), only returns TE intervals unambiguously belonging to DNA, LINE, SINE, and LTR classes.
#' @return Returns a GRanges object with TE intervals
#' @export  
importUCSC <- function(path = 'path to UCSC RMSK file',
                       curate = FALSE,
                       main_classes = TRUE,
                       main_assembly = TRUE)
{
  # Import
  ucsc_rmsk <-
    fread(path,
          data.table = TRUE,
          col.names = c('bin', 'sw_score', 'milli_div', 'milli_del', 'milli_ins', 'seqnames', 'start', 'end', 'geno_left', 'strand', 'name', 
								        'class', 'repfamily', 'position_start', 'position_end', 'left_rep', 'id_unique'))
  
  # adjust position on alignment based on strand
  ucsc_rmsk <- ucsc_rmsk[which(strand == '-'), position_start := (left_rep)][, !'left_rep']
  ucsc_rmsk[, position_start := as.integer(position_start)]
  
  # keep only major classes
  if (main_classes)
  {
    ucsc_rmsk <-  ucsc_rmsk[which(class %fin% c('DNA', 'SINE', 'LTR', 'LINE')), ]
  }
  
  res <- 
    ucsc_rmsk[, id_unique := paste(name, 1:nrow(ucsc_rmsk), sep = '|')
      ][order(seqnames, start, end), ]    
  res <- as_granges(res)
    
  if (main_assembly)
  {
    res <- GenomeInfoDb::keepStandardChromosomes(res, pruning.mode = 'coarse')
  }

  if (curate)
  {
    res <- curateTEs(res)
  }
  return(res)
}

#' Import TE intervals in common file formats
#' @export
importTEs <- function(path,
                      curate = FALSE,
                      main_classes = TRUE,
                      main_assembly = TRUE,
                      type = NULL,
                      keep_format = TRUE)
{
  if (is.null(type))     { type <- checkFormat(path) }
  if (type == 'unknown') { stop ('Source format is unknown, please define (e.g. Repeatmasker, Dfam, etc.)') }

  message ('Importing TE coordinates from ', type ,' source')
  if (type == 'rmsk') { res <- importRMSK(path, curate = curate, main_classes = main_classes, main_assembly = main_assembly, proper_alignments = TRUE) }
  if (type == 'ucsc') { res <- importUCSC(path, curate = curate, main_classes = main_classes, main_assembly = main_assembly)                           }
  if (type == 'dfam') { res <- importDFAM(path, curate = curate, main_assembly = main_assembly)                                                        }

  return(res)
}                      

import10x <- function(path,
                      long = TRUE)
{
  if (!requireNamespace('hdf5r', quietly = TRUE)) 
  {
    stop("hdf5r package required to read file, please install!")
  }
  if (!file.exists(path)) 
  {
    stop("File not found")
  }

  h5f      <- hdf5r::H5File$new(path, mode = 'r+')
  assembly <- names(x = h5f)
  
  if (length(assembly) > 1)
  {
    stop('More than 1 genome in file')
  }
  
  # adjust depending on cell ranger version
  if (h5f$attr_exists("PYTABLES_FORMAT_VERSION")) {
    gene_names <- 'gene_names'
    ensembl    <- 'genes'
  } else {
    gene_names <- 'features/name'
    ensembl    <- 'features/id'
  }
  
  counts   <- h5f[[paste0(assembly, '/data')]]
  indices  <- h5f[[paste0(assembly, '/indices')]]
  indptr   <- h5f[[paste0(assembly, '/indptr')]]
  dims     <- h5f[[paste0(assembly, '/shape')]]
  genes    <- h5f[[paste0(assembly, '/', gene_names)]][]
  ensembl  <- h5f[[paste0(assembly, '/', ensembl)]][]
  barcodes <- h5f[[paste0(assembly, '/barcodes')]][]
  smat <- 
    sparseMatrix(
                 i = indices[] + 1,
                 p = indptr[],
                 x = as.numeric(x = counts[]),
                 dims = dims[],
                 giveCsparse = TRUE
                )             
  
  # resolve ambigious gene names
  gene_ids_dt <- 
    data.table(genes = genes, 
               ensembl = ensembl, 
               duplicated = duplicated(genes))[which(duplicated), genes := paste(genes, ensembl, sep = '|')]

  rownames(smat) <- gene_ids_dt$genes
  colnames(smat) <- barcodes
  
  if (long)
  {
    res <- sparseToLong(smat)
  } else {
    res <- smat
  }
  
  h5f$close_all()
  return(res)
}
  
# actual reading function
  .readBamF <- function(df, what = NA, tag = NA, data.table = TRUE)
  {
    res <-
      future.apply::future_apply(df, 1, future.packages = 'data.table', function(x)
      {
        path    <- x['paths']
        paired  <- x['paired']
        barcode <- x['barcode']
        mate    <- x['mate']
        meta    <- x['meta']
        
        mate_1 <- mate == 'first'
        mate_2 <- mate == 'second' | mate == 'last'
        
        # check file type
        if (sum(substring(path, nchar(path) - 2, nchar(path)) %fin% c('bam', 'BAM', 'Bam')) > 0) { file_type <- 'bam' }
        if (sum(substring(path, nchar(path) - 2, nchar(path)) %fin% c('sam', 'SAM', 'Sam')) > 0) { file_type <- 'sam' }
        
        message(glue("Reading {path}"))

        # create param
        param <- ScanBamParam(flag = Rsamtools::scanBamFlag(isPaired          = as.logical(paired), 
                                                            isProperPair      = as.logical(paired), 
                                                            isFirstMateRead   = mate_1,
                                                            isSecondMateRead  = mate_2))

        if (!is.na(tag))
        {
          param@tag <- tag
        }
        
        if (!is.na(what))
        {
          param@what <- what
        } 
        
        # import reads from bam
        if (file_type == 'bam')
        {
          res <- 
            GenomicAlignments::readGAlignments(path, 
                                               #index = bam_index,
                                               use.names = FALSE,
                                               param = param)
        }
                                             
        # import reads from sam (MARS-seq specific)
        if (file_type == 'sam')
        {
          header <- .headerToSegInfo(fread(cmd       = glue('head -n 1000 {path} | grep "^@SQ"'), header = FALSE))
          
          reads <-
            fread(cmd       = glue('grep -v "^@" {path}'), # skip header
                  fill      = TRUE,
                  select    = c(1, 2, 3, 4, 6, 10),
                  col.names = c('qname', 'flag', 'seqnames', 'start', 'cigar', 'NH'))
                  
          # filter unmapped and barcode/UMI missing reads
          reads_f <- 
            reads[which(seqnames != '*'),     # throw unmapped reads
              ][, qname_nchar := nchar(qname)
                ][which(qname_nchar > 100), ]  # throw reads without barcode/umi suffix
              
          # add strand and barcode/UMI from read name
          strand              <- bamFlagAsBitMatrix(reads_f$flag)[, 'isMinusStrand']
          strand[strand == 1] <- '-'
          strand[strand == 0] <- '+'
          reads_f <- 
            reads_f[, ':=' (end = start, 
                            strand = strand, 
                            barcode = substring(qname, qname_nchar - 20, qname_nchar - 9), 
                            UR = substring(qname, qname_nchar - 7, qname_nchar), 
                            NH = as.integer(substring(NH, 6, 10)))][, !'qname_nchar']  
          
          # filter incomplete UMIs/barcodes
          res <- reads_f[-which(grepl('N', UR) | grepl('N', barcode)),]
          
          # convert to GAlignments
          if (!data.table)
          {
            res            <- as(as_granges(res), 'GAlignments')
            seqlevels(res) <- seqlevels(header)
            seqinfo(res)   <- header
          }
        }
        
        # transform to data.table and only keep relevant columns
        if (data.table)
        {
          res <- as.data.table(res)
          res <- res[, colnames(res) %fin% c('seqnames', 'strand', 'start', 'end', 'NH', 'UR', 'barcode', barcode, what, tag), with = FALSE]
        }
        
        # modify barcode column names
        if (!is.na(barcode) & file_type != 'sam') 
        {
          if (barcode == 'CB')
          {
            if (data.table)
            {
              res <- res[, barcode := CB][, !'CB']
            } else {
              colnames(mcols(res)) <- gsub('CB', 'barcode', colnames(mcols(res)))
            }
          } else {
            if (data.table)
            {
              res[, 'barcode' := barcode]
            } else {
              mcols(res)$barcode = barcode
            }
          }
        }

        if (!is.na(meta))
        {
          if (data.table)
          {
            res[, ':=' ('meta' = meta, 'barcode' = paste(barcode, meta, sep = '|'))]
          } else {
            mcols(res)$barcode <- paste(mcols(res)$barcode, meta, sep = '|')
          }
        }

        return(res)
      })
    
    if (data.table)
    {
      res <- rbindlist(res)
    } else {
      res <- unlist(GAlignmentsList(res))
    }
    return(res)
  }
tanaylab/reputils documentation built on Nov. 5, 2019, 9:58 a.m.