R/load_bams.R

Defines functions load_bams

Documented in load_bams

#' Load several bam files
#'
#' This function allows you to load several bam files, the result is a list of data tables
#' @param organism organism of the sample group, default is "human"
#' @param blacklist_folder folder where the blacklist files are, default is 'utils'
#' @keywords import
#' @import data.table tidyverse
#' @export
#' @examples
#' load_bams(organism)
load_bams <- function(organism = "human" , blacklist_folder = "utils") {

  if(organism %in% c("mouse", "mm", "mus musculus")){
    genome <- "mm10"
    # List of blacklisted regions (obtained from 'https://github.com/Boyle-Lab/Blacklist/tree/master/lists')
    blacklist <- file.path(blacklist_folder, "mm10-blacklist.v2.bed.gz")
    ensembl_species <- "mmusculus_gene_ensembl"
    chromosomes = paste('chr', c(1:19, "X"), sep = "")
  }else if (organism %in% c("human", "hg", "homo sapiens")) {
    ensembl_species = "hsapiens_gene_ensembl"
    genome <- "hg38"
    # List of blacklisted regions (obtained from 'https://github.com/Boyle-Lab/Blacklist/tree/master/lists')
    blacklist <<- file.path(blacklist_folder, "hg38-blacklist.bed.gz")
    chromosomes = paste('chr', c(1:22, "X"), sep = "")
  } else {
    print("the only organisms supported are 'mouse' and 'human'")
    stop()
  }
  ###data collection based on species
  sample_csv = "sample.csv"
  sample <- read.csv(file.path(sample_csv))
  path <- file.path("data", "input", "beds")
  sample$path <- file.path(path, paste(sample$SAMPLE.NAME, "_trimmed.mapped.filtered.sorted.deduplicated.bed", sep = ""))
  sampleTable = data.table(name = as.character(sample$short_name),
                           Batch = as.character(sample$run),
                           Treatment = as.character(sample$treatment),
                           Replicate = as.character(sample$run),
                           path= as.character(sample$path),
                           species = as.character(sample$species))
  sampleTable <- sampleTable %>% filter(species == organism)

  # Load the blacklist file
  os <- Sys.info()['sysname'][[1]]
  if (os=="Linux") {
    blacklist <<- fread(cmd=paste("gunzip -c", blacklist),
                        select=1:3, col.names=c("Chrom", "Start", "End"))
  } else if (os=="Windows") {
    blacklist <- fread(cmd=paste("tar -xvzf", blacklist),
                       select=1:3, col.names=c("Chrom", "Start", "End"))
  }
  # Transform chromosome names, and coordinates to 1-based
  #blacklist[, `:=`(Chrom = gsub("chr", "", Chrom),  Start = Start+1, End = End+1)]
  blacklist[, `:=`(Chrom = Chrom,  Start = Start+1, End = End+1)]
  setkeyv(blacklist, colnames(blacklist))
  #loading the bed files cmd=paste("gunzip -c", x),
  beds_dt <-  lapply(with(sampleTable, setNames(path, name)),
                   function(x){
                     tmp = fread(file = x, showProgress=FALSE,
                                 col.names=c("Chrom", "Start", "End", "score", "strand"),
                                 colClasses=c("character", "numeric", "numeric", "numeric", "character"))
                     setkeyv(tmp, c("Chrom", "Start", "End"))
                     tmp = tmp[Chrom%in%chromosomes,]
                     # Filter out DSBs falling within blacklisted regions
                     tmp2 = fsetdiff(tmp[, 1:3, with=FALSE], blacklist)
                     tmp2[tmp, score := score]
                     return(tmp2)
                   })
}
riccardo-trozzo/BlissR documentation built on Aug. 1, 2020, 12:23 a.m.