R/load_peaks.R

Defines functions load_annotated_peaks

Documented in load_annotated_peaks

#' Load several annotated peak files
#'
#' This function allows you to load several annotated peak files, the result is a list of data tables
#' @param organism organism of the sample group, default is 'human'
#' @param blacklist_folder folder where the blacklist files are, default is 'utils'
#' @keywords import
#' @import data.table TxDb.Mmusculus.UCSC.mm10.ensGene TxDb.Hsapiens.UCSC.hg38.knownGene tidyverse
#' @export
#' @examples
#' load_annotated_peaks(organism, blacklist_folder)
load_annotated_peaks <- function(organism = "human", blacklist_folder = "utils"){
  if(organism %in% c("mouse", "mm", "mus musculus")){
    txdb <<- TxDb.Mmusculus.UCSC.mm10.ensGene
    andb <<- "org.Mm.eg.db"
    genome <<- "mm10"
    cpg_annots <<- "mm10_cpg_islands"
    # List of blacklisted regions (obtained from 'https://github.com/Boyle-Lab/Blacklist/tree/master/lists')
    blacklist <<- file.path(blacklist_folder, "mm10-blacklist.v2.bed.gz")
    ensembl_species <<- "mmusculus_gene_ensembl"
    chromosomes <<- paste('chr', c(1:19, "X"), sep = "")
  }else if (organism %in% c("human", "hg", "homo sapiens")) {
    txdb <<- TxDb.Hsapiens.UCSC.hg38.knownGene
    andb <<- "org.Hs.eg.db"
    ensembl_species <<- "hsapiens_gene_ensembl"
    genome <<- "hg38"
    cpg_annots <<- "hg38_cpg_islands"
    # List of blacklisted regions (obtained from 'https://github.com/Boyle-Lab/Blacklist/tree/master/lists')
    blacklist <<- file.path(blacklist_folder, "hg38-blacklist.bed.gz")
    chromosomes <<- paste('chr', c(1:22, "X"), sep = "")
  } else {
    print("the only organisms supported are 'mouse' and 'human'")
    stop()
  }

  ###data collection based on species
  sample_csv = "sample.csv"
  sample <- read.csv(file.path(sample_csv))
  sample <- select(sample, SAMPLE.NAME, short_name, replicate, run, treatment, species)
  path <- file.path("data", "input", "peaks")
  sample$path <- file.path(path, paste(sample$SAMPLE.NAME, "_trimmed.mapped.filtered.sorted.deduplicated.peaks.bed", sep = ""))
  sampleTable <- data.table(name = as.character(sample$short_name),
                           Batch = as.character(sample$run),
                           Treatment = as.character(sample$treatment),
                           Replicate = as.character(sample$run),
                           path= as.character(sample$path),
                           species = as.character(sample$species))
  sampleTable <- sampleTable %>% filter(species == organism)

  # Load the blacklist file
  os <- Sys.info()['sysname'][[1]]
  if (os=="Linux") {
    blacklist <<- fread(cmd=paste("gunzip -c", blacklist),
                       select=1:3, col.names=c("chrom", "start", "end"))
  } else if (os=="Windows") {
    blacklist <- fread(cmd=paste("tar -xvzf", blacklist),
                       select=1:3, col.names=c("chrom", "start", "end"))
  }

  # Transform chromosome names, and coordinates to 1-based
  #blacklist[, `:=`(chrom = gsub("chr", "", chrom),  start = start+1, end = end+1)]
  blacklist[, `:=`(chrom = chrom,  start = start+1, end = end+1)]
  setkeyv(blacklist, colnames(blacklist))
  #loading the bed files cmd=paste("gunzip -c", x),
  peaks_dt <- lapply(with(sampleTable, setNames(path, name)), function(x){
    tmp <- fread(file=x, showProgress=FALSE,
                header = T)
    setkeyv(tmp, c("chrom", "start", "end"))
    tmp <- tmp[chrom%in%chromosomes,]
    # Filter out DSBs falling within blacklisted regions using chr, start, end
    tmp2 <- fsetdiff(tmp[, 1:3, with=FALSE], blacklist)
    #restoring all the other columns
    tmp2[tmp, PeakPvalue := PeakPvalue]
    tmp2[tmp, PeakCoverage := PeakCoverage]
    tmp2[tmp, PeakWidth := PeakWidth]
    tmp2[tmp, GenomicAnnotation := DetailedGenomicAnnotation]
    tmp2[tmp, GeneChrom := GeneChrom]
    tmp2[tmp, GeneStart := GeneStart]
    tmp2[tmp, GeneEnd := GeneEnd]
    tmp2[tmp, GeneLength := GeneLength]
    tmp2[tmp, GeneStrand := GeneStrand]
    tmp2[tmp, TranscriptID := TranscriptID]
    tmp2[tmp, DistanceToTSS := DistanceToTSS]
    tmp2[tmp, GeneID := EntrezID]
    tmp2[tmp, GeneName := GeneName]
    tmp2[tmp, GeneDesc := GeneDesc]
    return(tmp2)
  })
  return(peaks_dt)
}
riccardo-trozzo/BlissR documentation built on Aug. 1, 2020, 12:23 a.m.