#' Load several annotated peak files
#'
#' This function allows you to load several annotated peak files, the result is a list of data tables
#' @param organism organism of the sample group, default is 'human'
#' @param blacklist_folder folder where the blacklist files are, default is 'utils'
#' @keywords import
#' @import data.table TxDb.Mmusculus.UCSC.mm10.ensGene TxDb.Hsapiens.UCSC.hg38.knownGene tidyverse
#' @export
#' @examples
#' load_annotated_peaks(organism, blacklist_folder)
load_annotated_peaks <- function(organism = "human", blacklist_folder = "utils"){
if(organism %in% c("mouse", "mm", "mus musculus")){
txdb <<- TxDb.Mmusculus.UCSC.mm10.ensGene
andb <<- "org.Mm.eg.db"
genome <<- "mm10"
cpg_annots <<- "mm10_cpg_islands"
# List of blacklisted regions (obtained from 'https://github.com/Boyle-Lab/Blacklist/tree/master/lists')
blacklist <<- file.path(blacklist_folder, "mm10-blacklist.v2.bed.gz")
ensembl_species <<- "mmusculus_gene_ensembl"
chromosomes <<- paste('chr', c(1:19, "X"), sep = "")
}else if (organism %in% c("human", "hg", "homo sapiens")) {
txdb <<- TxDb.Hsapiens.UCSC.hg38.knownGene
andb <<- "org.Hs.eg.db"
ensembl_species <<- "hsapiens_gene_ensembl"
genome <<- "hg38"
cpg_annots <<- "hg38_cpg_islands"
# List of blacklisted regions (obtained from 'https://github.com/Boyle-Lab/Blacklist/tree/master/lists')
blacklist <<- file.path(blacklist_folder, "hg38-blacklist.bed.gz")
chromosomes <<- paste('chr', c(1:22, "X"), sep = "")
} else {
print("the only organisms supported are 'mouse' and 'human'")
stop()
}
###data collection based on species
sample_csv = "sample.csv"
sample <- read.csv(file.path(sample_csv))
sample <- select(sample, SAMPLE.NAME, short_name, replicate, run, treatment, species)
path <- file.path("data", "input", "peaks")
sample$path <- file.path(path, paste(sample$SAMPLE.NAME, "_trimmed.mapped.filtered.sorted.deduplicated.peaks.bed", sep = ""))
sampleTable <- data.table(name = as.character(sample$short_name),
Batch = as.character(sample$run),
Treatment = as.character(sample$treatment),
Replicate = as.character(sample$run),
path= as.character(sample$path),
species = as.character(sample$species))
sampleTable <- sampleTable %>% filter(species == organism)
# Load the blacklist file
os <- Sys.info()['sysname'][[1]]
if (os=="Linux") {
blacklist <<- fread(cmd=paste("gunzip -c", blacklist),
select=1:3, col.names=c("chrom", "start", "end"))
} else if (os=="Windows") {
blacklist <- fread(cmd=paste("tar -xvzf", blacklist),
select=1:3, col.names=c("chrom", "start", "end"))
}
# Transform chromosome names, and coordinates to 1-based
#blacklist[, `:=`(chrom = gsub("chr", "", chrom), start = start+1, end = end+1)]
blacklist[, `:=`(chrom = chrom, start = start+1, end = end+1)]
setkeyv(blacklist, colnames(blacklist))
#loading the bed files cmd=paste("gunzip -c", x),
peaks_dt <- lapply(with(sampleTable, setNames(path, name)), function(x){
tmp <- fread(file=x, showProgress=FALSE,
header = T)
setkeyv(tmp, c("chrom", "start", "end"))
tmp <- tmp[chrom%in%chromosomes,]
# Filter out DSBs falling within blacklisted regions using chr, start, end
tmp2 <- fsetdiff(tmp[, 1:3, with=FALSE], blacklist)
#restoring all the other columns
tmp2[tmp, PeakPvalue := PeakPvalue]
tmp2[tmp, PeakCoverage := PeakCoverage]
tmp2[tmp, PeakWidth := PeakWidth]
tmp2[tmp, GenomicAnnotation := DetailedGenomicAnnotation]
tmp2[tmp, GeneChrom := GeneChrom]
tmp2[tmp, GeneStart := GeneStart]
tmp2[tmp, GeneEnd := GeneEnd]
tmp2[tmp, GeneLength := GeneLength]
tmp2[tmp, GeneStrand := GeneStrand]
tmp2[tmp, TranscriptID := TranscriptID]
tmp2[tmp, DistanceToTSS := DistanceToTSS]
tmp2[tmp, GeneID := EntrezID]
tmp2[tmp, GeneName := GeneName]
tmp2[tmp, GeneDesc := GeneDesc]
return(tmp2)
})
return(peaks_dt)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.