R/BED_functions.R

Defines functions BedCheck BedGC.R BedGC.fasta.chr BedGC.fasta

BedGC.fasta <- function(binned.bed.file = NULL, genome = "hg19", fasta = NULL, na.to0 = TRUE, nt.add = c(0,50,100, 250, 500, 1000, 2500, 5000), out.dir = getwd(), return.data = FALSE, nthread = 1) {

  message("WARNING ! On hg19, this function consumes 3 GB per thread (to add to your current R session) !")
  message("When possible, prefer the BedGC.chr() function that performs the same task using sequences as single-chromosome FASTA files. This consumes only 0.3 GB of RAM per thread.")
  if (is.null(binned.bed.file)) stop("A BED file is required !", call. = FALSE)
  if (!file.exists(binned.bed.file)) stop("Could not find the BED file !", call. = FALSE)
  if (is.null(fasta)) stop("A FASTA file is required !", call. = FALSE)
  if (!file.exists(fasta)) stop("Could not find FASTA file !", call. = FALSE)
  if (!all(is.numeric(nt.add))) stop("nt.add must be a numeric vector !", call. = FALSE)
  if (is.null(out.dir)) message("NOTE : Checked / cleaned bed will be written in the same directory as source.")
  if (!file.exists(out.dir)) stop("Could not find the output directory !", call. = FALSE)
  if (!file.info(out.dir)$isdir) stop("out.dir is not a directory !", call. = FALSE)

  message("Loading genome information ...")
  data(list = genome, package = "chromosomes", envir = environment())

  message("Loading BED ...")
  bed.data <- read.table(file = binned.bed.file, header = FALSE, sep = "\t", comment.char = "#", stringsAsFactors = FALSE)
  if (ncol(bed.data) < 3) stop("BED file must contain at least 3 columns !", call. = FALSE)
  bed.data <- bed.data[,1:3]
  colnames(bed.data) <- c("chr", "start", "end")
  if (!all(unique(bed.data$chr) %in% cs$chromosomes$chrom)) stop(paste0("BED file contains chromosome(s) not defined in the ", genome, " genome !"), call. = FALSE)

  klen <- Biostrings::fasta.seqlengths(filepath = fasta)
  kdata <- Biostrings::readDNAStringSet(filepath = fasta, format = "fasta")

  message("Starting cluster ...")
  cl <- parallel::makeCluster(spec = nthread, type = "PSOCK")
  doParallel::registerDoParallel(cl)

  `%dopar%` <- foreach::"%dopar%"
  a <- 0
  Agcpc <- foreach::foreach(a = nt.add, .combine = "cbind") %dopar% {
    message(paste0("Computing GC for frame +", a, " bp"))

    querystart <- bed.data$start - a
    queryend <- bed.data$end + a
    querystart[querystart < 1] <- 1
    kmore.idx <- which(queryend > klen[bed.data$chr])
    if (length(kmore.idx) > 0) queryend[kmore.idx] <- klen[names(kmore.idx)]

    kbs.idx <- vapply(bed.data$chr, function(k) { which(kdata@ranges@NAMES == k) }, 1L)

    gcpc <- vapply(1:nrow(bed.data), function(x) {
      aFreq <- Biostrings::alphabetFrequency(Biostrings::subseq(kdata[kbs.idx[x]], querystart[x], queryend[x]), baseOnly = TRUE)
      acgt.count <- sum(aFreq[colnames(aFreq) != "other"])
      gc.count <- sum(aFreq[colnames(aFreq) %in% c("G", "C")])
      gcpc.x <- gc.count / acgt.count
      return(gcpc.x)
    }, .1)
    if (na.to0) gcpc[is.na(gcpc)] <- 0
    return(gcpc)
  }

  message("Stopping cluster ...")
  parallel::stopCluster(cl)

  ## Building the output object
  colnames(Agcpc) <- paste0("GC", nt.add, "b")
  out.df <- cbind(bed.data, Agcpc)
  outname <- paste0(out.dir, "/", sub(pattern = "\\.bed$", replacement = ".gc", x = basename(binned.bed.file), ignore.case = TRUE))
  write.table.fast(x = out.df, file = outname)
  message("Done.")
  if(return.data) return(out.df) else return(NULL)
}

BedGC.fasta.chr <- function(binned.bed.file = NULL, genome = "hg19", fasta.dir = NULL, na.to0 = TRUE, nt.add = c(0,50,100, 250, 500, 1000, 2500, 5000), out.dir = getwd(), return.data = FALSE, nthread = 2) {

  if (is.null(binned.bed.file)) stop("A BED file is required !", call. = FALSE)
  if (!file.exists(binned.bed.file)) stop("Could not find the BED file !", call. = FALSE)
  if (is.null(fasta.dir)) stop("A directory containing indexed FASTA file(s) for each chromosome is required !", call. = FALSE)
  if (!file.exists(fasta.dir)) stop("Could not find the FASTA directory !", call. = FALSE)
  if (!file.info(fasta.dir)$isdir) stop("fasta.dir is not a directory !", call. = FALSE)
  if (!all(is.numeric(nt.add))) stop("nt.add must be a numeric vector !", call. = FALSE)
  if (is.null(out.dir)) stop("An output directory is required !", call. = FALSE)
  if (!file.info(out.dir)$isdir) stop("out.dir is not a directory !", call. = FALSE)

  message("Loading genome information ...")
  data(list = genome, package = "chromosomes", envir = environment())

  message("Loading BED ...")
  bed.data <- read.table(file = binned.bed.file, header = FALSE, sep = "\t", comment.char = "#", stringsAsFactors = FALSE)
  if (ncol(bed.data) < 3) stop("BED file must contain at least 3 columns !", call. = FALSE)
  bed.data <- bed.data[,1:3]
  colnames(bed.data) <- c("chr", "start", "end")
  if (!all(unique(bed.data$chr) %in% cs$chromosomes$chrom)) stop(paste0("BED file contains chromosome(s) not defined in the ", genome, " genome !"), call. = FALSE)

  kcoords <- sapply(unique(bed.data$chr), function(k) { return(bed.data[bed.data$chr == k, ]) }, simplify = FALSE)

  message("Starting cluster ...")
  # require(foreach)
  # require(doParallel)
  cl <- parallel::makeCluster(spec = nthread, type = "PSOCK")
  doParallel::registerDoParallel(cl)

  a <- 0
  `%do%` <- foreach::"%do%"
  `%dopar%` <- foreach::"%dopar%"

  Agcpc <- foreach(a=nt.add, .combine = "cbind") %do% {
    message(paste0("Computing GC for frame +", a, " bp"))

    k <- 0
    gcpc <- foreach(k=kcoords, .combine = "c", .noexport = c("kcoords", "bed.data")) %dopar% {
      kchr <- unique(k$chr)
      kfafile <- paste0(fasta.dir, "/", kchr, ".fa")
      if (!file.exists(kfafile)) stop(paste0("Could ont find : ", kfafile), call. = FALSE)
      cat("Loading sequence for", kchr, "...\n")
      # require(Biostrings)
      klen <- Biostrings::fasta.seqlengths(filepath = kfafile)
      kdata <- Biostrings::readDNAStringSet(filepath = kfafile, format = "fasta")

      querystart <- k$start - a
      queryend <- k$end + a
      querystart[querystart < 1] <- 1
      queryend[queryend > klen] <- klen

      gcpc.k <- vapply(1:nrow(k), function(x) {
        aFreq <- Biostrings::alphabetFrequency(Biostrings::subseq(kdata, querystart[x], queryend[x]), baseOnly = TRUE)
        acgt.count <- sum(aFreq[colnames(aFreq) != "other"])
        gc.count <- sum(aFreq[colnames(aFreq) %in% c("G", "C")])
        gcpc.x <- gc.count / acgt.count
        return(gcpc.x)
      }, .1)
      return(gcpc.k)
    }
    if (na.to0) gcpc[is.na(gcpc)] <- 0
    return(gcpc)
  }
  message("Stopping cluster ...")
  parallel::stopCluster(cl)
  ## Building the output object
  # colnames(Agcpc) <- paste0("GC", nt.add, "b")
  colnames(Agcpc) <- paste0("GC", nt.add, "b")
  out.df <- cbind(bed.data, Agcpc)
  # outname <- paste0(out.dir,  "/", sub(pattern = "\\.bed$", replacement = ".gc", x = basename(binned.bed.file), ignore.case = TRUE))
  outname <- paste0(out.dir,  "/", sub(pattern = "\\.bed$", replacement = "_GC.RDS", x = basename(binned.bed.file), ignore.case = TRUE))
  saveRDS(out.df, file = outname, compress = "bzip2")
  if(return.data) return(out.df) else return(NULL)
  message("Done.")
  return(cbind(bed.data, Agcpc))
}

BedGC.R <- function(binned.bed.file = NULL, human.genome.build = "hg19", na.to0 = TRUE, nt.add = c(0,50,100, 250, 500, 1000, 2500, 5000), out.dir = getwd(), return.data = FALSE, nthread = 1) {

  message("WARNING ! On hg19, this function consumes 3 GB per thread (to add to your current R session) !")
  message("When possible, prefer the BedGC.chr() function that performs the same task using sequences as single-chromosome FASTA files. This consumes only 0.3 GB of RAM per thread.")
  if (is.null(binned.bed.file)) stop("A BED file is required !", call. = FALSE)
  if (is.null(human.genome.build)) stop("A genome name is required !", call. = FALSE)
  if (!file.exists(binned.bed.file)) stop("Could not find the BED file !", call. = FALSE)
  if (!all(is.numeric(nt.add))) stop("nt.add must be a numeric vector !", call. = FALSE)
  if (is.null(out.dir)) message("NOTE : Checked / cleaned bed will be written in the same directory as source.")
  if (!file.exists(out.dir)) stop("Could not find the output directory !", call. = FALSE)
  if (!file.info(out.dir)$isdir) stop("out.dir is not a directory !", call. = FALSE)

  message("Loading genome information ...")
  data(list = human.genome.build, package = "chromosomes", envir = environment())

  if (human.genome.build == "hg19") genome.package <- "BSgenome.Hsapiens.UCSC.hg19"
  if (human.genome.build == "hg38") genome.package <- "BSgenome.Hsapiens.UCSC.hg38"

  message("Loading BED ...")
  bed.data <- read.table(file = binned.bed.file, header = FALSE, sep = "\t", comment.char = "#", stringsAsFactors = FALSE)
  if (ncol(bed.data) < 3) stop("BED file must contain at least 3 columns !", call. = FALSE)
  bed.data <- bed.data[,1:3]
  colnames(bed.data) <- c("chr", "start", "end")
  if (!all(unique(bed.data$chr) %in% cs$chromosomes$chrom)) stop(paste0("BED file contains chromosome(s) not defined in the ", genome, " genome !"), call. = FALSE)

  require(package = genome.package, character.only = TRUE)
  organisms <- ls(paste0("package:",genome.package))
  orga.id <- organisms[length(organisms)]

  klen <- GenomeInfoDb::seqlengths(Hsapiens)

  message("Starting cluster ...")
  cl <- parallel::makeCluster(spec = nthread, type = "PSOCK")
  doParallel::registerDoParallel(cl)

  `%dopar%` <- foreach::"%dopar%"
  a <- 0
  Agcpc <- foreach::foreach(a = nt.add, .combine = "cbind") %dopar% {
    message(paste0("Computing GC for frame +", a, " bp"))

    Kgcpc <- foreach::foreach(k = unique(bed.data$chr), .combine = "rbind") %dopar% {
      kbed <- bed.data[bed.data$chr == k,]
      kbed$start <- kbed$start - a
      kbed$end <- kbed$end + a
      kbed$start[kbed$start < 1] <- 1
      kmore.idx <- which(kbed$end > klen[k])
      if (length(kmore.idx) > 0) kbed$end[kmore.idx] <- klen[k]

      kseq <- Hsapiens[[k]]

      gcpc <- vapply(1:nrow(kbed), function(x) {
        aFreq <- Biostrings::alphabetFrequency(Biostrings::subseq(kseq, kbed$start[x], kbed$end[x]), baseOnly = TRUE)
        acgt.count <- sum(aFreq[names(aFreq) != "other"])
        gc.count <- sum(aFreq[names(aFreq) %in% c("G", "C")])
        gcpc.x <- gc.count / acgt.count
        return(gcpc.x)
      }, .1)
      if (na.to0) gcpc[is.na(gcpc)] <- 0
      return(gcpc)
    }
    return(Kgcpc)
  }

  message("Stopping cluster ...")
  parallel::stopCluster(cl)

  ## Building the output object
  colnames(Agcpc) <- paste0("GC", nt.add, "b")
  out.df <- cbind(bed.data, Agcpc)
  outname <- paste0(out.dir, "/", sub(pattern = "\\.bed$", replacement = ".gc", x = basename(binned.bed.file), ignore.case = TRUE))
  write.table.fast(x = out.df, file = outname)
  message("Done.")
  if(return.data) return(out.df) else return(NULL)
}

BedCheck <- function(bed.file = NULL, genome.pkg = "BSgenome.Hsapiens.UCSC.hg19", out.dir = getwd(), return.data = FALSE) {

  if (is.null(bed.file)) stop("A BED file is required !", call. = FALSE)
  if (!file.exists(bed.file)) stop("Could not find the BED file !", call. = FALSE)
  if (is.null(out.dir)) message("NOTE : Checked / cleaned bed will be written in the same directory as source.") else {
    if (!file.exists(out.dir)) stop("Could not find the output directory !", call. = FALSE)
    if (!file.info(out.dir)$isdir) stop("out.dir is not a directory !", call. = FALSE)
  }
  message("Checking BED ...")
  # data(list = genome, package = "chromosomes", envir = environment())
  message(paste0("Loading ", genome.pkg, " ..."))
  suppressPackageStartupMessages(require(genome.pkg, character.only = TRUE))
  BSg.obj <- getExportedValue(genome.pkg, genome.pkg)
  genome <- BSgenome::providerVersion(BSg.obj)

  bed.data <- read.table(file = bed.file, header = FALSE, sep = "\t", comment.char = "#", stringsAsFactors = TRUE)
  if (ncol(bed.data) < 3) stop("BED file must contain at least 3 columns !", call. = FALSE)
  bed.data <- bed.data[,1:3]

  colnames(bed.data) <- c("chr", "start", "end")

  if (!all(unique(bed.data$chr) %in% BSgenome::seqnames(BSg.obj))) stop(paste0("BED file contains chromosome(s) not defined in the ", genome.pkg, " genome !"), call. = FALSE)

  myGR <- GenomicRanges::GRanges(bed.data)

  message(" Removing inproper coordinates (start >= end) ...")
  myGR <- myGR[!GenomicRanges::width(myGR) < 1,]

  message(" Removing replicates ...")
  myGR <- myGR[!GenomicRanges::duplicated(myGR),]

  message(" Looking for overlaps ...")
  myGR <- GenomicRanges::reduce(myGR)

  message(" Sorting ...")
  myGR <- GenomicRanges::sort(myGR)

  ## Converting GR back to df
  bed.data <- as.data.frame(myGR)
  # bed.data$seqnames <- as.character(bed.data$seqnames)
  bed.data <- bed.data[,1:3]

  if (!return.data) {
    message("Writing clean bed ...")
    colnames(bed.data)[1] <- "#chr"
    outname <- paste0(out.dir, "/", sub(pattern = "\\.bed$", replacement = paste0("_", genome, "_merged_sorted.bed"), x = basename(bed.file), ignore.case = TRUE))
    write.table.fast(x = bed.data, file = outname)
  }

  colnames(bed.data)[1] <- "chr"

  message("Done.")
  if(return.data) return(bed.data)
}
gustaveroussy/EaCoN documentation built on Oct. 20, 2021, 2:41 a.m.