R/tally.in.ranges.r

Defines functions writeToTallyFile tallyRangesBatch tallyRanges binGenome

Documented in binGenome tallyRanges tallyRangesBatch writeToTallyFile

binGenome <- function( reference, binsize = 1e6, chroms = seqnames(reference) ){
  stopifnot(class(reference) == "BSgenome")
  lengths <- seqlengths(reference)[chroms]
  ret <- do.call(rbind, lapply(names(lengths), function(chrom){
    res <- data.frame(
      Chrom = chrom,
      Start = seq(1, lengths[chrom], binsize)
      )
    if( lengths[chrom] > binsize){
      res$End <- c( seq(binsize, lengths[chrom], binsize), lengths[chrom] )
    }else{
      res$End <- c(lengths[chrom])
    }
    res
  })
  )
  ret <- GRanges(seqnames = ret$Chrom, ranges = IRanges(start = ret$Start, end = ret$End))
  seqlengths(ret) <- lengths
  return(ret)
}

tallyRanges <- function(
  bamfiles, ranges, reference, q=25, ncycles = 10, max.depth=1e6
){
  stopifnot(class(ranges) == "GRanges")
  strand(ranges) <- "*"
  stopifnot(class(reference) %in% c( "BSgenome" ))
  tmp = bplapply( seq_len(length(ranges)), function(rangeid){
    tallyBAM( bamfiles, chr=seqnames(ranges)[rangeid], start(ranges)[rangeid], end(ranges)[rangeid], q, ncycles, max.depth )
  })
  ref <- getSeq(reference, ranges)
  ret <- bplapply( seq_along(tmp), function(i){
    prepareForHDF5( tmp[[i]], ref[[i]] )
  }
  )
  return(ret)
}

tallyRangesBatch <- function(
  tallyFile, study, bamfiles, ranges, reference,
  q = 25, ncycles = 10, max.depth=1e6,
  regID = "Tally",
  res = list("ncpus" = 2, "memory" = 24000, "queue"="research-rh6"),
  written = c(), wrfile = "written.jobs.RDa", waitTime = Inf
  ) {
	stopifnot(class(ranges) == "GRanges")
  strand(ranges) <- "*"
  stopifnot(class(reference) == "BSgenome")
  if(!file.exists(tallyFile)){
    print("Preparing tally file since it does not exist.")
    lapply( unique(seqnames(ranges)), function(chrom) {
      prepareTallyFile(tallyFile, study, chrom, seqlengths(reference)[chrom], length(bamFiles))
    })
  }
  # require(BatchJobs)
  # check if registry is there
  if(file.exists(paste0( regID, "-files"))) {
    print("Registry already exists.")
    reg = loadRegistry(paste0( regID, "-files" ))
    toSubmit = setdiff(findJobs(reg), findDone(reg))
    submitJobs(reg, resources = res, ids=toSubmit)
  } else {
    print("Registry does not exist.")
    reg = makeRegistry(regID, seed = 23, packages = c("h5vc", "GenomicRanges"))
    reference = getSeq(reference, ranges)
    # submit jobs
	  batchFUN = function( rangeid, bamfiles, q, ncycles, max.depth, reference, ranges ) {
      #require(h5vc)
      #require(GenomicRanges)
      tallyBAM( bamfiles, chr=seqnames(ranges)[rangeid], start(ranges)[rangeid], end(ranges)[rangeid], q, ncycles, max.depth, reference = reference[[rangeid]] )
	  }
    batchMap(reg, fun = batchFUN, seq_along(ranges), more.args = list(bamfiles = bamfiles, q = q, ncycles = ncycles, max.depth = max.depth, reference = reference, ranges = ranges))
    submitJobs(reg, resources = res)
  }
	lastWrite = as.numeric(Sys.time())
  while(!all(findJobs(reg) %in% written)){
    toWrite = setdiff(findDone(reg), written)
    for(job in toWrite){
      if(verbose){
        print(paste("Processing job", job))
        print(paste("Written", length(written), "out of", length(findJobs(reg))))  
      }
      tally = loadResult(reg, job)
      writeToTallyFile(list("TheTaly" = tally), tallyFile, study, ranges[job], samples = NULL)
      if(verbose){
        print(paste("Written results from Job", job))
      }
      written = c(written, job)
      save(written, file=wrfile)
      lastWrite = as.numeric(Sys.time())
    }
    Sys.sleep(5)
    if(as.numeric(Sys.time())-lastWrite > waitTime & isEmpty(setdiff(findDone(reg), written))) {
      print(paste(length(setdiff(findJobs(reg), written)), "jobs were not written!")) 
      print("Exiting now!")
      break
    }
  }
}

writeToTallyFile <- function( theData, file, study, ranges, samples = NULL ){
  for(i in seq(length(theData))){
    chrom <- seqnames(ranges)[i]
    data <- theData[[i]]
    start <- start(ranges(ranges))[i]
    end <- end(ranges(ranges))[i]
    for(dataset in names(data)){
      if(is.null(dim(data[[dataset]]))){
        ndims <- 1
      }else{
        ndims <- length(dim(data[[dataset]]))
      }
      idxList <- vector(mode="list", length=ndims)
      if(!is.null(samples)){
        idxList[[length(idxList) - 1]] <- samples 
      }
      idxList[[length(idxList)]] <- start:end
      h5write(data[[dataset]], file, paste( study, chrom, dataset, sep = "/" ), index = idxList)  
    }
  }
  return(TRUE)
}

Try the h5vc package in your browser

Any scripts or data that you put into this service are public.

h5vc documentation built on Nov. 8, 2020, 4:56 p.m.