R/h5apply.R

Defines functions .h5dapplyIRanges

.sampleDimMap <- list(
  "Counts" = 2,
  "Coverages" = 1,
  "Deletions" = 1,
  "Reference" = NULL
)

setGeneric("h5dapply", function(...,range) standardGeneric("h5dapply"))

setMethod("h5dapply", c("numeric"), function(..., blocksize, range){
  stopifnot( is.numeric(range), length(range) == 2, diff(range) > 0, range[1] >= 1 )
  blocksize  = min(blocksize, diff(range))
  retsize    = ceiling(diff(range)/blocksize)
  blockstart = seq( range[1], by = blocksize, length = retsize)
  blockend   = blockstart + blocksize - 1
  blockend[length(blockend)] = min(blockend[length(blockend)], range[2])
  range <- IRanges( start = blockstart, end = blockend )
  .h5dapplyIRanges(..., range = range)
})

setMethod("h5dapply", c("IRanges"), function(..., range){
  .h5dapplyIRanges(..., range = range)
})

setMethod("h5dapply", c("GRanges"), function(..., group, range){
  ret <- lapply(seqlevels(range), function(chrom){
    tmpranges <- ranges(keepSeqlevels(range, chrom, pruning.mode = "coarse"))
    tmpgroup <- paste(group, chrom, sep="/")
    .h5dapplyIRanges(..., group=tmpgroup, range = tmpranges)
  })
  names(ret) <- seqlevels(range)
  ret
})


.h5dapplyIRanges <- function( filename, group, blocksize, FUN = function(x) x, ... , names = c("Counts", "Coverages", "Deletions", "Reference"), dims, range, samples = NULL, sampleDimMap = .sampleDimMap, verbose = FALSE, BPPARAM = NULL ) {
  sampleData <- getSampleData( filename, group )
  if( !is.null(samples) ){
    stopifnot(all(samples %in% sampleData$Sample))
  }else{
    samples = sampleData$Sample  
  }
  selectedColumns <- subset( sampleData, Sample %in% samples )$Column
  f <- H5Fopen(filename, flags = "H5F_ACC_RDONLY")
  g <- H5Gopen(f, group)
  h5Ds <- lapply(names, function(n) H5Dopen(g, n))
  names(h5Ds) <- names
  sizes <- lapply( lapply( lapply(h5Ds, H5Dget_space), H5Sget_simple_extent_dims), function(x) x$size )
  datasets = data.frame(
    Name = names,
    DimCount = sapply( sizes, length ),
    stringsAsFactors = FALSE
  )
  if( missing(dims) ){
    if(!("Counts" %in% names )){
      dCount <- tryCatch(H5Dopen(g, "Counts"), error = function(e) e)
      if(class(dCount) != "H5IdComponent"){
        stop( "If dims is not specified the dataset 'Counts' must exist in the tallyFile" )  
      }else{
        sCount <- H5Sget_simple_extent_dims(H5Dget_space(dCount))$size
        datasets$PosDim = sapply(sizes, function(x) which(x == sCount[4]))
        H5Dclose(dCount)
      }
    }else{
      if(verbose){
        message("Guessing chromosome length from 'Counts' dataset, if one of the other dimensions happens to match this length there might be weird behaviour here. In that case, specify the dimensions explicitly.")
      }
      datasets$PosDim = sapply(sizes, function(x) which(x == sizes$Counts[4]))
    }
  }else{
    datasets$PosDim = dims
    if( !all(dims <= datasets$DimCount & dims >= 1) ) 
      stop( "The specified dimensions 'dims' do not match the structure of the data" )
  }
  
  chromlength <- sapply(datasets$Name, function(n) sizes[[n]][datasets$PosDim[datasets$Name == n]])
  if( length(unique(chromlength)) == 1 ){
    chromlength = chromlength[1]
  }else{
    stop( "The specified datasets do not have a common length in the specified dimensions!" )
  }
  
  if( missing(range) ){
    range = c( 1, chromlength )
  }
  lapply(h5Ds, H5Dclose)
  H5Gclose(g)
  H5Fclose(f)
  
  blockstart <- start(range)
  blockend <- end(range)
  stopifnot( blockstart[1] >= 1, blockend[length(blockend)] <= chromlength )
  blocks = paste( blockstart, blockend, sep=":" )
  
  applyFUN <- function( b ){
    if(verbose)
      message( "Processing Block #", b, ": ", blocks[b], sep="")
    vec = vector(mode="list", length=nrow(datasets))
    names(vec) = datasets$Name 
    for( i in seq_len(nrow(datasets)) ){
      idx_list = vector(mode="list", length=datasets$DimCount[i] )
      idx_list[[ datasets$PosDim[i] ]] <- blockstart[b]:blockend[b]
      sampleDim <- sampleDimMap[[datasets$Name[i]]]
      if(!is.null(sampleDim)){
        if( sampleDim == datasets$PosDim[i] & !all( sampleData$Sample %in% samples) ){
          stop("Specified binning dimension equals sample dimension - can't subset by sample")
        }
        idx_list[[ sampleDim ]] <- selectedColumns
      }
      vec[[i]] = h5read( filename, paste( group, datasets$Name[i], sep="/" ), index = idx_list )
      if(names(vec)[i] == "Reference"){ #BEGIN: Explicit casting to integer since this now comes out as raw for some reason
        vec[[i]] <- as.integer(vec[[i]])
      }                                 #END:   Explicit casting to integer since this now comes out as raw for some reason
    }
    vec$h5dapplyInfo <- list( Blockstart = blockstart[b], Blockend = blockend[b], Datasets = datasets, Group = group )
    FUN(vec, ... )
  }
  if(is.null(BPPARAM)){
    ret = lapply(
      seq(along=blocks),
      applyFUN
    )
  }else{
    ret = bplapply(seq(along=blocks),
                   function(b, blocks, datasets, ...) {
                     library(h5vc)
                     applyFUN(b)
                   }, blocks=blocks, datasets=datasets, BPPARAM = BPPARAM
    )
  }
  
  base::names(ret) = blocks
  return( ret )
}

Try the h5vc package in your browser

Any scripts or data that you put into this service are public.

h5vc documentation built on Nov. 8, 2020, 4:56 p.m.