R/mergeTallyFiles.r

Defines functions mergeTallyFiles

Documented in mergeTallyFiles

mergeTallyFiles <- function( inputFiles, destFile, destGroup, blockSize = 1e6, sampleDims = c(), positionDims = c() ){
  sampleDims <- c("Counts" = 2, "Coverages" = 1, "Deletions" = 1, sampleDims)
  positionDims <- c("Counts" = 4, "Coverages" = 3, "Deletions" = 3, positionDims)
  #Checking for compatibility of dimensions 
  refLengths <- sapply(
    names(inputFiles),
    function(fileName){
      g <- h5ls( fileName )
      g <- subset(g, group == inputFiles[fileName])
      d <- g$dim[g$name == "Reference"]
      d
      }, USE.NAMES=TRUE)
  stopifnot(length(unique(refLengths)) == 1) # dimensions must match
  #SampleData
  sampleData <- lapply(
    names(inputFiles),
    function(fileName){
      getSampleData(fileName, inputFiles[fileName])[,c("Sample", "Patient", "Column", "Type")]
    })
  
  tmp <- sampleData
  offset <- 0
  for( i in seq(length(sampleData))){
    sampleData[[i]]$Column <- sampleData[[i]]$Column + offset
    sampleData[[i]]$SourceFile <- names(inputFiles)[i]
    offset <- offset + nrow(sampleData[[i]])
  }  
  destSampleData <- do.call( rbind, sampleData )
  sampleData <- tmp
  rm(tmp)

  dataSets <- Reduce(
    function(x,y) intersect(x,y),
    lapply(
      names(inputFiles),
      function(fileName){
        g <- h5ls( fileName )
        g <- subset(g, group == inputFiles[fileName])
        g$name
      })
  )
  print( paste( "Found common datasets:", paste(dataSets, collapse = ", " ) ) )
  destStudy <- strsplit(destGroup, split = "/")[[1]][1]
  destChrom <- strsplit(destGroup, split = "/")[[1]][2]
  prepareTallyFile( filename=destFile, study=destStudy, chrom=destChrom, chromlength=as.numeric(refLengths[1]), nsamples=nrow(destSampleData) )
  setSampleData(destFile, destGroup, destSampleData)
  blocks <- defineBlocks( 1, as.numeric(refLengths[1]), blockSize )
  for(blockID in blocks$ID){
    blockRange <- c(blocks$Start[blocks$ID == blockID], blocks$End[blocks$ID == blockID])
    wroteReference <- FALSE
    for( fileName in names(inputFiles) ){
      sliceSampleData <- subset(destSampleData, SourceFile == fileName)
      inputData <- h5readBlock( fileName, group=inputFiles[fileName], names=dataSets, range=blockRange )
      for(dataSet in dataSets){
          if( dataSet != "Reference" ){
              idx = lapply( seq(positionDims[dataSet]), function(x) return(NULL) ) #this could be more elegant for sure :D
              idx[[positionDims[dataSet]]] <- blockRange[1]:blockRange[2]
              idx[[sampleDims[dataSet]]] <- sliceSampleData$Column
              h5write( inputData[[dataSet]], destFile, paste( destGroup, dataSet, sep = "/" ), index = idx )
          }else{
              if( !wroteReference ){
                  idx <- list( blockRange[1]:blockRange[2] )
                  h5write( inputData[[dataSet]], destFile, paste( destGroup, dataSet, sep = "/" ), index = idx )
                  wroteReference <- TRUE
              }
          }
      }
    }
    print(paste("[", Sys.time(), "] Block #", blockID, "processed!"))
  }
}

Try the h5vc package in your browser

Any scripts or data that you put into this service are public.

h5vc documentation built on Nov. 8, 2020, 4:56 p.m.