mzDataTable: Import and convert raw mass spectrometry data formats

Documented in mzML2diskFrame .mzML2diskFrameChunk

#' @title Convert mz data to disk.frame
#'
#' @description Convert .mzML file to disk.frame using mzR, data.table, and
#'   disk.frame
#'
#' @param path path to the mzML file
#' @param diskFramePath path and fileName.df specifying write location of .df
#'   directory
#' @param scans Optional parameter. Provide a numeric vector to import select
#'   scans and write to a data.frame. If not provided, the whole file is
#'   converted. Default is NULL.
#' @param chunkSize number of scans to be extracted and written to the
#'   disk.frame at a time. Useful for breaking up large data files and
#'   converting to disk.frame. Default is 100. If NULL, imports all scans in
#'   single operation and you will probably run out of memory.
#'
#' @return Returns a disk.frame reference object.
#'
#' @export
#'
#' @examples
#' \dontrun{
#' #read .mzML file from system path and write to disk.frame
#' #100 scans (default) at a time
#' mzML2diskFrame(path = path_2_mzML,
#'                diskFramePath = "dfPath.df")
#'
#' #read scans 100-200 from .mzML file specified by a system path
#' #and write to disk.frame 20 scans at a time
#' mzML2diskFrame(path = path_2_mzML,
#'                diskFramePath = "dfPath.df",
#'                scans = c(100:200),
#'                chunkSize = 20)
#' }
#'

mzML2diskFrame <- function(path, diskFramePath, scans = NULL, chunkSize = 100){
  if(dir.exists(diskFramePath)){
    stop("diskFramePath leads to directory location that already exists")
  }

  #Link to the file
  file <- mzR::openMSfile(filename = path, verbose = TRUE)

  #Construct index to group the scans into chunks
  scanChunks <- .scanChunker(scans = scans,
                             mzRfilePointer = file,
                             chunkSize = chunkSize)

  #Setup disk.frame backend
  disk.frame::setup_disk.frame()
  options(future.globals.maxSize = Inf)

  #Create disk.frame
  diskF <- disk.frame::disk.frame(path = diskFramePath)

  #Get scans and write to disk.frame
  writeResult <- mapply(FUN = .mzML2diskFrameChunk,
                        scans = scanChunks,
                        MoreArgs = list(path = file,
                                        diskFrame = diskF),
                        SIMPLIFY = FALSE)

  #Cleanup
  c <- gc()

  diskF
}

#' @title Write data.table of mzML/mzXML data to a disk.frame
#'
#' @description Internal function. Import data using mzML2dataTable() and write
#'   results to a disk.frame as a chunk.
#' @param path path or mzR pointer to the mzML file
#' @param diskFrame disk.frame pointer object created by disk.frame()
#' @param scans a numeric vector to import select scans and write to a
#'   data.frame - likely generated by .scanChunker()
#'
#' @return NULL
#'

.mzML2diskFrameChunk <- function(path, diskFrame, scans){

  dt <- mzML2dataTable(path = path, scans = scans)

  #Print Statements: WRITE
  if(is.null(scans)){
    print("Writing all scans to disk.frame")
  }else{
    scanMin <- min(scans)
    scanMax <- max(scans)

    print(paste("Writing scans from:", scanMin, "to", scanMax, "to disk.frame"))
  }

  #Write the results to a disk.frame
  diskf <- disk.frame::add_chunk(df = diskFrame, chunk = dt)

  remove(dt)
}