rMSI: MSI Tools

Documented in fixImzMLDuplicates import_imzML

#########################################################################
#     rMSI - R package for MSI data handling and visualization
#     Copyright (C) 2014 Pere Rafols Soler
#
#     This program is free software: you can redistribute it and/or modify
#     it under the terms of the GNU General Public License as published by
#     the Free Software Foundation, either version 3 of the License, or
#     (at your option) any later version.
#
#     This program is distributed in the hope that it will be useful,
#     but WITHOUT ANY WARRANTY; without even the implied warranty of
#     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#     GNU General Public License for more details.
#
#     You should have received a copy of the GNU General Public License
#     along with this program.  If not, see <http://www.gnu.org/licenses/>.
############################################################################

#' fixImzMLDuplicates.
#' Delete duplicates and possible zero-drop errors (fixing Bruker's bugs in imzML).
#' 
#' @param mass spectrum mass axis.
#' @param intensity spectrum intensity.
#'
#' @return a list with non-duplicated mass and intensity vectors.
#'
fixImzMLDuplicates <- function(mass, intensity)
{
  idup <- which(duplicated(mass))
  if(length(idup) > 0)
  {
    for( i in 1:length(idup))
    {
      intensity[idup[i] - 1] <- max( intensity[idup[i]], intensity[idup[i] - 1])
    }
    mass <- mass[-idup]
    intensity <- intensity[-idup]
  }
  return(list(mass=mass, intensity=intensity))
}

#' import_imzML.
#'
#' @param imzML_File full path to .imzML file.
#' @param ibd_File path to the binary file (default the same as imzML file but with .ibd extension)
#' @param ramdisk_path where the ramdisk will be created.
#' @param fun_progress This is a callback function to update the progress of loading data. See details for more information.
#' @param fun_text This is a callback function to update the label widget of loading data. See details for more information.
#' @param close_signal function to be called if loading process is abored.
#' @param verifyChecksum if the binary file checksum must be verified, it is disabled by default for convenice with really big files.
#' @param subImg_rename alternative image name, a new ramdisk will be created with the given name.
#' @param subImg_Coords a Complex vector with the motors coordinates to be included in the ramdisk.
#' @param convertProcessed2Continuous if true (the default) an imzML file in processed mode will be converted to a continuous mode.
#'
#'  Imports an imzML image to an rMSI data object.
#'  It is recomanded to use rMSI::LoadMsiData directly instead of this function.
#'
#' @return an rMSI data object.
#' @export
#'
import_imzML <- function(imzML_File, ibd_File =  paste(sub("\\.[^.]*$", "", imzML_File), ".ibd", sep = "" ), ramdisk_path,  fun_progress = NULL, fun_text = NULL, close_signal = NULL, verifyChecksum = F, subImg_rename = NULL, subImg_Coords = NULL, convertProcessed2Continuous = T)
{
  setPbarValue<-function(progress)
  {
    setTxtProgressBar(pb, progress)
    return(T)
  }

  if(is.null(fun_progress))
  {
    pb<-txtProgressBar(min = 0, max = 100, style = 3 )
    fun_progress <- setPbarValue
    cat("\n")
  }
  else
  {
    pb<-NULL
  }

  #1- Parse XML data
  if(is.null(fun_text))
  {
    cat("Parsing XML data in imzML file...\n")
  }
  else
  {
    fun_text("Parsing XML data in imzML file...")
  }
  xmlRes <- CimzMLParse(path.expand(imzML_File))
  if( !is.null(xmlRes$Error))
  {
    .controlled_loadAbort(paste0(xmlRes$Error, "\n"), close_signal)
  }
  if(verifyChecksum)
  {
    if( xmlRes$SHA != "" )
    {
      cat("\nChecking binary data checksum using SHA-1 key... ")
      res <- toupper(digest::digest( ibd_File, algo = "sha1", file = T))
      if( res == xmlRes$SHA )
      {
        cat("OK\n")
      }
      else
      {
        cat(paste("NOK\nChecksums don't match\nXML key:", xmlRes$SHA, "\nBinary file key:", res,"\n"))
        #.controlled_loadAbort("ERROR: possible data corruption\n", close_signal)
        #Disableing the abort, just showing a warning here as it seams that there is bug in brukers checksum imzml file...
        cat("WARNING: MS data my be corrupt!\n")
      }
    }
    if( xmlRes$MD5 != "")
    {
      cat("Checking binary data checksum using MD5 key... ")
      res <- toupper(digest::digest( ibd_File, algo = "md5", file = T))
      if( res == xmlRes$MD5 )
      {
        cat("OK\n")
      }
      else
      {
        cat(paste("NOK\nChecksums don't match\nXML key:", xmlRes$MD5, "\nBinary file key:", res,"\n"))
        #.controlled_loadAbort("ERROR: possible data corruption\n", close_signal)
        #Disableing the abort, just showing a warning here as it seams that there is bug in brukers checksum imzml file...
        cat("WARNING: MS data my be corrupt!\n")
      }
    }  
  }
  else
  {
    cat("WARNING: Checksum validation is disabled, data may be corrupt!\n")
  }
  
  #Select specific pixels in the dataset
  if( !is.null(subImg_Coords))
  {
    keepIds <-  which( complex( real = xmlRes$run_data$x, imaginary = xmlRes$run_data$y) %in% subImg_Coords)

    if(length(keepIds) == 0)
    {
      .controlled_loadAbort("ERROR: no subImg_Coords found in current imzML data.\n", close_signal)
    }
    
    xmlRes$run_data <- xmlRes$run_data[keepIds,]
  }
  
  #2- Create a connection to read binary file
  bincon <- file(description = ibd_File, open = "rb")

  #3- Test the UUID in binary file (the first 16 bytes are always UUID (in XML file are in hex codes))
  binUUID <- paste(sprintf("%.2X", readBin(bincon, integer(), 16, size = 1, signed = F)), collapse = "")
  if(binUUID != xmlRes$UUID)
  {
    close(bincon)
    .controlled_loadAbort("ERROR: UUID in imzML file does not match UUID in ibd file\n", close_signal)
  }

  sizeInBytesFromDataType <- function(str_dataType)
  {
    if(str_dataType == "int" || str_dataType == "float")
    {
      return(4)
    }
    if(str_dataType == "long" || str_dataType == "double")
    {
      return(8)
    }
  }

  #5- Map imzML possible data types to ff packages available data types
  if(xmlRes$int_dataType == "int")
  {
    ffDataType <- "integer" #32 bit signed integer with NA.
    readDataTypeInt <- integer()
  }
  if(xmlRes$int_dataType == "long")
  {
    ffDataType <- "single" #32 bit signed integer is not available in ff so I map it to 32 bits float to allow enough range.
    readDataTypeInt <- integer()
  }
  if(xmlRes$int_dataType == "float")
  {
    ffDataType <- "single"
    readDataTypeInt <- numeric()
  }
  if(xmlRes$int_dataType == "double")
  {
    ffDataType <- "double"
    readDataTypeInt <- numeric()
  }
  bytes2ReadInt <- sizeInBytesFromDataType(xmlRes$int_dataType)
  
  #4- Obtain de m/z axis (this is only valid for imzML continuous mode)
  pt<-proc.time()
  if(xmlRes$mz_dataType == "int" || xmlRes$mz_dataType == "long")
  {
    readDataTypeMz <- integer()
  }
  if(xmlRes$mz_dataType == "float" || xmlRes$mz_dataType == "double")
  {
    readDataTypeMz <- numeric()
  }
  bytes2ReadMz <- sizeInBytesFromDataType(xmlRes$mz_dataType)
  
  bCreateRamdisk <- F #Start assuming data in processed and peak list
  bNoNeed2Resample <- T #Start assuming there is no need to resample the data
  if(xmlRes$continuous_mode)
  {
    mzAxis <- readBin(bincon, readDataTypeMz, xmlRes$run_data[1, "mzLength"], size = bytes2ReadMz, signed = T)
    bCreateRamdisk <- T
  }
  else
  {
    if(convertProcessed2Continuous)
    {
      #Processed mode, so a common mass axis must be calculated and stored in mzAxis var
      cat("Calculating the new mass axis...\n")
      if(!is.null(fun_text))
      {
        fun_text("Process mode, re-calculating mass axis...")
      }
      ppStep<-100/nrow(xmlRes$run_data)
      pp<-0
      #Update progress bar
      if( !fun_progress(pp) )
      {
        return(NULL) #progress bar function must return true if the loading process is to be continued.
      }
      
      icurr <- 1
      bLoad <- TRUE
      MergedSpc <- list()
      
      #Read only the first mass axis to compare if others are identical, this is the case for Bruker FTICR
      seek(bincon, rw = "read", where = xmlRes$run_data[1, "mzOffset"] )
      mzdd <- readBin(bincon, readDataTypeMz, xmlRes$run_data[1, "mzLength"], size = bytes2ReadMz, signed = T)
      
      #Read the intensity of the first mass axis 
      seek(bincon, rw = "read", where = xmlRes$run_data[1, "intOffset"] )
      dd <- readBin(bincon, readDataTypeInt, xmlRes$run_data[1, "intLength"], size = bytes2ReadInt, signed = T)
      
      #Fix duplicates and zero drops
      FirstSpectrumFixed <- fixImzMLDuplicates(mzdd, dd)
      
      #Calculate first mass axis bin size to avoid having to calculate it at each iteration
      firstMassAxis <- FirstSpectrumFixed$mass
      firstMassAxisBinSize <- rMSI::CalcMassAxisBinSize( firstMassAxis, FirstSpectrumFixed$intensity )
      rm(FirstSpectrumFixed)
      
      while(TRUE)
      {
        if(bLoad)
        {
          #Read mass axis for the current spectrum 
          seek(bincon, rw = "read", where = xmlRes$run_data[icurr, "mzOffset"] )
          mzdd <- readBin(bincon, readDataTypeMz, xmlRes$run_data[icurr, "mzLength"], size = bytes2ReadMz, signed = T)
          
          #Read intensity of current spectrum
          seek(bincon, rw = "read", where = xmlRes$run_data[icurr, "intOffset"] )
          dd <- readBin(bincon, readDataTypeInt, xmlRes$run_data[icurr, "intLength"], size = bytes2ReadInt, signed = T)
          
          #Fix duplicates and zero drops
          CurrSpectrumFixed <- fixImzMLDuplicates(mzdd, dd)
          
           #Get Bin size at peaks
          LoadMass <- CurrSpectrumFixed$mass
          if(identical(firstMassAxis, LoadMass))
          {
            bMassMerge <- F
            LoadBins <- firstMassAxisBinSize
          }
          else
          {
            bMassMerge <- T
            LoadBins <- rMSI::CalcMassAxisBinSize( LoadMass, CurrSpectrumFixed$intensity)  
          }
          bNoNeed2Resample <- bNoNeed2Resample & (!bMassMerge)
          
          #Shift register
          if(length(MergedSpc) > 0)
          {
            for(i in length(MergedSpc):1)
            {
              MergedSpc[[i+1]] <- MergedSpc[[i]]
            }
          }
          
          MergedSpc[[1]] <- list( level = 0, mass = LoadMass, bins =  LoadBins, merge = bMassMerge )
          icurr <- icurr + 1
          bLoad <- FALSE
          
          #Update progress bar
          pp_ant<-pp
          pp<-pp+ppStep
          if(!is.null(fun_progress) && (round(pp) > round(pp_ant)) )
          {
            #Update progress bar
            if( !fun_progress(pp) )
            {
              return(NULL) #progress bar function must return true if the loading process is to be continued.
            }
          }
        }
        
        if(length(MergedSpc) > 1)
        {
          if(MergedSpc[[1]]$level == MergedSpc[[2]]$level || icurr > nrow(xmlRes$run_data))
          { 
            #Merge!
            if( MergedSpc[[2]]$merge )
            {
              mam <- rMSI::MergeMassAxis(MergedSpc[[1]]$mass, MergedSpc[[1]]$bins, MergedSpc[[2]]$mass, MergedSpc[[2]]$bins )
            }
            else
            {
              #Both mass axes are identical so there is no need to merge them, this is the case for Bruker FTICR data
              mam <- list(mass = MergedSpc[[1]]$mass, bins = MergedSpc[[1]]$bins )
            }
            MergedSpc[[1]] <- list( level = MergedSpc[[1]]$level + 1, mass = mam$mass, bins =  mam$bins, merge = MergedSpc[[2]]$merge )
            
            #Shift register
            if(length(MergedSpc) > 2)
            {
              for(i in 2:(length(MergedSpc)-1))
              {
                MergedSpc[[i]] <- MergedSpc[[i+1]]
              }
            }
            MergedSpc[[length(MergedSpc)]] <- NULL #Delete the last element
            
            #End Condition
            if(icurr > nrow(xmlRes$run_data) && length(MergedSpc) == 1)
            {
              break
            }
          }
          else
          {
            bLoad <- TRUE
          }
        }
        else
        {
          bLoad <- TRUE
        }
      }

      bCreateRamdisk <- T
      mzAxis <-  MergedSpc[[1]]$mass
      rm(MergedSpc)
      gc()
    }
  }
  pt<-proc.time() - pt
  if(bCreateRamdisk)
  {
    cat(paste("\nMass axis calculation time:",round(pt["elapsed"], digits = 1),"seconds\n"))
    cat(paste("The re-sampled mass axis contains", length(mzAxis), "data points\n"))
  }
  
  #6- Create the ramdisk only if data is going to be coerced to continuous mode
  if( is.null(subImg_rename))
  {
    subImg_rename <-  basename(imzML_File)
  }
  else
  {
    ramdisk_path <- paste0(ramdisk_path, "_",subImg_rename )
  }
  if(bCreateRamdisk)
  {
    if(!is.null(fun_text))
    {
      fun_text("Loading data in the ramdisk...")
    }
    dir.create(ramdisk_path, recursive = T, showWarnings = F)
    datacube <- CreateEmptyImage(num_of_pixels = nrow(xmlRes$run_data), mass_axis = mzAxis, pixel_resolution = xmlRes$pixel_size_um,
                                 img_name = subImg_rename,
                                 ramdisk_folder = ramdisk_path,
                                 data_type = ffDataType,
                                 uuid = binUUID
                                 )
    #Ramdisk data accessors for full ff matrix acces (should be faster than accessing singles spectrum)
    currentDataCube <- 1
    currentDataIRow <- 1
    bufferMatrix <- matrix(nrow = nrow(datacube$data[[1]]), ncol = length(datacube$mass))
  }
  else
  {
    #No ramdisk is created then datacube will be a dummy list to store positions and peak lists
    datacube <- list(name = subImg_rename, 
                     size = c(NA,NA),
                     uuid = binUUID,
                     pos = matrix(NA, nrow = nrow(xmlRes$run_data), ncol = 2),
                     pixel_size_um = xmlRes$pixel_size_um,
                     peaks = list())
    colnames(datacube$pos) <- c("x", "y")
    names(datacube$size) <- c("x", "y")
  }
  #7- Read all spectra
  pt <- proc.time()
  cat("\nReading spectra from binary file...\n")
  ppStep<-100/nrow(xmlRes$run_data)
  pp<-0
  #Update progress bar
  if( !fun_progress(pp) )
  {
    return(NULL) #progress bar function must return true if the loading process is to be continued.
  }

  for(i in 1:nrow(xmlRes$run_data))
  {
    #Read intensity of current spectrum
    seek(bincon, rw = "read", where = xmlRes$run_data[i, "intOffset"] )
    dd <- readBin(bincon, readDataTypeInt, xmlRes$run_data[i, "intLength"], size = bytes2ReadInt, signed = T)
    
    if(!xmlRes$continuous_mode)
    {
      #Read mass axis for the current spectrum 
      seek(bincon, rw = "read", where = xmlRes$run_data[i, "mzOffset"] )
      mzdd <- readBin(bincon, readDataTypeMz, xmlRes$run_data[i, "mzLength"], size = bytes2ReadMz, signed = T)
      
      if(bCreateRamdisk)
      {
        #Fix duplicates and zero drops
        CurrSpectrumFixed <- fixImzMLDuplicates(mzdd, dd)
        mzdd  <- CurrSpectrumFixed$mass
        dd <- CurrSpectrumFixed$intensity

        #Apply re-sampling (only if needed...)
        if( !bNoNeed2Resample )
        {
          if(length(mzdd) == length(dd))
          {
            dd <- (approx( x = mzdd, y = dd, xout = mzAxis, ties = "ordered", yleft = 0, yright = 0))$y
            dd[which(is.na(dd))] <- 0 #Remove any possible NA
          }
          else
          {
            cat(paste0("WARNING: spectra at X = ", xmlRes$run_data$x[i], "  Y = ", xmlRes$run_data$y[i], " corrupt!\n" ))
            dd <- rep(0, length(mzAxis))  
          }
        } 
      }
    }
    
    if(bCreateRamdisk)
    {
      #Store the intensities to the ramdisk by blocks using a buffer matrix for better performance
      bufferMatrix[currentDataIRow, ] <- dd
      currentDataIRow <- currentDataIRow  + 1
      if( currentDataIRow > nrow(datacube$data[[currentDataCube]]))
      {
        datacube$data[[currentDataCube]][,] <- bufferMatrix
        currentDataCube <-  currentDataCube + 1
        currentDataIRow <- 1
        if(currentDataCube <= length(datacube$data))
        {
          bufferMatrix <- matrix(nrow = nrow(datacube$data[[currentDataCube]]), ncol = length(datacube$mass))
        }
      }
    }
    else
    {
      datacube$peaks[[i]]<-list(mass = mzdd, intensity = dd)
    }
    datacube$pos[i, "x"] <- xmlRes$run_data[i, "x"]
    datacube$pos[i, "y"] <- xmlRes$run_data[i, "y"]
    
    #Update progress bar
    pp_ant<-pp
    pp<-pp+ppStep
    if(!is.null(fun_progress) && (round(pp) > round(pp_ant)) )
    {
      #Update progress bar
      if( !fun_progress(pp) )
      {
        return(NULL) #progress bar function must return true if the loading process is to be continued.
      }
    }
  }
  pt<-proc.time() - pt
  cat(paste("\nBinary file reading time:",round(pt["elapsed"], digits = 1),"seconds\n"))

  #8- Close the bin file connection
  close(bincon)
  cat("\n")

  #9- Compute image size and arrange coords to avoid holes
  datacube$posMotors <- datacube$pos
  datacube$pos <- remap2ImageCoords(datacube$pos)
  datacube$size["x"] <- max(datacube$pos[,"x"])
  datacube$size["y"] <- max(datacube$pos[,"y"])

  if(bCreateRamdisk)
  {
    #10- Compute average spectrum
    datacube$mean <- AverageSpectrum(datacube)
  
    #11- Save the data cube for further fast access
    save(datacube, file = file.path(ramdisk_path, "datacube.RImg"))
    class(datacube) <- "rMSIObj"
  }
  else
  {
    class(datacube) <- "peakList"
  }
  
  #12- And it's done, just return de rMSI object
  if(!is.null(pb))
  {
    close(pb)
  }
  gc()
  if(!is.null(fun_text))
  {
    fun_text("Done")
  }
  
  return(datacube)
}