IFC: Tools for Imaging Flow Cytometry

################################################################################
# This file is released under the GNU General Public License, Version 3, GPL-3 #
# Copyright (C) 2020 Yohann Demont                                             #
#                                                                              #
# It is part of IFC package, please cite:                                      #
# -IFC: An R Package for Imaging Flow Cytometry                                #
# -YEAR: 2020                                                                  #
# -COPYRIGHT HOLDERS: Yohann Demont, Gautier Stoll, Guido Kroemer,             #
#                     Jean-Pierre Marolleau, Loïc Garçon,                      #
#                     INSERM, UPD, CHU Amiens                                  #
#                                                                              #
# DISCLAIMER:                                                                  #
# -You are using this package on your own risk!                                #
# -We do not guarantee privacy nor confidentiality.                            #
# -This program is distributed in the hope that it will be useful, but WITHOUT #
# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or        #
# FITNESS FOR A PARTICULAR PURPOSE. In no event shall the copyright holders or #
# contributors be liable for any direct, indirect, incidental, special,        #
# exemplary, or consequential damages (including, but not limited to,          #
# procurement of substitute goods or services; loss of use, data, or profits;  #
# or business interruption) however caused and on any theory of liability,     #
# whether in contract, strict liability, or tort (including negligence or      #
# otherwise) arising in any way out of the use of this software, even if       #
# advised of the possibility of such damage.                                   #
#                                                                              #
# You should have received a copy of the GNU General Public License            #
# along with IFC. If not, see <http://www.gnu.org/licenses/>.                  #
################################################################################

#' @title RIF/CIF File Merging
#' @description
#' Merges RIF or CIF files.
#' @param fileName paths of files to merge.
#' All files have to be either '.rif' or '.cif' files.
#' All files should have same channels.
#' @param write_to pattern used to export file.
#' Placeholders, like "\%d/\%s_fromR.\%e", will be substituted:\cr
#' -\%d: with full path directory of first element of 'fileName'\cr
#' -\%p: with first parent directory of first element of 'fileName'\cr
#' -\%e: with extension of 'fileName' (without leading .)\cr
#' -\%s: with shortname from of first element of 'fileName' (i.e. basename without extension).\cr
#' Exported file extension will be deduced from this pattern. It has to be the same as 'fileName', i.e. .cif or .rif.
#' @param extract_features whether to try to extract features. Default is FALSE. Not yet implemented.
#' @param endianness the endian-ness ("big" or "little") of the target system for the file. Default is .Platform$endian.\cr
#' Endianness describes the bytes order of data stored within the files. This parameter may not be modified.
#' @param verbose whether to display information (use for debugging purpose). Default is FALSE.
#' @param verbosity quantity of information displayed when verbose is TRUE; 1: normal, 2: rich. Default is 1.
#' @param overwrite whether to overwrite file or not. Default is FALSE.\cr
#' Note that if TRUE, it will overwrite exported file if paths of files in 'fileName' and deduced from 'write_to' arguments are different.
#' Otherwise, you will get an error saying that overwriting source file is not allowed.\cr
#' Note also that an original file, i.e. generated by IDEAS(R) or INSPIRE(R), will never be overwritten.
#' Otherwise, you will get an error saying that overwriting original file is not allowed.\cr
#' @param display_progress whether to display a progress bar. Default is TRUE.
#' @param add_tracking whether to register files' paths and objects' ids in the exported file. Default is TRUE.
#' @param ... other arguments to be passed.
#' @return It invisibly returns full path of exported file.
#' @keywords internal
mergeXIF <- function (fileName, write_to,
                      extract_features = FALSE, endianness = .Platform$endian, verbose = FALSE, verbosity = 1, 
                      overwrite = FALSE, display_progress = TRUE, 
                      add_tracking = TRUE, ...) {
  dots = list(...)
  # change locale
  locale_back <- setloc(c("LC_ALL" = "en_US.UTF-8"))
  enc_back <- options("encoding" = "UTF-8")
  on.exit(suspendInterrupts({setloc(locale_back); options(enc_back)}), add = TRUE)
  
  # check madatory param
  if(missing(fileName)) stop("'fileName' can't be missing")
  tmp = duplicated(fileName)
  if(any(tmp)) {
    warning(paste0("duplicated files have been removed from 'fileName': ","\n-", paste0(fileName[tmp],collapse="\n-")), call. = FALSE, immediate. = TRUE)
    fileName = fileName[!tmp]
  }
  if(length(fileName) < 2) stop("'fileName' should be at least of length 2 to create a merge")
  tmp = file.exists(fileName)
  if(!all(tmp)) stop(paste0(paste0("'fileName', can't find file",ifelse(sum(!tmp) > 1, "s:", ":")),"\n-", paste0(fileName[!tmp],collapse="\n-")))
  tmp = unique(getFileExt(fileName))
  if(length(tmp) != 1) stop("all files in 'fileName' should have same extension")
  fileName = enc2native(normalizePath(fileName, winslash = "/", mustWork = FALSE))
  tryCatch({
    tmp = unique(sapply(fileName, cpp_checkTIFF))
  }, error = function(e) {
    stop(e$message, call. = FALSE)
  })
  if(length(unique(tmp)) != 1) stop("can't deal with files in 'fileName' with different endianness")
  XIF_test = unique(sapply(fileName, testXIF))
  if(length(XIF_test) != 1) stop("can't deal with files in 'fileName' with different image/mask storage")
  XIF_step = as.integer(XIF_test == 1) + 1L
  file_first = fileName[1]
  
  # no check on illumination
  # check on channels
  tmp = lapply(fileName, FUN = function(f) {
    IFD = getIFD(fileName = f, offsets = "first", verbose = verbose, verbosity = verbosity, bypass = TRUE)
    tmp_acq = read_xml(getFullTag(IFD = IFD, which = 1, tag = "33027", raw = TRUE), options=c("HUGE","RECOVER","NOENT","NOBLANKS","NSCLEAN"))
    as.logical(as.numeric(unlist(strsplit( xml_text(xml_find_first(tmp_acq, "//Imaging//ChannelInUseIndicators_0_11")), " ", useBytes = TRUE, fixed = TRUE))))
  })
  if(!do.call(what = "all.equal.list", args = tmp)) stop("files in 'fileName' have been acquired with different channels")
  
  if(missing(write_to)) stop("'write_to' can't be missing")
  extract_features = as.logical(extract_features); assert(extract_features, len = 1, alw = c(TRUE, FALSE))
  display_progress = as.logical(display_progress); assert(display_progress, len = 1, alw = c(TRUE, FALSE))
  verbose = as.logical(verbose); assert(verbose, len = 1, alw = c(TRUE, FALSE))
  add_tracking = as.logical(add_tracking); assert(add_tracking, len = 1, alw = c(TRUE, FALSE))
  if(verbose) {
    verbosity = as.integer(verbosity); assert(verbosity, len = 1, alw = c(1,2))
    VER = ifelse(verbose & (verbosity==2), TRUE, FALSE)
  } else {
    VER = FALSE
  }
  
  r_endian = cpp_checkTIFF(file_first)
  swap = r_endian != endianness
  f_Ext = getFileExt(file_first)
  assert(write_to, len = 1, typ = "character")
  splitf_obj = splitf(file_first)
  splitp_obj = splitp(write_to)
  write_to = formatn(splitp_obj, splitf_obj)
  e_Ext = getFileExt(write_to); assert(e_Ext, len = 1, alw = c("cif", "rif"))
  if(f_Ext != e_Ext) stop("'fileName' and 'write_to' should have same extension")
  if(any(splitp_obj$channel > 0)) message("'write_to' has %c argument but channel information can't be retrieved with mergeXIF()")
  if(any(splitp_obj$object > 0)) message("'write_to' has %o argument but channel information can't be retrieved with mergeXIF()")
  dir_name = dirname(write_to)
  if(!dir.exists(dir_name)) if(!dir.create(dir_name, recursive = TRUE, showWarnings = FALSE)) stop(paste0("can't create\n", dir_name))
  
  overwritten = FALSE
  if(file.exists(write_to)) {
    write_to = normalizePath(write_to, winslash = "/", mustWork = FALSE)
    if(!overwrite) stop(paste0("file ", write_to, " already exists"))
    if(any(tolower(fileName) %in% tolower(write_to))) stop("you are trying to overwrite source file which is not allowed")
    tryCatch({
      IFD_export = getIFD(fileName = write_to, offsets = "first", force_trunc = TRUE, trunc_bytes = 4, verbose = verbose, verbosity = verbosity, bypass = TRUE)[[1]] 
    }, error = function(e) {
      stop(paste0(write_to, "\ndoes not seem to be well formatted:\n", e$message), call. = FALSE)
    })
    if(length(IFD_export$tags[["33090"]])==0) stop("you are trying to overwrite an original file which is not allowed")
    tmp_file = tempfile()
    overwritten = TRUE
  }
  file_w = ifelse(overwritten, tmp_file, write_to)
  title_progress = basename(write_to)
  
  # # extract features from files
  # feat = try(lapply(files, FUN = function(f) {
  #   suppressWarnings(ExtractFromXIF(fileName = f, extract_features = TRUE, extract_images = FALSE, 
  #                   extract_offsets = FALSE, extract_stats = TRUE, verbose = verbose, 
  #                   verbosity = verbosity, display_progress = TRUE,
  #                   fast = TRUE, recursive = TRUE)$features)
  # }))
  # str(feat)
  
  # unwanted tags
  # 33004 corresponds to file date
  # 33005 corresponds to user
  # 33018 corresponds to total object number
  # 33029 corresponds to merged flag for cif merged
  # 33030 corresponds to merged files in RIF
  # 33070 corresponds to total cell count 
  # 33081 appears in merged file, it has same val = 33080, but is of typ = 2 and map NULL 
  # 33082 corresponds to binary Features version, will be overwritten if features are found
  # 33083 corresponds to Features values in merged or subset
  # 33090, 33091, 33092, 33093, 33094 corresponds to tags we add to track objects origin
  unwanted = c(33004, 33005, 33018, 33029, 33030, 33070, 33081, 33082, 33083, 33090, 33091, 33092, 33093, 33094)
  
  # open connection for writing
  tryCatch(suppressWarnings({
    towrite = file(description = file_w, open = "wb")
  }), error = function(e) {
    stop(paste0(ifelse(overwritten,"temp ","'write_to' "), "file: ", file_w, "\ncan't be created: check name ?"))
  })
  write_to = normalizePath(write_to, winslash = "/", mustWork = FALSE)
  tryCatch(expr = {
    # magic number
    writeBin(object = c(cpp_uint32_to_raw(18761)[1:2], cpp_uint32_to_raw(42)[1:2]), con = towrite, endian = r_endian)
    # define writing position
    pos = 8
    tmp = cpp_uint32_to_raw(pos %% 4294967296)
    if(endianness != .Platform$endian) tmp = rev(tmp)
    writeBin(object = tmp, con = towrite, endian = endianness)
    
    # compute final object number
    final_obj = sum(sapply(fileName, FUN = function(f) getFullTag(IFD = getIFD(fileName = f, offsets = "first", trunc_bytes = 4, force_trunc = TRUE, verbose = verbose, verbosity = verbosity, bypass = TRUE), which = 1, tag = "33018")))
    
    # extract information from first IFD of first file
    IFD_first = getIFD(fileName = file_first, offsets = "first", trunc_bytes = 4, force_trunc = TRUE, verbose = verbose, verbosity = verbosity, bypass = TRUE)
    toread1 = file(description = file_first, open = "rb")
    if(display_progress) {
      pb1 = newPB(title = title_progress, label = "extracting 1st IFD", min = 0, max = length(IFD_first[[1]]$tags), initial = 0, style = 3)
      pb2 = newPB(title = title_progress, label = " ", min = 0, max = final_obj * XIF_step, initial = 0, style = 3)
      on.exit(endPB(pb2), add = TRUE)
    }
    tryCatch(expr = {
    IFD = cpp_fastTAGS(fname = file_first, offset = IFD_first[[1]]$curr_IFD_offset, swap = swap)
    
    # set additional IFD
    # 33029 or 33030 names of files that constitute the merge
    fname2 = collapse_raw(lapply(fileName, FUN = function(x) charToRaw(normalizePath(enc2native(x), mustWork = FALSE, winslash =  "\\"))), as.raw(0x7c))
    if(f_Ext == "rif") {
      feat_pos = IFD_first[[1]]$tags[["33080"]]$val + 4294967296 * (file.size(file_first) %/% 4294967296)
      seek(toread1, feat_pos)
      fv = readBin(toread1, what = "double", n = 2, size = 4)
      seek(toread1, feat_pos)
      ifd_merged = c(buildIFD(val = fname2, typ = 2, tag = 33030, endianness = r_endian),
                     buildIFD(val = 0, typ = 4, tag = 33070, endianness = r_endian),
                     buildIFD(val = feat_pos, typ = 4, tag = 33081, endianness = r_endian),
                     buildIFD(val = 1, typ = 4, tag = 33082, endianness = r_endian))
    } else {
      feat_pos = IFD_first[[1]]$tags[["33083"]]$val
      seek(toread1, feat_pos)
      fv = readBin(toread1, what = "double", n = 2, size = 4)
      seek(toread1, feat_pos)
      ifd_merged = c(buildIFD(val = 1, typ = 4, tag = 33029, endianness = r_endian),
                     buildIFD(val = NULL, typ = 2, tag = 33030, endianness = r_endian),
                     buildIFD(val = 0, typ = 4, tag = 33070, endianness = r_endian),
                     buildIFD(val = NULL, typ = 2, tag = 33081, endianness = r_endian),
                     buildIFD(val = 0, typ = 4, tag = 33082, endianness = r_endian))
    }
    if(identical(fv[1], IFD_first[[1]]$tags[["33018"]]$val)) {
      ifd_merged = c(ifd_merged,
                     buildIFD(val = readBin(toread1, what = "raw", n = 4 * (2 + prod(fv))), typ = 1, tag = 33083, endianness = r_endian))
    }
    # 33004 now time
    ifd_time = buildIFD(val = format(Sys.time(), "%d-%m-%Y %H:%M:%S %p"), typ = 2, tag = 33004, endianness = r_endian)
    # 33005 user
    ifd_user = buildIFD(val = "IFC package", typ = 2, tag = 33005, endianness = r_endian)
    # compute final object number
    ifd_obj = buildIFD(val = final_obj, typ = 4, tag = 33018, endianness = r_endian)
    # 33090 ifc pkg version
    ifd_version = buildIFD(val = paste0(unlist(packageVersion("IFC")), collapse = "."), typ = 2, tag = 33090, endianness = r_endian)
    if(add_tracking) {
      # 33091 names of files that constitute the merge
      ifd_files = buildIFD(val = collapse_raw(c(list(suppressWarnings(getFullTag(IFD = IFD_first, which = 1, tag = "33091", raw = TRUE))),
                                                list(fname2)),
                                              collapse = as.raw("0x3e")),
                           typ = 2, tag = 33091, endianness = r_endian)
      # 33092 checksum of each file constituting the merge
      ifd_checksum = buildIFD(val = collapse_raw(c(list(suppressWarnings(getFullTag(IFD = IFD_first, which = 1, tag = "33092", raw = TRUE))),
                                                   list(collapse_raw(lapply(fileName, FUN = function(x) charToRaw(num_to_string(cpp_checksum(x)))), as.raw(0x7c)))),
                                                 collapse = as.raw("0x3e")),
                              typ = 2, tag = 33092, endianness = r_endian)
    } else {
      ifd_files = list()
      ifd_checksum = list()
    }

    pos = writeIFD(ifd = IFD$tags[!(names(IFD$tags) %in% unwanted)],
                   r_con = toread1, w_con = towrite, pos = pos, 
                   extra = c(ifd_time, ifd_user, ifd_obj, ifd_merged, ifd_version, ifd_files, ifd_checksum),
                   endianness = r_endian)
    }, error = function(e) {
      stop(e$message)
    },
    finally = {
      if(display_progress) endPB(pb1)
      close(toread1)
    })
    
    # initialize object count
    off_obj = 0
    
    # repeat same process for img / msk data for all files
    for(f in fileName) {
      fname3 = charToRaw(f)
      IFD_first = getIFD(fileName = f, offsets = "first", trunc_bytes = 4, force_trunc = TRUE, verbose = verbose, verbosity = verbosity, bypass = TRUE)
      obj_count = XIF_step * getFullTag(IFD = IFD_first, which = 1, tag = "33018")
      IFD = IFD_first[[1]]
      
      label_progress = basename(f)
      # open connections for reading
      toread = file(description = f, open = "rb")
      tryCatch({
        OBJECT_ID = NULL
        for(i_obj in seq(1 - (XIF_test != 1), to = obj_count - (XIF_test != 1))) {
          cum_obj = i_obj + off_obj
          if(display_progress) setPB(pb = pb2, value = cum_obj, title = title_progress, label = paste0(label_progress, " - merging objects"))
          # extract IFD
          IFD = cpp_fastTAGS(fname = f, offset = IFD$next_IFD_offset, swap = swap)
          TYPE = IFD$tags[["33002"]]$val
          if(any(TYPE == 2)) OBJECT_ID = IFD$tags[["33003"]]$val
          ifd = IFD$tags[!(names(IFD$tags) %in% unwanted)]
          
          # modify object id
          tmp = cpp_uint32_to_raw(floor(cum_obj/XIF_step))
          if(endianness != r_endian) tmp = rev(tmp)
          if(length(ifd[["33003"]])!=0) ifd[["33003"]]$raw[9:12] <- tmp
          
          if(add_tracking) {
            extra = c(
              # register current object id in new tag to be able to track it
              buildIFD(val = collapse_raw(c(list(suppressWarnings(getFullTag(IFD = structure(list(IFD), class = "IFC_ifd_list", "fileName_image" = f), which = 1, tag = "33093", raw = TRUE))),
                                            list(charToRaw(num_to_string(OBJECT_ID)))),
                                          collapse = as.raw(0x3e)),
                       typ = 2, tag = 33093, endianness = r_endian),
              # add origin fileName to allow to track where exported objects are coming from
              buildIFD(val = collapse_raw(c(list(suppressWarnings(getFullTag(IFD = structure(list(IFD), class = "IFC_ifd_list", "fileName_image" = f), which = 1, tag = "33094", raw = TRUE))),
                                            list(fname3)),
                                          collapse = as.raw(0x3e)),
                       typ = 2, tag = 33094, endianness = r_endian))
          } else {
            extra = NULL
          }
          pos = writeIFD(ifd, r_con = toread, w_con = towrite, pos = pos, extra = extra, endianness = r_endian, last = (cum_obj == final_obj * 2))
        }
      }, error = function(e) {
        stop(e$message, call. = FALSE)
      }, finally = {
        close(toread)
      })
      off_obj = off_obj + obj_count
    }
  }, error = function(e) {
    stop(paste0("Can't create 'write_to' file.\n", write_to,
                ifelse(overwritten,"\nFile was not modified.\n","\n"),
                "See pre-file @\n", normalizePath(file_w, winslash = "/", mustWork = FALSE), "\n",
                e$message), call. = FALSE)
  }, finally = close(towrite))
  if(overwritten) {
    mess = paste0("\n######################\n", write_to, "\nhas been successfully overwritten\n")
    if(!suppressWarnings(file.rename(to = write_to, from = file_w))) { # try file renaming which is faster
      if(!file.copy(to = write_to, from = file_w, overwrite = TRUE)) { # try file.copy if renaming is not possible
        stop(paste0("Can't copy temp file@\n", normalizePath(file_w, winslash = "/"), "\n",
                    "Can't create 'write_to' file.\n", write_to,
                    "\nFile was not modified.\n"), call. = FALSE)
      } else {
        file.remove(file_w, showWarnings = FALSE)
      }
    }
  } else {
    mess = paste0("\n######################\n", write_to, "\nhas been successfully exported\n")
  }
  message(mess)
  return(invisible(write_to))
}