R/fcs.R

Defines functions ExportToFCS ExtractFromFCS FCS_to_data FCS_merge_sample FCS_merge_dataset readFCS readFCSdataset readFCSdata readFCStext readFCSdelimiter readFCSheader FCS_check_keywords convert_spillover

Documented in convert_spillover ExportToFCS ExtractFromFCS FCS_check_keywords FCS_merge_dataset FCS_merge_sample FCS_to_data readFCS readFCSdata readFCSdataset readFCSdelimiter readFCSheader readFCStext

################################################################################
# This file is released under the GNU General Public License, Version 3, GPL-3 #
# Copyright (C) 2021 Yohann Demont                                             #
#                                                                              #
# It is part of IFC package, please cite:                                      #
# -IFC: An R Package for Imaging Flow Cytometry                                #
# -YEAR: 2020                                                                  #
# -COPYRIGHT HOLDERS: Yohann Demont, Gautier Stoll, Guido Kroemer,             #
#                     Jean-Pierre Marolleau, Loïc Garçon,                      #
#                     INSERM, UPD, CHU Amiens                                  #
#                                                                              #
# DISCLAIMER:                                                                  #
# -You are using this package on your own risk!                                #
# -We do not guarantee privacy nor confidentiality.                            #
# -This program is distributed in the hope that it will be useful, but WITHOUT #
# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or        #
# FITNESS FOR A PARTICULAR PURPOSE. In no event shall the copyright holders or #
# contributors be liable for any direct, indirect, incidental, special,        #
# exemplary, or consequential damages (including, but not limited to,          #
# procurement of substitute goods or services; loss of use, data, or profits;  #
# or business interruption) however caused and on any theory of liability,     #
# whether in contract, strict liability, or tort (including negligence or      #
# otherwise) arising in any way out of the use of this software, even if       #
# advised of the possibility of such damage.                                   #
#                                                                              #
# You should have received a copy of the GNU General Public License            #
# along with IFC. If not, see <http://www.gnu.org/licenses/>.                  #
################################################################################

################################################################################
#             functions described hereunder are experimental                   #
#              inputs and outputs may change in the future                     #
################################################################################

#' @title Spillover Converter
#' @description
#' Converts spillover matrix to spillover keyword and reversely
#' @param spillover either a spillover matrix or a spillover keyword
#' @return if 'spillover' is a matrix, it returns a string. If 'spillover' is a string, it returns a matrix. In all cases if spillover is of length 0, it will return NULL.
#' @keywords internal
convert_spillover <- function(spillover) {
  if(length(spillover) == 0) return(NULL)
  if(is.matrix(spillover)) {
    if(length(rownames(spillover)) == 0) stop("'spillover' should have rownames")
    feat_n = parseFCSname(rownames(spillover))
    return(paste(ncol(spillover), paste0(feat_n$PnN, collapse=","), paste0(as.vector(spillover), collapse=","), sep=","))
  } else {
    foo = strsplit(spillover, split=",", fixed=TRUE)[[1]]
    feat_l = as.integer(foo[1])
    feat_n = foo[2:(feat_l+1)]
    vals = foo[-(seq_len(feat_l+1))]
    if(length(vals) != feat_l^2) stop("'spillover' keyword does not fulfill fcs specifications")
    return(matrix(as.numeric(vals), ncol=feat_l, nrow=feat_l, dimnames=list(NULL, feat_n)))
  }
}

#' @title FCS Keyword Checker
#' @description
#' Helper to check that FCS keyword-value pairs are compliant with specifications
#' @param text a named list of keywords values.
#' @param delimiter delimiter used to separate keyword-value pairs. /!\ NOTE that files with 0x00 'delimiter' can _NOT_ be parsed.
#' @param version version to check keywords compliance against. Default is 3.0.
#' @param encoding name of the encoding for raw to character conversion. Default is "UTF-8".
#' @param fun function to execute when mandatory parameters are not met. Default is "warning". Allowed are "stop","warning","message","return".
#' @param ... other arguments to be passed.
#' @keywords internal
FCS_check_keywords <- function(text, delimiter, version = 3.0, encoding = "UTF-8", fun = "warning", ...) {
  # prepare return message
  msg = c()
  
  # check inputs
  assert(text, typ = "list")
  assert(fun, len = 1, alw = c("stop","warning","message","return"))
  version = suppressWarnings(as.numeric(version)); version = na.omit(version); assert(version, len=1)
  encoding = na.omit(encoding); assert(encoding, len=1, typ="character")
  assert(delimiter, len=1, typ="character")
  raw_delimiter = attr(delimiter, "raw")
  if((length(raw_delimiter) != 1) || (typeof(raw_delimiter) != "raw")) stop("can't find valid \"raw\" attribute of 'delimiter'")
  
  # modify encoding FCS keywords values should be UTF-8 and keywords themselves ASCII (which are part of UTF-8)
  old_enc <- options("encoding")
  on.exit(options(old_enc))
  options("encoding" = encoding)
  
  # set keywords to upper case
  names(text) = toupper(names(text))
  
  # check required keywords
  key_mandatory = c("$DATATYPE","$PAR","$MODE","$BYTEORD","$NEXTDATA",
                    "$BEGINSTEXT", "$ENDSTEXT",
                    "$BEGINANALYSIS", "$ENDANALYSIS",
                    "$BEGINDATA", "$ENDDATA", 
                    "$CYT", "$TOT")
  if(version == 2.0) key_mandatory = key_mandatory[1:5]
  if(version <  3.1) key_mandatory = setdiff(key_mandatory, c("$CYT"))
  if(version >= 3.2) key_mandatory = setdiff(key_mandatory, c("$MODE", "$BEGINSTEXT", "$ENDSTEXT", "$BEGINANALYSIS", "$ENDANALYSIS"))
  tmp = key_mandatory %in% names(text)
  if(!all(tmp)) msg = c(msg, paste0("`REQUIRED not found`:\n\t- ", paste0(key_mandatory[!tmp], collapse = "\n\t- ")))
  
  # check type/mode/byteord
  type = text[["$DATATYPE"]]
  byteord = text[["$BYTEORD"]]
  if(!(type %in% c("A","I","F","D"))) msg = c(msg, paste0("`non-compatible $DATATYPE[", type,"] (allowed are \"A\",\"I\",\"F\",\"D\")`"))
  if(version >= 3.1) if(type %in% c("A"))  msg = c(msg, paste0("`deprecated $DATATYPE[", type,"]`"))
  mode = text[["$MODE"]]
  if(version <= 3.1) {
    if(!(mode %in% c("L","C","U"))) msg = c(msg, paste0("`non-compatible $MODE[",mode,"] (allowed are \"L\",\"C\",\"U\")`"))
    if(mode %in% c("C","U"))  msg = c(msg, paste0("`deprecated $MODE[", mode,"]`"))
  } else {
    if("$MODE" %in% names(text)) {
      msg = c(msg, paste0("`$MODE is a deprecated keyword`"))
      if(!(mode %in% c("L"))) msg = c(msg, paste0("`non-compatible $MODE[",mode,"] (allowed is \"L\")`"))
    }
  }
  if(version >= 3.1) if(!(byteord %in% c("1,2,3,4","4,3,2,1"))) msg = c(msg, paste0("`non-compatible $BYTEORD[",byteord,"] (allowed are \"1,2,3,4\",\"4,3,2,1\")`"))
  
  # check number of objects/parameters
  n_obj = na.omit(suppressWarnings(as.integer(text[["$TOT"]])))
  n_par = na.omit(suppressWarnings(as.integer(text[["$PAR"]])))
  if((length(n_obj) == 1) && (n_obj == 0)) msg = c(msg, "`$TOT is 0`")
  if((length(n_par) == 1) && (n_par == 0)) msg = c(msg, "`$PAR is 0`")
  if(version > 2.0) {
    foo = grep("^\\$P\\d+N$", names(text), value = TRUE, ignore.case = TRUE)
    if(length(foo) != n_par) msg = c(msg, paste0("`$PnN mismatch between found[",length(foo),"] vs expected[",n_par,"]`"))
  }
  
  # check uniqueness
  tmp = duplicated(names(text))
  if(any(tmp)) msg = c(msg, paste0("`non unique`:\n\t- ", paste0(names(text)[tmp], collapse = "\n\t- ")))
  
  # check uniqueness of PnN
  PnN = text[paste0("$P",seq_len(n_par),"N")]
  foo = PnN[sapply(PnN, length) != 0]
  tmp = duplicated(foo)
  if(any(tmp)) {
    bar = sapply(unique(foo[tmp]), FUN = function(x) paste0(names(which(x == foo)), collapse = ","))
    if(any(tmp)) msg = c(msg, paste0("`non unique $PnN`:\n\t- ", paste0(paste0(bar, "[",unique(foo[tmp]), "]"), collapse = "\n\t- ")))
  }
  
  # check non empty
  tmp = sapply(text, length) == 0
  if(any(tmp)) msg = c(msg, paste0("`empty value",ifelse(sum(tmp)==1,"","s")," found`:\n\t- ", paste0(names(text)[tmp], collapse = "\n\t- ")))
  
  # check keywords names use only 0x20 - 0xFE characters
  if(version >= 3.0) {
    # starting ver 3.0 spe says: The TEXT part should not contain return (ASCII 13), line feed (ASCII 10) or other unpritable characters (unless they are value or delimiter)
    tmp = sapply(names(text), FUN = function(x) {v = charToRaw(x); all(v >= 0x20 & v <= 0xFE) })
    if(!all(tmp)) msg = c(msg, paste0("`bad BYTE in name allowed ASCII are [0x20-0xFE (32-126)]`:\n\t- ", paste0(names(text)[!tmp], collapse = "\n\t- ")))
  } else {
    # for ver 2.0 spe mentions : The TEXT part should not contain 'carriage return' or 'line feed' characters  (unless they are value or delimiter)
    tmp = sapply(names(text), FUN = function(x) {v = charToRaw(x); all(v != 0x0A & v != 0x0D) })
    if(!all(tmp)) msg = c(msg, paste0("`bad BYTE in name 0x0A and 0x0D are not allowed`:\n\t- ", paste0(names(text)[!tmp], collapse = "\n\t- ")))
  }
  
  # check numeric keywords are not padded with characters other than 0
  bar = TRUE
  old_loc = Sys.getlocale(category = "LC_ALL")
  suppressWarnings(Sys.setlocale(category = "LC_ALL", locale = "C"))
  tryCatch({
    foo = sapply(text, FUN = function(x) (length(x) !=0) && (!is.na(suppressWarnings(as.numeric(x)))))
    bar = sapply(text[foo], FUN = function(x) {
      xx = tolower(x)
      if(xx %in% c("true","false")) return(TRUE) # to handle TRUE/FALSE to num conversion
      xx = strsplit(x = xx, split = "", fixed = TRUE)[[1]]
      if(any(xx == "x")) return(TRUE) # to handle raw to num conversion
      all(xx %in% c("0","1","2","3", "4", "5", "6", "7", "8", "9", "+", "-",".","e"))
    })
  }, finally = suppressWarnings(Sys.setlocale(category = "LC_ALL", locale = old_loc)))
  if(!all(bar)) msg = c(msg, paste0("`padded numeric (only padding with 0 is allowed)`:\n\t- ", paste0(paste0(names(text)[foo][!bar], "[",text[foo][!bar], "]"), collapse = "\n\t- ")))
  
  # check keyword-value pairs do not start with delimiter
  # delimiter = rawToChar(delimiter)
  foo = sapply(names(text), FUN = function(x) {
    if(length(x) == 0) return(FALSE)
    substr(x, 1,1) == delimiter
  })
  bar = sapply(text, FUN = function(x) {
    if(length(x) == 0) return(FALSE)
    substr(x, 1,1) == delimiter
  })
  tmp = foo | bar
  if(any(tmp)) msg = c(msg, paste0("`start with delimiter`:\n\t- ", paste0(paste0(names(text)[tmp], "[",text[tmp], "]"), collapse = "\n\t- ")))
  
  # check range and amplification
  foo = sapply(seq_len(n_par), FUN = function(i) {
    msg = c()
    PnB = paste0("$P",i,"B")
    PnE = paste0("$P",i,"E")
    PnG = paste0("$P",i,"G")
    PnN = paste0("$P",i,"N")
    PnR = paste0("$P",i,"R")
    if(length(text[[PnB]]) == 0) {
      msg = c(msg, paste0(PnB, " not found (REQUIRED)"))
    } else {
      bit = na.omit(suppressWarnings(as.integer(text[[PnB]])))
      if(length(bit) == 0) {
        if(!(type == "A" && text[[PnB]] == "*")) msg = c(msg, paste0("invalid PnB [",text[[PnB]],"]"))
      } else {
        if(type == "F" && bit != 32) if(version >= 3.1) c(msg, paste0(PnB, "[",text[[PnB]],"] should be \"32\" with $DATATYPE[F]"))
        if(type == "D" && bit != 64) if(version >= 3.1) c(msg, paste0(PnB, "[",text[[PnB]],"] should be \"64\" with $DATATYPE[D]"))
      }
    }
    if(length(text[[PnR]]) == 0) {
      msg = c(msg, paste0(PnR, " not found (REQUIRED)"))
    } else {
      ran = na.omit(suppressWarnings(as.numeric(text[[PnR]])))
      if(length(ran) == 0) msg = c(msg, paste0(PnR,"[",text[[PnR]],"] is not valid"))
    }
    if(length(text[[PnN]]) == 0) {
      if(version > 2.0) msg = c(msg, paste0(PnN, " not found (REQUIRED)")) # $PnN is not listed as REQUIRED in FCS2.0
    } else {
      if(grepl(",", text[[PnN]], fixed = TRUE)) c(msg, paste0(PnN,"[",text[[PnN]],"] should not contain ','")) 
    }
    if(length(text[[PnE]]) == 0) {
      if(version > 2.0) msg = c(msg, paste0(PnE, " not found (REQUIRED)")) # $PnE is not listed as REQUIRED in FCS2.0
    } else {
      trans = na.omit(suppressWarnings(as.numeric(strsplit(text[[PnE]], split = ",", fixed = TRUE)[[1]])))
      if(length(trans) != 2) {
        msg = c(msg, paste0(PnE,"[",text[[PnE]],"] should be \"f1,f2\" with numeric f1 and f2"))
      } else {
        if((version >= 3.1) && (type == "D" || type == "F")) { # starting from FCS3.1 $PnE has to be "0,0" for DATATYPE D and F
          if((trans[1] != 0) || (trans[2] != 0)) msg = c(msg, paste0(PnE,"[",text[[PnE]],"] should be \"f1,f2\" where f1=0 AND f2=0 with $DATATYPE[",type,"]"))
        } else { # otherwise $PnE always (? for 1.0) had to be f1,f2 with  f1=f2=0 OR (f1!=0 AND f2!=0)
          if(!(all(trans == 0) || all(trans != 0))) msg = c(msg, paste0(PnE,"[",text[[PnE]],"] should be \"f1,f2\" where (f1=0 AND f2=0, for linear) OR (f1!=0 AND f2!=0, for log) with $DATATYPE[",type,"]"))
        }
      }
    }
    gain = na.omit(suppressWarnings(as.numeric(text[[PnG]])))
    if(length(gain) != 0) {
      if(length(text[[PnE]]) != 0) {
        trans = na.omit(suppressWarnings(as.numeric(strsplit(text[[PnE]], split = ",", fixed = TRUE)[[1]])))
        if((length(trans) == 2) && (gain != 1) && ((trans[1] != 0) || (trans[2] != 0))) msg = c(msg, paste0(PnG,"[",text[[PnG]],"] should not be used with logarithmic amplification ($PnE != \"0,0\")",PnE,"[",text[[PnE]],"]"))
      }
    }
    return(paste0(msg, collapse="|"))
  })
  bar = unlist(recursive = FALSE, use.names = FALSE, lapply(foo, FUN = function(x) x != ""))
  if(any(bar)) msg = c(msg, paste0("`bad PnB/PnE/PnG/PnN/PnR keywords`:\n\t- ", paste0(foo[bar], collapse="\n\t- ")))
  
  # check spillover
  if("$SPILLOVER" %in% names(text)) {
    tryCatch({
      sp = convert_spillover(text[["$SPILLOVER"]])
      if(!all(colnames(sp) %in% unlist(recursive = FALSE, use.names = FALSE, PnN))) stop("'spillover' is defined with non PnN names [", paste0(colnames(sp),collapse = ","),"]")
    }, error = function(e) {
      msg <<- c(msg, paste0("$SPILLOVER ", e$message))
    })
  }
  if(length(msg) != 0) msg = c(sprintf("non FCS%.1f compliant keywords", version), msg)
  msg = gsub("\r","",msg,fixed=TRUE)
  if(fun == "return") return(msg)
  if(length(msg) != 0) {
    args = list(paste0(msg, collapse = "\n"))
    if(fun == "warning") args = c(args, list(call. = FALSE, immediate. = TRUE))
    do.call(what = fun, args = args)
  }
}

#' @title FCS Header Parser
#' @description
#' Helper to parse header segment from Flow Cytometry Standard (FCS) compliant files.
#' @param fileName path to file.
#' @param header, a list whose members define the "at" offset from header$start$at and the "n" number of bytes to extract:\cr
#' - start: where to start reading current FCS dataset.       Default is list(at = 0,  n = 6),\cr
#' - space: where to retrieve space.                          Default is list(at = 6,  n = 4),\cr
#' - text_beg: where to retrieve file text segment beginning. Default is list(at = 10, n = 8),\cr
#' - text_end: where to retrieve file text segment end.       Default is list(at = 18, n = 8),\cr
#' - data_beg: where to retrieve file data segment beginning. Default is list(at = 26, n = 8),\cr
#' - data_end: where to retrieve file data segment end.       Default is list(at = 34, n = 8).
#' @param encoding name of the encoding for raw to character conversion. Default is "UTF-8".
#' @param ... other arguments to be passed.
#' @keywords internal
readFCSheader <- function(fileName, header, encoding = "UTF-8", ...) {
  # prepare fileName
  if(missing(fileName)) stop("'fileName' can't be missing")
  assert(fileName, len = 1)
  fileName = enc2native(normalizePath(fileName, winslash = "/", mustWork = FALSE))
  if(!file.exists(fileName)) stop(paste("can't find", fileName, sep=" "))
  fsize = file.size(fileName)
  encoding = na.omit(encoding); assert(encoding, len=1, typ="character")
  
  opt_default = eval(formals(readFCS)$options)
  if(missing(header)) {
    header = opt_default$header
  } else {
    for(i in c("start", "space", "text_beg", "text_end", "data_beg", "data_end")) {
      if(!(i %in% names(header))) header[[i]] <- opt_default$header[[i]]
    }
  }
  
  # ensure start$at is valid
  at = suppressWarnings(as.numeric(header[["start"]]$at[1]))
  at = na.omit(at[at >=0])
  if((length(at) == 0) || (at > file.size(fileName))) stop("HEADER segment: start$at[",at,"] points to outside of the file")
  assert(at, len=1)
  
  # create connection binary reading
  toread = file(description = fileName, open = "rb")
  on.exit(close(toread))
  # modify encoding FCS keywords values should be UTF-8 and keywords themselves ASCII (which are part of UTF-8)
  old_enc <- options("encoding")
  options("encoding" = encoding) # FIXME should we read $UNICODE before parsing for FCS < 3.1
  tryCatch({
    # we will read offsets from options
    # FIXME, should we validate each header entry ?
    # e.g. header$start == FCSx.x
    # and so on ...
    header = sapply(names(header), simplify = FALSE, FUN = function(x) {
      goto = header[[x]]$at[1] + ifelse(x == "start", 0, at)
      if(goto > fsize) stop("HEADER segment: ",x,"[",goto,"] points to outside of the file")
      seek(toread, header[[x]]$at[1] + ifelse(x == "start", 0, at))
      if(x %in% c("start", "space")) {
        raw = rawToChar(readBin(toread, what = "raw", n = header[[x]]$n))
      } else {
        raw = trimws(x = rawToChar(readBin(toread, what = "raw", n = header[[x]]$n)))
        raw = suppressWarnings(na.omit(as.integer(raw) + at))
      }
      raw
    })
  }, finally = options(old_enc))
  return(structure(header, "offset" = at))
}

#' @title FCS Delimiter Reader
#' @description
#' Helper to extract delimiter from Flow Cytometry Standard (FCS) compliant files.
#' @param fileName path to file.
#' @param at offset of delimiter. Default is 58.
#' @param version version to check keywords compliance against. Default is 3.0.
#' @param encoding name of the encoding for raw to character conversion. Default is "UTF-8".
#' @param ... other arguments to be passed.
#' @keywords internal
readFCSdelimiter <- function(fileName, at = 58, version = 3.0, encoding = "UTF-8", ...) {
  # prepare fileName
  if(missing(fileName)) stop("'fileName' can't be missing")
  assert(fileName, len = 1)
  fileName = enc2native(normalizePath(fileName, winslash = "/", mustWork = FALSE))
  if(!file.exists(fileName)) stop(paste("can't find", fileName, sep=" "))
  encoding = na.omit(encoding); assert(encoding, len=1, typ="character")
  version = suppressWarnings(as.numeric(version)); version = na.omit(version); assert(version, len=1)
  
  # ensure start$at is valid
  at = suppressWarnings(as.numeric(at))
  at = na.omit(at[at >=0])
  if((length(at) == 0) || (at > file.size(fileName))) stop("DELIMITER segment: at[",at,"] points to outside of the file")
  assert(at, len=1)
  
  # create connection binary reading
  toread = file(description = fileName, open = "rb")
  on.exit(close(toread))
  # modify encoding FCS keywords values should be UTF-8 and keywords themselves ASCII (which are part of UTF-8)
  old_enc <- options("encoding")
  options("encoding" = encoding) # FIXME should we read $UNICODE before parsing for FCS < 3.1
  delimiter = character()
  raw_delimiter = raw()
  tryCatch({
    seek(toread, at)
    raw_delimiter = readBin(con = toread, what = "raw", n = 1)
    delimiter = rawToChar(raw_delimiter)
  }, finally = options(old_enc))
  
  if(length(raw_delimiter) == 1) {
    # for FCS2.0 delimiter is a BYTE
    # for FCS3.0 delimiter is an ASCII
    # starting from 3.1 delimiter is an ASCII that is not 0x00 nor 0xFF
    if(version == 3.0) if(raw_delimiter > 0x7F) warning("DELIMITER segment: should be any ASCII [0x00-0x7F (0-127)]",
                                                    call. = FALSE, immediate. = TRUE)
    if(version >= 3.1) if((raw_delimiter == 0x00) || (raw_delimiter >= 0x7E)) warning("DELIMITER segment: should be a [0x01-0x7E (1-126)] ASCII character",
                                                                              call. = FALSE, immediate. = TRUE)
  }
  return(structure(delimiter, "raw" = raw_delimiter))
}

#' @title FCS Text Parser
#' @description
#' Helper to parse text segment from Flow Cytometry Standard (FCS) compliant files.
#' @param fileName path to file.
#' @param delimiter delimiter used to separate keyword-value pairs. /!\ NOTE that files with 0x00 'delimiter' can _NOT_ be parsed.
#' @param start offset of text start. Default is 0.
#' @param end offset of text end. Default is 0.
#' @param encoding name of the encoding for raw to character conversion. Default is "UTF-8".
#' @param empty whether to allow empty values when parsing text segment. Default is FALSE.
#' @param trim remove whitespace in keywords names. Default is "none". Allowed are "both", "left", "right" and "none".
#' @param ... other arguments to be passed.
#' @keywords internal
readFCStext <- function(fileName, delimiter, start = 0, end = 0, encoding = "UTF-8", empty = FALSE, trim = "none", ...) {
  if(missing(fileName)) stop("'fileName' can't be missing")
  assert(fileName, len=1)
  fileName = enc2native(normalizePath(fileName, winslash = "/", mustWork = FALSE))
  if(!file.exists(fileName)) stop(paste("can't find", fileName, sep=" "))
  assert(delimiter, len=1, typ="character")
  raw_delimiter = attr(delimiter, "raw")
  if((length(raw_delimiter) != 1) || (typeof(raw_delimiter) != "raw")) stop("can't find valid \"raw\" attribute of 'delimiter'")
  start = na.omit(as.numeric(start))
  end = na.omit(as.numeric(end))
  if((length(start) != 1) || (length(end) != 1) || (end <= start)) stop("bad TEXT segment offsets")
  encoding = na.omit(encoding); assert(encoding, len=1, typ="character")
  assert(empty, len=1, alw = c(TRUE, FALSE))
  assert(trim, len=1, alw = c("none","left","right","both"))
  if(raw_delimiter == raw(1)) stop("delimiter 0x00 is not supported")
  
  # modify encoding FCS keywords values should be UTF-8 and keywords themselves ASCII (which are part of UTF-8)
  old_enc <- options("encoding")
  on.exit(options(old_enc))
  options("encoding" = encoding)
  
  # create connection binary reading
  toread = file(description = fileName, open = "rb")
  on.exit(close(toread), add = TRUE)
  last = raw()
  nend = end - 2
  while(!(raw_delimiter %in% last) && (abs(nend - end) <= 2)) {
    nend = nend + 1
    seek(toread, nend)
    last = readBin(toread, what = "raw", n = 1)
  }
  if(nend != end) {
    if(abs(nend - end) == 1) {
      warning("TEXT (or sup. TEXT) segment: offset is off by ", nend - end, " byte",
              call. = FALSE, immediate. = TRUE)
    } else {
      warning("TEXT (or sup. TEXT) segment: can't find final delimiter",
              call. = FALSE, immediate. = TRUE)
    }
  }
  
  seek(toread, start)
  text = rawToChar(readBin(toread, what = "raw", n = nend - start))
  
  # when same character as delimiter is used within keyword-value pair it has to be escaped (repeated twice)
  # according to FCS spe 2.0:
  # -if the separator appears in a keyword or in a keyword value, it must be "quoted" by being repeated
  # -since null (zero length) keywords or keyword values are not permitted, two consecutive separators can never occur between a value and a keyword
  # we generate a 20 random characters delim_esc that does not contain delimiter
  # we also ensure that this delim is not found elsewhere in the TEXT segment
  found = 1
  while(found) {
    # back compatible with old R version, no need for accuracy since it is just for finding a non existing string that allow parsing
    delim_esc = gen_altnames("foo", random_seed = list(seed=found,"Mersenne-Twister", "Inversion", "Rounding"))
    delim_esc = strsplit(x = delim_esc, split = delimiter, fixed = TRUE)[[1]]
    delim_esc = delim_esc[delim_esc!=""]
    delim_esc = paste0(delim_esc, collapse="")
    found = cpp_scanFirst(fileName, charToRaw(delim_esc), start = start, end = end)
  }
  # we 1st look at double delimiter instance and substitute it with delim_esc
  text = gsub(pattern = paste0(delimiter,delimiter), replacement = delim_esc, x = text, fixed = TRUE)
  # then text is split with delimiter
  text = strsplit(x = text, split = delimiter, fixed = TRUE)[[1]]
  # then escaped double delimiter is replaced with only one delimiter
  text = gsub(pattern = delim_esc, replacement = delimiter, x = text, fixed = TRUE)
  # remove 1st empty value (this happen when 1st keyword starts with delimiter)
  is_1st_empty = FALSE
  while((length(text) >= 1) && (text[1] == "")) {
    if(!is_1st_empty) warning("TEXT (or sup. TEXT) segment: 1st keyword starts with delimiter",
                              call. = FALSE, immediate. = TRUE)
    is_1st_empty = TRUE
    text = text[-1]
  }
  # finally keyword-value pairs are converted to named list
  id_val = seq(from = 2, to = length(text), by = 2)
  id_key = id_val-1
  text = structure(as.list(text[id_val]), names = text[id_key])
  # try to fix empty values
  # it happens, See Bras A.E. and van der Velden V.H.J. at \doi{10.1002/cyto.a.24187}, that fcs writers produce non compliant files
  # with empty values so we try to detect them and fix them with the reasoning that delimiter never occur in keys (=keywords names)
  found = grepl(delimiter, names(text), fixed = TRUE)
  if(any(found)) {
    if(empty) {
      text_ok = text[!found]
      text_nok = text[found]
      text_nok = strsplit(names(text_nok), delimiter, fixed = TRUE)
      text_nok = unlist(recursive = FALSE, use.names = TRUE,
                        lapply(seq_along(text[found]), FUN = function(ii) {
                          L = length(text_nok[[ii]])
                          structure(lapply(seq_along(text_nok[[ii]]), FUN = function(kk) {
                            if(kk == L) return(text[found][[ii]])
                            character()
                          }), names = text_nok[[ii]])
                        }))
      text = c(text_ok, text_nok)
    } else {
      warning("TEXT (or sup. TEXT) segment: found delimiter in keywords, please consider using `options$text_empty` = TRUE",
              call. = FALSE, immediate. = TRUE)
    }
  }
  msg = paste0("TEXT (or sup. TEXT) segment: found standard keywords padded with whitespace(s), please consider using `options$text_trim` != \"",trim,"\"\n\t-")
  names(text) = switch(trim,
         "both" = { 
           trimws(names(text), which = "both")
         },
         "left" = { 
           found = grep("^\\$[[:alnum:]]+[[:space:]]+$", names(text), ignore.case = TRUE, value = TRUE)
           if(length(found) != 0) warning(paste0(msg, paste0(found, collapse = "\n\t-")), call. = FALSE, immediate. = TRUE)
           trimws(names(text), which = "left")
         },
         "right" = { 
           found = grep("^[[:space:]]+\\$[[:alnum:]]$", names(text), ignore.case = TRUE, value = TRUE)
           if(length(found) != 0) warning(paste0(msg, paste0(found, collapse = "\n\t-")), call. = FALSE, immediate. = TRUE)
           trimws(names(text), which = "right")
         },
         {
           found = c(grep("^\\$[[:alnum:]]+[[:space:]]+$", names(text), ignore.case = TRUE, value = TRUE),
                     grep("^[[:space:]]+\\$[[:alnum:]]+$", names(text), ignore.case = TRUE, value = TRUE))
           if(length(found) != 0) warning(paste0(msg, paste0(found, collapse = "\n\t-")), call. = FALSE, immediate. = TRUE)
           names(text)
         })
  return(text)
}

#' @title FCS Data Parser
#' @description
#' Helper to parse data segment from Flow Cytometry Standard (FCS) compliant files.
#' @param fileName path to file.
#' @param text a named list of keywords values.
#' @param start offset of text start. Default is 0.
#' @param end offset of text end. Default is 0.
#' @param scale whether to apply data scaling. It only applies when fcs file is stored as DATATYPE "I". Default is TRUE.\cr
#' @param display_progress whether to display a progress bar. Default is TRUE.
#' @param ... other arguments to be passed.
#' @keywords internal
readFCSdata <- function(fileName, text, start = 0, end = 0, scale = TRUE, display_progress = TRUE, ...) {
  if(missing(fileName)) stop("'fileName' can't be missing")
  assert(fileName, len=1)
  fileName = enc2native(normalizePath(fileName, winslash = "/", mustWork = FALSE))
  if(!file.exists(fileName)) stop(paste("can't find", fileName, sep=" "))
  assert(text, typ = "list")
  start = na.omit(as.numeric(start))
  end = na.omit(as.numeric(end))
  if((length(start) != 1) || (length(end) != 1) || (end <= start)) stop("DATA segment: bad offsets")
  assert(scale, len=1, alw = c(TRUE,FALSE))
  title_progress = basename(fileName)
  assert(display_progress, len = 1, alw = c(TRUE,FALSE))
  
  # create connection binary reading
  toread = file(description = fileName, open = "rb")
  on.exit(close(toread))
  
  # force local
  old_loc = Sys.getlocale(category = "LC_ALL")
  suppressWarnings(Sys.setlocale(category = "LC_ALL", locale = "C"))
  on.exit(suppressWarnings(Sys.setlocale(category = "LC_ALL", locale = old_loc)), add = TRUE)
  
  # retrieve info to extract data
  type = text[["$DATATYPE"]]
  if((length(type) != 1) || !(type %in% c("A","I","F","D"))) stop("DATA segment: non-compatible $DATATYPE[",type,"]")
  n_obj = na.omit(suppressWarnings(as.integer(text[["$TOT"]])))
  n_par = na.omit(suppressWarnings(as.integer(text[["$PAR"]])))
  if(length(n_obj) == 0) stop("DATA segment: $TOT can't be found")
  if(length(n_par) == 0) stop("DATA segment: $PAR can't be found")
  features_names = grep("^\\$P\\d+N$", names(text), value = TRUE, ignore.case = TRUE)
  
  # hereafter we create several bit_* variables
  # bit_v : PnB, bits depth of the value
  # bit_r : PnR, bits range of the value
  # bit_n : number of bytes to read 
  # bit_d : bits depth to read ( = 8 * bit_n )
  # bit_o : bytes order to read
  # bit_m : bits mask, for instance if bit_v is 10 bits but the value is read from 16 bits then 6 bits are not used
  
  bit_v = unlist(recursive = FALSE, use.names = FALSE, lapply(text[paste0("$P",seq_len(n_par),"B")], FUN = function(x) suppressWarnings(as.integer(x))))
  msg = c()
  if(n_par != length(features_names)) msg = c(msg, paste0("DATA segment: mismatch between found[",length(features_names),"] vs expected[",n_par,"] number of $PnN"))
  if(n_par != length(bit_v)) msg = c(msg, paste0("DATA segment: mismatch between found[",length(bit_v),"] vs expected[",n_par,"] number of $PnB"))
  if(length(msg) != 0) stop(paste0(msg, collapse = "\n"))
  
  # define data_bytes and go to data start position
  data_bytes = end - start + 1
  seek(toread, start)
  # type "A" is deprecated in newer version of FCS specifications
  if(type == "A") {
    if((data_bytes + start) > file.size(fileName)) stop("DATA segment: points to outside of the file")
    if(length(unique(bit_v)) == 1) {
      if(text[["$P1B"]] == "*") {
        data = setdiff(strsplit(x = readBin(toread, what = "character", n = data_bytes),
                                split = paste0(sapply(as.raw(c("0x20","0x09","0x2C","0x0D","0x0A")), rawToChar), collapse = "|"))[[1]],
                       "")
      } else {
        if(is.na(bit_v[1])) stop("DATA segment: bad $PnB definition for $DATATYPE[A]")
        data = gsub(paste0("(.{",bit_v,"})"), "\\1 ", readBin(toread, what = "character", n = data_bytes))
      }
    } else {
      raw = readBin(toread, what = "raw", n = data_bytes)
      if(display_progress) {
        pb = newPB(min = 0, max = n_par, initial = 0, style = 3)
        on.exit(endPB(pb), add = TRUE)
        data = sapply(seq_len(n_par), FUN = function(i_par) {
          setPB(pb, value = i_par, title = title_progress, label = "$DATATYPE[A]: extracting values")
          bits = as.integer(text[[paste0("$P",i_par,"B")]]) # each PnB determines number of bytes to extract
          # FIXME it is not clear how to deal with a mix of PnB == * and PnB == integer ?
          if(is.na(bits)) stop("DATA segment: bad $P",i_par,"B definition for $DATATYPE[A] ", text[[paste0("$P",i_par,"B")]])
          off = (i_par - 1) * n_par
          if((off + n_obj) > data_bytes) stop("DATA segment: buffer overrun")
          sapply(seq_len(n_obj), FUN = function(i_obj) {
            as.numeric(readBin(con = raw[i_obj + off], what = "character", n = bits))
          })
        })
      } else {
        data = sapply(seq_len(n_par), FUN = function(i_par) {
          bits = as.integer(text[[paste0("$P",i_par,"B")]]) # each PnB determines number of bytes to extract
          off = (i_par - 1) * n_par
          if((off + n_obj) > data_bytes) stop("DATA segment: buffer overrun")
          sapply(seq_len(n_obj), FUN = function(i_obj) {
            as.numeric(readBin(con = raw[i_obj + off], what = "character", n = bits))
          })
        })
      }
    }
  } else {
    # some files register wrong dataend offset resulting in an off-by-one byte
    # the following should allow to correct it
    if((data_bytes %% 8) %% 2) data_bytes = data_bytes + 1 # go +1
    if((data_bytes %% 8) %% 2) data_bytes = data_bytes - 1 # go  0
    if((data_bytes %% 8) %% 2) data_bytes = data_bytes - 1 # go -1
    off_by = (end - start + 1) - data_bytes
    if(off_by != 0) warning("DATA segment: offset is off by ",off_by," byte",
                            call. = FALSE, immediate. = TRUE)
    if((data_bytes %% 8) %% 2) stop("DATA segment: number of bytes does not respect fcs specifications")
    
    # extract order, endianness
    b_ord = text[["$BYTEORD"]]
    bit_o = as.integer(strsplit(b_ord, split=",", fixed=TRUE)[[1]])
    b_ord = paste0(bit_o, collapse = ",")
    
    # determines endianness of the file
    endian = "unk"
    endian_l = paste0(seq_along(bit_o), collapse = ",")
    endian_b = paste0(rev(seq_along(bit_o)), collapse = ",")
    if(endian_l == b_ord) endian = "little"
    if(endian_b == b_ord) endian = "big"
    
    # register bit_v values
    bit_v_back = bit_v
    
    # try bit_v correction
    alw_b = 2^(1:6) # 2,4,8,16,32,64, sizes handled by R
    bit_corrected = FALSE
    if(any(!(bit_v %in% alw_b))) {
      bit_corrected = TRUE
      bit_v = sapply(bit_v, FUN = function(x) alw_b[x <= alw_b][1])
    }
    bit_n = unname(bit_v %/% 8)
    
    # try data_length correction and check accuracy of bit_v correction, if any
    if(sum(n_obj * bit_n) != data_bytes) { # data_length is not OK
      if(bit_corrected) {                  # bit_v was already corrected
        bit_v = bit_v_back
        bit_n = unname(bit_v %/% 8)
        bit_corrected = FALSE
        if(sum(n_obj*bit_n)!=data_bytes) { # bit_v is reverted but data_length still does not match
          stop("DATA segment: can't determine bit depth and DATA length")
        }
      } else {                             # bit_v was not corrected, we apply data_length correction
        data_bytes = sum(n_obj * bit_n)
        warning("DATA segment: number of bytes has been corrected",
                call. = FALSE, immediate. = TRUE)
      }
    } else {                               # data_length is OK
      if(bit_corrected)             {      # check if bit_v was corrected to show a warning
        warning("DATA segment: bits depth have been corrected to next allowed value",
                call. = FALSE, immediate. = TRUE)
      }
    }
    bit_d = unique(bit_v)
    
    if(type == "I") { 
      # FIXME it is not clear how to perform tightbit packing bit_p = xxxxx
      
      # setup readBin parameters for each channels
      args = sapply(bit_n, simplify = FALSE, USE.NAMES = TRUE, FUN = function(x)  list(what = "integer", size = x, signed = x > 2))
      
      # compute bit mask
      bit_r = sapply(text[paste0("$P",seq_len(n_par),"R")], FUN = function(x) suppressWarnings(ceiling(log2(as.numeric(x))))) # as.integer("4294967296") results in NA so we use as.numeric
      bit_m = lapply(seq_len(n_par), FUN = function(i_par) packBits(as.raw(sapply(seq_len(bit_v[i_par]), FUN = function(i) i <= min(bit_r[i_par],bit_v[i_par])))))
    } else {
      # fcs specifications mention:
      # besides PnB for types "F" and "D" have to be 32 or 64, respectively.
      bit_d == rep(ifelse(type == "F", 32L, 64L), n_par)
      
      # force bits depth, shall always be max allowed depth
      tmp = bit_v != bit_d
      if(any(tmp)) {
        warning(paste0("DATA segment: $PnB keyword",ifelse(sum(tmp) == 0, " has", "s have")," been forced to ", bit_d, ":\n",
                       paste0(paste0("\t- ", names(bit_v)[tmp]), collapse = "\n")),
                call. = FALSE, immediate. = TRUE)
        for(i in names(bit_v)[tmp]) text[[i]] <- num_to_string(bit_d)
      }
      bit_v <- bit_d
      
      # fcs specifications mention:
      # `No bit mask shall be applied when reading type "F" or "D" data`
      # So, bit_r is set to bit_d to prevent masking
      # FIXME, however what to do with PnR in type "F" or "D", shall we truncate ?
      # fcs specifications mention:
      # `The actual value stored in the data set may exceed this range on both sides of 
      #  the interval. Specifically, there may be negative values as well as values greater than n1, e.g., as 
      #  a consequence of compensation``
      bit_r <- bit_d
      bit_m = NULL
      
      # setup readBin parameters for each channels
      args = sapply(bit_n, simplify = FALSE, USE.NAMES = TRUE, FUN = function(x)  list(what = "numeric", size = x))
    }
    # check data_length before starting reading
    if((data_bytes + start) > file.size(fileName)) stop("DATA segment: points to outside of the file")
    
    
    # fcs specifications mention that type "I" use unsigned integers only
    # but readBin can only extracts 8bits and 16bits unsigned integer. So, 
    # for 32bits and 64bits we have to extract signed integers and convert them afterwards
    # with cpp_v_intxx_to_uintxx functions
    if((endian != "unk") &&              # endianness is either "little" or "big" and can be passed to readBin without reordering
       (length(bit_d) == 1) &&           # every channels have same bits depth
       (bit_d %in% c(2,4,8,16,32,64))) { # bits depth is a size handled by R
      if(all(bit_r == bit_d)) {          # there is no masking to apply
        data = do.call(args = c(list(con = toread, 
                                     endian = endian, 
                                     n = data_bytes / args[[1]]$size),
                                args[[1]]),
                       what = readBin)
      } else { # we are forced to apply bit masking if a PnR is not equal to bit_d
        # extract order for the whole data
        ord_ = cpp_get_bytes_order(n_obj, bit_n, bit_o, .Platform$endian != "little") # FIXME
        # extract mask for the whole data
        msk_ = rep(unlist(recursive = FALSE, use.names = FALSE, bit_m), n_obj)
        # extract the whole data in "raw"
        data = readBin(con = toread, what = "raw", n = data_bytes)
        # process data according to args, order and mask and make the conversion
        data = do.call(args = c(list(con = data[ord_] & msk_,
                                     endian = "little", # FIXME
                                     n = data_bytes / args[[1]]$size),
                                args[[1]]),
                       what = readBin)
      }
      # convert to unsigned integers if needed
      if(type == "I") {
        if(args[[1]]$size == 4) data = cpp_v_int32_to_uint32(data)
        if(args[[1]]$size == 8) data = cpp_v_int64_to_uint64(data)
      }
      data = matrix(data, ncol = n_par, nrow = n_obj, byrow = TRUE)
    } else {
      # extract order for the whole data
      ord_ = cpp_get_bytes_order(n_obj, bit_n, bit_o, .Platform$endian != "little") # FIXME
      # extract mask for the whole data
      msk_ = rep(unlist(recursive = FALSE, use.names = FALSE, bit_m), n_obj)
      # define grouping parameter for the whole data
      spl_ = rep(unlist(recursive = FALSE, use.names = FALSE, mapply(FUN = rep, seq_along(bit_n), bit_n, SIMPLIFY = FALSE)), times = n_obj)
      
      if(display_progress) {
        lab = sprintf("$DATATYPE[%s]: extracting values", type)
        pb = newPB(min = 0, max = n_par, initial = 0, style = 3)
        on.exit(endPB(pb), add = TRUE)
      }
      
      # extract the whole data in "raw"
      data = readBin(con = toread, what = "raw", n = data_bytes)
      # process data according to args, order and mask and make the conversion
      data = unlist(recursive = FALSE, use.names = FALSE, by(list(v = data[ord_] & msk_, split = spl_), spl_, FUN = function(x) {
        i_par = x$split[1]
        if(display_progress) setPB(pb, value = i_par, title = title_progress, label = lab)
        if(args[[i_par]]$size %in% c(3,5,6,7)) { # for sizes not handled by R 
          M = c(1,2,4,8)[args[[i_par]]$size <= c(1,2,4,8)][1]
          m = M - args[[i_par]]$size
          bar = split(x$v,ceiling(seq_along(x$v)/args[[i_par]]$size))
          bar = lapply(bar, FUN = function(i) c(i, rep(as.raw(0x00), m)))
          foo = readBin(con = unlist(recursive = FALSE, use.names = FALSE, bar),
                        endian = "little", # FIXME ?
                        what = args[[i_par]]$what,
                        n = n_obj,
                        size = M,
                        signed = TRUE)
        } else { # size is handled by R
          foo = readBin(con = x$v,
                        endian = "little", # FIXME ?
                        what = args[[i_par]]$what,
                        n = n_obj,
                        size = args[[i_par]]$size,
                        signed = args[[i_par]]$signed)
        }
        # convert to unsigned integers if needed
        if(type == "I") {
          if(args[[i_par]]$size == 4) return(cpp_v_int32_to_uint32(foo))
          if(args[[i_par]]$size == 8) return(cpp_v_int64_to_uint64(foo))
        }
        foo
      }))
      data = matrix(data, ncol = n_par, nrow = n_obj, byrow = FALSE)
    }
  }
  
  # convert data to data.frame
  feat_names = unlist(recursive = FALSE, use.names = FALSE, lapply(seq_len(n_par), FUN = function(i) {
    N = text[[paste0("$P",i,"N")]]
    S = text[[paste0("$P",i,"S")]]
    if(length(S) != 0) return(paste(N , paste0("< ",S," >")))
    return(N)
  }))
  data = structure(data.frame(data, check.names = FALSE), names = feat_names)
  
  # scale data only for type I, ISAC spe mentions:
  # When linear scale is used, $PnE/0,0/ shall be entered if the floating point data type is used i.e. "F" or "D"
  # meaning that no scaling shall be used for type "F" and "D". Besides type "A" is deprecated
  if(scale) {
    if(type == "I") {
      for(i in seq_len(n_par)) { # log amplification scaling
        PnE = paste0("$P",i,"E")
        PnR = paste0("$P",i,"R")
        trans = text[[PnE]]
        ran = na.omit(suppressWarnings(as.numeric(text[[PnR]])))
        if((length(trans) == 0) || (length(ran) == 0)) next # no scaling info, SKIP
        trans = na.omit(suppressWarnings(as.numeric(strsplit(trans, split = ",", fixed = TRUE)[[1]])))
        # FIXME should we detect and force range to max range ? in case of mismatch with PnR and actual data range ?
        if(length(trans) != 2 || trans[1] == 0) next        # invalid PnE info, SKIP
        if(trans[2] == 0) trans[2] <- 1                     # invalid PnE info, but we apply correction
        data[,i] <- trans[2] * 10^(trans[1] * data[,i] / ran)
      }
    }
    for(i in seq_len(n_par)) {   # gain scaling
      PnE = paste0("$P",i,"E")
      PnG = paste0("$P",i,"G")
      gain = na.omit(suppressWarnings(as.numeric(text[[PnG]])))
      if(length(gain) == 0) next # no scaling info, SKIP
      trans = "0,0"
      if(length(text[[PnE]]) != 0) trans = text[[PnE]]
      trans = na.omit(suppressWarnings(as.numeric(strsplit(trans, split = ",", fixed = TRUE)[[1]])))
      # FIXME should we apply PnG when PnE is not valid i.e. PnE = f1 (missing f2)
      if((length(trans) == 2) && (trans[1] == 0) && (trans[2] == 0)) data[,i] <- data[,i] / gain
    }
  }
  return(data)
}

#' @title FCS Dataset Parser
#' @description
#' Helper to parse dataset from Flow Cytometry Standard (FCS) compliant files.
#' @param fileName path to file.
#' @param options list of options used to parse FCS file. It should contain (otherwise, it will be filled with the default values listed below):\cr
#' - header, a list whose members define the "at" offset from header$start$at and the "n" number of bytes to extract:\cr
#' -- start: where to start reading current FCS dataset.       Default is list(at = 0,  n = 6),\cr
#' -- space: where to retrieve space.                          Default is list(at = 6,  n = 4),\cr
#' -- text_beg: where to retrieve file text segment beginning. Default is list(at = 10, n = 8),\cr
#' -- text_end: where to retrieve file text segment end.       Default is list(at = 18, n = 8),\cr
#' -- data_beg: where to retrieve file data segment beginning. Default is list(at = 26, n = 8),\cr
#' -- data_end: where to retrieve file data segment end.       Default is list(at = 34, n = 8),\cr
#' - apply_scale, whether to apply data scaling. It only applies when fcs file is stored as DATATYPE "I". Default is TRUE.\cr
#' - dataset, (coerced to) an ordered vector of unique indices of desired dataset(s) to extract. Default is 1 to extract only the first dataset, whereas NULL allows to extract all available datasets.\cr
#' - force_header, whether to force the use of header to determine the position of data segment. Default is FALSE, for using positions found in "$BEGINDATA" and "$ENDDATA" keywords.\cr
#' - text_only, whether to only extract text segment. Default is FALSE.\cr
#' - text_check, whether to check if text segment is compliant with FCS specifications. Default is FALSE.\cr
#' - text_empty, whether to allow empty values when parsing text segment. Default is FALSE.\cr
#' - text_trim, remove whitespace in keywords names. Default is "none". Allowed are "both", "left", "right" and "none".
#' @param display_progress whether to display a progress bar. Default is TRUE.
#' @param ... other arguments to be passed.
#' @details 'options' may be tweaked according to file type, instrument and software used to generate it.\cr
#' Default 'options' should allow to read most files.\cr
#' 'options' members with the exception of 'header' may be passed thanks to '...'.
#' @return a list containing:\cr
#' - options, list of 'options' used\cr
#' - header, list of header information corresponding to 'options'\cr
#' - delimiter, unique character used to separate keyword-value pairs\cr
#' - text, list of keywords values,\cr
#' - data, data.frame of values.
#' @keywords internal
readFCSdataset <- function(fileName, options, display_progress = TRUE, ...) {
  dots = list(...)
  # prepare fileName
  if(missing(fileName)) stop("'fileName' can't be missing")
  assert(fileName, len = 1)
  fileName = enc2native(normalizePath(fileName, winslash = "/", mustWork = FALSE))
  if(!file.exists(fileName)) stop(paste("can't find", fileName, sep=" "))
  
  # fill options with default values when not provided
  opt_default = eval(formals(readFCS)$options)
  if(missing(options)) {
    options = opt_default
  } else {
    for(i in c("header", "apply_scale", "dataset", "force_header", "text_only", "text_check", "text_empty")) {
      if(!(i %in% names(options))) options[[i]] <- opt_default[[i]]
    }
    for(i in c("start", "space", "text_beg", "text_end", "data_beg", "data_end")) {
      if(!(i %in% names(options$header))) options$header[[i]] <- opt_default$header[[i]]
    }
  }
  # check if we can find options arguments in dots
  if("text_only" %in% names(dots)) options$text_only <- dots$text_only
  if("text_check" %in% names(dots)) options$text_check <- dots$text_check
  if("text_empty" %in% names(dots)) options$text_empty <- dots$text_empty
  if("text_trim" %in% names(dots)) options$text_trim <- dots$text_trim
  if("dataset" %in% names(dots)) options$dataset <- dots$dataset
  if("apply_scale" %in% names(dots)) options$apply_scale <- dots$apply_scale
  if("force_header" %in% names(dots)) options$force_header <- dots$force_header
  assert(options[["text_only"]], len = 1, alw = c(TRUE, FALSE))
  assert(options[["text_check"]], len = 1, alw = c(TRUE, FALSE))
  assert(options[["text_empty"]], len = 1, alw = c(TRUE, FALSE))
  assert(options[["text_trim"]], len = 1, alw = c("none", "both", "left", "right"))
  options[["dataset"]] = sort(unique(unname(options[["dataset"]])), na.last = FALSE)
  if(length(options[["dataset"]]) == 0) options[["dataset"]] = integer()
  assert(options[["apply_scale"]], len = 1, alw = c(TRUE, FALSE))
  assert(options[["force_header"]], len = 1, alw = c(TRUE, FALSE))
  
  # extract HEADER
  header = readFCSheader(fileName = fileName, header = options$header, encoding = "UTF-8")
  at = attr(header, "offset")
  version = suppressWarnings(na.omit(as.numeric(substr(header$start,4,6))))
  if(length(version) == 0) stop("can't determine FCS version: ", header$start)
  
  # extract DELIMITER
  delimiter = readFCSdelimiter(fileName = fileName, at = header$text_beg, version = version, encoding = "UTF-8")
  
  # extract TEXT segment,
  # the primary TEXT segment has to be in within bytes 58 - 99,999,999
  text = readFCStext(fileName = fileName, delimiter = delimiter, 
                     start = 1 + header$text_beg, end = header$text_end,
                     encoding = "UTF-8", empty = options$text_empty, trim = options$text_trim)
  NTEXT = names(text)
  names(text) = toupper(names(text))
  text_bck = text
  
  # extract supplemental TEXT segment
  # we will use text to extract supplemental TEXT segment offsets
  extra_off1 = suppressWarnings(na.omit(as.numeric(text[["$BEGINSTEXT"]]) + at))
  extra_off2 = suppressWarnings(na.omit(as.numeric(text[["$ENDSTEXT"]])   + at))
  if((length(extra_off1) != 0) &&
     (length(extra_off2) != 0) &&
     (extra_off1 != header$text_beg) && 
     (extra_off2 != header$text_end) &&
     (extra_off2 > extra_off1)) {
    tryCatch({
      extra_text = readFCStext(fileName = fileName, delimiter = delimiter, 
                               start = extra_off1, end = extra_off2,
                               encoding = "UTF-8", empty = options$text_empty, trim = options$text_trim)
      NEXTRA = names(extra_text)
      names(extra_text) = toupper(extra_text)
      tmp = names(extra_text) %in% names(text)
      if(any(tmp)) warning("TEXT (or sup. TEXT) segment: supplemental TEXT segment contains keyword",ifelse(sum(tmp)==0,"","s")," already found in TEXT segment that will be discarded:\n\t- ",
                           paste0(NEXTRA[tmp], collapse = "\n\t- "),
                           call. = FALSE, immediate. = TRUE)
      text = c(text, extra_text[!tmp])
      NTEXT = c(NTEXT, NEXTRA[!tmp])
    }, error = function(e) {
      text = text_bck
      warning("TEXT (or sup. TEXT) segment: supplemental TEXT segment is not readable:\n",
              e$message, call. = FALSE, immediate. = TRUE)
    })
  }
  
  # get cur fileName and dataset
  if(!any("$FIL" == names(text))) {
    text[["$FIL"]] <- fileName
    NTEXT = c(NTEXT, "$FIL")
  }
  data_fil <- text[["$FIL"]]
  data_set = num_to_string(length(options$header$start$at))
  
  # check that keywords fulfill FCS spe
  if((options$text_check) &&
     (
       (length(options$dataset) == 0) ||
       (data_set %in% options$dataset)
     )) FCS_check_keywords(text = text, delimiter = delimiter, version = version, encoding = "UTF-8")
  
  # retrieve DATA segment offsets
  # $BEGINDATA and $ENDDATA are not REQUIRED keywords in old (<= 2.0) FCS spe
  # so we use HEADER segment for data offsets and silently set 'force_header' = TRUE,
  # otherwise, for >= 3.0, preferred location is within $BEGINDATA and $ENDDATA from TEXT segment
  off1 = numeric()
  off2 = numeric()
  has_been_forced = FALSE
  to_string = function(x) {if(length(x)==0) { return(NULL) } else { return(num_to_string(x))}}
  if(version >= 3.0) {
    off1 = suppressWarnings(na.omit(as.numeric(text[["$BEGINDATA"]]) + at))
    off2 = suppressWarnings(na.omit(as.numeric(text[["$ENDDATA"]])   + at))
    # if not found in text despite being mandatory, we will force_header
    if(any(c(off1, off2, length(off1), length(off2)) == 0)) {
      if(!options$force_header) warning("can't determine DATA offsets from TEXT keywords $BEGINDATA[",
                                        to_string(off1),"] or $ENDDATA[",to_string(off2),
                                        "], 'force_header' has been forced to TRUE",
                                        call. = FALSE, immediate. = TRUE)
      has_been_forced = TRUE
      options$force_header = TRUE
    }
    # we check consistency between HEADER and TEXT
    if((!has_been_forced) &&                                                     # are we forced to read data offsets from header only
       (!options$text_only) &&                                                   # does user want data to be extracted ?
       ((length(options$dataset) == 0) || (data_set %in% options$dataset)) &&    # should the current dataset be extracted ?
       ((!any(0 %in% header$data_beg)) && (!any(0 %in% header$data_end))) &&     # data offsets can be found in header and is not 0
       ((!any(off1 %in% header$data_beg)) || (!any(off2 %in% header$data_end)))) # data offsets from keywords can be found and differs from header
    {
      warning("discrepancies between HEADER[",
              to_string(header$data_beg),"-",to_string(header$data_end),
              "] and TEXT[",
              to_string(off1),"-",to_string(off2),
              "] segments for determining DATA offsets\n",
              "/!\\ you should manually validate results with 'force_header' set to TRUE or FALSE",
              call. = FALSE, immediate. = TRUE)
    }
  } else {
    options$force_header = TRUE
  }
  if(options$force_header) {
    if(any(c(header$data_beg, header$data_end, length(header$data_beg), length(header$data_end)) == 0)) {
      warning("can't 'force_header' because HEADER indicates 0 for DATA offset(s)",
              call. = FALSE, immediate. = TRUE)
    } else {
      off1 = header$data_beg
      off2 = header$data_end
    }
  }
  if(any(c(off1, off2, length(off1), length(off2)) == 0) || (off2 <= off1)) {
    warning("can't determine data offsets, 'text_only' has been forced to TRUE",
            call. = FALSE, immediate. = TRUE)
    options$text_only = TRUE
  }
  
  # check $MODE, we can only extract DATA segment in "L" mode
  mode = text[["$MODE"]]
  has_been_forced = FALSE
  if(version <= 3.1) { # $MODE is REQUIRED in FCS <= 3.1
    if((length(mode) != 1) || (mode != "L")) has_been_forced = TRUE
  } else {             # $MODE is DEPRECATED starting FCS >= 3.2, but if here it should be "L"
    if((length(mode) == 1) && (mode != "L")) has_been_forced = TRUE
  }
  if(has_been_forced) {
    options$text_only = TRUE
    warning("DATA stored in $MODE[",mode,"] are not supported, 'text_only' has been forced to TRUE", # mode "C" and "U" have been deprecated in FCS spe
            call. = FALSE, immediate. = TRUE)
  }
  
  # extract DATA segment
  data = data.frame()                                 # prepare default returned value for data
  if(!options$text_only &&                            # user only wants text segment
     (                                                
       (length(options$dataset) == 0) ||
       (data_set %in% options$dataset)  # no need to extract data if user doesn't need it
     )) data = readFCSdata(fileName = fileName, text = text, version = version,
                           start = off1, end = off2, 
                           scale = options$apply_scale, display_progress = display_progress)
  
  # TODO retrieve analysis segment ?
  # # we will use text to extract analysis segment offsets
  # off1 = suppressWarnings(as.integer(text[["$BEGINANALYSIS"]]))
  # off2 = suppressWarnings(as.integer(text[["$ENDANALYSIS"]]))
  # # if not found in text despite being mandatory, we will use header
  # if(length(off1) == 0) off1 = suppressWarnings(as.integer(header$data_beg))
  # if(length(off2) == 0) off2 = suppressWarnings(as.integer(header$data_end))
  # anal=raw()
  
  # recover original TEXT names (i.e. not forced to upper case)
  for(k in c("file", "fileName", "fileName_image", "date", "dataset", "version", "FCSversion")) {
    kk = paste0("@IFC_", k)
    foo = toupper(kk) == names(text)
    if(any(foo)) NTEXT[foo] <- kk
  }
  names(text) = NTEXT
  text[["@IFC_file"]] <- basename(data_fil) # internal filename if found, otherwise fileName
  text[["@IFC_fileName"]] <- fileName
  text[["@IFC_dataset"]] <- data_set
  text[["@IFC_version"]] <- paste0(unlist(recursive = FALSE, use.names = FALSE, packageVersion("IFC")), collapse = ".")
  text[["@IFC_FCSversion"]] <- header$start
  return(list(options=options,
              header=header,
              delimiter=delimiter,
              # anal=raw(),
              text=text, 
              data=data))
}

#' @title FCS File Parser
#' @description
#' Parse data from Flow Cytometry Standard (FCS) compliant files.
#' @param fileName path to file.
#' @param options list of options used to parse FCS file. It should contain (otherwise, it will be filled with the default values listed below):\cr
#' - header, a list whose members define the "at" offset from header$start$at and the "n" number of bytes to extract:\cr
#' -- start: where to start reading current FCS dataset.       Default is list(at = 0,  n = 6),\cr
#' -- space: where to retrieve space.                          Default is list(at = 6,  n = 4),\cr
#' -- text_beg: where to retrieve file text segment beginning. Default is list(at = 10, n = 8),\cr
#' -- text_end: where to retrieve file text segment end.       Default is list(at = 18, n = 8),\cr
#' -- data_beg: where to retrieve file data segment beginning. Default is list(at = 26, n = 8),\cr
#' -- data_end: where to retrieve file data segment end.       Default is list(at = 34, n = 8),\cr
#' - apply_scale, whether to apply data scaling. It only applies when fcs file is stored as DATATYPE "I". Default is TRUE.\cr
#' - dataset, (coerced to) an ordered vector of unique indices of desired dataset(s) to extract. Default is 1 to extract only the first dataset, whereas NULL allows to extract all available datasets.\cr
#' - force_header, whether to force the use of header to determine the position of data segment. Default is FALSE, for using positions found in "$BEGINDATA" and "$ENDDATA" keywords.\cr
#' - text_only, whether to only extract text segment. Default is FALSE.\cr
#' - text_check, whether to check text segment is compliant with FCS specifications. Default is FALSE.\cr
#' - text_empty, whether to allow empty values when parsing text segment. Default is FALSE.\cr
#' - text_trim, remove whitespace in keywords names. Default is "none". Allowed are "both", "left", "right" and "none".
#' @param display_progress whether to display a progress bar. Default is TRUE.
#' @param ... other arguments to be passed.
#' @details 'options' may be tweaked according to file type, instrument and software used to generate it.\cr
#' Default 'options' should allow to read most files.\cr
#' 'options' members with the exception of 'header' may be passed thanks to '...'.
#' @source Data File Standard for Flow Cytometry, version FCS 3.1 from Spidlen J. et al. available at \doi{10.1002/cyto.a.20825}.
#' @return a list whose elements are lists for each dataset stored within the file.\cr
#' each sub-list contains:\cr
#' - header, list of header information corresponding to 'options'\cr
#' - delimiter, unique character used to separate keyword-value pairs\cr
#' - text, list of keywords values,\cr
#' - data, data.frame of values.
#' @export
readFCS <- function(fileName,
                    options = list(header = list(start = list(at = 0, n = 6),
                                                 space = list(at = 6, n = 4),
                                                 text_beg = list(at = 10, n = 8),
                                                 text_end = list(at = 18, n = 8),
                                                 data_beg = list(at = 26, n = 8),
                                                 data_end = list(at = 34, n = 8)),
                                   apply_scale = TRUE,
                                   dataset = 1,
                                   force_header = FALSE,
                                   text_only = FALSE,
                                   text_check = FALSE,
                                   text_empty = FALSE,
                                   text_trim = "none"),
                    display_progress = TRUE, ...) {
  # extract dataset(s)
  ans = list()
  more = 0L
  while(length(more) != 0) {
    dat = readFCSdataset(fileName = fileName, options = options, display_progress = display_progress, ...)
    options <- dat$options
    ans = c(ans, list(dat[-1]))
    text = ans[[length(ans)]]$text
    names(text) = toupper(names(text))
    fileName = text[["@IFC_FILENAME"]]
    more = suppressWarnings(na.omit(as.numeric(text[["$NEXTDATA"]]) + options$header$start$at[1]))
    more = setdiff(more, options$header$start$at)
    if((length(options$dataset) != 0) && all(options$dataset %in% 1L)) more = numeric()
    if((length(more) != 0) && (more >= file.size(fileName))) {
      more = numeric()
      warning("can't extract all datasets: $NEXTDATA points to outside of the file",
              call. = FALSE, immediate. = TRUE)
    }
    options$header$start$at <- c(more, options$header$start$at)
  }
  if(length(options$dataset) == 0) options$dataset = seq_along(ans)
  tmp = options$dataset %in% seq_along(ans)
  if(!all(tmp)) warning("dataset",
                        ifelse(sum(!tmp)==1,"","s"),
                        " [",
                        paste0(options$dataset[!tmp], collapse = ",")
                        ,"] can't be found in\n",
                        fileName,
                        call. = FALSE, immediate. = TRUE)
  return(structure(ans[sapply(seq_along(ans), FUN = function(ii) ans[[ii]]$text[["@IFC_dataset"]] %in% options$dataset)],
                   class = "IFC_fcs", fileName = fileName))
}

#' @title FCS Object Data Sets Merging
#' @description
#' Merges FCS data object with various data sets.
#' @param fcs `IFC_fcs` object as extracted by readFCS().
#' @param ... other arguments to be passed.
#' @details in data can contain extra columns named 'import_file' and 'import_subfile' intended to allow file/dataset identification
#' @return a list of list containing:\cr
#' - header, list of header information corresponding to 'options'\cr
#' - delimiter, unique character used to separate keyword-value pairs\cr
#' - text, list of keywords values,\cr
#' - data, data.frame of values.
#' @keywords internal
FCS_merge_dataset <- function(fcs, ...) {
  dots = list(...)
  display_progress = dots$display_progress
  if(length(display_progress) == 0) display_progress = TRUE
  assert(display_progress, len=1, alw = c(TRUE, FALSE))
  
  L = length(fcs)
  if(L > 1) {
    if(display_progress) {
      pb = newPB(label = "FCS", min = 0, max = L)
      on.exit(endPB(pb))
    }
    features = Reduce(function(x, y) {
      Nx = names(x)
      Ny = names(y)
      com <- Nx[Nx %in% Ny]
      Nxx <- Nx[!Nx %in% Ny]
      Nyy <- Ny[!Ny %in% Nx]
      xx = x[, Nxx, drop = FALSE]
      xx = cbind.data.frame(xx, matrix(NA, nrow = nrow(xx), ncol = length(Nyy)))
      names(xx) = c(Nxx, Nyy)
      yy = y[, Nyy, drop = FALSE]
      yy = cbind.data.frame(yy, matrix(NA, nrow = nrow(yy), ncol = length(Nxx)))
      names(yy) = c(Nyy, Nxx)
      aa = rbind.data.frame(xx[, c(Nxx, Nyy), drop = FALSE], yy[, c(Nxx, Nyy), drop = FALSE], make.row.names = FALSE)
      if(length(com) != 0) {
        if(length(c(Nxx, Nyy)) == 0) {
          aa = structure(rbind.data.frame(x[, com, drop = FALSE],
                                          y[, com, drop = FALSE],
                                          make.row.names = FALSE),
                         names = com)
        } else {
          aa = structure(cbind.data.frame(aa, rbind.data.frame(x[, com, drop = FALSE],
                                                               y[, com, drop = FALSE],
                                                               make.row.names = FALSE)),
                         names = c(Nxx, Nyy, com))
        }
      }
      aa
    },
    lapply(seq_len(L), FUN = function(i) {
      if(display_progress) setPB(pb, value = i, title = "FCS", label = "Merging Data Sets")
      dat = fcs[[i]]$data
      if(!any("import_file" == names(dat))) {
        dat[,"import_file"]=rep(fcs[[i]]$text[["@IFC_file"]], nrow(dat))
        # dat = cbind.data.frame(dat, "import_file"=rep(fcs[[i]]$text[["@IFC_file"]], nrow(dat)))
      }
      if(!any("import_subfile" == names(dat))) {
        dat[,"import_subfile"]=rep(fcs[[i]]$text[["@IFC_dataset"]], nrow(dat))
        # dat = cbind.data.frame(dat, "import_subfile"=rep(fcs[[i]]$text[["@IFC_dataset"]], nrow(dat)))
      }
      dat
    }))
  } else {
    features = fcs[[1]]$data
    if(!any("import_file" == names(features))) {
      features[,"import_file"]=rep(fcs[[1]]$text[["@IFC_file"]], nrow(features))
      # features = cbind.data.frame(features, "import_file"=rep(fcs[[1]]$text[["@IFC_file"]], nrow(features)))
    }
    if(!any("import_subfile" == names(features))) {
      features[,"import_subfile"]=rep(1, nrow(features))
      # features = cbind.data.frame(features, "import_subfile"=rep(1, nrow(features)))
    }
  }
  
  ans = list(list(header=fcs[[1]]$header,
                  delimiter=fcs[[1]]$delimiter,
                  text=fcs[[1]]$text, 
                  data = features))
  class(ans) <- "IFC_fcs"
  attr(ans, "fileName") <- attr(fcs, "fileName")
  bar <- unique(features[, "import_file"])
  if(length(bar) > 1) attr(ans, "Merged_fcs") <- bar
  ans
}

#' @title FCS Object Samples Merging
#' @description
#' Merges FCS data object with various samples.
#' @param fcs `IFC_fcs` object as extracted by readFCS().
#' @param ... other arguments to be passed.
#' @details in data can contain extra columns named 'import_file' and 'import_subfile' intended to allow file/dataset identification
#' @return a list of list containing:\cr
#' - header, list of header information corresponding to 'options'\cr
#' - delimiter, unique character used to separate keyword-value pairs\cr
#' - text, list of keywords values,\cr
#' - data, data.frame of values.
#' @keywords internal
FCS_merge_sample <- function(fcs, ...) {
  dots = list(...)
  display_progress = dots$display_progress
  if(length(display_progress) == 0) display_progress = TRUE
  assert(display_progress, len=1, alw = c(TRUE, FALSE))
  
  L = length(fcs)
  if(L > 1) {
    if(display_progress) {
      pb = newPB(label = "FCS", min = 0, max = L)
      on.exit(endPB(pb))
    }
    features = Reduce(function(x, y) {
      Nx = names(x)
      Ny = names(y)
      # FIXME should we add a check_names argument to ensure that each fcs sample have exactly the same names ?
      if(sum(nchar(Nx)) > sum(nchar(Ny))) {N = Nx} else {N = Ny}
      names(x) = N
      names(y) = N
      rbind.data.frame(x, y, make.row.names = FALSE)
    },
    lapply(seq_len(L), FUN = function(i) {
      if(display_progress) setPB(pb, value = i, title = "FCS", label = "Merging Samples")
      dat = fcs[[i]]$data
      if(!any("import_file" == names(dat))) {
        dat[,"import_file"]=rep(fcs[[i]]$text[["@IFC_file"]], nrow(dat))
        # dat = cbind.data.frame(dat, "import_file"=rep(fcs[[i]]$text[["@IFC_file"]], nrow(dat)))
      }
      if(!any("import_subfile" == names(dat))) {
        dat[,"import_subfile"]=rep(fcs[[i]]$text[["@IFC_dataset"]], nrow(dat))
        # dat = cbind.data.frame(dat, "import_subfile"=rep(fcs[[i]]$text[["@IFC_dataset"]], nrow(dat)))
      }
      dat
    }))
  } else {
    features = fcs[[1]]$data
    if(!any("import_file" == names(features))) {
      features[,"import_file"]=rep(fcs[[1]]$text[["@IFC_file"]], nrow(features))
      # features = cbind.data.frame(features, "import_file"=rep(fcs[[1]]$text[["@IFC_file"]], nrow(features)))
    }
    if(!any("import_subfile" == names(features))) {
      features[,"import_subfile"]=rep(1, nrow(features))
      # features = cbind.data.frame(features, "import_subfile"=rep(1, nrow(features)))
    }
  }
  
  ans = list(list(header=fcs[[1]]$header,
                  delimiter=fcs[[1]]$delimiter,
                  text=fcs[[1]]$text, 
                  data = features))
  class(ans) <- "IFC_fcs"
  attr(ans, "fileName") <- attr(fcs, "fileName")
  bar <- unique(features[, "import_file"])
  if(length(bar) > 1) attr(ans, "Merged_fcs") <- bar
  ans
}

#' @title FCS Object Converter
#' @description
#' Converts FCS data object to `IFC_data` object.
#' @param fcs `IFC_fcs` object as extracted by readFCS().
#' @param ... other arguments to be passed.
#' @details in data can contain extra columns named 'import_file' and 'import_subfile' intended to allow file/dataset identification
#' @return A named list of class `IFC_data`, whose members are:\cr
#' -description, a list of descriptive information,\cr
#' -Merged_fcs, character vector of path of files used to create fcs, if input was a merged,\cr
#' -Keywords, a named-list of keywords values, only keywords from 1st 'fcs' segment will be retrieved\cr
#' -fileName, path of fileName input,\cr
#' -fileName_image, path of .cif image fileName is referring to,\cr
#' -features, a data.frame of features,\cr
#' -features_def, a describing how features are defined,\cr
#' -graphs, a list of graphical elements found,\cr
#' -pops, a list describing populations found,\cr
#' -regions, a list describing how regions are defined,\cr
#' -images, a data.frame describing information about images,\cr
#' -offsets, an integer vector of images and masks IFDs offsets,\cr
#' -stats, a data.frame describing populations count and percentage to parent and total population,\cr
#' -checksum, a checksum integer.
#' @keywords internal
FCS_to_data <- function(fcs, ...) {
  # create structure
  dots = list(...)
  display_progress = dots$display_progress
  if(length(display_progress) == 0) display_progress = TRUE
  assert(display_progress, len=1, alw = c(TRUE, FALSE))
  min_data = list("description" = list("Assay" = data.frame("date" = NULL, "IDEAS_version" = NULL, "binaryfeatures" = NULL),
                                       "ID" = data.frame("file" = NULL, "creation" = NULL, "objcount" = NULL, "checksum" = NULL),
                                       "Images" = data.frame("name" = NULL, "color" = NULL, "physicalChannel" = NULL, "xmin" = NULL,
                                                             "xmax" = NULL, "xmid" = NULL, "ymid"= NULL, "scalemin"= NULL, "scalemax"= NULL,
                                                             "tokens"= NULL, "baseimage"= NULL, "function"= NULL),
                                       "masks" = data.frame(matrix(character(),nrow = 0, ncol = 3, dimnames = list(list(),list("type","name","def"))))),
                  "Merged_fcs" = character(),
                  "Keywords" = list(),
                  "fileName" = character(),
                  "fileName_image" = character(),
                  "features" = structure(.Data = list(), class = c("data.frame", "IFC_features")),
                  "features_def" = structure(.Data =  list(), class = c("IFC_features_def")),
                  "graphs" = structure(.Data =  list(), class = c("IFC_graphs")),
                  "pops" = structure(.Data =  list(), class = c("IFC_pops")),
                  "regions" = structure(.Data = list(), class = c("IFC_regions")),
                  "images" = structure(.Data = list(), class = c("data.frame", "IFC_images")),
                  "offsets" = structure(.Data = integer(), class = c("IFC_offsets")),
                  "stats" = data.frame(),
                  "checksum" = integer())
  class(min_data) = c("IFC_data")
  
  # define features categories which requires no param
  No_Param = c("Time", "Object Number", "Raw Centroid X", "Raw Centroid Y",  "Flow Speed", "Camera Line Number", "Camera Timer", "Objects per mL", "Objects per sec")
  
  features = FCS_merge_dataset(fcs, ...)[[1]]$data
  
  identif = names(features) %in% c("import_file", "import_subfile")
  idx = features[, identif, drop = FALSE]
  if(!"import_file" %in% names(idx)) idx$import_file = fcs[[1]][["@IFC_file"]]
  if(!"import_subfile" %in% names(idx)) idx$import_subfile = 1
  obj_count = as.integer(nrow(features))
  
  multiple = prod(length(unique(idx[, 1])), length(unique(idx[, 2]))) > 1
  # if several files creates pops to identify who is who
  if(multiple) {
    idx$count = seq_len(obj_count)
    all_obj = rep(FALSE, obj_count)
    pops = by(idx, idx[, c("import_file", "import_subfile")], FUN = function(x) {
      if(length(unique(idx[, "import_subfile"])) == 1) {
        name = unique(x$import_file)
      } else {
        name = paste(unique(x$import_file), "dataset", unique(x$import_subfile), sep = "_")
      }
      obj = all_obj
      obj[x$count] <- TRUE
      buildPopulation(name = name, type = "T", color = "White", lightModeColor = "Black", obj = obj)
    })
    pops = pops[sapply(pops, FUN = function(p) length(p) != 0)]
  }
  features = subset(x = features, select = !identif)
  features_def = lapply(names(features), FUN = function(i_feat) {
    # TODO check it it correctly imports linear values
    if(i_feat %in% No_Param) return(buildFeature(name = gsub("LOG$", "LIN", i_feat, ignore.case = TRUE), val = features[, i_feat], def = i_feat))
    return(buildFeature(name = gsub("LOG$", "LIN", i_feat, ignore.case = TRUE), val = features[, i_feat]))
  })
  
  # fill min object
  instrument = sapply(fcs, FUN = function(x) {
    names(x$text) = toupper(names(x$text))
    tmp = x$text$`$CYT`
    if((length(tmp) == 0) || (tmp == "")) return("unk")
    return(tmp)
  })
  FCS_version = sapply(fcs, FUN = function(x) {
    tmp = x$text[["@IFC_FCSversion"]]
    if((length(tmp) == 0) || (tmp == "")) return("unk")
    return(tmp)
  })
  spillover = lapply(fcs, FUN = function(x) {
    names(x$text) = toupper(names(x$text))
    tmp = x$text[c("$SPILLOVER","SPILL","SPILLOVER")]
    tmp = tmp[sapply(tmp, length) != 0]
    if(length(tmp) == 0) return(NULL)
    return(tmp)
  })
  spillover = spillover[sapply(spillover, length) != 0]
  if(is.list(spillover) && length(spillover) == 1) spillover = spillover[[1]]
  if(is.list(spillover) && length(spillover) == 1) spillover = spillover[[1]]
  if(!is.list(spillover) && length(spillover) == 1 && spillover == "") spillover = NULL
  if(length(spillover) != 0) {
    features_names = parseFCSname(names(features))
    spillover = try(convert_spillover(spillover), silent = TRUE)
    if(inherits(spillover, "try-error")) {
      spillover = NULL
    } else {
      rownames(spillover) <- names(features)[apply(sapply(colnames(spillover), FUN = function(x) {
        x==features_names$PnN
      }), 2, FUN = function(i) which(i)[1])]
    }
  }
  # checksum = sapply(fcs, FUN = function(x) {
  #   tmp = x$description[[1]]$`$ENDDATA`
  #   if((length(tmp) == 0) || (tmp == "")) return("unk")
  #   return(tmp)
  # })
  
  min_data$fileName = normalizePath(attr(fcs, "fileName"), winslash = "/", mustWork = FALSE)
  bar <- unique(idx[, "import_file"])
  if(length(bar) > 1) min_data$Merged_fcs <- bar
  min_data$Keywords <- fcs[[1]]$text
  min_data$description$Assay = data.frame(date = file.info(min_data$fileName)$mtime, FCS_version = paste0(FCS_version, collapse = ", "), stringsAsFactors = FALSE)
  min_data$description$ID = data.frame(file = min_data$fileName,
                                       creation = format(file.info(min_data$fileName)$ctime, format = "%d-%b-%y %H:%M:%S"),
                                       objcount = obj_count,
                                       stringsAsFactors = FALSE)
  min_data$description$FCS = min_data$description$ID
  min_data$checksum = integer()
  min_data$features = structure(data.frame("Object Number" = 0:(obj_count-1), check.names = FALSE), class = c("data.frame", "IFC_features"))
  min_data$features_def = structure(list(buildFeature(name = "Object Number", val = 0:(obj_count-1), def = "Object Number")[1:4]), names = "Object Number", class = c("list", "IFC_features_def"))
  # foo = grep("^\\$P|^\\@P|^\\$D|^@D|^flowCore", names(fcs@description), value = TRUE, invert = TRUE)
  # min_data$info = fcs@description[foo]
  min_data$description$FCS = c(min_data$description$ID, list(instrument = paste0(instrument, collapse = ", "), spillover = spillover))
  min_data = suppressWarnings(IFC::data_add_features(obj = min_data, features = features_def))
  min_data = IFC::data_add_pops(obj = min_data,
                                pops = list(buildPopulation(name = "All", type = "B",
                                                            color = "White", lightModeColor = "Black",
                                                            obj = rep(TRUE, obj_count))),
                                display_progress = display_progress)
  # min_data$features_comp = min_data$features[, grep("^FS.*$|^SS.*$|LOG|^Object Number$|TIME", names(min_data$features), value = TRUE, invert = TRUE, ignore.case = TRUE)]
  if(multiple) {
    min_data = IFC::data_add_pops(obj = min_data, pops = pops, display_progress = display_progress)
  }
  K = class(min_data$pops)
  min_data$pops = lapply(min_data$pops, FUN = function(p) {
    attr(p, "reserved") <- TRUE
    return(p)
  })
  class(min_data$pops) <- K
  min_data$stats = get_pops_stats(min_data$pops, obj_count)
  return(min_data)
}

#' @title FCS File Reader
#' @description
#' Extracts data from Flow Cytometry Standard (FCS) Files.
#' @param fileName path(s) of file(s). If multiple files are provided they will be merged and 
#' populations will be created to identify each single file within returned `IFC_data` object.
#' @source Data File Standard for Flow Cytometry, version FCS 3.1 from Spidlen J. et al. available at \doi{10.1002/cyto.a.20825}.
#' @param ... other arguments to be passed to readFCS function, with the exception of 'options$text_only'.
#' @return A named list of class `IFC_data`, whose members are:\cr
#' -description, a list of descriptive information,\cr
#' -Merged_fcs, character vector of path of files used to create fcs, if input was a merged,\cr
#' -Keywords, a named-list of keywords values, only keywords from 1st 'fcs' segment will be retrieved\cr
#' -fileName, path of fileName input,\cr
#' -fileName_image, path of .cif image fileName is referring to,\cr
#' -features, a data.frame of features,\cr
#' -features_def, a describing how features are defined,\cr
#' -graphs, a list of graphical elements found,\cr
#' -pops, a list describing populations found,\cr
#' -regions, a list describing how regions are defined,\cr
#' -images, a data.frame describing information about images,\cr
#' -offsets, an integer vector of images and masks IFDs offsets,\cr
#' -stats, a data.frame describing populations count and percentage to parent and total population,\cr
#' -checksum, a checksum integer.
#' @export
ExtractFromFCS <- function(fileName, ...) {
  # create structure
  dots = list(...)
  dots = dots[!(names(dots) %in% c("fcs","text_only"))]
  display_progress = dots$display_progress
  if(length(display_progress) == 0) display_progress = TRUE
  assert(display_progress, len=1, alw = c(TRUE, FALSE))
  fileName = enc2native(normalizePath(path = fileName, winslash = "/", mustWork = TRUE))
  
  # read the fcs file and extract features and description
  L = length(fileName)
  if(display_progress) {
    pb = newPB(label = "reading files", min = 0, max = L)
    on.exit(endPB(pb))
  }
  fcs = lapply(seq_len(L), FUN = function(i_file) {
    if(display_progress) setPB(pb, value = i_file, title = "Extracting FCS", label = "reading files")
    do.call(what = FCS_merge_dataset, args = c(dots, list(fcs = quote(do.call(what = readFCS,  args=c(dots, list(fileName = fileName[[i_file]])))))))[[1]]
  })
  attr(fcs, "fileName") <- fileName[1]
  do.call(what = FCS_to_data, args = c(dots, list(fcs = quote(fcs))))
}

#' @title FCS File Writer
#' @description
#' Writes an `IFC_data` object to a Flow Cytometry Standard (FCS) file.
#' @param obj an `IFC_data` object extracted with features extracted.
#' @param write_to pattern used to export file.
#' Placeholders, like "\%d/\%s_fromR.\%e", will be substituted:\cr
#' -\%d: with full path directory of 'obj$fileName'\cr
#' -\%p: with first parent directory of 'obj$fileName'\cr
#' -\%e: with extension of 'obj$fileName' (without leading .)\cr
#' -\%s: with shortname from 'obj$fileName' (i.e. basename without extension).\cr
#' Exported file extension will be deduced from this pattern. Note that it has to be a .fcs.
#' @param overwrite whether to overwrite file or not. Default is FALSE.
#' Note that if TRUE, it will overwrite exported file if path of 'fileName' and deduced from 'write_to' arguments are different.
#' Otherwise, you will get an error saying that overwriting source file is not allowed.\cr
#' Note also that an original file will never be overwritten.
#' @param delimiter an ASCII character to separate the FCS keyword-value pairs. Default is : "/".
#' @param cytometer string, if provided it will be used to fill $CYT keyword.\cr
#' However, when missing $CYT will be filled with obj$description$FCS$instrument if found, or "Image Stream" otherwise.
#' @param ... other arguments to be passed. keyword-value pairs can be passed here.
#' @return invisibly returns full path of exported file.
#' @export
ExportToFCS <- function(obj, write_to, overwrite = FALSE, delimiter="/", cytometer = "Image Stream", ...) {
  dots = list(...)
  # change locale
  locale_back = Sys.getlocale("LC_ALL")
  on.exit(suppressWarnings(Sys.setlocale("LC_ALL", locale = locale_back)))
  suppressWarnings(Sys.setlocale("LC_ALL", locale = "English"))
  now = format(Sys.time(), format = "%d-%b-%y %H:%M:%S")
  
  old_enc <- options("encoding")
  on.exit(options(old_enc), add = TRUE)
  options("encoding" = "UTF-8")
  
  # check mandatory param
  assert(obj, cla = "IFC_data")
  if(length(obj$pops)==0) stop("please use argument 'extract_features' = TRUE with ExtractFromDAF() or ExtractFromXIF() and ensure that features were correctly extracted")
  if(missing(write_to)) stop("'write_to' can't be missing")
  assert(write_to, len = 1, typ = "character")
  assert(overwrite, len = 1, alw = c(TRUE, FALSE))
  assert(cytometer, len = 1, typ = "character")
  # assert(display_progress, c(TRUE, FALSE))
  raw_delimiter = charToRaw(delimiter)
  if(length(raw_delimiter) != 1) stop("'delimiter' should be of size 1")
  if((raw_delimiter == 0x00) || (raw_delimiter > 0x7E)) stop("'delimiter' should be an [0x01-0x7E (1-126)] ASCII character")
  delimiter_esc = paste0(delimiter, delimiter)
  
  # tests if file can be written
  fileName = normalizePath(obj$fileName, winslash = "/", mustWork = FALSE)
  title_progress = basename(fileName)
  splitf_obj = splitf(fileName)
  splitp_obj = splitp(write_to)
  write_to = formatn(splitp_obj, splitf_obj)
  file_extension = getFileExt(write_to)
  assert(file_extension, len = 1, alw = "fcs")
  if(any(splitp_obj$channel > 0)) message("'write_to' has %c argument but channel information can't be retrieved with data_to_DAF()")
  if(any(splitp_obj$object > 0)) message("'write_to' has %o argument but channel information can't be retrieved with data_to_DAF()")
  
  overwritten = FALSE
  if(file.exists(write_to)) {
    write_to = normalizePath(write_to, winslash = "/", mustWork = FALSE)
    if(!overwrite) stop(paste0("file ",write_to," already exists"))
    if(tolower(fileName) == tolower(write_to)) stop("you are trying to overwrite source file which is not allowed")
    tryCatch({
      fcs = readFCS(fileName = write_to, text_only = TRUE)
    }, error = function(e) {
      stop(paste0(write_to, "\ndoes not seem to be well formatted")) 
    })
    if(length(fcs[[1]]$text[["@IFC_version"]]) == 0) stop("you are trying to overwrite an original file which is not allowed")
    tmp_file = normalizePath(tempfile(), winslash = "/", mustWork = FALSE)
    overwritten = TRUE
  }
  
  dir_name = dirname(write_to)
  if(!dir.exists(dir_name)) if(!dir.create(dir_name, recursive = TRUE, showWarnings = FALSE)) stop(paste0("can't create\n", dir_name))
  file_w = ifelse(overwritten, tmp_file, write_to)
  tryCatch(suppressWarnings({
    towrite = file(description = file_w, open = "wb")
  }), error = function(e) {
    stop(paste0(ifelse(overwritten,"temp ","'write_to' "), "file: ", file_w, "\ncan't be created: check name ?"))
  })
  close(towrite)
  write_to = normalizePath(write_to, winslash = "/", mustWork = FALSE)
  
  # defines some variables
  pkg_ver = paste0(unlist(recursive = FALSE, use.names = FALSE, packageVersion("IFC")), collapse = ".")
  # is_fcs = length(obj$description$FCS)!=0
  
  # init header
  header = list(version  = "FCS3.0",
                space    = "    ",
                text_beg = "      58",
                text_end = "        ",
                data_beg = "        ",
                data_end = "        ",
                anal_beg = "       0",
                anal_end = "       0")
  
  # we modify features to add populations
  features = fastCbind(obj$features[, setdiff(names(obj$features), names(obj$pops)), drop = FALSE],
                       sapply(names(obj$pops), simplify = FALSE, FUN = function(p) obj$pops[[p]]$obj))
  # need to replace non finite values by something; IDEAS is using 0 so we use 0 also
  # TODO maybe replace -Inf by features min and +Inf by features max ?
  features = as.data.frame(apply(features, 2, cpp_replace_non_finite), stringsAsFactors = TRUE)
  
  # determines length of text_segment2
  # comma (ASCII 0x2C) is not allowed in features names according to fcs specifications so it is replaced by a space
  feat_names = parseFCSname(gsub(","," ",names(features),fixed=TRUE))
  N = feat_names$PnN
  tmp = duplicated(toupper(N))
  if(any(tmp)) stop("$PnN should be unique\n\t- ", paste0(N[tmp], collpase="\n\t- "))
  S = feat_names$PnS
  # S[S == ""] <- names(features)[S == ""] # should we add PnS when not here ? IDEAS does not export $PnS
  L = length(N)
  text2_length = 0
  text_segment2 = lapply(seq_len(L), FUN = function(i) {
    # each time we write /$PnN/<feature_name>/$PnB/32/$PnE/0, 0/$PnG/1/$PnR/<feature_max_value> and /$PnS/<feature_alt-name> if PnS is not empty
    # bar = 1 + diff(range(features[, i], na.rm = TRUE))
    bar = ceiling(max(features[, i], na.rm = TRUE)) # FIXME since we write type "F" this has no importance, shall we use 262144, as it is commonly used ?
    foo = c(paste0("$P",i,"N"), N[i],
            paste0("$P",i,"B"), "32",
            paste0("$P",i,"E"), "0, 0",
            paste0("$P",i,"G"), "1",
            paste0("$P",i,"R"), bar)
    if(S[i] != "") foo = c(foo, paste0("$P",i,"S"), S[i])
    if(any(sapply(foo, FUN = function(x) substr(x,1,1) == delimiter))) stop("keyword-value pairs should not start with 'delimiter'[",delimiter,"]:\n\t- ",
                                                                            paste0(paste0(foo[rep_len(c(TRUE, FALSE), length(foo))], "[", foo[rep_len(c(FALSE, TRUE), length(foo))], "]"), collapse="\n\t- "))
    foo = gsub(pattern = delimiter, x = foo, replacement = delimiter_esc, fixed=TRUE)
    foo = charToRaw(paste(c("", foo), collapse = delimiter))
    text2_length <<- text2_length + length(foo)
    return(foo)
  })
  cyt = obj$description$FCS$instrument
  if((length(cyt) == 0 ) || (cyt == "")) cyt = "Image Stream"
  if(!missing(cytometer)) cyt = cytometer
  if(length(obj$fileName_image) == 0) obj$fileName_image = ""
  
  # init text segment with mandatory + custom parameters #* = mandatory
  text_segment1 = list("$BEGINSTEXT" = "0",                                                      #*
                       "$ENDSTEXT" = "0",                                                        #*
                       "$BEGINANALYSIS" = "0",                                                   #*
                       "$ENDANALYSIS" = "0",                                                     #*
                       "$BYTEORD" = c("4,3,2,1", "1,2,3,4")[(.Platform$endian == "little") + 1], #*
                       "$DATATYPE" = "F",                                                        #*
                       "$MODE" = "L",                                                            #*
                       "$NEXTDATA" = "0",                                                        #*
                       "$TOT" = num_to_string(obj$description$ID$objcount),                      #*
                       "$PAR" = L,                                                               #*
                       #* BEGINDATA and ENDDATA are also mandatory and will be added afterwards
                       #* PnB, PnE, PnN, and PnR are also mandatory and are part of text_segment2
                       #* PnG is not mandatory but will be filled in part 2
                       "$CYT" = cyt,
                       "@IFC_fileName" = obj$fileName,
                       "@IFC_fileName_image" = obj$fileName_image,
                       "@IFC_version" = pkg_ver,
                       "@IFC_date" = now
  )
  
  # gather keywords in priority order 
  text_segment1 = c(text_segment1, dots, obj$Keywords)
  # removes keywords whose values are NULL
  text_segment1 = text_segment1[sapply(text_segment1, length) != 0]
  # removes duplicated keywords (priority order is important here)
  text_segment1 = text_segment1[!duplicated(toupper(names(text_segment1)))]
  # removes not allowed keywords (e.g. in text_segment2 ($PnN, $PnS, $PnB, $PnE, $PnG, $PnR) or "")
  text_segment1 = text_segment1[setdiff(names(text_segment1),c(""))]
  text_segment1 = text_segment1[!grepl("^\\$P\\d+[N|S|B|E|G|R]$", names(text_segment1), ignore.case = TRUE)]
  
  # determines length of data
  data_length = 4 * L * nrow(features)
  
  # determines length of mandatory param
  N = names(text_segment1)
  text1_length = sum(c(nchar("$BEGINDATA"), 2, # 2 for additional delimiters
                       nchar("$ENDDATA"),   2, # 2 for additional delimiters, there is already one at the beg of text2
                       sapply(seq_along(text_segment1), FUN = function(i) {
                         foo = c(N[i], text_segment1[[i]])
                         if(any(sapply(foo, FUN = function(x) substr(x,1,1) == delimiter))) stop("keyword-value pairs should not start with 'delimiter' [",delimiter,"]:\n\t- ",
                                                                                                 paste0(foo[1],"[",foo[2],"]"))
                         v = charToRaw(gsub(delimiter, delimiter_esc, N[i], fixed=TRUE))
                         if(any(v < 0x20 | v >= 0x7F)) stop("keyword contains invalid ASCII character, valid are [0x20-0x7E (32-126)]\n\t- ",
                                                            N[i], "[",paste0(paste0("0x",v), collapse = ","),"]")
                         length(charToRaw(gsub(delimiter, delimiter_esc, text_segment1[[i]], fixed=TRUE))) +
                         length(v) + 2 # 2 for additional delimiters
                       }), text2_length,
                       nchar(paste0(header, collapse = ""))
  ))
  
  # compute missing offsets 
  # ENDSTEXT
  # determining text_end is tricky since it depends on its own length
  # so we use a function to optimize it
  f = function(x, text_length) {
    data_beg = x + 1
    data_end = x + data_beg + data_length - 1
    ans = text_length + nchar(num_to_string(data_beg)) + nchar(num_to_string(data_end))
    if(ans != x) ans = f(x = ans, text_length = text_length)
    return(ans)
  }
  text_end = f(x = text1_length, text_length = text1_length)
  if(text_end >= 1e8) stop("primary TEXT segment is too big")
  header$text_end = sprintf("%8i", text_end)
  
  # BEGINDATA / ENDDATA
  data_beg = text_end + 1               # +1 because data start just after text segment end
  data_end = data_beg + data_length - 1 # -1 because last data byte is at minus one from total length
  if((data_beg >= 1e8) || (data_end >= 1e8)) {
    header$data_beg = sprintf("%8i", 0)
    header$data_end = sprintf("%8i", 0)
  } else {
    header$data_end = sprintf("%8i", data_end)
    header$data_beg = sprintf("%8i", data_beg)
  }
  text_segment1 = c(text_segment1, list("$BEGINDATA" = num_to_string(data_beg)))
  text_segment1 = c(text_segment1, list("$ENDDATA"   = num_to_string(data_end)))
  
  towrite = file(description = file_w, open = "wb")
  tryCatch({
    # writes header
    writeBin(object = charToRaw(paste0(header, collapse="")), con = towrite)
    
    # writes text segment1
    N = names(text_segment1)
    lapply(seq_along(text_segment1), FUN = function(i) {
      writeBin(object = charToRaw(paste(c("", 
                                          gsub(delimiter, delimiter_esc, N[i], fixed=TRUE),
                                          gsub(delimiter, delimiter_esc, text_segment1[i], fixed=TRUE)),
                                        collapse = delimiter)), con = towrite)
    })
    
    # writes text segment2
    lapply(seq_along(text_segment2), FUN = function(i) {
      writeBin(object = text_segment2[[i]], con = towrite)
    })
    writeBin(object = charToRaw(delimiter), con = towrite) # we add final delimiter after the last keyword-value
    
    # export features values in data segment
    apply(features, 1, FUN = function(x) writeBin(x, con = towrite, size = 4))
    
    # FIXME write CRC
    writeBin(object = rep(as.raw(0x30), 8), con = towrite) 
    
  }, error = function(e) {
    stop(paste0("Can't create 'write_to' file.\n", write_to,
                ifelse(overwritten,"\nFile was not modified.\n","\n"),
                "See pre-file @\n", normalizePath(file_w, winslash = "/"), "\n",
                e$message), call. = FALSE)
  }, finally = close(towrite))
  if(overwritten) {
    mess = paste0("\n######################\n", write_to, "\nhas been successfully overwritten\n")
    if(!suppressWarnings(file.rename(to = write_to, from = file_w))) { # try file renaming which is faster
      if(!file.copy(to = write_to, from = file_w, overwrite = TRUE)) { # try file.copy if renaming is not possible
        stop(paste0("Can't copy temp file@\n", normalizePath(file_w, winslash = "/"), "\n",
                    "Can't create 'write_to' file.\n", write_to,
                    "\nFile was not modified.\n"), call. = FALSE)
      } else {
        file.remove(file_w, showWarnings = FALSE)
      }
    }
  } else {
    mess = paste0("\n######################\n", write_to, "\nhas been successfully exported\n")
  }
  message(mess)
  return(invisible(write_to))
}

Try the IFC package in your browser

Any scripts or data that you put into this service are public.

IFC documentation built on Sept. 14, 2023, 1:08 a.m.