point: Reading, Processing, and Analysing Raw Ion Count Data

Documented in read_IC read_meta

#' Read raw ion count data
#'
#' \code{read_IC()} is designed to obtain the numerical data associated with ion
#' counts and minimal set meta-data.
#' \code{read_meta()} can be used to specifically retrieve the meta date
#' associated with ion count data analysis, thereby loading specifications
#' related to the optics, the primary and secondary ion beams, and the mass
#' spectrometer.
#'
#' Ion count data consists of time-incremented integer values. These functions
#' are currently only supported for data generated by a NanoSIMS50L. Raw ion
#' count data and accompanying is extracted and collated into a single tibble
#' from text files with the extensions \emph{.is_txt} \emph{.chk_is} and
#' \emph{.stat}, respectively. These files can be found in the directories
#' associated with the SIMS measurements.
#'
#' @param directory A path or connection to a directory containing raw ion count
#' data txt  files.
#' @param meta Logical indicating whether to include full meta-data.
#' @param hide Logical indicating whether metadata is included as columns
#'  \code{FALSE} or as an attribute of the tibble \code{TRUE}.
#'
#'
#' @return A \code{tibble::\link[tibble:tibble]{tibble}} containing raw ion
#' count data and metadata.
#' @export
#' @examples
#' # Use point_example() to access the examples bundled with this package
#'
#' read_IC(point_example("2018-01-19-GLENDON"))
read_IC <- function(directory, meta = FALSE, hide = TRUE){

  # List files
  ls_IC <- read_validator(directory, "is_txt")[["is_txt"]]

  # Collecting measurement data
  tb_IC <- vroom::vroom(
    ls_IC,
    comment = "B",
    delim = "\t",
    skip = 1,
    col_types = "-dd",
    col_select = c("file.nm", t.nm = .data$X, N.rw = .data$Y),
    na = c("X", "Y"),
    id = "file.nm",
    .name_repair = "minimal"
  ) %>%
    tidyr::drop_na() %>%
    dplyr::mutate(
      file.nm =
        dplyr::recode(.data$file.nm, !!! rlang::set_names(names(ls_IC), ls_IC))
    )

  # meta data names according to Cameca
  point_nms <- dplyr::filter(
    point::names_cameca,
    .data$extension == ".is_txt",
    .data$use == "meta"
  )

  tb_meta <- point_lines(ls_IC, pattern = "B", sep = "\\=", id = "num.mt") %>%
    # meta data names
    dplyr::rename(rlang::set_names(point_nms$cameca, nm = point_nms$point)) %>%
    dplyr::mutate(
      species.nm = stringr::str_extract(.data$mass.mt, "(?<=\\().+?(?=\\))"),
      tc.mt = readr::parse_number(.data$tc.mt)
     )

  # vector of detector numbering
  vc_num <- rep(
    # number of detectors
    1:max(tb_meta$num.mt),
    # number of measurements per detector per analysis
    each = nrow(tb_IC) / length(ls_IC) / max(tb_meta$num.mt),
    # total number of measurements
    length.out = nrow(tb_IC)
  )

  tb_rw <- dplyr::left_join(
    tibble::add_column(tb_IC, num.mt = vc_num),
    tb_meta,
    by = c("file.nm", "num.mt")
  )

  # extended meta-data
  if (isTRUE(meta)) {
    suppressMessages(
      tb_rw <- rlang::list2(tb_rw, !!! read_meta(directory)) %>%
        purrr::reduce(dplyr::left_join)
    )
    # Add block number
    tb_rw <- dplyr::group_by(tb_rw, .data$file.nm, .data$species.nm)%>%
      dplyr::mutate(bl.nm = dplyr::ntile(n = as.numeric(.data$bl_num.mt))) %>%
      dplyr::ungroup()
  }
  # hide meta-data
  if (isTRUE(hide)) tb_rw <- fold(tb_rw, type = ".mt")
  tb_rw
  }
#' @rdname read_IC
#'
#' @export
read_meta <- function(directory) {

  # Check validity of directory
  ls_files <- read_validator(directory)
  # vector of cameca variable names
  vc_meta <- dplyr::filter(
    point::names_cameca,
    .data$extension == ".chk_is",
    .data$format == "line"
  )

  # optics set-up
  meta_join <- function(meta) {
    tidyr::pivot_longer(meta, -c(.data$file.nm, .data$id), names_to = "meta")
  }
  tb_ll <- purrr::map(
    vc_meta$cameca,
    ~point_lines(
      ls_files[["chk_is"]],
      pattern = .x,
      sep = "\\:",
      delim = "/"
      )
    ) %>%
    purrr::compact() %>%
    purrr::map(meta_join) %>%
    dplyr::bind_rows() %>%
    dplyr::distinct(.data$file.nm, .data$meta, .data$value, .keep_all = TRUE) %>%
    dplyr::select(-.data$id) %>%
    tidyr::pivot_wider(names_from = "meta")

  # PHD
  tb_phd <- point_table(
    directory,
    pattern_begin = "Phd Centering Results",
    pattern_end = "E0S Centering Results",
    file_type = "chk_is",
    col_names = c("", "num.mt", "M_PHD.mt", "SD_PHD.mt", "EMHV.mt"),
    col_types = "-cddd",
    nudge_top = 1,
    nudge_tail = -3
  ) %>%
    dplyr::mutate(num.mt = readr::parse_number(.data$num.mt))

  dplyr::rename(
    tb_ll,
    dplyr::any_of(rlang::set_names(vc_meta$cameca, nm = vc_meta$point))
  ) %>%
    dplyr::mutate(
      # Add measurement number
      n.rw = as.numeric(.data$`bl_num.mt`) * as.numeric(.data$`meas_bl.mt`),
      # Add electron detector type (EM or FC)
      det_type.mt =  dplyr::if_else("FC_start.mt" %in% colnames(.), "FC", "EM")
    ) %>%
    list(tb_phd)
  }

#' Get path to point example
#'
#' This function comes from the package `readr`, and has been modified to access
#' the bundled datasets in directory `inst/extdata` of `point`. This
#' function make them easy to access. This function is modified from
#' \code{\link[readr:readr_example]{readr_example}} of the package
#' \code{\link[readr]{readr}}.
#'
#' @param path Name of file. If `NULL`, the example files will be listed.
#' @export
#' @examples
#' point_example()
#' point_example("2018-01-19-GLENDON")
point_example <- function(path = NULL) {
  if (is.null(path)) {
    dir(system.file("extdata", package = "point"))
  } else {
    system.file("extdata", path, package = "point", mustWork = TRUE)
  }
}

#' Check if directory is suitable for point
#'
#' This function checks whether the necessary files for the `point` read
#' functions are included in the directory.
#'
#' @param directory A path or connection to a directory containing raw ion count
#' data files.
#' @param types Regular expression for the required file extensions. Default
#' searches for files ending with .is_txt, .chk_is, and .stat
#'
#' @return A logical indicating whether the directory is suitable for `point`
#' @export
#' @examples
#' ICdir_chk(point_example("2018-01-19-GLENDON"))
ICdir_chk <-function(directory, types = c("is_txt", "chk_is", "stat")){
  # types <- paste0(".", types)
  # check if type is valid
  sys_types <- c("is_txt", "chk_is", "stat") %>% rlang::set_names()
  if (any(types %in% sys_types)) {
    types <- sys_types[sys_types %in% types]
  } else {
    stop("Unknown extension.", call. = FALSE)
  }
  # directory name if also file name
  dir_nm <- stringr::str_extract(
    directory,
    stringr::str_c("(?<=", dirname(directory), "/)(.)+")
  )
  ls_files <- fs::dir_ls(directory)%>%
    purrr::keep(stringr::str_detect(., pattern = dir_nm))
  ls_names <- unique(fs::path_ext_remove(fs::path_file(ls_files))) %>%
    purrr::keep(stringr::str_detect(., pattern = "(_[:digit:]+_[:digit:]+)$"))
  ls_types <- purrr::cross(list(directory, ls_names, ext = types)) %>%
    purrr::map_chr(purrr::lift(fs::path)) %>%
    rlang::set_names(nm = rep(ls_names, dplyr::n_distinct(types)))

  if (length(ls_types > 0) & all(ls_types %in% ls_files)) {
    # makes grouped list
    split(ls_types, rep(names(types), each = dplyr::n_distinct(ls_names)))
  } else {
    FALSE
  }
}

#' Access and hide IC metadata
#'
#' \code{unfold()} helps unpack metadata associated with ion count
#' data loaded with \code{read_IC()}. \code{fold()} does the opposite an hides
#' the metadata as attribute of the tibble.
#'
#' @param df A tibble containing ion count data along any point of the point-
#' workflow
#' @param type A character string identifying the metadata (default:
#' \code{"metadata"})
#' @param merge Logical dictating whether metadata is joined to the tibble or
#' returned as a separate file.
#' @param meta Additional tibble containing the metadata for storage along the
#' main IC data.
#'
#' @return A tibble with metadata as an attribute, columns or as a seperate
#' tibble.
#' @export
#' @examples
#' tb_rw <- read_IC(point_example("2018-01-19-GLENDON"), hide = TRUE)
#'
#' # Unfold metadata
#' unfold(tb_rw, merge = FALSE)
unfold <- function(df, type = "metadata", merge = TRUE) {
  # no attribute of name type return unchanged data
  if (is.null(attr(df, type))) {
    warning("Attribute unavailable.", call. = FALSE)
    return(df)
  }
  meta <- attr(df, type)
  vars <- dplyr::select(meta, dplyr::ends_with(".nm")) %>%
    colnames()
  vars <- vars[which(vars %in% colnames(df))]
  if (merge) return(dplyr::left_join(df, meta, by = vars)) else return(meta)
}
#' @rdname unfold
#'
#' @export
fold <- function(df, type, meta = NULL) {

  vc_type <- c(`metadata` = ".mt", `rawdata` = ".rw", `modeldata` = ".ml")
  vc_type <- vc_type[vc_type %in% type]

  if (is.null(meta)){
    tb <- dplyr::select(df, -c(dplyr::ends_with(type)))
    ls_tb <- purrr::map(
      vc_type,
      ~dplyr::select(df, dplyr::ends_with(".nm") | dplyr::ends_with(.x))
    )
    ls_tb[[length(vc_type) + 1]] <- (tb)
  } else {
    ls_tb <- rlang::list2(metadata = meta, df)
  }
  purrr::reduce2(rev(ls_tb), rev(names(vc_type)), write_attr)
}

#-------------------------------------------------------------------------------
# Function for testing and validation (NOT EXPORTET)
#-------------------------------------------------------------------------------
# Validation function to check for empty files or files with empty columns
read_validator <- function(directory, types = c("is_txt", "chk_is", "stat")){

  # Argument class check
  stopifnot(fs::is_dir(directory))

  # Check if directory contains files
  if (length(dir(directory)) == 0) {
    stop("`directory` does not contain any files.", call. = FALSE)
  }
  # Check if directory contains specified file types
  if (isFALSE(ICdir_chk(directory, types))) {
    stop(
      paste0("`directory` does not contain required filetypes: .is_txt,",
             " .chk_is, and .stat."),
      call. = FALSE
    )
  } else {
    ls_files <- ICdir_chk(directory, types)
  }
  # Length check of txt files
  if ("is_txt" %in% types & any(missing_text(ls_files[["is_txt"]]) == 0)) {
   good <- missing_text(ls_files[["is_txt"]]) > 0
   ls_files[["is_txt"]] <- ls_files[["is_txt"]][good]
   warning("Empty txt file removed.", call. = FALSE)
  } else {
  return(ls_files)
  }
}

# Row scanner determine number of rows
row_scanner <- function(ls, reg_expr, return_line = FALSE, nudge = 0) {
  lines <- vroom::vroom_lines(ls)
  pos_line <- stringr::str_which(lines, reg_expr)
  ext_line <- lines[pos_line + nudge]
  # Are these lines identical ?
  if (isTRUE(return_line)) {
    if (length(unique(ext_line)) > 1) {
      warning("Column names are not equal.", call. = FALSE)
    }
    col_nms <- stringr::str_split(
      unique(ext_line),
      "\\s(?=[[:upper:]])"
    )[[1]] %>%
      stringr::str_trim()
    # empty strings
    nm_empty <- stringi::stri_isempty(col_nms)
    col_nms[nm_empty] <- paste0("X", seq_along(nm_empty))[nm_empty]
    list(pos_line, col_nms)
  } else {
    pos_line
  }
}

# File validator. Empty text files
missing_text <- function(files) {
  purrr::map_dbl(files, ~length(vroom::vroom_lines(.x, n_max = 2)))
}

write_attr <- function(df1, df2, nm) {
  attr(df1, nm) <- df2
  df1
}

# extracting single lines from cameca
point_lines <- function(files, pattern = NULL, position = NULL, sep = NULL,
                        delim = ":", id = "id") {
  # names files
  file_nms <- names(files)
  # load all lines
  files <- vroom::vroom_lines(files)

  # filter lines
  if (!is.null(position)) {
    files <- files[position]
  } else if (!is.null(pattern)) {
    files <- stringr::str_subset(
      files,
      pattern =
        stringr::str_c("\\Q", pattern, "\\E", "\\s*", sep, collapse = "|"))
  }

  # short cut if pattern does not exist
  if (length(files) == 0) return(NULL)

  # line numbers
  file_num <- rep(1: (length(files) / length(file_nms)),  length(file_nms))

  # update names if multiple rows are extracted per file
  if (length(files) > length(file_nms)) {
    file_nms <- rep(file_nms, each = length(files) %/% length(file_nms))
  }

  # replace NAs
  files <- stringr::str_replace_all(
    files,
    pattern = "N\\/A",
    replacement = "NA"
  )

  # extract column names with regex
  col_nms <- stringr::str_extract_all(
    files,
    paste0("(?<=(\\", delim, "|^))(.)+?(?=(", sep ,"|$))")
    ) %>%
    purrr::flatten_chr() %>%
    stringr::str_trim() %>%
    unique()

  # regex column names
  remove_reg <- stringr::str_c(
    "(\\Q", col_nms, "\\E\\s*", sep, ")",
    collapse = "|"
  )
  # extract column names regex from output to obtain values
  vals <- stringr::str_remove_all(files, paste0(remove_reg, "|\\s"))
  # create appropriate value separators
  separators <- rep(
    c(rep(delim, length(pattern) - 1), "\n"),
    length(files) / length(pattern)
  )
  vals <- stringr::str_c(vals, separators, collapse = "")
  vroom::vroom(
    I(vals),
    delim = delim,
    # default to character
    col_types = vroom::cols(.default = vroom::col_character()),
    col_names = col_nms,
    show_col_types = FALSE
  ) %>%
    tibble::add_column(
      file.nm = file_nms,
      {{id}} := file_num,
      .before = col_nms[1]
    )

  }

# function to read CAMECA output to validate point output
point_table <- function(directory, pattern_begin, table_depth,
                        pattern_end = NULL, file_type, col_types = NULL,
                        col_names = NULL, nudge_top = 0, nudge_tail = 0,
                        table_dups = NULL) {

  ls_files <- ICdir_chk(directory,  file_type)
  # top position table
  min_row <- purrr::map(
    ls_files[[file_type]],
    ~row_scanner(
      .x,
      pattern_begin,
      return_line = TRUE,
      nudge = nudge_top
    )
  ) %>%
    purrr::transpose()

  # max depth of table
  if (!is.null(pattern_end)) {
    max_row <- purrr::map(
      ls_files[[file_type]],
      ~row_scanner(
        .x,
        pattern_end,
        nudge = nudge_tail
        )
      )
    table_row <- purrr::map2(max_row, min_row[[1]], ~ .x - .y + nudge_tail)
  } else if (!is.null(table_depth)) {
    table_row <- purrr::map(min_row[[1]], ~rep(table_depth, length(.x)))
  }


  # execute reading functions
  purrr::imap_dfr(
    ls_files[[file_type]],
    ~read_point_table(
      .x,
      min_row[[1]][.y],
      table_row[.y],
      if (is.null(col_names)) min_row[[2]][[.y]] else col_names,
      col_types,
      table_dups
      ),
    .id = "file.nm"
  )
}

# reading tables in Cameca meta data
read_point_table <- function(files, skip_rows, table_rows, var_names,
                              col_types, table_dups = NULL) {
  # Na aliases
  NA_aliases <- c(mapply(strrep,"X", 1:10, USE.NAMES = FALSE), "1.#R")
  # execute read function
  purrr::map2_dfr(
    skip_rows,
    table_rows,
    ~vroom::vroom_fwf(
      files,
      vroom::fwf_empty(
        files,
        skip = .x + 1,
        n = .y,
        col_names = var_names
      ),
      skip = .x + 1,
      n_max = .y,
      col_types = col_types,
      na = NA_aliases,
      .name_repair = "minimal"
    ),
    .id = table_dups
  )
}