pbratios: Processing Pb isotope ratio data

#' Extracting raw Pb isotope ratio.
#'
#' Extracts raw 208Pb/207Pb, 206Pb/207Pb, 208Pb/206Pb and 207Pb/206Pb data from the raw .csv file
#' produced by Perkin Elmer Elan ICP-MS.
#'
#' @import data.table
#'
#' @param filename A data set saved in csv format with "," as field separator and "." as decimal
#'   separator. Such file can be obtained from the .rep report provided by the instrument by opening
#'   it with a spreadsheet editor and saving it as .csv file using a local setting = EN-US or
#'   equivalent. The function identifies standard samples for bracketing when labelled as “SRM” and
#'   substitutes their names with “SRM981_X” where X is a progressive integer. Similarly, QC samples
#'   “CRM” are renamed as “CRM482_X”.
#'
#' @param report A string referring to the type of report generated by the instrument Accepted
#'   values are \code{"short"} or \code{"long"}. \itemize{ \item{\bold{Short}}{ reports have only
#'   information about the sample ID, data and time of the measure, a summary with average
#'   intensities and finally, the intensity values for each replicate.} \item{\bold{Long}}{ reports
#'   contain also information of the "short" counterpart and also some information about the
#'   measurement conditions. Additionally, the summary section is reported after the intensity
#'   values for each replicates. These reports are tipically produced when working in isotope ratio
#'   mode.} }
#'
#' @return A data.table containing raw intensities and raw Pb isotope ratios for each sample.
#'   Provided isotope ratios are 208Pb/207Pb, 206Pb/207Pb, 208Pb/206Pb and 207Pb/206Pb. Note that
#'   these ratios have still to be inspected for outliers, summarised and corrected for mass bias.
#'
#' @examples
#' file.short <- system.file("extdata", "sednya_2015.csv", package = "pbratios")
#' sednya.2015 <- extract_data(file.short, report = "short")
#'
#' file.long <- system.file("extdata", "spmnya_2012.csv", package = "pbratios")
#' spmnya.2012 <- extract_data(file.long, report = "long")
#'
#' @export
#'
extract_data <- function(filename, report = c("short", "long")) {
  report <- match.arg(report)

  if (report != "short" & report != "long") {
    stop('Error: report must be either equal to "short" or "long".')
  }

  # Loading the file
  data <- fread(filename, header = FALSE, na.strings = "")

  # Extracting the sample name
  sampleid <- data[grepl("Sample ID", V1), V2]

  # Fixing names for SRMs and CRMs
  srm_id <- grepl("SRM", sampleid) & !grepl("test", sampleid)
  crm_id <- grepl("CRM", sampleid)

  sampleid[srm_id] <- paste("SRM981", 1:length(sampleid[srm_id]), sep = "_")
  sampleid[crm_id] <- paste("CRM482", 1:length(sampleid[crm_id]), sep = "_")

  if (report == "short"){

    # Row number for "Replicates" and "Sample ID"
    rep.row <- data[, row.id := 1:.N][grepl("Replicates", V1), row.id]
    sample.row <- data[grepl("Sample ID", V1), row.id][-1]
    sample.row[length(sample.row) + 1] <- data[.N, row.id + 1]
    # Row numbers with data for each sample
    n.row <- sample.row - rep.row - 1

    # Select row with data
    data <- data[do.call(c, mapply(seq, rep.row + 1, sample.row - 1, SIMPLIFY = FALSE)), ][,
    # Assigning sample names
                      sample := rep(sampleid, n.row)][
    # Remove rows with "Analyte"
                      -grep("Analyte", V1)]

    data <- setnames(
      data,
      old = c("V1", "V2", "V3"),
      new = c("element", "isotope", "intensity")
    )
  }

  if (report == "long"){

    # Row number for "Replicates" and "Summary"
    rep.row <- data[, row.id := 1:.N][grepl("^Replicates$", V1), row.id]
    summary.row <- data[grepl("^Summary$", V1), row.id]

    # Row numbers with data for each sample
    n.row <- summary.row - rep.row - 1


    # Select row with data
    data <- data[do.call(c, mapply(seq, rep.row + 1, summary.row - 1, SIMPLIFY = FALSE)), ][,
    # Assigning sample names
                sample := rep(sampleid, n.row)][
    # Remove rows with "Replicate"
                -grep("Replicate", V1)][
    # Remove rows with "Analyte"
                -grep("Analyte", V2)]

  ## Rename the columns
  data <- setnames(
          data,
          old = c("V2", "V3", "V4"),
          new = c("element", "isotope", "intensity")
  )

  }

  # Merge columns "element" and "isotope" to the new column "nuclide"
  data <- data[, nuclide := paste0(element, isotope)][,
               .(sample, nuclide, intensity)]

  ## Assigning each column to its right class
  data.factor <- c("sample", "nuclide")
  data[, (data.factor) := lapply(.SD, as.factor),
       .SDcols = data.factor][, intensity := as.double(intensity)] # S values <- NAs

  # Put the dataset in wide format
  lvls <- as.character(unique(data$sample))
  data <- dcast(data[, id := 1:.N, by = .(sample, nuclide)],
                      id + sample ~ nuclide, value.var = 'intensity')[,
                      sample := factor(sample, levels = lvls)][order(sample)]

  # Calculate the raw ratios
  data[, c("Pb208207",
           "Pb206207",
           "Pb208206",
           "Pb207206") :=
           .(Pb208 / Pb207,
             Pb206 / Pb207,
             Pb208 / Pb206,
             Pb207 / Pb206)][, -"id", with = FALSE]
}
# End of function