msdap: Mass Spectrometry Downstream Analysis Pipeline

Documented in import_dataset_peaks

#' Import a label-free proteomics dataset from Peaks
#'
#' @param filename a features.csv file exported by Peaks
#' @param collapse_peptide_by if multiple data points are available for a peptide in a sample, at what level should these be combined? options: "sequence_modified" (recommended default), "sequence_plain", ""
#'
#' @importFrom tidyr pivot_longer
#' @export
import_dataset_peaks = function(filename, collapse_peptide_by = "sequence_modified") {
  reset_log()
  append_log("reading features.csv generated by Peaks", type = "info")

  if(!(collapse_peptide_by %in% c("sequence_plain", "sequence_modified", ""))) {
    append_log('collapse_peptide_by parameter must be any of; "sequence_plain", "sequence_modified", ""', type = "error")
  }

  # first, check if input file exists
  check_parameter_is_string(filename)
  # will check for presence of file as well as .gz/.zip extension if file doesn't exist, will throw error if both do not exist
  filename = path_exists(filename, NULL, try_compressed = TRUE)


  # validate the input is peaks (recycle generic function which includes robust matching and error messages)
  attributes_required = list(sequence_modified = "Peptide",
                             protein_id = "Accession",
                             charge = "z")

  # read first line from file; read 1 line into data.frame without colnames with first row having all values, then convert to character array
  # this leverages data.table to infer what character was used to delimit columns
  headers = as.character( read_textfile_compressed(filename, as_table = T, nrow = 1, header = F, data.table = F) )
  # guestimate the sample names from column headers
  regex_rt = "[ .]rt[ .]mean$"
  regex_intensity = "[ .](normalized[ .]){0,1}area$"
  map_sample_rt = grep(regex_rt, headers, ignore.case = T)
  sample_id = gsub(regex_rt, "", headers[map_sample_rt])
  cols_int = which(gsub(regex_intensity, "", headers, ignore.case = T) %in% sample_id)

  # input validation
  if(length(sample_id) != length(cols_int) || length(cols_int) != length(map_sample_rt)) {
    append_log(sprintf("cannot locate a column for both RT and intensity for all samples! RT columns: [%s]   intensity columns: [%s]",
                       paste(headers[map_sample_rt], collapse = ", "), paste(headers[cols_int], collapse = ", ")), type = "error")
  }

  # list of all columns we want
  attributes_composed = attributes_required
  for(n in headers[map_sample_rt]) attributes_composed[[n]] = n
  for(n in headers[cols_int]) attributes_composed[[n]] = n

  # basically this reads the CSV/TSV table from file and maps column names to expected names.
  # (complicated) downstream code handles compressed files, efficient parsing of only the requested columns, etc.
  tibw = read_table_by_header_spec(filename, attributes_required = attributes_composed, as_tibble_type = TRUE)

  # filter invalid entries; empty value in any column-of-interest
  tibw = tibw %>%
    filter_if(is.numeric, all_vars(is.finite(.) & . > 0)) %>%
    filter_if(is.character, all_vars(. != ""))


  ### convert from wide-to-long format
  # peptide description
  tibw$sequence_plain = gsub("(\\[[^]]*\\])|(\\([^)]*\\))", "", tibw$sequence_modified)
  tibw$peptide_id = paste0(tibw$sequence_modified, "_", tibw$charge)

  ## rename RT and intensity columns
  # first, replace illegal characters; replace all # with _
  colnames(tibw) = gsub("#+", "_", colnames(tibw))
  # slightly over-complicated regex to account for scenarios where sample names contain multiple spaces (bad practice, but users could name stuff; sample cond1 rep2)
  colnames(tibw) = gsub(paste0("^ *(\\S.*\\S) *", regex_rt), "rt###\\1", colnames(tibw), ignore.case = T)
  colnames(tibw) = gsub(paste0("^ *(\\S.*\\S) *", regex_intensity), "intensity###\\1", colnames(tibw), ignore.case = T)
  tib = pivot_longer(tibw,
                     cols = -c("peptide_id", "sequence_plain", names(map_required)),
                     names_to = c(".value", "sample_id"),
                     names_sep = "###")


  # compatability with downstream pipeline (standard in every import function)
  tib$confidence = NA
  tib$detect = T
  tib$intensity[!is.finite(tib$intensity) | tib$intensity < 0] = NA
  tib$intensity = log2(tib$intensity)
  tib$intensity[!is.na(tib$intensity) & tib$intensity < 0] = 0 # note; we already removed zero intensity values when importing. here, we threshold extremely low values
  tib$isdecoy = F

  tib = tib %>% filter(!is.na(intensity))

  # collapse peptides by plain or modified sequence (eg; peptide can be observed in some sample with and without modifications, at multiple charges, etc)
  if(collapse_peptide_by == "") {
    # if 'no collapse' is set, at least merge by modseq and charge. eg; there may be multiple peaks for some peptide with the same charge throughout retention time
    tib = peptides_collapse_by_sequence(tib, prop_peptide = "peptide_id")
    append_log("NOT collapsing peptides by plain/modified-sequence, thus 2 observations of the same sequence with different charge are considered a distinct peptide_id. This is not recommended for DDA!", type = "warning")
  } else {
    tib = peptides_collapse_by_sequence(tib, prop_peptide = collapse_peptide_by) # alternative, collapse modified sequences; prop_peptide = "sequence_modified"
  }

  log_peptide_tibble_pep_prot_counts(tib)
  return(list(peptides=tibble_peptides_reorder(tib), proteins=empty_protein_tibble(tib), acquisition_mode = "dda"))
}