#' Import a label-free proteomics dataset from Peaks
#'
#' @param filename a features.csv file exported by Peaks
#' @param collapse_peptide_by if multiple data points are available for a peptide in a sample, at what level should these be combined? options: "sequence_modified" (recommended default), "sequence_plain", ""
#'
#' @importFrom tidyr pivot_longer
#' @export
import_dataset_peaks = function(filename, collapse_peptide_by = "sequence_modified") {
reset_log()
append_log("reading features.csv generated by Peaks", type = "info")
if(!(collapse_peptide_by %in% c("sequence_plain", "sequence_modified", ""))) {
append_log('collapse_peptide_by parameter must be any of; "sequence_plain", "sequence_modified", ""', type = "error")
}
# first, check if input file exists
check_parameter_is_string(filename)
# will check for presence of file as well as .gz/.zip extension if file doesn't exist, will throw error if both do not exist
filename = path_exists(filename, NULL, try_compressed = TRUE)
# validate the input is peaks (recycle generic function which includes robust matching and error messages)
attributes_required = list(sequence_modified = "Peptide",
protein_id = "Accession",
charge = "z")
# read first line from file; read 1 line into data.frame without colnames with first row having all values, then convert to character array
# this leverages data.table to infer what character was used to delimit columns
headers = as.character( read_textfile_compressed(filename, as_table = T, nrow = 1, header = F, data.table = F) )
# guestimate the sample names from column headers
regex_rt = "[ .]rt[ .]mean$"
regex_intensity = "[ .](normalized[ .]){0,1}area$"
map_sample_rt = grep(regex_rt, headers, ignore.case = T)
sample_id = gsub(regex_rt, "", headers[map_sample_rt])
cols_int = which(gsub(regex_intensity, "", headers, ignore.case = T) %in% sample_id)
# input validation
if(length(sample_id) != length(cols_int) || length(cols_int) != length(map_sample_rt)) {
append_log(sprintf("cannot locate a column for both RT and intensity for all samples! RT columns: [%s] intensity columns: [%s]",
paste(headers[map_sample_rt], collapse = ", "), paste(headers[cols_int], collapse = ", ")), type = "error")
}
# list of all columns we want
attributes_composed = attributes_required
for(n in headers[map_sample_rt]) attributes_composed[[n]] = n
for(n in headers[cols_int]) attributes_composed[[n]] = n
# basically this reads the CSV/TSV table from file and maps column names to expected names.
# (complicated) downstream code handles compressed files, efficient parsing of only the requested columns, etc.
tibw = read_table_by_header_spec(filename, attributes_required = attributes_composed, as_tibble_type = TRUE)
# filter invalid entries; empty value in any column-of-interest
tibw = tibw %>%
filter_if(is.numeric, all_vars(is.finite(.) & . > 0)) %>%
filter_if(is.character, all_vars(. != ""))
### convert from wide-to-long format
# peptide description
tibw$sequence_plain = gsub("(\\[[^]]*\\])|(\\([^)]*\\))", "", tibw$sequence_modified)
tibw$peptide_id = paste0(tibw$sequence_modified, "_", tibw$charge)
## rename RT and intensity columns
# first, replace illegal characters; replace all # with _
colnames(tibw) = gsub("#+", "_", colnames(tibw))
# slightly over-complicated regex to account for scenarios where sample names contain multiple spaces (bad practice, but users could name stuff; sample cond1 rep2)
colnames(tibw) = gsub(paste0("^ *(\\S.*\\S) *", regex_rt), "rt###\\1", colnames(tibw), ignore.case = T)
colnames(tibw) = gsub(paste0("^ *(\\S.*\\S) *", regex_intensity), "intensity###\\1", colnames(tibw), ignore.case = T)
tib = pivot_longer(tibw,
cols = -c("peptide_id", "sequence_plain", names(map_required)),
names_to = c(".value", "sample_id"),
names_sep = "###")
# compatability with downstream pipeline (standard in every import function)
tib$confidence = NA
tib$detect = T
tib$intensity[!is.finite(tib$intensity) | tib$intensity < 0] = NA
tib$intensity = log2(tib$intensity)
tib$intensity[!is.na(tib$intensity) & tib$intensity < 0] = 0 # note; we already removed zero intensity values when importing. here, we threshold extremely low values
tib$isdecoy = F
tib = tib %>% filter(!is.na(intensity))
# collapse peptides by plain or modified sequence (eg; peptide can be observed in some sample with and without modifications, at multiple charges, etc)
if(collapse_peptide_by == "") {
# if 'no collapse' is set, at least merge by modseq and charge. eg; there may be multiple peaks for some peptide with the same charge throughout retention time
tib = peptides_collapse_by_sequence(tib, prop_peptide = "peptide_id")
append_log("NOT collapsing peptides by plain/modified-sequence, thus 2 observations of the same sequence with different charge are considered a distinct peptide_id. This is not recommended for DDA!", type = "warning")
} else {
tib = peptides_collapse_by_sequence(tib, prop_peptide = collapse_peptide_by) # alternative, collapse modified sequences; prop_peptide = "sequence_modified"
}
log_peptide_tibble_pep_prot_counts(tib)
return(list(peptides=tibble_peptides_reorder(tib), proteins=empty_protein_tibble(tib), acquisition_mode = "dda"))
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.