#' Converting MaxQuant Output file proteinGroups.txt to CLARION format
#' by creating a headline of metadata for each column
#'
#' List with columns of reduced version (see config.json file)
#' If you only want the samples of a specific keyword write: column;exp
#' For example:
#' You got:
#' Intensity
#' Intensity 'experiment_name'
#' Do you want both add "Intensity" to the list.
#' Do you only want the sample add "Intensity;exp" to the list
#' Anything else like 'Intensity;ex' or 'Intensity;' results in writing both.
#' Only works if there are samples of that type. If not, column does not show up in file
#'
#' @author Rene Wiegandt
#' @param proteinGroups_in path of proteinGroup.txt file
#' @param summary_in path of belonging summary.txt file
#' @param outfile path of full CLARION output file
#' @param outfile_reduced path of reduced CLARION output file
#' @param config path of config file (containing information about metadata)
#' @param delimiter delimiter (Default = ;)
#' @param format pre-header information about format (optional)
#' @param version pre-header information about version (optional)
#' @param experiment_id pre-header information about experiment id (optional)
#'
#' @return TRUE on success
#'
#' @export
parse_MaxQuant <- function(proteinGroups_in, summary_in, outfile, outfile_reduced, config = system.file("extdata", "parser_MaxQuant_config.json", package = "wilson"), delimiter = ";", format = NULL, version = NULL, experiment_id = NULL){
if (missing(proteinGroups_in)) {
stop("The proteinGroups file was not given")
}
if (missing(summary_in)) {
stop("The summary file was not given")
}
if (missing(outfile)) {
stop("The output file was not given")
}
if (missing(outfile_reduced)) {
stop("The output_reduced file was not given")
}
# parsers json config file
# @param config path of config file
# @return data.table with metadata
get_meta_from_config <- function(meta_config) {
dr <- lapply(meta_config$meta, function(r) {
data.table::data.table("col_name" = r$col_name,
"level" = r$level,
"type" = r$type,
"label" = r$label,
"sublabel" = r$sublabel
)
})
do.call("rbind", dr)
}
# Get the type of the sample
# Cutting of the experiment name and checking in given lists if left column name is in one of those lists
# Depending on in which list it is returns the type
# @param exp_name experiment name
# @param scores,ratios,category,ary vectors with Strings
# @return String type of given column
get_sample_type <- function(name, scores, ratios, prob, category, ary) {
name_split <- strsplit(name, " ")
if (grepl("[[:digit:]]{1,2}", utils::tail(name_split[[1]], 1))) {
name <- paste(utils::head(name_split[[1]], -1), collapse = " ")
}
if (name %in% scores) return("score")
if (name %in% ratios) return("ratio")
if (name %in% prob) return("probability")
if (name %in% category) return("category")
if (name %in% ary) return("array")
return("unknown")
}
# Get the level of the sample column
# Checking for keywords inside of the column name
# Each keyword is given one level
# @param col_head column head
# @return String level of given column
get_sample_level <- function(col_head, isSample, full_list) {
# Get the level of all 'sample' columns.
# Default: level is "sample"
if (grepl("Ratio", col_head, perl = TRUE)) {
if (grepl("type", col_head, perl = TRUE)) return("feature")
return("contrast")
}
if (grepl("type", col_head, perl = TRUE)) return("feature")
if (grepl("Fraction [[:digit:]]{1,2}", col_head, perl = TRUE)) return("condition")
if (isSample) return("sample")
if (col_head %in% full_list) return("contrast")
return("unknown")
}
# Get label and sublabel of remaining columns
# remaining columns <- columns which are not in meta or a sample column
# @param col_head column head
# @return list with label and sublabel
get_remaining_labeling <- function(col_head) {
col_head_split <- strsplit(col_head, " ")
if (length(col_head_split) > 1) {
sublabel <- col_head_split[[1]][length(col_head_split[[1]])]
if (grepl(sublabel, "[%]", fixed = TRUE)) {
label <- paste(utils::head(col_head_split[[1]], -2), collapse = " ")
sublabel <- paste(utils::tail(col_head_split[[1]], 2), collapse = " ")
} else {
label <- paste(utils::head(col_head_split[[1]], -1), collapse = " ")
sublabel <- col_head_split[[1]][length(col_head_split[[1]])]
}
return(list(label, sublabel))
}
return(list(col_head, ""))
}
# Get list of all column names with experiment name, which are in the reduced version
# @param meta_full data table with metadata
# @param reduced_list raw list of all columns of every quantification method, which need to in the reduced version
# @param exp_names list of experiment names
# @param col_names list of all column names
# @return list: 1 <- list of all reduced column names, 2 <- columns which are in the raw list but not in the final list
get_reduced_version <- function(meta_full, reduced_list, exp_names, col_names) {
red_split <- strsplit(reduced_list, ";")
red_exp_list <- unlist(lapply(red_split, function(split_name) {
# if length == 2 entry of reduced list is column_name;exp
# only adding column_name + experiment name to the list
if (length(split_name) == 2) {
red_col_name <- lapply(exp_names, function(exp_name) {
rcn <- paste(split_name[1], exp_name)
# special case for Sequence Coverage exp_name [%]
if (!(rcn %in% col_names) && (paste(rcn, "[%]") %in% col_names)) {
rcn <- paste(rcn, "[%]")
}
return(rcn)
})
} else {
# adding both column name + experiment_name and column name without experiment name to list
red_col_name <- lapply(1:(length(exp_names) + 1), function(i) {
if (i <= length(exp_names)) {
if (paste(split_name, exp_names[i]) %in% col_names) {
return(paste(split_name, exp_names[i]))
}
} else {
return(split_name)
}
})
}
}))
overlap <- setdiff(red_exp_list, col_names)
red_exp_list <- red_exp_list[!(red_exp_list %in% overlap)]
return(list(red_exp_list, overlap))
}
# writing clarion file
# @param meta data table with meta data
# @param out output file
# @param format format
# @param version version number
# @param exp_id experiment id
# @param pGroups data table protein groups file
write_clarion_file <- function(meta, out, format, version, exp_id, pGroups, delimiter) {
to_append <- FALSE
if (!missing(format)) {
write(paste0("!format=", format), file = out, append = to_append)
to_append <- TRUE
}
if (!missing(version)) {
write(paste0("!version=", version), file = out, append = to_append)
to_append <- TRUE
}
if (!missing(exp_id)) {
write(paste0("!experiment_id=", exp_id), file = out, append = to_append)
to_append <- TRUE
}
write(paste0("!delimiter=", delimiter), file = out, append = to_append)
write("#key\tfactor1\tlevel\ttype\tlabel\tsub_label", file = out, append = TRUE)
data.table::fwrite(meta, file = out, sep = "\t", append = TRUE, col.names = FALSE, quote = FALSE)
data.table::fwrite(pGroups, file = out, sep = "\t", append = TRUE, col.names = TRUE, quote = FALSE)
}
# reading files in data tables
proteinGroups <- data.table::fread(proteinGroups_in, header = TRUE, quote = "")
summary_file <- data.table::fread(summary_in, header = TRUE)
meta_config <- tryCatch({
rjson::fromJSON(file = config)
}, error = function(cond) {
stop("Could not read config file")
}, warning = function(w) {
stop("Could not read config file")
})
# getting experiment names
if ("Experiment" %in% colnames(summary_file)) {
exp_names <- unique(summary_file[Experiment != "", Experiment])
} else {
stop("wrong format on summary file: column \'Experiment\' misssing")
}
meta <- get_meta_from_config(meta_config = meta_config)
sample_scores <- meta_config$type_scores
sample_ratios <- meta_config$type_ratios
sample_probability <- meta_config$type_probability
sample_category <- meta_config$type_category
sample_ary <- meta_config$type_array
reduced_list <- meta_config$reduced_list
full_sample_list <- c(sample_scores, sample_ratios, sample_probability, sample_category, sample_ary)
if (is.null(reduced_list)) {
stop("reduced_list is missing in config file")
}
# get column names
col_names <- colnames(proteinGroups)
# delete rows from meta, which are not in col_names
meta_trim <- meta[col_name %in% col_names]
# get metadata for each sample column
# append rows to data table with metadata
samples_list <- lapply(col_names, function(col_head) {
unlist(lapply(exp_names, function(name) {
name_brackets <- paste0("\\Q", name)
exp_regex <- paste0("\\Q ", name)
sample_description <- strsplit(col_head, exp_regex)
if (length(grep(name_brackets, col_head, perl = TRUE)) == 1 ) { # Does column name contains experiment name?
de <- data.table::data.table("col_name" = c(col_head),
"level" = c(get_sample_level(col_head = col_head, isSample = TRUE, full_list = full_sample_list)),
"type" = c(get_sample_type(name = sample_description[[1]][1], scores = sample_scores,
ratios = sample_ratios, prob = sample_probability, category = sample_category,
ary = sample_ary)),
"label" = c(name),
"sublabel" = c(trimws(gsub(name_brackets, "", col_head), "r"))
)
return(de)
}
}))
})
samples <- do.call("rbind", Filter(Negate(is.null), samples_list))
meta_half <- rbind(meta_trim, samples)
# get metadata for each remaining column(columsn which are specific for a certain quantification methode)
# append rows to data table with metadata
remaining_list <- lapply(col_names, function(col_head) {
if (!(col_head %in% meta_half[["col_name"]])) {
label_sublabel <- get_remaining_labeling(col_head = col_head)
de2 <- data.table::data.table("col_name" = c(col_head),
"level" = c(get_sample_level(col_head = col_head, isSample = FALSE, full_list = full_sample_list)),
"type" = c(get_sample_type(name = col_head, scores = sample_scores, ratios = sample_ratios,
prob = sample_probability, category = sample_category, ary = sample_ary)),
"label" = c(label_sublabel[1]),
"sublabel" = c(label_sublabel[2])
)
return(de2)
}
})
remaining <- do.call("rbind", Filter(Negate(is.null), remaining_list))
meta_full <- rbind(meta_half, remaining)
meta_full$col_name <- sub("^", "#", meta_full$col_name)
# add column factor 1 and reorder the columns
meta_full$factor1 <- ""
meta_full <- meta_full[, c("col_name", "factor1", "level", "type", "label", "sublabel")]
# get data.table with reduced metadata
reduced <- get_reduced_version(meta_full = meta_full, reduced_list = reduced_list,
exp_names = exp_names, col_names = col_names)
meta_reduced <- meta_full[meta_full$col_name %in% sub("^", "#", reduced[[1]]), ]
# if there are unknown columns the user gets a warning with all unknown columns
# unknown columns wont be writen in the meta data header
if (nrow(meta_full[level == "unknown"]) > 0) {
meta_warn <- gsub("#", "", meta_full[level == "unknown", col_name])
warning("Following columns are unknown and have been removed (Check JSON config file: 'meta'): ", paste(meta_warn, collapse = ", "))
meta_full <- meta_full[level != "unknown"]
}
# if there are unknown column types the user gets a warning with all columns with unknown type
# columns with unknown types wont be writen in the meta data header
if (nrow(meta_full[type == "unknown"]) > 0) {
meta_warn <- gsub("#", "", meta_full[type == "unknown", col_name])
warning("Due to unknown type follwing columns were removed (Check JSON config file: 'type_X'): ", paste(meta_warn, collapse = ", "))
meta_full <- meta_full[type != "unknown"]
}
# writing advanced CLARION file
write_clarion_file(meta = meta_full, out = outfile, format = format,
version = version, exp_id = experiment_id, pGroups = proteinGroups, delimiter = delimiter)
# writing reduced CLARION file
write_clarion_file(meta = meta_reduced, out = outfile_reduced, format = format,
version = version, exp_id = experiment_id, pGroups = proteinGroups, delimiter = delimiter)
return(TRUE)
}
#' Method to parse input file.
#'
#' @param file Path to file that needs parsing.
#' @param dec The decimal separator. See \code{\link[data.table]{fread}}.
#'
#' @return Clarion object. See \code{\link[wilson]{Clarion}}
#'
#' @import data.table
#'
#' @export
parser <- function(file, dec = ".") {
message(paste("Parsing file:", file))
# number of rows for each part
con <- file(file, open = "r")
num_header <- 0
num_metadata <- 0
tryCatch(expr = {
while (TRUE) {
line <- readLines(con = con, n = 1)
if (grepl("^!", line, perl = TRUE)) {
num_header <- num_header + 1
} else if (grepl("^#", line, perl = TRUE)) {
num_metadata <- num_metadata + 1
} else {
break
}
}
}, finally = {
close(con = con)
})
### parse header
if (num_header > 0) {
header <- data.table::fread(input = file, fill = TRUE, header = FALSE, dec = dec, nrows = num_header, integer64 = "double")
# cut of leading !
header <- as.list(gsub("^!", "", header[[1]]))
# make named list
header_names <- gsub("=.*$", "", header, perl = TRUE)
header <- as.list(gsub("^.*?=", "", header, perl = TRUE))
names(header) <- header_names
# remove quotes from delimiter
if (!is.null(header$delimiter) && grepl(header$delimiter, pattern = '^".*"$', perl = TRUE)) {
header$delimiter <- substr(header$delimiter, start = 2, stop = nchar(header$delimiter) - 1)
}
} else {
header <- NULL
}
### parse metadata
metadata <- data.table::fread(input = file, skip = num_header, header = FALSE, nrows = num_metadata, fill = TRUE, dec = dec, integer64 = "double")
# cut off leading #
metadata[, names(metadata)[1] := gsub("^#", "", metadata[[1]])]
# set first line as header
names(metadata) <- as.character(metadata[1])
# remove empty columns
empty_cols <- union(
which(colSums(is.na(metadata)) == nrow(metadata)), # indices of full na columns
which(colSums(metadata == "") == nrow(metadata)) # indices of full "" columns
)
if (length(empty_cols) > 0) {
metadata[, (empty_cols) := NULL]
}
# delete first line
metadata <- metadata[-1]
### parse data
data <- data.table::fread(input = file, header = TRUE, skip = num_header + num_metadata, fill = FALSE, dec = dec, integer64 = "double")
data.table::setindexv(metadata, names(metadata)[1])
data.table::setindexv(data, names(data)[1])
return(Clarion$new(header = header, metadata = metadata, data = data))
}
#' TOBIAS TFBS table to clarion parser
#'
#' Click \href{https://github.molgen.mpg.de/loosolab/TOBIAS}{here} for more information about TOBIAS.
#'
#' @param input Path to input table
#' @param output Output path.
#' @param filter_columns Either a vector of columnnames or a file containing one columnname per row.
#' @param filter_pattern Keep columns matching the given pattern. Uses parameter filter_columns for matching if set. In the case of no matches a warning will be issued and all columns will be used.
#' @param config Json file containing metadata information for all columns. Will use first occurence for duplicate column names.
#' @param omit_NA Logical whether all rows containing NA should be removed.
#' @param condition_names Vector of condition names. Default = NULL. Used to classify columns not provided in config.
#' @param condition_pattern Used to identify condition names by matching and removing given pattern with \code{\link[base]{grep}}. Ignored when condition_names is set.
#' @param in_field_delimiter Delimiter for multi value fields. Default = ','.
#' @param dec Decimal separator. Used in file reading and writing.
#' @param ... Used as header information.
#'
#' @details During conversion the parser will try to use the given config (if provided) to create the \href{https://github.molgen.mpg.de/loosolab/wilson-apps/wiki/Local-usage:Input-format/}{Clarion} metadata. In the case of insufficient config information it will try to approximate by referencing condition names issuing warnings in the process.
#' @details As the format requires an unique id the parser will create one if necessary.
#' @details Factor grouping (metadata factor columns) is currently not implemented!
#'
#' @export
tobias_parser <- function(input, output, filter_columns = NULL, filter_pattern = NULL, config = system.file("extdata", "tobias_config.json", package = "wilson"), omit_NA = FALSE, condition_names = NULL, condition_pattern = "_bound$", in_field_delimiter = ",", dec = ".", ...) {
## filter data columns
# check if filter columns is a file or a vector
if (!is.null(filter_columns) && file.exists(filter_columns)) {
select_columns <- scan(file = filter_columns, what = character())
} else {
select_columns <- filter_columns
}
# filter pattern
if (!is.null(filter_pattern)) {
# use file header if filter_columns is empty
if (is.null(select_columns)) {
# only read header
select_columns <- names(data.table::fread(input = input, header = TRUE, nrows = 0))
}
select_columns <- grep(pattern = filter_pattern, x = select_columns, value = TRUE)
if (identical(select_columns, character(0))) {
warning("No column matches for given filter pattern! Proceeding with all columns.")
}
}
##### data
data <- data.table::fread(input, dec = dec, select = select_columns, header = TRUE)
# omit na rows
if (omit_NA) {
data <- stats::na.omit(data)
}
##### metadata
metadata <- data.table::data.table(names(data))
# load config
if (!is.null(config)) {
config_file <- RJSONIO::fromJSON(config)
# get column names from config file
col_names <- vapply(X = config_file$meta, FUN.VALUE = character(1), FUN = function(x) {
x[["col_name"]]
})
} else {
config_file <- NULL
}
# identify conditions
if (is.null(condition_names)) {
conditions <- gsub(pattern = condition_pattern, replacement = "", x = grep(pattern = condition_pattern, x = metadata[[1]], value = TRUE))
} else {
conditions <- condition_names
}
## create metadata row by row
unique_id_fallback <- NULL
condition_pattern <- paste0(conditions, collapse = "|")
approx_columns <- NULL
meta_rows <- lapply(metadata[[1]], function(x) {
# is column information provided in config?
if (!is.null(config_file) && is.element(x, col_names)) {
# uses first appearance in config file in case of duplicates
return(config_file$meta[[which.max(col_names == x)]][-1])
}
# if no information is provided via config try to guess meta-information
# utilize condition names to do so
approx_columns <<- c(approx_columns, x)
## level
# get distance of all conditions matched to x
# count number of exact substring matches
match_dist <- utils::adist(conditions, x) - nchar(x) + nchar(conditions)
count_matches <- sum(match_dist == 0)
if (count_matches == 1) {
level <- "condition"
} else if (count_matches > 1) {
level <- "contrast"
} else {
level <- "feature"
}
## type
if (level == "feature") {
if (any(grepl(pattern = in_field_delimiter, x = data[[x]], fixed = TRUE))) {
type <- "array"
} else {
# define fallback unique_id in case none is defined through config
# this will redefine first unique feature with type category as unqiue_id
if (is.null(unique_id_fallback) && anyDuplicated(data[[x]]) == 0) {
unique_id_fallback <<- x
}
type <- "category"
}
} else {
if (!is.numeric(data[[x]])) {
type <- "array"
} else if (grepl(pattern = "fc|fold[\\._\\- ]?change", x = x, perl = TRUE, ignore.case = TRUE)) {
type <- "ratio"
} else if (grepl(pattern = "p[\\._\\- ]?val|p[\\._\\- ]?adj", x = x, perl = TRUE, ignore.case = TRUE)) {
type <- "probability"
} else {
type <- "score"
}
}
## label/ sub_label
label <- sub_label <- ""
label_parts <- unlist(strsplit(x = x, split = "_"))
if (length(label_parts) == 1) {
label <- label_parts
} else if (length(label_parts) == 2) {
label <- label_parts[1]
sub_label <- label_parts[2]
} else if (length(label_parts) >= 3 && level == "contrast") {
# replace '_' with whitespace
condition_pattern <- gsub(pattern = "_", replacement = " ", x = condition_pattern, fixed = TRUE)
x <- gsub(pattern = "_", replacement = " ", x = x, fixed = TRUE)
# get first condition using all identified conditions as pattern
first_condition <- gsub(pattern = paste0("(^", condition_pattern, ").*"), replacement = "\\1", x = x)
# strip first condition
stripped_condition <- substring(x, first = nchar(first_condition) + 2) # + 1 because parameter is inclusive and + 1 for whitespace
# get new first condition
second_condition <- gsub(pattern = paste0("(^", condition_pattern, ").*"), replacement = "\\1", x = stripped_condition)
label <- paste0(first_condition, "|", second_condition)
sub_label <- substring(stripped_condition, first = nchar(second_condition) + 2) # + 1 because parameter is inclusive and + 1 for whitespace
} else {
label <- paste0(label_parts[-length(label_parts)], collapse = " ")
sub_label <- label_parts[length(label_parts)]
}
return(c(level, type, label, sub_label))
})
# meta approximation warning
if (!is.null(approx_columns)) {
warning("Missing information in config! Tried guessing meta-information for ", paste0(approx_columns, collapse = ", "))
}
# list of vectors (rows) to matrix
meta_matrix <- do.call(rbind, meta_rows)
metadata <- cbind(metadata, meta_matrix)
names(metadata) <- c("key", "level", "type", "label", "sub_label")
# set unique_id fallback
if (!any(metadata[["type"]] == "unique_id")) {
if (!is.null(unique_id_fallback)) {
metadata[key == unique_id_fallback, "type"] <- "unique_id"
} else {
# setup unique_id column if there is neither a defined column nor a fallback
# create id column
data[, "id" := seq_len(nrow(data))]
# move id column to first position
new_order <- c("id", names(data)[ names(data) != "id"])
data <- data[, new_order, with = FALSE]
id_row <- data.table::data.table("id", level = "feature", type = "unique_id", label = "id", sub_label = "")
names(id_row)[1] <- "key"
# add meta entry
metadata <- rbind(id_row, metadata)
}
}
##### header
header <- c(
list(
format = "Clarion",
version = "1.0"),
list(...)
)
# add delimiter if necessary
if (any(metadata[["type"]] == "array")) {
header <- append(x = header, values = list(delimiter = in_field_delimiter), after = 2)
}
##### validate
# create clarion object for validation
clarion <- Clarion$new(header = header, metadata = metadata, data = data)
##### write
# TODO implement and use clarion write function
# write clarion
# header
flat_header <- data.table::data.table(paste0("!", names(clarion$header), "=", clarion$header))
data.table::fwrite(x = flat_header, file = output, col.names = FALSE, sep = "\t", dec = dec)
# metadata
# add '#'
names(clarion$metadata)[1] <- paste0("#", names(clarion$metadata)[1])
clarion$metadata[, names(clarion$metadata)[1] := paste0("#", clarion$metadata[[1]])]
data.table::fwrite(x = clarion$metadata, file = output, col.names = TRUE, sep = "\t", append = TRUE, quote = FALSE, dec = dec)
# data
data.table::fwrite(x = clarion$data, file = output, col.names = TRUE, sep = "\t", append = TRUE, dec = dec)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.