Nothing
#'
#' Get the longest common prefix from a set of strings.
#'
#' Input is converted to character (e.g. from factor) first.
#'
#'
#' @param strings Vector of strings
#' @return Single string - might be empty ("")
#'
#' @export
#'
#' @examples
#' longestCommonPrefix(c("CBA.321", "CBA.77654", "")) ## ""
#' longestCommonPrefix(c("CBA.321", "CBA.77654", "CB")) ## "CB"
#' longestCommonPrefix(c("ABC.123", "ABC.456")) ## "ABC."
#' longestCommonPrefix(c("nothing", "in", "common")) ## ""
#'
#'
longestCommonPrefix = function(strings) {
strings = as.character(strings)
if (length(strings) == 0) return("")
if (length(strings) == 1) return(strings[1])
min_len = min(sapply(strings, nchar))
if (min_len == 0) return ("")
i = 2
for(i in 1:(min_len+1)) ## +1 is required to gauge match for last char
{
if (i > min_len) next; ## passed the end
chars = substring(strings, i, i)
if (length(unique(chars)) > 1) {
## mismatch at pos i
break;
}
}
if (i==1) return("")
return(substr(strings[1], 1, i-1))
}
#'
#' Like longestCommonPrefix(), but on the suffix.
#'
#' @param strings Vector of strings
#' @return Single string - might be empty ("")
#'
#' @export
#'
#' @examples
#'
#' longestCommonSuffix(c("123.ABC", "45677.ABC", "BC")) ## "BC"
#' longestCommonSuffix(c("123.ABC", "", "BC")) ## ""
#' longestCommonSuffix(c("123.ABC", "45677.ABC")) ## ".ABC"
#' longestCommonSuffix(c("nothing", "in", "common")) ## ""
#'
longestCommonSuffix = function(strings)
{
strings_rev = sapply(lapply(strsplit(strings, NULL), rev), paste, collapse="")
r = longestCommonPrefix(strings_rev)
r_rev = sapply(lapply(strsplit(r, NULL), rev), paste, collapse="")
return (r_rev)
}
#' Removes the longest common prefix (LCP) from a vector of strings.
#'
#' You should provide only unique strings (to increase speed).
#' If only a single string is given, the empty string will be returned unless \code{minOutputLength} is set.
#'
#' @param x Vector of strings with common prefix
#' @param min_out_length Minimal length of the shortest element of x after LCP removal [default: 0, i.e. empty string is allowed] . If the output would be shorter, the last part of the LCP is kept.
#' @param add_dots Prepend output with '..' if shortening was done.
#' @return Shortened vector of strings
#'
#' @examples
#' delLCP(c("TK12345_H1"), min_out_length=0)
#' ## ""
#'
#' delLCP(c("TK12345_H1"), min_out_length=4)
#' ## "5_H1"
#'
#' delLCP(c("TK12345_H1"), min_out_length=4, add_dots = TRUE)
#' ## "..5_H1"
#'
#' delLCP(c("TK12345_H1", "TK12345_H2"), min_out_length=4)
#' ## "5_H1" "5_H2"
#'
#' delLCP(c("TK12345_H1", "TK12345_H2"), min_out_length=4, add_dots = TRUE)
#' ## "..5_H1" "..5_H2"
#'
#' delLCP(c("TK12345_H1", "TK12345_H2"), min_out_length=8)
#' ## "12345_H1", "12345_H2"
#'
#' delLCP(c("TK12345_H1", "TK12345_H2"), min_out_length=8, add_dots = TRUE)
#' ## "TK12345_H1", "TK12345_H2" (unchanged, since '..' would add another two)
#'
#' delLCP(c("TK12345_H1", "TK12345_H2"), min_out_length=60)
#' ## "TK12345_H1", "TK12345_H2" (unchanged)
#'
#' delLCP(c("TK12345_H1", "TK12345_H2"), min_out_length=60, add_dots = TRUE)
#' ## "TK12345_H1", "TK12345_H2" (unchanged)
#'
#'
#' @export
delLCP <- function(x, min_out_length = 0, add_dots = FALSE)
{
x = as.character(x)
lcp = nchar(longestCommonPrefix( x )) # shorten string (remove common prefix)
if (min_out_length > 0)
{
minLen = min(nchar(x))
if (lcp >= minLen - min_out_length) ## removing lcp would shorten the string too much
{
lcp = max(minLen - min_out_length, 0)
if (lcp <= 2 & add_dots) return(x) ## we can shorten by at most 2 chars .. don't do it
}
}
x = sapply(x, substring, lcp+1)
if (add_dots) x = paste0("..", x)
return (x)
}
#' Removes the longest common suffix (LCS) from a vector of strings.
#'
#' @param x Vector of strings with common suffix
#' @return Shortened vector of strings
#'
#' @examples
#' delLCS(c("TK12345_H1")) ## ""
#' delLCS(c("TK12345_H1", "TK12345_H2")) ## "TK12345_H1" "TK12345_H2"
#' delLCS(c("TK12345_H1", "TK12!45_H1")) ## "TK123" "TK12!"
#'
#' @export
delLCS <- function(x)
{
lcs = nchar(longestCommonSuffix( x )) # shorten string (remove common suffix)
x = sapply(x, function(v) substr(v, 1, nchar(v)-lcs))
return (x)
}
#' Count the number of chars of the longest common prefix
#'
#' @param x Vector of strings with common prefix
#' @return Length of LCP
#'
#' @export
lcpCount <- function(x)
{
lcp = nchar(longestCommonPrefix( x )) # number of common prefix chars
return (lcp)
}
#' Count the number of chars of the longest common suffix
#'
#' @param x Vector of strings with common suffix
#' @return Length of LCS
#'
#' @export
lcsCount <- function(x)
{
lcs = nchar(longestCommonSuffix( x )) # number of common suffix chars
return(lcs)
}
#' Compute longest common substring of two strings.
#'
#' Implementation is very inefficient (dynamic programming in R)
#' --> use only on small instances
#'
#' @param s1 String one
#' @param s2 String two
#' @return String containing the longest common substring
#'
#' @export
LCS <- function(s1, s2)
{
if (nchar(s1)==0 || nchar(s2)==0) return("")
v1 <- unlist(strsplit(s1,split=""))
v2 <- unlist(strsplit(s2,split=""))
num <- matrix(0, nchar(s1), nchar(s2))
maxlen <- 0
pstart = 0
for (i in 1:nchar(s1)) {
for (j in 1:nchar(s2)) {
if (v1[i] == v2[j]) {
if ((i==1) || (j==1)) {
num[i,j] <- 1
}
else {
num[i,j] <- 1+num[i-1,j-1]
}
if (num[i,j] > maxlen) {
maxlen <- num[i,j]
pstart = i-maxlen+1
}
}
}
}
## return the substring found
return (substr(s1, pstart, pstart+maxlen-1))
}
#' Find longest common substring from 'n' strings.
#'
#' Warning: greedy heuristic! This is not guaranteed to find the best solution (or any solution at all), since its done pairwise with the shortest input string as reference.
#'
#' @param strings A vector of strings in which to search for LCS
#' @param min_LCS_length Minimum length expected. Empty string is returned if the result is shorter
#' @return longest common substring (or "" if shorter than \code{min_LCS_length})
#'
#' @examples
#' LCSn(c("1_abcde...",
#' "2_abcd...",
#' "x_abc...")) ## --> "_abc"
#' LCSn(c("16_IMU008_CISPLA_E5_R11",
#' "48_IMU008_CISPLA_P4_E7_R31",
#' "60_IMU008_CISPLA_E7_R11"), 3) ## -->"_IMU008_CISPLA_"
#' LCSn(c("AAAAACBBBBB",
#' "AAAAADBBBBB",
#' "AAAABBBBBEF",
#' "AAABBBBBDGH")) ## --> "BBBBB"
#' LCSn(c("AAAXXBBB",
#' "BBBXXDDD",
#' "XXAAADDD")) ## --> fails due to greedy approach; should be "XX"
#'
#' @export
#'
LCSn = function(strings, min_LCS_length = 0)
{
## abort if there is no chance of finding a suitably long substring
if (min(nchar(strings)) < min_LCS_length) return("");
if (length(strings) <= 1) return (strings)
## apply LCS to all strings, using the shortest as reference
idx_ref = which(nchar(strings)==min(nchar(strings)))[1]
strings_other = strings[-idx_ref]
candidates = unique(sapply(strings_other, LCS, strings[idx_ref]))
candidates
## if only one string remains, we're done
if (length(candidates) == 1)
{
if (nchar(candidates[1]) < min_LCS_length) return("") else return (candidates[1])
}
## if its more, call recursively until a solution is found
## continue with the shortest candidates (since only they can be common to all)
cand_short = candidates[nchar(candidates) == min(nchar(candidates))]
solutions = unique(sapply(cand_short, function(cand) LCSn(c(cand, strings_other))))
## get the longest solution
idx_sol = which(nchar(solutions)==max(nchar(solutions)))[1];
return (solutions[idx_sol])
}
#'
#' Removes common substrings (infixes) in a set of strings.
#'
#' Usually handy for plots, where condition names should be as concise as possible.
#' E.g. you do not want names like
#' 'TK20130501_H2M1_010_IMU008_CISPLA_E3_R1.raw' and
#' 'TK20130501_H2M1_026_IMU008_CISPLA_E7_R2.raw'
#' but rather 'TK.._010_I.._E3_R1.raw' and
#' 'TK.._026_I.._E7_R2.raw'
#'
#' If multiple such substrings exist, the algorithm will remove the longest first and iterate
#' a number of times (two by default) to find the second/third etc longest common substring.
#' Each substring must fulfill a minimum length requirement - if its shorter, its not considered worth removing
#' and the iteration is aborted.
#'
#' @param strings A vector of strings which are to be shortened
#' @param infix_iterations Number of successive rounds of substring removal
#' @param min_LCS_length Minimum length of the longest common substring (default:7, minimum: 6)
#' @param min_out_length Minimum length of shortest element of output (no shortening will be done which causes output to be shorter than this threshold)
#' @return A list of shortened strings, with the same length as the input
#'
#' @examples
#' #library(PTXQC)
#' simplifyNames(c('TK20130501_H2M1_010_IMU008_CISPLA_E3_R1.raw',
#' 'TK20130501_H2M1_026_IMU008_CISPLA_E7_R2.raw'), infix_iterations = 2)
#' # --> "TK.._010_I.._E3_R1.raw","TK.._026_I.._E7_R2.raw"
#'
#' try(simplifyNames(c("bla", "foo"), min_LCS_length=5))
#' # --> error, since min_LCS_length must be >=6
#'
#' @export
#'
simplifyNames = function(strings, infix_iterations = 2, min_LCS_length = 7, min_out_length = 7)
{
if (min_LCS_length<6) stop( "simplifyNames(): param 'min_LCS_length' must be 6 at least.")
for (it in 1:infix_iterations)
{
lcs = LCSn(strings, min_LCS_length=min_LCS_length)
if (nchar(lcs)==0) return (strings) ## no infix of minimum length found
## replace infix with 'ab..yz'
strings_i = sub(paste0("(.*)", gsub("([.|()\\^{}+$*?]|\\[|\\])", "\\\\\\1", lcs), "(.*)"),
paste0("\\1"
, ifelse(substring(lcs,1,2) == "..", "", substring(lcs,1,2)) ## dont keep ".."
, ".."
, ifelse(substring(lcs, nchar(lcs)-1) == "..", "", substring(lcs, nchar(lcs)-1)) ## dont keep ".."
, "\\2"),
strings)
if (min(nchar(strings_i)) < min_out_length) return(strings) ## result got too short...
strings = strings_i
}
return (strings)
}
#' Shorten a string to a maximum length and indicate shorting by appending '..'
#'
#' Some axis labels are sometimes just too long and printing them will either
#' squeeze the actual plot (ggplot) or make the labels disappear beyond the margins (graphics::plot)
#' One ad-hoc way of avoiding this is to shorten the names, hoping they are still meaningful to the viewer.
#'
#' This function should be applied AFTER you tried more gentle methods, such as \code{\link{delLCP}} or \code{\link{simplifyNames}}.
#'
#' @param x Vector of input strings
#' @param max_len Maximum length allowed
#' @param verbose Print which strings were shortened
#' @param allow_duplicates If shortened strings are not discernible any longer, consider the short version valid (not the default), otherwise (default) return the full string (--> no-op)
#' @return A vector of shortened strings
#'
#' @examples
#' r = shortenStrings(c("gamg_101", "gamg_101230100451", "jurkat_06_100731121305", "jurkat_06_1"))
#' all(r == c("gamg_101", "gamg_101230100..", "jurkat_06_1007..", "jurkat_06_1"))
#'
#' @seealso \code{\link{delLCP}}, \code{\link{simplifyNames}}
#'
#' @export
#'
shortenStrings = function(x, max_len = 20, verbose = TRUE, allow_duplicates = FALSE)
{
if (any(duplicated(x)) & !allow_duplicates) stop("Duplicated input given to 'shortenStrings'!\n ", paste(x, collapse="\n "))
idx = nchar(x) > max_len
xr = x
xr[idx] = paste0(substr(x[idx], 1, max_len-2), "..")
if (!allow_duplicates & any(duplicated(xr)))
{
## try cutting the prefix instead of the suffix...
xr[idx] = paste0("..", sapply(x[idx], function(s) substr(s, nchar(s)-max_len+2, nchar(s))))
if (any(duplicated(xr)))
{
warning("Duplicated output produced by 'shortenStrings'!\n ", paste(xr, collapse="\n "), "\nPlease rename your items (make them shorter and less similar)")
## fail generously and return the original input
return(x)
}
}
if (verbose & any(idx))
{
cat("The following labels will be shortened to ease plotting:\n")
cat(paste0(" ", paste(x[idx], collapse="\n ")))
cat("\n")
}
return (xr)
}
#' Compute shortest prefix length which makes all strings in a vector uniquely identifyable.
#'
#' If there is no unique prefix (e.g. if a string is contained twice), then the length
#' of the longest string is returned, i.e. if the return value is used in a call to substr, nothing happens
#' e.g. substr(x, 1, supCount(x)) == x
#'
#' @param x Vector of strings
#' @param prefix_l Starting prefix length, which is incremented in steps of 1 until all prefixes are unique (or maximum string length is reached)
#' @return Integer with minimal prefix length required
#'
#' @examples
#' supCount(c("abcde...", "abcd...", "abc...")) ## 5
#'
#' x = c("doubled", "doubled", "aLongDummyString")
#' all( substr(x, 1, supCount(x)) == x )
#' ## TRUE (no unique prefix due to duplicated entries)
#'
#' @export
#'
supCount <- function(x, prefix_l=1)
{
max_length = max(nchar(x))
while (prefix_l < max_length)
{
dups = duplicated(sapply(x, substr, 1, prefix_l), NA)
if (sum(dups)==0)
{
break;
}
prefix_l = prefix_l + 1
}
return (prefix_l)
}
#' Re-estimate a new set size to split a number of items into equally sized sets.
#'
#' This is useful for plotting large datasets where multiple pages are needed.
#' E.g. you know that you need 101 barplots, but you only want to fit about 25 per page.
#' Naively one would now do five plots, with the last one only containing a single barplot.
#' Using this function with correctSetSize(101, 25) would tell you to use 26 barplots per page,
#' so you end up with four plots, all roughly equally filled.
#' It also works the other extreme case, where your initial size is chosen slightly too high, e.g.
#' Sets of size 5 for just 8 items is too much, because we can reduce the set size to 4 and still
#' need two sets but now they are much more equally filled (correctSetSize(8, 5) == 4).
#'
#' We allow for up to set sizes of 150\% from default, to avoid the last set being sparse (we remove it and distribute to the other bins)
#' 150%\ oversize is the extreme case, which only happens with sets of size two. With more sets the overhead is much smaller (1/X).
#' Once the number of sets is fixed, we distribute all items equally.
#'
#' E.g. 6 items & initial_set_size=5, would result in 2 bins (5 items, 1 item), but we'd rather have one bin of 6 items
#' or 8 items & initial_set_size=5, would result in 2 bins (5+3 items), since the last set is more than half full, but we'd rather have 4+4
#'
#' @param item_count Known number of items which need to assigned to sets
#' @param initial_set_size Desired number of items a single set should hold
#' @return re-estimated set size which a set should hold in order to avoid underfilled sets
#'
#' @examples
#' stopifnot(
#' correctSetSize(8, 5) == 4
#' )
#' stopifnot(
#' correctSetSize(101, 25) == 26
#' )
#'
#' @export
#'
correctSetSize = function(item_count, initial_set_size)
{
blocks = seq(from = 1, to = item_count, by = initial_set_size)
blockcount = length(blocks)
lastblocksize = item_count - tail(blocks, n = 1) + 1
#cat(paste("Naively, last set has size", lastblocksize, "\n"))
if (lastblocksize < initial_set_size * 0.5 & blockcount > 1)
{ ## last block is not full, reduce block count if more than one block
blockcount = blockcount - 1
#cat(paste("reducing number of sets to", blockcount, "\n"))
}
## distribute equally among fixed number of sets
set_size = ceiling(item_count / blockcount)
return (set_size)
}
#' Calls FUN on a subset of data in blocks of size 'subset_size' of unique indices.
#'
#' One subset consists of 'subset_size' unique groups and thus of all rows which
#' in 'data' which have any of these groups.
#' The last subset might have less groups, if the number of unique groups is not dividable by subset_size.
#'
#' FUN is applied on each subset.
#'
#' @param data Data.frame whose subsets to use on FUN
#' @param indices Vector of group assignments, same length as nrow(data)
#' @param subset_size Number of groups to use in one subset
#' @param FUN Function applied to subsets of data
#' @param sort_indices Sort groups (by their sorted character(!) names) before building subsets
#' @param ... More arguments to FUN
#'
#' @return list of function result (one entry for each subset)
#'
#' @examples
#' byX(data.frame(d=1:10), 1:10, 2, sum)
#'
#' @export
#'
byX <- function(data, indices, subset_size = 5, FUN, sort_indices = TRUE, ...)
{
stopifnot(subset_size > 0)
stopifnot(nrow(data) == length(indices))
groups = unique(indices)
if (sort_indices)
{
#cat(paste0("Sorting indices (", length(groups), ")...\n"))
groups = sort(groups)
#cat(paste0(groups))
#cat()
}
blocks = seq(from = 1, to = length(groups), by = subset_size)
result = lapply(blocks, function(x) {
#cat(paste("block", x, " ... "))
range = x:(min(x+subset_size-1, length(groups)))
lns = indices %in% groups[range];
subset = (data[lns, , drop = FALSE])
rownames(subset) = rownames(data)[lns]
#cat(paste("call\n"))
r = FUN(subset, ...)
flush.console()
return (r)
})
return (result)
}
#'
#' Same as \code{\link{byX}}, but with more flexible group size, to avoid that the last group has only a few entries (<50\% of desired size).
#'
#' The 'subset_size' param is internally optimized using \code{\link{correctSetSize}} and
#' then \code{\link{byX}} is called.
#'
#' @param data Data.frame whose subset to use on FUN
#' @param indices Vector of group assignments, same length as nrow(data)
#' @param subset_size Ideal number of groups to use in one subset -- this can be changed internally, from 75\%-150\%
#' @param FUN function Applied to subsets of data
#' @param sort_indices Groups are formed by their sorted character(!) names
#' @param ... More arguments to FUN
#'
#' @return list of function result (one entry for each subset)
#'
#' @examples
#' stopifnot(
#' byXflex(data.frame(d=1:10), 1:10, 2, sum, sort_indices = FALSE) ==
#' c(3, 7, 11, 15, 19)
#' )
#'
#' @export
#'
byXflex <- function(data, indices, subset_size = 5, FUN, sort_indices = TRUE, ...)
{
stopifnot(subset_size>0)
#indices=1:24
#subset_size=5
groups = unique(indices)
subset_size = correctSetSize(length(groups), subset_size)
return (byX(data, indices, subset_size, FUN, sort_indices, ...))
}
#' Assign set numbers to a vector of values.
#'
#' Each set has size set_size (internally optimized using \code{\link{correctSetSize}}), holding values from 'values'.
#' This gives n such sets and the return value is just the set index for each value.
#'
#' @param values Vector of values
#' @param set_size Number of distinct values allowed in a set
#' @param sort_values Before assigning values to sets, sort the values?
#'
#' @return Vector (same length as input) with set numbers
#'
#' @examples
#' #library(PTXQC)
#' assignBlocks(c(1:11, 1), set_size = 3, sort_values = FALSE)
#' ## --> 1 1 1 2 2 2 3 3 3 4 4 1
#'
#' @export
#'
assignBlocks = function(values, set_size = 5, sort_values = TRUE)
{
groups = unique(values)
## correct
set_size = correctSetSize(length(groups), set_size)
## sort
if (sort_values)
{
#cat(paste0("Sorting values (", length(groups), ")..."))
groups = sort(groups)
}
blocks = seq(from=1, to=length(groups), by=set_size)
result = rep(NA, length(values))
for (x in 1:length(blocks))
{
range = blocks[x]:(min(blocks[x]+set_size-1, length(groups)))
lns = values %in% groups[range];
result[lns] = x
}
#cat("done\n")
return (result)
}
#'
#' Grep with values returned instead of indices.
#'
#' The parameter 'value' should not be passed to this function since it is
#' passed internally already.
#'
#' @param reg regex param
#' @param data container
#' @param ... other params forwarded to grep()
#'
#' @return values of data which matched the regex
#'
#' @examples
#' grepv("x", c("abc", "xyz"))
#' ## --> "xyz"
#'
#' @export
#'
grepv = function(reg, data, ...)
{
return (grep(reg, data, value = TRUE, ...))
}
#' paste with tab as separator
#'
#' @param ... Arguments forwarded to paste()
#' @return return value of paste()
#'
#' @examples
#' pastet("tab","separated")
#' ## --> "tab\tseparated"
#'
#' @export
#'
pastet = function(...)
{
paste(..., sep="\t")
}
#' paste with newline as separator
#'
#' @param ... Arguments forwarded to paste()
#' @return return value of paste()
#'
#' @examples
#' pasten("newline","separated")
#' ## --> "newline\nseparated"
#'
#' @export
#'
pasten = function(...)
{
paste(..., sep="\n")
}
#' Coefficient of variation (CV)
#'
#' Computes sd(x) / mean(x)
#'
#' @param x Vector of numeric values
#' @return CV
CV = function(x){sd(x) / mean(x)}
#' Relative standard deviation (RSD)
#'
#' Simply \code{\link{CV}}*100
#'
#' @param x Vector of numeric values
#' @return RSD
#'
RSD = function(x){CV(x)*100}
#' Replace 0 with NA in a vector
#'
#' @param x A numeric vector
#' @return Vector of same size as 'x', with 0's replaced by NA
#'
del0 = function(x)
{
x[x==0] = NA
return(x)
}
#'
#' Repeat each element x_i in X, n_i times.
#'
#' @param x Values to be repeated
#' @param n Number of repeat for each x_i (same length as x)
#' @return Vector with values from x, n times
#'
#' @examples
#'
#' repEach(1:3, 1:3) ## 1, 2, 2, 3, 3, 3
#'
#' @export
#'
repEach = function(x, n)
{
stopifnot(length(x) == length(n))
r = unlist(
sapply(1:length(x), function(i) rep(x[i], each=n[i]))
)
return(r)
}
#' Find the local maxima in a vector of numbers.
#'
#' A vector of booleans is returned with the same length as input
#' which contains TRUE when there is a maximum.
#' Simply sum up the vector to get the number of maxima.
#'
#' @param x Vector of numbers
#' @param thresh_rel Minimum relative intensity to maximum intensity of 'x' required
#' to be a maximum (i.e., a noise threshold). Default is 20\%.
#' @return Vector of bool's, where TRUE indicates a local maximum.
#'
#' @examples
#' r = getMaxima(c(1,0,3,4,5,0))
#' all(r == c(TRUE,FALSE,FALSE,FALSE,TRUE,FALSE))
#'
#' getMaxima(c(1, NA, 3, 2, 3, NA, 4, 2, 5))
#'
#' @export
#'
getMaxima = function(x, thresh_rel = 0.2)
{
pos = rep(FALSE, length(x))
thresh_abs = max(x, na.rm = TRUE) * thresh_rel
x_noNA = na.omit(x)
if (length(x_noNA) == 0) return(pos)
last = x_noNA[1]
## find index of first non-NA in 'x'
i = which(last == x)[1]
last_nonNA_i = i
up = TRUE ## we start going up (if the next point is lower, the first point is a maximum)
while (i < length(x))
{
i = i + 1;
if (is.na(x[i])) next;
if (last < x[i] && !up)
{ # ascending in down mode
up = !up
} else if (last > x[i] && up)
{ # going down in up mode
up = !up
if (x[last_nonNA_i]>=thresh_abs) pos[last_nonNA_i] = TRUE
}
last = x[i]
last_nonNA_i = i
}
if (up) pos[last_nonNA_i] = TRUE ## if we ended going up, the last point is a maximum
return (pos)
}
#'
#' Prepare a Mosaic plot of two columns in long format.
#'
#' Found at http://stackoverflow.com/questions/19233365/how-to-create-a-marimekko-mosaic-plot-in-ggplot2
#' Modified (e.g. to pass R check)
#'
#' Returns a data frame, which can be used for plotting and has the following columns:
#' 'Var1' - marginalized values from 1st input column
#' 'Var2' - marginalized values from 2nd input column
#' 'Freq' - relative frequency of the combination given in [Var1, Var2]
#' 'margin_var1' - frequency of the value given in Var1
#' 'var2_height' - frequency of the value given in Var2, relative to Var1
#' 'var1_center' - X-position when plotting (large sets get a larger share)
#'
#' @param data A data.frame with exactly two columns
#' @return Data.frame
#'
#' @examples
#' data = data.frame(raw.file = c(rep('file A', 100), rep('file B', 40)),
#' charge = c(rep(2, 60), rep(3, 30), rep(4, 10),
#' rep(2, 30), rep(3, 7), rep(4, 3)))
#' mosaicize(data)
#'
#' @export
#'
mosaicize = function(data)
{
stopifnot(ncol(data)==2)
lev_var1 = length(unique(data[, 1]))
ct.data = as.data.frame(prop.table(table(data[, 1], data[, 2])))
ct.data$Margin_var1 = prop.table(table(data[, 1]))
ct.data$Var2_height = ct.data$Freq / ct.data$Margin_var1
ct.data$Var1_center = c(0, cumsum(ct.data$Margin_var1)[(1:lev_var1) -1]) + ct.data$Margin_var1 / 2
#ct.data = ct.data[ order(ct.data$Var1), ]
return (ct.data)
}
#' A string concatenation function, more readable than 'paste()'.
#'
#' @param a Char vector
#' @param b Char vector
#' @return Concatenated string (no separator)
#'
`%+%` = function(a, b)
{
return (paste(a, b, sep=""))
}
#'
#' Given a vector of (short/long) filenames, translate to the (long/short) version
#'
#' @param f_names Vector of filenames
#' @param mapping A data.frame with from,to columns
#' @return A vector of translated file names as factor (ordered by mapping!)
#'
#' @export
#'
renameFile = function(f_names, mapping)
{
if (all(f_names %in% mapping$from)) { ## from -> to
f_new = mapping$to[match(f_names, mapping$from)]
f_new = factor(f_new, levels=mapping$to)
} else if (all(f_names %in% mapping$to)) { ## to -> from
f_new = mapping$from[match(f_names, mapping$to)]
f_new = factor(f_new, levels=mapping$from)
} else {
stop("Error in renameFile: filenames cannot be found in mapping!")
}
return (f_new)
}
#'
#' Add the value of a variable to an environment (fast append)
#'
#' The environment must exist, and its name must be given as string literal in 'env_name'!
#' The value of the variable 'v' will be stored under the name given in 'v_name'.
#' If 'v_name' is not given, a variable name will be created by increasing an internal counter
#' and using the its value padded with zeros as name (i.e., "0001", "0002" etc).
#'
#' @param env_name String of the environment variable
#' @param v Value to be inserted
#' @param v_name String used as variable name. Automatically generated if omitted.
#' @return Always TRUE
#'
#'
appendEnv = function(env_name, v, v_name = NULL)
{
e = eval.parent(as.name(env_name), n=3)
e$.counter <- e$.counter + 1
e[[sprintf("%04d",e$.counter)]] <- v ## pad with 0's, to ensure correct order when calling ls() on env_name
return(TRUE)
}
#'
#' Thin out a data.frame by removing rows with similar numerical values in a certain column.
#'
#' All values in the numerical column 'filterColname' are assigned to bins of width 'binsize'.
#' Only one value per bin is retained. All other rows are removed and the reduced
#' data frame will all its columns is returned.
#'
#' @param data The data.frame to be filtered
#' @param filterColname Name of the filter column as string
#' @param binsize Width of a bin
#' @return Data.frame with reduced rows, but identical input columns
#'
#'
thinOut = function(data, filterColname, binsize)
{
nstart = nrow(data)
## first remove duplicates (they cannot possibly pass the filter)
data = data[!duplicated(data[, filterColname]), ]
## sort (expensive)
if (is.unsorted(data[, filterColname])) data = data[order(data[, filterColname]), ]
## binning...
local_deltas = c(0, diff(data[, filterColname]))
ld_cs = cumsum(local_deltas)
data = data[!duplicated(round(ld_cs / binsize)),]
#print("Saved " %+% round(100 - nrow(x)/nstart*100) %+% "% data")
return(data)
}
#'
#' Apply 'thinOut' on all subsets of a data.frame, split by a batch column
#'
#' The binsize is computed from the global data range of the filter column by dividing the range
#' into binCount bins.
#'
#' @param data The data.frame to be split and filtered(thinned)
#' @param filterColname Name of the filter column as string
#' @param batchColname Name of the split column as string
#' @param binCount Number of bins in the 'filterColname' dimension.
#' @return Data.frame with reduced rows, but identical input columns
#'
thinOutBatch = function(data, filterColname, batchColname, binCount = 1000)
{
binsize = (max(data[, filterColname], na.rm=TRUE) - min(data[, filterColname], na.rm=TRUE)) / binCount
r = plyr::ddply(data, batchColname, thinOut, filterColname, binsize)
return (r)
}
#'
#' Assign a relative abundance class to a set of (log10) abundance values
#'
#' Abundances (should be logged already) are grouped into different levels,
#' starting from the smallest values ("low") to the highest values ("high").
#' Intermediate abundances are either assigned as "mid", or "low-mid".
#' If the range is too large, only "low" and "high" are assigned, the intermediate values
#' are just numbers.
#'
#' Example:
#' getAbundanceClass(c(12.4, 17.1, 14.9, 12.3)) ## --> factor(c("low", "high", "mid", "low"))
#'
#'
#' @param x Vector of numeric values (in log10)
#' @return Vector of factors corresponding to input with abundance class names (e.g. low, high)
#'
getAbundanceClass = function(x) {
r = data.frame(x = x)
r$x_diff = round(r$x - min(r$x))
r$x_diff_fac = as.numeric(factor(r$x_diff)) ## make a factor, to get ordering (lowAbd=1, ...)
## assign names to abundance classes
cc = length(unique(r$x_diff))
if (cc==1) lvls = "mid"
if (cc==2) lvls = c("low", "high")
if (cc==3) lvls = c("low", "mid", "high")
if (cc==4) lvls = c("low", "low-mid", "mid-high", "high")
if (cc>4) {
lvls = 1:max(r$x_diff_fac)
lvls[1] = "low"
lvls[length(lvls)] = "high"
}
r$x_cl = lvls[r$x_diff_fac]
r$x_cl = factor(r$x_cl, levels=lvls, ordered=TRUE)
r
return(r$x_cl)
}
#'
#' Assembles a list of output file names, which will be created during reporting.
#'
#' You can combine **report_name_has_folder** (and **mzTab_filename** for mzTab files) to obtain report filenames which are even more
#' robust to moving around (since they contain infixes of the mzTab filename and the folder),
#' e.g. `@em `report_HEK293-study_myProjects.html``, where the input
#' was `mzTab_filename='HEK293-study.mzTab` and `folder='c:/somePath/myProjects/`.
#'
#' @param folder Directory where the MaxQuant output (txt folder) or the mzTab file resides
#' @param report_name_has_folder Boolean: Should the report files (html, pdf) contain the name
#' of the deepest(=last) subdirectory in **txt_folder** which is not `txt`?
#' Useful for discerning different reports in a PDF viewer.
#' E.g. when flag is FALSE: `report_v0.91.0.html`; and `report_v0.91.0_bloodStudy.html` when flag is TRUE (and the
#' txt folder is `.../bloodStudy/txt/` or `...bloodStudy/`)
#' @param mzTab_filename If input is an mzTab, specify its name, so that the filenames can use its basename as infix
#' E.g. when `mzTab_filename = 'HEK293-study.mzTab'` then the output will be
#' `report_HEK293-study.html`.
#' This allows to get reports on multiple mzTabs in the same folder without overwriting report results.
#'
#' @return List of output file names (just names, no file is created)
#' with list entries:
#' **yaml_file**, **heatmap_values_file**, **R_plots_file**, **filename_sorting**, **mzQC_file**, **log_file**, **report_file_prefix**, **report_file_PDF**, **report_file_HTML**
#'
#' @export
#'
getReportFilenames = function(folder, report_name_has_folder = TRUE, mzTab_filename = NULL)
{
## package version: added to output filename
pv = try(utils::packageVersion("PTXQC"))
if (inherits(pv, "try-error")) pv = "_unknown"
report_version = paste0("v", pv)
## amend report filename with a foldername where it resides, to ease discerning different reports in a PDF viewer
extra_folderRef = ""
folders = rev(unlist(strsplit(normalizePath(folder, winslash = .Platform$file.sep), split=.Platform$file.sep)))
while (length(folders)){
if (!grepl(":", folders[1]) & folders[1]!="txt") {
extra_folderRef = paste0("_", folders[1])
break;
} else folders = folders[-1]
}
## complete path + report_vXY
report_file_simple = paste0(folder, .Platform$file.sep, "report_", report_version)
## .. + myMzTab [optional]
if (!is.null(mzTab_filename) && nchar(mzTab_filename) > 0) {
report_file_simple = paste0(report_file_simple, "_", gsub("\\.mzTab$", "", basename(mzTab_filename), ignore.case = TRUE))
}
## include folder-name at the end
if (report_name_has_folder)
report_file_prefix = paste0(report_file_simple, extra_folderRef)
else
report_file_prefix = report_file_simple
fh = list(yaml_file = paste0(report_file_prefix, ".yaml"),
heatmap_values_file = paste0(report_file_prefix, "_heatmap.txt"),
R_plots_file = paste0(report_file_prefix, "_plots.Rdata"),
filename_sorting = paste0(report_file_prefix, "_filename_sort.txt"),
mzQC_file = paste0(report_file_prefix, ".mzQC"),
log_file = paste0(report_file_prefix, ".log"),
report_file_prefix = report_file_prefix,
report_file_PDF = paste0(report_file_prefix, ".pdf"),
report_file_HTML = paste0(report_file_prefix, ".html")
)
return (fh)
}
#'
#' Extract the number of protein groups observed per Raw file
#' from an evidence table.
#'
#' Required columns are "protein.group.ids", "fc.raw.file" and "is.transferred".
#'
#' If match-between-runs was enabled during the MaxQuant run,
#' the data.frame returned will contain separate values for 'transferred' evidence
#' plus an 'MBRgain' column, which will give the extra MBR evidence in percent.
#'
#' @param df_evd Data.frame of evidence.txt as read by MQDataReader
#' @return Data.frame with columns 'fc.raw.file', 'counts', 'category', 'MBRgain'
#'
#'
getProteinCounts = function(df_evd) {
required_cols = c("protein.group.ids", "fc.raw.file", "is.transferred")
if (!all(required_cols %in% colnames(df_evd))) {
stop("getProteinCounts(): Missing columns!")
}
## report Match-between-runs data only if if it was enabled
reportMTD = any(df_evd$is.transferred)
prot_counts = plyr::ddply(df_evd, "fc.raw.file", .fun = function(x, reportMTD)
{
## proteins
x$group_mtdinfo = paste(x$protein.group.ids, x$is.transferred, sep="_")
# remove duplicates (since strsplit below is expensive) -- has no effect on double counting of PROTEINS(!), since it honors MTD
xpro = x[!duplicated(x$group_mtdinfo),]
# split groups
p_groups = lapply(as.character(xpro$protein.group.ids), function(x) {
return (strsplit(x, split=";", fixed=TRUE))
})
# get number of unique proteins
pg_set_genuineUnique = unique(unlist(p_groups[!xpro$is.transferred]))
# MBR will contribute peptides of already known proteins
pg_set_allMBRunique = unique(unlist(p_groups[xpro$is.transferred]))
pg_count_GenAndMBR = length(intersect(pg_set_allMBRunique, pg_set_genuineUnique))
# ... and peptides of proteins which are new in this Raw file (few)
pg_count_newMBR = length(pg_set_allMBRunique) - pg_count_GenAndMBR
# ... genuine-only proteins:
pg_count_onlyGenuine = length(pg_set_genuineUnique) - pg_count_GenAndMBR
## return values in MELTED array
if (reportMTD)
{
df = data.frame(counts = c(pg_count_onlyGenuine, pg_count_GenAndMBR, pg_count_newMBR),
category = c("genuine (exclusive)", "genuine + transferred", "transferred (exclusive)"),
MBRgain = c(NA, NA, pg_count_newMBR / (pg_count_onlyGenuine + pg_count_GenAndMBR) * 100))
} else {
df = data.frame(counts = c(pg_count_onlyGenuine),
category = c("genuine"),
MBRgain = NA)
}
return (df)
}, reportMTD = reportMTD)
return (prot_counts)
}
#'
#' Extract the number of peptides observed per Raw file
#' from an evidence table.
#'
#' Required columns are "fc.raw.file", "modified.sequence" and "is.transferred".
#'
#' If match-between-runs was enabled during the MaxQuant run,
#' the data.frame returned will contain separate values for 'transferred' evidence
#' plus an 'MBRgain' column, which will give the extra MBR evidence in percent.
#'
#' @param df_evd Data.frame of evidence.txt as read by MQDataReader
#' @return Data.frame with columns 'fc.raw.file', 'counts', 'category', 'MBRgain'
#'
#'
getPeptideCounts = function(df_evd) {
required_cols = c("fc.raw.file", "modified.sequence", "is.transferred")
if (!all(required_cols %in% colnames(df_evd))) {
stop("getPeptideCounts(): Missing columns!")
}
## report Match-between-runs data only if if it was enabled
reportMTD = any(df_evd$is.transferred)
pep_counts = plyr::ddply(df_evd, "fc.raw.file", .fun = function(x, reportMTD)
{
x <<- x
#pep_count_genuineAll = sum(!x$is.transferred) # (we count double sequences... could be charge +2, +3,... or oversampling)
pep_set_genuineUnique = unique(x$modified.sequence[!x$is.transferred]) ## unique sequences (discarding PTM's)
pep_set_allMBRunique = unique(x$modified.sequence[x$is.transferred])
pep_count_GenAndMBR = length(intersect(pep_set_genuineUnique, pep_set_allMBRunique)) ## redundant, i.e. both Genuine and MBR
pep_count_newMBR = length(pep_set_allMBRunique) - pep_count_GenAndMBR ## new MBR peptides
pep_count_onlyGenuine = length(pep_set_genuineUnique) - pep_count_GenAndMBR ## genuine-only peptides
## return values in MELTED array
if (reportMTD)
{
df = data.frame(counts = c(pep_count_onlyGenuine, pep_count_GenAndMBR, pep_count_newMBR),
category = c("genuine (exclusive)", "genuine + transferred", "transferred (exclusive)"),
MBRgain = c(NA, NA, pep_count_newMBR / (pep_count_onlyGenuine + pep_count_GenAndMBR) * 100))
} else {
df = data.frame(counts = c(pep_count_onlyGenuine),
category = c("genuine"),
MBRgain = NA)
}
return (df)
}, reportMTD = reportMTD)
return (pep_counts)
}
#'
#' Check if a file is writable and blocks an interactive session, waiting for user input.
#'
#' This functions gives the user a chance to make the output file writeable before a write attempt
#' is actually made by R to avoid having run the whole program again upon write failure.
#'
#' Note: The file will not be overwritten or changed by this function.
#'
#' @param filename The file to test for writable
#' @param prompt_text If not writable, show this prompt text to the user
#' @param abort_answer If the user enters this string into the prompt, this function will stop()
#' @return TRUE if writable, FALSE if aborted by user or (not-writeable and non-interactive)
#'
wait_for_writable = function(filename,
prompt_text = paste0("The file '", filename, "' is not writable. Please close all applications using this file. Press '", abort_answer, "' to abort!"),
abort_answer = "n")
{
while(TRUE)
{
is_writeable = try({
f = file(filename, open="a"); ## test for append, which will not overwrite the file
close(f)
}, silent = TRUE
)
if (inherits(is_writeable, "try-error")) {
if (interactive()) {
r = invisible(readline(prompt = prompt_text))
if (r == abort_answer)
{ ## manual abort by user
return (FALSE)
}
} else
{ ## not writable and non-interactive .. we cannot wait
return(FALSE)
}
} else {
return (TRUE)
}
}
## unreachable
return (FALSE)
}
#'
#' Estimate the empirical density and return it
#'
#'
#' @param samples Vector of input values (samples from the distribution)
#' @param y_eval Vector of points where CDF is evaluated (each percentile by default)
#' @return Data.frame with columns 'x', 'y'
#'
#' @examples
#' plot(getECDF(rnorm(1e4)))
#'
#' @export
#'
getECDF = function(samples, y_eval = (1:100)/100)
{
## internal little helper
inv_ecdf <- function(f){
x <- environment(f)$x
y <- environment(f)$y
approxfun(y, x)
}
## compute ECDF
ec = ecdf(samples)
## sample it at certain y-positions
iec = inv_ecdf(ec)
x_eval = iec(y_eval)
r = data.frame(x = x_eval, y = y_eval)
return (r)
}
#'
#' Discretize RT peak widths by averaging values per time bin.
#'
#' Should be applied for each Raw file individually.
#'
#' Returns a data.frame, where 'bin' gives the index of each bin, 'RT' is
#' the middle of each bin and 'peakWidth' is the averaged peak width per bin.
#'
#' @param data Data.frame with columns 'retention.time' and 'retention.length'
#' @param RT_bin_width Bin size in minutes
#' @return Data.frame with columns 'bin', 'RT', 'peakWidth'
#'
#' @export
#'
#' @examples
#' data = data.frame(retention.time = seq(30,200, by=0.001)) ## one MS/MS per 0.1 sec
#' data$retention.length = seq(0.3, 0.6, length.out = nrow(data)) + rnorm(nrow(data), 0, 0.1)
#' d = peakWidthOverTime(data)
#' plot(d$RT, d$peakWidth)
#'
#'
peakWidthOverTime = function(data, RT_bin_width = 2)
{
r = range(data$retention.time, na.rm = TRUE)
brs = seq(from = r[1], to = r[2] + RT_bin_width, by = RT_bin_width)
data$bin = findInterval(data$retention.time, brs, all.inside = TRUE) ## faster than cut(..., labels = FALSE)
#data$bin = cut(data$retention.time, breaks = brs, include.lowest = TRUE, labels = FALSE)
retLStats = plyr::ddply(data, "bin", .fun = function(xb) {
data.frame(RT = brs[xb$bin[1]], peakWidth = median(xb$retention.length, na.rm = TRUE))
})
return(retLStats)
}
#' Get all currently available metrics
#'
#' @param DEBUG_PTXQC Use qc objects from the package (FALSE) or from environment (TRUE/DEBUG)
#' @return List of matric objects
#'
getMetricsObjects = function(DEBUG_PTXQC = FALSE)
{
if (DEBUG_PTXQC) {
lst_qcMetrics_str = ls(sys.frame(which = 0), pattern="qcMetric_") ## executed outside of package, i.e. not loaded...
} else {
lst_qcMetrics_str = ls(name = getNamespace("PTXQC"), pattern="qcMetric_")
}
if (length(lst_qcMetrics_str) == 0) stop("getMetricsObjects(): No metrics found! Very weird!")
lst_qcMetrics = sapply(lst_qcMetrics_str, function(m) {
q = get(m)
if (inherits(q, "refObjectGenerator")) {
return(q$new())
}
return(NULL)
})
return(lst_qcMetrics)
}
#'
#' Make a color (given as name or in RGB) darker by factor x = [0 = black, 1=unchanged]
#' @param color A color as understood by col2rgb
#' @param factor Between 0 (make black) and 1 (leave color as is)
#' @return darkened color
#'
darken = function(color, factor=0.8){
dark_col = grDevices::rgb(t(grDevices::col2rgb(color) * factor), maxColorValue=255)
return(dark_col)
}
#'
#' When MaxQuant is run with a wrong locale (i.e. the decimal separator is not a '.', but a ','),
#' then MaxQuant results are plainly wrong and broken. The can be detected by, e.g. checking for negative charge annotation
#'
#' @param df_evd Evidence table from which we only need the 'charge' column
#'
checkEnglishLocale = function(df_evd) {
if (!checkInput(c("charge"), df_evd)) return(TRUE)
if (any(df_evd$charge < 0)){
return (FALSE)
}
else return (TRUE)
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.