##
## jamba-string.r
##
## igrepHas
## rbindList
## makeNames
## nameVector
## nameVectorN
##
#' vector contains any case-insensitive grep match
#'
#' vector contains any case-insensitive grep match
#'
#' This function checks the input vector for any elements matching the
#' grep pattern. The grep is performed case-insensitive (igrep). This function
#' is particularly useful when checking function arguments or object class,
#' where the class(a) might return multiple values, or where the name of
#' the class might be slightly different than expected, e.g. data.frame,
#' data_frame, DataFrame.
#'
#' @param pattern the grep pattern to use with `base::grep()`
#' @param x vector to use in the grep
#' @param ignore.case logical default TRUE, meaning the grep will be performed
#' in case-insensitive mode.
#' @param minCount integer minimum number of matches required to return TRUE.
#' @param naToBlank logical whether to convert NA to blank, instead of
#' allowing grep to handle NA values as-is.
#'
#' @return logical indicating whether the grep match criteria were met,
#' TRUE indicates the grep pattern was present in minCount or more
#' number of entries.
#'
#' @seealso `base::grep()`
#'
#' @examples
#' a <- c("data.frame","data_frame","tibble","tbl");
#' igrepHas("Data.*Frame", a);
#' igrepHas("matrix", a);
#'
#' @family jam grep functions
#'
#' @export
igrepHas <- function
(pattern,
x=NULL,
ignore.case=TRUE,
minCount=1,
naToBlank=FALSE,
...)
{
## Purpose is a quick check for greppable substring, for if() statements
##
## naToBlank=TRUE will convert NA values to "" prior to running grep
##
## The special case where minCount is negative (minCount == -1) or larger
## than length(x), it will be set to length(x) and therefore
## requires all elements of x to meet the grep criteria
if (minCount < 0 || minCount > length(x)) {
minCount <- length(x);
}
if (length(x) == 0) {
return(FALSE);
} else {
if (naToBlank && any(is.na(x))) {
x[is.na(x)] <- "";
}
length(grep(pattern=pattern,
x=x,
ignore.case=ignore.case,
...)) >= as.integer(minCount);
}
}
#' case-insensitive grep, returning values
#'
#' case-insensitive grep, returning values
#'
#' This function is a simple wrapper around `base::grep()` which
#' runs in case-insensitive mode, and returns matching values. It is
#' particularly helpful when grabbing values from a vector.
#'
#' @param ...,value,ignore.case parameters sent to `base::grep()`
#'
#' @return vector of matching values
#'
#' @examples
#' V <- paste0(LETTERS[1:5], LETTERS[4:8]);
#' vigrep("d", V);
#'
#' @family jam grep functions
#'
#' @export
vigrep <- function
(..., value=TRUE, ignore.case=TRUE)
{
## Purpose is simple to provide quicker wrapper to igrep, returning values
igrep(..., value=value, ignore.case=ignore.case);
}
#' grep, returning values
#'
#' grep, returning values
#'
#' This function is a simple wrapper around `base::grep()` which
#' returns matching values. It is
#' particularly helpful when grabbing values from a vector, but where the
#' case (uppercase or lowercase) is known.
#'
#' @param ...,value,ignore.case parameters sent to `base::grep()`
#'
#' @return vector of matching values
#'
#' @examples
#' V <- paste0(LETTERS[1:5], LETTERS[4:8]);
#' vgrep("D", V);
#' vgrep("d", V);
#' vigrep("d", V);
#'
#' @family jam grep functions
#'
#' @export
vgrep <- function
(..., value=TRUE, ignore.case=FALSE)
{
## Purpose is simple to provide quicker wrapper to grep, returning values
grep(..., value=value, ignore.case=ignore.case);
}
#' case-insensitive grep
#'
#' case-insensitive grep
#'
#' This function is a simple wrapper around `base::grep()` which
#' runs in case-insensitive mode. It is mainly used to save keystrokes,
#' but is consistently named alongside \code{\link{vgrep}} and
#' \code{\link{vigrep}}.
#'
#' @param ...,ignore.case parameters sent to `base::grep()`
#'
#' @return vector of matching indices
#'
#' @examples
#' V <- paste0(LETTERS[1:5], LETTERS[4:8]);
#' igrep("D", V);
#' igrep("d", V);
#' vigrep("d", V);
#'
#' @family jam grep functions
#'
#' @export
igrep <- function
(..., ignore.case=TRUE)
{
## Purpose is simply to provide quick wrapper for case-insensitive grep()
grep(ignore.case=ignore.case, ...);
}
#' case-insensitive logical grepl
#'
#' case-insensitive logical grepl
#'
#' This function is a simple wrapper around `base::grepl()` which
#' runs in case-insensitive mode simply by adding default `ignore.case=TRUE`.
#' It is mainly used for convenience.
#'
#' @param ...,ignore.case parameters sent to `base::grep()`
#'
#' @return `logical` vector indicating pattern match
#'
#' @examples
#' V <- paste0(LETTERS[1:5], LETTERS[4:8]);
#' ig1 <- grepl("D", V);
#' ig2 <- igrepl("D", V);
#' ig3 <- grepl("d", V);
#' ig4 <- igrepl("d", V);
#' data.frame(V,
#' grepl_D=ig1,
#' grepl_d=ig3,
#' igrepl_D=ig2,
#' igrepl_d=ig4);
#'
#' @family jam grep functions
#'
#' @export
igrepl <- function
(...,
ignore.case=TRUE)
{
## Purpose is simply to provide quick wrapper for case-insensitive grep()
grepl(ignore.case=ignore.case,
...);
}
#' case-insensitive grep, returning unmatched indices
#'
#' case-insensitive grep, returning unmatched indices
#'
#' This function is a simple wrapper around `base::grep()` which
#' runs in case-insensitive mode, and returns unmatched entries.
#' It is mainly used to save keystrokes,
#' but is consistently named alongside \code{\link{vgrep}} and
#' \code{\link{vigrep}}, and quite helpful for writing concise code.
#'
#' @param ...,ignore.case,invert parameters sent to `base::grep()`
#'
#' @return vector of non-matching indices
#'
#' @examples
#' V <- paste0(LETTERS[1:5], LETTERS[4:8]);
#' unigrep("D", V);
#' igrep("D", V);
#'
#' @family jam grep functions
#'
#' @export
unigrep <- function
(..., ignore.case=TRUE, invert=TRUE)
{
## purpose is to un-grep, return non-hits in case-insensitive fashion
igrep(..., ignore.case=ignore.case, invert=invert);
}
#' case-insensitive grep, returning unmatched values
#'
#' case-insensitive grep, returning unmatched values
#'
#' This function is a simple wrapper around `base::grep()` which
#' runs in case-insensitive mode, and returns unmatched values.
#' It is mainly used to save keystrokes,
#' but is consistently named alongside \code{\link{vgrep}} and
#' \code{\link{vigrep}}, and quite helpful for writing concise code.
#' It is particularly useful for removing unwanted entries from a long
#' vector, for example removing accession numbers from a long
#' vector of gene symbols in order to review gene annotations.
#'
#' @param ...,ignore.case,value,invert parameters sent to `base::grep()`
#'
#' @return vector of non-matching indices
#'
#' @examples
#' V <- paste0(LETTERS[1:5], LETTERS[4:8]);
#' unigrep("D", V);
#' igrep("D", V);
#'
#' @family jam grep functions
#'
#' @export
unvigrep <- function
(..., ignore.case=TRUE, value=TRUE, invert=TRUE)
{
## purpose is to un-grep, return non-hits in case-insensitive fashion
grep(..., ignore.case=ignore.case, value=value, invert=invert);
}
#' provigrep: progressive case-insensitive value-grep
#'
#' case-insensitive value-grep for a vector of patterns
#'
#' Purpose is to provide "progressive vigrep()",which is value-returning,
#' case-insensitive grep, starting with an ordered vector of grep patterns.
#' For example, it returns entries in the order they are matched, by the
#' progressive use of grep patterns.
#'
#' It is particularly good when using multiple grep patterns, since
#' `grep()` does not accept multiple patterns as input. This function
#' also only returns the unique matches in the order they were matched,
#' which alleviates the need to run a series of `grep()` functions
#' and collating their results.
#'
#' It is mainly to allow for prioritized ordering of matching entries, where
#' one would like certain matching entries first, followed by another
#' set of matching entries, without duplication. For example,
#' one might grep for a few patterns, but want certain pattern hits to be
#' listed first.
#'
#' @param patterns `character` vector of regular expression patterns,
#' ultimately passed to `base::grep()`.
#' @param x `character` vector that is the subject of `base::grep()`.
#' @param maxValues `integer` or NULL, the maximum matching entries to
#' return per grep pattern. Note that each grep pattern may match multiple
#' values, and values are only returned at most once each, so restricting
#' items returned by one grep pattern may allow an item to be matched
#' by subsequent patterns, see examples. This argument is most commonly
#' used with `maxValues=1` which returns only the first matching entry
#' per pattern.
#' @param sortFunc `function` or NULL, used to sort entries within each set of
#' matching entries. Use NULL to avoid sorting entries.
#' @param rev `logical` whether to reverse the order of matching entries. Use
#' TRUE if you would like entries matching the patterns to be placed last,
#' and entries not matching the grep patterns to be placed first. This
#' technique is effective at placing "noise names" at the end of a long
#' vector, for example.
#' @param returnType `character` indicating whether to return a vector or list.
#' A list will be in order of the grep patterns, using empty elements to
#' indicate when no entries matched each pattern. This output is useful
#' when you would like to know which patterns matched specific entries.
#' @param ignore.case `logical` parameter sent to `base::grep()`, TRUE
#' runs in case-insensitive mode, as by default.
#' @param value `logical` indicating whether to return the matched value,
#' or when `value=FALSE` the index position is returned.
#' @param ... additional arguments are passed to `vigrep()`.
#'
#' @examples
#' # a rather comical example
#' # set up a test set with labels containing several substrings
#' set.seed(1);
#' testTerms <- c("robot","tree","dog","mailbox","pizza","noob");
#' testWords <- pasteByRow(t(combn(testTerms,3)));
#'
#' # now pull out entries matching substrings in order
#' provigrep(c("pizza", "dog", "noob", "."), testWords);
#' # more detail about the sort order is shown with returnType="list"
#' provigrep(c("pizza", "dog", "noob", "."), testWords, returnType="list");
#' # rev=TRUE will reverse the order of the list
#' provigrep(c("pizza", "dog", "noob", "."), testWords, returnType="list", rev=TRUE);
#' provigrep(c("pizza", "dog", "noob", "."), testWords, rev=TRUE);
#'
#' # another example showing ordering of duplicated entries
#' set.seed(1);
#' x <- paste0(
#' sample(letters[c(1,2,2,3,3,3,4,4,4,4)]),
#' sample(1:5));
#' x;
#' # sort by letter
#' provigrep(letters[1:4], x)
#'
#' # show more detail about how the sort is performed
#' provigrep(letters[1:4], x, returnType="list")
#'
#' # rev=TRUE will reverse the order of pattern matching
#' # which is most useful when "." is the last pattern:
#' provigrep(c(letters[1:3], "."), x, returnType="list")
#' provigrep(c(letters[1:3], "."), x, returnType="list", rev=TRUE)
#'
#' # example demonstrating maxValues
#' # return in list format
#' provigrep(c("[ABCD]", "[CDEF]", "[FGHI]"), LETTERS, returnType="list")
#'
#' # maxValues=1
#' provigrep(c("[ABCD]", "[CDEF]", "[FGHI]"), LETTERS, returnType="list", maxValues=1)
#' provigrep(c("[ABCD]", "[CDEF]", "[FGHI]"), LETTERS, returnType="list", maxValues=1, value=FALSE)
#' proigrep(c("[ABCD]", "[CDEF]", "[FGHI]"), LETTERS, maxValues=1)
#'
#' @family jam grep functions
#'
#' @export
provigrep <- function
(patterns,
x,
maxValues=NULL,
sortFunc=c,
rev=FALSE,
returnType=c("vector", "list"),
ignore.case=TRUE,
value=TRUE,
...)
{
## Purpose is to provide "progressive vigrep()" (which is value-returning,
## case-insensitive) mainly to allow for prioritized ordering of matching
## entries.
## For example, one might grep for a few patterns, but want certain pattern
## hits to come back before others.
##
## rev will return the entries in reverse order, which is effective if
## using set of patterns to down-prioritize.
##
## sortFunc is intended to allow for sorting each set of matched entries
## along the way, particularly useful when using a non-standard sort
## function.
##
## returnType="vector" returns the vector of matching entries
## returnType="list" returns a named list of matching entries, using
## the grep patterns as list names
##
returnType <- match.arg(returnType);
x_unique <- make.unique(as.character(x),
sep="_v");
## Iterate each grep pattern
valueSetL <- lapply(patterns, function(i){
z <- vigrep(pattern=i,
x=x,
value=TRUE,
ignore.case=ignore.case,
...);
if (length(sortFunc) > 0 && !identical(c, sortFunc)) {
## If sortFunc is not c(), then run it
z <- sortFunc(z);
}
## Here the values are converted to index positions
match(make.unique(as.character(z), sep="_v"),
x_unique);
});
## Apply maxValues
if (length(maxValues) > 0) {
valueSetL <- jamba::heads(valueSetL,
n=maxValues);
}
## Make each item only represented once across the list
if (length(names(patterns)) == 0) {
names(valueSetL) <- makeNames(patterns);
} else {
names(valueSetL) <- names(patterns);
}
f1 <- factor(names(valueSetL),
levels=names(valueSetL));
m1 <- match(
make.unique(sep="_v",
as.character(unlist(valueSetL))),
as.character(seq_along(x)));
r1 <- rep(f1, lengths(valueSetL));
if (value && "list" %in% returnType) {
valueSetL <- split(x[m1[!is.na(m1)]], r1[!is.na(m1)]);
} else {
valueSetL <- split(m1[!is.na(m1)], r1[!is.na(m1)]);
}
## Optionally reverse the list
if (rev) {
valueSetL <- rev(valueSetL);
}
## Optionally return the list format
if ("list" %in% returnType) {
return(valueSetL);
}
if (value) {
valueSet <- x[unique(unlist(valueSetL))];
} else {
valueSet <- unique(unlist(valueSetL));
}
return(valueSet);
}
#' proigrep: progressive case-insensitive grep
#'
#' case-insensitive grep for a vector of patterns
#'
#' @rdname provigrep
#'
#' @export
proigrep <- function
(...,
value=FALSE)
{
provigrep(...,
value=value);
}
#' rbind a list of vectors into matrix or data.frame
#'
#' rbind a list of vectors into matrix or data.frame
#'
#' The purpose of this function is to emulate `do.call(rbind, x)` on a list
#' of vectors, while specifically handling when there are different
#' numbers of entries per vector. The output `matrix` number of columns
#' will be the longest vector (or largest number of columns) in the
#' input list `x`.
#'
#' Instead of recycling values in each row to fill the target number
#' of columns, this function fills cells with blank fields,
#' with default argument `fixBlanks=TRUE`.
#'
#' In extensive timings tests at the time this function was created,
#' this technique was notably faster than alternatives.
#' It runs `do.call(rbind, x)` then subsequently replaces recycled values
#' with blank entries, in a manner that is notably faster than
#' alternative approaches such as pre-processing the input data.
#'
#' @return `matrix` unless `returnDF=TRUE` in which the output is coerced
#' to a `data.frame`.
#' The rownames by default are derived from the list names,
#' but the colnames are not derived from the vector names.
#' If input `x` contains `data.frame` or `matrix` objects, the output
#' will retain those values.
#'
#' @param x `list` of atomic `vector`, `matrix`, or `data.frame`
#' objects.
#' @param emptyValue `character` value to use to represent missing values,
#' whenever a blank cell is introduced into the resulting matrix
#' @param nullValue optional value used to replace NULL entries in
#' the input list, useful especially when the data was produced
#' by `strsplit()` with `""`. Use `nullValue=""` to replace `NULL`
#' with `""` and preserve the original list length. Otherwise when
#' `nullValue=NULL` any empty entries will be silently dropped.
#' @param keepListNames `logical` whether to use list names as rownames
#' in the resulting matrix or data.frame.
#' @param newColnames NULL or `character` vector of colnames to use for the
#' resulting matrix or data.frame.
#' @param newRownames NULL or `character` vector of rownames to use for the
#' resulting matrix or data.frame. If supplied, this value overrides the
#' keepListNames=TRUE use of list names as rownames.
#' @param fixBlanks `logical` whether to use blank values instead of repeating
#' each vector to the length of the maximum vector length when filling
#' each row of the matrix or data.frame.
#' @param returnDF `logical` whether to return a data.frame, by default FALSE,
#' a matrix is returned.
#' @param verbose `logical` whether to print verbose output during processing.
#'
#' @examples
#' L <- list(a=LETTERS[1:4], b=letters[1:3]);
#' rbindList(L);
#' rbindList(L, returnDF=TRUE);
#'
#' @family jam list functions
#'
#' @export
rbindList <- function
(x,
emptyValue="",
nullValue=NULL,
keepListNames=TRUE,
newColnames=NULL,
newRownames=NULL,
fixBlanks=TRUE,
returnDF=FALSE,
verbose=FALSE,
...)
{
## Purpose is to emulate do.call(rbind, x) on a list with variable
## numbers of entries but instead of repeating the values to fill the
## number of resulting columns, it goes back and empties out the fields
## which should have no value. In extensive timings tests at the time
## this function was created, this method was notably faster than
## alternatives.
##
## keepListNames=TRUE is default, the way R prepends list names to
## the element names when unlisting an object.
## keepListNames=FALSE will not use the list names, but keep the element
## names without changing them.
##
## newColnames and newRownames are optionally intended for assigning colnames
## and rownames to the resulting matrix, respectively.
##
## fixBlanks=FALSE will turn off the fixing of duplicated entries
##
## returnDF=TRUE will convert the output to data.frame, when usually it
## would be a matrix after calling rbind(). This step checks for non-unique
## rownames, and if present, calls makeNames(rownames(x), ...). The "..."
## can be used to customize how duplicate rownames are assigned.
##
xLslen <- lengths(x);
if (any(xLslen %in% 0)) {
if (length(nullValue) > 0) {
x[xLslen == 0] <- head(nullValue, 1);
xLslen[xLslen == 0] <- 1;
}
if (all(xLslen == 0)) {
return(NULL);
}
x <- x[!xLslen == 0];
xLslen <- xLslen[!xLslen == 0];
#x[xLslen == 0] <- "";
}
## Now run the normal do.call(rbind, ...) which duplicates entries
## but we will clean them up
xDF <- suppressWarnings(do.call(rbind, x));
if (!keepListNames) {
rownames(xDF) <- unlist(lapply(x, function(ix){
if (igrepHas("data.*frame|matrix", class(ix))) {
rownames(ix);
} else {
names(ix);
}}));
}
## Mark the row entries to fix
if (fixBlanks) {
xLfix <- unique(xLslen[xLslen < ncol(xDF)]);
if (length(xLfix) > 0) {
## Iterate chunks of rows which share the same lengths
## and blank out their subsequent column values
for(i in xLfix) {
whichRows <- which(xLslen == i);
xDF[whichRows,((i+1):ncol(xDF))] <- "";
}
}
}
if (!is.null(newColnames)) {
colnames(xDF) <- makeNames(rep(newColnames, length.out=ncol(xDF)));
}
if (!is.null(newRownames)) {
rownames(xDF) <- makeNames(rep(newRownames, length.out=nrow(xDF)));
}
if (returnDF) {
## Check for duplicated rownames, allowed in matrices but not
## in data.frames
if (length(tcount(rownames(xDF), minCount=2)) > 0) {
rownames(xDF) <- makeNames(rownames(xDF), ...);
}
#xDF <- unlistDataFrame(as.data.frame(xDF), verbose=verbose, ...);
xDF <- data.frame(check.names=FALSE,
stringsAsFactors=FALSE,
xDF);
}
return(xDF);
}
#' make unique vector names
#'
#' make unique vector names
#'
#' This function extends the basic goal from \code{\link[base]{make.names}}
#' which is intended to make syntactically valid names from a character vector.
#' This makeNames function makes names unique, and offers configurable methods
#' to handle duplicate names. By default, any duplicated entries receive a
#' suffix _v# where # is s running count of entries observed, starting at 1.
#' The \code{\link[base]{make.names}} function, by contrast, renames the
#' second observed entry starting at .1, leaving the original entry
#' unchanged. Optionally, makeNames can rename all entries with a numeric
#' suffix, for consistency.
#'
#' For example:
#' \code{A, A, A, B, B, C}
#' becomes:
#' \code{A_v1, A_v2, A_v3, B_v1, B_v2, C}
#'
#' Also, makeNames always allows "_".
#'
#' This makeNames function is similar to \code{\link[base]{make.unique}}
#' which also converts a vector into a unique vector by adding suffix values,
#' however the \code{\link[base]{make.unique}} function intends to allow
#' repeated operations which recognize duplicated entries and continually
#' increment the suffix number. This makeNames function currently does not
#' handle repeat operations. The recommended approach to workaround having
#' pre-existing versioned names would be to remove suffix values prior to
#' running this function. One small distinction from
#' \code{\link[base]{make.unique}} is that makeNames does version the first
#' entry in a set.
#'
#' @return character vector of unique names
#'
#' @family jam string functions
#'
#' @param x character vector to be used when defining names. All other
#' vector types will be coerced to character prior to use.
#' @param unique argument which is ignored, included only for
#' compatibility with `base::make.names`. All results from
#' `makeNames()` are unique.
#' @param suffix character separator between the original entry and the
#' version, if necessary.
#' @param renameOnes logical whether to rename single, unduplicated, entries.
#' @param doPadInteger logical whether to pad integer values to a consistent
#' number of digits, based upon all suffix values needed. This output
#' allows for more consistent sorting of names. To define a fixed number
#' of digits, use the useNchar parameter.
#' @param useNchar integer or NULL, number of digits to use when padding
#' integer values with leading zero, only relevant when usePadInteger=TRUE.
#' @param startN integer number used when numberStyle is "number", this integer
#' is used for the first entry to be renamed. You can use this value to
#' make zero-based suffix values, for example.
#' @param numberStyle character style for version numbering
#' \describe{
#' \item{"number"}{Use integer numbers to represent each duplicated
#' entry.}
#' \item{"letters"}{Use lowercase letters to represent each duplicated
#' entry. The 27th entry uses the pattern "aa" to represent two
#' 26-base digits. When doPadInteger=TRUE, a zero is still used
#' to pad the resulting version numbers, again to allow easy sorting
#' of text values, but also because there is no letter equivalent
#' for the number zero.
#' It is usually best to change the suffix to "_" or "" when using
#' "letters".}
#' \item{"LETTERS"}{Use uppercase letters to represent each duplicated
#' entry, with the same rules as applied to "letters".}
#' }
#' @param renameFirst logical whether to rename the first entry in a set of
#' duplicated entries. If FALSE then the first entry in a set will not
#' be versioned, even when renameOnes=TRUE.
#' @param keepNA logical whether to retain NA values using the string "NA".
#' If keepNA is FALSE, then NA values will remain NA, thus causing some
#' names to become `<NA>`, which can cause problems with some downstream
#' functions which assume all names are either NULL or non-NA.
#'
#' @examples
#' V <- rep(LETTERS[1:3], c(2,3,1));
#' makeNames(V);
#' makeNames(V, renameOnes=TRUE);
#' makeNames(V, renameFirst=FALSE);
#' exons <- makeNames(rep("exon", 3), suffix="");
#' makeNames(rep(exons, c(2,3,1)), numberStyle="letters", suffix="");
#'
#' @export
makeNames <- function
(x,
unique=TRUE,
suffix="_v",
renameOnes=FALSE,
doPadInteger=FALSE,
startN=1,
numberStyle=c("number","letters","LETTERS"),
useNchar=NULL,
renameFirst=TRUE,
keepNA=TRUE,
...)
{
## Purpose is to make unique names without the R mangling that comes
## with make.names().
## By default, unique entries are not renamed, and entries with two or
## more replicates are renamed to NAME_v1, NAME_v2, NAME_v3, etc.
##
## if renameOnes=TRUE, it will rename singlets to NAME_v1 even if there
## is only one entry.
##
## renameFirst=TRUE will rename each duplicated entry NAME_v1, NAME_v2,
## NAME_v3, etc.
## renameFirst=FALSE will not rename the first in a set of duplicated
## entries, e.g. NAME, NAME_v1, NAME_v2, etc.
##
## The distinction between renameOnes and renameFirst:
## renameOnes=TRUE will rename all singlets and duplicated entries,
## starting with the first entry.
## renameOnes=FALSE will not rename singlet entries.
## renameFirst=TRUE will only rename duplicated entries, starting with
## the first entry.
## renameFirst=FALSE will not rename the first entry in a set of
## duplicated entries.
##
## the suffix can be changed, e.g. "_r" will name names NAME_r1,
## NAME_r2, NAME_r3, etc.
##
## numberStyle="number" uses integers as the suffix
## numberStyle="letters" uses lowercase letters as digits, similar to Excel column names
## numberStyle="LETTERS" uses uppercase letters as digits, similar to Excel column names
## Be aware that letters can only go to roughly 18,000 entries, given the current implementation
## of colNum2excelName
##
## When useNchar is numeric, it sets doPadInteger=TRUE, and will use at least
## that many digits in padding the integer.
##
##
## TODO:
## Update logic to be analogous to using make.unique(), which intends
## to maintain previous versioning of names without appending deeper
## suffices as appropriate.
## E.g. c("", "", "", "_v1", "_v2", "_v3")
## becomes c("_v4", "_v5", "_v6", "_v1", "_v2", "_v3")
## instead of
## c("_v1", "_v2", "_v3", "_v1", "_v2", "_v3")
## or
## c("_v1_v1", "_v2_v1", "_v3_v1", "_v1_v2", "_v2_v2", "_v3_v2")
##
if (length(x) == 0) {
return(x);
}
numberStyle <- match.arg(numberStyle);
if (!is.null(useNchar)) {
useNchar <- as.integer(useNchar);
doPadInteger=TRUE;
}
if (any(c("factor", "ordered") %in% class(x))) {
x <- as.character(x);
}
if (keepNA && any(is.na(x))) {
x <- rmNA(x,
naValue="NA");
}
## First check for duplicates using anyDuplicated()
## version 0.0.35.900, this change speeds assignment
## in large vectors when most entries are not duplicated.
dupes <- duplicated(x);
## Convert entries to a named count of occurences of each entry
if (any(dupes)) {
xSubDupes <- table(x[dupes]) + 1;
maxCt <- max(c(1,xSubDupes));
xSubOnes <- setNames(rep(1, sum(!dupes)), x[!dupes]);
xSub <- c(xSubOnes, xSubDupes);
} else {
xSub <- setNames(rep(1, sum(!dupes)), x[!dupes]);
maxCt <- 1;
}
## version 0.0.34.900 and previous used the method below
#xSub <- table(as.character(x));
## Vector of counts to be used
versionsV <- as.integer(renameFirst):maxCt + startN - 1;
## If using letters, define the set of letter upfront to save processing
if (igrepHas("letters", numberStyle)) {
if (numberStyle %in% "letters") {
useLetters <- letters[1:26];
zeroVal <- "A";
} else {
useLetters <- LETTERS[1:26];
zeroVal <- "a";
}
num2letters <- colNum2excelName(versionsV,
useLetters=useLetters,
zeroVal=zeroVal,
...);
versionsV <- num2letters;
}
if (doPadInteger) {
versionsV <- padInteger(versionsV,
useNchar=useNchar,
...);
}
## If no duplicated entries
if (max(xSub) %in% c(-Inf,1)) {
## If not renaming the singlet entries, send the same list back
if (!renameOnes) {
return(x);
} else {
## If renaming singlets, simply paste the suffix and first entry
return(paste0(x, suffix, head(versionsV, 1)));
}
}
if (renameOnes) {
xUse <- 1:length(x);
} else {
xUse <- (x %in% names(xSub)[xSub > 1]);
}
xSub1 <- x[xUse];
## Preserve the original order
names(xSub1) <- padInteger(seq_along(xSub1));
## Split the vector into a list of vectors, by name
xSub2 <- split(xSub1, xSub1);
names(xSub2) <- NULL;
## Optionally pad the integer to facilitate sorting
## Note: This padding only pads integers within each name,
## not across all names.
xSub3 <- lapply(xSub2, function(i){
versionsV[seq_along(i)];
});
## Now simply paste the value, the suffix, and the new version
xSub1v <- paste0(unlist(xSub2), suffix, unlist(xSub3));
## Re-order the vector using the original order
names(xSub1v) <- names(unlist(xSub2));
xSub1v2 <- xSub1v[names(xSub1)];
## Assign only the entries we versioned
x[xUse] <- xSub1v2;
## Last check for renameFirst=FALSE, in which case we remove the first
## versioned entry
if (!renameFirst) {
escapeRegex <- function(string){
gsub("([.|()\\^{}+$*?]|\\[|\\])", "\\\\\\1", string);
}
firstVer <- paste0(escapeRegex(paste0(suffix, head(versionsV, 1))), "$");
x[xUse] <- gsub(firstVer, "", x[xUse]);
}
return(x);
}
#' assign unique names for a vector
#'
#' assign unique names for a vector
#'
#' This function assigns unique names to a vector, if necessary it runs
#' \code{\link{makeNames}} to create unique names. It differs from
#' \code{\link[stats]{setNames}} in that it ensures names are unique,
#' and when no names are supplied, it uses the vector itself to define
#' names. It is helpful to run this function inside an \code{\link[base]{lapply}}
#' function call, which by default maintains names, but does not assign
#' names if the input data did not already have them.
#'
#' When used with a data.frame, it is particularly convenient to pull out
#' a named vector of values. For example, log2 fold changes by gene, where
#' the gene symbols are the name of the vector.
#'
#' \code{nameVector(genedata[,c("Gene","log2FC")])}
#'
#' @return vector with names defined
#'
#' @family jam string functions
#'
#' @param x vector input, or data.frame, matrix, or tibble with two columns,
#' the second column is used to name values in the first column.
#' @param y NULL or character vector of names. If NULL then x is used.
#' Note that y is recycled to the length of x, prior to being sent
#' to the makeNamesFunc.
#' In fringe cases, y can be a matrix, data.frame, or tibble, in which
#' case \code{\link{pasteByRow}} will be used to create a character string
#' to be used for vector names. Note this case is activated only when x
#' is not a two column matrix, data.frame, or tibble.
#' @param makeNamesFunc function to make names unique, by default
#' \code{\link{makeNames}} which ensures names are unique.
#' @param ... passed to \code{\link{makeNamesFunc}}, or to
#' \code{\link{pasteByRow}} if y is a two column data.frame, matrix, or
#' tibble. Thus, \code{sep} can be defined here as a delimiter between
#' column values.
#'
#' @examples
#' # it generally just creates names from the vector values
#' nameVector(LETTERS[1:5]);
#'
#' # if values are replicated, the makeNames() function makes them unique
#' V <- rep(LETTERS[1:5], each=3);
#' nameVector(V);
#'
#' # for a two-column data.frame, it creates a named vector using
#' # the values in the first column, and names in the second column.
#' df <- data.frame(seq_along(V), V);
#' df;
#' nameVector(df);
#'
#' # Lastly, admittedly a fringe case, it can take a multi-column data.frame
#' # to generate labels:
#' nameVector(V, df);
#'
#' @export
nameVector <- function
(x,
y=NULL,
makeNamesFunc=makeNames,
...)
{
## Purpose is to name a vector with its own values,
## useful for lapply which only names output if the input
## vector has names.
##
## A neat trick is to use the _v# naming scheme in makeNames to
## create unique names based upon a single label, e.g.
## set1colors <- nameVector(brewer.pal(15, "Set1"), "Set1");
## Set1_v1 Set1_v2 Set1_v3 Set1_v4 Set1_v5 Set1_v6 Set1_v7 Set1_v8 Set1_v9
## "#E41A1C" "#377EB8" "#4DAF4A" "#984EA3" "#FF7F00" "#FFFF33" "#A65628" "#F781BF" "#999999"
##
## Added bonus, if given a 2-column table, it'll use them as x and y
if (igrepHas("dataframe", class(x))) {
x <- as.data.frame(x);
}
if (igrepHas("data.frame", class(x)) && ncol(x) == 2) {
y <- x[[2]];
x <- x[[1]];
} else if (igrepHas("matrix", class(x)) && ncol(x) == 2) {
y <- x[,2];
x <- x[,1];
}
if (length(y) > 0) {
if (igrepHas("data.frame|matrix", class(y))) {
## If given a data.frame use pasteByRow() to create a string
y <- pasteByRow(y, ...);
}
names(x) <- makeNamesFunc(rep(y, length.out=length(x)), ...);
} else {
names(x) <- makeNamesFunc(x, ...);
}
return(x);
}
#' define a named vector using vector names
#'
#' define a named vector using vector names
#'
#' This function creates a vector from the names of the input vector,
#' then assigns the same as names. The utility is mainly for
#' \code{\link[base]{lapply}} functions which maintain the name of a vector
#' in its output. The reason to run \code{\link[base]{lapply}} using names
#' is so the lapply function is operating only on the name and not the
#' data it references, which can be convenient when the name of the element
#' is useful to known inside the function body. The reason to name the names,
#' is so the list object returned by \code{\link[base]{lapply}} is also named
#' with these same consistent names.
#'
#' Consider a list of data.frames, each of which represents stats results
#' from a contrast and fold change. The data.frame may not indicate the name
#' of the contrast, while the list itself may be named by the contrast.
#' One would \code{lapply(nameVectorN(listDF), function(iName)iName)} which
#' allows the internal function access to the name of each list element. This
#' could for example be added to the data.frame.
#'
#' @return vector of names, whose names are uniquely assigned using
#' \code{\link{makeNames}} using the values of the vector.
#'
#' @family jam string functions
#'
#' @param x vector or any object which has names available via \code{names(x)}
#' @param makeNamesFunc function used to create unique names, in the event that
#' the names(x) are not unique.
#'
#' @examples
#' # a simple integer vector with character names
#' L <- nameVector(1:5, LETTERS[1:5]);
#' L;
#'
#' # we can make a vector of names, retaining the names
#' nameVectorN(L);
#'
#' # Now consider a named list, where the name is important
#' # to keep for downstream work.
#' K <- list(A=(1:3)^3, B=7:10, C=(1:4)^2);
#' K;
#' # Typical lapply-style work does not operate on the name,
#' # making it difficult to use the name inside the function.
#' # Here, we just add the name to the colnames, but anything
#' # could be useful.
#' lapply(K, function(i){
#' data.frame(mean=mean(i), median=median(i));
#' });
#'
#' # So the next step is to run lapply() on the names
#' lapply(names(K), function(i){
#' iDF <- data.frame(mean=mean(K[[i]]), median=median(K[[i]]));
#' colnames(iDF) <- paste(c("mean", "median"), i);
#' iDF;
#' })
#' # The result is good, but the list is no longer named.
#' # The nameVectorN() function is helpful for maintaining the names.
#'
#' # So we run lapply() on the named-names, which keeps the names in
#' # the resulting list, and sends it into the function.
#' lapply(nameVectorN(K), function(i){
#' iDF <- data.frame(mean=mean(K[[i]]), median=median(K[[i]]));
#' colnames(iDF) <- paste(c("mean", "median"), i);
#' iDF;
#' });
#'
#' @export
nameVectorN <- function
(x,
makeNamesFunc=makeNames,
...)
{
## Purpose is to extend nameVector to create a named vector of
## names(x), useful when you want to run lapply() on the names
## of a vector, and return a list whose names are names(x)
## Use instead of lapply(nameVector(names(x)), function(i)c)
if (is.null(names(x))) {
names(x) <- seq_along(x);
}
nameVector(names(x), makeNamesFunc=makeNamesFunc, ...);
}
#' remove NULL entries from list
#'
#' remove NULL entries from list
#'
#' This function is a simple helper function to remove NULL from a list,
#' optionally replacing it with another value
#'
#' @return list with NULL entries either removed, or replaced with nullValue.
#' This function is typically called so it removed list elements which are
#' NULL, resulting in a list that contains non-NULL entries. This function
#' can also be useful when NULL values should be changed to something else,
#' perhaps a character value "NULL" to be used as a label.
#'
#' @family jam practical functions
#'
#' @param x list or other object which may contain NULL.
#'
#' @examples
#' x <- list(A=1:6, B=NULL, C=letters[11:16]);
#' rmNULL(x)
#' rmNULL(x, nullValue=NA)
#'
#' @export
rmNULL <- function
(x, nullValue=NULL,
...)
{
## Purpose is similar to rmNA() which can also perform this function,
## to replace NULL with a non-NULL value. It operates effectively
## on lists, which may contain some NULL elements
#isNULL <- sapply(x, is.null);
if (length(x) == 0) {
x <- nullValue;
} else {
isNULL <- sapply(x, function(i){
length(i) == 0;
});
if (length(nullValue) == 0) {
x <- x[!isNULL];
} else {
x[isNULL] <- nullValue;
}
}
x;
}
#' remove NA values
#'
#' remove NA values
#'
#' This function removes NA values, by default shortening a vector as a result,
#' but optionally replacing NA and Infinite values with fixed values.
#'
#' @return vector with NA entries either removed, or replaced with naValue,
#' and NULL entries either removed or replaced by nullValue.
#'
#' @family jam practical functions
#'
#' @param x vector input
#' @param naValue NULL or single replacement value for NA entries. If NULL,
#' then NA entries are removed from the result.
#' @param rmNULL `logical` whether to replace NULL entries with `nullValue`
#' @param nullValue NULL or single replacement value for NULL entries. If NULL,
#' then NULL entries are removed from the result.
#' @param rmInfinite `logical` whether to replace Infinite values with
#' infiniteValue
#' @param infiniteValue value to use when rmInfinite==TRUE to replace
#' entries which are Inf or -Inf.
#' @param rmNAnames `logical` whether to remove entries which have NA as the
#' name, regardless whether the entry itself is NA.
#' @param verbose `logical` whether to print verbose output
#' @param ... additional arguments are ignored.
#'
#' @export
rmNA <- function
(x,
naValue=NULL,
rmNULL=FALSE,
nullValue=naValue,
rmInfinite=TRUE,
infiniteValue=NULL,
rmNAnames=FALSE,
verbose=FALSE,
...)
{
## Purpose is simply to remove NA entries from a vector.
##
## It will assign NA to some number (e.g. zero)
## if naValue is not NULL.
##
## rmNULL=TRUE will convert NULL into the naValue
##
## If rmInfinite is TRUE, it will remove infinite values.
## If infiniteValue is not NULL, infinite values will be assigned that value
## (e.g. to apply a max value and not remove those datapoints)
##
## rmNAnames=TRUE will remove NA names only when names exist
if (length(x) == 0) {
if (rmNULL) {
x <- nullValue;
}
return(x);
}
# No change for now, version 0.0.87.900
# Call rmNAs() when input x is list?
# if (is.list(x)) {
# return(rmNAs(x,
# naValue=naValue,
# rmNULL=rmNULL,
# nullValue=nullValue,
# rmInfinite=rmInfinite,
# infiniteValue=infiniteValue,
# rmNAnames=rmNAnames,
# verbose=verbose));
# }
if (!"list" %in% class(x) && rmInfinite && any(is.infinite(x))) {
x <- rmInfinite(x,
infiniteValue=infiniteValue);
}
if (igrepHas("factor", class(x))) {
if (any(is.na(as.character(x)))) {
if (verbose) {
printDebug("rmNA(): ",
"NA is present in factor x.");
}
## If x is a factor, only modify it if there is an NA,
## which we must check by converting to character, in case
## NA is not one of the factor levels but is present in x
if (!is.null(naValue)) {
## If we are replacing NA, then add it as a factor then change it
x <- addNA(x, ifany=TRUE);
levels(x) <- rmNA(levels(x), naValue=naValue);
} else {
## We not replacing NA, then remove NA from levels(x)
notNA <- which(!is.na(as.character(x)));
x <- factor(x[notNA], exclude=NA);
}
}
} else if (any(is.na(x))) {
whichNA <- which(is.na(x));
if (length(naValue) > 0) {
x[whichNA] <- naValue;
} else {
x <- x[-c(whichNA)];
}
}
## Note: NULL should only occur in lists, not vectors
if (rmNULL) {
isNULL <- sapply(x, is.null);
if (any(isNULL)) {
x[isNULL] <- naValue;
}
}
## Optionally remove entries with NA names
if (rmNAnames && !is.null(names(x)) && any(is.na(names(x)))) {
naNames <- which(!is.na(names(x)));
x <- x[naNames];
}
x;
}
#' remove Infinite values
#'
#' remove Infinite values
#'
#' This function removes any positive or negative infinite numerical
#' values, optionally replacing them with a given value or NA.
#'
#' @return numeric vector with infinite values either removed, or
#' replaced with the supplied value.
#'
#' @family jam practical functions
#'
#' @param x vector input
#' @param infiniteValue NULL to remove Infinite values, or a replacement value
#' @param ... additional parameters are ignored
#'
#' @export
rmInfinite <- function
(x,
infiniteValue=NULL,
...)
{
## Purpose is to remove infinite values from a vector.
## If infiniteValue is not NULL, infinite values will be assigned that value
## (e.g. to apply a max value and not remove those datapoints)
if (!is.null(infiniteValue)) {
if (igrepHas("character", class(infiniteValue))) {
x[is.infinite(x)] <- paste0(gsub("1", "",
sign(x[is.infinite(x)])), infiniteValue);
} else {
x[is.infinite(x)] <- infiniteValue * sign(x[is.infinite(x)]);
}
} else {
x <- x[!is.infinite(x)];
}
return(x);
}
#' apply unique to each element of a list
#'
#' Apply unique to each element of a list, usually a list of vectors
#'
#' This function will attempt to use `S4Vectors::unique()` which is
#' substantially faster than any `apply` family function, especially
#' for very long lists. However, when `S4Vectors` is not installed,
#' it applies uniqueness to the `unlist`ed vector of values, which is
#' also substantially faster than the `apply` family functions for
#' long lists, but which may still be less efficient than the
#' C implementation provided by `S4Vectors`.
#'
#' @return `list` with unique values in each list element.
#'
#' @param x input list of vectors
#' @param keepNames boolean indicating whether to keep the list element
#' names in the returned results.
#' @param incomparables see [unique()] for details, this value is only
#' sent to `S4Vectors::unique()` when the Bioconductor package
#' `S4Vectors` is installed, and is ignored otherwise for efficiency.
#' @param useBioc boolean indicating whether this function should try
#' to use `S4Vectors::unique()` when the Bioconductor package
#' `S4Vectors` is installed, otherwise it will use a somewhat less
#' efficient bulk operation.
#'
#' @family jam string functions
#' @family jam list functions
#'
#' @examples
#' L1 <- list(CA=nameVector(LETTERS[c(1:4,2,7,4,6)]),
#' B=letters[c(7:11,9,3)],
#' D=nameVector(LETTERS[4]));
#' L1;
#' uniques(L1);
#'
#' if (1 == 1) {
#' if (suppressWarnings(suppressPackageStartupMessages(require(IRanges)))) {
#' printDebug("Bioc CompressedList:");
#' print(system.time(uniques(rep(L1, 10000), useBioc=TRUE)));
#' }
#' if (suppressWarnings(suppressPackageStartupMessages(require(S4Vectors)))) {
#' printDebug("Bioc SimpleList:");
#' print(system.time(uniques(rep(L1, 10000), useSimpleBioc=TRUE)));
#' }
#' printDebug("Simple list, keepNames=FALSE:");
#' print(system.time(uniques(rep(L1, 10000), useBioc=FALSE, keepNames=FALSE)));
#' printDebug("Simple list, keepNames=TRUE:");
#' print(system.time(uniques(rep(L1, 10000), useBioc=FALSE, keepNames=TRUE)));
#' }
#'
#' @export
uniques <- function
(x,
keepNames=TRUE,
incomparables=FALSE,
useBioc=TRUE,
useSimpleBioc=FALSE,
xclass=NULL,
...)
{
## Purpose is to take a list of vectors and return unique members
## for each vector in the list.
##
## keepNames=TRUE will keep the first name for the each duplicated entry
if (useBioc || useSimpleBioc) {
# if (!suppressWarnings(suppressPackageStartupMessages(require(S4Vectors)))) {
if (!check_pkg_installed("S4Vectors")) {
useSimpleBioc <- FALSE;
useBioc <- FALSE;
}
}
if (useBioc) {
# if (!suppressWarnings(suppressPackageStartupMessages(require(IRanges)))) {
if (!check_pkg_installed("IRanges")) {
useBioc <- FALSE;
}
}
xNames <- names(x);
if (useSimpleBioc) {
## Former method used List(x)
## which reverted to SimpleList for simple list input
## and SimpleList does not have the amazing optimization
as.list(
unique(S4Vectors::List(x),
incomparables=incomparables,
...));
} else if (useBioc) {
## Pro tip: use specific class to invoke optimized functions
## otherwise they revert to base lapply(x, unique)
if (length(xclass) == 0) {
xclass <- sclass(x);
}
if (is.list(xclass)) {
xclass <- cPaste(xclass,
checkClass=FALSE,
useBioc=useBioc,
...)
}
if (any(grepl(",", xclass))) {
xclass <- gsub(",.*$", "", xclass);
}
xlist <- list();
xclassesu <- unique(xclass);
for (xclassu in xclassesu) {
xclassidx <- which(xclass %in% xclassu);
if ("character" %in% xclassu) {
xlist[xclassidx] <- as.list(unique(IRanges::CharacterList(x[xclassidx]),
incomparables=incomparables))
} else if ("factor" %in% xclassu) {
xlist[xclassidx] <- as.list(unique(IRanges::FactorList(x[xclassidx]),
incomparables=incomparables))
} else if ("integer" %in% xclassu) {
xlist[xclassidx] <- as.list(unique(IRanges::IntegerList(x[xclassidx]),
incomparables=incomparables))
} else if ("logical" %in% xclassu) {
xlist[xclassidx] <- as.list(unique(IRanges::LogicalList(x[xclassidx]),
incomparables=incomparables))
} else if ("raw" %in% xclassu) {
xlist[xclassidx] <- as.list(unique(IRanges::RawList(x[xclassidx]),
incomparables=incomparables))
} else if ("Rle" %in% xclassu) {
xlist[xclassidx] <- as.list(unique(IRanges::RleList(x[xclassidx]),
incomparables=incomparables))
} else if ("complex" %in% xclassu) {
xlist[xclassidx] <- as.list(unique(IRanges::ComplexList(x[xclassidx]),
incomparables=incomparables))
} else if ("GRanges" %in% xclassu) {
xlist[xclassidx] <- lapply(unique(GenomicRanges::GRangesList(x[xclassidx])), function(gr){gr});
} else {
xlist[xclassidx] <- lapply(x[xclassidx], unique);
}
}
names(xlist) <- xNames;
return(xlist);
} else if (!keepNames) {
lapply(x, unique);
} else {
xu <- unlist(unname(x),
use.names=TRUE);
if (length(xNames) == 0) {
names(x) <- seq_along(x);
} else {
names(x) <- makeNames(names(x));
}
xn <- factor(rep(names(x), rlengths(x)),
levels=names(x));
## Concatenate name with value so uniqueness requires both
xun <- paste0(xn, "!!", xu);
xmatch <- match(unique(xun), xun);
xuse <- xu[xmatch];
xnuse <- xn[xmatch];
if (!keepNames) {
xuse <- unname(xuse);
}
xlist <- split(xuse, xnuse);
names(xlist) <- xNames;
xlist;
}
}
#' paste a list into a delimited vector
#'
#' Paste a list of vectors into a character vector, with values
#' delimited by default with a comma.
#'
#' This function is essentially a wrapper for [S4Vectors::unstrsplit()]
#' except that it also optionally applies uniqueness to each vector
#' in the list, and sorts values in each vector using [mixedOrder()].
#'
#' The sorting and uniqueness is applied to the `unlist`ed vector of
#' values, which is substantially faster than any `apply` family function
#' equivalent. The uniqueness is performed by [uniques()], which itself
#' will use `S4Vectors::unique()` if available.
#'
#' @return character vector with the same names and in the same order
#' as the input list `x`.
#'
#' @param x input `list` of vectors
#' @param sep `character` delimiter used to paste multiple values together
#' @param doSort `logical` indicating whether to sort each vector
#' using [mixedOrder()].
#' @param makeUnique `logical` indicating whether to make each vector in
#' the input list unique before pasting its values together.
#' @param na.rm boolean indicating whether to remove NA values from
#' each vector in the input list. When `na.rm` is `TRUE` and a
#' list element contains only `NA` values, the resulting string
#' will be `""`.
#' @param keepFactors `logical` only used when `useLegacy=TRUE` and
#' `doSort=TRUE`; indicating whether to preserve factors,
#' keeping factor level order. When
#' `keepFactors=TRUE`, if any list element is a `factor`, all elements
#' are converted to factors. Note that this step combines overall
#' factor levels, and non-factors will be ordered using
#' `base::order()` instead of `jamba::mixedOrder()` (for now.)
#' @param useBioc `logical` indicating whether this function should try
#' to use `S4Vectors::unstrsplit()` when the Bioconductor package
#' `S4Vectors` is installed, otherwise it will use a less
#' efficient `mapply()` operation.
#' @param useLegacy `logical` indicating whether to enable to previous
#' legacy process used by `cPaste()`.
#' @param honorFactor `logical` passed to `mixedSorts()`, whether any
#' `factor` vector should be sorted in factor level order.
#' When `honorFactor=FALSE` then even `factor` vectors are sorted
#' as if they were `character` vectors, ignoring the factor levels.
#' @param ... additional arguments are passed to `mixedOrder()` when
#' `doSort=TRUE`.
#'
#' @examples
#' L1 <- list(CA=LETTERS[c(1:4,2,7,4,6)], B=letters[c(7:11,9,3)]);
#'
#' cPaste(L1);
#' # CA B
#' # "A,B,C,D,B,G,D,F" "g,h,i,j,k,i,c"
#'
#' cPaste(L1, doSort=TRUE);
#' # CA B
#' # "A,B,B,C,D,D,F,G" "c,g,h,i,i,j,k"
#'
#' ## The sort can be done with convenience function cPasteS()
#' cPasteS(L1);
#' # CA B
#' # "A,B,B,C,D,D,F,G" "c,g,h,i,i,j,k"
#'
#' ## Similarly, makeUnique=TRUE and cPasteU() are the same
#' cPaste(L1, makeUnique=TRUE);
#' cPasteU(L1);
#' # CA B
#' # "A,B,C,D,G,F" "g,h,i,j,k,c"
#'
#' ## Change the delimiter
#' cPasteSU(L1, sep="; ")
#' # CA B
#' # "A; B; C; D; F; G" "c; g; h; i; j; k"
#'
#' # test mix of factor and non-factor
#' L2 <- c(
#' list(D=factor(letters[1:12],
#' levels=letters[12:1])),
#' L1);
#' L2;
#' cPasteSU(L2, keepFactors=TRUE);
#'
#' # tricky example with mix of character and factor
#' # and factor levels are inconsistent
#' # end result: factor levels are defined in order they appear
#' L <- list(entryA=c("miR-112", "miR-12", "miR-112"),
#' entryB=factor(c("A","B","A","B"),
#' levels=c("B","A")),
#' entryC=factor(c("C","A","B","B","C"),
#' levels=c("A","B","C")),
#' entryNULL=NULL)
#' L;
#' cPaste(L);
#' cPasteU(L);
#'
#' # by default keepFactors=FALSE, which means factors are sorted as characters
#' cPasteS(L);
#' cPasteSU(L);
#' # keepFactors=TRUE will keep unique factor levels in the order they appear
#' # this is the same behavior as unlist(L[c(2,3)]) on a list of factors
#' cPasteSU(L, keepFactors=TRUE);
#' levels(unlist(L[c(2,3)]))
#'
#' @family jam string functions
#' @family jam list functions
#'
#' @export
cPaste <- function
(x,
sep=",",
doSort=FALSE,
makeUnique=FALSE,
na.rm=FALSE,
keepFactors=FALSE,
checkClass=TRUE,
useBioc=TRUE,
useLegacy=FALSE,
honorFactor=TRUE,
verbose=FALSE,
...)
{
## Purpose is to utilize the vectorized function unstrsplit() from the S4Vectors package
if (length(x) == 0) {
return("");
}
if (!suppressWarnings(suppressPackageStartupMessages(require(S4Vectors)))) {
warn("cPaste() is substantially faster when Bioconductor package S4Vectors is installed.");
#stop("The IRanges package is required by cPaste() for the CharacterList class.");
useBioc <- FALSE;
}
xNames <- names(x);
if (!igrepHas("list", class(x))) {
x <- list(x);
xNames <- NULL;
}
## Assign temporary names if none are present
if (length(names(x)) == 0) {
names(x) <- seq_along(x);
} else {
names(x) <- makeNames(names(x));
}
## For speed, we sort and/or convert to character class as a vector,
## rather than a bunch of tiny vectors inside a list. Vector sorting
## is fast; splitting vector into a list is fast.
if (checkClass) {
xclass <- sclass(x);
if (is.list(xclass)) {
xclass <- cPaste(xclass,
checkClass=FALSE);
}
} else {
xclass <- rep("character",
length.out=length(x));
}
if (verbose) {
if (length(x) > 100) {
printDebug("cPaste(): ",
"head(xclass, 100):",
head(xclass, 100));
} else {
printDebug("cPaste(): ",
"xclass:",
xclass);
}
}
## Optionally make vectors unique
if (makeUnique) {
x <- uniques(x,
useBioc=useBioc,
xclass=xclass,
...);
}
## Optionally sort vectors
if (doSort) {
x <- mixedSorts(x,
xclass=xclass,
honorFactor=honorFactor,
...);
}
## Legacy processing below
if (useLegacy) {
## Handle potential mix of factor and non-factor
if (any(grepl("factor", xclass))) {
if (any(!grepl("factor", xclass))) {
# x contains mix of factor and non-factor
if (verbose) {
printDebug("cPaste(): ",
"mix of factor and non-factor");
}
if (doSort && keepFactors) {
if (verbose) {
printDebug("cPaste(): ",
"converting non-factor to factor with mixedSort()");
}
# if sorting we must convert non-factor to factor using mixedSort
xnonfactor <- which(!grepl("factor", xclass));
xnonfactorlevels <- mixedSort(unique(unlist(x[xnonfactor])));
## carefully split using factor split so empty entries are not lost
x[xnonfactor] <- split(
factor(unlist(x[xnonfactor]),
levels=xnonfactorlevels),
factor(
rep(xnonfactor, lengths(x[xnonfactor])),
levels=xnonfactor)
);
} else {
if (verbose) {
printDebug("cPaste(): ",
"converting factor to character");
}
# if not sorting, or not keeping factor levels
# convert factor to character
xisfactor <- which(grepl("factor", xclass));
x[xisfactor] <- lapply(x[xisfactor], as.character);
}
} else {
# all values are factors
if (verbose) {
printDebug("cPaste(): ",
"all values are factor");
}
}
} else {
# no values are factors, leave as-is
if (verbose) {
printDebug("cPaste(): ",
"no values are factor");
}
}
xu <- unlist(x);
if (igrepHas("factor", class(xu)) && doSort && !keepFactors) {
# if sorting AND if xu is factor AND we do not want to keep factor levels
# then convert to character
if (verbose) {
printDebug("cPaste(): ",
"converting factor to character to drop factor levels during sort");
}
xu <- as.character(xu);
}
## We define a vector of names as a factor, so the
## order of the factor levels will maintain the
## original order of input data during the
## split() which occurs later.
## Using a factor also preserves empty levels,
## in the case that NA values are removed.
xn <- factor(rep(names(x), rlengths(x)),
levels=names(x));
if (doSort && length(xu) > 1) {
if (igrepHas("factor", class(xu))) {
xuOrder <- order(xu, ...);
} else {
xuOrder <- mixedOrder(xu, ...);
}
xu <- xu[xuOrder];
xn <- xn[xuOrder];
}
## Optionally remove NA values
if (na.rm && length(xu) > 0 && any(is.na(xu))) {
whichNotNA <- which(!is.na(xu));
xu <- xu[whichNotNA];
xn <- xn[whichNotNA];
}
## split() using a factor keeps the data in original order
x <- split(
as.character(unname(xu)),
xn);
}
# specifically enforce na.rm=TRUE
if (length(na.rm) > 0 && TRUE %in% na.rm) {
x <- rmNAs(x,
naValue=NULL);
}
if (useBioc) {
## Note: The explicit conversion to class CharacterList is required
## in order to avoid errors with single list elements of NA when
## na.rm=FALSE. Specifically, unstrsplit() requires all elements in
## the list to be "character" class, and a single NA is class "logical"
## and causes an error.
if (verbose) {
printDebug("cPaste(): ",
"Using Bioc unstrsplit().")
}
# if "factor" and non-factor classes are present, convert them to one class
if (any(grepl("factor", ignore.case=TRUE, xclass)) &&
length(unique(xclass)) > 1) {
xfactor <- grepl("factor", ignore.case=TRUE, xclass);
x[xfactor] <- lapply(x[xfactor], as.character);
}
xNew <- S4Vectors::unstrsplit(
IRanges::CharacterList(x),
sep=sep);
} else {
if (verbose) {
printDebug("cPaste(): ",
"Using mapply().")
}
xNew <- mapply(paste,
x,
collapse=sep);
}
## Revert names(x) to their original state
names(xNew) <- xNames;
return(xNew);
}
#' paste a list into a delimited vector using sorted values
#'
#' Paste a list of vectors into a character vector, with values sorted
#' then delimited by default with a comma.
#'
#' This function is convenient a wrapper for `cPaste(.., doSort=TRUE)`.
#'
#' @inheritParams cPaste
#'
#' @family jam string functions
#' @family jam list functions
#'
#' @export
cPasteS <- function
(x,
sep=",",
doSort=TRUE,
makeUnique=FALSE,
na.rm=FALSE,
keepFactors=FALSE,
checkClass=TRUE,
useBioc=TRUE,
...)
{
## Purpose is to call cPaste with doSort=TRUE
cPaste(x=x,
sep=sep,
doSort=doSort,
makeUnique=makeUnique,
na.rm=na.rm,
keepFactors=keepFactors,
checkClass=checkClass,
useBioc=useBioc,
...);
}
#' paste a list into a delimited vector using sorted, unique values
#'
#' Paste a list of vectors into a character vector, with unique values
#' sorted then delimited by default with a comma.
#'
#' This function is convenient a wrapper for `cPaste(.., doSort=TRUE, makeUnique=TRUE)`.
#'
#' @inheritParams cPaste
#'
#' @family jam string functions
#' @family jam list functions
#'
#' @export
cPasteSU <- function
(x,
sep=",",
doSort=TRUE,
makeUnique=TRUE,
na.rm=FALSE,
keepFactors=FALSE,
checkClass=TRUE,
useBioc=TRUE,
...)
{
## Purpose is to call cPaste with doSort=TRUE and makeUnique=TRUE
cPaste(x=x,
sep=sep,
doSort=doSort,
makeUnique=makeUnique,
na.rm=na.rm,
keepFactors=keepFactors,
checkClass=checkClass,
useBioc=useBioc,
...);
}
#' paste a list into a delimited vector using unique values
#'
#' Paste a list of vectors into a character vector of unique values,
#' usually delimited by a comma.
#'
#' This function is convenient a wrapper for `cPaste(.., makeUnique=TRUE)`.
#'
#' @inheritParams cPaste
#'
#' @family jam string functions
#' @family jam list functions
#'
#' @export
cPasteUnique <- function
(x,
sep=",",
doSort=FALSE,
makeUnique=TRUE,
na.rm=FALSE,
keepFactors=FALSE,
checkClass=TRUE,
useBioc=TRUE,
...)
{
## Purpose is to call cPaste with makeUnique=TRUE
cPaste(x=x,
sep=sep,
doSort=doSort,
makeUnique=makeUnique,
na.rm=na.rm,
keepFactors=keepFactors,
checkClass=checkClass,
useBioc=useBioc,
...);
}
#' paste a list into a delimited vector using unique values
#'
#' Paste a list of vectors into a character vector of unique values,
#' usually delimited by a comma.
#'
#' This function is convenient a wrapper for `cPaste(.., makeUnique=TRUE)`.
#'
#' @inheritParams cPaste
#'
#' @family jam string functions
#' @family jam list functions
#'
#' @export
cPasteU <- function
(x,
sep=",",
doSort=FALSE,
makeUnique=TRUE,
na.rm=FALSE,
keepFactors=FALSE,
checkClass=TRUE,
useBioc=TRUE,
...)
{
## Purpose is to call cPaste with makeUnique=TRUE
cPaste(x=x,
sep=sep,
doSort=doSort,
makeUnique=makeUnique,
na.rm=na.rm,
keepFactors=keepFactors,
checkClass=checkClass,
useBioc=useBioc,
...);
}
#' Rename columns in a data.frame, matrix, tibble, or GRanges object
#'
#' Rename columns in a data.frame, matrix, tibble, or GRanges object
#'
#' This function is intended to rename one or more columns in a
#' `data.frame`, `matrix`, tibble, or `GRanges` related object.
#' It will gracefully ignore columns which do not match,
#' in order to make it possible to call the
#' function again without problem.
#'
#' This function will also recognize input objects `GRanges`,
#' `ucscData`, and `IRanges`, which store annotation in `DataFrame`
#' accessible via `IRanges::values()`. Note the `IRanges` package
#' is required, for its generic function `values()`.
#'
#' The values supplied in `to` and `from` are converted from `factor`
#' to `character` to avoid coersion by R to `integer`, which was
#' noted in output prior to jamba version `0.0.72.900`.
#'
#'
#' @return `data.frame` or object equivalent to the input `x`,
#' with columns `from` renamed to values in `to`. For genomic
#' ranges objects such as `GRanges` and `IRanges`, the colnames
#' are updated in `IRanges::values(x)`.
#'
#' @family jam practical functions
#'
#' @param x `data.frame`, `matrix`, `tbl`, or `GRanges` equivalent
#' object. It will work on any object for which `colnames()`
#' is defined.
#' @param from `character` vector of colnames expected to be in `x`.
#' Any values that do not match `colnames(x)` are ignored.
#' @param to `character` vector with `length(to) == length(from)`
#' corresponding to the target name for any colnames that
#' match `from`.
#'
#' @examples
#' df <- data.frame(A=1:5, B=6:10, C=11:15);
#' df;
#' df2 <- renameColumn(df,
#' from=c("A","C"),
#' to=c("a_new", "c_new"));
#' df2;
#' df3 <- renameColumn(df2,
#' from=c("A","C","B"),
#' to=c("a_new", "c_new","b_new"));
#' df3;
#'
#' @export
renameColumn <- function
(x,
from,
to,
verbose=FALSE,
...)
{
## Purpose is simply to rename one or more colnames in a data.frame or matrix.
## This method makes sure to rename only colnames which exist in 'from'
## and renames them in the appropriate order to 'to'.
## Therefore you can re-run this method and it will not make changes
## that are not warranted. Note that it will also only make changes, thus
## if you perform some operation on 'from' to generate 'to' and some entries
## do not change, this method will not rename those columns.
# coerce factor to character to prevent being coerced to integer
if ("factor" %in% class(from)) {
from <- as.character(from);
}
if ("factor" %in% class(to)) {
to <- as.character(to);
}
if (length(to) != length(from)) {
stop("length(from) must be equal to length(to)");
}
if (igrepHas("ucscdata|granges|iranges", class(x))) {
if (verbose) {
printDebug("renameColumn(): ",
"Recognized GRanges input.");
}
if (!check_pkg_installed("IRanges")) {
stop(paste(
"Input data requires the IRanges Bioconductor package,",
"install with BiocManager::install('IRanges')"));
}
renameSet <- which(from %in% colnames(IRanges::values(x)) & from != to);
renameWhich <- match(from[renameSet], colnames(IRanges::values(x)));
if (verbose) {
printDebug("renameColumn(): ",
"Renaming ",
formatInt(length(renameWhich)),
" columns.");
}
if (length(renameWhich) > 0) {
colnames(IRanges::values(x))[renameWhich] <- to[renameSet];
}
} else {
renameSet <- which(from %in% colnames(x) & from != to);
renameWhich <- match(from[renameSet], colnames(x));
if (verbose) {
printDebug("renameColumn(): ",
"Renaming ",
formatInt(length(renameWhich)),
" columns.");
}
if (length(renameWhich) > 0) {
colnames(x)[renameWhich] <- to[renameSet];
}
}
return(x);
}
#' Fill blank entries in a vector
#'
#' Fill blank entries in a vector
#'
#' This function takes a character vector and fills any blank (missing)
#' entries with the last non-blank entry in the vector. It is intended
#' for situations like imported Excel data, where there may be one
#' header value representing a series of cells.
#'
#' The method used does not loop through the data, and should scale
#' fairly well with good efficiency even for extremely large vectors.
#'
#' @return
#' Character vector where blank entries are filled with the
#' most recent non-blank value.
#'
#' @param x character vector
#' @param blankGrep vector of grep patterns, or `NA`, indicating
#' the type of entry to be considered blank.
#' Each `blankGrep` pattern is searched using `jamba::proigrep()`, which
#' by default uses case-insensitive regular expression pattern
#' matching.
#' @param first options character string intended when the first
#' entry of `x` is blank. By default `""` is used.
#' @param ... additional parameters are ignored.
#'
#' @examples
#' x <- c("A", "", "", "", "B", "C", "", "", NA,
#' "D", "", "", "E", "F", "G", "", "");
#' data.frame(x, fillBlanks(x));
#'
#' @family jam string functions
#'
#' @export
fillBlanks <- function
(x,
blankGrep=c("[ \t]*"),
first="",
...)
{
## Purpose is to take a character vector, and fill blank values
## with the most recent non-blank value.
## Intended for data imported from something like Excel, where
## people sometimes use one heading for a section of multiple
## columns or rows
## Ensure all non-NA patterns have a leading "^"
blankGrep <- rmNA(blankGrep);
if (length(blankGrep) == 0) {
blankGrep <- "^$";
}
addToGrep1 <- which(!is.na(blankGrep) & !grepl("^\\^|^[(]*\\^", blankGrep));
blankGrep[addToGrep1] <- paste0("^", blankGrep[addToGrep1]);
## Ensure all non-NA patterns have a leading "^"
addToGrep2 <- which(!is.na(blankGrep) & !grepl("\\$$|\\$[)]*$", blankGrep));
blankGrep[addToGrep2] <- paste0(blankGrep[addToGrep2], "$");
xBlank <- sort(proigrep(blankGrep, x));
if (length(xBlank) == 0) {
return(x);
}
if (1 %in% xBlank) {
xBlank <- setdiff(xBlank, 1);
x[1] <- first;
}
xIsBlank <- rep(FALSE, length.out=length(x));
xIsBlank[xBlank] <- TRUE;
whichNotBlank <- which(!xIsBlank);
xWhichNotBlankU <- unique(c(1, whichNotBlank));
xNonBlankVals <- x[!xIsBlank];
xNonBlankWhich <- as.numeric(cut(seq_along(x), c(xWhichNotBlankU-1, Inf)));
xFilled <- xNonBlankVals[xNonBlankWhich];
return(xFilled);
}
#' Format an integer as a string
#'
#' Format an integer as a string
#'
#' This function is a quick wrapper function around `base::format()`
#' to display integer values as text strings. It will also return a
#' matrix if the input is a matrix.
#'
#' @return character vector if `x` is a vector, or if `x` is a matrix
#' a matrix will be returned.
#'
#' @family jam string functions
#'
#' @param x numeric vector or matrix
#' @param big.mark,trim,scientific options sent to `base::format()` but
#' configured with defaults intended for integer values.
#' @param forceInteger logical indicating whether numeric values should
#' be rounded to the nearest integer value prior to `base::format()`.
#' This option is intended to hide decimal values where they are not
#' informative.
#'
#' @examples
#' x <- c(1234, 1234.56, 1234567.89);
#' ## By default, commas are used for big.mark, and decimal values are hidden
#' formatInt(x);
#'
#' ## By default, commas are used for big.mark
#' formatInt(x, forceInteger=FALSE);
#'
#' @export
formatInt <- function
(x,
big.mark=",",
trim=TRUE,
forceInteger=TRUE,
scientific=FALSE,
...)
{
## Purpose is to format a pretty integer, with commas separating thousandths digits
if (forceInteger) {
y <- format(x=round(x),
big.mark=big.mark,
trim=trim,
scientific=scientific,
...);
} else {
y <- format(x=x,
big.mark=big.mark,
trim=trim,
scientific=scientific,
...);
}
if ("matrix" %in% class(x)) {
y <- matrix(data=y,
nrow=nrow(x),
ncol=ncol(x),
dimnames=list(rownames(x), colnames(x)));
} else if (igrepHas("character|num|integer|float", class(x))) {
if (!is.null(names(x))) {
names(y) <- names(x);
}
}
y;
}
#' Convert list of vectors to data.frame with item, value, name
#'
#' Convert list of vectors to data.frame with item, value, name
#'
#' This function converts a list of vectors to a tall data.frame
#' with colnames `item` to indicate the list name, `value` to indicate
#' the vector value, and `name` to indicate the vector name if
#' `useVectorNames=TRUE` and if names exist.
#'
#' @param x list of vectors
#' @param makeUnique logical indicating whether the data.frame should
#' contain unique rows.
#' @param useVectorNames logical indicating whether vector names should
#' be included in the data.frame, if they exist.
#' @param ... additional arguments are ignored.
#'
#' @family jam list functions
#'
#' @export
list2df <- function
(x,
makeUnique=TRUE,
useVectorNames=TRUE,
...)
{
## Purpose is to take a list of vectors, and turn it
## into a data.frame, where the list names are in column 1
## and the values are in column 2.
##
## This function is good for converting something like
## conflicts(,TRUE) into a usable data.frame, e.g.:
## cf1 <- conflicts(,TRUE);
## xdf <- list2df(cf1);
## irMethods <- xdf[igrep("IRanges", xdf[,1]),2];
## irConflicts <- xdf[xdf[,2] %in% irMethods & !xdf[,1] %in% "package:IRanges",];
xdf <- data.frame(item=rep(names(x), lengths(x)),
value=unname(unlist(x)));
if (length(useVectorNames) > 0 && useVectorNames) {
xdf$name <- unname(unlist(lapply(x, names)));
}
if (makeUnique) {
xdf <- unique(xdf);
}
return(xdf);
}
#' Jam-specific recursive apply
#'
#' Jam-specific recursive apply
#'
#' This function is a very lightweight customization to `base::rapply()`,
#' specifically that it does not remove `NULL` entries.
#'
#' @family jam list functions
#'
#' @param x `list`
#' @param FUN `function` to be called on non-list elements in `x`.
#' @param how `character` string indicating whether to return the
#' `list` or whether to call `unlist()` on the result.
#' @param ... additional arguments are passed to `FUN`.
#'
#' @examples
#' L <- list(entryA=c("miR-112", "miR-12", "miR-112"),
#' entryB=factor(c("A","B","A","B"),
#' levels=c("B","A")),
#' entryC=factor(c("C","A","B","B","C"),
#' levels=c("A","B","C")),
#' entryNULL=NULL)
#' rapply(L, length)
#' jam_rapply(L, length)
#'
#' L0 <- list(A=1:3, B=list(C=1:3, D=4:5, E=NULL));
#' rapply(L0, length)
#' jam_rapply(L0, length)
#'
#' @export
jam_rapply <- function
(x,
FUN,
how=c("unlist", "list"),
...)
{
how <- match.arg(how);
newlist <- lapply(x, function(i){
if (is.list(i)){
jam_rapply(i,
FUN=FUN,
how=how,
...);
} else {
FUN(i,
...)
}
});
if ("unlist" %in% how) {
newlist <- unlist(newlist);
}
return(newlist);
}
#' sort alphanumeric values within a list format
#'
#' sort alphanumeric values within a list format
#'
#' This function is an extension to `mixedSort()` to sort each vector
#' in a list. It applies the sort to the whole unlisted vector then
#' splits back into list form.
#'
#' In the event the input is a nested list of lists, only the first
#' level of list structure is maintained in the output data. For
#' more information, see `rlengths()` which calculates the recursive
#' nested list sizes. An exception is when the data contained in `x`
#' represents multiple classes, see below.
#'
#' When data in `x` represents multiple classes, for example `character`
#' and `factor`, the mechanism is slightly different and not as well-
#' optimized for large length `x`. The method uses
#' `rapply(x, how="replace", mixedSort)` which recursively, and iteratively,
#' calls `mixedSort()` on each vector, and therefore returns data in the
#' same nested `list` structure as provided in `x`.
#'
#' When data in `x` represents only one class, data is `unlist()` to one
#' large vector, which is sorted with `mixedSort()`, then split back into
#' `list` structure representing `x` input.
#'
#' @family jam sort functions
#' @family jam string functions
#' @family jam list functions
#'
#' @inheritParams mixedSort
#' @param xclass `character` vector of classes in `x`, used for slight
#' optimization to re-use this vector if it has already been
#' defined for `x`. When `NULL` it is created within this function.
#' @param indent `numeric` used only when `verbose=TRUE` to determine
#' the number of spaces indented for verbose output, passed to
#' `printDebug()`.
#'
#' @examples
#' # set up an example list of mixed alpha-numeric strings
#' set.seed(12);
#' x <- paste0(sample(letters, replace=TRUE, 52), rep(1:30, length.out=52));
#' x;
#' # split into a list as an example
#' xL <- split(x, rep(letters[1:5], c(6,7,5,4,4)));
#' xL;
#'
#' # now run mixedSorts(xL)
#' # Notice "e6" is sorted before "e30"
#' mixedSorts(xL)
#'
#' # for fun, compare to lapply(xL, sort)
#' # Notice "e6" is sorted after "e30"
#' lapply(xL, sort)
#'
#' # test super-long list
#' xL10k <- rep(xL, length.out=10000);
#' names(xL10k) <- as.character(seq_along(xL10k));
#' print(head(mixedSorts(xL10k), 10))
#'
#' # Now make some list vectors into factors
#' xF <- xL;
#' xF$c <- factor(xL$c)
#' # for fun, reverse the levels
#' xF$c <- factor(xF$c,
#' levels=rev(levels(xF$c)))
#' xF
#' mixedSorts(xF)
#'
#' # test super-long list
#' xF10k <- rep(xF, length.out=10000);
#' names(xF10k) <- as.character(seq_along(xF10k));
#' print(head(mixedSorts(xF10k), 10))
#'
#' # Make a nested list
#' set.seed(1);
#' l1 <- list(
#' A=sample(nameVector(11:13, rev(letters[11:13]))),
#' B=list(
#' C=sample(nameVector(4:8, rev(LETTERS[4:8]))),
#' D=sample(nameVector(LETTERS[2:5], rev(LETTERS[2:5])))
#' )
#' )
#' l1;
#' # The output is a nested list with the same structure
#' mixedSorts(l1, verbose=TRUE);
#' mixedSorts(l1, sortByName=TRUE, verbose=TRUE);
#'
#' # Make a nested list with two sub-lists
#' set.seed(1);
#' l2 <- list(
#' A=list(
#' E=sample(nameVector(11:13, rev(letters[11:13])))
#' ),
#' B=list(
#' C=sample(nameVector(4:8, rev(LETTERS[4:8]))),
#' D=sample(nameVector(LETTERS[2:5], rev(LETTERS[2:5])))
#' )
#' )
#' l2;
#' # The output is a nested list with the same structure
#' mixedSorts(l2);
#' mixedSorts(l2, sortByName=TRUE);
#'
#' # when one entry is missing
#' L0 <- list(A=3:1,
#' B=list(C=c(1:3,NA,0),
#' D=LETTERS[c(4,5,2)],
#' E=NULL));
#' L0
#' mixedSorts(L0)
#' mixedSorts(L0, na.rm=TRUE)
#'
#' @export
mixedSorts <- function
(x,
blanksFirst=TRUE,
na.last=NAlast,
keepNegative=FALSE,
keepInfinite=TRUE,
keepDecimal=FALSE,
ignore.case=TRUE,
useCaseTiebreak=TRUE,
sortByName=FALSE,
na.rm=FALSE,
verbose=FALSE,
NAlast=TRUE,
honorFactor=TRUE,
xclass=NULL,
indent=0,
debug=FALSE,
...)
{
## Purpose is to take a list of vectors and run mixedSort() efficiently
##
# if empty then return without change
if (length(x) == 0) {
return(x)
}
# if entirely empty then return without change
if (all(lengths(x) == 0)) {
return(x)
}
xNames <- names(x);
# recursive class() because rapply drops NULL entries
rclass <- function(x){
if (is.list(x)) {
return(lapply(x, rclass))
}
return(class(x))
}
if (length(xclass) == 0) {
# xclass <- rapply(x, class, how="unlist");
xclass <- unlist(rclass(x))
}
xclassu <- unique(xclass);
if (!TRUE %in% honorFactor &&
any(c("factor") %in% xclass)) {
xclass[xclass %in% "factor"] <- "character"
}
xclass <- unique(xclass);
if (length(names(x)) == 0) {
names(x) <- seq_along(x);
} else {
if (any(duplicated(names(x)))) {
names(x) <- makeNames(names(x));
}
}
# check for nested list
x_has_list <- any(sapply(x, function(i){"list" %in% class(i)}));
# if (!TRUE %in% sortByName && length(xclass) > 1) {
if (length(xclass) > 1) {
# slight optimization
# split simple (non-nested) list into subsets by class
# then run each class in bulk/optimized mode
# then reassign
if (!x_has_list) {
#
if (verbose) {
printDebug("mixedSorts(): ",
indent=indent,
"Performing sort for each class subtype.");
}
xclass_sets <- cPaste(lapply(x, class));
for (xclass_set in unique(xclass_sets)) {
if (verbose) {
printDebug("mixedSorts(): ",
indent=indent + 3,
"Class subtype: ", xclass_set);
}
k <- which(xclass_sets %in% xclass_set);
if ("NULL" %in% xclass_set) {
if (verbose) {
printDebug("mixedSorts(): ",
indent=indent + 6,
"Skipping NULL class subtype: ", xclass_set);
}
} else {
x[k] <- mixedSorts(x[k],
blanksFirst=blanksFirst,
na.last=na.last,
keepNegative=keepNegative,
keepInfinite=keepInfinite,
keepDecimal=keepDecimal,
ignore.case=ignore.case,
useCaseTiebreak=useCaseTiebreak,
sortByName=sortByName,
na.rm=na.rm,
xclass=xclass_sets[k],
honorFactor=honorFactor,
indent=indent + 12,
verbose=verbose,
...)
}
}
names(x) <- xNames;
return(x);
} else {
# iterate nested list individually
if (verbose) {
printDebug("mixedSorts(): ",
indent=indent,
"Performing rapply() mixedSort() for each nested sublist.");
}
xnew <- rapply(x, how="replace", function(i){
mixedSort(i,
blanksFirst=blanksFirst,
na.last=na.last,
keepNegative=keepNegative,
keepInfinite=keepInfinite,
keepDecimal=keepDecimal,
ignore.case=ignore.case,
useCaseTiebreak=useCaseTiebreak,
sortByName=sortByName,
honorFactor=honorFactor,
...)
});
names(xnew) <- xNames;
return(xnew);
}
}
# at this point xclass only has one value
# unless sortByName=TRUE
if (verbose) {
printDebug("mixedSorts(): ",
indent=indent,
"Sorting list containing a single class.");
}
## unlist values
if (length(xclassu) > 1 &&
"factor" %in% xclassu) {
# when factor is included, it is converted to character at this step
if (verbose) {
printDebug("mixedSorts(): ",
indent=indent,
"Condensing factor sublist to character.");
}
xu <- unlist(rapply(x, how="unlist", function(i){
if ("factor" %in% class(i)) {
as.character(i)
} else {
i
}
}));
} else {
xu <- unlist(x);
}
## vector names
xun <- unname(jam_rapply(x, names));
if (TRUE %in% sortByName &&
length(xun) < length(xu)) {
stop("Cannot sort by name because not all vectors have names.");
}
## We define a vector of names as a factor, so the
## order of the factor levels will maintain the
## original order of input data during the
## split() which occurs later.
## Using a factor also preserves empty levels,
## in the case that NA values are removed.
xrn <- jam_rapply(x, length);
xn <- factor(
rep(names(xrn),
xrn),
levels=names(xrn));
#xn <- factor(rep(names(x), rlengths(x)),
# levels=names(x));
if (TRUE %in% sortByName) {
if (verbose) {
printDebug("mixedSorts(): ",
indent=indent,
"Performing sortByName.");
}
xu_use <- xun;
} else {
xu_use <- xu;
}
# print("xu_use:");print(xu_use);
if (honorFactor %in% TRUE && "factor" %in% class(xu_use)) {
if (verbose) {
printDebug("mixedSorts(): ",
indent=indent,
"Ordering by factor levels.");
}
xuOrder <- order(xu_use,
na.last=na.last);
} else {
xuOrder <- mixedOrder(xu_use,
blanksFirst=blanksFirst,
na.last=na.last,
keepNegative=keepNegative,
keepInfinite=keepInfinite,
keepDecimal=keepDecimal,
ignore.case=ignore.case,
useCaseTiebreak=useCaseTiebreak,
honorFactor=honorFactor,
...);
}
xu <- xu[xuOrder];
xn <- xn[xuOrder];
xu_use <- xu_use[xuOrder];
if (length(xun) > 0) {
xun <- xun[xuOrder];
}
## Optionally remove NA values
if (TRUE %in% na.rm && any(is.na(xu_use))) {
if (verbose) {
printDebug("mixedSorts(): ",
indent=indent,
"Removing NA values.");
}
whichNotNA <- which(!is.na(xu_use));
xu <- xu[whichNotNA];
xn <- xn[whichNotNA];
if (length(xun) > 0) {
xun <- xun[whichNotNA];
}
xu_use <- xu_use[whichNotNA];
# shrink input x so it still works with relist() below
if (any(x_has_list)) {
if (verbose) {
printDebug("mixedSorts(): ",
indent=indent,
"Removing NA values from nested input list.");
}
x <- jam_rapply(x, function(i){i[!is.na(i)]}, "list")
}
}
if (length(xun) > 0) {
names(xu) <- xun;
} else {
xu <- unname(xu);
}
## split() using a factor keeps the data in original order
if (debug) {
return(list(
xu=xu,
xn=xn,
xun=xun,
x=x)
);
}
if (x_has_list) {
if (verbose) {
printDebug("mixedSorts(): ",
indent=indent,
"Re-creating nested list structure.");
printDebug("mixedSorts(): ",
indent=indent,
"xu:");
print(xu)
printDebug("mixedSorts(): ",
indent=indent,
"xn:");
print(xn)
}
# xu_ordered <- unlist(unname(split(xu, xn)))
xu_ordered <- unlist(unname(split(xu, xn)))
xnew <- relist_named(xu_ordered, x);
if (length(xnew) == length(xNames)) {
names(xnew) <- xNames;
}
} else {
if (verbose) {
printDebug("mixedSorts(): ",
indent=indent,
"Re-creating list structure.");
}
xnew <- split(xu, xn);
names(xnew) <- xNames;
}
return(xnew);
}
#' relist a vector which allows re-ordered names
#'
#' relist a vector which imposes the model object list structure while
#' allowing vector elements and names to be re-ordered
#'
#' This function is a simple update to `utils::relist()`
#' that allows the order of vectors to change, alongside the
#' correct names for each element.
#'
#' More specifically, this function does not replace the
#' updated names with the corresponding names from
#' the list `skeleton`, as is the case in default implementation of
#' `utils::relist()`.
#'
#' This function is called by `mixedSorts()` which iteratively calls
#' `mixedOrder()` on each vector component of the input `list`,
#' and permits nested lists. The result is a single sorted vector
#' which is split into the `list` components, then relist-ed to
#' the original structure. During the process, it is important
#' to retain vector names in the order defined by `mixedOrder()`.
#'
#' @return `list` object with the same structure as the `skeleton`.
#'
#' @family jam list functions
#'
#' @param x vector to be applied to the `skeleton` list
#' structure in order.
#' @param skeleton `list` object representing the desired
#' final list structure, or `vector` when the input
#' data `x` should be returned as-is, without change.
#' Specifically, when `skeleton` is a `vector`, the
#' `names(x)` are maintained without change.
#' @param ... additional arguments are ignored.
#'
#' @examples
#' # generate nested list
#' x <- list(A=nameVector(LETTERS[3:1]),
#' B=list(
#' E=nameVector(LETTERS[10:7]),
#' D=nameVector(LETTERS[5:4])),
#' C=list(
#' G=nameVector(LETTERS[19:16]),
#' F=nameVector(LETTERS[15:11]),
#' H=list(
#' I=nameVector(LETTERS[22:20]))
#' ))
#' x
#'
#' # unlisted vector of items
#' xu <- unlist(unname(x))
#' # unlisted vector of names
#' xun <- unname(jam_rapply(x, names));
#' names(xu) <- xun;
#'
#' # recursive list element lengths
#' xrn <- jam_rapply(x, length);
#' # define factor in order of list structure
#' xn <- factor(
#' rep(names(xrn),
#' xrn),
#' levels=names(xrn));
#'
#' # re-create the original list
#' xu_new <- unlist(unname(split(xu, xn)))
#' xnew <- relist_named(xu_new, x);
#' xnew
#'
#' # re-order elements
#' k <- mixedOrder(xu_new);
#' xuk <- unlist(unname(split(xu[k], xn[k])))
#' xk <- relist_named(xuk, x);
#' xk
#'
#' # the default relist() function does not support this use case
#' xdefault <- relist(xuk, x);
#' xdefault
#'
#' @export
relist_named <- function
(x,
skeleton,
...)
{
##
ind <- 1L;
result <- skeleton;
if ("list" %in% class(skeleton)) {
for (i in seq_along(skeleton)) {
size <- length(unlist(result[[i]]));
result[[i]] <- relist_named(
x[seq.int(ind, length.out=size)],
result[[i]]);
ind <- ind + size;
}
} else {
result <- x;
}
result;
}
#' Uppercase the first letter in each word
#'
#' Uppercase the first letter in each word
#'
#' This function is a simple mimic of the Perl function `ucfirst` which
#' converts the first letter in each word to uppercase. When
#' `lowercaseAll=TRUE` it also forces all other letters to lowercase,
#' otherwise mixedCase words will retain capital letters in the middle
#' of words.
#'
#' @param x character vector.
#' @param lowercaseAll logical indicating whether to force all letters
#' to lowercase before applying uppercase to the first letter.
#' @param firstWordOnly logical indicating whether to apply the
#' uppercase only to the first word in each string. Note that it
#' still applies the logic to every entry in the input vector `x`.
#' @param ... additional arguments are ignored.
#'
#' @family jam string functions
#'
#' @examples
#' ucfirst("TESTING_ALL_UPPERCASE_INPUT")
#' ucfirst("TESTING_ALL_UPPERCASE_INPUT", TRUE)
#' ucfirst("TESTING_ALL_UPPERCASE_INPUT", TRUE, TRUE)
#'
#' ucfirst("testing mixedCase upperAndLower case input")
#' ucfirst("testing mixedCase upperAndLower case input", TRUE)
#' ucfirst("testing mixedCase upperAndLower case input", TRUE, TRUE)
#'
#' @export
ucfirst <- function
(x,
lowercaseAll=FALSE,
firstWordOnly=FALSE,
...)
{
## Purpose is to mimic the Perl function,
## and upper-case the first letter of a word
##
## lowercaseAll=TRUE will make everything after
## the first character into lowercase.
##
if (lowercaseAll) {
x <- tolower(x);
}
if (firstWordOnly) {
newX <- sub("(^|\\b|[^[a-zA-Z0-9])([a-zA-Z])",
"\\1\\U\\2",
x,
perl=TRUE);
} else {
newX <- gsub("(^|\\b|[^[a-zA-Z0-9])([a-zA-Z])",
"\\1\\U\\2",
x,
perl=TRUE);
}
return(newX);
}
#' Global substitution into ordered factor
#'
#' Global substitution into ordered factor
#'
#' This function is an extension of `base::gsub()` that
#' returns an ordered factor output. When input is also a
#' factor, the output factor levels are retained in the
#' same order, after applying the string substitution.
#'
#' This function is very useful when making changes via `base::gsub()`
#' to a factor with ordered levels, because it retains the
#' the order of levels after modification.
#'
#' Tips:
#'
#' * To convert a character vector to a factor, whose levels are
#' sorted, use `sortFunc=sort`.
#' * To convert a character vector to a factor, whose levels are
#' the order they appear in the input `x`, use `sortFunc=c`.
#' * To convert a character vector to a factor, whose levels are
#' sorted alphanumerically, use `sortFunc=mixedSort`.
#'
#' @return factor whose levels are based upon the order of
#' input levels when the input `x` is a factor; or if the
#' input `x` is not a factor, it is converted to a factor
#' using the provided sort function `sortFunc`.
#'
#' @param pattern,replacement,x,ignore.case,perl,fixed,useBytes
#' arguments sent to `base::gsub()`
#' @param sortFunc function used to sort factor levels, which
#' is not performed if the input `x` is a `factor`.
#' @param ... additional arguments are passed to `sortFunc`
#'
#' @family jam string functions
#'
#' @examples
#' x <- c(paste0(
#' rep(c("first", "second", "third"), 2),
#' rep(c("Section", "Choice"), each=3)),
#' "Choice");
#' f <- factor(x, levels=x);
#' f;
#'
#' # default gsub() will return a character vector
#' gsub("(first|second|third)", "", f)
#' # converting to factor resets the factor level order
#' factor(gsub("(first|second|third)", "", f))
#'
#' ## gsubOrdered() maintains the factor level order
#' gsubOrdered("(first|third)", "", f)
#' gsubOrdered("(first)", "", f)
#'
#' # to convert character vector to factor, levels in order they appear
#' gsubOrdered("", "", x, sortFunc=c)
#'
#' # to convert character vector to factor, levels alphanumeric sorted
#' gsubOrdered("", "", x, sortFunc=mixedSort)
#'
#' @export
gsubOrdered <- function
(pattern,
replacement,
x,
ignore.case=FALSE,
perl=FALSE,
fixed=FALSE,
useBytes=FALSE,
sortFunc=mixedSort,
...)
{
## Purpose is to perform gsub() but maintain order of factor levels consistent with the
## input data.
##
## If input data is not a factor, it is converted to a factor,
## using sortFunc() to order the levels.
##
## To have levels ordered based upon their original order,
## use sortFunc=c
##
## To have levels ordered based upon sample sorting,
## use sortFunc=sortSamples
##
## To have levels ordered based upon alphenumeric sorting,
## use sortFunc=mixedSort
##
## The special case where is.na(pattern) it will change NA values
## to the replacement, and relevel the factor accordingly
xNames <- names(x);
if (!igrepHas("factor", class(x))) {
if (is.na(pattern) && any(is.na(x))) {
x <- addNA(factor(x, levels=unique(sortFunc(x))));
} else {
x <- factor(x, levels=unique(sortFunc(x)));
}
}
if (is.na(pattern)) {
if (!any(is.na(x))) {
return(x);
}
xNA <- which(is.na(x));
y <- as.character(x);
y[xNA] <- replacement;
yLevels <- levels(x);
yLevels[is.na(yLevels)] <- replacement;
yLevels <- unique(yLevels);
} else {
y <- gsub(pattern=pattern, replacement=replacement, x=x, ignore.case=ignore.case,
perl=perl, fixed=fixed, useBytes=useBytes, ...);
yLevels <- unique(gsub(pattern=pattern, replacement=replacement, x=levels(x), ignore.case=ignore.case,
perl=perl, fixed=fixed, useBytes=useBytes, ...));
}
if (!is.null(xNames)) {
names(y) <- xNames;
}
return(factor(y, levels=yLevels));
}
#' Pattern replacement with multiple patterns
#'
#' Pattern replacement with multiple patterns
#'
#' This function is a simple wrapper around `base::gsub()`
#' when considering a series of pattern-replacement
#' combinations. It applies each pattern match and replacement
#' in order and is therefore not vectorized.
#'
#' When `x` input is a `list` each vector in the `list` is processed,
#' somewhat differently than processing one vector.
#' 1. When the `list` contains another `list`, or when `length(x) < 100`,
#' each value in `x` is iterated calling `gsubs()`.
#' This process is the slowest option, however not noticeble until
#' `x` has length over 10,000.
#' 2. When the `list` does not contain another `list` and all values are
#' non-factor, or all values are `factor`, they are unlisted,
#' processed as a vector, then relisted. This process is nearly the
#' same speed as processing one single vector, except the time it
#' takes to confirm the list element classes.
#' 3. When values contain a mix of non-factor and `factor` values, they
#' are separately unlisted, processed by `gsubs()`, then relisted
#' and combined afterward. Again, this process is only slightly slower
#' than option 2 above, given that it calls `gsubs()` twice, with two
#' vectors.
#' 4. Note that `factor` values at input are
#' replaced with `character` values at output, consistent with `gsub()`.
#'
#' @return `character` vector when input `x` is an atomic vector,
#' or `list` when input `x` is a `list`.
#'
#' @family jam string functions
#'
#' @param pattern character vector of patterns
#' @param replacement character vector of replacements
#' @param x character vector with input data to be curated
#' @param ignore.case logical indicating whether to perform
#' pattern matching in case-insensitive manner, where
#' `ignore.case=TRUE` will ignore the uppercase/lowercase
#' distinction.
#' @param replace_multiple logical vector indicating whether to perform
#' global substitution, where `replace_multiple=FALSE` will
#' only replace the first occurrence of the pattern, using
#' `base::sub()`. Note that this vector can refer to individual
#' entries in `pattern`.
#' @param ... additional arguments are passed to `base::gsub()`
#' or `base::sub()`.
#'
#' @export
gsubs <- function
(pattern,
replacement,
x,
ignore.case=TRUE,
replaceMultiple=rep(TRUE, length(pattern)),
...)
{
## Purpose is to curate a text field using a series of gsub()
## commands, operating on a vector of from,to vectors.
## 'pattern' is expected to be a vector of regular expression patterns
## used by gsub()
##
## 'replacement' is expected to be a vector of replacement patterns, as
## used by gsub(), including relevant regular expression operators.
## If 'replacement' is empty, the "" is used, thereby replacing patterns with
## empty characters.
##
## replaceMultiple is a logical vector indicating whether each pattern
## replacement should use gsub() if replaceMultiple==TRUE, or sub()
## if replaceMultiple==FALSE. The default is TRUE, which uses gsub().
## One would use replaceMultiple=FALSE in order to replace only the
## first occurrence of a pattern, like replacing the first tab character
## only.
##
## This function allows the patterns and replacements to be defined
## upfront, then applied to any relevant character vectors consistently,
## for example across columns of a data.frame.
if (length(x) == 0 || length(pattern) == 0) {
return(x);
}
if (length(replaceMultiple) == 0) {
replaceMultiple <- TRUE;
}
replaceMultiple <- rep(replaceMultiple, length.out=length(pattern));
if (length(replacement) == 0) {
replacement <- "";
}
replacement <- rep(replacement, length.out=length(pattern));
# if input x is a list, iterate each element in the list.
# Possible optimization for long lists that are not nested:
# unlist into one character vector, split back into original form afterward.
if (is.list(x) || "list" %in% class(x)) {
# iterate each entry in x
if (length(x) > 100) {
# with more than 100 entries, iterative replacement gets slow
x_class <- (sclass(x))
if ("list" %in% x_class) {
x_class <- cPaste(x_class)
}
if (!igrepHas("list", x_class)) {
x_is_factor <- grepl("factor", x_class)
x_lengths <- lengths(x)
x_split <- factor(rep(seq_along(x), x_lengths),
levels=seq_along(x))
if (all(x_is_factor) || !any(x_is_factor)) {
# if no factor, or all factor, we can unlist()
x_unlist <- unlist(x, use.names=FALSE)
x_new_vector <- gsubs(pattern=pattern,
replacement=replacement,
x=x_unlist,
ignore.case=ignore.case,
replaceMultiple=replaceMultiple,
...)
x_new <- split(x_new_vector, x_split)
return(x_new)
} else if (any(x_is_factor) && !all(x_is_factor)) {
xnf <- x[!x_is_factor]
xif <- x[x_is_factor]
# non-factor
xnf_split <- factor(
rep(seq_along(x)[!x_is_factor],
x_lengths[!x_is_factor]),
levels=seq_along(x))
xnf_unlist <- unlist(x[!x_is_factor], use.names=FALSE)
xnf_new_vector <- gsubs(pattern=pattern,
replacement=replacement,
x=xnf_unlist,
ignore.case=ignore.case,
replaceMultiple=replaceMultiple,
...)
# factor
xif_split <- factor(
rep(seq_along(x)[x_is_factor],
x_lengths[x_is_factor]),
levels=seq_along(x))
xif_unlist <- unlist(x[x_is_factor], use.names=FALSE)
# printDebug("head(xif_unlist, 10):");print(head(xif_unlist, 10))
xif_new_vector <- gsubs(pattern=pattern,
replacement=replacement,
x=xif_unlist,
ignore.case=ignore.case,
replaceMultiple=replaceMultiple,
...)
# printDebug("head(xif_new_vector, 3):");print(head(xif_new_vector, 3));
# re-assemble
x_new <- split(xnf_new_vector, x_split[!x_is_factor])
xif_new <- split(xif_new_vector, x_split[x_is_factor])
# printDebug("head(xif_new, 3):");print(head(xif_new, 3));
x_new[x_is_factor] <- xif_new[x_is_factor];
return(x_new)
}
}
}
x_new <- lapply(x, function(ix){
gsubs(pattern=pattern,
replacement=replacement,
x=ix,
ignore.case=ignore.case,
replaceMultiple=replaceMultiple,
...)
})
return(x_new);
}
for (i in seq_along(pattern)) {
if (replaceMultiple[[i]]) {
x <- gsub(pattern=pattern[i],
replacement=replacement[i],
x=x,
ignore.case=ignore.case,
...);
} else {
x <- sub(pattern=pattern[i],
replacement=replacement[i],
x=x,
ignore.case=ignore.case,
...);
}
}
return(x);
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.