#' Alternative Implementations of Base R Functions
#'
#' Alternative implementations of some base R functions, including sort, order,
#' and match. Functions are simplified but can be faster or have other
#' advantages. See the documentation of individual functions for details and
#' benchmarks.
#'
#' Note that these functions cannot be considered drop-in replacements for the
#' functions in base \code{R}. They do not implement all the same parameters and
#' do not work for all data types. Utilize these with caution in specialized
#' applications that require them.
#'
#' @name grr
#' @docType package
#' @useDynLib grr
NULL
#' Sorting vectors
#'
#' Simplified implementation of \code{\link{sort}} utilizing parallelized C++.
#' For large vectors, typically is about 2-10x faster for numbers and 20-100x
#' faster for characters and factors (depending on system).
#'
#' @param x a vector of class numeric, integer, character, factor, or logical.
#' Long vectors are not supported.
#' @export
#' @examples
#' chars<-as.character(sample(1e3,1e4,TRUE))
#' system.time(a<-sort(chars))
#' system.time(b<-sort2(chars))
#' identical(a,b)
#'
#' ints<-as.integer(sample(1e3,1e4,TRUE))
#' system.time(a<-sort(ints))
#' system.time(b<-sort2(ints))
#' identical(a,b)
#'
#' nums<-runif(1e4)
#' system.time(a<-sort(nums))
#' system.time(b<-sort2(nums))
#' identical(a,b)
#'
#' logs<-as.logical(sample(0:1,1e6,TRUE))
#' system.time(result<-sort(logs))
#' system.time(result<-sort2(logs))
#'
#' facts<-as.factor(as.character(sample(1e3,1e4,TRUE)))
#' system.time(a<-sort(facts))
#' system.time(b<-sort2(facts))
#' identical(a,b)
#'
#' #How are special values like NA and Inf handled?
#' #For numerics, values sort correctly with NA and NaN after all real numbers.
#' sort2(c(NaN,1,2,NA,NaN,Inf,-Inf))
#' #For characters, values sort correctly with NA at the end.
#' sort2(c(NA,'C','B',NA,'A'))
#' #For factors, values sort correctly with NA at the end
#' sort2(as.factor(c(NA,'C','B',NA,'A')))
#'
#' \dontrun{
#' chars<-as.character(sample(1e5,1e6,TRUE))
#' system.time(a<-sort(chars))
#' system.time(b<-sort2(chars))
#'
#' ints<-as.integer(sample(1e5,1e6,TRUE))
#' system.time(result<-sort(ints))
#' system.time(result<-sort2(ints))
#'
#' nums<-runif(1e6)
#' system.time(result<-sort(nums))
#' system.time(result<-sort2(nums))
#'
#' logs<-as.logical(sample(0:1,1e7,TRUE))
#' system.time(result<-sort(logs))
#' system.time(result<-sort2(logs))
#'
#' facts<-as.factor(as.character(sample(1e5,1e6,TRUE)))
#' system.time(a<-sort(facts))
#' system.time(b<-sort2(facts))
#' }
sort2<-function(x)
{
result<-.Call('sortcpp',x)
return(result)
}
#' Ordering vectors
#'
#' Simplified implementation of \code{\link{order}} utilizing parallelized C++.
#' For large vectors, typically is about 2-10x faster for numbers and 20-100x
#' faster for characters and factors (depending on system).
#'
#' @param x a vector of class numeric, integer, character, factor, or logical.
#' Long vectors are not supported.
#' @export
#' @examples
#' chars<-as.character(sample(1e3,1e4,TRUE))
#' system.time(a<-order(chars))
#' system.time(b<-order2(chars))
#' identical(chars[a],chars[b])
#'
#' ints<-as.integer(sample(1e3,1e4,TRUE))
#' system.time(a<-order(ints))
#' system.time(b<-order2(ints))
#' identical(ints[a],ints[b])
#'
#' nums<-runif(1e4)
#' system.time(a<-order(nums))
#' system.time(b<-order2(nums))
#' identical(nums[a],nums[b])
#'
#' logs<-as.logical(sample(0:1,1e6,TRUE))
#' system.time(a<-order(logs))
#' system.time(b<-order2(logs))
#' identical(logs[a],logs[b])
#'
#' facts<-as.factor(as.character(sample(1e3,1e4,TRUE)))
#' system.time(a<-order(facts))
#' system.time(b<-order2(facts))
#' identical(facts[a],facts[b])
#'
#' #How are special values like NA and Inf handled?
#' #For numerics, values sort correctly with NA and NaN after all real numbers.
#' (function (x) x[order2(x)])(c(NA,1,2,NA,NaN,Inf,-Inf))
#' #For characters, values sort correctly with NA at the end.
#' (function (x) x[order2(x)])(c(NA,'C','B',NA,'A'))
#' #For factors, values sort correctly with NA at the end.
#' (function (x) x[order2(x)])(as.factor(c(NA,'C','B',NA,'A')))
#'
#'
#' #Ordering a data frame using order2
#' df<-data.frame(one=as.character(1:4e5),
#' two=sample(1:1e5,4e5,TRUE),
#' three=sample(letters,4e5,TRUE),stringsAsFactors=FALSE)
#' system.time(a<-df[order(df$one),])
#' system.time(b<-df[order2(df$one),])
#' system.time(a<-df[order(df$two),])
#' system.time(b<-df[order2(df$two),])
#'
#' \dontrun{
#' chars<-as.character(sample(1e5,1e6,TRUE))
#' system.time(a<-order(chars))
#' system.time(b<-order2(chars))
#'
#' ints<-as.integer(sample(1e5,1e6,TRUE))
#' system.time(result<-order(ints))
#' system.time(result<-order2(ints))
#'
#' nums<-runif(1e6)
#' system.time(result<-order(nums))
#' system.time(result<-order2(nums))
#'
#' logs<-as.logical(sample(0:1,1e7,TRUE))
#' system.time(result<-order(logs))
#' system.time(result<-order2(logs))
#'
#' facts<-as.factor(as.character(sample(1e5,1e6,TRUE)))
#' system.time(a<-order(facts))
#' system.time(b<-order2(facts))
#' identical(facts[a],facts[b])
#'
#'
#' }
order2<-function(x)
{
result<-.Call('ordercpp',x)
return(result)
}
#' Value Matching
#'
#' Returns a lookup table or list of the positions of ALL matches of its first
#' argument in its second and vice versa. Similar to \code{\link{match}}, though
#' that function only returns the first match.
#'
#' This behavior can be imitated by using joins to create lookup tables, but
#' \code{matches} is simpler and faster: usually faster than the best joins in
#' other packages and thousands of times faster than the built in
#' \code{\link{merge}}.
#'
#' \code{all.x/all.y} correspond to the four types of database joins in the
#' following way:
#'
#' \describe{ \item{left}{\code{all.x=TRUE}, \code{all.y=FALSE}}
#' \item{right}{\code{all.x=FALSE}, \code{all.y=TRUE}}
#' \item{inner}{\code{all.x=FALSE}, \code{all.y=FALSE}}
#' \item{full}{\code{all.x=TRUE}, \code{all.y=TRUE}} }
#'
#' Note that \code{NA} values will match other \code{NA} values.
#'
#' @param x vector. The values to be matched. Long vectors are not currently
#' supported.
#' @param y vector. The values to be matched. Long vectors are not currently
#' supported.
#' @param all.x logical; if \code{TRUE}, then each value in \code{x} will be
#' included even if it has no matching values in \code{y}
#' @param all.y logical; if \code{TRUE}, then each value in \code{y} will be
#' included even if it has no matching values in \code{x}
#' @param list logical. If \code{TRUE}, the result will be returned as a list
#' of vectors, each vector being the matching values in y. If \code{FALSE},
#' result is returned as a data frame with repeated values for each match.
#' @param indexes logical. Whether to return the indices of the matches or the
#' actual values.
#' @param nomatch the value to be returned in the case when no match is found.
#' If not provided and \code{indexes=TRUE}, items with no match will be
#' represented as \code{NA}. If set to \code{NULL}, items with no match will
#' be set to an index value of \code{length+1}. If {indexes=FALSE}, they will
#' default to \code{NA}.
#' @export
#' @examples
#' one<-as.integer(1:10000)
#' two<-as.integer(sample(1:10000,1e3,TRUE))
#' system.time(a<-lapply(one, function (x) which(two %in% x)))
#' system.time(b<-matches(one,two,all.y=FALSE,list=TRUE))
#'
#' #Only retain items from one with a match in two
#' b<-matches(one,two,all.x=FALSE,all.y=FALSE,list=TRUE)
#' length(b)==length(unique(two))
#'
#' one<-round(runif(1e3),3)
#' two<-round(runif(1e3),3)
#' system.time(a<-lapply(one, function (x) which(two %in% x)))
#' system.time(b<-matches(one,two,all.y=FALSE,list=TRUE))
#'
#' one<-as.character(1:1e5)
#' two<-as.character(sample(1:1e5,1e5,TRUE))
#' system.time(b<-matches(one,two,list=FALSE))
#' system.time(c<-merge(data.frame(key=one),data.frame(key=two),all=TRUE))
#'
#' \dontrun{
#' one<-as.integer(1:1000000)
#' two<-as.integer(sample(1:1000000,1e5,TRUE))
#' system.time(b<-matches(one,two,indexes=FALSE))
#' if(requireNamespace("dplyr",quietly=TRUE))
#' system.time(c<-dplyr::full_join(data.frame(key=one),data.frame(key=two)))
#' if(require(data.table,quietly=TRUE))
#' system.time(d<-merge(data.table(data.frame(key=one))
#' ,data.table(data.frame(key=two))
#' ,by='key',all=TRUE,allow.cartesian=TRUE))
#'
#' one<-as.character(1:1000000)
#' two<-as.character(sample(1:1000000,1e5,TRUE))
#' system.time(a<-merge(one,two)) #Times out
#' system.time(b<-matches(one,two,indexes=FALSE))
#' if(requireNamespace("dplyr",quietly=TRUE))
#' system.time(c<-dplyr::full_join(data.frame(key=one),data.frame(key=two)))#'
#' if(require(data.table,quietly=TRUE))
#' {
#' system.time(d<-merge(data.table(data.frame(key=one))
#' ,data.table(data.frame(key=two))
#' ,by='key',all=TRUE,allow.cartesian=TRUE))
#' identical(b[,1],as.character(d$key))
#' }
#' }
matches<-function(x,y,all.x=TRUE,all.y=TRUE,list=FALSE,indexes=TRUE,nomatch=NA)
{
result<-.Call('matches',x,y)
result<-data.frame(x=result[[1]],y=result[[2]])
if(!all.y)
result<-result[result$x!=length(x)+1,]
if(!all.x)
result<-result[result$y!=length(y)+1,]
if(!indexes)
{
result$x<-x[result$x]
result$y<-y[result$y]
}
else if(!is.null(nomatch))
{
result$x[result$x==length(x)+1]<-nomatch
result$y[result$y==length(y)+1]<-nomatch
}
if(list)
result<-tapply(result$y,result$x,function (z) z[!is.na(z)])
return(result)
}
#'Extract/return parts of objects
#'
#'Alternative to built-in \code{\link{Extract}} or \code{[}. Allows for
#'extraction operations that are ambivalent to the data type of the object. For
#'example, \code{extract(x,i)} will work on lists, vectors, data frames,
#'matrices, etc.
#'
#'Extraction is 2-100x faster on data frames than with the built in operation -
#'but does not preserve row names.
#'
#'@param x object from which to extract elements
#'@param i,j indices specifying elements to extract. Can be \code{numeric},
#' \code{character}, or \code{logical} vectors.
#'@export
#'@examples
#'#Typically about twice as fast on normal subselections
#'orders<-data.frame(orderNum=1:1e5,
#' sku=sample(1e3, 1e5, TRUE),
#' customer=sample(1e4,1e5,TRUE))
#'a<-sample(1e5,1e4)
#' system.time(b<-orders[a,])
#'system.time(c<-extract(orders,a))
#'rownames(b)<-NULL
#'rownames(c)<-NULL
#'identical(b,c)
#'
#'#Speedup increases to 50-100x with oversampling
#'a<-sample(1e5,1e6,TRUE)
#' system.time(b<-orders[a,])
#'system.time(c<-extract(orders,a))
#'rownames(b)<-NULL
#'rownames(c)<-NULL
#'identical(b,c)
#'
#'#Can create function calls that work for multiple data types
#'alist<-as.list(1:50)
#'avector<-1:50
#'extract(alist,1:5)
#'extract(avector,1:5)
#'extract(orders,1:5)#'
#'
#'\dontrun{
#'orders<-data.frame(orderNum=as.character(sample(1e5, 1e6, TRUE)),
#' sku=sample(1e3, 1e6, TRUE),
#' customer=sample(1e4,1e6,TRUE))
#'a<-sample(1e6,1e7,TRUE)
#' system.time(b<-orders[a,])
#'system.time(c<-extract(orders,a))
#'}
extract<-function(x,i=NULL,j=NULL)
{
if(is.null(dim(x)))
{
x<-x[i]
return(x)
}
else
if(!is.null(j))
x<-x[,j]
if(!is.null(i))
{
if(is.data.frame(x))
x<-as.data.frame(lapply(x,function (a) a[i]))
else
x<-x[i,]
}
return(x)
}
#' A wrapper for \code{\link{sample.int}} and \code{\link[grr]{extract}} that
#' makes it easy to quickly sample rows from any object, including Matrix and
#' sparse matrix objects.
#'
#' Row names are not preserved.
#'
#' @param x object from which to extract elements
#' @param size a positive number, the number of items to choose.
#' @param replace Should sampling be with replacement?
#' @param prob A vector of probability weights for obtaining the elements of the
#' vector being sampled.
#' @export
#' @examples
#'
#' #Sampling from a list
#' l1<-as.list(1:1e6)
#' b<-sample2(l1,1e5)
#'
#' #Sampling from a data frame
#' orders<-data.frame(orderNum=sample(1e5, 1e6, TRUE),
#' sku=sample(1e3, 1e6, TRUE),
#' customer=sample(1e4,1e6,TRUE),stringsAsFactors=FALSE)
#'
#' a<-sample2(orders,250000)
#'
#' #With oversampling sample2 can be much faster than the alternatives,
#' #with the caveat that it does not preserve row names.
#' system.time(a<-sample2(orders,2000000,TRUE))
#' system.time(b<-orders[sample.int(nrow(orders),2000000,TRUE),])
#' \dontrun{
#'
#' system.time(c<-dplyr::sample_n(orders,2000000,replace=TRUE))
#'
#' #Can quickly sample for sparse matrices while preserving sparsity
#' sm<-rsparsematrix(20000000,10000,density=.0001)
#' sm2<-sample2(sm,1000000)
#' }
sample2<-function(x,size,replace=FALSE,prob=NULL)
{
index<-sample.int(len(x),size,replace,prob)
return(extract(x,index))
}
#' Convert string representations of numbers in any base to any other base.
#'
#' @param x a vector of integers or strings to be converted
#' @param base1 the base of x
#' @param base2 the base of the output
#' @seealso \code{\link{as.octmode}}
#' @seealso \code{\link{as.hexmode}}
#' @seealso \code{\link{strtoi}}
#' @export
#' @examples
#'
#'identical(convertBase(1234,base2=8),as.character(as.octmode(1234)))
#'
#'convertBase(17771,base1=8,base2=30)
#'convertBase(17771,base1=8,base2=10)
#'convertBase(8185,base1=10,base2=30)
#'
convertBase<-function(x, base1=10, base2=10)
{
if(base1!=10)
x<-toBase10(x,base1)
if(base2!=10)
x<-fromBase10(x,base2)
return(x)
}
toBase10<-function(x,base)
{
if(base>36 | base < 2)
stop('This function is only implemented for bases 2-36.')
if(base>10)
x<-tolower(x)
lookup<-c(0:35)
names(lookup)<-c(0:9,letters)
lookup[(base+1):36]<-NA
splits<-strsplit( gsub("(.)","\\1~",x), "~" )
results<-unlist(lapply(splits,function (x) sum(lookup[unlist(x)]*(base^(length(x):1-1)))))
return(results)
}
fromBase10<-function(x,base)
{
if(base>36 | base < 2)
stop('This function is only implemented for bases 2-36.')
if(!is.numeric(x))
x<-as.integer(x)
lookup<-c(0:9,letters)
results<-sapply(x, function (x)
{
answer<-NULL
while (x>0)
{
remainder <- x %% base
x <- x %/% base
answer<-c(remainder,answer)
}
result<-lookup[answer+1]
result<-paste0(result,collapse = '')
return(result)
})
return(results)
}
len<-function (data)
{
result <- ifelse(is.null(nrow(data)), length(data), nrow(data))
return(result)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.