R/multiplyr.R

Defines functions onLoad

#' Data Manipulation with Parellelism and Shared Memory Matrices
#'
#' @description
#' Provides a new form of data frame backed by shared memory matrices and
#' a way to manipulate them. Upon creation these data frames are shared
#' across multiple local nodes to allow for simple parallel processing. Run the
#' following command for a more thorough explanation: \code{vignette("basics")}
#'
#' @section Major differences from dplyr:
#'
#' \code{summarise} with dplyr will return a single number, but here it
#' will return N values depending on how many nodes there are. Typically
#' you should follow \code{summarise} with \code{\link{reduce}}, which is
#' run locally.
#'
#' @section Standard dplyr-like functions:
#' \tabular{ll}{
#'     \code{\link{arrange}}     \tab Sort data \cr
#'     \code{\link{distinct}}    \tab Select unique rows or unique combinations of variables \cr
#'     \code{\link{filter}}      \tab Filter data \cr
#'     \code{\link{group_by}}    \tab Group data \cr
#'     \code{\link{group_sizes}} \tab Return size of groups \cr
#'     \code{\link{groupwise}}   \tab Use grouped data (also known as \code{ungroup})\cr
#'     \code{\link{mutate}}      \tab Change values of existing variables (and create new ones) \cr
#'     \code{\link{n_groups}}    \tab Return number of groups \cr
#'     \code{\link{rename}}      \tab Rename variables \cr
#'     \code{\link{rowwise}}     \tab Use data as individual rows \cr
#'     \code{\link{select}}      \tab Retain only specified variables \cr
#'     \code{\link{slice}}       \tab Select rows by position\cr
#'     \code{\link{summarise}}   \tab Summarise data \cr
#'     \code{\link{transmute}}   \tab Change variables and drop all others \cr
#' }
#'
#' @section Parallel functions:
#' \tabular{ll}{
#'     \code{\link{partition_even}}  \tab Partition data evenly amongst cluster nodes \cr
#'     \code{\link{partition_group}} \tab Partition data so that each group is wholly on a node \cr
#'     \code{\link{within_group}}    \tab Execute code within a group \cr
#'     \code{\link{within_node}}     \tab Execute code within a group \cr
#' }
#'
#' @section Additional data frame functions:
#' \tabular{ll}{
#'     \code{\link{Multiplyr}}   \tab Create new parallel data frame \cr
#'     \code{\link{define}}      \tab Define new variables \cr
#'     \code{\link{nsa}}         \tab No strings attached mode \cr
#'     \code{\link{reduce}}      \tab Summarise locally only \cr
#'     \code{\link{regroup}}     \tab Return to grouped data \cr
#'     \code{\link{undefine}}    \tab Delete variables \cr
#' }
#'
#' @section Data manipulation adjuncts:
#' \tabular{ll}{
#'     \code{\link{between}} \tab Tests whether elements of a vector lie between two values (inclusively) \cr
#'     \code{\link{cumall}}  \tab Cumulative all \cr
#'     \code{\link{cumany}}  \tab Cumulative any \cr
#'     \code{\link{cummean}} \tab Cumulative mean \cr
#'     \code{\link{first}}   \tab Returns first value in vector \cr
#'     \code{\link{last}}    \tab Returns last value in vector \cr
#'     \code{\link{lag}}     \tab Offset x backwards by n \cr
#'     \code{\link{lead}}    \tab Offset x forwards by n \cr
#'     \code{\link{n}}       \tab Number of items in current group \cr
#'     \code{\link{nth}}     \tab Return the nth item from a vector \cr
#' }
#'
#' @importFrom bigmemory sub.big.matrix attach.big.matrix mwhich mpermute
#' @importFrom bigmemory.sri describe
#' @importFrom magrittr %>%
#' @importFrom parallel makeCluster stopCluster clusterExport clusterEvalQ
#' @importClassesFrom bigmemory big.matrix big.matrix.descriptor
#' @docType package
#' @name multiplyr
NULL

if(getRversion() >= "2.15.1") {
    # Avoid NOTEs during check about lack of global variable bindings
    utils::globalVariables(c(".Gbase", ".end", ".expr", ".grouped", ".groups",
                             ".local", ".rows", ".start", ".tg", ".gcdesc",
                             ".offset"))
}

.onLoad <- function (libname, pkgname) {
    op <- options()
    op.multiplyr <- list(
        multiplyr.cores = as.numeric (Sys.getenv ("R_MULTIPLYR_CORES", unset=parallel::detectCores()-1))
    )
    toset <- (!names(op.multiplyr) %in% names(op))
    if (any(toset)) {
        options(op.multiplyr[toset])
    }
}

Try the multiplyr package in your browser

Any scripts or data that you put into this service are public.

multiplyr documentation built on May 30, 2017, 12:09 a.m.