R/tbl_df_methods.R

#' @include tbl_xdf.R
NULL

#' Convert a data source or tbl to a data frame
#'
#' @param x A data source object, or tbl wrapping the same.
#' @param maxRowsByCols the maximum dataset size to convert, expressed in terms of rows times columns. Defaults to NULL, meaning no maximum.
#' @param row.names,optional For compatibility with the \code{as.data.frame} generic. Not used.
#' @param ... Other arguments to \code{rxDataStep}.
#' @details
#' These are simple wrappers around \code{\link[RevoScaleR]{rxDataStep}}, with the check on the maximum table size turned off. You should ensure that you have enough memory for your data.
#'
#' \code{as.data.frame} converts a data source object (typically an Xdf file, but can also be any data source type that \code{rxDataStep} supports) into a data frame. The \code{pull}, \code{$} and \code{[[} methods extract a single column from a data source, as a vector.
#'
#' @seealso
#' \code{\link[base]{as.data.frame}}, \code{\link{collect}}
#'
#' @examples
#' mtx <- as_xdf(mtcars, overwrite=TRUE)
#' mtx$mpg
#' mtx[["mpg"]]
#' pull(mtx, mpg)
#' as.data.frame(mtx)
#' as_data_frame(mtx)  # returns a tbl_df
#' @aliases as.data.frame
#' @rdname as.data.frame
#' @export
as.data.frame.RxXdfData <- function(x, maxRowsByCols=NULL, row.names=NULL, optional=TRUE, ...)
{
    # calling rxDataStep on HDFS data from remote client is bog-slow, use direct download instead
    if(in_hdfs(x))
        collect(x, as_data_frame=TRUE, maxRowsByCols=maxRowsByCols, ...)
    else local_exec(rxDataStep(x, outFile=NULL, maxRowsByCols=maxRowsByCols, ...))
}


#' @rdname as.data.frame
#' @export
as.data.frame.RxDataSource <- function(x, maxRowsByCols=NULL, row.names=NULL, optional=TRUE, ...)
{
    # if data in HDFS: import to xdf, rely on as.data.frame.RxXdfData to download and convert to df
    if(in_hdfs(x))
    {
        x <- compute(x)
        on.exit(delete_xdf(x))
        as.data.frame(x, maxRowsByCols=maxRowsByCols, ...)
    }
    else local_exec(rxDataStep(x, outFile=NULL, maxRowsByCols=maxRowsByCols, ...))
}



#' @param name The name of a column to extract from a data source object
#' @rdname as.data.frame
#' @export
"$.RxFileData" <- function(x, name)
{
    rxDataStep(unTbl(x), outFile=NULL, varsToKeep=name, maxRowsByCols=NULL)[[1]]
}


## rxGetVarInfo depends on Rx* sources not having a [ method
##' @export
#"[.RxFileData" <- function(x, i, j, drop=FALSE, maxRowsByCols=NULL, ...)
#{
    ## try to detect if list-style indexing used (no j argument, missing or otherwise)
    #missingI <- missing(i)
    #missingJ <- missing(j)
    #nArgs <- nargs()
    #nOtherArgs <- length(match.call(expand=FALSE)$`...`) + !missing(drop) + !missing(maxRowsByCols)
    #asList <- (nArgs - nOtherArgs <= 2) || (nArgs - nOtherArgs <= 1 && missingJ)

    #if(asList)
    #{
        ## morph to matrix-style indexing: set 2nd index (if present) to 1st, set 1st index to missing
        #if(!missingI)
            #j <- i
        #missingJ <- missingI
        #missingI <- TRUE
    #}

    ## from here on, assume matrix-style indexing (2 indices)
    #varsToKeep <- if(!missingJ)
    #{
        #nams <- names(x)
        #if(is.logical(j) || is.numeric(j))
            #varsToKeep <- nams[j]
        #else varsToKeep <- as.character(j)
    #}
    #else NULL
    #df <- rxDataStep(x, outFile=NULL, varsToKeep=varsToKeep, maxRowsByCols=maxRowsByCols, ...)
    ## must do this separately because by semantics of [, row selection expression is evaluated in calling frame
    #if(!missingI)
        #df[i, , drop=drop]
    #else df
#}


#' @rdname as.data.frame
#' @export
"[[.RxFileData" <- function(x, name, maxRowsByCols=NULL, ...)
{
    nams <- names(x)
    if(length(name) > 1)
        stop("attempt to select more than one column")
    varsToKeep <- if(is.logical(name) || is.numeric(name))
        nams[name]
    else as.character(name)

    rxDataStep(unTbl(x), outFile=NULL, varsToKeep=varsToKeep, maxRowsByCols=maxRowsByCols, ...)[[1]]
}


#' @rdname as.data.frame
#' @export
pull.RxFileData <- function(.data, var=-1)
{
    # exactly the same as for data frames
    var <- select_var(names(.data), !!enquo(var))
    .data[[var]]
}
RevolutionAnalytics/dplyrXdf documentation built on June 3, 2019, 9:08 p.m.