R/workdir_utils.R

#' Get, set and clean the xdf tbl directory
#'
#' By default, dplyrXdf will save the xdf files it creates in the R temporary directory. This can be a problem if it is in a location with limited disk space. Use \code{set_dplyrxdf_dir} to change the xdf tbl directory, and \code{get_dplyrxdf_dir} to view it.
#'
#' @param path Location in which to save xdf tbls. If missing, defaults to the R temporary directory.
#' @param fileSystem The filesystem for which to set or get the tbl directory; can be either "hdfs" or "native". Currently only the native (local) filesystem is supported.
#'
#' @details
#' If \code{path} is supplied, \code{set_dplyrxdf_dir} creates a new directory (with a unique name) located \emph{under} \code{path}. This ensures that the files managed by dplyrXdf are properly isolated from the rest of the filesystem.
#'
#' @seealso
#' \code{\link{rxGetFileSystem}}, \code{\link{rxSetFileSystem}}
#' @rdname workdir
#' @export
set_dplyrxdf_dir <- function(path, fileSystem=rxGetFileSystem())
{
    fileSystem <- validateFileSystem(fileSystem)
    if(inherits(fileSystem, "RxHdfsFileSystem"))
    {
        if(missing(path))
            path <- "/tmp"

        # allow for Azure Data Lake storage
        host <- fileSystem$hostName
        path <- gsub("\\", "/", tempfile(pattern="dxTmp", tmpdir=path), fixed=TRUE)
        .dxOptions$hdfsWorkDir <- path
        .dxOptions$hdfsHost <- host
        .dxOptions$hdfsWorkDirCreated <- FALSE
    }
    else
    {
        if(missing(path))
            path <- tempdir()
        path <- tempfile(pattern="dxTmp", tmpdir=path)
        path <- normalizePath(path, mustWork=FALSE)
        dir.create(path, recursive=TRUE)
        .dxOptions$localWorkDir <- path
    }
    invisible(NULL)
}


#' @rdname workdir
#' @export
get_dplyrxdf_dir <- function(fileSystem=rxGetFileSystem())
{
    fileSystem <- validateFileSystem(fileSystem)
    if(in_hdfs(fileSystem))
    {
        if(!is.na(detectHdfsConnection(FALSE)))
            make_dplyrxdf_dir(fileSystem)
        makeHdfsUri(.dxOptions$hdfsHost, normalizeHdfsPath(.dxOptions$hdfsWorkDir))
    }
    else .dxOptions$localWorkDir
}


make_dplyrxdf_dir <- function(fileSystem=rxGetFileSystem())
{
    fileSystem <- validateFileSystem(fileSystem)
    if(in_hdfs(fileSystem))
    {
        path <- .dxOptions$hdfsWorkDir
        if(!.dxOptions$hdfsWorkDirCreated)
        {
            message("Creating HDFS working directory")
            host <- fileSystem$hostName
            res <- hdfs_dir_create(path, host=host)

            if(res)
                .dxOptions$hdfsWorkDirCreated <- TRUE
            else warning("unable to create HDFS working directory", call.=FALSE)

            .dxOptions$hdfsHost <- host
            return(res)
        }
    }
    else
    {
        path <- .dxOptions$localWorkDir
        if(!dir.exists(path))
            return(dir.create(path))
    }
    NULL
}


#' @details
#' \code{clean_dplyrxdf_dir} is a utility function to delete the files generated by dplyrXdf. Note that all files in the specified location will be removed!
#' @rdname workdir
#' @export
clean_dplyrxdf_dir <- function(fileSystem=rxGetFileSystem())
{
    fileSystem <- validateFileSystem(fileSystem)
    path <- get_dplyrxdf_dir(fileSystem)

    if(inherits(fileSystem, "RxNativeFileSystem"))
    {
        files <- dir(path, full.names=TRUE)
        unlink(files, recursive=TRUE)
    }
    else if(inherits(fileSystem, "RxHdfsFileSystem"))
    {
        host <- .dxOptions$hdfsHost
        pathExists <- hdfs_dir_exists(path, host)
        if(!pathExists)
            return(invisible(NULL))
        files <- hdfs_dir(path, full_path=TRUE, host=host)
        if(length(files) == 0)
            return(invisible(NULL))
        hdfs_dir_remove(files, skipTrash=TRUE, host=host)
    }

    invisible(NULL)
}


validateFileSystem <- function(fs)
{
    if(!inherits(fs, "RxFileSystem"))
    {
        if(tolower(fs) == "hdfs")
            fs <- RxHdfsFileSystem()
        else fs <- RxNativeFileSystem()
    }
    fs
}
RevolutionAnalytics/dplyrXdf documentation built on June 3, 2019, 9:08 p.m.