R/dbgit.R

Defines functions remove_empty runit cli_install path_up find_up get_env pjoin dbgit_init dbgit_pull dbgit_push dbgit_pushfile

Documented in dbgit_init dbgit_pull dbgit_push dbgit_pushfile

#if (!require("pacman")) install.packages("pacman")
#pacman::p_load(package1, package2, package_n)
#if (!require("processx")) install.packages("processx")

library(processx)
library(fs)
library(utils)
library(dplyr)

dbgit_wksp_file <- ".databricks"

remove_empty <- function(x) return( x[x!="" ])
runit <- function(cmd, args) {
    return(
        run(cmd,
            args = remove_empty(args),
            spinner = T,
            echo = T,
            echo_cmd = T
        ))
}

cli_install <- function() {
    runit("pip", args=c("install", "--upgrade", "databricks-cli"))

    # get config from files root home to Documents
    file.copy(file.path(Sys.getenv("HOME"),"..",".databrickscfg"), file.path(Sys.getenv("HOME"),".databrickscfg"), overwrite = T)
    file.copy(file.path(Sys.getenv("HOME"),"..",".gitconfig"), file.path(Sys.getenv("HOME"),".gitconfig"), overwrite = T)

    cfgpath=file.path(Sys.getenv("HOME"),'.databrickscfg')
    if(!file.exists(cfgpath)) stop(paste('Cannot find databricks configuration file in ', cfgpath))

}
#cli_install()

path_up <- function(p) {
    if(p=="/") { return("/") }
    return(path_join(head(unlist(path_split(p)),-1)))
}

find_up <- function(pwd, name) {
    n <- file.path(pwd, name)
    there <- file.exists(n)
    if( !there & pwd=="/" ) { return(NA) }
    if( there ) { return(paste0(pwd)) }
    return(find_up(path_up(pwd), name)) # not there yet, recurse
}

get_env <- function() {
    pwd <- getwd()
    base <- find_up(pwd, dbgit_wksp_file)
    if(is.na(base)) {
        stop("Databricks folder not found")
    }

    dbbase=readLines(file.path(base,dbgit_wksp_file))
    # check output?

    return(list(
        pwd = pwd,
        base = base,
        relfolder = substring(pwd, nchar(base, allowNA = T)+1),
        dbbase = dbbase
    ))
}

pjoin <- function(...) return(gsub('//', '/', file.path(...)))

#' Initialize dbgit reposiory
#'
#' This function allow to configure a folder to be used as sync repository with databricks
#'
#' @param wksp_folder indicates the absolute location of databricks workspace e.g. /Users/some@email.com for pointing to a home folder
#'
#' @export

dbgit_init <- function(wksp_folder) {
    cli_install()

    if(missing(wksp_folder)) wksp_folder <- readline("Please, enter databricks workspace folder:")
    print(wksp_folder)

    gitpath=file.path(Sys.getenv("HOME"),'.gitconfig')
    if(!file.exists(gitpath)) stop(paste('Cannot find GIT configuration file in ', gitpath))

    out <- runit("git", args = c("init"))
    write(wksp_folder, dbgit_wksp_file)
    out <- runit("git", args = c("add", dbgit_wksp_file))
    out <- runit("git", args = c("commit", "-m", "'initial commit'"))
    print(paste("dbgit_init executed for workspace location",wksp_folder))
}


#' Pull a directory from databricks
#'
#' This function allow you to sync pulling a folder FROM databricks TO local git folder
#'
#' @export
#'
dbgit_pull <- function() {
    cli_install()
    env <- get_env()
    out <- runit("git", args = c("status" ,"--porcelain"))
    if(nchar(out$stdout)>0) {
        stop("Local changes found! Please commit or stash the files first to avoid losing data.")
    }

    out <- runit("databricks",
               args = c("workspace" ,
                        "export_dir",
                        "-o",
                        file.path(env$dbbase, env$relfolder),
                        file.path(env$base, env$relfolder)
               ))

}

#' Push a directory to databricks
#'
#' This function allow you to sync pushing a local git folder TO databricks
#'
#' @param overwrite is a boolean indicating if we should overwrite the file if already exists. By default is FALSE.
#' @export
#'
dbgit_push <- function(overwrite=FALSE) {
    cli_install()
    env <- get_env()
    print("Pushing directory to databricks..")

    writemode=ifelse(overwrite, "-o", "")
    out <- runit("databricks",
               args = c("workspace" ,
                        "import_dir",
                        writemode,
                        "-e",
                        file.path(env$base, env$relfolder),
                        file.path(env$dbbase, env$relfolder)
               ))

}

#' Push a file to databricks
#'
#' This function allow you to sync pushing a local git file TO databricks
#' @param filename indicating the filename to upload to databricks.
#' @param overwrite is a boolean indicating if we should overwrite the file if already exists. By default is FALSE.
#'
#' @export
#'
dbgit_pushfile <- function(filename, overwrite=FALSE) {
    cli_install()
    env <- get_env()
    print("Pushing file to databricks..")
    writemode=ifelse(overwrite, "-o", "")

    fileonly <- path_ext_remove(filename)
    lowerfile <- tolower(filename)
    language = case_when(
        endsWith(lowerfile, ".py") ~ "PYTHON",
        endsWith(lowerfile, ".scala") ~ "SCALA",
        endsWith(lowerfile, ".r") ~ "R",
        endsWith(lowerfile, ".sql") ~ "SQL",
        TRUE ~ "unknown"
    )

    out <- runit("databricks",
               args = remove_empty(
                   c("workspace" ,
                        "import",
                        "-l",
                        language,
                        writemode,
                        pjoin(env$base, env$relfolder, filename),
                        pjoin(env$dbbase, env$relfolder, fileonly))

               ))

}

#dbgit_init('/DEV')
#dbgit_pull()
dwerbam/databricks_git_R documentation built on Nov. 5, 2019, 2:39 p.m.