knitr::opts_chunk$set(eval = FALSE,comment = "#>",fig.width = 5)
Point SPARK_HOME to an existing installation of Spark:
export SPARK_HOME='/path/to/spark/installation'
Set the HADOOP_CONF_DIR and Spark MASTER environmental variables.
export HADOOP_CONF_DIR=/etc/hadoop/conf export MASTER="yarn-client"
Generally, these have been done by the Operations Department
my versions are:
cd sparkling-water-2.1.8/ nohup bin/sparkling-shell --num-executors 3 --executor-memory 5g --master yarn-client > ./test.out &
sparklyr.shell.*
will pass parameters to spark-submit
, please see : http://spark.rstudio.com/deployment.html#package_options
The below configurations start a cluster with 15g memory
and 6 cores
.
Sys.setenv(SPARK_HOME = "/home/Rstudio/spark-2.1.0-bin-hadoop2.7") options(rsparkling.sparklingwater.version = "2.1.8") options(rsparkling.sparklingwater.location = "/home/Rstudio/sparkling-water-2.1.8/assembly/build/libs/sparkling-water-assembly_2.11-2.1.8-all.jar") library(dplyr) library(sparklyr) library(h2o) library(rsparkling) conf <- list( "sparklyr.shell.driver-memory"= "4g", "sparklyr.shell.num-executors" = 3, "sparklyr.shell.executor-memory" = "5g", "sparklyr.shell.executor-cores" = 2 ) # connecting to spark sc <- spark_connect(master="yarn-client", version = "2.1.0", spark_home = '/home/Rstudio/spark-2.1.0-bin-hadoop2.7', config = conf) # connecting to sparkling water h2o_flow(sc) # View the status of h2o cloud h2o.clusterInfo() h2o.clusterStatus()
# you can also use H2o Flow to import data mydata <- h2o.importFile(path = "hdfs://nameservice1/test/test.txt",destination_frame = "test.hex") # convert h2o-hex to spark RDD mydata_tbl <- as_spark_dataframe(sc,mydata)
I recommend use h2o flow importFiles
to import data, it's pretty much easy, then you just need mydata <- h2o.getFrame("data_name_id.hex")
, send the key index to R.
http://spark.rstudio.com/h2o.html#deep_learning
path <- system.file("extdata", "prostate.csv", package = "h2o") test <- readr::read_csv(path) # copy R dataframe to spark RDD prostate_df <- sdf_copy_to(sc, test, "prostate") head(prostate_df) # spark RDD to h2o hex prostate_hf <- as_h2o_frame(sc, prostate_df, name = "prostate.hex") splits <- h2o.splitFrame(prostate_hf, seed = 1) h2o.ls() y <- "VOL" #remove response and ID cols x <- setdiff(names(prostate_hf), c("ID", y)) dl_fit <- h2o.deeplearning(x = x, y = y, training_frame = splits[[1]], epochs = 15, activation = "Rectifier", hidden = c(10, 5, 10), input_dropout_ratio = 0.7) h2o.performance(dl_fit, newdata = splits[[2]]) h2o.ls()
h2o:::.h2o.garbageCollect() h2o.shutdown(prompt=FALSE) spark_disconnect(sc)
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.