Use Case

A common use case for a Data Scientist is to create their R programs to analyse a dataset on their local compute platform (e.g., a laptop with 6GB RAM running Ubuntu with R installed). Development is performed with a subset of the full dataset (a random sample) that will not exceed the available memory and will return results quickly. When the experimental setup is complete the script can be sent across to a considerably more capable compute engine on Azure for modelling the whole population.

In this vignette a Linux Data Science Virtual Machine (DSVM) cluster is deployed, a distributed/parallel analysis is completed, results collected, and the compute resources deleted. Azure consumption occurs just for the duration.


# Load the required subscription resources: TID, CID, and KEY.
# Also includes the ssh PUBKEY for the user.

USER <-[['user']]

source(paste0(USER, "_credentials.R"))

# Load the required packages.

library(AzureSMR)    # Support for managing Azure resources.
library(AzureDSVM)    # Further support for the Data Scientist.

# Parameters for this script: the name for the new resource group and
# its location across the Azure cloud. The resource name is used to
# name the resource group that we will create transiently for the
# purposes of this script.

# Create a random resource group to reduce likelihood of conflict with
# other users.

BASE <- 
  runif(4, 1, 26) %>%
  round() %>%
  letters[.] %>%
  paste(collapse="") %T>%
  {sprintf("Base name:\t\t%s", .) %>% cat("\n")}

RG <-
  paste0("my_dsvm_", BASE,"_rg_sea") %T>%
  {sprintf("Resource group:\t\t%s", .) %>% cat("\n")}

# Choose a data centre location.

LOC <-
  "southeastasia"  %T>%
  {sprintf("Data centre location:\t%s", .) %>% cat("\n")}

# Include the random BASE in the hostname to reducely likelihood of
# conflict.

  paste0("my", BASE) %T>%
  {sprintf("Hostname:\t\t%s", .) %>% cat("\n")}


# Connect to the Azure subscription and use this as the context for
# all of our activities.

context <- createAzureContext(tenantID=TID, clientID=CID, authKey=KEY)

# Check if the resource group already exists. Take note this script
# will not remove the resource group if it pre-existed.

rg_pre_exists <- existsRG(context, RG, LOC) %T>% print()

if (! rg_pre_exists)
  # Create a new resource group into which we create the VMs and
  # related resources. Resource group name is RG. 

  # Note that to create a new resource group one needs to add access
  # control of Active Directory application at subscription level.

  azureCreateResourceGroup(context, RG, LOC)


# Deploy a cluster of 3 DSVMs.

COUNT <- 3

         = RG, 
                  location       = LOC, 
                  hostname       = BASE,
                  username       = USER, 
                  authen         = "Key",
                  pubkey         = PUBKEY,
                  count          = COUNT)

cluster <- azureListVM(context, RG, LOC)

for (i in 1:COUNT)
  vm   <- cluster[i, "name"]
  fqdn <- paste(cluster[i, "name"],
                cluster[i, "location"],

  cat(vm, "\n")

  operateDSVM(context, RG, vm, operation="Check")

  cmd <- paste("ssh -q",
               "-o StrictHostKeyChecking=no",
               "-o UserKnownHostsFile=/dev/null",
               fqdn, "uptime")
  system(cmd, intern=TRUE) %>% print()

Run analytics.

Next step is to use the DSVM for data analytics.

There are many ways of interacting with a DSVM. For both Linux and Windows based DSVMs it is convenient to remote login onto the hostname with GUI (more detailed information can be found here). Often remote execution within an R session is preferred by data scientists as it can be efficiently automated through R scripts. The following chunks of codes demonstrate how to use R for remote execution of R scripts under a desired computing context.

We begin with a very simple experiment with random number generation. The function executeScript() handles the remote execution. (Note that the current version only supports remote execution of a script on a Linux DSVM and the remote execution is achieved via a ssh channel.) The computing context can be specified for the execution. In the case of "clusterParallel", a cluster of DSVMs is used.

Note that Microsoft R Server (>= 9.0) allows remote execution on a properly configured DSVM. One can follow the steps here to configure the deployed DSVMs for remote interaction with Microsoft R Server.

# Create a script for remote execution.

code <- "
x <- seq(1, 500)
y <- x * rnorm(length(x), 0, 0.1)

tmpf1 <- tempfile(paste0("AzureDSVM_experiment_01_"))
writeLines(code, tmpf1)

# Local parallelism on node cores.

t1 <- Sys.time()

      = RG,
              hostname        = cluster$name[1],
              remote          = paste(cluster$name[1],
              username        = unique(cluster$admin),
              script          = tmpf1,
              compute.context = "localParallel")

t2 <- Sys.time()

# cluster parallelism across nodes.

      = RG,
              hostname        = cluster$name[1],
              remote          = paste(cluster$name[1],
              master          = paste(cluster$name[1],
              slaves          = paste(cluster$name[-1],
              username        = unique(cluster$admin),
              script          = tmpf1,
              compute.context = "clusterParallel")

t3 <- Sys.time()

performance1 <- t2 - t1
performance2 <- t3 - t2


Yet another example with parallel execution by using rxExec function from the Microsoft RevoScaleR package.

# Parallelizing k-means clustering on the iris dataset.

codes <- paste("library(scales)",
               "df <- scale(iris[, -5])",
               "rxExec(kmeans, x=df, centers=2)",

tmpf2 <- tempfile(paste0("AzureDSVM_experiment_02_"))
writeLines(codes, tmpf2)

t4 <- Sys.time()

      = RG,
              hostname        = cluster$name[1],
              remote          = paste(cluster$name[1],
              master          = paste(cluster$name[1],
              slaves          = paste(cluster$name[-1],
              username        = unique(cluster$admin),
              script          = tmpf2,
              compute.context = "clusterParallel")

t5 <- Sys.time()

performance3 <- t5 - t4


Clean up.

file.remove(tmpf1, tmpf2)

# Delete the resource group now that we have proved existence. There
# is probably no need to wait. Only delete if it did not pre-exist
# this script. Deletion seems to take 10 minutes or more.

if (! rg_pre_exists)
  azureDeleteResourceGroup(context, RG)

