samples/resource_files/resource_files_example.R

# =======================================
# === Setup / Install and Credentials ===
# =======================================
# install packages from github
library(devtools)
devtools::install_github("azure/doAzureParallel")

# import packages
library(doAzureParallel)

# set azure credentials
doAzureParallel::setCredentials("credentials.json")

# Add data.table package to the CRAN packages and Azure/rAzureBatch to the Github packages
# in order to install the packages to all of the nodes
# Since reading the large datasets cost high memory, we recommend using Standard_D11_v2
# "rPackages": {
#   "cran": ["data.table"],
#   "github": ["Azure/rAzureBatch", "Azure/doAzureParallel"]
# }

# ===================================================
# === Setting up your cluster with resource files ===
# ===================================================

# Now we will use resource-files to upload our dataset onto each node of our cluster.
# Currently, our data is stored in Azure Blob in an account called 'playdatastore',
#   in a public container called "nyc-taxi-dataset". The default blob containers permissions
#   settings are private when creating containers in doAzureParallel / Azure Storage Explorer.
#   To get this dataset onto each node,
#   we will create a resouceFile object for each blob - we will then use the resourceFile
#   when building the cluster so that each node in the cluster knows to download these files
#   after the node is provisioned.
# Using the NYC taxi datasets, http://www.nyc.gov/html/tlc/html/about/trip_record_data.shtml
azureStorageUrl <- "http://playdatastore.blob.core.windows.net/nyc-taxi-dataset"
resource_files <- list(
  rAzureBatch::createResourceFile(httpUrl = paste0(azureStorageUrl, "/yellow_tripdata_2016-1.csv"), filePath = "yellow_tripdata_2016-1.csv"),
  rAzureBatch::createResourceFile(httpUrl = paste0(azureStorageUrl, "/yellow_tripdata_2016-2.csv"), filePath = "yellow_tripdata_2016-2.csv"),
  rAzureBatch::createResourceFile(httpUrl = paste0(azureStorageUrl, "/yellow_tripdata_2016-3.csv"), filePath = "yellow_tripdata_2016-3.csv"),
  rAzureBatch::createResourceFile(httpUrl = paste0(azureStorageUrl, "/yellow_tripdata_2016-4.csv"), filePath = "yellow_tripdata_2016-4.csv"),
  rAzureBatch::createResourceFile(httpUrl = paste0(azureStorageUrl, "/yellow_tripdata_2016-5.csv"), filePath = "yellow_tripdata_2016-5.csv"),
  rAzureBatch::createResourceFile(httpUrl = paste0(azureStorageUrl, "/yellow_tripdata_2016-6.csv"), filePath = "yellow_tripdata_2016-6.csv")
)

# add the parameter 'resourceFiles' to download files to nodes
cluster <- makeCluster("resource_files_cluster.json", resourceFiles = resource_files)

# when the cluster is provisioned, register the cluster as your parallel backend
registerDoAzureParallel(cluster)

# ======================================================
# === Setting up storage account to write results to ===
# ======================================================

# Setup storage location to write your results to:
# This step will allow your to upload your results from within your doAzureParallel foreach loop:
#
#   1. Replace the "mystorageaccount" with the name of the storage account you wish to write your results to.
#   2. Create an output container named "nyc-taxi-graphs" to store your results in
#   3. Create a SasToken that allows us to write ("w") to the container
#   4. Notice the parameter 'sr = "c"' in the createSasToken method, this
#      simply means that the token is created for that entire container in storage
#
storageAccountName <- "mystorageaccount"
outputsContainer <- "nyc-taxi-graphs"
rAzureBatch::createContainer(outputsContainer)

# permissions: r = read, w = write.
outputSas <- rAzureBatch::createSasToken(permission = "rw", sr = "c", outputsContainer)

# =======================================================
# === Foreach with resourceFiles & writing to storage ===
# =======================================================

results <- foreach(i = 1:6) %dopar% {

  library(data.table)
  library(ggplot2)
  library(rAzureBatch)

  # To get access to your azure resource files, user needs to use the special
  # environment variable to get the directory
  fileDirectory <- paste0(Sys.getenv("AZ_BATCH_NODE_STARTUP_DIR"), "/wd")
  print(fileDirectory)

  # columns to keep for the datafram
  colsToKeep <- c("pickup_longitude", "pickup_latitude", "dropoff_longitude", "dropoff_latitude", "tip_amount", "trip_distance")

  # read in data from CSV that was downloaded from the resource file
  file <- fread(paste0(fileDirectory, "/yellow_tripdata_2016-", i, ".csv"), select = colsToKeep)

  # set the coordinates for the bounds of the plot
  min_lat <- 40.5774
  max_lat <- 40.9176
  min_long <- -74.15
  max_long <- -73.7004

  # compute intensive plotting
  plot <- ggplot(file, aes(x=pickup_longitude, y=pickup_latitude)) +
    geom_point(size=0.06) +
    scale_x_continuous(limits=c(min_long, max_long)) +
    scale_y_continuous(limits=c(min_lat, max_lat)) +
    scale_color_gradient(low="#CCCCCC", high="#8E44AD", trans="log") +
    labs(title = paste0("Map of NYC, Plotted Using Locations Of All Yellow Taxi Pickups in ", i, " month"))

  # build image from plot
  image <- paste0("nyc-taxi-", i, ".png")
  ggsave(image)

  # save image to the storage account using the Sas token we created above
  blob <- rAzureBatch::uploadBlob(containerName = outputsContainer,
             image,
             sasToken = outputSas,
             accountName = storageAccountName)

  # return the blob url
  blob$url
}

# The results object is a list of pointers to files in Azure Storage. Copy and paste the links into your favorite browser
# to see the output per run.
results

# deprovision your cluster after your work is complete
stopCluster(cluster)
Azure/doAzureParallel documentation built on May 22, 2021, 4:39 a.m.