segue: A segue into parallel processing on Amazon's Web Services. Includes a parallel lapply function for the Elastic Map Reduce (EMR) engine

Documented in checkLastStepStatus checkStatus createCluster deleteS3Bucket deleteS3Key downloadS3File emptyS3Bucket makeS3Bucket setCredentials startCluster stopCluster submitJob uploadS3File

## A little simplification would be the first step toward rational living, I think.
## Eleanor Roosevelt 


## Lower logging level: LogManager.getLogManager().getLogger("com.amazonaws.request").setLevel(Level.OFF);
## ref: https://forums.aws.amazon.com/thread.jspa?messageID=186655&#186655

##' ##' AWS Support Function: set up credentials
##'
##' sets up the credentials needed to access AWS and optionally sets environment
##' variables for auto loading of credentials in the future
##' @param awsAccessKeyText your AWS Access Key as a string
##' @param awsSecretKeyText your AWS Secret Key as a string
##' @param setEnvironmentVariables T/F would you like environment variables to be set so
##' Segue will read the credentials on load
##' @author James "JD" Long
##' @export
setCredentials <- function(awsAccessKeyText, awsSecretKeyText, setEnvironmentVariables = TRUE){
    awsCreds <- new(com.amazonaws.auth.BasicAWSCredentials, awsAccessKeyText, awsSecretKeyText)
    assign("awsCreds", awsCreds, envir = .GlobalEnv)

    if (setEnvironmentVariables == TRUE) {
      Sys.setenv(AWSACCESSKEY = awsAccessKeyText, AWSSECRETKEY = awsSecretKeyText)
    }
}
##' AWS Support Function: Delete an S3 Key (a.k.a file)
##'
##' Deteles a key in a given bucket on S3
##' @param bucketName name of the bucket
##' @param keyName the key in the bucket
##' @author James "JD" Long
##' @export
deleteS3Key <- function(bucketName, keyName){
  tx <- new(com.amazonaws.services.s3.transfer.TransferManager, awsCreds)
  s3 <- tx$getAmazonS3Client()
  if (s3$doesBucketExist(bucketName)) { 
    s3$deleteObject(bucketName, keyName)
  }
}

##' AWS Support Function: Empty an S3 bucket
##'
##' Deletes all keys in the designated bucket
##' @param bucketName Name of the bucket to be emptied
##' @author James "JD" Long
##' @export
emptyS3Bucket <- function(bucketName){
  tx <- new(com.amazonaws.services.s3.transfer.TransferManager, awsCreds)
  s3 <- tx$getAmazonS3Client()

  # TODO: need a check to make sure the current user owns the bucket
  #       before trying to delete everything in it
  #       there's some risk this might loop forever if they don't own the bucket
  if (s3$doesBucketExist(bucketName)) {  
    lst <- s3$listObjects(bucketName)
    objSums <- lst$getObjectSummaries()
    listJavaObjs <- .jevalArray(objSums$toArray())
    if (length(listJavaObjs)>0){
      for (i in 1:length(listJavaObjs)) {
        deleteS3Key(bucketName, listJavaObjs[[i]]$getKey()[[1]])
      }
    }
    if (lst$isTruncated()){
      #recursion FTW!
      emptyS3Bucket(bucketName)
    }
  }
}

##' AWS Support Function: Delete an S3 Bucket
##'
##' Does nothing if the bucketName does not exist. If bucket contains Keys,
##' all keys are deleted.
##' @param bucketName the bucket to be deleted
##' @author James "JD" Long
##' @export
deleteS3Bucket <- function(bucketName){
  tx <- new(com.amazonaws.services.s3.transfer.TransferManager, awsCreds)
  s3 <- tx$getAmazonS3Client()
  if (s3$doesBucketExist(bucketName) == TRUE) {
    emptyS3Bucket(bucketName)
    tx <- new(com.amazonaws.services.s3.transfer.TransferManager, awsCreds)
    s3 <- tx$getAmazonS3Client()
    s3$deleteBucket(bucketName)
  }
}

##' AWS Support Function: Creates an S3 Bucket
##'
##' Creates an S3 bucket. If the bucket already exists, no warning is returned.
##' @param bucketName string of the name of the bucket to be created
##' @author James "JD" Long
##' @export
makeS3Bucket <- function(bucketName){
    #awsCreds <- get("awsCreds", envir = segue.env)
    tx <- new(com.amazonaws.services.s3.transfer.TransferManager, awsCreds)
    s3 <- tx$getAmazonS3Client()
    #test if the bucket exists; if not,  make bucket
    if (s3$doesBucketExist(bucketName) == FALSE) {
      s3$createBucket(bucketName)
    } else {
      warning("Unable to Create Bucket. Bucket with same name already exists.", call. = FALSE)
    }
}

##' AWS Support Function: Uploads a local file to an S3 Bucket
##'
##' If buckName does not exist, it is created and a warning is issued. 
##' @param bucketName destination bucket
##' @param localFile local file to be uploaded
##' @author James "JD" Long
##' @export
uploadS3File <- function(bucketName, localFile){
    tx <- new(com.amazonaws.services.s3.transfer.TransferManager, awsCreds)
    s3 <- tx$getAmazonS3Client()
    fileToUpload <-  new(File, localFile)
    request <- new(com.amazonaws.services.s3.model.PutObjectRequest, bucketName, fileToUpload$getName(), fileToUpload)
    s3$putObject(request)
}

##' AWS Support Function: Downloads a key from an S3 Bucket into a local file.
##'
##' Pulls a key (file) from a bucket into a localFile. If the keyName = ".all" then
##' all files from the bucket are pulled and localFile should be a
##' directory name. Ignores "sub directories" in buckets. 
##' @param bucketName destination bucket
##' @param keyName key to download. ".all" to pull all keys
##' @param localFile local file name or path if ".all" is called for keyName
##' @author James "JD" Long
##' @export
downloadS3File <- function(bucketName, keyName, localFile){
    tx <- new(com.amazonaws.services.s3.transfer.TransferManager, awsCreds)
    s3 <- tx$getAmazonS3Client()
    if (keyName != ".all") {
      request <- new(com.amazonaws.services.s3.model.GetObjectRequest, bucketName, keyName)
      theObject <- s3$getObject(request, new(java.io.File, localFile))
    } else {
     # this will only pull the first page of listings
     # so if there are a lot of files it won't grab them all
     # 
     # TODO: make it pull multiple pages of files
     # TODO: pull subdirectories too
      system(paste("mkdir", localFile), ignore.stderr = TRUE)
      lst <- s3$listObjects(bucketName)
      objSums <- lst$getObjectSummaries()
      listJavaObjs <- .jevalArray(objSums$toArray())
      if (length(listJavaObjs)>0){
        for (i in 1:length(listJavaObjs)) {
          # if statement here just to filter out subdirs
          key <- listJavaObjs[[i]]$getKey()[[1]]
          #if ( length( unlist(strsplit(key, split="/")) ) == 1) {
            if (substring( key, nchar( key ) - 7, nchar( key ) )  != "$folder$") {
              localFullFile <- paste(localFile, "/", listJavaObjs[[i]]$getKey()[[1]], sep="")
              downloadS3File(bucketName, listJavaObjs[[i]]$getKey()[[1]], localFullFile)
            }
          #}
        }
      }
    }
  }
  
##' Creates the configuration object, uploads needed files, and starts
##' a Segue Hadoop cluster on Elastic Map Reduce. 
##'
##' The the needed files are uploaded to S3 and the EMR nodes are started.
##' @param numInstances number of nodes (EC2 instances)
##' @param cranPackages vector of string names of CRAN packages to load on each cluster node
##' @param customPackages vector of string file names of custom packages to load on each cluster node
##' @param filesOnNodes vector of string names of full path of files to be loaded on each node.
##' Files will be loaded into the local
##' path (i.e. ./file) on each node. 
##' @param rObjectsOnNodes a named list of R objects which will be passed to the R
##' session on the worker nodes. Be sure the list has names. The list will be attached
##' on the remote nodes using attach(rObjectsOnNodes). If you list does not have names,
##' this will fail.
##' @param enableDebugging T/F whether EMR debugging should be enabled
##' @param instancesPerNode Number of R instances per node. Default of NULL uses AWS defaults.
##' @param masterInstanceType EC2 instance type for the master node
##' @param slaveInstanceType EC2 instance type for the slave nodes
##' @param location EC2 location name for the cluster
##' @param ec2KeyName EC2 Key used for logging into the main node. Use the user name 'hadoop'
##' @param copy.image T/F whether to copy the entire local environment to the nodes. If this feels
##' fast and loose... you're right! It's nuts. Use it with caution. Very handy when you really need it.
##' @param otherBootstrapActions a list-of-lists of other bootstrap actions to run; chlid list members
##    are: "name" == unique identifier of this bootstrap action ; "localFile" == path to local script
##    to be uploaded to the temp area in S3; "s3file" == path to an existing script in S3 (won't be
##    uploaded to the temp area); "args" == vector of character arguments.   "localFile" and "s3file"
##    are mutually exclusive but one is required; "args" is optional.
##' @param sourcePackagesToInstall vector of full paths to source packages to be installed on each node
##' @param masterBidPrice Bid price for master server
##' @param slaveBidPrice Bid price for slave (task) server
##' @return an emrlapply() cluster object with appropriate fields
##'   populated. Keep in mind that this creates the cluster and starts the cluster running.
##' @author James "JD" Long
##' @examples
##' \dontrun{
##' myCluster   <- createCluster(numInstances=2,
##'  cranPackages=c("Hmisc", "plyr"))
##' }
##' @export
createCluster <- function(numInstances=2,
                          cranPackages=NULL,
                          customPackages=NULL,
                          filesOnNodes=NULL,
                          rObjectsOnNodes=NULL, 
                          enableDebugging=FALSE,
                          instancesPerNode=NULL,
                          masterInstanceType="m1.large",
                          slaveInstanceType="m1.large",
                          location = "us-east-1c",
                          ec2KeyName=NULL,
                          copy.image=FALSE ,
                          otherBootstrapActions=NULL,
                          sourcePackagesToInstall=NULL,
                          masterBidPrice=NULL,
                          slaveBidPrice=NULL
                          ){
  ## this used to be an argument but not bootstrapping
  ## caused too many problems
  bootStrapLatestR=TRUE 

  clusterObject <- list(numInstances = numInstances,
                        cranPackages = cranPackages,
                        customPackages = customPackages, 
                        enableDebugging = enableDebugging,
                        bootStrapLatestR = bootStrapLatestR,
                        filesOnNodes = filesOnNodes,
                        rObjectsOnNodes = rObjectsOnNodes, 
                        enableDebugging = enableDebugging,
                        instancesPerNode = instancesPerNode,
                        masterInstanceType = masterInstanceType,
                        slaveInstanceType = slaveInstanceType,
                        location = location,
                        ec2KeyName = ec2KeyName ,
                        copy.image = copy.image ,
                        otherBootstrapActions = otherBootstrapActions,
                        sourcePackagesToInstall = sourcePackagesToInstall,
                        masterBidPrice = masterBidPrice,
                        slaveBidPrice = slaveBidPrice
                        )

  if ( tolower(masterInstanceType) == "m1.small") {
    clusterObject$masterInstanceType <- "m1.large"
    print("WARNING: masterInstanceType set to m1.small. Segue requires 64 bit OS so the masterInstanceType is being changed to m1.large. You will be billed by Amazon accordingly.")
  }
  if ( tolower(slaveInstanceType) == "m1.small") {
    clusterObject$slaveInstanceType <- "m1.large"
    print("WARNING: slaveInstanceType set to m1.small. Segue requires 64 bit OS so the slaveInstanceType is being changed to m1.large. You will be billed by Amazon accordingly.")
  }


  
  localTempDir <- paste(tempdir(),
                        paste(sample(c(0:9, letters), 10, rep=T), collapse=""),
                        "-segue",
                        sep="")
  
  clusterObject$localTempDir <- localTempDir
  clusterObject$localTempDirOut <- paste(localTempDir, "/out", sep="")

  dir.create(localTempDir, showWarnings = TRUE, recursive = TRUE, mode = "0777")
  dir.create(clusterObject$localTempDirOut, showWarnings = TRUE, recursive = TRUE, mode = "0777")
  
  s3TempDir <- tolower(unlist(strsplit(localTempDir, "/"))[length(unlist(strsplit(localTempDir, "/")))])
  deleteS3Bucket(s3TempDir)
  clusterObject$s3TempDir <- s3TempDir
  
  s3TempDirOut <- tolower(paste(s3TempDir , "out", sep=""))
  deleteS3Bucket(s3TempDirOut)
  clusterObject$s3TempDirOut <- s3TempDirOut

  #create the s3 bucket
  ## TODO: error check this
  makeS3Bucket(s3TempDir)
  
  #upload the bootstrapper to S3 
  if (bootStrapLatestR==TRUE) {
    ##TODO: error checking in the uploadS3File function
    uploadS3File(s3TempDir, system.file("bootstrapLatestR.sh", package="segue") )
    uploadS3File(s3TempDir, system.file("update.R", package="segue") )
    
  }
  clusterObject$bootStrapLatestR <- bootStrapLatestR

  ## if copy.image is TRUE then save an image and  use the fileOnNodes
  ## feature to add the saved image to the nodes
  if (copy.image == TRUE) {
    imageFile <- paste( localTempDir, "/local-workspace-image.RData", sep="" )
    save.image( file=imageFile, compress=TRUE )
    clusterObject$filesOnNodes = c(clusterObject$filesOnNodes, imageFile)
  }
  
  ## if customPackages are present, add them to the filesOnNodes
  if (is.null(customPackages) == FALSE) {
    clusterObject$filesOnNodes = c(clusterObject$filesOnNodes, customPackages)
  }
  
  # start cluster
  jobFlowId <- startCluster(clusterObject)
  clusterObject$jobFlowId <- jobFlowId
  return(clusterObject)
}

##' AWS Support Function: Checks the status of a given job on EMR
##'
##' Checks the status of a previously issued job.
##' @param jobFlowId the Job Flow Id of the job to check
##' @return Job Status 
##' @author James "JD" Long
##' @export
checkStatus <- function(jobFlowId){
  service <- new( com.amazonaws.services.elasticmapreduce.AmazonElasticMapReduceClient, awsCreds )
  request <- new( com.amazonaws.services.elasticmapreduce.model.DescribeJobFlowsRequest )
  detailsList <- new( java.util.ArrayList )
  detailsList$add(jobFlowId)
  request$setJobFlowIds(detailsList)
  descriptions <- as.list(service$describeJobFlows(request)$getJobFlows())
  descriptions[[1]]$getExecutionStatusDetail()$getState()
}

##' AWS Support Function: Checks the status of a given job on EMR
##'
##' Checks the status of a previously issued step.
##' @param jobFlowId the Job Flow Id of the job to check
##' @return Status of the last step 
##' @author James "JD" Long
##' @export
checkLastStepStatus <- function(jobFlowId){
  service <- new( com.amazonaws.services.elasticmapreduce.AmazonElasticMapReduceClient, awsCreds )
  request <- new( com.amazonaws.services.elasticmapreduce.model.DescribeJobFlowsRequest )
  detailsList <- new( java.util.ArrayList )
  detailsList$add(jobFlowId)
  request$setJobFlowIds(detailsList)
  descriptions <- as.list(service$describeJobFlows(request)$getJobFlows())
  #descriptions[[1]]$getExecutionStatusDetail()$getState()

  steps <- as.list(descriptions[[1]]$getSteps())
  step <- steps[[length(steps)]] #grab the last step only
  status <- step$getExecutionStatusDetail()
  status$getState()
}


##' Starts a cluster on Amazon's EMR service
##' 
##' After a cluster has been defined with createCluster() this function actually
##' starts the machines running. Currently exported, but soon will be internal only.
##' 
##' @param clusterObject cluster object to start
##' @return a Job Flow ID
##' 
##' @export
startCluster <- function(clusterObject){
  numInstances     <- clusterObject$numInstances
  s3TempDir        <- clusterObject$s3TempDir 
  s3TempDirOut     <- clusterObject$s3TempDirOut
  bootStrapLatestR <- clusterObject$bootStrapLatestR
  verbose          <- TRUE
 
  service <- new( com.amazonaws.services.elasticmapreduce.AmazonElasticMapReduceClient, awsCreds )
  request <- new( com.amazonaws.services.elasticmapreduce.model.RunJobFlowRequest )
  conf    <- new( com.amazonaws.services.elasticmapreduce.model.JobFlowInstancesConfig )

  #creates the bootstrap list
  bootStrapList <- new( java.util.ArrayList )
  
  if (bootStrapLatestR == TRUE) {
   scriptBootActionConfig <- new(com.amazonaws.services.elasticmapreduce.model.ScriptBootstrapActionConfig)
   scriptBootActionConfig$setPath(paste("s3://", s3TempDir, "/bootstrapLatestR.sh", sep=""))
  
   bootStrapConfig <- new( com.amazonaws.services.elasticmapreduce.model.BootstrapActionConfig)
     with( bootStrapConfig, setScriptBootstrapAction(scriptBootActionConfig))
     with( bootStrapConfig, setName("R-InstallLatest"))
 
   bootStrapList$add(bootStrapConfig)

   ## update packages
   scriptBootActionConfig <- new(com.amazonaws.services.elasticmapreduce.model.ScriptBootstrapActionConfig)
   scriptBootActionConfig$setPath(paste("s3://", s3TempDir, "/update.R", sep=""))
  
   bootStrapConfig <- new( com.amazonaws.services.elasticmapreduce.model.BootstrapActionConfig)
     with( bootStrapConfig, setScriptBootstrapAction(scriptBootActionConfig))
     with( bootStrapConfig, setName("R-UpdatePackages"))
 
   bootStrapList$add(bootStrapConfig)
   
  }

  ## handle additional bootstrap actions, if requested.
  if ( ! is.null(clusterObject$otherBootstrapActions) ){
    ## TODO: more graceful exit here? or would stopifnot() be appropriate, in this case?
    stopifnot( "list" == class(clusterObject$otherBootstrapActions) )

    invisible( sapply( clusterObject$otherBootstrapActions , function( action ){
      scriptBootActionConfig <- new(com.amazonaws.services.elasticmapreduce.model.ScriptBootstrapActionConfig)

      ## are we uploading a local file to run? or will we use a script that already exists in
      ## (a non-temporary) S3 bucket?
      if( ! is.null( action$localFile ) ){
        uploadS3File(clusterObject$s3TempDir , action$localFile)
        scriptBootActionConfig$setPath(paste("s3://", clusterObject$s3TempDir, "/" , basename( action$localFile ) , sep=""))
      }else if( ! is.null( action$s3file ) ){
        scriptBootActionConfig$setPath(action$s3file)
      }

      if( ! is.null( action$args ) ){
        ## TODO: proper quoting around args? or leave that for caller?
        argsAsList <- new( java.util.ArrayList )
        sapply( action$args , function(item){ argsAsList$add(item) } )
        scriptBootActionConfig$withArgs(argsAsList)
      }

  
      bootStrapConfig <- new( com.amazonaws.services.elasticmapreduce.model.BootstrapActionConfig)
        with( bootStrapConfig, setScriptBootstrapAction(scriptBootActionConfig))
        with( bootStrapConfig, setName(action$name))

      bootStrapList$add(bootStrapConfig)
    
    } ) )
  }

  if (is.null(clusterObject$filesOnNodes) == FALSE) { # putting files on each node
    print("INFO: You have selected files to be put on each node. These files are being uploaded to S3.")
    ## build a batch file that includes each element of filesOnNodes
    ## then add the batch file as a boot strap action

    ## open the output file (bootStrapFiles.sh) in clusterObject$tempDir
    ## open an output file connection
    outfile <- file( paste( clusterObject$localTempDir, "/bootStrapFiles.sh", sep="" ), "w" )  
    cat("#!/bin/bash", "", file = outfile, sep = "\n")
    cat("mkdir /tmp/segue-upload/", "", file = outfile, sep = "\n")
     ## for each element in filesOnNodes add a hadoop -fs line
    for ( file in clusterObject$filesOnNodes ){
      remotePath <- paste( "/tmp/segue-upload/", tail(strsplit(file,"/")[[1]], 1), sep="" )
      fileName <- tail(strsplit(file,"/")[[1]], 1)
      s3Path <- paste( "s3://", clusterObject$s3TempDir, "/", fileName, sep="" )
      cat( paste( "hadoop fs -get ", s3Path, remotePath)
          , file = outfile, sep = "\n" )
      cat( "\n", file = outfile )
      
      # copy each file to S3
      uploadS3File( clusterObject$s3TempDir, file )
    }
    close( outfile )
     # copy bootStrapFiles.sh to clusterObject$s3TempDir
    uploadS3File( clusterObject$s3TempDir, paste( clusterObject$localTempDir, "/bootStrapFiles.sh", sep="" ) )

     # add a bootstrap action that runs bootStrapFiles.sh
   scriptBootActionConfig <- new(com.amazonaws.services.elasticmapreduce.model.ScriptBootstrapActionConfig)
   scriptBootActionConfig$setPath(paste("s3://", s3TempDir, "/bootStrapFiles.sh", sep=""))
  
   bootStrapConfig <- new( com.amazonaws.services.elasticmapreduce.model.BootstrapActionConfig)
     with( bootStrapConfig, setScriptBootstrapAction(scriptBootActionConfig))
     with( bootStrapConfig, setName("RBootStrapFiles"))
 
   bootStrapList$add(bootStrapConfig)
   print("INFO: Upload of files to S3 is complete.")
  }

  
  if (is.null(clusterObject$sourcePackagesToInstall) == FALSE) {
    print("INFO: Now building sources packages to install and uploading them based on the sourcePackagesToInstall list.")
    ## build a batch file that includes each file in sourcePackagesToInstall
    ## then add the batch file as a boot strap action

    ## open the output file (installSourcePackages.sh) in clusterObject$tempDir
    ## open an output file connection
    outfile <- file( paste( clusterObject$localTempDir, "/installSourcePackages.sh", sep="" ), "w" )  
    cat("#!/bin/bash", "", file = outfile, sep = "\n")
    cat("mkdir /tmp/segue-source-packages/", "", file = outfile, sep = "\n")
     ## for each element in sourcePackagesToInstall add a hadoop -fs line
    for ( file in clusterObject$sourcePackagesToInstall ){
      remotePath <- paste( "/tmp/segue-source-packages/", tail(strsplit(file,"/")[[1]], 1), sep="" )
      fileName <- tail(strsplit(file,"/")[[1]], 1)
      s3Path <- paste( "s3://", clusterObject$s3TempDir, "/", fileName, sep="" )
      cat( paste( "hadoop fs -get ", s3Path, remotePath)
          , file = outfile, sep = "\n" )
      cat( "\n", file = outfile )
      cat( "sudo R CMD INSTALL ", remotePath, "\n", file = outfile, sep = "" )
      
      # copy each file to S3
      uploadS3File( clusterObject$s3TempDir, file )
    }
    close( outfile )
     # copy installSourcePackages.sh) to clusterObject$s3TempDir
    uploadS3File( clusterObject$s3TempDir, paste( clusterObject$localTempDir, "/installSourcePackages.sh", sep="" ) )

     # add a bootstrap action that runs bootStrapFiles.sh
   scriptBootActionConfig <- new(com.amazonaws.services.elasticmapreduce.model.ScriptBootstrapActionConfig)
   scriptBootActionConfig$setPath(paste("s3://", s3TempDir, "/installSourcePackages.sh", sep=""))
  
   bootStrapConfig <- new( com.amazonaws.services.elasticmapreduce.model.BootstrapActionConfig)
     with( bootStrapConfig, setScriptBootstrapAction(scriptBootActionConfig))
     with( bootStrapConfig, setName("RinstallSourcePackages"))
 
   bootStrapList$add(bootStrapConfig)
   print("INFO: Source packages uploaded.")
  }

  

  if (is.null(clusterObject$instancesPerNode) == FALSE) { #sersiously... test this
   scriptBootActionConfig <- new(com.amazonaws.services.elasticmapreduce.model.ScriptBootstrapActionConfig)
   scriptBootActionConfig$setPath("s3://elasticmapreduce/bootstrap-actions/configure-hadoop")

   argList <- new( java.util.ArrayList )
   argList$add( "-s" )
   argList$add( paste( "mapred.tasktracker.map.tasks.maximum=", clusterObject$instancesPerNode, sep="") )
   argList$add( "-s" )
   argList$add( paste( "mapred.tasktracker.reduce.tasks.maximum=", clusterObject$instancesPerNode, sep="") )
 
   scriptBootActionConfig$setArgs( argList )
                                  
   bootStrapConfig <- new( com.amazonaws.services.elasticmapreduce.model.BootstrapActionConfig)
     with( bootStrapConfig, setScriptBootstrapAction(scriptBootActionConfig))
     with( bootStrapConfig, setName("SetInstancePerNode"))
 
   bootStrapList$add(bootStrapConfig)    
  }
          
   ## this adds the bootstrap to the request
  
   request$setBootstrapActions(bootStrapList)
  
  if ( is.null( clusterObject$ec2KeyName ) != TRUE ) {
      conf$setEc2KeyName(clusterObject$ec2KeyName)
  }
  
  
  instanceGroups <- .jnew("java/util/Vector")
  if (!is.null(clusterObject$masterBidPrice)) {
      masterGroupConf <- new( com.amazonaws.services.elasticmapreduce.model.InstanceGroupConfig )
      masterGroupConf$setInstanceCount(new(Integer, as.character(1)))
      masterGroupConf$setInstanceRole(new(String, as.character("MASTER"))) 
      masterGroupConf$setInstanceType(new(String, as.character(clusterObject$masterInstanceType))) 
      masterGroupConf$setMarket(new(String, as.character("SPOT"))) 
      masterGroupConf$setBidPrice(new(String, as.character(clusterObject$masterBidPrice))) 
      instanceGroups$add(masterGroupConf)
  }
  if (!is.null(clusterObject$slaveBidPrice)) {
      slaveGroupConf <- new( com.amazonaws.services.elasticmapreduce.model.InstanceGroupConfig )
      slaveGroupConf$setInstanceCount(new(Integer, as.character(numInstances - 1)))
      slaveGroupConf$setInstanceRole(new(String, as.character("CORE"))) 
      slaveGroupConf$setInstanceType(new(String, as.character(clusterObject$slaveInstanceType))) 
      slaveGroupConf$setMarket(new(String, as.character("SPOT"))) 
      slaveGroupConf$setBidPrice(new(String, as.character(clusterObject$slaveBidPrice))) 
      instanceGroups$add(slaveGroupConf)
  }
  if (!is.null(clusterObject$masterBidPrice) || !is.null(clusterObject$slaveBidPrice)) {
      conf$setInstanceGroups(instanceGroups)
  } else {
      # Must configure instances either using instance count, 
      # master and slave instance type or instance groups but not both
      conf$setInstanceCount(new(Integer, as.character(numInstances)))
      conf$setMasterInstanceType( clusterObject$masterInstanceType )      
      conf$setSlaveInstanceType( clusterObject$slaveInstanceType )
  }
  conf$setKeepJobFlowAliveWhenNoSteps(new(Boolean, TRUE))

  conf$setPlacement(new(com.amazonaws.services.elasticmapreduce.model.PlacementType, clusterObject$location))
  conf$setHadoopVersion("0.20.205")
  request$setInstances(conf)
  request$setLogUri(paste("s3://", s3TempDir, "-logs", sep=""))
  jobFlowName <- paste("RJob-", date(), sep="")
  request$setName(jobFlowName)
  request$setAmiVersion("2.0.4")
  

  result <- service$runJobFlow(request)

  ## seems like this sleep should not be needed... but otherwise
  ## getJobFlowId() does not get the correct jobflowid

  Sys.sleep(15)
  jobFlowId <- result$getJobFlowId()

  currentStatus <- checkStatus(jobFlowId)
  while (currentStatus  %in% c("COMPLETED", "FAILED", "TERMINATED", "WAITING", "CANCELLED")  == FALSE) {
    Sys.sleep(30)
    currentStatus <- checkStatus(jobFlowId)
    message(paste(currentStatus, " - ", Sys.time(), sep="" ))
  }
 
  if (currentStatus == "WAITING") {
    message("Your Amazon EMR Hadoop Cluster is ready for action. \nRemember to terminate your cluster with stopCluster().\nAmazon is billing you!")
  }
  return(jobFlowId)
  
  ## TODO: need to catch situations where the cluster failed
}

##' Stops a running cluster
##'
##' Stops a running cluster and deletes temporary directories from EC2
##' 
##' @return nothing
##' @author James "JD" Long
##' @param clusterObject a cluster object to stop
##' @export
stopCluster <- function(clusterObject){
  jobFlowId <- clusterObject$jobFlowId

  service <- new( com.amazonaws.services.elasticmapreduce.AmazonElasticMapReduceClient, awsCreds )
  request <- new( com.amazonaws.services.elasticmapreduce.model.TerminateJobFlowsRequest )
  detailsList <- new( java.util.ArrayList )
  detailsList$add(jobFlowId)
  request$withJobFlowIds(detailsList)
  service$terminateJobFlows(request)

  ## I have no idea why AWS needs sleep before
  ## I can delete the temp dirs, but these fail
  ## if I don't have the sleep
  Sys.sleep(10)
  try( deleteS3Bucket( clusterObject$s3TempDir ), silent=TRUE )
  try( deleteS3Bucket( paste( clusterObject$s3TempDir, "-logs", sep="" ) ), silent=TRUE )
  try( deleteS3Bucket( clusterObject$s3TempDirOut ), silent=TRUE )

  ## something weird is going on... I have to do this twice or it
  ## does not fully delete the s3TempDir's subdirectory
  ## will need to give this some attention later
  Sys.sleep(15)
  try( deleteS3Bucket( clusterObject$s3TempDir ), silent=TRUE )
  try( deleteS3Bucket( paste( clusterObject$s3TempDir, "-logs", sep="" ) ), silent=TRUE )
  try( deleteS3Bucket( clusterObject$s3TempDirOut ), silent=TRUE )
 
}

##' Submits a job to a running cluster
##' 
##' After a cluster has been started this function submits jobs to that cluster.
##' If a job is submitted with enableDebugging=TRUE, all jobs submitted to that
##' cluster will also have debugging enabled. To turn debugging off, the cluster
##' must be stopped and restarted.
##' 
##' 
##' @param clusterObject a cluster object to submit to
##' @param stopClusterOnComplete set to true if you want the cluster to be shut down
##' after job completes
##' @param taskTimeout maximum time a single unit of work can run (in minutes)
##' @return Execution status of this job
##' 
##' @export
submitJob <- function(clusterObject, stopClusterOnComplete=FALSE, taskTimeout=10){

  jobFlowId       <- clusterObject$jobFlowId
  s3TempDir       <- clusterObject$s3TempDir
  s3TempDirOut    <- clusterObject$s3TempDirOut
  enableDebugging <- clusterObject$enableDebugging
 
  try(deleteS3Bucket(s3TempDirOut), silent=TRUE)
  unlink(clusterObject$localTempDirOut, recursive = TRUE)
  dir.create(clusterObject$localTempDirOut)
  
  jobFlowId <- clusterObject$jobFlowId

  if (enableDebugging==TRUE){
    service <- new( com.amazonaws.services.elasticmapreduce.AmazonElasticMapReduceClient, awsCreds )
    hadoopJarStep <- new(com.amazonaws.services.elasticmapreduce.model.HadoopJarStepConfig)
    hadoopJarStep$setJar("s3://us-east-1.elasticmapreduce/libs/script-runner/script-runner.jar")
    argList <- new( java.util.ArrayList )
    argList$add( "s3://us-east-1.elasticmapreduce/libs/state-pusher/0.1/fetch" )
    hadoopJarStep$setArgs(argList)
    stepName <- format(Sys.time(), "%Y-%m-%d_%H:%M:%OS5") 
    stepConfig <- new(com.amazonaws.services.elasticmapreduce.model.StepConfig, stepName, hadoopJarStep)
    stepConfig$setActionOnFailure("CANCEL_AND_WAIT")
    stepList <- new( java.util.ArrayList )
    stepList$add( stepConfig )
    request <- new( com.amazonaws.services.elasticmapreduce.model.AddJobFlowStepsRequest, jobFlowId, stepList)
    requestResult <- service$addJobFlowSteps(request)
  }
  
  service <- new( com.amazonaws.services.elasticmapreduce.AmazonElasticMapReduceClient, awsCreds )

  hadoopJarStep <- new(com.amazonaws.services.elasticmapreduce.model.HadoopJarStepConfig)
  hadoopJarStep$setJar("/home/hadoop/contrib/streaming/hadoop-streaming.jar")
  argList <- new( java.util.ArrayList )

  # the task timeout is passed to us in minutes, but AWS/EMR expects it in milliseconds
  taskTimeoutMilliseconds <- taskTimeout * 60 * 1000
  argList$add( "-D" )
  argList$add( paste( "mapred.task.timeout=" , taskTimeoutMilliseconds , sep="" ) )

  argList$add( "-cacheFile" )
  argList$add( paste("s3n://", s3TempDir, "/emrData.RData#emrData.RData", sep=""))
  argList$add( "-input" )
  argList$add( paste("s3n://", s3TempDir, "/stream.txt", sep="") )
  argList$add( "-output" )
  argList$add( paste("s3n://", s3TempDirOut, "/results", sep="") )
  argList$add( "-mapper" )
  argList$add( "cat" )
  argList$add( "-reducer" )
  argList$add( paste("s3n://", s3TempDir, "/mapper.R", sep="" ) )


  hadoopJarStep$setArgs(argList)

  stepName <- format(Sys.time(), "%Y-%m-%d_%H:%M:%OS5") 
  
  stepConfig <- new(com.amazonaws.services.elasticmapreduce.model.StepConfig, stepName, hadoopJarStep)
  stepConfig$setActionOnFailure("CANCEL_AND_WAIT")
  
  stepList <- new( java.util.ArrayList )
  stepList$add( stepConfig )
  request <- new( com.amazonaws.services.elasticmapreduce.model.AddJobFlowStepsRequest, jobFlowId, stepList)

  try(deleteS3Bucket(clusterObject$s3TempDirOut), silent=TRUE)
  
  #submit to EMR happens here
  service$addJobFlowSteps(request)

  Sys.sleep(15)

  checkLastStepStatus(jobFlowId)

  Sys.sleep(15)

  currentStatus <- checkStatus(jobFlowId)
  while (currentStatus  %in% c("COMPLETED", "FAILED", "TERMINATED", "WAITING", "CANCELLED")  == FALSE) {
    Sys.sleep(30)
    currentStatus <- checkStatus(jobFlowId)
    message(paste(currentStatus, " - ", Sys.time(), sep="" ))
  }
  if (stopClusterOnComplete==TRUE) {
    stopCluster(clusterObject)
  }
  return(currentStatus)
  
}
leeper/segue documentation built on May 21, 2019, 1:39 a.m.
rdrr.io home R language documentation Run R code online
CRAN packages Bioconductor packages R-Forge packages GitHub packages
Note that we can't provide technical support on individual packages. You should contact the package authors for that.
leeper/segue
A segue into parallel processing on Amazon's Web Services. Includes a parallel lapply function for the Elastic Map Reduce (EMR) engine

R/awsFunctions.R
In leeper/segue: A segue into parallel processing on Amazon's Web Services. Includes a parallel lapply function for the Elastic Map Reduce (EMR) engine

Defines functions setCredentials deleteS3Key emptyS3Bucket deleteS3Bucket makeS3Bucket uploadS3File downloadS3File createCluster checkStatus checkLastStepStatus startCluster stopCluster submitJob

Documented in checkLastStepStatus checkStatus createCluster deleteS3Bucket deleteS3Key downloadS3File emptyS3Bucket makeS3Bucket setCredentials startCluster stopCluster submitJob uploadS3File

R Package Documentation

Browse R Packages

We want your feedback!

leeper/segue A segue into parallel processing on Amazon's Web Services. Includes a parallel lapply function for the Elastic Map Reduce (EMR) engine

R/awsFunctions.R In leeper/segue: A segue into parallel processing on Amazon's Web Services. Includes a parallel lapply function for the Elastic Map Reduce (EMR) engine

Defines functions setCredentials deleteS3Key emptyS3Bucket deleteS3Bucket makeS3Bucket uploadS3File downloadS3File createCluster checkStatus checkLastStepStatus startCluster stopCluster submitJob

Documented in checkLastStepStatus checkStatus createCluster deleteS3Bucket deleteS3Key downloadS3File emptyS3Bucket makeS3Bucket setCredentials startCluster stopCluster submitJob uploadS3File

R Package Documentation

Browse R Packages

We want your feedback!

leeper/segue
A segue into parallel processing on Amazon's Web Services. Includes a parallel lapply function for the Elastic Map Reduce (EMR) engine

R/awsFunctions.R
In leeper/segue: A segue into parallel processing on Amazon's Web Services. Includes a parallel lapply function for the Elastic Map Reduce (EMR) engine