R/pairRDD.R

#
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements.  See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License.  You may obtain a copy of the License at
#
#    http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

# Operations supported on RDDs contains pairs (i.e key, value)
#' @include generics.R jobj.R RDD.R
NULL

############ Actions and Transformations ############

#' Look up elements of a key in an RDD
#'
#' @description
#' \code{lookup} returns a list of values in this RDD for key key.
#'
#' @param x The RDD to collect
#' @param key The key to look up for
#' @return a list of values in this RDD for key key
#' @examples
# nolint start
#'\dontrun{
#' sc <- sparkR.init()
#' pairs <- list(c(1, 1), c(2, 2), c(1, 3))
#' rdd <- parallelize(sc, pairs)
#' lookup(rdd, 1) # list(1, 3)
#'}
# nolint end
#' @rdname lookup
#' @aliases lookup,RDD-method
#' @noRd
setMethod("lookup",
          signature(x = "RDD", key = "ANY"),
          function(x, key) {
            partitionFunc <- function(part) {
              filtered <- part[unlist(lapply(part, function(i) { identical(key, i[[1]]) }))]
              lapply(filtered, function(i) { i[[2]] })
            }
            valsRDD <- lapplyPartition(x, partitionFunc)
            collectRDD(valsRDD)
          })

#' Count the number of elements for each key, and return the result to the
#' master as lists of (key, count) pairs.
#'
#' Same as countByKey in Spark.
#'
#' @param x The RDD to count keys.
#' @return list of (key, count) pairs, where count is number of each key in rdd.
#' @examples
# nolint start
#'\dontrun{
#' sc <- sparkR.init()
#' rdd <- parallelize(sc, list(c("a", 1), c("b", 1), c("a", 1)))
#' countByKey(rdd) # ("a", 2L), ("b", 1L)
#'}
# nolint end
#' @rdname countByKey
#' @aliases countByKey,RDD-method
#' @noRd
setMethod("countByKey",
          signature(x = "RDD"),
          function(x) {
            keys <- lapply(x, function(item) { item[[1]] })
            countByValue(keys)
          })

#' Return an RDD with the keys of each tuple.
#'
#' @param x The RDD from which the keys of each tuple is returned.
#' @examples
# nolint start
#'\dontrun{
#' sc <- sparkR.init()
#' rdd <- parallelize(sc, list(list(1, 2), list(3, 4)))
#' collectRDD(keys(rdd)) # list(1, 3)
#'}
# nolint end
#' @rdname keys
#' @aliases keys,RDD
#' @noRd
setMethod("keys",
          signature(x = "RDD"),
          function(x) {
            func <- function(k) {
              k[[1]]
            }
            lapply(x, func)
          })

#' Return an RDD with the values of each tuple.
#'
#' @param x The RDD from which the values of each tuple is returned.
#' @examples
# nolint start
#'\dontrun{
#' sc <- sparkR.init()
#' rdd <- parallelize(sc, list(list(1, 2), list(3, 4)))
#' collectRDD(values(rdd)) # list(2, 4)
#'}
# nolint end
#' @rdname values
#' @aliases values,RDD
#' @noRd
setMethod("values",
          signature(x = "RDD"),
          function(x) {
            func <- function(v) {
              v[[2]]
            }
            lapply(x, func)
          })

#' Applies a function to all values of the elements, without modifying the keys.
#'
#' The same as `mapValues()' in Spark.
#'
#' @param X The RDD to apply the transformation.
#' @param FUN the transformation to apply on the value of each element.
#' @return a new RDD created by the transformation.
#' @examples
#'\dontrun{
#' sc <- sparkR.init()
#' rdd <- parallelize(sc, 1:10)
#' makePairs <- lapply(rdd, function(x) { list(x, x) })
#' collectRDD(mapValues(makePairs, function(x) { x * 2) })
#' Output: list(list(1,2), list(2,4), list(3,6), ...)
#'}
#' @rdname mapValues
#' @aliases mapValues,RDD,function-method
#' @noRd
setMethod("mapValues",
          signature(X = "RDD", FUN = "function"),
          function(X, FUN) {
            func <- function(x) {
              list(x[[1]], FUN(x[[2]]))
            }
            lapply(X, func)
          })

#' Pass each value in the key-value pair RDD through a flatMap function without
#' changing the keys; this also retains the original RDD's partitioning.
#'
#' The same as 'flatMapValues()' in Spark.
#'
#' @param X The RDD to apply the transformation.
#' @param FUN the transformation to apply on the value of each element.
#' @return a new RDD created by the transformation.
#' @examples
#'\dontrun{
#' sc <- sparkR.init()
#' rdd <- parallelize(sc, list(list(1, c(1,2)), list(2, c(3,4))))
#' collectRDD(flatMapValues(rdd, function(x) { x }))
#' Output: list(list(1,1), list(1,2), list(2,3), list(2,4))
#'}
#' @rdname flatMapValues
#' @aliases flatMapValues,RDD,function-method
#' @noRd
setMethod("flatMapValues",
          signature(X = "RDD", FUN = "function"),
          function(X, FUN) {
            flatMapFunc <- function(x) {
              lapply(FUN(x[[2]]), function(v) { list(x[[1]], v) })
            }
            flatMap(X, flatMapFunc)
          })

############ Shuffle Functions ############

#' Partition an RDD by key
#'
#' This function operates on RDDs where every element is of the form list(K, V) or c(K, V).
#' For each element of this RDD, the partitioner is used to compute a hash
#' function and the RDD is partitioned using this hash value.
#'
#' @param x The RDD to partition. Should be an RDD where each element is
#'             list(K, V) or c(K, V).
#' @param numPartitions Number of partitions to create.
#' @param ... Other optional arguments to partitionBy.
#'
#' @param partitionFunc The partition function to use. Uses a default hashCode
#'                      function if not provided
#' @return An RDD partitioned using the specified partitioner.
#' @examples
#'\dontrun{
#' sc <- sparkR.init()
#' pairs <- list(list(1, 2), list(1.1, 3), list(1, 4))
#' rdd <- parallelize(sc, pairs)
#' parts <- partitionByRDD(rdd, 2L)
#' collectPartition(parts, 0L) # First partition should contain list(1, 2) and list(1, 4)
#'}
#' @rdname partitionBy
#' @aliases partitionBy,RDD,integer-method
#' @noRd
setMethod("partitionByRDD",
          signature(x = "RDD"),
          function(x, numPartitions, partitionFunc = hashCode) {
            stopifnot(is.numeric(numPartitions))

            partitionFunc <- cleanClosure(partitionFunc)
            serializedHashFuncBytes <- serialize(partitionFunc, connection = NULL)

            packageNamesArr <- serialize(.sparkREnv$.packages,
                                         connection = NULL)
            broadcastArr <- lapply(ls(.broadcastNames),
                                   function(name) { get(name, .broadcastNames) })
            jrdd <- getJRDD(x)

            # We create a PairwiseRRDD that extends RDD[(Int, Array[Byte])],
            # where the key is the target partition number, the value is
            # the content (key-val pairs).
            pairwiseRRDD <- newJObject("org.apache.spark.api.r.PairwiseRRDD",
                                       callJMethod(jrdd, "rdd"),
                                       numToInt(numPartitions),
                                       serializedHashFuncBytes,
                                       getSerializedMode(x),
                                       packageNamesArr,
                                       broadcastArr,
                                       callJMethod(jrdd, "classTag"))

            # Create a corresponding partitioner.
            rPartitioner <- newJObject("org.apache.spark.HashPartitioner",
                                       numToInt(numPartitions))

            # Call partitionBy on the obtained PairwiseRDD.
            javaPairRDD <- callJMethod(pairwiseRRDD, "asJavaPairRDD")
            javaPairRDD <- callJMethod(javaPairRDD, "partitionBy", rPartitioner)

            # Call .values() on the result to get back the final result, the
            # shuffled actual content key-val pairs.
            r <- callJMethod(javaPairRDD, "values")

            RDD(r, serializedMode = "byte")
          })

#' Group values by key
#'
#' This function operates on RDDs where every element is of the form list(K, V) or c(K, V).
#' and group values for each key in the RDD into a single sequence.
#'
#' @param x The RDD to group. Should be an RDD where each element is
#'             list(K, V) or c(K, V).
#' @param numPartitions Number of partitions to create.
#' @return An RDD where each element is list(K, list(V))
#' @seealso reduceByKey
#' @examples
#'\dontrun{
#' sc <- sparkR.init()
#' pairs <- list(list(1, 2), list(1.1, 3), list(1, 4))
#' rdd <- parallelize(sc, pairs)
#' parts <- groupByKey(rdd, 2L)
#' grouped <- collectRDD(parts)
#' grouped[[1]] # Should be a list(1, list(2, 4))
#'}
#' @rdname groupByKey
#' @aliases groupByKey,RDD,integer-method
#' @noRd
setMethod("groupByKey",
          signature(x = "RDD", numPartitions = "numeric"),
          function(x, numPartitions) {
            shuffled <- partitionByRDD(x, numPartitions)
            groupVals <- function(part) {
              vals <- new.env()
              keys <- new.env()
              pred <- function(item) exists(item$hash, keys)
              appendList <- function(acc, i) {
                addItemToAccumulator(acc, i)
                acc
              }
              makeList <- function(i) {
                acc <- initAccumulator()
                addItemToAccumulator(acc, i)
                acc
              }
              # Each item in the partition is list of (K, V)
              lapply(part,
                     function(item) {
                       item$hash <- as.character(hashCode(item[[1]]))
                       updateOrCreatePair(item, keys, vals, pred,
                                          appendList, makeList)
                     })
              # extract out data field
              vals <- eapply(vals,
                             function(i) {
                               length(i$data) <- i$counter
                               i$data
                             })
              # Every key in the environment contains a list
              # Convert that to list(K, Seq[V])
              convertEnvsToList(keys, vals)
            }
            lapplyPartition(shuffled, groupVals)
          })

#'  Merge values by key
#'
#' This function operates on RDDs where every element is of the form list(K, V) or c(K, V).
#' and merges the values for each key using an associative and commutative reduce function.
#'
#' @param x The RDD to reduce by key. Should be an RDD where each element is
#'             list(K, V) or c(K, V).
#' @param combineFunc The associative and commutative reduce function to use.
#' @param numPartitions Number of partitions to create.
#' @return An RDD where each element is list(K, V') where V' is the merged
#'         value
#' @examples
#'\dontrun{
#' sc <- sparkR.init()
#' pairs <- list(list(1, 2), list(1.1, 3), list(1, 4))
#' rdd <- parallelize(sc, pairs)
#' parts <- reduceByKey(rdd, "+", 2L)
#' reduced <- collectRDD(parts)
#' reduced[[1]] # Should be a list(1, 6)
#'}
#' @rdname reduceByKey
#' @aliases reduceByKey,RDD,integer-method
#' @noRd
setMethod("reduceByKey",
          signature(x = "RDD", combineFunc = "ANY", numPartitions = "numeric"),
          function(x, combineFunc, numPartitions) {
            reduceVals <- function(part) {
              vals <- new.env()
              keys <- new.env()
              pred <- function(item) exists(item$hash, keys)
              lapply(part,
                     function(item) {
                       item$hash <- as.character(hashCode(item[[1]]))
                       updateOrCreatePair(item, keys, vals, pred, combineFunc, identity)
                     })
              convertEnvsToList(keys, vals)
            }
            locallyReduced <- lapplyPartition(x, reduceVals)
            shuffled <- partitionByRDD(locallyReduced, numToInt(numPartitions))
            lapplyPartition(shuffled, reduceVals)
          })

#' Merge values by key locally
#'
#' This function operates on RDDs where every element is of the form list(K, V) or c(K, V).
#' and merges the values for each key using an associative and commutative reduce function, but
#' return the results immediately to the driver as an R list.
#'
#' @param x The RDD to reduce by key. Should be an RDD where each element is
#'             list(K, V) or c(K, V).
#' @param combineFunc The associative and commutative reduce function to use.
#' @return A list of elements of type list(K, V') where V' is the merged value for each key
#' @seealso reduceByKey
#' @examples
# nolint start
#'\dontrun{
#' sc <- sparkR.init()
#' pairs <- list(list(1, 2), list(1.1, 3), list(1, 4))
#' rdd <- parallelize(sc, pairs)
#' reduced <- reduceByKeyLocally(rdd, "+")
#' reduced # list(list(1, 6), list(1.1, 3))
#'}
# nolint end
#' @rdname reduceByKeyLocally
#' @aliases reduceByKeyLocally,RDD,integer-method
#' @noRd
setMethod("reduceByKeyLocally",
          signature(x = "RDD", combineFunc = "ANY"),
          function(x, combineFunc) {
            reducePart <- function(part) {
              vals <- new.env()
              keys <- new.env()
              pred <- function(item) exists(item$hash, keys)
              lapply(part,
                     function(item) {
                       item$hash <- as.character(hashCode(item[[1]]))
                       updateOrCreatePair(item, keys, vals, pred, combineFunc, identity)
                     })
              list(list(keys, vals)) # return hash to avoid re-compute in merge
            }
            mergeParts <- function(accum, x) {
              pred <- function(item) {
                exists(item$hash, accum[[1]])
              }
              lapply(ls(x[[1]]),
                     function(name) {
                       item <- list(x[[1]][[name]], x[[2]][[name]])
                       item$hash <- name
                       updateOrCreatePair(item, accum[[1]], accum[[2]], pred, combineFunc, identity)
                     })
              accum
            }
            reduced <- mapPartitions(x, reducePart)
            merged <- reduce(reduced, mergeParts)
            convertEnvsToList(merged[[1]], merged[[2]])
          })

#' Combine values by key
#'
#' Generic function to combine the elements for each key using a custom set of
#' aggregation functions. Turns an RDD[(K, V)] into a result of type RDD[(K, C)],
#' for a "combined type" C. Note that V and C can be different -- for example, one
#' might group an RDD of type (Int, Int) into an RDD of type (Int, Seq[Int]).
#' Users provide three functions:
#' \itemize{
#'   \item createCombiner, which turns a V into a C (e.g., creates a one-element list)
#'   \item mergeValue, to merge a V into a C (e.g., adds it to the end of a list) -
#'   \item mergeCombiners, to combine two C's into a single one (e.g., concatenates
#'    two lists).
#' }
#'
#' @param x The RDD to combine. Should be an RDD where each element is
#'             list(K, V) or c(K, V).
#' @param createCombiner Create a combiner (C) given a value (V)
#' @param mergeValue Merge the given value (V) with an existing combiner (C)
#' @param mergeCombiners Merge two combiners and return a new combiner
#' @param numPartitions Number of partitions to create.
#' @return An RDD where each element is list(K, C) where C is the combined type
#' @seealso groupByKey, reduceByKey
#' @examples
# nolint start
#'\dontrun{
#' sc <- sparkR.init()
#' pairs <- list(list(1, 2), list(1.1, 3), list(1, 4))
#' rdd <- parallelize(sc, pairs)
#' parts <- combineByKey(rdd, function(x) { x }, "+", "+", 2L)
#' combined <- collectRDD(parts)
#' combined[[1]] # Should be a list(1, 6)
#'}
# nolint end
#' @rdname combineByKey
#' @aliases combineByKey,RDD,ANY,ANY,ANY,integer-method
#' @noRd
setMethod("combineByKey",
          signature(x = "RDD", createCombiner = "ANY", mergeValue = "ANY",
                    mergeCombiners = "ANY", numPartitions = "numeric"),
          function(x, createCombiner, mergeValue, mergeCombiners, numPartitions) {
            combineLocally <- function(part) {
              combiners <- new.env()
              keys <- new.env()
              pred <- function(item) exists(item$hash, keys)
              lapply(part,
                     function(item) {
                       item$hash <- as.character(hashCode(item[[1]]))
                       updateOrCreatePair(item, keys, combiners, pred, mergeValue, createCombiner)
                     })
              convertEnvsToList(keys, combiners)
            }
            locallyCombined <- lapplyPartition(x, combineLocally)
            shuffled <- partitionByRDD(locallyCombined, numToInt(numPartitions))
            mergeAfterShuffle <- function(part) {
              combiners <- new.env()
              keys <- new.env()
              pred <- function(item) exists(item$hash, keys)
              lapply(part,
                     function(item) {
                       item$hash <- as.character(hashCode(item[[1]]))
                       updateOrCreatePair(item, keys, combiners, pred, mergeCombiners, identity)
                     })
              convertEnvsToList(keys, combiners)
            }
            lapplyPartition(shuffled, mergeAfterShuffle)
          })

#' Aggregate a pair RDD by each key.
#'
#' Aggregate the values of each key in an RDD, using given combine functions
#' and a neutral "zero value". This function can return a different result type,
#' U, than the type of the values in this RDD, V. Thus, we need one operation
#' for merging a V into a U and one operation for merging two U's, The former
#' operation is used for merging values within a partition, and the latter is
#' used for merging values between partitions. To avoid memory allocation, both
#' of these functions are allowed to modify and return their first argument
#' instead of creating a new U.
#'
#' @param x An RDD.
#' @param zeroValue A neutral "zero value".
#' @param seqOp A function to aggregate the values of each key. It may return
#'              a different result type from the type of the values.
#' @param combOp A function to aggregate results of seqOp.
#' @return An RDD containing the aggregation result.
#' @seealso foldByKey, combineByKey
#' @examples
# nolint start
#'\dontrun{
#' sc <- sparkR.init()
#' rdd <- parallelize(sc, list(list(1, 1), list(1, 2), list(2, 3), list(2, 4)))
#' zeroValue <- list(0, 0)
#' seqOp <- function(x, y) { list(x[[1]] + y, x[[2]] + 1) }
#' combOp <- function(x, y) { list(x[[1]] + y[[1]], x[[2]] + y[[2]]) }
#' aggregateByKey(rdd, zeroValue, seqOp, combOp, 2L)
#'   # list(list(1, list(3, 2)), list(2, list(7, 2)))
#'}
# nolint end
#' @rdname aggregateByKey
#' @aliases aggregateByKey,RDD,ANY,ANY,ANY,integer-method
#' @noRd
setMethod("aggregateByKey",
          signature(x = "RDD", zeroValue = "ANY", seqOp = "ANY",
                    combOp = "ANY", numPartitions = "numeric"),
          function(x, zeroValue, seqOp, combOp, numPartitions) {
            createCombiner <- function(v) {
              do.call(seqOp, list(zeroValue, v))
            }

            combineByKey(x, createCombiner, seqOp, combOp, numPartitions)
          })

#' Fold a pair RDD by each key.
#'
#' Aggregate the values of each key in an RDD, using an associative function "func"
#' and a neutral "zero value" which may be added to the result an arbitrary
#' number of times, and must not change the result (e.g., 0 for addition, or
#' 1 for multiplication.).
#'
#' @param x An RDD.
#' @param zeroValue A neutral "zero value".
#' @param func An associative function for folding values of each key.
#' @return An RDD containing the aggregation result.
#' @seealso aggregateByKey, combineByKey
#' @examples
# nolint start
#'\dontrun{
#' sc <- sparkR.init()
#' rdd <- parallelize(sc, list(list(1, 1), list(1, 2), list(2, 3), list(2, 4)))
#' foldByKey(rdd, 0, "+", 2L) # list(list(1, 3), list(2, 7))
#'}
# nolint end
#' @rdname foldByKey
#' @aliases foldByKey,RDD,ANY,ANY,integer-method
#' @noRd
setMethod("foldByKey",
          signature(x = "RDD", zeroValue = "ANY",
                    func = "ANY", numPartitions = "numeric"),
          function(x, zeroValue, func, numPartitions) {
            aggregateByKey(x, zeroValue, func, func, numPartitions)
          })

############ Binary Functions #############

#' Join two RDDs
#'
#' @description
#' \code{join} This function joins two RDDs where every element is of the form list(K, V).
#' The key types of the two RDDs should be the same.
#'
#' @param x An RDD to be joined. Should be an RDD where each element is
#'             list(K, V).
#' @param y An RDD to be joined. Should be an RDD where each element is
#'             list(K, V).
#' @param numPartitions Number of partitions to create.
#' @return a new RDD containing all pairs of elements with matching keys in
#'         two input RDDs.
#' @examples
# nolint start
#'\dontrun{
#' sc <- sparkR.init()
#' rdd1 <- parallelize(sc, list(list(1, 1), list(2, 4)))
#' rdd2 <- parallelize(sc, list(list(1, 2), list(1, 3)))
#' joinRDD(rdd1, rdd2, 2L) # list(list(1, list(1, 2)), list(1, list(1, 3))
#'}
# nolint end
#' @rdname join-methods
#' @aliases join,RDD,RDD-method
#' @noRd
setMethod("joinRDD",
          signature(x = "RDD", y = "RDD"),
          function(x, y, numPartitions) {
            xTagged <- lapply(x, function(i) { list(i[[1]], list(1L, i[[2]])) })
            yTagged <- lapply(y, function(i) { list(i[[1]], list(2L, i[[2]])) })

            doJoin <- function(v) {
              joinTaggedList(v, list(FALSE, FALSE))
            }

            joined <- flatMapValues(groupByKey(unionRDD(xTagged, yTagged), numPartitions),
                                    doJoin)
          })

#' Left outer join two RDDs
#'
#' @description
#' \code{leftouterjoin} This function left-outer-joins two RDDs where every element is of
#' the form list(K, V). The key types of the two RDDs should be the same.
#'
#' @param x An RDD to be joined. Should be an RDD where each element is
#'             list(K, V).
#' @param y An RDD to be joined. Should be an RDD where each element is
#'             list(K, V).
#' @param numPartitions Number of partitions to create.
#' @return For each element (k, v) in x, the resulting RDD will either contain
#'         all pairs (k, (v, w)) for (k, w) in rdd2, or the pair (k, (v, NULL))
#'         if no elements in rdd2 have key k.
#' @examples
# nolint start
#'\dontrun{
#' sc <- sparkR.init()
#' rdd1 <- parallelize(sc, list(list(1, 1), list(2, 4)))
#' rdd2 <- parallelize(sc, list(list(1, 2), list(1, 3)))
#' leftOuterJoin(rdd1, rdd2, 2L)
#' # list(list(1, list(1, 2)), list(1, list(1, 3)), list(2, list(4, NULL)))
#'}
# nolint end
#' @rdname join-methods
#' @aliases leftOuterJoin,RDD,RDD-method
#' @noRd
setMethod("leftOuterJoin",
          signature(x = "RDD", y = "RDD", numPartitions = "numeric"),
          function(x, y, numPartitions) {
            xTagged <- lapply(x, function(i) { list(i[[1]], list(1L, i[[2]])) })
            yTagged <- lapply(y, function(i) { list(i[[1]], list(2L, i[[2]])) })

            doJoin <- function(v) {
              joinTaggedList(v, list(FALSE, TRUE))
            }

            joined <- flatMapValues(groupByKey(unionRDD(xTagged, yTagged), numPartitions), doJoin)
          })

#' Right outer join two RDDs
#'
#' @description
#' \code{rightouterjoin} This function right-outer-joins two RDDs where every element is of
#' the form list(K, V). The key types of the two RDDs should be the same.
#'
#' @param x An RDD to be joined. Should be an RDD where each element is
#'             list(K, V).
#' @param y An RDD to be joined. Should be an RDD where each element is
#'             list(K, V).
#' @param numPartitions Number of partitions to create.
#' @return For each element (k, w) in y, the resulting RDD will either contain
#'         all pairs (k, (v, w)) for (k, v) in x, or the pair (k, (NULL, w))
#'         if no elements in x have key k.
#' @examples
# nolint start
#'\dontrun{
#' sc <- sparkR.init()
#' rdd1 <- parallelize(sc, list(list(1, 2), list(1, 3)))
#' rdd2 <- parallelize(sc, list(list(1, 1), list(2, 4)))
#' rightOuterJoin(rdd1, rdd2, 2L)
#' # list(list(1, list(2, 1)), list(1, list(3, 1)), list(2, list(NULL, 4)))
#'}
# nolint end
#' @rdname join-methods
#' @aliases rightOuterJoin,RDD,RDD-method
#' @noRd
setMethod("rightOuterJoin",
          signature(x = "RDD", y = "RDD", numPartitions = "numeric"),
          function(x, y, numPartitions) {
            xTagged <- lapply(x, function(i) { list(i[[1]], list(1L, i[[2]])) })
            yTagged <- lapply(y, function(i) { list(i[[1]], list(2L, i[[2]])) })

            doJoin <- function(v) {
              joinTaggedList(v, list(TRUE, FALSE))
            }

            joined <- flatMapValues(groupByKey(unionRDD(xTagged, yTagged), numPartitions), doJoin)
          })

#' Full outer join two RDDs
#'
#' @description
#' \code{fullouterjoin} This function full-outer-joins two RDDs where every element is of
#' the form list(K, V). The key types of the two RDDs should be the same.
#'
#' @param x An RDD to be joined. Should be an RDD where each element is
#'             list(K, V).
#' @param y An RDD to be joined. Should be an RDD where each element is
#'             list(K, V).
#' @param numPartitions Number of partitions to create.
#' @return For each element (k, v) in x and (k, w) in y, the resulting RDD
#'         will contain all pairs (k, (v, w)) for both (k, v) in x and
#'         (k, w) in y, or the pair (k, (NULL, w))/(k, (v, NULL)) if no elements
#'         in x/y have key k.
#' @examples
# nolint start
#'\dontrun{
#' sc <- sparkR.init()
#' rdd1 <- parallelize(sc, list(list(1, 2), list(1, 3), list(3, 3)))
#' rdd2 <- parallelize(sc, list(list(1, 1), list(2, 4)))
#' fullOuterJoin(rdd1, rdd2, 2L) # list(list(1, list(2, 1)),
#'                               #      list(1, list(3, 1)),
#'                               #      list(2, list(NULL, 4)))
#'                               #      list(3, list(3, NULL)),
#'}
# nolint end
#' @rdname join-methods
#' @aliases fullOuterJoin,RDD,RDD-method
#' @noRd
setMethod("fullOuterJoin",
          signature(x = "RDD", y = "RDD", numPartitions = "numeric"),
          function(x, y, numPartitions) {
            xTagged <- lapply(x, function(i) { list(i[[1]], list(1L, i[[2]])) })
            yTagged <- lapply(y, function(i) { list(i[[1]], list(2L, i[[2]])) })

            doJoin <- function(v) {
              joinTaggedList(v, list(TRUE, TRUE))
            }

            joined <- flatMapValues(groupByKey(unionRDD(xTagged, yTagged), numPartitions), doJoin)
          })

#' For each key k in several RDDs, return a resulting RDD that
#' whose values are a list of values for the key in all RDDs.
#'
#' @param ... Several RDDs.
#' @param numPartitions Number of partitions to create.
#' @return a new RDD containing all pairs of elements with values in a list
#' in all RDDs.
#' @examples
# nolint start
#'\dontrun{
#' sc <- sparkR.init()
#' rdd1 <- parallelize(sc, list(list(1, 1), list(2, 4)))
#' rdd2 <- parallelize(sc, list(list(1, 2), list(1, 3)))
#' cogroup(rdd1, rdd2, numPartitions = 2L)
#' # list(list(1, list(1, list(2, 3))), list(2, list(list(4), list()))
#'}
# nolint end
#' @rdname cogroup
#' @aliases cogroup,RDD-method
#' @noRd
setMethod("cogroup",
          "RDD",
          function(..., numPartitions) {
            rdds <- list(...)
            rddsLen <- length(rdds)
            for (i in 1:rddsLen) {
              rdds[[i]] <- lapply(rdds[[i]],
                                  function(x) { list(x[[1]], list(i, x[[2]])) })
            }
            union.rdd <- Reduce(unionRDD, rdds)
            group.func <- function(vlist) {
              res <- list()
              length(res) <- rddsLen
              for (x in vlist) {
                i <- x[[1]]
                acc <- res[[i]]
                # Create an accumulator.
                if (is.null(acc)) {
                  acc <- initAccumulator()
                }
                addItemToAccumulator(acc, x[[2]])
                res[[i]] <- acc
              }
              lapply(res, function(acc) {
                if (is.null(acc)) {
                  list()
                } else {
                  acc$data
                }
              })
            }
            cogroup.rdd <- mapValues(groupByKey(union.rdd, numPartitions),
                                     group.func)
          })

#' Sort a (k, v) pair RDD by k.
#'
#' @param x A (k, v) pair RDD to be sorted.
#' @param ascending A flag to indicate whether the sorting is ascending or descending.
#' @param numPartitions Number of partitions to create.
#' @return An RDD where all (k, v) pair elements are sorted.
#' @examples
# nolint start
#'\dontrun{
#' sc <- sparkR.init()
#' rdd <- parallelize(sc, list(list(3, 1), list(2, 2), list(1, 3)))
#' collectRDD(sortByKey(rdd)) # list (list(1, 3), list(2, 2), list(3, 1))
#'}
# nolint end
#' @rdname sortByKey
#' @aliases sortByKey,RDD,RDD-method
#' @noRd
setMethod("sortByKey",
          signature(x = "RDD"),
          function(x, ascending = TRUE, numPartitions = SparkR:::getNumPartitionsRDD(x)) {
            rangeBounds <- list()

            if (numPartitions > 1) {
              rddSize <- countRDD(x)
              # constant from Spark's RangePartitioner
              maxSampleSize <- numPartitions * 20
              fraction <- min(maxSampleSize / max(rddSize, 1), 1.0)

              samples <- collectRDD(keys(sampleRDD(x, FALSE, fraction, 1L)))

              # Note: the built-in R sort() function only works on atomic vectors
              samples <- sort(unlist(samples, recursive = FALSE), decreasing = !ascending)

              if (length(samples) > 0) {
                rangeBounds <- lapply(seq_len(numPartitions - 1),
                                      function(i) {
                                        j <- ceiling(length(samples) * i / numPartitions)
                                        samples[j]
                                      })
              }
            }

            rangePartitionFunc <- function(key) {
              partition <- 0

              # TODO: Use binary search instead of linear search, similar with Spark
              while (partition < length(rangeBounds) && key > rangeBounds[[partition + 1]]) {
                partition <- partition + 1
              }

              if (ascending) {
                partition
              } else {
                numPartitions - partition - 1
              }
            }

            partitionFunc <- function(part) {
              sortKeyValueList(part, decreasing = !ascending)
            }

            newRDD <- partitionByRDD(x, numPartitions, rangePartitionFunc)
            lapplyPartition(newRDD, partitionFunc)
          })

#' Subtract a pair RDD with another pair RDD.
#'
#' Return an RDD with the pairs from x whose keys are not in other.
#'
#' @param x An RDD.
#' @param other An RDD.
#' @param numPartitions Number of the partitions in the result RDD.
#' @return An RDD with the pairs from x whose keys are not in other.
#' @examples
# nolint start
#'\dontrun{
#' sc <- sparkR.init()
#' rdd1 <- parallelize(sc, list(list("a", 1), list("b", 4),
#'                              list("b", 5), list("a", 2)))
#' rdd2 <- parallelize(sc, list(list("a", 3), list("c", 1)))
#' collectRDD(subtractByKey(rdd1, rdd2))
#' # list(list("b", 4), list("b", 5))
#'}
# nolint end
#' @rdname subtractByKey
#' @aliases subtractByKey,RDD
#' @noRd
setMethod("subtractByKey",
          signature(x = "RDD", other = "RDD"),
          function(x, other, numPartitions = SparkR:::getNumPartitionsRDD(x)) {
            filterFunction <- function(elem) {
              iters <- elem[[2]]
              (length(iters[[1]]) > 0) && (length(iters[[2]]) == 0)
            }

            flatMapValues(filterRDD(cogroup(x,
                                            other,
                                            numPartitions = numPartitions),
                                    filterFunction),
                          function(v) { v[[1]] })
          })

#' Return a subset of this RDD sampled by key.
#'
#' @description
#' \code{sampleByKey} Create a sample of this RDD using variable sampling rates
#' for different keys as specified by fractions, a key to sampling rate map.
#'
#' @param x The RDD to sample elements by key, where each element is
#'             list(K, V) or c(K, V).
#' @param withReplacement Sampling with replacement or not
#' @param fraction The (rough) sample target fraction
#' @param seed Randomness seed value
#' @examples
#'\dontrun{
#' sc <- sparkR.init()
#' rdd <- parallelize(sc, 1:3000)
#' pairs <- lapply(rdd, function(x) { if (x %% 3 == 0) list("a", x)
#'                                    else { if (x %% 3 == 1) list("b", x) else list("c", x) }})
#' fractions <- list(a = 0.2, b = 0.1, c = 0.3)
#' sample <- sampleByKey(pairs, FALSE, fractions, 1618L)
#' 100 < length(lookup(sample, "a")) && 300 > length(lookup(sample, "a")) # TRUE
#' 50 < length(lookup(sample, "b")) && 150 > length(lookup(sample, "b")) # TRUE
#' 200 < length(lookup(sample, "c")) && 400 > length(lookup(sample, "c")) # TRUE
#' lookup(sample, "a")[which.min(lookup(sample, "a"))] >= 0 # TRUE
#' lookup(sample, "a")[which.max(lookup(sample, "a"))] <= 2000 # TRUE
#' lookup(sample, "b")[which.min(lookup(sample, "b"))] >= 0 # TRUE
#' lookup(sample, "b")[which.max(lookup(sample, "b"))] <= 2000 # TRUE
#' lookup(sample, "c")[which.min(lookup(sample, "c"))] >= 0 # TRUE
#' lookup(sample, "c")[which.max(lookup(sample, "c"))] <= 2000 # TRUE
#' fractions <- list(a = 0.2, b = 0.1, c = 0.3, d = 0.4)
#' sample <- sampleByKey(pairs, FALSE, fractions, 1618L) # Key "d" will be ignored
#' fractions <- list(a = 0.2, b = 0.1)
#' sample <- sampleByKey(pairs, FALSE, fractions, 1618L) # KeyError: "c"
#'}
#' @rdname sampleByKey
#' @aliases sampleByKey,RDD-method
#' @noRd
setMethod("sampleByKey",
          signature(x = "RDD", withReplacement = "logical",
                    fractions = "vector", seed = "integer"),
          function(x, withReplacement, fractions, seed) {

            for (elem in fractions) {
              if (elem < 0.0) {
                stop("Negative fraction value ", fractions[which(fractions == elem)])
              }
            }

            # The sampler: takes a partition and returns its sampled version.
            samplingFunc <- function(partIndex, part) {
              set.seed(bitwXor(seed, partIndex))
              res <- vector("list", length(part))
              len <- 0

              # mixing because the initial seeds are close to each other
              stats::runif(10)

              for (elem in part) {
                if (elem[[1]] %in% names(fractions)) {
                  frac <- as.numeric(fractions[which(elem[[1]] == names(fractions))])
                  if (withReplacement) {
                    count <- stats::rpois(1, frac)
                    if (count > 0) {
                      res[(len + 1) : (len + count)] <- rep(list(elem), count)
                      len <- len + count
                    }
                  } else {
                    if (stats::runif(1) < frac) {
                      len <- len + 1
                      res[[len]] <- elem
                    }
                  }
                } else {
                  stop("KeyError: \"", elem[[1]], "\"")
                }
              }

              # TODO(zongheng): look into the performance of the current
              # implementation. Look into some iterator package? Note that
              # Scala avoids many calls to creating an empty list and PySpark
              # similarly achieves this using `yield'. (duplicated from sampleRDD)
              if (len > 0) {
                res[1:len]
              } else {
                list()
              }
            }

            lapplyPartitionsWithIndex(x, samplingFunc)
          })

Try the SparkR package in your browser

Any scripts or data that you put into this service are public.

SparkR documentation built on June 3, 2021, 5:05 p.m.