R/predNewLocs.R

#' Prediction at new locations based on the fitting results of original dataset
#'
#' @param newdata
#'     A data.frame includes all locations' longitude, latitude, and elevation,
#'     where the prediction is to be calculated.
#' @param fitted
#'     Can be either a data.frame in memory or HDFS path which contains all fitting results
#'     of original dataset.
#' @param output
#'     The output path of fitting results on HDFS. If data is a data.frame object,
#'     the output should be set as default NULL. Since the function will return
#'     the fitting results in memory.
#' @param stat_info
#'     The RData on HDFS which contains all station metadata. Make sure
#'     copy the RData of station_info to HDFS first using rhput.
#' @param model_control
#'     Should be a list object generated from \code{spacetime.control} function.
#'     The list including all necessary smoothing parameters of nonparametric fitting.
#' @param cluster_control
#'     Should be a list object generated from \code{mapreduce.control} function.
#'     The list including all necessary Rhipe parameters and also user tunable
#'     MapReduce parameters. It is only necessary for data on HDFS situation. If data
#'     is data.frame in memory, this parameter should be kept as default NULL.
#' @author
#'     Xiaosu Tong
#' @export
#' @seealso
#'     \code{\link{spacetime.control}}, \code{\link{mapreduce.control}}
#'
#' @examples
#' \dontrun{
#'     mcontrol <- spacetime.control(
#'       vari="resp", time="date", n=576, n.p=12, stat_n=7738, surf = "interpolate",
#'       s.window="periodic", t.window = 241, degree=2, span=0.015, Edeg=2
#'     )
#'     ccontrol <- mapreduce.control(
#'       libLoc= NULL, reduceTask=169, io_sort=128, slow_starts = 0.5,
#'       map_jvm = "-Xmx200m", reduce_jvm = "-Xmx200m",
#'       map_memory = 1024, reduce_memory = 1024,
#'       reduce_input_buffer_percent=0.4, reduce_parallelcopies=10,
#'       reduce_merge_inmem=0, task_io_sort_factor=100,
#'       spill_percent=0.9, reduce_shuffle_input_buffer_percent = 0.8,
#'       reduce_shuffle_merge_percent = 0.4
#'     )
#'     new.grid <- expand.grid(
#'       lon = seq(-126, -67, by = 0.5),
#'       lat = seq(25, 49, by = 0.5)
#'     )
#'     instate <- !is.na(map.where("state", new.grid$lon, new.grid$lat))
#'     new.grid <- new.grid[instate, ]
#'
#'     elev.fit <- spaloess( elev ~ lon + lat,
#'       data = station_info,
#'       degree = 2,
#'       span = 0.015,
#'       distance = "Latlong",
#'       normalize = FALSE,
#'       napred = FALSE,
#'       alltree = FALSE,
#'       family="symmetric",
#'       control=loess.control(surface = "direct")
#'     )
#'     grid.fit <- predloess(
#'       object = elev.fit,
#'       newdata = data.frame(
#'         lon = new.grid$lon,
#'         lat = new.grid$lat
#'       )
#'     )
#'     new.grid$elev2 <- log2(grid.fit + 128)
#'
#'     #if the original fitting results are in memory
#'     fitted <- drsstl(
#'       data=tmax_all,
#'       output=NULL,
#'       stat_info="station_info",
#'       model_control=mcontrol
#'     )
#'     predNewLocs(
#'       original = fitted, newdata = new.grid, model_control = mcontrol
#'     )
#'
#'     #if the fitting results are on HDFS
#'     predNewLocs(
#'       fitted="/tmp/output/output_bymth", newdata=new.grid, output = "/tmp",
#'       station_info="/tmp/station_info.RData", model_control = mcontrol,
#'       cluster_control = ccontrol
#'     )
#' }
predNewLocs <- function(fitted, newdata, output = NULL, stat_info=NULL, model_control=spacetime.control(), cluster_control=NULL) {

  if ("ddf" %in% class(fitted)) {
    
    rst <- predNew_local(original=recombine(fitted, combRbind), newdata=newdata, mlcontrol=model_control)
    return(rst)

  } else if(class(fitted) == "data.frame") {

    rst <- predNew_local(original=fitted, newdata=newdata, mlcontrol=model_control)
    return(rst)

  } else if (class(fitted) == "character") {

    if(is.null(output)) {
      stop("An output path on HDFS should be specified")
    }
    if(is.null(cluster_control)) {
      stop("A cluster control must be specified for data on HDFS")
    }

    predNew_mr(newdata=newdata, input=fitted, output=output, info = stat_info, mlcontrol=model_control, clcontrol=cluster_control)

  } else {
    stop("The input data should be either a data.frame in memory or a HDFS path of input data")
  }

}
XiaosuTong/drSpaceTime documentation built on May 9, 2019, 11:06 p.m.