R/ds.mice.pmm.R

#' @title Calculates imputations for univariate missing data by predictive mean matching
#' @description This function performs imputation by predictive mean matching by executing the pmmDS function on the server-side.
#'@param y Vector to be imputed
#'@param ry Logical vector of length \code{length(y)} indicating the 
#'the subset \code{y[ry]} of elements in \code{y} to which the imputation 
#'model is fitted. The \code{ry} generally distinguishes the observed 
#'(\code{TRUE}) and missing values (\code{FALSE}) in \code{y}.
#'@param x Numeric design matrix with \code{length(y)} rows with predictors for 
#'\code{y}. Matrix \code{x} may have no missing values.
#'@param wy Logical vector of length \code{length(y)}. A \code{TRUE} value 
#'indicates locations in \code{y} for which imputations are created.
#'@param donors The size of the donor pool among which a draw is made. 
#'The default is \code{donors = 5L}. Setting \code{donors = 1L} always selects 
#'the closest match, but is not recommended. Values between 3L and 10L 
#'provide the best results in most cases (Morris et al, 2015).
#'@param matchtype Type of matching distance. The default choice 
#'(\code{matchtype = 1L}) calculates the distance between 
#'the \emph{predicted} value of \code{yobs} and 
#'the \emph{drawn} values of \code{ymis} (called type-1 matching). 
#'Other choices are \code{matchtype = 0L} 
#'(distance between predicted values) and \code{matchtype = 2L} 
#'(distance between drawn values).
#'@param ridge The ridge penalty used in \code{.norm.draw()} to prevent 
#'problems with multicollinearity. The default is \code{ridge = 1e-05}, 
#'which means that 0.01 percent of the diagonal is added to the cross-product. 
#'Larger ridges may result in more biased estimates. For highly noisy data 
#'(e.g. many junk variables), set \code{ridge = 1e-06} or even lower to 
#'reduce bias. For highly collinear data, set \code{ridge = 1e-04} or higher.
#'@param \dots Other named arguments.
#'@return Vector with imputed data, same type as \code{y}, and of length 
#'\code{sum(wy)}
#'@examples 
#'
#'# In this example, we assume that the Opal server to which we are connecting, 
#'# has a table that contains the 'boys' data from the original mice package.
#'
#'# Load DataSHIELD libraries
#'library(dsBaseClient)
#'library(dsMiceClient)
#'
#'# Build login information
#'server <- c("server_name")
#'url <- c("opal_url")
#'user <- "username"
#'password <- "password"
#'table <- c("project_name.table_name")
#'logindata <- data.frame(server,url,user,password,table)
#'
#'# Login and assign the 'boys' dataset to varable 'D' on the server-side
#'opals <- datashield.login(logins=logindata, assign=TRUE)
#'
#'datashield.assign(opals, symbol="xname", value=as.symbol("c('age', 'hgt', 'wgt')"))
#'datashield.assign(opals, symbol="r", value=as.symbol("complete.cases(D[, xname])"))
#'datashield.assign(opals, symbol="x", value=as.symbol("D[r, xname]"))
#'datashield.assign(opals, symbol="y", value=as.symbol("D[r, 'tv']"))
#'datashield.assign(opals, symbol="ry", value=as.symbol("notNaDS(y)"))
#'
#'# Impute missing tv data
#'yimp <- ds.mice.pmm('y','ry','x')
#'length(yimp)
#'table(yimp)
#'hist(table(yimp), xlab = 'Imputed missing tv')
#'
#'@export

ds.mice.pmm = function(y = NULL, ry = NULL, x = NULL, wy = NULL, donors = 5, 
                       matchtype = 1L, ridge = 1e-05, checks=TRUE, datasources=NULL, ...) {
  
  # check that y, ry and x are not NULL
  if (is.null(y) || is.null(ry) || is.null(x)) {
    stop("Please provide (at least) the following arguments:\n",
         "\ty: the vector to be imputed\n",
         "\try: the logical vector to which the imputation model is fitted\n",
         "\tx: the predictors matrix\n", call.=FALSE)
  }
  
  # if no opal login details are provided look for 'opal' objects in the environment
  if (is.null(datasources)) {
    datasources <- dsBaseClient:::findLoginObjects()
  }
  
  # check that y, ry and x are defined in the remote server environment
  params <- list(y=y, ry=ry, x=x)
  
  for (name in names(params)) {
    param <- params[[name]]
    
    xnames <- dsBaseClient:::extract(param)
    varname <- xnames$elements
    obj2lookfor <- xnames$holders
    
    if (checks) {
      if (is.na(obj2lookfor)) {
        dsBaseClient:::isDefined(datasources, varname)
      } else {
        dsBaseClient:::isDefined(datasources, obj2lookfor)
      }
    }
  }
  
  # call the server-side function pmmDS
  cally <- paste0("pmmDS(", y, ",", ry , ",", x, ")")
  result <- datashield.aggregate(datasources, as.symbol(cally))

  return(result)
}
gflcampos/dsMiceClient documentation built on May 3, 2019, 4:33 p.m.