R/estimate_matches.R

#' Run lasso estimation on the prepared matrixes
#'
#' @param source_x_mat,source_y_mat,source_w_mat matrixes of independent, dependent variables and weights
#' @param target_x_mat matrixes of independent variables for the prediction sample
#' @param reduced if TRUE terurns reduced outpur without specific regression details.
#' @param n_folds number of folds for cross-validation
#' @param force_lambda allows to specify one lamda value. Shoul be 0, when we
#'          want to switch to the linear regression.
#' @inheritParams find_near
#'
#' @return In both \code{reduced=TRUE} and \code{reduced=FALSE} forms, the function returns
#'     a list with elements. In the form \code{reduced=TRUE} only results of matching
#'     are returned:
#'     \itemize{
#'       \item \code{source_y_hat} is the vecrtor of predicted values based on the result of
#'             the lasso regression with \code{nfolds} cross validation and "mse"
#'             measure for identifying minimizing value of lambda. It uses \code{source_x_mat},
#'             \code{source_y_mat}, and \code{source_w_mat} for running regression and predictind \code{source_y_hat}.
#'       \item \code{target_y_hat} is the vetor of predicted values produced using the
#'             \code{source_y_hat} regression results and \code{target_x_mat} independent variables matrix.
#'       \item \code{match} is the dataframe with: columns \code{target_id} - index of each
#'              \code{target_y_hat} value; column \code{target_y_hat} - its' value; column \code{source_id} -
#'              position of the nearest match from the \code{source_y_hat} vector and
#'              column \code{source_y_hat} - value of the nearest match
#'     }
#'
#'     In the not reduced form the list contains more elements:
#'     \itemize{
#'       \item items \code{source_x_mat}, \code{source_y_mat}, \code{source_w_mat} and \code{target_x_mat} from the inputs to the
#'             functions.
#'       \item \code{lambda_cv} - result of the \code{\link[glmnet]{cv.glmnet}}.
#'       \item \code{fit}  - result of the \code{\link[glmnet]{glmnet}}
#'     }
#'
#' @export
#'
#' @examples
#' library(dplyr)
#' library(purrr)
#' library(glmnet)
#' library(lassopmm)
#'
#' # Run estimate of a fake data
#' XX <- as.matrix(mtcars[, !names(mtcars) %in% "hp"])
#' YY <- as.matrix(mtcars[, "hp"])
#' WW <- matrix(rep(1, nrow(mtcars)), ncol = 1, nrow = nrow(mtcars))
#' XX1 <- as.matrix(mtcars[1:10, !names(mtcars) %in% "hp"])
#'
#' # Running simple estimation and returning
#' a <- estimate_matches(source_x_mat = XX, source_y_mat = YY,
#'                       source_w_mat = WW, target_x_mat = XX1,
#'                       reduced = FALSE, n_near = 5)
#'
#' # Extract regression coefficients
#' a$fit %>% coef()
#' str(a, max.level = 1)
#'
#' # Developing a sample bootsrtap vector
#' perm_example <- purrr::map(1:5, ~ sample(1:nrow(YY), nrow(YY), TRUE))
#'
#' # Running estimations on every single bootstrap permutation vector
#' a_boot <-
#'   perm_example %>%
#'   purrr::map(~ lassopmm::estimate_matches(
#'     source_x_mat = XX[.x, ],
#'     source_y_mat = YY[.x, ],
#'     source_w_mat = WW[.x, ],
#'     target_x_mat = XX1,
#'     reduced = FALSE,
#'     n_near = 5
#'   ))
#'
#' # Exploring the structure
#' a_boot %>%
#'   str(max.level = 1)
#'
#' # Accesssing fit of a specific bootstrap iteration
#' a_boot[[3]]$fit
#'
#' # Doing the same with each single bootstrap iteration
#' a_boot %>%
#'   map("fit")
#'
#' # Extracting coefficinets from each single bootstrap iteration
#' a_boot %>%
#'   map("fit") %>%
#'   map(coefficients)
#'
#' # Combine all coefficients in a table
#' aa <-
#'   a_boot %>%
#'   map("fit") %>%
#'   map(broom::tidy) %>%
#'   map(~ select(.x, term, estimate))
#' map(seq_along(aa), .f = function(x) {
#'   rename_at(aa[[x]], vars(estimate), list(~ paste0(., "_", x)))}) %>%
#'   reduce(full_join, by = "term")
#'
estimate_matches <-
  function(source_x_mat,
             source_y_mat,
             source_w_mat,
             target_x_mat,
             n_near = 5,
             n_folds = 10,
             force_lambda = NULL,
             reduced = TRUE) {
    if (is.null(force_lambda) || is.na(force_lambda)) {
      lambda_cv <-
        glmnet::cv.glmnet(
          x = source_x_mat,
          y = source_y_mat,
          weights = source_w_mat,
          type.measure = "mse",
          nfolds = n_folds
        )
    } else {
      lambda_cv <- list()
      lambda_cv$lambda.min <- force_lambda
    }

    fit <-
      glmnet::glmnet(
        x = source_x_mat,
        y = source_y_mat,
        weights = source_w_mat,
        alpha = 1,
        lambda = lambda_cv$lambda.min
      )

    source_y_hat <- stats::predict(fit, newx = source_x_mat)[, 1]
    target_y_hat <- stats::predict(fit, newx = target_x_mat)[, 1]

    # Check that predict works the right way
    # t(as.matrix(coef(fit))) %*% t(cbind(rep(1, nrow(target_x_mat)), target_x_mat))

    match <- find_near(target_y_hat, source_y_hat, n_near)

    if (!reduced) {
      return(
        list(
          source_x_mat = source_x_mat,
          source_w_mat = source_w_mat,
          source_y_mat = source_y_mat,
          target_x_mat = target_x_mat,
          lambda_cv = lambda_cv,
          fit = fit,
          source_y_hat = source_y_hat,
          target_y_hat = target_y_hat,
          match = match
        )
      )
    } else {
      return(list(
        source_y_hat = source_y_hat,
        target_y_hat = target_y_hat,
        match = match
      ))
    }
  }
EBukin/lassopmm documentation built on June 12, 2019, 9:51 a.m.