# ==============================================================================
# quickmatch -- Quick Generalized Full Matching
# https://github.com/fsavje/quickmatch
#
# Copyright (C) 2017 Fredrik Savje and Jasjeet S. Sekhon
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see http://www.gnu.org/licenses/
# ==============================================================================
#' Derive generalized full matchings
#'
#' \code{quickmatch} constructs near-optimal generalized full matchings. The
#' function expects the user to provide distances measuring the similarity of
#' units and a set of matching constraints. It then constructs a matching
#' so that units assigned to the same group are as similar as possible while
#' satisfying the matching constraints.
#'
#' The \code{treatment_constraints} parameter should be a named vector with
#' treatment-specific constraints. For example, in a sample with treatment
#' conditions "A", "B" and "C", the vector \code{c("A" = 1, "B" = 2, "C" = 0)}
#' specifies that each matched group should contain at least one unit with
#' treatment "A", at least two units with treatment "B" and any number of units
#' with treatment "C". Treatments not specified in the vector defaults to zero.
#' For example, the vector \code{c("A" = 1, "B" = 2)} is identical to the
#' previous one. When \code{treatment_constraints} is \code{NULL}, the function
#' requires at least one unit for each treatment in each group. In our current
#' example, \code{NULL} would be shorthand for \code{c("A" = 1, "B" = 1, "C" = 1)}.
#'
#' The \code{size_constraint} parameter can be used to constrain the matched
#' groups to contain at least a certain number of units in total (independently
#' of treatment assignment). For example, if \code{treatment_constraints =
#' c("A" = 1, "B" = 2)} and \code{total_size_constraint = 4}, each matched
#' group will contain at least one unit assigned to "A", at least two units
#' assigned to "B" and at least four units in total, where the fourth unit can
#' be from any treatment condition.
#'
#' The \code{target} parameter can be used to control which units are included
#' in the matching. When \code{target} is \code{NULL} (the default), all units
#' will be assigned to a matched group. When not \code{NULL}, the parameter
#' indicates that some units must be assigned to matched group and that the
#' remaining units can safely be ignored. This can be useful, for example,
#' when one is interested in estimating treatment effects only for a certain
#' type of units (e.g., the average treatment effect for the treated, ATT). It
#' is particularly useful when units of interested are not represented in the
#' whole covariate space (i.e., an one-sided overlap problem). Without the
#' \code{target} parameter, the function would in such cases try to assign every
#' unit to a group, including units in sparse regions that we are not interested
#' in. This could lead to unnecessarily large and diverse matched groups. By
#' specifying that some units are of interest only insofar as they help us satisfy
#' the matching constraints (i.e., setting the \code{target} parameter to the
#' appropriate value), we can avoid such situations.
#'
#' Consider, as an example, a study with two treatment conditions, "A" and "B".
#' Units assigned to "B" are more numerous and tend to have more extreme
#' covariate values. We are, however, only interested in estimating the
#' treatment effect for units assigned to "A". By specifying \code{target = "A"},
#' the function ensures that all "A" units are assigned to matched groups. Some
#' units assigned to treatment "B" -- in particular the units with extreme
#' covariate values -- will be left unassigned. However, as those units are not
#' of interest, they can safely be ignored, and we avoid groups of poor quality.
#'
#' Even if some of the units that can be ignored are not needed to satisfy the
#' matching constraints, it is rarely beneficial to discard them blindly; they can
#' occasionally provide useful information. The default behavior when \code{target}
#' is non-NULL is to assign as many of the ignorable units as possible given that
#' the within-group distances do not increase too much
#' (using \code{secondary_unassigned_method = "estimated_radius"}). This behavior
#' might, however, reduce covariate balance in some instances. If called with
#' \code{secondary_unassigned_method = "ignore"}, units not specified in
#' \code{target} will be discarded unless they are absolutely needed to satisfying
#' the matching constraints. This tends to reduce bias since the within-group
#' distances are minimized, but it could increase variance since we ignore
#' potentially useful information in the sample. An intermediate alternative
#' is to specify an aggressive caliper for the ignorable units, which is done
#' with the \code{secondary_radius} parameter. (These parameters are part of the
#' \code{\link[scclust]{sc_clustering}} function that \code{quickmatch} calls.
#' The \code{target} parameter corresponds to the \code{primary_data_points}
#' parameter in that function.)
#'
#' The \code{caliper} parameter constrains the maximum distance between units
#' assigned to the same matched group. This is implemented by restricting the
#' edge weight in the graph used to construct the matched groups (see
#' \code{\link[scclust]{sc_clustering}} for details). As a result, the caliper
#' will affect all groups in the matching and, in general, make it harder for
#' the function to find good matches even for groups where the caliper is not
#' binding. In particular, a too tight \code{caliper} can lead to discarded
#' units that otherwise would be assigned to a group satisfying both the
#' matching constraints and the caliper. For this reason, it is recommended
#' to set the \code{caliper} value quite high and only use it to avoid particularly
#' poor matches. It strongly recommended to use the \code{caliper} parameter only
#' when \code{primary_unassigned_method = "closest_seed"} in the underlying
#' \code{\link[scclust]{sc_clustering}} function (which is the default
#' behavior).
#'
#' \code{quickmatch} calls \code{\link[scclust]{sc_clustering}} with
#' \code{seed_method = "inwards_updating"}. The \code{seed_method} parameter
#' governs how the seeds are selected in the nearest neighborhood graph that
#' is used to construct the matched groups (see \code{\link[scclust]{sc_clustering}}
#' for details). The \code{"inwards_updating"} option generally works well
#' and is safe with most datasets. Using \code{seed_method = "exclusion_updating"}
#' often leads to better performance (in the sense of matched groups with more
#' similar units), but it may increase run time. Discrete data (or more generally
#' when units tend to be at equal distance to many other units) will lead to
#' particularly poor run time with this option. If the data set has at least one
#' continuous covariate, \code{"exclusion_updating"} is typically reasonably
#' quick. A third option is \code{seed_method = "lexical"}, which decreases the
#' run time relative to \code{"inwards_updating"} (sometimes considerably) at
#' the cost of performance. \code{quickmatch} passes parameters on to
#' \code{\link[scclust]{sc_clustering}}, so to change \code{seed_method}, call
#' \code{quickmatch} with the parameter specified as usual:
#' \code{quickmatch(..., seed_method = "exclusion_updating")}.
#'
#' @param distances
#' \code{\link[distances]{distances}} object or a numeric vector, matrix
#' or data frame. The parameter describes the similarity of the units to be
#' matched. It can either be preprocessed distance information using a
#' \code{\link[distances]{distances}} object, or raw covariate data. When
#' called with covariate data, Euclidean distances are calculated unless
#' otherwise specified.
#' @param treatments
#' factor specifying the units' treatment assignments.
#' @param treatment_constraints
#' named integer vector with the treatment constraints. If \code{NULL}, the
#' function ensures that each matched group contains one unit from each
#' treatment condition.
#' @param size_constraint
#' integer with the required total number of units in each group. Must be
#' greater or equal to the sum of \code{treatment_constraints}. If NULL, no
#' constraints other than the treatment constraints are imposed.
#' @param target
#' units to target the matching for. All units indicated by \code{target} are
#' ensured to be assigned to a matched group (disregarding eventual
#' \code{caliper} setting). Units not indicated by \code{target} could be
#' left unassigned if they are not necessary to satisfy the matching
#' constraints. If \code{NULL}, \code{quickmatch} targets the complete sample
#' and ensures that all units are assigned to a group. If \code{target} is a
#' logical vector with the same length as the sample size, units indicated
#' with \code{TRUE} will be targeted. If \code{target} is an integer vector,
#' the units with indices in \code{target} are targeted. Indices starts at 1
#' and \code{target} must be sorted. If \code{target} is a character vector,
#' it should contain treatment labels, and the corresponding units (as given
#' by \code{treatments}) will be targeted.
#' @param caliper
#' restrict the maximum within-group distance.
#' @param ...
#' additional parameters to be sent either to the \code{\link[distances]{distances}}
#' function when the \code{distances} parameter contains covariate data, or
#' to the underlying \code{\link[scclust]{sc_clustering}} function.
#'
#' @return
#' Returns a \code{\link{qm_matching}} object with the matched groups.
#'
#' @seealso
#' See \code{\link[scclust]{sc_clustering}} for the underlying function used
#' to construct the matched groups.
#'
#' @references
#' Sävje, Fredrik, Michael J. Higgins and Jasjeet S. Sekhon (2017),
#' \sQuote{Generalized Full Matching}, arXiv 1703.03882.
#' \url{https://arxiv.org/abs/1703.03882}
#'
#' @examples
#' # Construct example data
#' my_data <- data.frame(y = rnorm(100),
#' x1 = runif(100),
#' x2 = runif(100),
#' treatment = factor(sample(rep(c("T1", "T2", "C"), c(25, 25, 50)))))
#'
#' # Make distances
#' my_distances <- distances(my_data, dist_variables = c("x1", "x2"))
#'
#' # Make matching with one unit from "T1", "T2" and "C" in each matched group
#' quickmatch(my_distances, my_data$treatment)
#'
#' # Require at least two "C" in each group
#' quickmatch(my_distances,
#' my_data$treatment,
#' treatment_constraints = c("T1" = 1, "T2" = 1, "C" = 2))
#'
#' # Require groups with at least six units in total
#' quickmatch(my_distances,
#' my_data$treatment,
#' treatment_constraints = c("T1" = 1, "T2" = 1, "C" = 2),
#' size_constraint = 6)
#'
#' # Focus the matching to units assigned to "T1" and "T2" (i.e., all
#' # units assigned to "T1" or T2 will be assigned to a matched group).
#' # Units assigned to treatment "C" will be assigned to groups so to
#' # ensure that each group contains at least one unit of each treatment
#' # condition. Remaining "C" units could be left unassigned.
#' quickmatch(my_distances,
#' my_data$treatment,
#' target = c("T1", "T2"))
#'
#' # Impose caliper
#' quickmatch(my_distances,
#' my_data$treatment,
#' caliper = 0.25)
#'
#' # Call `quickmatch` directly with covariate data (ie., not pre-calculating distances)
#' quickmatch(my_data[c("x1", "x2")], my_data$treatment)
#'
#' # Call `quickmatch` directly with covariate data using Mahalanobis distances
#' quickmatch(my_data[c("x1", "x2")],
#' my_data$treatment,
#' normalize = "mahalanobize")
#'
#' @export
quickmatch <- function(distances,
treatments,
treatment_constraints = NULL,
size_constraint = NULL,
target = NULL,
caliper = NULL,
...) {
dots <- eval(substitute(alist(...)))
if (!distances::is.distances(distances)) {
dist_call <- dots[names(dots) %in% names(formals(distances::distances))]
dist_call$data <- distances
distances <- do.call(distances::distances, dist_call)
}
ensure_distances(distances)
num_observations <- length(distances)
treatments <- coerce_treatments(treatments, num_observations, FALSE)
if (is.null(treatment_constraints)) {
treatment_constraints <- rep(1L, nlevels(treatments))
names(treatment_constraints) <- levels(treatments)
}
treatment_constraints <- coerce_treatment_constraints(treatment_constraints,
levels(treatments))
size_constraint <- coerce_size_constraint(size_constraint,
sum(treatment_constraints),
num_observations)
target <- coerce_target(target, treatments, FALSE)
ensure_caliper(caliper)
sc_call <- dots[names(dots) %in% names(formals(scclust::sc_clustering))]
if (!is.null(sc_call$type_labels)) {
stop("`type_labels` is ignored, please use the `treatments` parameter instead.")
}
if (!is.null(sc_call$type_constraints)) {
stop("`type_constraints` is ignored, please use the `treatment_constraints` parameter instead.")
}
if (!is.null(sc_call$primary_data_points)) {
stop("`primary_data_points` is ignored, please use the `target` parameter instead.")
}
if (is.null(sc_call$seed_method)) {
sc_call$seed_method <- "inwards_updating"
}
if (is.null(sc_call$primary_unassigned_method)) {
sc_call$primary_unassigned_method <- "closest_seed"
}
if (is.null(sc_call$secondary_unassigned_method)) {
secondary_unassigned_method_default <- TRUE
sc_call$secondary_unassigned_method <- "closest_seed"
} else {
secondary_unassigned_method_default <- FALSE
}
if (is.null(sc_call$primary_radius)) {
sc_call$primary_radius <- "seed_radius"
}
if (is.null(sc_call$secondary_radius)) {
if (is.null(caliper)) {
sc_call$secondary_radius <- "estimated_radius"
} else {
sc_call$secondary_radius <- "seed_radius"
}
}
# If `caliper` is NULL, do nothing
# If `sc_call$seed_radius` is NULL, use `caliper`
if (is.null(sc_call$seed_radius) && !is.null(caliper)) {
if (sc_call$primary_unassigned_method %in% c("ignore", "closest_seed")) {
sc_call$seed_radius <- as.numeric(caliper) / 2.0
if (sc_call$secondary_unassigned_method == "closest_assigned") {
warning("Caliper is not properly enforced when `secondary_unassigned_method`==\"closest_assigned\".")
}
} else if (sc_call$primary_unassigned_method %in% c("any_neighbor", "closest_assigned")) {
sc_call$seed_radius <- as.numeric(caliper) / 4.0
warning("Caliper might perform poorly unless `primary_unassigned_method`==\"closest_seed\".")
}
if (sc_call$primary_radius != "seed_radius") {
warning("Caliper is not properly enforced unless `primary_radius`==\"seed_radius\".")
}
if (sc_call$secondary_radius != "seed_radius") {
warning("Caliper is not properly enforced unless `secondary_radius`==\"seed_radius\".")
}
} else if (!is.null(sc_call$seed_radius) && !is.null(caliper)) {
warning("`caliper` is ignored when `seed_radius` is specified.")
}
sc_call$distances <- distances
sc_call$size_constraint <- size_constraint
sc_call$type_labels <- treatments
sc_call$type_constraints <- treatment_constraints
sc_call$primary_data_points <- target
out_matching <- tryCatch({
do.call(scclust::sc_clustering, sc_call)
},
error = function(x) {
if (grepl("\\(scclust:src/nng_clustering.c:[0-9]+\\) Infeasible radius constraint.", x$message)) {
if (secondary_unassigned_method_default) {
new_warning("Most seeds are at zero distance to their neighbors in the clustering NNG. This typically happens with discrete low-dimensional covariates. Consider using exact matching with such data. Running with `secondary_unassigned_method == \"ignore\"`.", level = -5)
sc_call$secondary_unassigned_method <- "ignore"
do.call(scclust::sc_clustering, sc_call)
} else {
new_error("** Error in scclust: ", x$message, "\n ** Explanation of error: Most seeds are at zero distance to their neighbors in the clustering NNG. This typically happens with discrete low-dimensional covariates. Consider using exact matching with such data.", level = -5)
}
} else {
new_error("** Error in scclust: ", x$message, level = -5)
}
})
class(out_matching) <- c("qm_matching", class(out_matching))
out_matching
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.