# The 'match_subjects' function
# Written by Kevin Potter
# email: kevin.w.potter@gmail.com
# Please email me directly if you
# have any questions or comments
# Last updated 2018-11-14
# Table of contents
# 1) match_subjects
# 2) Methods for 'match_subjects'
# 2.1) is.match_subjects
# 2.2) print.match_subjects
# 2.3) print.match_subjects
# 2.4) summary.match_subjects
# 2.5) print.summary.match_subjects
###
### 1) match_subjects
###
#' Helper Function for the MatchIt Command
#'
#' A helper function that simplifies the steps and
#' commands needed to find a subset of individuals
#' from a test and control group that match over
#' a collection of covariates when using the
#' \code{matchit} function from the \pkg{MatchIt}.
#'
#' @param df A data frame with a column denoting
#' group membership (coded as 1 for the test group
#' and 0 otherwise), columns for all covariates to
#' match over, and an optional column with the
#' identifier for each individual in the dataset.
#' @param DV A character string with the column
#' name for the binary variable denoting group
#' membership.
#' @param ID An optional character string giving the
#' column name with the identifier for each
#' individual. If \code{NULL}, default identifiers
#' will be assigned to all individuals, saved in
#' the column \code{ID}.
#'
#' @details Forthcoming.
#'
#' @references
#' Daniel E. Ho, Kosuke Imai, Gary King, Elizabeth A. Stuart (2011).
#' MatchIt: Nonparametric Preprocessing for Parametric Causal Inference.
#' Journal of Statistical Software, Vol. 42, No. 8, pp. 1-28.
#' URL http://www.jstatsoft.org/v42/i08/
#'
#' @return Forthcoming.
#'
#' @export
#' @examples
#' # Example dataset from 'MatchIt' package
#' data( 'lalonde', package = 'MatchIt' )
#'
#' # Extract data of interest
#' df = lalonde[,c('treat','age','educ','black','hispan','married','nodegree')]
#' mo = match_subjects( df, DV = 'treat' )
#' mo # Brief summary of matching
#' # Extract subset of data that was matched
#' md = subset( mo )
#' # Summary of results with tests for comparison of groups
#' # (Can take some time to compute)
#' sm = summary( mo )
#' sm
#' # Can display results for full original sample as well
#' print( sm, full = T )
match_subjects = function( df, DV, ID = NULL ) {
# Specify subject IDs
if ( is.null( ID ) ) {
df$ID = rep( 1:nrow( df ) )
ID = 'ID'
}
# Variable names in data frame
cn = colnames( df )
# Determine variables to
# match over
IVs = cn[ !grepl( DV, cn ) & !grepl( ID, cn ) ]
# Remove rows with any missing values
no_na = apply( df, 1, function(x) all( !is.na( x ) ) )
dtbf = df[ no_na, ]
# Create formula for variables to match over
frm = paste(
DV, '~',
paste( IVs, collapse = ' + ' )
)
frm = as.formula( frm )
# Initialize output
nc =
1 + # ID
1 + # DV
length( IVs ) +
7
output = matrix( NA, 0, nc )
colnames( output ) = c(
ID, DV, IVs,
'Test', 'Matched', 'Excluded',
'Index_Test', 'Index_Control',
'Weight', 'Propensity_score'
)
output = as.data.frame( output )
# Attempt exact matching
match_output = tryCatch(
{
MatchIt::matchit( frm,
# Ancillary options for procedure
method = 'exact', data = dtbf )
},
error = function(e) return( NULL )
)
if ( !is.null( match_output ) ) {
# Extract matched rows
m_df_exact = MatchIt::match.data( match_output )
# Rearrange results and concatenate to
# final output data frame
tmp = data.frame(
Test = m_df_exact[[ DV ]],
Matched = T,
Excluded = F,
Index_Test = 0,
Index_Control = 0,
Weight = m_df_exact$weights,
Propensity_score = NA,
stringsAsFactors = F
)
tmp$Index_Test[ tmp$Test == 1 ] =
m_df_exact$subclass[ tmp$Test == 1 ]
tmp$Index_Control[ tmp$Test == 0 ] =
m_df_exact$subclass[ tmp$Test == 0 ]
output = rbind( output,
cbind( m_df_exact[,c(ID,DV,IVs)], tmp ) )
# Remove matched results from data to analyze
exclude = dtbf[[ID]] %in% m_df_exact[[ID]]
dtbf = dtbf[ !exclude, ]
}
# Attempt matching based on using nearest-neighbours
# and the propensity scores
match_output = tryCatch(
{
MatchIt::matchit( frm,
# Ancillary options for procedure
method = 'nearest', data = dtbf )
},
error = function(e) return( NULL )
)
if ( !is.null( match_output ) ) {
# Extract matched rows
m_df_nearest = MatchIt::match.data( match_output )
# With the nearest neighbours approach, the 'matchit'
# function returns a matrix with a column giving the
# row index for the matched control subjects. The
# row names of the matrix are the row names of the
# associated test subjects.
mm = match_output$match.matrix
if ( nrow( output ) == 0 ) {
adj = 0
} else {
adj = max( output$Index_Control )
}
# Rearrange results
tmp = data.frame(
Test = m_df_nearest[[ DV ]],
Matched = T,
Excluded = F,
Index_Test = 0,
Index_Control = 0,
Weight = m_df_nearest$weights,
Propensity_score = m_df_nearest$distance,
stringsAsFactors = F
)
# Determine which subjects match each other
# approximately across groups
# Loop over subjects in the test group
inc = 1
for ( s in 1:nrow( mm ) ) {
# Identify test subject
id_test = dtbf[ rownames( mm )[s], ID ]
sel_test = m_df_nearest[[ ID ]] %in% id_test
# Identify matched control subject(s)
id_con = dtbf[ mm[s,], ID ]
sel_con = m_df_nearest[[ ID ]] %in% id_con
tmp$Index_Test[sel_test] = inc + adj
tmp$Index_Control[sel_con] = inc + adj
inc = inc + 1
}
# Concatenate to final data frame
output = rbind(
output,
cbind(
m_df_nearest[,c(ID,DV,IVs)],
tmp )
)
# Remove matched results from data to analyze
exclude = dtbf[[ID]] %in% m_df_nearest[[ID]]
dtbf = dtbf[ !exclude, ]
}
# Concatenate remaining observations to final
# data frame
# Non-matched values
if ( nrow( dtbf ) != 0 ) {
# Rearrange results
tmp = data.frame(
Test = dtbf[[ DV ]],
Matched = F,
Excluded = F,
Index_Test = 0,
Index_Control = 0,
Weight = NA,
Propensity_score = NA,
stringsAsFactors = F
)
# Concatenate to final data frame
output = rbind(
output,
cbind( dtbf[,c(ID,DV,IVs)],
tmp
)
)
}
# Excluded values
if ( any( !no_na ) ) {
# Rearrange results
tmp = data.frame(
Test = df[[ DV ]][!no_na],
Matched = F,
Excluded = T,
Index_Test = 0,
Index_Control = 0,
Weight = NA,
Propensity_score = NA,
stringsAsFactors = F
)
# Concatenate to final data frame
output = rbind(
output,
cbind( df[!no_na,c(ID,DV,IVs)],
tmp
)
)
}
# Create final output
out = list(
Data = output,
DV = DV,
ID = ID,
IVs = IVs
)
# Create S3 class
class( out ) = "match_subjects"
return( out )
}
###
### 2) Methods for 'match_subjects'
###
# 2.1)
#' @rdname match_subjects
#' @export
is.match_subjects = function( x ) {
# Purpose:
# ...
# Arguments:
# x - An R object
# Returns:
# ...
out = inherits(x, "match_subjects")
return( out )
}
# 2.2)
#' @rdname match_subjects
#' @export
subset.match_subjects = function( x ) {
# Purpose:
# ...
# Arguments:
# An R object of class 'match_subject'
# Returns:
# ...
# Extract data
dat = x$Data
# Select only matched cases
sel = x$Data$Matched
out = dat[ sel, ]
# Sort data by
# matched indices for
# control and test groups,
# respectively
o = order( out$Index_Test, out$Index_Control )
out = out[ o, ]
return( out )
}
# 2.3)
#' @rdname match_subjects
#' @export
print.match_subjects = function( x ) {
# Purpose:
# ...
# Arguments:
# An R object of class 'match_subject'
# Returns:
# ...
# Extract results
dat = x$Data
# Helper function to quickly print a string
# to the console window
qcat = function( x ) {
cat( c( x, '\n' ) )
}
# Total number of subjects
Total_N = nrow( dat )
qcat( paste( 'Total N:', Total_N ) )
# Details for matched subjects in test group
qcat( 'Matched subjects (test group)' )
N = sum( x$Data$Test == 1 )
P_group = round( 100*(N/Total_N) )
Matched = sum( dat$Test[ dat$Matched ] == 1 )
P = round( 100*(Matched/N) )
string = paste( 'N = ', N, ' (', P_group, '%); ',
'Matched = ', Matched, ', ',
P, '%', sep = '' )
qcat( string )
N_test = N
# Details for matched subjects in control group
qcat( 'Matched subjects (control group)' )
N = sum( dat$Test == 0 )
P_group = round( 100*(N/Total_N) )
Matched = sum( dat$Test[ dat$Matched ] == 0 )
P = round( 100*(Matched/N_test) )
string = paste( 'N = ', N, ' (', P_group, '%); ',
'Matched = ', Matched, ', ',
P, '%', sep = '' )
qcat( string )
# Average and range for the standardized difference
# scores
std_d = apply(
x$Data[ x$Data$Matched ,x$IVs ], 2,
std_difference,
test = x$Data$Test[ x$Data$Matched ] )
std_d_m = mean( std_d )
std_d_r = range( std_d )
# Details for matched subjects in control group
qcat( 'Average and range - standardized difference' )
string = paste(
round( std_d_m, 2 ), ' (',
round( std_d_r[1], 2 ), ' - ',
round( std_d_r[2], 2 ), ')',
sep = '' )
qcat( string )
}
# 2.4)
#' @rdname match_subjects
#' @export
summary.match_subjects = function( x ) {
# Purpose:
# ...
# Arguments:
# x
# Returns:
# ...
# Extract results
dat = x$Data
# Extract indices
DV = x$DV
IVs = x$IVs
keep = !dat$Excluded
match = dat$Matched
comparisons_full = apply( dat[ , IVs ], 2, compare_groups,
group = dat$Test, sel = keep )
comparisons_matched = apply( dat[ , IVs ], 2, compare_groups,
dat$Test, sel = keep & match )
out = list(
comparisons_full = comparisons_full,
comparisons_matched = comparisons_matched
)
class( out ) = 'summary.match_subjects'
return( out )
}
# 2.5)
#' @rdname match_subjects
#' @export
print.summary.match_subjects = function( x, full = F ) {
# Purpose:
# ...
# Arguments:
# x -
# full -
# Returns:
# ...
# Define function to format table of results for
# comparison of groups
df_res = function( cmp ) {
res = data.frame(
Difference = round( cmp[7,], 2 ),
p_val_diff = round( cmp[8,], 3 ),
sig_diff = ' ',
Test = ' ',
Statistic = round( cmp[1,], 2 ),
p_val = round( cmp[3,], 3 ),
sig = ' ',
BF = round( cmp[4,], 1 ),
pos = ' ',
post_pv = round( cmp[5,], 3 ),
sig2 = ' ',
stringsAsFactors = F
)
# Label test types
res$Test[ cmp[6,] == 2 ] = 'T-test'
res$Test[ cmp[6,] == 1 ] = 'Chi-sq.'
# Label significance
res$sig_diff[ res$p_val_diff < .05 ] = '* '
res$sig[ res$p_val < .05 ] = '* '
res$sig2[ res$post_pv < .05 ] = '* '
# Label positive evidence for null
res$pos[ res$BF > 3 ] = "' "
# Add meaningful column names
colnames( res ) = c(
'Std. diff.',
'p-value',
' ',
'Test',
'Stat',
'p-value',
' ',
'BF (Null)',
' ',
'Post. p-value',
' '
)
return( res )
}
# Display results
cat( 'Matched groups', '\n' )
print( df_res( x$comparisons_matched ) )
if ( full ) {
cat( 'Full sample', '\n' )
print( df_res( x$comparisons_full ) )
}
cat( "* = p < .05", "\n" )
cat( "' = Positive evidence for null hyp.", "\n" )
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.