R/match_subjects.R

# The 'match_subjects' function
# Written by Kevin Potter
# email: kevin.w.potter@gmail.com
# Please email me directly if you
# have any questions or comments
# Last updated 2018-11-14

# Table of contents
# 1) match_subjects
# 2) Methods for 'match_subjects'
#   2.1) is.match_subjects
#   2.2) print.match_subjects
#   2.3) print.match_subjects
#   2.4) summary.match_subjects
#   2.5) print.summary.match_subjects

###
### 1) match_subjects
###

#' Helper Function for the MatchIt Command
#'
#' A helper function that simplifies the steps and
#' commands needed to find a subset of individuals
#' from a test and control group that match over
#' a collection of covariates when using the
#' \code{matchit} function from the \pkg{MatchIt}.
#'
#' @param df A data frame with a column denoting
#'   group membership (coded as 1 for the test group
#'   and 0 otherwise), columns for all covariates to
#'   match over, and an optional column with the
#'   identifier for each individual in the dataset.
#' @param DV A character string with the column
#'   name for the binary variable denoting group
#'   membership.
#' @param ID An optional character string giving the
#'   column name with the identifier for each
#'   individual. If \code{NULL}, default identifiers
#'   will be assigned to all individuals, saved in
#'   the column \code{ID}.
#'
#' @details Forthcoming.
#'
#' @references
#' Daniel E. Ho, Kosuke Imai, Gary King, Elizabeth A. Stuart (2011).
#'   MatchIt: Nonparametric Preprocessing for Parametric Causal Inference.
#'   Journal of Statistical Software, Vol. 42, No. 8, pp. 1-28.
#'   URL http://www.jstatsoft.org/v42/i08/
#'
#' @return Forthcoming.
#'
#' @export
#' @examples
#' # Example dataset from 'MatchIt' package
#' data( 'lalonde', package = 'MatchIt' )
#'
#' # Extract data of interest
#' df = lalonde[,c('treat','age','educ','black','hispan','married','nodegree')]
#' mo = match_subjects( df, DV = 'treat' )
#' mo # Brief summary of matching
#' # Extract subset of data that was matched
#' md = subset( mo )
#' # Summary of results with tests for comparison of groups
#' # (Can take some time to compute)
#' sm = summary( mo )
#' sm
#' # Can display results for full original sample as well
#' print( sm, full = T )

match_subjects = function( df, DV, ID = NULL ) {

  # Specify subject IDs
  if ( is.null( ID ) ) {
    df$ID = rep( 1:nrow( df ) )
    ID = 'ID'
  }

  # Variable names in data frame
  cn = colnames( df )
  # Determine variables to
  # match over
  IVs = cn[ !grepl( DV, cn ) & !grepl( ID, cn ) ]

  # Remove rows with any missing values
  no_na = apply( df, 1, function(x) all( !is.na( x ) ) )
  dtbf = df[ no_na, ]

  # Create formula for variables to match over
  frm = paste(
    DV, '~',
    paste( IVs, collapse = ' + ' )
  )
  frm = as.formula( frm )

  # Initialize output
  nc =
    1 + # ID
    1 + # DV
    length( IVs ) +
    7
  output = matrix( NA, 0, nc )
  colnames( output ) = c(
    ID, DV, IVs,
    'Test', 'Matched', 'Excluded',
    'Index_Test', 'Index_Control',
    'Weight', 'Propensity_score'
  )
  output = as.data.frame( output )

  # Attempt exact matching
  match_output = tryCatch(
    {
      MatchIt::matchit( frm,
                        # Ancillary options for procedure
                        method = 'exact', data = dtbf )
    },
    error = function(e) return( NULL )
  )

  if ( !is.null( match_output ) ) {

    # Extract matched rows
    m_df_exact = MatchIt::match.data( match_output )

    # Rearrange results and concatenate to
    # final output data frame
    tmp = data.frame(
      Test = m_df_exact[[ DV ]],
      Matched = T,
      Excluded = F,
      Index_Test = 0,
      Index_Control = 0,
      Weight = m_df_exact$weights,
      Propensity_score = NA,
      stringsAsFactors = F
    )
    tmp$Index_Test[ tmp$Test == 1 ] =
      m_df_exact$subclass[ tmp$Test == 1 ]
    tmp$Index_Control[ tmp$Test == 0 ] =
      m_df_exact$subclass[ tmp$Test == 0 ]
    output = rbind( output,
                    cbind( m_df_exact[,c(ID,DV,IVs)], tmp ) )

    # Remove matched results from data to analyze
    exclude = dtbf[[ID]] %in% m_df_exact[[ID]]
    dtbf = dtbf[ !exclude, ]

  }

  # Attempt matching based on using nearest-neighbours
  # and the propensity scores
  match_output = tryCatch(
    {
      MatchIt::matchit( frm,
                        # Ancillary options for procedure
                        method = 'nearest', data = dtbf )
    },
    error = function(e) return( NULL )
  )

  if ( !is.null( match_output ) ) {

    # Extract matched rows
    m_df_nearest = MatchIt::match.data( match_output )

    # With the nearest neighbours approach, the 'matchit'
    # function returns a matrix with a column giving the
    # row index for the matched control subjects. The
    # row names of the matrix are the row names of the
    # associated test subjects.
    mm = match_output$match.matrix

    if ( nrow( output ) == 0 ) {
      adj = 0
    } else {
      adj = max( output$Index_Control )
    }

    # Rearrange results
    tmp = data.frame(
      Test = m_df_nearest[[ DV ]],
      Matched = T,
      Excluded = F,
      Index_Test = 0,
      Index_Control = 0,
      Weight = m_df_nearest$weights,
      Propensity_score = m_df_nearest$distance,
      stringsAsFactors = F
    )
    # Determine which subjects match each other
    # approximately across groups
    # Loop over subjects in the test group
    inc = 1
    for ( s in 1:nrow( mm ) ) {
      # Identify test subject
      id_test = dtbf[ rownames( mm )[s], ID ]
      sel_test = m_df_nearest[[ ID ]] %in% id_test
      # Identify matched control subject(s)
      id_con = dtbf[ mm[s,], ID ]
      sel_con = m_df_nearest[[ ID ]] %in% id_con
      tmp$Index_Test[sel_test] = inc + adj
      tmp$Index_Control[sel_con] = inc + adj
      inc = inc + 1
    }
    # Concatenate to final data frame
    output = rbind(
      output,
      cbind(
        m_df_nearest[,c(ID,DV,IVs)],
        tmp )
    )

    # Remove matched results from data to analyze
    exclude = dtbf[[ID]] %in% m_df_nearest[[ID]]
    dtbf = dtbf[ !exclude, ]
  }

  # Concatenate remaining observations to final
  # data frame

  # Non-matched values
  if ( nrow( dtbf ) != 0 ) {

    # Rearrange results
    tmp = data.frame(
      Test = dtbf[[ DV ]],
      Matched = F,
      Excluded = F,
      Index_Test = 0,
      Index_Control = 0,
      Weight = NA,
      Propensity_score = NA,
      stringsAsFactors = F
    )

    # Concatenate to final data frame
    output = rbind(
      output,
      cbind( dtbf[,c(ID,DV,IVs)],
             tmp
      )
    )

  }

  # Excluded values
  if ( any( !no_na ) ) {

    # Rearrange results
    tmp = data.frame(
      Test = df[[ DV ]][!no_na],
      Matched = F,
      Excluded = T,
      Index_Test = 0,
      Index_Control = 0,
      Weight = NA,
      Propensity_score = NA,
      stringsAsFactors = F
    )

    # Concatenate to final data frame
    output = rbind(
      output,
      cbind( df[!no_na,c(ID,DV,IVs)],
             tmp
      )
    )

  }

  # Create final output
  out = list(
    Data = output,
    DV = DV,
    ID = ID,
    IVs = IVs
  )

  # Create S3 class
  class( out ) = "match_subjects"

  return( out )
}

###
### 2) Methods for 'match_subjects'
###

# 2.1)
#' @rdname match_subjects
#' @export

is.match_subjects = function( x ) {
  # Purpose:
  # ...
  # Arguments:
  # x - An R object
  # Returns:
  # ...

  out = inherits(x, "match_subjects")

  return( out )
}

# 2.2)
#' @rdname match_subjects
#' @export

subset.match_subjects = function( x ) {
  # Purpose:
  # ...
  # Arguments:
  # An R object of class 'match_subject'
  # Returns:
  # ...

  # Extract data
  dat = x$Data

  # Select only matched cases
  sel = x$Data$Matched
  out = dat[ sel, ]

  # Sort data by
  # matched indices for
  # control and test groups,
  # respectively
  o = order( out$Index_Test, out$Index_Control )
  out = out[ o, ]

  return( out )
}

# 2.3)
#' @rdname match_subjects
#' @export

print.match_subjects = function( x ) {
  # Purpose:
  # ...
  # Arguments:
  # An R object of class 'match_subject'
  # Returns:
  # ...

  # Extract results
  dat = x$Data

  # Helper function to quickly print a string
  # to the console window
  qcat = function( x ) {
    cat( c( x, '\n' ) )
  }

  # Total number of subjects
  Total_N = nrow( dat )
  qcat( paste( 'Total N:', Total_N ) )

  # Details for matched subjects in test group
  qcat( 'Matched subjects (test group)' )

  N = sum( x$Data$Test == 1 )
  P_group = round( 100*(N/Total_N) )
  Matched = sum( dat$Test[ dat$Matched ] == 1 )
  P = round( 100*(Matched/N) )
  string = paste( 'N = ', N, ' (', P_group, '%); ',
                  'Matched = ', Matched, ', ',
                  P, '%', sep = '' )
  qcat( string )
  N_test = N

  # Details for matched subjects in control group
  qcat( 'Matched subjects (control group)' )

  N = sum( dat$Test == 0 )
  P_group = round( 100*(N/Total_N) )
  Matched = sum( dat$Test[ dat$Matched ] == 0 )
  P = round( 100*(Matched/N_test) )
  string = paste( 'N = ', N, ' (', P_group, '%); ',
                  'Matched = ', Matched, ', ',
                  P, '%', sep = '' )
  qcat( string )

  # Average and range for the standardized difference
  # scores
  std_d = apply(
    x$Data[ x$Data$Matched ,x$IVs ], 2,
    std_difference,
    test = x$Data$Test[ x$Data$Matched ] )

  std_d_m = mean( std_d )
  std_d_r = range( std_d )

  # Details for matched subjects in control group
  qcat( 'Average and range - standardized difference' )
  string = paste(
    round( std_d_m, 2 ), ' (',
    round( std_d_r[1], 2 ), ' - ',
    round( std_d_r[2], 2 ), ')',
    sep = '' )
  qcat( string )

}

# 2.4)
#' @rdname match_subjects
#' @export

summary.match_subjects = function( x ) {
  # Purpose:
  # ...
  # Arguments:
  # x
  # Returns:
  # ...

  # Extract results
  dat = x$Data

  # Extract indices
  DV = x$DV
  IVs = x$IVs
  keep = !dat$Excluded
  match = dat$Matched

  comparisons_full = apply( dat[ , IVs ], 2, compare_groups,
                            group = dat$Test, sel = keep )

  comparisons_matched = apply( dat[ , IVs ], 2, compare_groups,
                               dat$Test, sel = keep & match )

  out = list(
    comparisons_full = comparisons_full,
    comparisons_matched = comparisons_matched
  )

  class( out ) = 'summary.match_subjects'

  return( out )
}

# 2.5)
#' @rdname match_subjects
#' @export

print.summary.match_subjects = function( x, full = F ) {
  # Purpose:
  # ...
  # Arguments:
  # x    -
  # full -
  # Returns:
  # ...

  # Define function to format table of results for
  # comparison of groups
  df_res = function( cmp ) {

    res = data.frame(
      Difference = round( cmp[7,], 2 ),
      p_val_diff = round( cmp[8,], 3 ),
      sig_diff = ' ',
      Test = ' ',
      Statistic = round( cmp[1,], 2 ),
      p_val = round( cmp[3,], 3 ),
      sig = ' ',
      BF = round( cmp[4,], 1 ),
      pos = ' ',
      post_pv = round( cmp[5,], 3 ),
      sig2 = ' ',
      stringsAsFactors = F
    )

    # Label test types
    res$Test[ cmp[6,] == 2 ] = 'T-test'
    res$Test[ cmp[6,] == 1 ] = 'Chi-sq.'

    # Label significance
    res$sig_diff[ res$p_val_diff < .05 ] = '* '
    res$sig[ res$p_val < .05 ] = '* '
    res$sig2[ res$post_pv < .05 ] = '* '
    # Label positive evidence for null
    res$pos[ res$BF > 3 ] = "' "

    # Add meaningful column names
    colnames( res ) = c(
      'Std. diff.',
      'p-value',
      '  ',
      'Test',
      'Stat',
      'p-value',
      '  ',
      'BF (Null)',
      '  ',
      'Post. p-value',
      '  '
    )

    return( res )
  }

  # Display results
  cat( 'Matched groups', '\n' )
  print( df_res( x$comparisons_matched ) )

  if ( full ) {

    cat( 'Full sample', '\n' )
    print( df_res( x$comparisons_full ) )
  }

  cat( "* = p < .05", "\n" )
  cat( "' = Positive evidence for null hyp.", "\n" )
}
rettopnivek/guiMatchIt documentation built on May 17, 2019, 10:39 a.m.