R/designmatrix.R

Defines functions marginal_means_algebra designmatrix is.designmatrix print.designmatrix summary.designmatrix plot.designmatrix subset.designmatrix `subset<-`

Documented in designmatrix is.designmatrix plot.designmatrix print.designmatrix subset.designmatrix summary.designmatrix

# The 'designmatrix' function
# Written by Kevin Potter
# email: kevin.w.potter@gmail.com
# Please email me directly if you
# have any questions or comments
# Last updated 2019-03-15

# Table of contents
# 1) Internal functions
#   1.1) marginal_means_algebra
# 2) designmatrix
# 3) Methods
#   3.1) is.designmatrix
#   3.2) print.designmatrix
#   3.3) summary.designmatrix
#   3.4) plot.designmatrix
#   3.5) subset.designmatrix
#   3.6) subset<-
#   3.7) data.frame.designmatrix

###
### 1) Internal functions
###

# 1.1)
marginal_means_algebra = function( dm ) {
  # Purpose:
  # Creates a table showing the algebra used
  # to compute the marginal group means based
  # on a specified design matrix.
  # Arguments:
  # dm - An object of class 'designmatrix'
  # Returns:
  # A data frame.

  # Extract summary matrix
  sm = dm$summary_matrix
  # Define total number of possible regression
  # coefficients
  cf = paste( 'B', 0:( ncol( sm ) - 1 ), sep = '' )

  # Initialize matrix showing algebra for
  # marginal group means
  mm = matrix( " ", nrow(sm), ncol(sm) )
  colnames( mm ) = colnames( sm )

  # Loop over rows
  for ( i in 1:nrow( mm ) ) {

    # Identify coefficients to exclude
    sel = sm[i,] == 0
    # Create display for weights/coefficient values
    # (round to one decimal place)
    val = paste( '(',round(sm[i,],1),')', sep = '' )
    # Add coefficient label
    res = paste( val, cf )
    # Specify additive operator when appropriate
    sel2 = cumsum( !sel ) > 1
    if ( any( sel2 ) ) {
      res[sel2] = paste( '+', res[sel2] )
    }
    mm[i,!sel] = res[!sel]

    # Remove weights of 1
    mm[i,] = gsub( '(1) ', '', mm[i,], fixed = T )
    # Remove weights of -1 and update operator
    mm[i,] = gsub( '+ (-1) ', '- ', mm[i,], fixed = T )

  }
  # Convert to data frame
  mm = data.frame( mm, stringsAsFactors = F )


  # Determine labels and weights for overall average
  avg = colSums( sm )/nrow( sm )
  # Identify coefficients to exclude
  sel = avg == 0
  # Round to one decimal place
  avg = round( avg, 1 )
  # Create display for weights/coefficient values
  avg = paste( '(', avg, ')', sep = '' )
  # Add coefficient label
  fin_avg = paste( avg, cf )
  # Specify additive operator when appropriate
  sel2 = cumsum( !sel ) > 1
  if ( any( sel2 ) ) {
    fin_avg[sel2] = paste( '+', fin_avg[sel2] )
  }
  fin_avg[sel] = " "

  # Remove weights of 1
  fin_avg = gsub( '(1) ', '', fin_avg, fixed = T )
  # Remove weights of -1 and update operator
  fin_avg = gsub( '+ (-1) ', '- ', fin_avg, fixed = T )

  # Add grouping variables
  mm = cbind( dm$group_means[,dm$variables],
              mm )
  # Convert everything to character strings
  for ( i in 1:ncol(mm) ) mm[[i]] = as.character( mm[[i]] )
  # Add overall average
  tmp = character( length( dm$variables ) )
  tmp[ length( tmp ) ] = 'Avg.'
  tmp = c( tmp, fin_avg )
  names( tmp ) = colnames( mm )
  # Add separator for overall average
  tmp2 = rep( '-', ncol( mm ) )
  names( tmp2 ) = colnames( mm )
  mm = rbind( mm, tmp2, tmp )

  if ( length( dm$variables ) == 1 )
    colnames( mm )[1] = dm$variables[1]

  return( mm )
}

###
### 2)
###

#' Initialize and Update Design Matrices
#'
#' Either 1) initialize a design matrix given a data frame and
#' a list specifying the dependent variable and associated
#' grouping variables, or 2) update a design matrix after
#' modification of of the element summarizing the design matrix
#' over all possible combinations of levels for the grouping
#' variables.
#'
#' @param df A data frame with the observations for
#'   the dependent variable and associated grouping variables.
#' @param select A list with...
#'   \enumerate{
#'     \item The column name for the dependent variable.
#'     \item The set of column names for the grouping variables.
#'   }
#'   If only the column name for the dependent variable is provided,
#'   the function assumes all remaining columns are the set of
#'   grouping variables.
#' @param dm An object of class \code{designmatrix}. If provided,
#'   the function will update the object based on the specifications
#'   of the \code{summary_matrix} element.
#' @param digits The number of digits to round to when
#'   computing the group means.
#'
#' @return An object of class \code{designmatrix}. The \code{print}
#'   method displays the combination of levels and associated
#'   rows of the design matrix. The \code{summary} method
#'   displays the combination of levels and their associated
#'   descriptive statistics (mean, standard deviation,
#'   standard error of the mean, and sample size). The method
#'   \code{subset} can be used to access the summary matrix
#'   (or the full design matrix if \code{summary} is \code{TRUE}),
#'   or to replace elements of the summary matrix.
#'
#'   The \code{plot} method generates a simple plot of the group
#'   means (filled circles). If elements of the design matrix are
#'   non-zero, the method fits a simple linear model to the data
#'   using the specified design matrix and shows the resulting
#'   estimates for the group means (empty circles). If the option
#'   \code{intercept} is set to \code{TRUE} (the default), the method
#'   assumes the intercept term is defined in the user-submitted
#'   design matrix. The option \code{error_bars}, if \code{TRUE},
#'   adds approximate 95% uncertainty intervals with boundaries
#'   equal to +/- 2 standard errors relative to the observed means.
#'   The option \code{exclude_effects} takes a string character
#'   with the column names for the design matrix. The method
#'   then displays what the estimated means would be were these
#'   columns excluded from from the design matrix (empty blue circles).
#'   Additional plotting options for \code{\link[graphics]{plot}},
#'   excluding \code{xlab}, \code{ylab}, \code{xaxt}, \code{pch},
#'   and \code{bty}, can be supplied.
#'
#' @examples
#' # Example with R data set 'PlantGrowth'
#' dm = designmatrix( PlantGrowth, list( 'weight', 'group' ) )
#' # Summarize group means
#' summary( dm )
#'
#' # Specify design matrix
#' # Main effect of treatment 1
#' subset( dm )[1:2,2] = c(-1,1)
#' # Main effect of treatment 2
#' subset( dm )[c(1,3),3] = c(-1,1)
#'
#' # Update summaries and full design matrix
#' dm = designmatrix( dm )
#'
#' # Example of methods
#' print( dm )
#' plot( dm, intercept = T, error_bars = T, exclude_effects = c( 'X2', 'X3' ) )
#'
#' # Example analysis
#' dtba = getdata( dm )
#' lmf = lm( DV ~ -1 + ., data = dtba )
#'
#' @export

designmatrix = function( df = NULL,
                         select = NULL,
                         dm = NULL,
                         digits = 2 ) {

  # Labels for descriptive statistics
  ds_vrb = c( 'M', 'SD', 'SEM', 'N' )

  # Check if 'df' input is provided
  if ( !is.null( df ) ) { # Open (1)

    # If is of class 'designmatrix' proceed to next segment
    if ( is.designmatrix( df ) ) {

      dm = df

    } else { # Open (2)

      # Check input is a data frame
      if ( !is.data.frame( df ) ) {
        stop( paste(
          "Either need 1) a data frame of conditions and observations",
          "(and a list with at least the column name for the dependent",
          "variable), or 2) an object of class 'designmatrix'"
        ), call. = FALSE )

      }

      # Define error message to return for issues related
      # to the 'select' input
      err_msg = function( type = 1 ) {

        if ( type == 1 ) {
          stop( paste(
            "Provide a list giving 1) the dependent variable name,",
            "and 2) the set of names for the grouping variables" ),
            call. = F )
        }
        if ( type == 2 ) {
          warning( paste(
            "Only first element in character string was used",
            "as column name for dependent variable"
          ), call. = FALSE )
        }
        if ( type == 3 ) {
          stop( paste(
            "At least some provided column names are",
            "not present in data frame"
          ), call. = FALSE )
        }
      }

      # Initialize variables
      DV = NULL
      vrb = NULL

      # If no 'select' input is provided
      if ( is.null( select ) ) {
        err_msg()
      } else { # Open (3)

        # If a character string is provided
        # convert it to a list
        if ( is.character( select ) ) {
          if ( length( select ) == 1 ) {
            select = list( select )
          } else {
            select = list( select[1] )
            err_msg( type = 2 )
          }
        }

        # If 'select' input is not a list
        if ( !is.list( select ) ) {
          err_msg()
        } else { # Open (4)

          # If excess list elements are provided
          if ( length( select ) > 2 ) {
            warning( paste(
              "Excess list elements provided when",
              "specifying dependent and grouping",
              "variables"
            ), call. = FALSE )
          }

          # Extract dependent variable
          DV = select[[1]][1]
          if ( length( select[[1]] ) > 1 ) {
            err_msg( type = 2 )
          }

          cn = colnames( df )

          # If no other elements provided in data frame
          if ( length( cn ) == 1 ) {
            stop( paste(
              "Data frame should have both a dependent",
              "variable and at least one grouping variable"
            ), call. = FALSE )
          }

          # If provided name is not in column names
          if ( !( DV %in% cn ) ) {
            err_msg( type = 3 )
          } else { # Open (5)

            # If only one element, assume
            # remaining columns are grouping variables
            if ( length( select ) == 1 ) {
              vrb = cn[ cn != DV ]
            } else {
              vrb = select[[2]]
            }

            # If grouping variables are not in data frame
            if ( !all( vrb %in% cn ) ) {
              err_msg( type = 3 )
            }

          } # Close (5)

        } # Close (4)

        # Final check to make sure a dependent variable
        # and set of grouping variables were extracted
        if ( is.null( DV ) | is.null( vrb ) ) {
          err_msg()
        }

      } # Close (3)

      # Check if any of the selected columns have
      # names that overlap with the descriptive statistics
      comp = c( ds_vrb, 'DV' )

      # For dependent variable
      if ( DV %in% comp ) {

        orig_DV = DV
        sel = colnames( df ) %in% DV
        colnames( df )[sel] = paste( DV, '2', sep = '.' )
        DV = paste( DV, '2', sep = '.' )

        warning( paste(
          "Changed", orig_DV, "to", DV,
          "to avoid conflict with labels for descriptive statistics"
        ), call. = FALSE )

      }

      # For grouping variables
      if ( any( vrb %in% comp ) ) {

        orig_vrb = vrb
        sel1 = vrb %in% comp
        sel2 = colnames( df ) %in% vrb[sel1]
        colnames( df )[sel2] = paste( vrb[sel1], '2', sep = '.' )
        vrb[sel1] = paste( vrb[sel1], '2', sep = '.' )

        if ( sum(sel1) > 1 ) {

          string = c(
            paste( '{', paste( orig_vrb[sel1], collapse = ', ' ), '}', sep = '' ),
            paste( '{', paste( vrb[sel1], collapse = ', ' ), '}', sep = '' )
          )

          warning( paste(
            "Changed", string[1], "to", string[2],
            "to avoid conflict with labels for descriptive statistics"
          ), call. = FALSE )
        } else {
          warning( paste(
            "Changed", orig_vrb[sel1], "to", vrb[sel1],
            "to avoid conflict with labels for descriptive statistics"
          ), call. = FALSE )
        }

      }

      # Generic name for dependent variable
      df$DV = df[[ DV ]]

      # Summary of group means based on
      # subset of grouping variables
      sm = df %>%
        group_by_at( vars(one_of(vrb)) ) %>%
        summarize(
          M = round( mean( DV ), digits = digits ),
          SD = round( sd( DV ), digits = digits ),
          SEM = round( sd( DV )/sqrt( length( DV ) ), digits = digits ),
          N = length( DV )
        ) %>%
        as.data.frame()

      # Initialize design matrix
      DM = matrix( 0, nrow( df ), nrow( sm ) )
      DM[,1] = 1 # By default, set first column as an intercept
      colnames( DM ) = paste( 'X', 1:nrow( sm ), sep = '' )
      # Initialize summary matrix
      X = matrix( 0, nrow( sm ), nrow( sm ) )
      X[,1] = 1 # By default, set first column as an intercept
      colnames( X ) = paste( 'X', 1:nrow( sm ), sep = '' )

      out = list(
        group_means = sm,
        summary_matrix = X,
        design_matrix = DM,
        combined = cbind( sm, X ),
        data = df,
        dv = DV,
        variables = vrb,
        algebra_group_means = NULL
      )
      out$combined = out$combined[ , !(colnames( out$combined ) %in% ds_vrb) ]
      # Determine algebra for marginal means
      out$algebra_group_means =
        marginal_means_algebra( out )
      class( out ) = "designmatrix"

      return( out )

    } # Close (2)

  } # Close (1)

  # If a object of class 'designmatrix' is provided,
  # update based on summary matrix
  if ( !is.null( dm ) ) { # Open (1)

    if ( is.designmatrix( dm ) ) { # Open (2)

      df = dm$data
      X = dm$summary_matrix
      DM = dm$design_matrix
      sm = dm$group_means
      nc = ncol( sm ) - length( ds_vrb )
      vrb = colnames( sm )[1:nc]

      nrow_sm = nrow( sm )
      nrow_df = nrow( df )
      n_vrb = length( vrb )
      log_val = rep( F, n_vrb )

      sm = as.list( sm )
      df = as.list( df )

      # Loop over rows of summary matrix
      for ( i in 1:nrow_sm ) {

        sel = logical( nrow_df )

        # Loop over rows for data
        for ( j in 1:nrow_df ) {

          # Loop over subset of columns
          for ( k in 1:n_vrb ) {
            # Identify which row of summary matrix current
            # column matches
            log_val[k] = df[[ vrb[k] ]][j] == sm[[ vrb[k] ]][i]
          }
          sel[j] = all( log_val )

        }
        DM[sel,] = matrix( X[i,], sum(sel), ncol( X ), byrow = T )

      }
      colnames( DM ) = colnames( X )

      # Convert back to data frames
      sm = data.frame( sm, stringsAsFactors = F )
      df = data.frame( df, stringsAsFactors = F )

      dm$design_matrix = DM
      dm$combined = cbind( sm, X )
      dm$combined = dm$combined[ , !(colnames( dm$combined ) %in% ds_vrb) ]
      dm$algebra_group_means =
        marginal_means_algebra( dm )

      return( dm )

    } else {
      stop( "Input must be a 'designmatrix' object.",
            call. = F )
    } # Close (2)

  } # Close (1)
}

###
### 3) Methods
###

# 3.1)
#' @rdname designmatrix
#' @export

is.designmatrix = function( x )
  inherits( x, "designmatrix" )

# 3.2)
#' @rdname designmatrix
#' @export

print.designmatrix = function( x ) {
  print( x$combined )
}

# 3.3)
#' @rdname designmatrix
#' @export

summary.designmatrix = function( x ) {
  return( x$group_means )
}

# 3.4)
#' @rdname designmatrix
#' @export

plot.designmatrix = function( x,
                              intercept = T,
                              exclude_effects = NULL,
                              error_bars = F,
                              average = T,
                              ... ) {

  df = x$data
  DM = x$design_matrix
  sel = apply( DM, 2, function(x) any( x != 0 ) )

  if ( any( sel ) ) {

    dtba = data.frame(
      y = df[[ x$dv ]]
    )

    cur_DM = as.matrix( DM[,sel] )
    colnames( cur_DM ) = colnames( DM )[sel]
    nd = x$combined
    nd_exclude = NULL
    if ( !is.null( exclude_effects ) ) {
      nd_exclude = nd
      nd_exclude[, colnames(nd) %in% exclude_effects ] = 0
    }

    dtba = cbind( dtba, cur_DM )

    if ( !intercept ) {
      lmf = lm( y ~ ., data = dtba )
    } else {
      lmf = lm( y ~ -1 + ., data = dtba )
    }
    prd = predict( lmf, newdata = nd )
    prd_exclude = NULL
    if ( !is.null( exclude_effects ) ) {
      prd_exclude = predict( lmf, newdata = nd_exclude )
    }
  }

  if ( error_bars ) {
    yl = range( c( x$group_means$M - 2*x$group_means$SEM,
                   x$group_means$M + 2*x$group_means$SEM ) )

    plot( 1:nrow( x$combined ),
          x$group_means$M,
          pch = 19,
          xlab = 'Conditions',
          ylab = 'Group means',
          bty = 'n',
          xaxt = 'n',
          ylim = yl,
          ...
    )
    axis( 1, 1:nrow( x$combined ) )

    segments( 1:nrow( x$group_means ),
              x$group_means$M - 2*x$group_means$SEM,
              1:nrow( x$group_means ),
              x$group_means$M + 2*x$group_means$SEM
    )

  } else {

    plot( 1:nrow( x$combined ),
          x$group_means$M,
          pch = 19,
          xlab = 'Conditions',
          ylab = 'Group means',
          bty = 'n',
          xaxt = 'n',
          ...
    )
    axis( 1, 1:nrow( x$combined ) )
  }

  if ( average ) {
    abline( h = mean( x$group_means$M ), lty = 2 )
  }

  if ( any( sel ) ) {

    points( 1:length( prd ),
            prd,
            pch = 21,
            bg = NA,
            cex = 2 )

    if ( !is.null( prd_exclude ) ) {
      points( 1:length( prd_exclude ),
              prd_exclude,
              pch = 21,
              col = 'blue',
              bg = NA,
              cex = 2 )
    }


  }

  # Extract plotting dimensions
  plt_dm = par('usr')

  # Add legend
  legend( plt_dm[1],
          plt_dm[4] + diff( plt_dm[3:4] )*.1,
          c( 'Observed',
             'Predicted',
             'Predicted - excluded'
          ),
          pch = c( 19, 21, 21 ),
          col = c( 'black',
                   'black',
                   'blue' ),
          horiz = T,
          bty = 'n',
          xpd = T
  )

  # legend(
  # )

}

# 3.5)
#' @rdname designmatrix
#' @export

subset.designmatrix = function( x, summary = T ) {

  if ( summary ) {
    # Return the summary matrix
    return( x$summary_matrix )

  } else {
    # Return the full design matrix
    return( x$design_matrix )

  }
}

# 3.6)
#' @rdname designmatrix
#' @export

`subset<-` = function( x, value, update = F ) {

  if ( !is.designmatrix( x ) ) {
    err_msg = paste( "Object being subsetted",
                     "must be of class",
                     "'designmatrix'" )
    stop( err_msg, call. = F )
  }

  if ( !is.matrix( value ) ) {
    err_msg = paste( "Input must be a matrix" )
    stop( err_msg, call. = F )
  }

  sm.nr = nrow( x$summary_matrix )
  sm.nc = ncol( x$summary_matrix )

  nr = nrow( value )
  nc = ncol( value )

  if ( nr != sm.nr ) {
    err_msg = paste( "Input must have same number of rows" )
    stop( err_msg, call. = F )
  }

  if ( nc > sm.nc ) {
    err_msg = paste( "Input cannot have more columns" )
    stop( err_msg, call. = F )
  }

  if ( nc == sm.nc ) {
    x$summary_matrix <- value
    if ( update ) x = designmatrix( x )
    return(x)
  }

  if ( nc < sm.nc ) {

    cn = colnames( value )
    sm.cn = colnames( x$summary_matrix )

    if ( all( cn %in% sm.cn ) ) {
      x$summary_matrix[,cn] <- value
      if ( update ) x = designmatrix( x )
      return(x)
    } else {
      err_msg = paste( "Column names must match" )
      stop( err_msg, call. = F )
    }

  }

}
rettopnivek/designmatrix documentation built on Jan. 13, 2020, 7:09 p.m.