R/cv.trans.psa.R
In PSAgraphics: Propensity Score Analysis Graphics

Documented in cv.trans.psa

#' Transformation of Factors to Individual Levels
#'
#' The function \code{cv.trans.psa} takes a covariate data frame and replaces
#' each categorical covariate of n >=3 levels with \code{n} new binary
#' covariate columns, one for each level.  Transforms covariate dataframe for
#' use with the function \code{cv.bal.psa}.
#'
#'
#' @param covariates A dataframe of covariates, presumably some factors.
#' @param fcol An optional vector containing the factor columns in the
#' covariate dataframe.  In NULL (default) routine to identfy factors
#' internally.
#' @return Returns a dataframe \code{covariates.transformed} containing new
#' columns for each level of more than binary factors.  The rest of the
#' covariate dataframe stays unchanged.
#' @author James E. Helmreich \email{James.Helmreich@@Marist.edu}
#'
#' Robert M. Pruzek \email{RMPruzek@@yahoo.com}
#'
#' KuangNan Xiong \email{harryxkn@@yahoo.com}
#' @seealso \code{\link{cv.bal.psa}}, \code{\link{loess.psa}},
#' \code{\link{cstrata.psa}}, \code{\link{cv.trans.psa}}
#' @examples
#'
#' #Note reordering of columns, binary factor and numeric column are unchanged.
#' f2 <- factor(sample(c(0, 1), 20, replace = TRUE))
#' f4 <- factor(sample(c("a", "b", "c", "d"), 20, replace = TRUE))
#' cv <- rnorm(20)
#' X <- data.frame(f2, f4, cv)
#' cv.trans.psa(X)
#' #
#' f2 <- factor(sample(c('c', 'C'), 20, replace = TRUE))
#' f4 <- factor(sample(c("b", "A", "d", "CC"), 20, replace = TRUE))
#' cv <- rnorm(20)
#' X <- data.frame(f2, f4, cv)
#' cv.trans.psa(X)
#'
#' @export cv.trans.psa
cv.trans.psa <- function(covariates, fcol = NULL) {
   #cv.trans.psa takes the covariate data frame and replaces each categorical
   #covariate with n >=3 levels with n new binary covariate columns, one for each level.
   # covariates: dataframe of covariates
   # fcol: columns containing categorical covariates to be exploded.  If 0 then routine identifies
   # categorical columns.
   X <- covariates

   #J: col2 will be vector with columns that are factors, either user defined or found internally.

   n.rows <- dim(X)[1]
   col2 <- fcol
   if (is.null(fcol))
   {
      xclass <- sapply(X, is.numeric)
      for (i in 1:dim(X)[2]) {
         if (!xclass[i]) {
            col2[i] <- i
         } else{
            col2[i] <- 0
         }
      }
   }

   #J: get rid of possible 0s in col2
   col2 <- col2[col2[] != 0]
   if (sum(col2) == 0) {
      stop("No Categorical/Factor Columns Identified")
   }


   #J: Binary factors become 0:1.  n-level factors (n >= 3) become n distinct columns,
   #coded 0:1 for each level in the obvious way.

   fac.num <- length(col2)
   d <- matrix(0, nrow = dim(X)[1], ncol = fac.num)
   fac.size <- NULL
   fac.codes <- NULL

   for (i in 1:fac.num)
   {
      # First find the levels of each factor, create the names for each of the new columns,
      # and code (in the matrix 'd') the factors numerically if they weren't already so coded.
      fac.i <- X[, col2[i]]
      fac.levels <- matrix(sort(unique(fac.i)))
      num.levels <- length(fac.levels)
      colnames(fac.levels) <- dimnames(X)[[2]][col2[i]]
      rownames(fac.levels) <-
         paste(dimnames(X)[[2]][col2[i]], 1:num.levels)
      fac.codes <- rbind(fac.codes, fac.levels)
      colnames(fac.codes) <- c("Levels")

      # fac.size will be 1 for a binary factor, number of levels for other factors.
      if (num.levels == 2) {
         fac.size[i] = 1
      } else {
         fac.size[i] = num.levels
      }
      for (j in 1:n.rows)
      {
         for (k in 1:num.levels)
         {
            if (fac.levels[k] == fac.i[j])
               d[j, i] = k - 1
         }
      }
   }

   # dd has one column for each binary factor, and one column for each *level* of other factors.  Same for dlabel
   dd = matrix(0, nrow = n.rows, ncol = sum(fac.size))
   dlabel = matrix(0, nrow = 1, ncol = sum(fac.size))
   ko = 0
   for (p in 1:fac.num)
   {
      h1 = ko + 1
      ko = ko + fac.size[p]
      h2 = ko

      if (fac.size[p] > 2)
      {
         fac.p <- X[, col2[p]]
         mmdp <- matrix(0, n.rows, length(unique(d[, p])))
         dd[, h1:h2]  <- ifelse(d[, p] == (col(mmdp) - 1), 1, 0)
         dlabel[, h1:h2] = paste(dimnames(X)[[2]][col2[p]], sort(unique(fac.p)), sep =
                                    '_')
      }
      if (fac.size[p] == 1)
      {
         fac.p <- X[, col2[p]]
         dd[, h2] = d[, p]
         dlabel[, h2] = paste(dimnames(X)[[2]][col2[p]], sort(unique(fac.p))[2], sep =
                                 '_')
      }

      #  if (fac.size[p] > 2)
      #     {mmdp <- matrix(0, n.rows, length(unique(d[,p])))
      #      dd[ ,h1:h2]  <- ifelse(d[,p] == (col(mmdp)-1), 1, 0)
      #      dlabel[ ,h1:h2] = paste(dimnames(X)[[2]][col2[p]], 1:fac.size[p], sep='_')
      #     }
      #   if (fac.size[p] == 1)
      #     {dd[,h2] = d[,p]
      #      dlabel[,h2] = dimnames(X)[[2]][col2[p]]
      #     }


      X[, col2[p]] = d[, p]
   }

   colnames(dd) = dlabel[1, ]
   X2 = cbind(X, dd)
   X = X2[, -col2]


   out <- list(X)
   names(out) <- c("covariates.transformed")
   return(out)

}