R/DataFrameMaker.R

#' Creates a data frame of artificial health data; category variables only
#'
#' @code{dfMaker(N = 10000, V = 26, pNA = 0.1, pd = 0.5)}
#'
#' @param N number of rows for the data frame
#' @param V number of explanatory variables. Each has three levels a, b and c.
#' @param pNA the proportion of values to make NA at random
#' @param pd the proportion of the binary response variable to be 'diseased'
#'
#' @details it can be useful to create a toy data frame of categorical explanatory
#' variables with NAs included. This will produce any number of columns with a maximum
#' of 26 categorical explanatory variables, each of three levels.
#'
#' @examples
#' DF = dfMaker(N = 5000, V = 5, pNA = 0.02, pd = 0.1)
#' str(DF)

dfMaker = function(N = 10000, V = 12, pNA = 0.1, pd = 0.5){
  n = round(N * pNA)
  ABC = data.frame(A=letters[sample(1:3, N, replace=TRUE)])
  colnames(ABC) = LETTERS[27-V]
  for(a in 2:V){
    X=letters[sample(1:3, N, replace=TRUE)]
    ABC = cbind(ABC,X)
    names(ABC)[ncol(ABC)] = LETTERS[26-V+a]
  }
  NAin = function(x){
    x[sample(1:length(x),n)] = NA
    return(x)
  }
  ABC = as.data.frame(apply(ABC, 2, NAin))
  ABC$Condition = factor(sample(c("Diseased","Clear"),N, prob=c(pd,1-pd), replace=TRUE))
  return(ABC)
}
helophilus/ColsTools documentation built on May 30, 2019, 4:03 p.m.