R/syn.strata.r

Defines functions syn.strata

Documented in syn.strata

# Strata can be provided as a vector (=flag) or as variable names 
# If all stratifying values have the same values within strata they will be
# automatically removed; if not they will be taken into account

# ????? 
#! probably NOT possible and syn.strata needs to be kept as a separate FUN
# creating generic syn function
# changing syn to syn.default
# and if strata != NULL calling syn.strata
# ?????

# Other issues:
# - multiple printout (per strata, per m)
# - compare() by strata?
# - k as a sum?

syn.strata <- function(data, strata = NULL, 
                minstratumsize = 10 + 10 * length(visit.sequence), 
                tab.strataobs = TRUE, tab.stratasyn = FALSE,
                method = "cart", visit.sequence = (1:ncol(data)),
                predictor.matrix = NULL,
                m = 1, k = nrow(data), proper = FALSE,
                minnumlevels = 1, maxfaclevels = 60,
                rules = NULL, rvalues = NULL,
                cont.na = NULL, semicont = NULL,
                smoothing = NULL, event = NULL, denom = NULL,
                drop.not.used = FALSE, drop.pred.only = FALSE,
                default.method = c("normrank","logreg","polyreg","polr"),
                numtocat = NULL, catgroups = rep(5,length(numtocat)), 
                models = FALSE,
                print.flag = TRUE,
                seed = "sample",
                ...){
 
 m0 <- max(1, m)

 if (!is.na(seed) & seed == "sample") {
   seed <- sample.int(1e9,1)
 }
 if (!is.na(seed)) set.seed(seed)
 
 # CHECKS
 #--------
 if (is.null(strata)) stop("Argument strata is missing.", call. = FALSE) 
 # If strata given as variable names (check if they exist) 
 # -> change into one factor with strata names 
 if (is.character(strata) & any(!duplicated(strata))) {
   varindex <- match(strata, colnames(data))
   if (any(is.na(varindex))) stop("Unrecognized variable(s) in strata: ", 
     paste(strata[is.na(varindex)],collapse=", "), call. = FALSE)
   else {
     dstrataNA  <- lapply(data[, strata, drop = FALSE], addNA, ifany = TRUE) # change NA to a factor level
     strata.lab <- interaction(dstrataNA, drop = TRUE, sep = "_")
     #strata.varnames <- paste0(strata, collapse="_") 
   }
 } else {
 # check length of strata vector if given as vector; check for missing
   if (length(strata) != nrow(data)) stop(paste("The length of strata index (",
     length(strata), ") does not match the number of rows in the data (",
     nrow(data),").",sep=""), call. = FALSE)
   if (any(is.na(strata))) stop("Strata indicator cannot have missing values.", 
     call. = FALSE)
   strata.lab <- factor(strata)
 }
 #--------
 
 # make sure stratification variables are included in visit.sequence
 # important when drop.not.used==T
 if (is.character(strata) & any(!duplicated(strata))){   #GR-20/10/2016 drop.not.used == TRUE removed from the condition
   strata <- match(strata, colnames(data))
   if (is.character(visit.sequence)) visit.sequence <- match(visit.sequence, colnames(data))
   if (any(is.na(visit.sequence))) stop("Unrecognized variable(s) in visit.sequence.", call. = FALSE)
   visit.sequence <- c(visit.sequence, strata[!(strata %in% visit.sequence)])
 }
 
 stratalev.lab <- levels(strata.lab) 
 strata.n.obs  <- table(strata.lab)   
 nstrata       <- dim(strata.n.obs)

 if (tab.strataobs == TRUE) {
   cat("Number of observations in strata (original data):")
   print(table(strata.lab, deparse.level = 0))
 }

 # check min number of observations in strata
 stratasize.stop <- minstratumsize         
 stratasize.warn <- 100 + 10*length(visit.sequence)

 smallstrata <- sum(strata.n.obs < stratasize.stop)
 if (smallstrata > 5) stop("In the original data multiple strata have fewer than the recommended\nnumber of observations. We advise that each should have at least ",
   stratasize.stop, " observations\n('minstratumsize' which by default is 10 + 10 * no. of variables used in prediction).\n",
   "You can override this by setting the parameter 'minstratumsize' to a lower value.\n", 
   sep = "", call. = FALSE)
 if (smallstrata > 0) stop("In the original data some strata (", 
   paste(stratalev.lab[strata.n.obs < stratasize.stop], collapse = ", "), 
   ") have fewer than the recommended\nnumber of observations. We advise that each should have at least ",
   stratasize.stop, " observations\n('minstratumsize' which by default is 10 + 10 * no. of variables used in prediction).\n",
   "You can override this by setting the parameter 'minstratumsize' to a lower value.\n", 
   sep = "", call. = FALSE)
 if (any(strata.n.obs < stratasize.warn) & print.flag == TRUE) {
   cat("CAUTION: In the original data some strata (", 
   paste(stratalev.lab[strata.n.obs < stratasize.warn], collapse=", "), 
   ") have limited numbers of observations.\nWe advise that there should be at least ", 
   stratasize.warn, 
   " observations (100 + 10 * no. of variables\nused in prediction).\n", sep = "")
 }
 
 synds.names <- c("call", "m", "syn", "method", "visit.sequence", 
   "predictor.matrix", "smoothing", "event", "denom", "proper", "n", "k", 
   "rules", "rvalues", "cont.na", "semicont", "drop.not.used", "drop.pred.only", 
   "models", "seed", "var.lab", "val.lab", "obs.vars", "strata.syn", "strata.lab","numtocat","catgroups")
 synds <- list(setNames(vector("list",length(synds.names)),synds.names)) 
 synds <- rep(synds, m0)
 sel.names <- match(c("call", "m", "predictor.matrix", "proper", "strata.syn",
   "strata.lab", "seed", "numtocat", "catgroups", "models"), synds.names)
 same.by.m <- c("call", "m", "method", "visit.sequence", "predictor.matrix", 
   "smoothing", "event", "denom", "proper", "n", "rules", "rvalues", "cont.na", 
   "semicont", "drop.not.used", "drop.pred.only",  "seed", "var.lab", 
   "val.lab", "obs.vars", "strata.lab","numtocat","catgroups")
 same.by.m.idx <- match(same.by.m, synds.names) 
  
 syn.args <- as.list(match.call()[-1])
   
 strata.n.syn <- vector("list", m0) 
 for (j in 1:m0){ 
   synds[[j]]$strata.syn <- sort(factor(sample(stratalev.lab, k, replace = TRUE, 
     prob = strata.n.obs), levels = stratalev.lab, exclude = NULL))   
   synds[[j]]$strata.lab <- stratalev.lab
   strata.n.syn[[j]] <- table(synds[[j]]$strata.syn, deparse.level = 0)
   if (tab.stratasyn == TRUE) {  
     cat("\nNumber of observations in strata (synthetic data, m = ", j, "):", sep="")
     print(strata.n.syn[[j]])
   }
 }
 #Different way of printing (all syn in one table)  
 #cat("\nNumber of observations in strata (synthetic data):\n")
 #starta.n.syn.df <- do.call("rbind", strata.n.syn) 
 #rownames(starta.n.syn.df) <- paste0("m = ", 1:m)
 #print(starta.n.syn.df)

 for (j in 1:m0){  
   synds.ind <- vector("list", nstrata) # results by stratum  
   # exclude args that are not in syn()
   syn.args$strata <- syn.args$tab.stratasyn <- syn.args$tab.strataobs <- 
     syn.args$minstratumsize <- NULL
   syn.args$m <- 1
   syn.args$visit.sequence <- visit.sequence
   # vs <- visit.sequence
   # syn.args$visit.sequence <- c(vs[(vs %in% strata)],vs[!(vs %in% strata)])  #!GR 9/17 move strata to start of visit sequence
  
   
   for (i in 1:nstrata) {
     if (print.flag) cat("\nm = ",j,", strata = ", stratalev.lab[i],
       "\n-----------------------------------------------------\n", sep="")
     if (!is.na(stratalev.lab[i])) {
       syn.args$data <- data[strata.lab == stratalev.lab[i],]  
     } else {
       syn.args$data <- data[is.na(as.character(strata.lab)),]
     }
     syn.args$k <- strata.n.syn[[j]][i]; names(syn.args$k) <- "strata"
     syn.args$seed <- NA 
     if (syn.args$k == 0 | m==0) syn.args$m <- 0 else syn.args$m <- 1
     synds.ind[[i]] <- do.call("syn", syn.args)
   }

   synds[[j]]$call <- match.call()
   synds[[j]]$m <- m
   synds[[j]]$proper <- proper
   synds[[j]]$predictor.matrix <- eval(parse(text = paste0("list(", 
     paste0("synds.ind[[", 1:nstrata, "]]$predictor.matrix", collapse = ", "),")")))
   synds[[j]]$models <- eval(parse(text = paste0("list(", 
     paste0("synds.ind[[", 1:nstrata, "]]$models", collapse = ", "),")")))
   synds[[j]]$seed <- seed
   synds[[j]][-sel.names] <- eval(parse(text = paste0("Map(rbind, ", 
     paste0("synds.ind[[", 1:nstrata, "]][-sel.names]", collapse = ", "),")")))
   if (k > 0 & m > 0) rownames(synds[[j]]$syn) <- 1:k
 }
 
 if (m==1 | m==0) {
   synds <- synds[[1]] 
 } else {
   synds <- eval(parse(text = paste0("Map(list, ", paste0("synds[[", 1:m, "]]", 
     collapse = ", "),")")))
   synds[same.by.m.idx] <- lapply(same.by.m.idx, function(x) synds[[x]][[1]])
 }

 if (m==0) cat("\nCAUTION: method, visit.sequence and predictor.matrix are lists that may vary by stratum and\nshould not be used to initialise syn. For initialising rerun with m = 0 without stratification.\n\n") 
 
 class(synds) <- "synds"
 return(synds) 
}

Try the synthpop package in your browser

Any scripts or data that you put into this service are public.

synthpop documentation built on Aug. 31, 2022, 5:06 p.m.