R/compare.syn.r

Defines functions compare.CI dfCI compare.fit.synds dfNA ggnum ggfac pertable compare.list compare.synds compare.default compare

Documented in compare compare.fit.synds compare.list compare.synds

###-----compare------------------------------------------------------------
compare <- function(object, data, ...) UseMethod("compare")

###-----compare.default----------------------------------------------------
compare.default <- function(object, ...)
  stop("No compare method associated with class ", class(object), call. = FALSE)


###-----compare.synds------------------------------------------------------
compare.synds <- function(object, data, vars = NULL, msel = NULL,  
                          stat = "percents", breaks = 20, 
                          nrow = 2, ncol = 2, rel.size.x = 1,
                          utility.stats = c("pMSE", "S_pMSE", "df"),
                          utility.for.plot = "S_pMSE",  
                          cols = c("#1A3C5A","#4187BF"),
                          plot = TRUE, table = FALSE, ...){    
                                                                         
 if (is.null(data)) stop("Requires parameter data to give name of the real data.\n", call. = FALSE)
 if (!is.data.frame(data)) stop("Argument data must be a data frame.\n", call. = FALSE) 
  
 if (any(class(data) %in% c("tbl", "tbl_df"))) data <- as.data.frame(data)  
  
 if (!inherits(object, "synds")) stop("Object must have class synds.\n", call. = FALSE )                                                                    
 if (!is.null(msel) & !all(msel %in% (1:object$m))) stop("Invalid synthesis number(s).", call. = FALSE)
 if (!all(utility.stats %in% c("VW", "FT", "JSD", "SPECKS", "WMabsDD", "U", "G", "pMSE", "PO50", "MabsDD", "dBhatt",
                              "S_VW", "S_FT", "S_JSD", "S_WMabsDD", "S_G", "S_pMSE", "df", "all"))) 
  stop('utility.stats must be set to "all" or selected from "VW", "FT", "JSD", "SPECKS", "WMabsDD", "U", "G", "pMSE", "PO50", "MabsDD", "dBhatt", "S_VW", "S_FT", "S_JSD", "S_WMabsDD", "S_G", "S_pMSE" or "df".\n', call. = FALSE)  
 
  if (!is.null(utility.for.plot) &&
    !(length(utility.for.plot) == 1 & utility.for.plot %in% c("VW", "FT", "JSD", "SPECKS", "WMabsDD", "U", "G", "pMSE", "PO50", "MabsDD", "dBhatt",
                                "S_VW", "S_FT", "S_JSD", "S_WMabsDD", "S_G", "S_pMSE"))) 
    stop('utility.for.plot must be one of "VW", "FT", "JSD", "SPECKS", "WMabsDD", "U", "G", "pMSE", "PO50", "MabsDD", "dBhatt", "S_VW", "S_FT", "S_JSD", "S_WMabsDD", "S_G" or "S_pMSE" .\n', call. = FALSE)  
  if (!is.null(utility.for.plot) && !utility.for.plot %in% utility.stats ) cat(utility.for.plot, "used on plots added to table of results.\n")   
  utility.stats <- unique(c(utility.stats, utility.for.plot))
   
 if (!(length(stat) == 1 & stat %in% c("percents", "counts"))) { 
   cat('Parameter stat must be "percents" or "counts".\n' )
   stat <- "percents"
   cat('Changed to default value of "percents".\n')
 }

 # single / pooled synthetic data sets                                  
 if (object$m == 1) {
   syndsall <- object$syn 
 } else if (length(msel) == 1) {
   syndsall <- object$syn[[msel]]
 } else if (length(msel) > 1 | is.null(msel)) {                               
   syndsall <- do.call(rbind,object$syn)
 }
 # list of synthetic data sets for non-pooled results 
 if (length(msel) > 1) {
   synds <- vector("list",length(msel))
   for (i in 1:length(msel)) synds[[i]] <- object$syn[[msel[i]]]
 }
 synnames    <- names(syndsall)
 realnames   <- names(data)
 commonnames <- synnames[match(realnames,synnames)]
 commonnames <- commonnames[!is.na(commonnames)]

 if (!is.null(vars)) {
   if (!(all(vars %in% synnames))) stop("Variable(s) ", 
     paste0(vars[is.na(match(vars,synnames))], collapse = ", "),
     " not in synthetic data \n", call. = FALSE)
   if (!(all(vars %in% realnames))) stop("Variable(s) ", 
     paste0(vars[is.na(match(vars,realnames))], collapse = ", "),
     " not in observed data \n", call. = FALSE)
   commonnames <- commonnames[match(vars,commonnames)]
 }
 
 if (!(all(synnames %in% realnames))) cat("Warning: Variable(s)", 
   paste0(synnames[is.na(match(synnames,realnames))], collapse = ", "),
   "in synthetic object but not in observed data\n",
   " Looks like you might not have the correct data for comparison\n")
 
 if ((length(commonnames) == 0) && (typeof(commonnames) == "character"))        #! when would it apply?
   stop("None of variables selected for comparison in data", call. = FALSE)

 df.obs    <- data[, commonnames, drop = FALSE]
 df.synall <- syndsall[, commonnames, drop = FALSE]
 if (length(msel) > 1) {
   df.syn <- vector("list",length(msel))
   for (i in 1:length(msel)) df.syn[[i]] <- synds[[i]][, commonnames, drop = FALSE]
 }
 
 # change any numeric variables with < 6 distinct values to factors
 for (i in 1:length(commonnames)) {
   if (is.numeric(df.obs[,i]) && length(table(df.obs[,i])) < 6){
     df.obs[,i] <- as.factor(df.obs[,i])
     df.synall[,i] <- as.factor(df.synall[,i])
   }
 }

 num <- sapply(df.synall, is.numeric) | sapply(df.synall, is.integer)  
 fac <- sapply(df.synall, function(x) is.factor(x) | is.logical(x))  

 if (is.null(vars)) {
   if (object$m == 1) vars <- names(object$syn)
   else vars <- names(object$syn[[1]])
 }
 
 if (!is.null(utility.stats) | !is.null(utility.for.plot)) {
   utility.list <- as.list(1:length(vars))
   names(utility.list) <- vars
   if (utility.stats[1] == "all") utility.stats <- 
     c("VW", "FT", "JSD", "SPECKS", "WMabsDD", "U", "G", "pMSE", "PO50", 
       "MabsDD", "dBhatt","S_VW", "S_FT", "S_JSD", "S_WMabsDD", "S_G", 
       "S_pMSE", "df")
   tab.utility <- matrix(NA, length(vars), length(unique(c(utility.stats, utility.for.plot))))
   dimnames(tab.utility) <- list(vars, unique(c(utility.stats, utility.for.plot)))

   for (i in 1:length(vars)) {
     utility.list[[i]] <- utility.tab(object, data, vars = vars[i])
     if (i == 1) tab.ind <- match(utility.stats, names(utility.list[[i]]))
     tab.utility[i, ] <- sapply(utility.list[[i]][tab.ind], mean)
   }
   if (!is.null(utility.for.plot)) utilvals.for.plot <- tab.utility[, match(utility.for.plot, dimnames(tab.utility)[[2]])]
 } else {
   if (is.null(utility.stats)) tab.utility <- NULL
   if (is.null(utility.for.plot)) utilvals.for.plot <- NULL
 }
 # frequency tables for factors
 if (sum(fac) > 0) {
   any.fac.na <- unlist(apply(df.obs[,fac,drop = FALSE],2,function(x) any(is.na(x))))    
   per.obs.fac <- ggfac(df.obs[, fac, drop = FALSE], anyna = any.fac.na, stat = stat)  
   if (length(msel) <= 1) {
     per.syn.facall <- ggfac(df.synall[, fac, drop = FALSE], 
     name = "synthetic", anyna = any.fac.na, stat = stat)
     if (stat == "counts") {
       per.syn.facall$perdf$Count <- per.syn.facall$perdf$Count/object$m
       per.syn.facall$perlist <- lapply(per.syn.facall$perlist,"/",object$m)
     }
   }
   if (length(msel) > 1) {
     per.syn.fac <- vector("list",length(msel))
     for (i in 1:length(msel)) per.syn.fac[[i]] <- ggfac(df.syn[[i]][, fac, drop = FALSE], 
       name = paste0("syn=", msel[i]), anyna = any.fac.na, stat = stat)  
   }
 } else {                                       
   per.obs.fac    <- NULL
   per.syn.facall <- NULL
 }

 # frequency tables for numeric variables
 if (sum(num) > 0) {
   cont.index <- match(colnames(df.obs[, num, drop = FALSE]), colnames(syndsall))    
   na <- object$cont.na[cont.index] 
  # to exclude from summaries if no missing in data
   any.na <- unlist(apply(df.obs[, num, drop = FALSE], 2, function(x) any(is.na(x))))  

   lbreaks    <- as.list(rep(breaks, length(na)))  

  ## get limits(=breaks) from both observed and synthetic
  #--
   df.both  <- rbind.data.frame(df.obs, df.synall)                
   per.both <- ggnum(df.both[, num, drop = FALSE], na = na, 
     breaks = lbreaks, anyna = any.na, stat = stat) ##GR stat added 
  #--  
   per.obs.num <- ggnum(df.obs[, num, drop = FALSE], na = na, 
     breaks = per.both$hbreaks, anyna = any.na, stat = stat) ##GR stat added
   
   if (length(msel) <= 1) {
     per.syn.numall <- ggnum(df.synall[, num, drop = FALSE], 
       name = "synthetic", na = na, breaks = per.both$hbreaks, 
       anyna = any.na, stat = stat) 
       if (stat == "counts") {
         per.syn.numall$perdf$Count <- per.syn.numall$perdf$Count/object$m
         per.syn.numall$perlist <- lapply(per.syn.numall$perlist,"/",object$m)
       }
   } 
   if (length(msel) > 1) {
     per.syn.num <- vector("list",length(msel))
     for (i in 1:length(msel)) per.syn.num[[i]] <- ggnum(df.syn[[i]][, num, drop = FALSE], 
       name =  paste0("syn=",msel[i]), na = na, breaks = per.both$hbreaks, 
       anyna = any.na, stat = stat) 
   } 
 } else {
   per.obs.num <- NULL
   per.syn.numall <- NULL
 }

 # data frame for plotting 
 if (length(msel) <= 1) per.fac <- rbind.data.frame(per.obs.fac$perdf, 
   per.obs.num$perdf, per.syn.facall$perdf, per.syn.numall$perdf )

 if (length(msel) > 1) {
   per.fac <- rbind.data.frame(per.obs.fac$perdf, per.obs.num$perdf)
   for (i in 1:length(msel)) {
     if (sum(fac) > 0) temp.fac <- per.syn.fac[[i]]$perdf else temp.fac <- NULL
     if (sum(num) > 0) temp.num <- per.syn.num[[i]]$perdf else temp.num <- NULL  
     per.fac <- rbind.data.frame(per.fac, temp.fac, temp.num)
   }
 }
  per.fac$Variable <- factor(per.fac$Variable, levels = commonnames, 
   ordered = T, exclude = NULL)
  per.fac$Value <- factor(per.fac$Value, levels = unique(per.fac$Value), 
   ordered = T, exclude = NULL)
 
 # list of result tables
 if (length(msel) <= 1) {
   os.table.fac <- mapply(rbind, obs = per.obs.fac$perlist, 
     syn = per.syn.facall$perlist, SIMPLIFY = FALSE)  
   os.table.num <- mapply(rbind, obs = per.obs.num$perlist, 
     syn = per.syn.numall$perlist, SIMPLIFY = FALSE)  
 }
 if (length(msel) > 1) {
   os.table.fac <- per.obs.fac$perlist 
   os.table.num <- per.obs.num$perlist 
   for (i in 1:length(msel)) {
     if (sum(fac) > 0) temp.fac <- per.syn.fac[[i]]$perlist else temp.fac <- NULL
     if (sum(num) > 0) temp.num <- per.syn.num[[i]]$perlist else temp.num <- NULL
     os.table.fac <- mapply(rbind, os.table.fac, temp.fac, SIMPLIFY = FALSE)    
     os.table.num <- mapply(rbind, os.table.num, temp.num, SIMPLIFY = FALSE)     
   }
  }
 os.table <- c(os.table.fac, os.table.num)
 if (is.null(msel)) {
   for (i in 1:length(os.table)) dimnames(os.table[[i]])[[1]] <- c("observed","synthetic")
 } else {
   for (i in 1:length(os.table)) dimnames(os.table[[i]])[[1]] <- c("observed", paste0("syn=", msel))
 }
 Value <- Percent <- Count <- Data <- NULL    ## otherwise 'no visible binding for global variables'
 # sorts the factor labels in the right order for numeric vars
 per.fac$Value <- as.character(per.fac$Value)
 vals <- unique(per.fac$Value)
 valsnum <- unique(per.fac$Value[per.fac$Variable %in% names(num[num == TRUE])])
 valsnum.nonmiss <- sort(as.numeric(vals[vals %in% valsnum & substr(vals, 1, 4) != "miss"]))
 valsnum.nonmiss <- format(valsnum.nonmiss, scientific = FALSE,
                           trim = TRUE, drop0trailing = TRUE)
 valsnum.miss <- sort(vals[vals %in% valsnum & substr(vals, 1, 4) == "miss"])
 vals[vals %in% valsnum] <- c(valsnum.nonmiss, valsnum.miss)
 per.fac$Value <- factor(as.character(per.fac$Value), levels = vals)
 
 if (!is.null(utility.for.plot)){
   levels(per.fac$Variable) <- paste0(levels(per.fac$Variable), ": ", 
     utility.for.plot, " = ", round(utilvals.for.plot, 2))
   commonnames_lab <- paste0(commonnames, ": ",
     utility.for.plot, " = ", round(utilvals.for.plot, 2))
 } else {
   commonnames_lab <- commonnames
 }

 # get different plots in order of data
 nperplot <- nrow*ncol
 nplots   <- ceiling(length(commonnames)/nperplot)
 plots    <- vector("list", nplots)
 tables   <- vector("list", nplots)
 
 for (i in 1:nplots) {
   min <- (i - 1)*nperplot + 1
   max <- min(length(commonnames), (i - 1)*nperplot + nperplot)
   # tables
   ttables <- vector("list", (max - min + 1))
   names(ttables) <- commonnames[min:max]
   for (j in commonnames[min:max]) {
     ttables[[j]] <- os.table[[j]]
   } 
   tables[[i]] <- ttables

  # plots
   per.fact <- per.fac[per.fac$Variable %in% commonnames_lab[min:max],]
   if (stat == "percents") p <- ggplot(data = per.fact, aes(x = Value, y = Percent, fill = Data))
   else p <- ggplot(data = per.fact, aes(x = Value, y = Count, fill = Data))
   p <- p + geom_bar(position = "dodge", colour = cols[1], stat = "identity") + 
      facet_wrap(~ Variable, scales = "free", ncol = ncol)  
   p <- p + guides(fill = guide_legend(override.aes = list(colour = NULL))) + 
        theme(axis.text.x = element_text(angle = -30, hjust = 0, vjust = 1, size = rel(rel.size.x)), 
              legend.position = "top", 
              legend.key = element_rect(colour = cols[1])) 
   p <- p + theme(legend.title = element_blank())
   if (length(msel) > 1) p <- p + scale_fill_manual(values = c(cols[1], rep(cols[2], length(msel))))
   if (length(msel) <= 1) p <- p + scale_fill_manual(values = cols)
   plots[[i]] <- p
 }
 
 if (length(tables) == 1) {
   tables <- tables[[1]]
   plots  <- plots[[1]]
 }

 res <- list(tables = tables, plots = plots, stat = stat, vars = vars,
             tab.utility = tab.utility, table = table, plot = plot)

 class(res) <- "compare.synds"
 return(res)
}


###-----compare.data.frame---compare.list----------------------------------    
compare.data.frame <- compare.list <- function(object, data, vars = NULL, cont.na = NULL,         
                                               msel = NULL, stat = "percents", breaks = 20, 
                                               nrow = 2, ncol = 2, rel.size.x = 1,
                                               utility.stats = c("pMSE", "S_pMSE", "df"),
                                               utility.for.plot = "S_pMSE",
                                               cols = c("#1A3C5A","#4187BF"),   
                                               plot = TRUE, table = FALSE, ...){
  
  if (is.null(data)) stop("Requires parameter 'data' to give name of the real data.\n\n",  call. = FALSE)
  if (is.null(object)) stop("Requires parameter 'object' to give name of the synthetic data.\n\n",  call. = FALSE)   
  
  if (is.list(object) & !is.data.frame(object)) m <- length(object)
  else if (is.data.frame(object)) m <- 1
  else stop("object must be a data frame or a list of data frames.\n", call. = FALSE)
 
  # sort out cont.na to make it into a complete named list
  cna <- cont.na
  cont.na <- as.list(rep(NA, length(data)))
  names(cont.na) <- names(data)
  if (!is.null(cna)) {
    if (!is.list(cna) | any(names(cna) == "") | is.null(names(cna))) 
      stop("Argument 'cont.na' must be a named list with names of selected variables.", call. = FALSE)  
    if (any(!names(cna) %in% names(data))) stop("Names of the list cont.na must be variables in data.\n", call. = FALSE)
    for (i in 1:length(cna)) {
      j <- (1:length(data))[names(cna)[i] == names(data)]
      cont.na[[j]] <- unique(c(NA,cna[[i]]))
    }
  }

  object <- list(syn = object, m = m, cont.na = cont.na)
  class(object ) <- "synds"
  
  res <- compare.synds(object = object, data = data, vars = vars, 
                       msel = msel, stat = stat, breaks = breaks, 
                       nrow = nrow, ncol = ncol, rel.size.x = rel.size.x,
                       utility.stats = utility.stats,
                       utility.for.plot = utility.for.plot,
                       cols = cols, plot = plot, table = table, ...) 
  res$call <- match.call()
  return(res)
}


###-----pertable-----------------------------------------------------------
# calculate counts/percentages
pertable <- function(x, stat, ...) { ##GR stat added
 res <- table(x, useNA = "always")   
 if (stat == "percents") res <- res*100/sum(res) 
 return(res)
}


###-----ggfac--------------------------------------------------------------
# calculate percentages for factors and store in a data frame (a long format)
ggfac <- function(data, name = "observed", anyna, stat){ 
  data <- as.data.frame(data)
  perlist  <- lapply(data, pertable, stat = stat)
  for (i in 1:length(perlist)) {
    if (anyna[i] == FALSE) perlist[[i]] <- perlist[[i]][-length(perlist[[i]])]  
  }
  if (stat == "percents") Percent  <- unlist(perlist, use.names = FALSE) 
  else Count <- unlist(perlist, use.names = FALSE)
  Value <- unlist(lapply(perlist,names), use.names = FALSE)
  Variable <- rep(names(perlist), sapply(perlist,length))
  if (stat == "percents") perdf <- cbind.data.frame(Percent, Value, Variable) 
  else  perdf <- cbind.data.frame(Count, Value, Variable)
  
  perdf$Data <- name
  return(list(perdf = perdf, perlist = perlist))
} 

###-----ggnum--------------------------------------------------------------
# calculate percentages for numeric variables (non-missing values and 
# missing data categories seperately) and store in a data frame (a long format)
ggnum <- function(data, name = "observed", na = as.list(rep(NA,ncol(data))), 
                  breaks = as.list(rep(30,ncol(data))), anyna, 
                  stat = stat){ 
  data <- as.data.frame(data)

# non-missing values  
  nvar <- ncol(data)
  perlist <- vector("list", nvar)
  hbreaks <- vector("list", nvar)
  
  for (i in 1:nvar) {
  ## counts for non-missing values  
    vardata <- data[!(data[,i] %in% na[[i]]),i]                  
    hh      <- hist(vardata, breaks = breaks[[i]], plot = FALSE)
    counts  <- hh$counts
    names(counts) <- format(hh$breaks[-length(hh$breaks)],
                            scientific = FALSE, trim = TRUE,
                            drop0trailing = TRUE)
    hbreaks[[i]]  <- hh$breaks
  ## counts for missing values  
    dataNA <- data[data[,i] %in% c(NA,na[[i]]),i]
    
    if (length(dataNA) == 0) {
      if (anyna[i] == TRUE) {
        NAcounts <- 0   
        names(NAcounts) <- NA 
      } else {
        NAcounts <- NULL
      }
    } else {
      if (anyna[i] == TRUE) {
        NAcounts <- table(dataNA, useNA = "always")
      } else {
        NAcounts <- table(dataNA)  
      }
    } 
    if (!is.null(NAcounts)) names(NAcounts) <- paste("miss", names(NAcounts), sep = ".") 
    counts <- c(counts, NAcounts)
    if (stat == "percents") perlist[[i]] <- counts*100/length(data[,i])
    else perlist[[i]] <- counts
  }
  names(perlist) <- colnames(data)

# create data frame in a long format  
  if (stat == "percents") Percent  <- unlist(perlist, use.names = F) 
  else Count  <- unlist(perlist, use.names = F)
  Value    <- unlist(lapply(perlist,names), use.names = F)
  Variable <- rep(names(perlist),sapply(perlist,length))
  if (stat == "percents") perdf    <- cbind.data.frame(Percent, Value, Variable)
  else perdf    <- cbind.data.frame(Count, Value, Variable)
  perdf$Data <- name
  
return(list(perdf = perdf, perlist = perlist, hbreaks = hbreaks))
}


###-----dfNA---------------------------------------------------------------
dfNA <- function(data, na){
  all  <- length(data)
  data <- data[data %in% unlist(na)]
  NA.counts <- table(data, exclude = NULL)*100/all
  return(NA.counts)
}


###-----compare.fit.synds--------------------------------------------------
compare.fit.synds <- function(object, data, plot = "Z",
  print.coef = FALSE, return.plot = TRUE, plot.intercept = FALSE, 
  lwd = 1, lty = 1, lcol = c("#1A3C5A","#4187BF"),
  dodge.height = .5, point.size = 2.5, 
  population.inference = FALSE, ci.level = 0.95, ...) {   # c("#132B43", "#56B1F7")

 # Compares and plots fits to synthetic and original data
 # First parameter must be a fit to synthetic data from glm.synds(),
 # lm.synds(), polr.synds() or multinom.synds()

 value <- "Value"
 coefficient <- c("Coefficient", "Model")  
  
 if (!inherits(object, "fit.synds")) stop("Object must have class fit.synds.\n")
 if (!is.data.frame(data)) stop("Data must be a data frame.\n")  # theoretically can be a matrix (?)
 if (ci.level <= 0 | ci.level > 1) stop("ci.level must be beteen 0 and 1.\n")

 m <- object$m
 n <- object$n
 k <- object$k
 fitting.function <- object$fitting.function

##?? syn.coef    <- object$mcoefavg
 
 call <- match.call()

 # get fit to real data
 if (fitting.function %in% c("multinom", "polr")) {
   real.fit.0 <- do.call(object$fitting.function,
                         args = list(formula = formula(object),
                         Hess = TRUE, data = call$data))
   real.fit <- summary(real.fit.0)
 } else if (fitting.function %in% c("lm")) {
   real.fit <- summary(do.call(object$fitting.function,
                       args = list(formula = formula(object), 
                       data = call$data)))
 } else { ##  for glm
   real.fit <- summary(do.call(object$fitting.function,
                       args = list(formula = formula(object),
                       family = object$call$family, 
                       data = call$data)))
 }
 
 if (object$fitting.function == "multinom") { 
   real.varcov <- vcov(real.fit.0) 
   dd <- dimnames(t(real.fit$coefficients))
   real.fit$coefficients <- cbind(as.vector(t(real.fit$coefficients)), 
                                  as.vector(t(real.fit$standard.errors)),
                                  as.vector(t(real.fit$coefficients))/as.vector(t(real.fit$standard.errors))) 
   dimnames(real.fit$coefficients) <- list(
                                   paste(rep(dd[[2]], each = length(dd[[1]])), 
                                 rep(dd[[1]], length(dd[[2]])), sep = " : "), c("Estimate","Std error","t stat"))
 } else if (object$fitting.function == "polr") { 
   real.varcov <- vcov(real.fit.0) 
 } else if (object$fitting.function == "lm") {
   real.varcov <- real.fit$cov.unscaled * real.fit$sigma^2  ##??
 } else { # for "glm"
   real.varcov <- real.fit$cov.scaled   
 }

 syn.fit  <- summary.fit.synds(object, real.varcov = real.varcov,
                               population.inference = population.inference)
 incomplete <- syn.fit$incomplete

 # detailed results
 res.obs <- real.fit$coefficients[,1:3]
 colnames(res.obs) <- c("Beta","se(Beta)","Z")
 res.syn  <- syn.fit$coefficients[,1:3] 
 res.syn  <- res.syn[order(match(rownames(res.syn), rownames(res.obs))), ]
 res.overlap <- compare.CI(res.syn, res.obs, ci.level = ci.level, intercept = TRUE)
 ncoef <- nrow(res.obs) 

 res.diff <-  cbind(res.syn[,1], res.obs[,1], 
		    res.syn[,1] - res.obs[,1], 
 		   (res.syn[,1] - res.obs[,1]) / res.obs[,2])
 dimnames(res.diff)[[2]] <- c("Synthetic", "Observed", "Diff", "Std. coef diff")
#? pval <- round(2 * (1 - pnorm(abs(res.syn[,1] - res.obs[,1])/sqrt(diag(lof.varcov)))), 3)  # "p value"
 
 if (incomplete == TRUE) {   
   if (object$m < ncoef) {
     cat("\n\nWnen some variables are not synthesised m  (= ",m,") must exceed number of",
         "\ncoefficients (= ",ncoef,") for lack of fit test. No test can be reported.\n\n")  
     lack.of.fit <- NULL
     lof.pvalue  <- NULL
   } else {
   QB <- matrix(NA, m, length(object$mcoefavg))
     for (i in 1:m) {
       QB[i,] <- object$mcoef[i,] - object$mcoefavg
     }
     lof.varcov <- t(QB) %*% QB/(m - 1)/m
     lack.of.fit <- t(res.diff[,3]) %*% ginv(lof.varcov) %*% res.diff[,3]
     lack.of.fit <- lack.of.fit * (object$m - ncoef)/ncoef/(object$m - 1) # Hotellings T square
     lof.pvalue  <- 1 - pf(lack.of.fit, ncoef, object$m - ncoef)          
   }
 } else {
   lof.varcov <- real.varcov*n/k/m
   lack.of.fit <- t(res.diff[,3]) %*% ginv(lof.varcov) %*% res.diff[,3]  #! multiply by m for combined test
   lof.pvalue  <- 1 - pchisq(lack.of.fit, ncoef)
 }

##?? if (object$proper == TRUE) lof.varcov <- real.varcov * (1 + n/k)/m

# Calculate summary measures
 mean.ci.overlap   <-  mean(res.overlap[,1])
 mean.abs.std.diff <-  mean(abs(res.diff[,4]))
 
 if (return.plot == TRUE) {
   yvar <- as.character(formula(object)[[2]])

 if (plot == "Z") {
     BetaCI <- dfCI(real.fit, Z = TRUE, ci.level = ci.level)
     
     if (population.inference == FALSE) {  ## get interval from real var
       BsynCI <- BetaCI
       for (i in c(1,3,4)) BsynCI[,i] <- BsynCI[,i] + (res.syn[,1] - res.obs[,1])/res.obs[,2]
       BsynCI[,5] <- "synthetic"
     } else {
       BsynCI <- dfCI(syn.fit, Z = TRUE, name.Z = "Z.syn", 
                      model.name = "synthetic", ci.level = ci.level)
     }
     xlab = "Z value"
     title = paste0("Z values for fit to ",yvar)
   
   } else {
     BetaCI <- dfCI(real.fit, Z = FALSE, ci.level = ci.level)  
     
     if (population.inference == FALSE) {  ## get interval from real var
       BsynCI <- BetaCI
       for (i in c(1,3,4)) BsynCI[,i] <- BsynCI[,i] + (res.syn[,1] - res.obs[,1])
       BsynCI[,5] <- "synthetic"
     } else {
       BsynCI <- dfCI(syn.fit, Z = FALSE, name.Z = "syn.coef", 
                      model.name = "synthetic", ci.level = ci.level)
     }

     xlab = "Value"
     title = paste0("Coefficients for fit to ",yvar)
   }
   
   modelCI <- rbind.data.frame(BetaCI, BsynCI)
   rownames(modelCI) <- 1:nrow(modelCI)
   
   if (!plot.intercept) modelCI <- modelCI[modelCI$Coefficient != "(Intercept)",]

   CI.geom <- geom_errorbar(aes_string(ymin = "LowCI", ymax = "HighCI",
     color = "Model", linetype = "Model"), data = modelCI, width = 0, 
     lwd = lwd, lty = lty, position = position_dodge(width = dodge.height))

   point.geom <- geom_point(aes_string(#ymin = value, ymax = value,  #BN-03/02/2017 commented
     color = "Model", shape = "Model"), data = modelCI, 
     size = point.size, position = position_dodge(width = dodge.height))

   p <- ggplot(data = modelCI, aes_string(x = "Coefficient", y = "Value"))
   p <- p + geom_hline(yintercept = 0, colour = "grey", linetype = 2, lwd = 1)
   p <- p + CI.geom + point.geom + labs(title = title, y = xlab)
   p <- p + scale_shape_manual(values = c(17:16), breaks = c("synthetic","observed")) +
            scale_colour_manual(values = lcol[2:1], breaks = c("synthetic", "observed"))
   p <- p + coord_flip()
   # p <- p + theme_bw()
   # scale_colour_manual(values = rev(brewer.pal(3,"Blues")))
   # scale_colour_grey(start = 0, end = .6)
   p
 } else p <- NULL

 res <- list(call = object$call, coef.obs = res.obs, coef.syn = res.syn, 
   coef.diff = res.diff, mean.abs.std.diff = mean.abs.std.diff,
   ci.overlap = res.overlap, mean.ci.overlap =  mean.ci.overlap, 
   lack.of.fit = lack.of.fit, lof.pvalue = lof.pvalue, 
   ci.plot = p, print.coef = print.coef,       
   m = object$m, ncoef = ncoef,
   incomplete = incomplete,
   population.inference = population.inference)  

 class(res) <- "compare.fit.synds"
 return(res)
}


###-----dfCI---------------------------------------------------------------
# extract info for plotting confidence intervals
dfCI <- function(modelsummary, names.est.se = c("Estimate","Std. Error"),
          model.name = "observed",  ci.level = 0.95, Z = FALSE, 
          name.Z = colnames(modelsummary$coefficients)[3]){
  
  CI <- qnorm(1 - (1 - ci.level)/2)
  if (!Z) {
    #msCI <- as.data.frame(modelsummary$coefficients[,names.est.se])
    msCI <- as.data.frame(modelsummary$coefficients[,1:2]) 
    names(msCI) <- c("Value", "SE")
  } else {
    # msCI <- as.data.frame(modelsummary$coefficients[,name.Z])
    msCI <- as.data.frame(modelsummary$coefficients[,3]) 
    names(msCI) <- c("Value")
    msCI$SE <- 1
  }  
  msCI$Coefficient <- rownames(msCI)

  msCI$HighCI <- msCI$Value + CI*msCI$SE
  msCI$LowCI  <- msCI$Value - CI*msCI$SE
  msCI$SE     <- NULL
  msCI$Model  <- model.name
  msCI$Coefficient <- factor(msCI$Coefficient, levels = rev(msCI$Coefficient))  #!BN290416, rev added 

  return(msCI)
}


###-----compare.CI---------------------------------------------------------
compare.CI <- function(synthetic, observed, ci.level, intercept, ...)
{
 CI <- qnorm(1- (1 - ci.level)/2)
 ##Initiate
 if(nrow(observed) > nrow(synthetic)){
   numVar <- nrow(synthetic); rNames <- rownames(synthetic)
 } else{
   numVar <- nrow(observed); rNames <- rownames(observed)
 } 
 CIoverlap <- matrix(NA, nrow = numVar, ncol = 1, 
   dimnames = list(rNames, "CI overlap"))
	
##Calculate CIoverlap
 for(i in 1:numVar) {

   ##Store CIs
   syn.upper <- synthetic[i, 1] + (CI * synthetic[i, 2])
   syn.lower <- synthetic[i, 1] - (CI * synthetic[i, 2])
   obs.upper <- observed[i, 1]  + (CI * observed[i, 2])
   obs.lower <- observed[i, 1]  - (CI * observed[i, 2])
	  
   ## CI overlap
   overlap.lower <- max(obs.lower, syn.lower)       
   overlap.upper <- min(obs.upper, syn.upper)       

   CIoverlap[i, 1] <- 0.5 * 
     (((overlap.upper - overlap.lower) / (obs.upper - obs.lower)) + 
	    ((overlap.upper - overlap.lower) / (syn.upper - syn.lower)))
 }

 if(intercept == FALSE) {
   CIoverlap = CIoverlap[-1, , drop = FALSE]
 }
  
 return(as.data.frame(CIoverlap))
}

 

Try the synthpop package in your browser

Any scripts or data that you put into this service are public.

synthpop documentation built on Aug. 31, 2022, 5:06 p.m.