R/R_functions.R

Defines functions plot.synMicro_object .Beta_cube_fn print.readData_passed summary.synMicro_object print.synMicro_object multipleSyn createModel readData

Documented in createModel multipleSyn plot.synMicro_object print.readData_passed print.synMicro_object readData summary.synMicro_object

readData <- function( Y_input, X_input, RandomSeed = 99){
  
  set.seed(RandomSeed)
  
	if (is.null(Y_input)) stop("Y_input is NULL") 
	NA_Y_mat = is.na( Y_input )
	Y_input <- data.frame(Y_input)

	if (is.null(X_input)) stop("X_input is NULL") 
	NA_X_mat = is.na( X_input )
	X_input <- data.frame(X_input)
	
	n_sample = dim(Y_input)[[1]] ; p_Y = dim(Y_input)[[2]]
	Y_mat_std = array(0,c(n_sample,p_Y))
	mean_Y_input = sd_Y_input = rep(0,p_Y)
	for (l in 1:p_Y){
		mean_Y_input[l] = mean(Y_input[,l], na.rm=TRUE)
		sd_Y_input[l] = sd(Y_input[,l], na.rm=TRUE)
		Y_mat_std[,l] = ( Y_input[,l] - mean_Y_input[l] ) / sd_Y_input[l]
	}
	
	p_X = dim(X_input)[[2]] ; D_l_vec = rep(0,p_X)
	X_mat_std = array(0,c(n_sample,p_X))
	levels_X_input = list()
	for (l in 1:p_X){
		# unique_l = sort(unique(X_input[,l]))
	  levels_X_input[[l]] = unique_l = levels(X_input[,l])
		D_l_vec[l] = length(unique_l)
		for (i in 1:n_sample){
			if (is.na(X_input[i,l])){ 
				X_mat_std[i,l] = X_input[i,l]
			} else {
				X_mat_std[i,l] = which( X_input[i,l] == unique_l ) - 1
			}
		}		
	} # for (l)
	
	var_names <- list(y=names(Y_input), x=names(X_input))
	names(levels_X_input) <- names(X_input)
	orig_data <- cbind(Y_input, X_input) #### new
	HCMM_input = list(n_sample=n_sample, p_Y=p_Y, Y_mat_std=Y_mat_std, mean_Y_input=mean_Y_input, sd_Y_input=sd_Y_input, NA_Y_mat = NA_Y_mat, p_X=p_X, D_l_vec=D_l_vec, X_mat_std=X_mat_std, levels_X_input=levels_X_input, NA_X_mat = NA_X_mat, var_names = var_names, orig_data = orig_data)
	
	class(HCMM_input) <- "readData_passed"
	return(HCMM_input)	
	
} # readData <- function

createModel <- function(data_obj, max_R_S_K=c(30,50,20)){ # , r_i_vec, s_i_vec, k_i_vec
  
  if ( !is(data_obj, "readData_passed") ) stop("data_obj should be generated by 'readData' function") ;
  
  model <- new(modelobject, max_R_S_K)
  
  Y_mat = data_obj$Y_mat_std ; X_mat = data_obj$X_mat_std ; 
  Y_NA_mat = is.na(Y_mat) ; X_NA_mat = is.na(X_mat) ; 
  Y_mat[is.na(Y_mat)] = 0.0 ; X_mat[is.na(X_mat)] = 0 ;
  # print(head(Y_in)) ; print(head(X_in)) ; 
  model$Y_mat = as.matrix(Y_mat) ; model$.Y_NA_mat = as.matrix(Y_NA_mat) ; 
  model$X_mat = as.matrix(X_mat) ; model$.X_NA_mat = as.matrix(X_NA_mat) 
  model$.D_l_vec = data_obj$D_l_vec
  
  model$msg_level = 2
  # 0: errors; 1: error and warnings; 2: errors, warnings and info
  
  model$.Initialization() 
  
  return(model)
  
} # Initialize <- function

### multipleSyn 
multipleSyn <- function(data_obj, model_obj, n_burnin, m, interval_btw_Syn, show_iter=TRUE){
  int_print = min(200,floor(interval_btw_Syn/2)) ; 
  
  if ( !is(model_obj, "Rcpp_modelobject") ) stop("model_obj needs to be prepared by 'createModel' function") ;
  n_sample = dim(model_obj$Y_mat)[[1]] ; p_y = dim(model_obj$Y_mat)[[2]] ; p_x = dim(model_obj$X_mat)[[2]]
  r_i_cube = array(0,c(m,n_sample)) ; s_i_cube = array(0,c(m,n_sample)) ; k_i_cube = array(0,c(m,n_sample)) ;
  
  Synt_data <- replicate(m, 
                         list(Y_Synt=array(0, c(n_sample, p_y)), 
                              X_Synt=array(0, c(n_sample, p_x))), simplify=FALSE)

  total_iter = n_burnin + m * interval_btw_Syn
  
  count_iter = 0
  
  if (show_iter==TRUE){
    start_time = current_time = Sys.time(); message(paste0("Current time, ",current_time)) ; prev_time = current_time # 1.3.4
  }
  message( paste0("Total iteration = ", (total_iter) ) )
  message("Burn-in ..................................")
  
  if ( n_burnin > int_print ){
    n_repeat_burnin = floor( n_burnin / int_print ) ; resid_n = n_burnin - n_repeat_burnin * int_print ;
    for (i_repeat in 1:n_repeat_burnin){
      for (i_run in 1:int_print){
        model_obj$.Iterate()
        count_iter = count_iter + 1
      }
      current_time = Sys.time()
      if (show_iter==TRUE){
        int_time = round( as.numeric(difftime(current_time, prev_time, units = "mins")), 1 )
        message(paste("Current time, ",current_time))
        message(paste0("Iter=",(i_repeat*int_print),", ",int_time," min for previous ", int_print, " iterations"))
        cat("table(r_i_vec)=\n") ; print(table(model_obj$r_i_vec+1))
        cat("\ntable(s_i_vec)=\n") ; print(table(model_obj$s_i_vec+1))
        cat("\ntable(k_i_vec)=\n") ; print(table(model_obj$k_i_vec+1)); cat("\n")
      }
      prev_time = current_time
    } # for (i_repeat)
    if (resid_n>0){
      for (i_run in 1:resid_n){
        model_obj$.Iterate()
        count_iter = count_iter + 1
      }
    } 
  } else {
    for (i_run in 1:n_burnin){
      model_obj$.Iterate()
      count_iter = count_iter + 1
    }
  } # if (n_burn_in)
  
  message("Drawing completed datasets ............")
  
  for ( i_imp in 1:m ){
    
    ## Run (interval_btw_Syn) iterations for parameter updates ## 
    for (i_run in 1:interval_btw_Syn){
      model_obj$.Iterate()
      count_iter = count_iter + 1
    }
    
    ## Draw an imputed dataset for every (interval_btw_Syn) iterations ## 
     r_i_cube[i_imp,] = model_obj$r_i_vec+1 ; s_i_cube[i_imp,] = model_obj$s_i_vec+1 ; k_i_cube[i_imp,] = model_obj$k_i_vec+1
    
    # Synthetic dataset #
    model_obj$.Synthesis()
    
    for (i_sample in 1:n_sample){
      Synt_data[[i_imp]]$Y_Synt[i_sample,] = data_obj$mean_Y_input + data_obj$sd_Y_input * model_obj$Synt_Y_mat[i_sample,]
    }
    for(l in 1:p_x){
      Synt_data[[i_imp]]$X_Synt[,l] <- data_obj$levels_X_input[[l]][model_obj$Synt_X_mat[,l] + 1]
    }
    Synt_data[[i_imp]]$Y_Synt <- data.frame(Synt_data[[i_imp]]$Y_Synt); names(Synt_data[[i_imp]]$Y_Synt) <- data_obj$var_names$y
    Synt_data[[i_imp]]$X_Synt <- data.frame(Synt_data[[i_imp]]$X_Synt); names(Synt_data[[i_imp]]$X_Synt) <- data_obj$var_names$x
    
    for(l in 1:p_x){
      Synt_data[[i_imp]]$X_Synt[,l] <- factor(Synt_data[[i_imp]]$X_Synt[,l], levels=data_obj$levels_X_input[[l]])
    }
    
    Synt_data[[i_imp]] <- cbind(Synt_data[[i_imp]]$Y_Synt, Synt_data[[i_imp]]$X_Synt)

  } # for ( i_imp )
  
  Synthetic_result = list(synt_data = Synt_data, comp_mat = list(r_mat = r_i_cube, s_mat = s_i_cube, k_mat = k_i_cube), orig_data = data_obj$orig_data) # The last two objects are newly added

  message("Finished...............................")
  if (show_iter==TRUE){
    current_time = Sys.time(); message(paste("Current time,",current_time))
    message( paste0("Total time for the ",total_iter," iterations = ", round(current_time-start_time, 3) ) )
  }
  class(Synthetic_result) <- "synMicro_object"
  return(Synthetic_result)
  
}

print.synMicro_object <- function(x, ...){
  ## print multipleSyn results
  cat(paste0("Number of synthetic data sets: ", length(x$synt_data), "\n"))
  cat(paste0("Data size: ", nrow(x$orig_data), " obs. of ", ncol(x$orig_data), " variables \n"))
  cat("-------------------------------------------\n")

  cat("First synthetic dataset:\n")
  print(head(x$synt_data[[1]]))

  cat("\nMixture component of the first synthetic dataset:\n")
  cat(" $r_mat: ", head(x$comp_mat$r_mat[1,]), " ...\n")
  cat(" $s_mat: ", head(x$comp_mat$s_mat[1,]), " ...\n")
  cat(" $k_mat: ", head(x$comp_mat$k_mat[1,]), " ...\n")
}

summary.synMicro_object <- function(object, max_print = 4, ...){
  ## print multipleSyn results
  is_Y <- sapply(object$orig_data, is.numeric)
  is_X <- !is_Y
  
  n_print <- ifelse(length(object$synt_data)>max_print, max_print, length(object$synt_data))
  
  results <- list() ## final output
  
  results$continuous <- list()
  
  ## continuous
  for(i in which(is_Y)){
    orig <- summary(object$orig_data[,i])[1:6]
    synt <- sapply(object$synt_data, function(s) summary(s[,i]))
    qbar <- rowMeans(synt)
    bm <- apply(synt, MARGIN=1, sd)
    
    # res_tmp  <- data.frame(cbind(orig, qbar, bm, synt[,1:n_print]))
    # names(res_tmp) <- c("Orig.", "Q_bar", "B_m", paste0("Synt.", 1:n_print))
    res_tmp  <- data.frame(cbind(orig, qbar, bm, synt))
    names(res_tmp) <- c("Orig.", "Q_bar", "B_m", paste0("Synt.", 1:ncol(synt)))
    
    results$continuous[[names(object$orig_data)[i]]] <- res_tmp
  }
  
  ## categorical
  results$categorical <- list()
  for(i in which(is_X)){
    orig <- table(object$orig_data[,i])
    synt <- sapply(object$synt_data, function(s) table(s[,i]))
    
    qbar <- rowMeans(synt)
    
    res_tmp <- data.frame(cbind(orig, qbar, synt))
    names(res_tmp) <- c("Orig.", "Q_bar", paste0("Synt.", 1:ncol(synt)))
    
    results$categorical[[names(object$orig_data)[i]]] <- res_tmp
  }

  ## print
  cat("Number of synthetic data sets:", length(object$synt_data), "\n")
  cat("Data size:", nrow(object$orig_data), "obs. of", ncol(object$orig_data), "variables \n")
  cat("-------------------------------------------\n")
  cat("Continuous:", names(object$orig_data)[is_Y], "\n")
  print(lapply(results$continuous, function(sss) sss[,1:(3+n_print)]))
  cat("-------------------------------------------\n")
  cat("Categorical:", names(object$orig_data)[is_X], "\n")
  print(lapply(results$categorical, function(sss) sss[,1:(2+n_print)]))
}


print.readData_passed <- function(x, ...){
  ## print readData results
  cat("Original data: ", x$n_sample, "obs. of", (x$p_Y+x$p_X), "variables \n")
  print(head(x$orig_data))
  cat("-------------------------------------------\n")
  cat("Number of missing values:\n")
  cat(" Continiuous: \n ")
  print(colSums(x$NA_Y_mat))
  cat("\n Categorical: \n ")
  print(colSums(x$NA_X_mat))
}

.Beta_cube_fn = function(data_obj, model_obj){
  
  Beta_with_std_y = model_obj$.Beta_cube
  p_y = dim(model_obj$.Beta_cube)[[1]] ; p_x_star = dim(model_obj$.Beta_cube)[[2]] ; R = dim(model_obj$.Beta_cube)[[3]]
  mean_y = data_obj$mean_Y_input ; sd_y = data_obj$sd_Y_input
  
  Beta_orig = Beta_with_std_y
  for ( r in 1:R ){
    l = 1
    Beta_orig[,l,r] = mean_y + sd_y * Beta_with_std_y[,l,r] # mean_y should be included. it is tested
    for ( l in 2:p_x_star){
      Beta_orig[,l,r] = sd_y * Beta_with_std_y[,l,r] 	
    } # for (l)
  } # for (r)
  
  return(Beta_orig)
  
} # Beta_cube_orig_scale

plot.synMicro_object <- function(x, vars, plot_num=NULL, ...){
  ### Check input objects ####
  if(is.numeric(vars)) vars <- names(x$orig_data)[vars] ## index -> variable names
  if(!all(vars %in% names(x$orig_data))) stop("invalid variable names") ## variable name error
  
  var_y <- names(x$orig_data)[sapply(x$orig_data, class)=="numeric"]
  var_x <- names(x$orig_data)[sapply(x$orig_data, class)=="factor"]
  
  y_test <- vars %in% var_y
  x_test <- vars %in% var_x
  
  if(all(any(y_test),any(x_test))) stop("Only one can be selected between categorical and continuous")
  
  if(!is.null(plot_num)){ ## plot num: number of synthetic data...
    if(!(plot_num %in% 1:length(x$synt_data))) stop("invalid synthetic data number")
    
    ## continuous-type
    if(any(y_test)){
      switch(sum(y_test),
             '1'={
               plot_data <- data.frame(Original=x$orig_data[,vars], Synthetic = x$synt_data[[plot_num]][,vars])
               boxplot(plot_data, main=paste0(vars, ": Synthetic ", plot_num), col=c("grey", "royalblue"), lwd=2, ylim = range(plot_data))
             },
             '2'={
               plot(formula(paste0(vars, collapse="~")), data=x$orig_data, pch=16, col="grey", main=paste0("Synthetic ", plot_num))
               points(formula(paste0(vars, collapse="~")), data=x$synt_data[[plot_num]], col="royalblue")
               legend("topright", c("Original", "Synthetic"), pch=c(16, 1), col = c("grey", "royalblue"))
             },
             {stop("only one plot can be output")})
    }
    
    if(any(x_test)){
      if(sum(x_test)==1){
        plot_data <- t(cbind(table(x$orig_data[,vars]),table(x$synt_data[[plot_num]][,vars])))
        barplot(plot_data,
                main=paste0(vars, ": Synthetic ", plot_num),
                beside = TRUE, col = c("grey", "royalblue"), ylim = c(0, max(plot_data)*1.5),
                legend.text = c("Original", "Synthetic"),
                args.legend = list(bty="n"))
      }else{
        stop("only one plot can be output")
      }
    }
    
  }else{
    ## continuous-type ####
    if(any(y_test)){
      if(sum(y_test)==1){   ## box-plot: one variable
        synth_data <- data.frame(sapply(x$synt_data, function(s) s[vars]))
        
        for(m_rep in 1:length(x$synt_data)){
          readline(prompt="Press [enter] to continue...")
          plot_data <- data.frame(Original=x$orig_data[,vars], Synthetic = synth_data[,m_rep])
          boxplot(plot_data, main=paste0(vars, ": Synthetic ", m_rep), col=c("grey", "royalblue"), lwd=2, ylim = range(x$orig_data[,vars],synth_data))
        }
        
      }else{  ## scatter plot: more than 2
        
        var_list <- apply(combn(vars, 2), MARGIN=2, function(vvv) paste0(vvv, collapse = " ~ "))
        
        for(var_rep in 1:length(var_list)){
          for(m_rep in 1:length(x$synt_data)){
            readline(prompt="Press [enter] to continue...")
            plot(formula(var_list[var_rep]), data=x$orig_data, pch=16, col="grey", main=paste0("Synthetic ", m_rep))
            points(formula(var_list[var_rep]), data=x$synt_data[[m_rep]], col="royalblue")
            legend("topright", c("Original", "Synthetic"), pch=c(16, 1), col = c("grey", "royalblue"))
          }
        }
      }
    }
    
    ## categorical-type ####
    if(any(x_test)){
      for(var_rep in vars){
        synth_data <- sapply(x$synt_data, function(s) table(s[,var_rep]))
        
        for(m_rep in 1:length(x$synt_data)){
          readline(prompt="Press [enter] to continue...")
          barplot(t(cbind(table(x$orig_data[,var_rep]), synth_data[,m_rep])), main=paste0(var_rep, ": Synthetic ", m_rep),
                  beside = TRUE, col = c("grey", "royalblue"), ylim = c(0, ceiling(max(synth_data, table(x$orig_data[,var_rep]))*1.5)),
                  legend.text = c("Original", "Synthetic"),
                  args.legend = list(bty="n"))
        }
      }
    }
  }
}

Try the synMicrodata package in your browser

Any scripts or data that you put into this service are public.

synMicrodata documentation built on April 12, 2025, 9:14 a.m.