Nothing
#' This function performs data augmentation on the provided dataset.
#' @param theta A data frame containing the theta.
#' @param data A data where theta to be merged.
#' @param MissingVars A variable with missing data
#' @return A data frame containing the augmented data.
#' @keywords internal
#' @importFrom dplyr group_keys
#' @importFrom data.table as.data.table := .I setorder
theta_back_2_data <- function(theta,data, MissingVars){
data$row.num <- seq(1:nrow(data))
group.data <- dplyr::group_by(data, !!!dplyr::syms(MissingVars))
ptrn <- dplyr::group_keys(group.data)
ptrnTheta <- cbind(ptrn,theta)
MergedData <- merge(data,ptrnTheta, by.x=MissingVars, by.y=MissingVars)
MergedData1 <- MergedData[order(MergedData$row.num),]
MergedData1$row.num <- NULL
return(list("data" = MergedData1))
}
#' Data Augmentation Function
#'
#' This function performs data augmentation on the provided dataset.
#' @keywords internal
#' @importFrom stats na.omit
dataAugmentation <-function(SourceMisData, formula, adtnlCovforR=NULL)
{
#SourceMisData=fn_data
#Get column names with missing values
missing_cols <- colSums(is.na(SourceMisData))
if (sum(missing_cols)>0){
#sub-setting the data based on the supplied formula
formula_vars <- all.vars(formula)
if (length(adtnlCovforR) > 0) {
df_data <- subset(SourceMisData, select = c(formula_vars, adtnlCovforR))
} else {
df_data <- subset(SourceMisData, select = formula_vars)
}
resp <- all.vars(formula)[1]
#predictors containing missing data
predictor_vars <- attr(terms(formula), "term.labels")
nMissCov=sum(colSums(is.na(SourceMisData[, predictor_vars])))
if (nMissCov !=0){
VarWithMissingVal <- predictor_vars[colSums(is.na(SourceMisData[, predictor_vars])) > 0]
#creating a flag m=1 if there is a missing in any of the covariates
df_data$m <- ifelse(rowSums(is.na(data.frame(df_data[, VarWithMissingVal]))) > 0, 1, 0)
}
#creating a flag R=1 if there is a missing in response variable
if (any(is.na(df_data[, resp]))){
df_data$R <- ifelse(rowSums(is.na(data.frame(df_data[, resp]))) > 0, 1, 0)
}
#creating a group variable grp to compute weight using Bayes theorem later
df_data$grp<- seq(1:nrow(df_data))
}
else{
df_data <- SourceMisData
}
cols_with_missing <- names(missing_cols[missing_cols>0])
tt.miss=df_data
tt=tt.miss
# table(tt.miss$m, exclude=NULL)
# augmenting with all possible data available in the column
col.name=colnames(tt)
for (i in 1:length(col.name))
{
#identifies all missing columns, if a column with missing is not of interest
#need to be dropped before this stage
if (sum(which(is.na(tt.miss[,i])))>0){
t=tt[,i]
t.2=tt %>% dplyr::filter(as.vector(is.na(t)))
# nrow(t.2)
fl.1=!is.na(t)
temp.1=tt[fl.1,]
zz=as.matrix(unique(temp.1[,i]))
for(j in 1:nrow(zz)){
# print (c(i,j))
t1=t.2
t1[,i]=zz[j,]
#-----end of debug-------
# Print column names before rbind
temp.1<-rbind(temp.1,t1)
}
tt=temp.1
}
tt.1=tt
}
data=tt[order(tt$grp),]
if (nMissCov !=0){
#predictors containing missing data
predictor_vars <- attr(terms(formula), "term.labels")
cols_with_missing <- predictor_vars[colSums(is.na(tt.miss[, predictor_vars])) > 0]
# Ensure unique_sorted_df is a data frame, even if cols_with_missing has one column
unique_sorted_df <- tt.miss[!duplicated(tt.miss[, cols_with_missing, drop = FALSE]), cols_with_missing, drop = FALSE]
# Apply order function to multiple columns dynamically
unique_sorted_df <- unique_sorted_df[do.call(order, unique_sorted_df[cols_with_missing]), , drop = FALSE]
#unique non-missing pattern
ptrn<-na.omit(unique_sorted_df)
# from the augmented data keeping only those rows that are observed in the observed data
dataWithObjMiss<-merge(x=data, y=ptrn, by=cols_with_missing, sort=FALSE)
dataWithObjMiss <- dataWithObjMiss[order(dataWithObjMiss$grp),]
dn=nrow(ptrn)
}
else {
dataWithObjMiss=data
ptrn=NULL
dn=NULL
}
return(list(augData=dataWithObjMiss, ObjPattern=ptrn, distptrn=dn))
}
#' formula generation
#'
#' This function is for formula generation.
#' @keywords internal
#' @importFrom stats as.formula
form_gen<-function(resp, pred){
form<-paste(resp,"~")
l<-length(pred)
if(l != 1){
for (i in 1:(l-1)){
form<-paste(form, pred[i], "+")
}
i<-i+1
form<-paste(form, pred[i])
}else{
form<-paste(form, pred)
}
return(as.formula(form))
}
#' Function to check if any character variables exist in a formula and show an error
#' @keywords internal
checkCharacterVariablesInFormula <- function(formula, data) {
# Extract variables from the formula
vars_in_formula <- all.vars(formula)
# Loop through each variable in the formula
for (var in vars_in_formula) {
if (is.character(data[[var]])) {
message(paste("Error: The variable", var, "is a character. It needs to be declared as a numeric factor in the model."))
}
}
return(NULL)
}
#' @keywords internal
charToFactor<- function( DF){
DFNames<- names(DF)
for( columnName in DFNames){
if( all(is.character( DF[,columnName]))){
DF[,columnName]<- as.factor( DF[,columnName])
}
}
return(DF)
}
#' @keywords internal
dataAugmentationDWN <- function(sourceDF,
verbose = FALSE,
maxNumberLevels=10)
{
# assume that sourceDF has already been reduced to the covariates
# in the formula
# and there are no missing in the response.
#
#
# convert factors in data frame to characters
# add missing logical and group ID
sourceDF<- factorToChar(sourceDF)
# missing flag
sourceDF$m<- apply( is.na(sourceDF), 1, any)
# group ID
sourceDF$grp<- 1: nrow( sourceDF)
DFNames <- names(sourceDF)
N <- nrow(sourceDF)
# observed combinations of covariates in the complete data
# columns of DF with any missing values
missingIndexDF <- apply(is.na(sourceDF), 2, any)
# names of these columns
missingNamesAll <- DFNames[missingIndexDF]
#
# new DF that are all rows of sourceDF with _complete_ rows
# for the columns that do have some missing values.
# Rename to be sure names are aligned correctly
workingDF <- as.data.frame(sourceDF[!sourceDF$m, missingIndexDF],
col.names = missingNamesAll)
# make a table of the covariate combinations that appear among the complete
# rows and for the covariates that have some missing values
# output for factors that have more than maxLevel will be set to NA.
levelInfo<- findUniqueLevels(workingDF, maxLevel=maxNumberLevels)
# Check that the missing pattern levels do not exceed the maximum allowed
if (any(levelInfo$nLevels > maxNumberLevels)) {
message("unique values in each factor: ", paste(levelInfo$nLevels, collapse = " "))
stop(paste("number of unique values for some covariates exceed", maxNumberLevels))
}
# all possible combinations of values for
# covariates that have missing values
tmpCombinations<- expand.grid(levelInfo$levels)
# restrict these to just the combinations that actually appear in
# workingDF
# the code below is equivalent to looking at the NA cell counts
# in table(workingDF) but we can not control the order of
# levels in table -- so we compute this explicilty
indexAllCombinations <- pasteColumns(tmpCombinations)
indexCombinationsDF<- pasteColumns(workingDF)
whichCombinations<- match( indexAllCombinations,indexCombinationsDF )
# row indices for a combination that appears in data set
whichCombinations<- which(!is.na(whichCombinations ))
# Now here is a dataframe that has only combinations that
# appear in the dataset.
ObjPattern <- tmpCombinations[ whichCombinations, ]
if (verbose) {
message("covariate patterns from complete data: ", paste(missingNamesAll, collapse=" "))
message("Missing pattern: ", paste(ObjPattern, collapse=" "))
}
# handy way to keep track of all the combinations of covariates
# appearing in workingDF
ObjPatternString <- pasteColumns(ObjPattern)
#
if (verbose) {
message("Object Pattern string: ", paste(ObjPatternString, collapse = " "))
}
# sort the missing pattern strings so that order will be consistent.
iOrderString<- order( ObjPatternString)
ObjPatternString<- ObjPatternString[iOrderString]
ObjPattern<- ObjPattern[iOrderString,]
#
# Now a big loop over all rows expanding missing covariates when needed
#
rowNames <- row.names(sourceDF)
tmpDF <- NULL
# N is the number of rows of the original data frame.
# for rows with a missing covariate(s) this will be expanded into the
# same "y" response and all combinations of the missing covariates
# that align with the observed covariates
# case == original row in the data frame
for (case in 1:N) {
# a single row of the source data frame
currentRow <- as.data.frame(sourceDF[case, ])
row.names(currentRow) <- as.character(case)
if (verbose) {
message("Row ", case, ": ", currentRow)
}
# does this row have a missing covariate?
if (sourceDF$m[case]) {
# expand row with all possible cases for missing covariates
missingIndex <- c(is.na(currentRow)) # missing covariates
#names of covariates in this row that are missing
missingNames <- DFNames[missingIndex]
# and names of nonmissing covariates!
presentNames <- intersect( missingNamesAll,
DFNames[!missingIndex])
if (length(presentNames) == 0) {
# all covariates are missing!
# fill in with all possible combinations
dataPattern <- ObjPattern
indexPattern<- 1:nrow( ObjPattern)
}
else{
# only repeat those patterns that match the nonmissing covariates
# note union of missingNames and presentNames should be
# the names of all covariates that have at least one missing value
# covariate string present
presentRowString <-
paste(currentRow[, presentNames], collapse = "")
# missing pattern for this row
ObjPatternStringRow <- pasteColumns(ObjPattern[, presentNames])
# find all covariate patterns that match the covariates
# that are present in this row
indexPattern <- which(presentRowString == ObjPatternStringRow)
dataPattern <- ObjPattern[indexPattern, ]
}
# cbind will automatically repeat the nonMissing current row values.
# very handy!
tmpExpandedRow <- dataPattern
tmpExpandedRow <- cbind(dataPattern,currentRow[, !missingIndex])
# reshuffle columns to be in DFNames order
tmpExpandedRow <- cbind(tmpExpandedRow[, DFNames])
# add informative row names as the original row number a "_"
# and the number of the expanded result
row.names(tmpExpandedRow) <- paste0(row.names(currentRow), "_", 1:nrow(tmpExpandedRow))
if (verbose) {
# For indexPattern, assuming it's a vector:
message("indexPattern: ", paste(indexPattern, collapse = " "))
# For tmpExpandedRow, which is likely a data frame:
# Format the data frame as a character matrix, then collapse rows.
message("expanded Row:\n",
paste(apply(format(tmpExpandedRow), 1, paste, collapse = " "), collapse = "\n"))
}
# accumulate this case onto the growing data frame.
tmpDF <- rbind(tmpDF,
cbind(tmpExpandedRow))
}
else{
# no missing so just append row -- this is the easy one!
tmpDF <- rbind(tmpDF,
currentRow)
}
} # end loop over cases (rows)
# add patternID to each row.
# this will be the row of ObjPattern that has been
# used for this row in the augmented dataset
patternDF <- pasteColumns(tmpDF[, missingIndexDF])
# patternID is used to index the estimated
# probability of a missing pattern
patternID <- match(patternDF , ObjPatternString)
tmpDF <- cbind(tmpDF, patternID = patternID)
# convert to factors if column is a character vector
# if numeric -- leave it unchanged.
tmpDF<- charToFactor( tmpDF)
return(
list(
DF = tmpDF,
ObjPattern = ObjPattern,
objPatternString = ObjPatternString,
distptrn = nrow(ObjPattern),
missingNamesAll= missingNamesAll
)
)
}
#' @keywords internal
factorToChar<- function( DF){
DFNames<- names(DF)
for( columnName in DFNames){
if( any(is.factor( DF[,columnName]))){
DF[,columnName]<- as.character( DF[,columnName])
}
}
return(DF)
}
#' @keywords internal
pasteColumns<- function(A, sep=""){
A<- as.matrix(A)
N<- ncol( A)
colStrings<- as.character(A[,1])
if( N>1){
for( k in 2:N ){
colStrings<- paste( colStrings, as.character(A[,k]), sep=sep)
}
}
return( colStrings)
}
#' @keywords internal
findUniqueLevels<- function( DF, maxLevel=10){
M<- ncol(DF)
factorLevels<- list()
nLevels<- rep( NA, M)
for( j in 1:M){
colWork<- DF[,j]
# convert to character if a factor
if (is.factor(colWork)){
colWork<- as.character(colWork)
}
# find unique values
tmpLevel<- unique(colWork)
# unlist to strip out from a tibble or daa frame
tmpLevel<- c(unlist(unique(colWork)))
# names added are confusing- revmoe these
#record the size
nLevels[j]<- length( tmpLevel)
names(tmpLevel)<- NULL
# if many levels then set to NA
# this column is likely just a numerical vector
# with few common values
if( length(tmpLevel)>maxLevel){
tmpLevel<- NA
}
# accumulate into a list
factorLevels<- c( factorLevels, list(tmpLevel))
}
names( factorLevels)<- names(DF)
#
# # now determine unique combinations that appear
# levelTags<- matrix( NA, nrow(DF), M)
# for( j in 1:M){
# colWork<- DF[,j]
# levelTags[,j]<- match(factorLevels[[j]], colWork )
#
# }
return(
list(levels=factorLevels, nLevels=nLevels)
)
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.