R/SupportingFunctionsforDataGenerationsandMiscv02.R
In glmfitmiss: Fitting GLMs with Missing Data in Both Responses and Covariates

Documented in checkCharacterVariablesInFormula dataAugmentation form_gen theta_back_2_data

#' This function performs data augmentation on the provided dataset.
#' @param theta A data frame containing the theta.
#' @param data A data where theta to be merged.
#' @param MissingVars A variable with missing data
#' @return A data frame containing the augmented data.
#' @keywords internal
#' @importFrom dplyr group_keys
#' @importFrom data.table as.data.table := .I setorder
theta_back_2_data <- function(theta,data, MissingVars){
  data$row.num <- seq(1:nrow(data))
  group.data <- dplyr::group_by(data, !!!dplyr::syms(MissingVars))
  ptrn <- dplyr::group_keys(group.data)
  ptrnTheta <- cbind(ptrn,theta)
  MergedData <- merge(data,ptrnTheta, by.x=MissingVars, by.y=MissingVars)
  MergedData1 <- MergedData[order(MergedData$row.num),]
  MergedData1$row.num <- NULL
  return(list("data" = MergedData1))
}

#' Data Augmentation Function
#'
#' This function performs data augmentation on the provided dataset.
#' @keywords internal
#' @importFrom stats na.omit
dataAugmentation <-function(SourceMisData, formula, adtnlCovforR=NULL)
{

  #SourceMisData=fn_data

  #Get column names with missing values
  missing_cols <- colSums(is.na(SourceMisData))

  if (sum(missing_cols)>0){
    #sub-setting the data based on the supplied formula
    formula_vars <- all.vars(formula)

    if (length(adtnlCovforR) > 0) {
      df_data <- subset(SourceMisData, select = c(formula_vars, adtnlCovforR))
    } else {
      df_data <- subset(SourceMisData, select = formula_vars)
    }


    resp <- all.vars(formula)[1]

    #predictors containing missing data
    predictor_vars <- attr(terms(formula), "term.labels")
    nMissCov=sum(colSums(is.na(SourceMisData[, predictor_vars])))
    if (nMissCov !=0){
      VarWithMissingVal <- predictor_vars[colSums(is.na(SourceMisData[, predictor_vars])) > 0]

      #creating a flag m=1 if there is a missing in any of the covariates
      df_data$m <- ifelse(rowSums(is.na(data.frame(df_data[, VarWithMissingVal]))) > 0, 1, 0)
    }


    #creating a flag R=1 if there is a missing in response variable
    if (any(is.na(df_data[, resp]))){

      df_data$R <- ifelse(rowSums(is.na(data.frame(df_data[, resp]))) > 0, 1, 0)
    }


    #creating a group variable grp to compute weight using Bayes theorem later
    df_data$grp<- seq(1:nrow(df_data))

  }
  else{
    df_data <- SourceMisData
  }

  cols_with_missing <- names(missing_cols[missing_cols>0])

  tt.miss=df_data

  tt=tt.miss
  # table(tt.miss$m, exclude=NULL)
  # augmenting with all possible data available in the column
  col.name=colnames(tt)
  for (i in 1:length(col.name))
  {
    #identifies all missing columns, if a column with missing is not of interest
    #need to be dropped before this stage
    if (sum(which(is.na(tt.miss[,i])))>0){

      t=tt[,i]
      t.2=tt %>% dplyr::filter(as.vector(is.na(t)))
      # nrow(t.2)
      fl.1=!is.na(t)
      temp.1=tt[fl.1,]

      zz=as.matrix(unique(temp.1[,i]))
      for(j in 1:nrow(zz)){
        # print (c(i,j))
        t1=t.2
        t1[,i]=zz[j,]

        #-----end of debug-------
        # Print column names before rbind
        temp.1<-rbind(temp.1,t1)

      }
      tt=temp.1

    }
    tt.1=tt

  }

  data=tt[order(tt$grp),]

  if (nMissCov !=0){
    #predictors containing missing data
    predictor_vars <- attr(terms(formula), "term.labels")
    cols_with_missing <- predictor_vars[colSums(is.na(tt.miss[, predictor_vars])) > 0]

    # Ensure unique_sorted_df is a data frame, even if cols_with_missing has one column
    unique_sorted_df <- tt.miss[!duplicated(tt.miss[, cols_with_missing, drop = FALSE]), cols_with_missing, drop = FALSE]

    # Apply order function to multiple columns dynamically
    unique_sorted_df <- unique_sorted_df[do.call(order, unique_sorted_df[cols_with_missing]), , drop = FALSE]

    #unique non-missing pattern
    ptrn<-na.omit(unique_sorted_df)
    # from the augmented data keeping only those rows that are observed in the observed data
    dataWithObjMiss<-merge(x=data, y=ptrn, by=cols_with_missing, sort=FALSE)
    dataWithObjMiss <- dataWithObjMiss[order(dataWithObjMiss$grp),]

    dn=nrow(ptrn)
  }
  else {
    dataWithObjMiss=data
    ptrn=NULL
    dn=NULL
  }
  return(list(augData=dataWithObjMiss, ObjPattern=ptrn, distptrn=dn))
}

#' formula generation
#'
#' This function is for formula generation.
#' @keywords internal
#' @importFrom stats as.formula
form_gen<-function(resp, pred){
  form<-paste(resp,"~")
  l<-length(pred)
  if(l != 1){
    for (i in 1:(l-1)){
      form<-paste(form, pred[i], "+")
    }
    i<-i+1
    form<-paste(form, pred[i])
  }else{
    form<-paste(form, pred)
  }
  return(as.formula(form))
}

#' Function to check if any character variables exist in a formula and show an error
#' @keywords internal
checkCharacterVariablesInFormula <- function(formula, data) {
  # Extract variables from the formula
  vars_in_formula <- all.vars(formula)

  # Loop through each variable in the formula
  for (var in vars_in_formula) {
    if (is.character(data[[var]])) {
      message(paste("Error: The variable", var, "is a character. It needs to be declared as a numeric factor in the model."))
    }
  }
  return(NULL)
}


#' @keywords internal
charToFactor<- function( DF){
  DFNames<- names(DF)
  for( columnName in DFNames){
    if( all(is.character( DF[,columnName]))){
      DF[,columnName]<- as.factor( DF[,columnName])
    }
  }
  return(DF)
}

#' @keywords internal
dataAugmentationDWN <- function(sourceDF,
                                verbose = FALSE,
                                maxNumberLevels=10)
{
  # assume that sourceDF has already been reduced to the covariates
  # in the formula
  # and there are no missing in the response.
  #
  #
  # convert factors in data frame to characters
  # add missing logical and group ID
  sourceDF<- factorToChar(sourceDF)
  # missing flag
  sourceDF$m<- apply( is.na(sourceDF), 1, any)
  # group ID
  sourceDF$grp<- 1: nrow( sourceDF)
  DFNames <- names(sourceDF)
  N <- nrow(sourceDF)
  # observed combinations of covariates in the complete data
  # columns of DF with any missing values
  missingIndexDF <-  apply(is.na(sourceDF), 2, any)
  # names of these columns
  missingNamesAll <- DFNames[missingIndexDF]
  #
  # new DF  that are all rows of sourceDF with _complete_ rows
  # for the columns that do have some missing values.
  # Rename to be sure names are aligned correctly
  workingDF <- as.data.frame(sourceDF[!sourceDF$m, missingIndexDF],
                             col.names = missingNamesAll)
  # make a table of the covariate combinations that appear among the complete
  # rows and for the covariates that have some missing values
  # output for factors that have more than maxLevel will be set to NA.
  levelInfo<- findUniqueLevels(workingDF, maxLevel=maxNumberLevels)
  # Check that the missing pattern levels do not exceed the maximum allowed
  if (any(levelInfo$nLevels > maxNumberLevels)) {
    message("unique values in each factor: ", paste(levelInfo$nLevels, collapse = " "))
    stop(paste("number of unique values for some covariates exceed", maxNumberLevels))
  }
  # all possible combinations of values for
  # covariates that have missing values
  tmpCombinations<- expand.grid(levelInfo$levels)
  # restrict these to just the combinations that actually appear in
  # workingDF
  # the code below is equivalent to looking at the NA cell counts
  # in table(workingDF) but we can not control the order of
  # levels in table -- so we compute this explicilty
  indexAllCombinations <- pasteColumns(tmpCombinations)
  indexCombinationsDF<- pasteColumns(workingDF)
  whichCombinations<- match( indexAllCombinations,indexCombinationsDF )
  # row indices for a combination that appears in data set
  whichCombinations<- which(!is.na(whichCombinations ))
  # Now here is a dataframe that has only combinations that
  # appear in the dataset.
  ObjPattern <- tmpCombinations[ whichCombinations, ]
  if (verbose) {
    message("covariate patterns from complete data: ", paste(missingNamesAll, collapse=" "))
    message("Missing pattern: ", paste(ObjPattern, collapse=" "))
  }
  # handy way to keep track of all the combinations of covariates
  # appearing in workingDF
  ObjPatternString <- pasteColumns(ObjPattern)
  #
  if (verbose) {
    message("Object Pattern string: ", paste(ObjPatternString, collapse = " "))
  }
  # sort the missing pattern strings so that order will be consistent.
  iOrderString<- order( ObjPatternString)
  ObjPatternString<- ObjPatternString[iOrderString]
  ObjPattern<- ObjPattern[iOrderString,]
  #
  # Now a big loop over all rows expanding missing covariates when needed
  #
  rowNames <- row.names(sourceDF)
  tmpDF <- NULL
  # N is the number of rows of the original data frame.
  # for rows with a missing covariate(s) this will be expanded into the
  # same "y" response and all combinations of the missing covariates
  # that align with the observed covariates
  # case == original row in the data frame
  for (case in 1:N) {
    # a single row of the source data frame
    currentRow <- as.data.frame(sourceDF[case, ])
    row.names(currentRow) <- as.character(case)
    if (verbose) {
      message("Row ", case, ": ", currentRow)
    }
    # does this row have a missing covariate?
    if (sourceDF$m[case]) {
      # expand row with all possible cases for missing covariates
      missingIndex <- c(is.na(currentRow)) # missing covariates
      #names of covariates in this row that are missing
      missingNames <- DFNames[missingIndex]
      # and names of nonmissing covariates!
      presentNames <- intersect(   missingNamesAll,
                                   DFNames[!missingIndex])
      if (length(presentNames) == 0) {
        # all covariates are missing!
        # fill in with all possible combinations
        dataPattern <- ObjPattern
        indexPattern<- 1:nrow( ObjPattern)
      }
      else{
        # only repeat those patterns that match the nonmissing covariates
        # note union of missingNames and presentNames should be
        # the names of all covariates that have at least one missing value
        # covariate string present
        presentRowString <-
          paste(currentRow[, presentNames], collapse = "")
        # missing pattern for this row
        ObjPatternStringRow <- pasteColumns(ObjPattern[, presentNames])
        # find all covariate patterns that match the covariates
        # that are present in this row
        indexPattern <- which(presentRowString == ObjPatternStringRow)
        dataPattern <- ObjPattern[indexPattern, ]
      }
      # cbind will automatically repeat the nonMissing current row values.
      # very handy!
      tmpExpandedRow <- dataPattern
      tmpExpandedRow <- cbind(dataPattern,currentRow[, !missingIndex])
      # reshuffle columns to be in DFNames order
      tmpExpandedRow <- cbind(tmpExpandedRow[, DFNames])
      # add informative row names as the original row number a "_"
      # and the number of the expanded result
      row.names(tmpExpandedRow) <- paste0(row.names(currentRow), "_", 1:nrow(tmpExpandedRow))
      if (verbose) {
        # For indexPattern, assuming it's a vector:
        message("indexPattern: ", paste(indexPattern, collapse = " "))
        
        # For tmpExpandedRow, which is likely a data frame:
        # Format the data frame as a character matrix, then collapse rows.
        message("expanded Row:\n", 
                paste(apply(format(tmpExpandedRow), 1, paste, collapse = " "), collapse = "\n"))
      }
      # accumulate this case onto the growing data frame.
      tmpDF <- rbind(tmpDF,
                     cbind(tmpExpandedRow))

    }
    else{
      # no missing so just append row -- this is the easy one!
      tmpDF <- rbind(tmpDF,
                     currentRow)
    }
  } # end loop over cases (rows)
  # add patternID  to each row.
  # this will be the row of ObjPattern that has been
  # used for this row in the augmented dataset
  patternDF <- pasteColumns(tmpDF[, missingIndexDF])
  # patternID is used to index the estimated
  # probability of a missing pattern
  patternID <- match(patternDF , ObjPatternString)
  tmpDF <- cbind(tmpDF, patternID = patternID)
  # convert to factors if column is a character vector
  # if numeric -- leave it unchanged.
  tmpDF<- charToFactor( tmpDF)
  return(
    list(
      DF = tmpDF,
      ObjPattern = ObjPattern,
      objPatternString = ObjPatternString,
      distptrn = nrow(ObjPattern),
      missingNamesAll= missingNamesAll
    )
  )
}


#' @keywords internal
factorToChar<- function( DF){
  DFNames<- names(DF)
  for( columnName in DFNames){
    if( any(is.factor( DF[,columnName]))){
      DF[,columnName]<- as.character( DF[,columnName])
    }
  }
  return(DF)
}

#' @keywords internal
pasteColumns<- function(A, sep=""){
  A<- as.matrix(A)
  N<- ncol( A)
  colStrings<- as.character(A[,1])
  if( N>1){
    for( k in 2:N ){
      colStrings<- paste( colStrings, as.character(A[,k]), sep=sep)
    }
  }
  return( colStrings)
}

#' @keywords internal
findUniqueLevels<- function( DF, maxLevel=10){
  M<- ncol(DF)
  factorLevels<- list()
  nLevels<- rep( NA, M)
  
  for(  j in 1:M){
    colWork<- DF[,j]
    # convert to character if a factor
    if (is.factor(colWork)){
      colWork<- as.character(colWork)
    }
    # find unique values
    tmpLevel<- unique(colWork)
    # unlist to strip out from a tibble or daa frame
    tmpLevel<-   c(unlist(unique(colWork)))
    # names added are confusing- revmoe these
    #record the size
    nLevels[j]<- length( tmpLevel)
    names(tmpLevel)<- NULL
    # if  many levels then set to NA
    # this column is likely just a numerical vector
    # with few common values
    if( length(tmpLevel)>maxLevel){
      tmpLevel<- NA
    }
    # accumulate into a list
    factorLevels<- c( factorLevels, list(tmpLevel))
  }
  names( factorLevels)<- names(DF)
  #
  # # now determine unique combinations that appear
  # levelTags<- matrix( NA, nrow(DF), M)
  # for(  j in 1:M){
  #   colWork<- DF[,j]
  #   levelTags[,j]<- match(factorLevels[[j]], colWork )
  #
  # }
  
  
  return(
    list(levels=factorLevels, nLevels=nLevels)
  )
}

Any scripts or data that you put into this service are public.

glmfitmiss documentation built on June 8, 2025, 1:59 p.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

glmfitmiss
Fitting GLMs with Missing Data in Both Responses and Covariates

R/SupportingFunctionsforDataGenerationsandMiscv02.R
In glmfitmiss: Fitting GLMs with Missing Data in Both Responses and Covariates

Defines functions findUniqueLevels pasteColumns factorToChar dataAugmentationDWN charToFactor checkCharacterVariablesInFormula form_gen dataAugmentation theta_back_2_data

Documented in checkCharacterVariablesInFormula dataAugmentation form_gen theta_back_2_data

Try the glmfitmiss package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

glmfitmiss Fitting GLMs with Missing Data in Both Responses and Covariates

R/SupportingFunctionsforDataGenerationsandMiscv02.R In glmfitmiss: Fitting GLMs with Missing Data in Both Responses and Covariates

Defines functions findUniqueLevels pasteColumns factorToChar dataAugmentationDWN charToFactor checkCharacterVariablesInFormula form_gen dataAugmentation theta_back_2_data

Documented in checkCharacterVariablesInFormula dataAugmentation form_gen theta_back_2_data

Try the glmfitmiss package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

glmfitmiss
Fitting GLMs with Missing Data in Both Responses and Covariates

R/SupportingFunctionsforDataGenerationsandMiscv02.R
In glmfitmiss: Fitting GLMs with Missing Data in Both Responses and Covariates