R/SupportingFunctionsforDataGenerationsandMiscv02.R

Defines functions findUniqueLevels pasteColumns factorToChar dataAugmentationDWN charToFactor checkCharacterVariablesInFormula form_gen dataAugmentation theta_back_2_data

Documented in checkCharacterVariablesInFormula dataAugmentation form_gen theta_back_2_data

#' This function performs data augmentation on the provided dataset.
#' @param theta A data frame containing the theta.
#' @param data A data where theta to be merged.
#' @param MissingVars A variable with missing data
#' @return A data frame containing the augmented data.
#' @keywords internal
#' @importFrom dplyr group_keys
#' @importFrom data.table as.data.table := .I setorder
theta_back_2_data <- function(theta,data, MissingVars){
  data$row.num <- seq(1:nrow(data))
  group.data <- dplyr::group_by(data, !!!dplyr::syms(MissingVars))
  ptrn <- dplyr::group_keys(group.data)
  ptrnTheta <- cbind(ptrn,theta)
  MergedData <- merge(data,ptrnTheta, by.x=MissingVars, by.y=MissingVars)
  MergedData1 <- MergedData[order(MergedData$row.num),]
  MergedData1$row.num <- NULL
  return(list("data" = MergedData1))
}

#' Data Augmentation Function
#'
#' This function performs data augmentation on the provided dataset.
#' @keywords internal
#' @importFrom stats na.omit
dataAugmentation <-function(SourceMisData, formula, adtnlCovforR=NULL)
{

  #SourceMisData=fn_data

  #Get column names with missing values
  missing_cols <- colSums(is.na(SourceMisData))

  if (sum(missing_cols)>0){
    #sub-setting the data based on the supplied formula
    formula_vars <- all.vars(formula)

    if (length(adtnlCovforR) > 0) {
      df_data <- subset(SourceMisData, select = c(formula_vars, adtnlCovforR))
    } else {
      df_data <- subset(SourceMisData, select = formula_vars)
    }


    resp <- all.vars(formula)[1]

    #predictors containing missing data
    predictor_vars <- attr(terms(formula), "term.labels")
    nMissCov=sum(colSums(is.na(SourceMisData[, predictor_vars])))
    if (nMissCov !=0){
      VarWithMissingVal <- predictor_vars[colSums(is.na(SourceMisData[, predictor_vars])) > 0]

      #creating a flag m=1 if there is a missing in any of the covariates
      df_data$m <- ifelse(rowSums(is.na(data.frame(df_data[, VarWithMissingVal]))) > 0, 1, 0)
    }


    #creating a flag R=1 if there is a missing in response variable
    if (any(is.na(df_data[, resp]))){

      df_data$R <- ifelse(rowSums(is.na(data.frame(df_data[, resp]))) > 0, 1, 0)
    }


    #creating a group variable grp to compute weight using Bayes theorem later
    df_data$grp<- seq(1:nrow(df_data))

  }
  else{
    df_data <- SourceMisData
  }

  cols_with_missing <- names(missing_cols[missing_cols>0])

  tt.miss=df_data

  tt=tt.miss
  # table(tt.miss$m, exclude=NULL)
  # augmenting with all possible data available in the column
  col.name=colnames(tt)
  for (i in 1:length(col.name))
  {
    #identifies all missing columns, if a column with missing is not of interest
    #need to be dropped before this stage
    if (sum(which(is.na(tt.miss[,i])))>0){

      t=tt[,i]
      t.2=tt %>% dplyr::filter(as.vector(is.na(t)))
      # nrow(t.2)
      fl.1=!is.na(t)
      temp.1=tt[fl.1,]

      zz=as.matrix(unique(temp.1[,i]))
      for(j in 1:nrow(zz)){
        # print (c(i,j))
        t1=t.2
        t1[,i]=zz[j,]

        #-----end of debug-------
        # Print column names before rbind
        temp.1<-rbind(temp.1,t1)

      }
      tt=temp.1

    }
    tt.1=tt

  }

  data=tt[order(tt$grp),]

  if (nMissCov !=0){
    #predictors containing missing data
    predictor_vars <- attr(terms(formula), "term.labels")
    cols_with_missing <- predictor_vars[colSums(is.na(tt.miss[, predictor_vars])) > 0]

    # Ensure unique_sorted_df is a data frame, even if cols_with_missing has one column
    unique_sorted_df <- tt.miss[!duplicated(tt.miss[, cols_with_missing, drop = FALSE]), cols_with_missing, drop = FALSE]

    # Apply order function to multiple columns dynamically
    unique_sorted_df <- unique_sorted_df[do.call(order, unique_sorted_df[cols_with_missing]), , drop = FALSE]

    #unique non-missing pattern
    ptrn<-na.omit(unique_sorted_df)
    # from the augmented data keeping only those rows that are observed in the observed data
    dataWithObjMiss<-merge(x=data, y=ptrn, by=cols_with_missing, sort=FALSE)
    dataWithObjMiss <- dataWithObjMiss[order(dataWithObjMiss$grp),]

    dn=nrow(ptrn)
  }
  else {
    dataWithObjMiss=data
    ptrn=NULL
    dn=NULL
  }
  return(list(augData=dataWithObjMiss, ObjPattern=ptrn, distptrn=dn))
}

#' formula generation
#'
#' This function is for formula generation.
#' @keywords internal
#' @importFrom stats as.formula
form_gen<-function(resp, pred){
  form<-paste(resp,"~")
  l<-length(pred)
  if(l != 1){
    for (i in 1:(l-1)){
      form<-paste(form, pred[i], "+")
    }
    i<-i+1
    form<-paste(form, pred[i])
  }else{
    form<-paste(form, pred)
  }
  return(as.formula(form))
}

#' Function to check if any character variables exist in a formula and show an error
#' @keywords internal
checkCharacterVariablesInFormula <- function(formula, data) {
  # Extract variables from the formula
  vars_in_formula <- all.vars(formula)

  # Loop through each variable in the formula
  for (var in vars_in_formula) {
    if (is.character(data[[var]])) {
      message(paste("Error: The variable", var, "is a character. It needs to be declared as a numeric factor in the model."))
    }
  }
  return(NULL)
}


#' @keywords internal
charToFactor<- function( DF){
  DFNames<- names(DF)
  for( columnName in DFNames){
    if( all(is.character( DF[,columnName]))){
      DF[,columnName]<- as.factor( DF[,columnName])
    }
  }
  return(DF)
}

#' @keywords internal
dataAugmentationDWN <- function(sourceDF,
                                verbose = FALSE,
                                maxNumberLevels=10)
{
  # assume that sourceDF has already been reduced to the covariates
  # in the formula
  # and there are no missing in the response.
  #
  #
  # convert factors in data frame to characters
  # add missing logical and group ID
  sourceDF<- factorToChar(sourceDF)
  # missing flag
  sourceDF$m<- apply( is.na(sourceDF), 1, any)
  # group ID
  sourceDF$grp<- 1: nrow( sourceDF)
  DFNames <- names(sourceDF)
  N <- nrow(sourceDF)
  # observed combinations of covariates in the complete data
  # columns of DF with any missing values
  missingIndexDF <-  apply(is.na(sourceDF), 2, any)
  # names of these columns
  missingNamesAll <- DFNames[missingIndexDF]
  #
  # new DF  that are all rows of sourceDF with _complete_ rows
  # for the columns that do have some missing values.
  # Rename to be sure names are aligned correctly
  workingDF <- as.data.frame(sourceDF[!sourceDF$m, missingIndexDF],
                             col.names = missingNamesAll)
  # make a table of the covariate combinations that appear among the complete
  # rows and for the covariates that have some missing values
  # output for factors that have more than maxLevel will be set to NA.
  levelInfo<- findUniqueLevels(workingDF, maxLevel=maxNumberLevels)
  # Check that the missing pattern levels do not exceed the maximum allowed
  if (any(levelInfo$nLevels > maxNumberLevels)) {
    message("unique values in each factor: ", paste(levelInfo$nLevels, collapse = " "))
    stop(paste("number of unique values for some covariates exceed", maxNumberLevels))
  }
  # all possible combinations of values for
  # covariates that have missing values
  tmpCombinations<- expand.grid(levelInfo$levels)
  # restrict these to just the combinations that actually appear in
  # workingDF
  # the code below is equivalent to looking at the NA cell counts
  # in table(workingDF) but we can not control the order of
  # levels in table -- so we compute this explicilty
  indexAllCombinations <- pasteColumns(tmpCombinations)
  indexCombinationsDF<- pasteColumns(workingDF)
  whichCombinations<- match( indexAllCombinations,indexCombinationsDF )
  # row indices for a combination that appears in data set
  whichCombinations<- which(!is.na(whichCombinations ))
  # Now here is a dataframe that has only combinations that
  # appear in the dataset.
  ObjPattern <- tmpCombinations[ whichCombinations, ]
  if (verbose) {
    message("covariate patterns from complete data: ", paste(missingNamesAll, collapse=" "))
    message("Missing pattern: ", paste(ObjPattern, collapse=" "))
  }
  # handy way to keep track of all the combinations of covariates
  # appearing in workingDF
  ObjPatternString <- pasteColumns(ObjPattern)
  #
  if (verbose) {
    message("Object Pattern string: ", paste(ObjPatternString, collapse = " "))
  }
  # sort the missing pattern strings so that order will be consistent.
  iOrderString<- order( ObjPatternString)
  ObjPatternString<- ObjPatternString[iOrderString]
  ObjPattern<- ObjPattern[iOrderString,]
  #
  # Now a big loop over all rows expanding missing covariates when needed
  #
  rowNames <- row.names(sourceDF)
  tmpDF <- NULL
  # N is the number of rows of the original data frame.
  # for rows with a missing covariate(s) this will be expanded into the
  # same "y" response and all combinations of the missing covariates
  # that align with the observed covariates
  # case == original row in the data frame
  for (case in 1:N) {
    # a single row of the source data frame
    currentRow <- as.data.frame(sourceDF[case, ])
    row.names(currentRow) <- as.character(case)
    if (verbose) {
      message("Row ", case, ": ", currentRow)
    }
    # does this row have a missing covariate?
    if (sourceDF$m[case]) {
      # expand row with all possible cases for missing covariates
      missingIndex <- c(is.na(currentRow)) # missing covariates
      #names of covariates in this row that are missing
      missingNames <- DFNames[missingIndex]
      # and names of nonmissing covariates!
      presentNames <- intersect(   missingNamesAll,
                                   DFNames[!missingIndex])
      if (length(presentNames) == 0) {
        # all covariates are missing!
        # fill in with all possible combinations
        dataPattern <- ObjPattern
        indexPattern<- 1:nrow( ObjPattern)
      }
      else{
        # only repeat those patterns that match the nonmissing covariates
        # note union of missingNames and presentNames should be
        # the names of all covariates that have at least one missing value
        # covariate string present
        presentRowString <-
          paste(currentRow[, presentNames], collapse = "")
        # missing pattern for this row
        ObjPatternStringRow <- pasteColumns(ObjPattern[, presentNames])
        # find all covariate patterns that match the covariates
        # that are present in this row
        indexPattern <- which(presentRowString == ObjPatternStringRow)
        dataPattern <- ObjPattern[indexPattern, ]
      }
      # cbind will automatically repeat the nonMissing current row values.
      # very handy!
      tmpExpandedRow <- dataPattern
      tmpExpandedRow <- cbind(dataPattern,currentRow[, !missingIndex])
      # reshuffle columns to be in DFNames order
      tmpExpandedRow <- cbind(tmpExpandedRow[, DFNames])
      # add informative row names as the original row number a "_"
      # and the number of the expanded result
      row.names(tmpExpandedRow) <- paste0(row.names(currentRow), "_", 1:nrow(tmpExpandedRow))
      if (verbose) {
        # For indexPattern, assuming it's a vector:
        message("indexPattern: ", paste(indexPattern, collapse = " "))
        
        # For tmpExpandedRow, which is likely a data frame:
        # Format the data frame as a character matrix, then collapse rows.
        message("expanded Row:\n", 
                paste(apply(format(tmpExpandedRow), 1, paste, collapse = " "), collapse = "\n"))
      }
      # accumulate this case onto the growing data frame.
      tmpDF <- rbind(tmpDF,
                     cbind(tmpExpandedRow))

    }
    else{
      # no missing so just append row -- this is the easy one!
      tmpDF <- rbind(tmpDF,
                     currentRow)
    }
  } # end loop over cases (rows)
  # add patternID  to each row.
  # this will be the row of ObjPattern that has been
  # used for this row in the augmented dataset
  patternDF <- pasteColumns(tmpDF[, missingIndexDF])
  # patternID is used to index the estimated
  # probability of a missing pattern
  patternID <- match(patternDF , ObjPatternString)
  tmpDF <- cbind(tmpDF, patternID = patternID)
  # convert to factors if column is a character vector
  # if numeric -- leave it unchanged.
  tmpDF<- charToFactor( tmpDF)
  return(
    list(
      DF = tmpDF,
      ObjPattern = ObjPattern,
      objPatternString = ObjPatternString,
      distptrn = nrow(ObjPattern),
      missingNamesAll= missingNamesAll
    )
  )
}


#' @keywords internal
factorToChar<- function( DF){
  DFNames<- names(DF)
  for( columnName in DFNames){
    if( any(is.factor( DF[,columnName]))){
      DF[,columnName]<- as.character( DF[,columnName])
    }
  }
  return(DF)
}

#' @keywords internal
pasteColumns<- function(A, sep=""){
  A<- as.matrix(A)
  N<- ncol( A)
  colStrings<- as.character(A[,1])
  if( N>1){
    for( k in 2:N ){
      colStrings<- paste( colStrings, as.character(A[,k]), sep=sep)
    }
  }
  return( colStrings)
}

#' @keywords internal
findUniqueLevels<- function( DF, maxLevel=10){
  M<- ncol(DF)
  factorLevels<- list()
  nLevels<- rep( NA, M)
  
  for(  j in 1:M){
    colWork<- DF[,j]
    # convert to character if a factor
    if (is.factor(colWork)){
      colWork<- as.character(colWork)
    }
    # find unique values
    tmpLevel<- unique(colWork)
    # unlist to strip out from a tibble or daa frame
    tmpLevel<-   c(unlist(unique(colWork)))
    # names added are confusing- revmoe these
    #record the size
    nLevels[j]<- length( tmpLevel)
    names(tmpLevel)<- NULL
    # if  many levels then set to NA
    # this column is likely just a numerical vector
    # with few common values
    if( length(tmpLevel)>maxLevel){
      tmpLevel<- NA
    }
    # accumulate into a list
    factorLevels<- c( factorLevels, list(tmpLevel))
  }
  names( factorLevels)<- names(DF)
  #
  # # now determine unique combinations that appear
  # levelTags<- matrix( NA, nrow(DF), M)
  # for(  j in 1:M){
  #   colWork<- DF[,j]
  #   levelTags[,j]<- match(factorLevels[[j]], colWork )
  #
  # }
  
  
  return(
    list(levels=factorLevels, nLevels=nLevels)
  )
}

Try the glmfitmiss package in your browser

Any scripts or data that you put into this service are public.

glmfitmiss documentation built on June 8, 2025, 1:59 p.m.