R/identify.R

Defines functions identify_properties identify_differences identify_att_differences identify_class_differences identify_mode_differences identify_unsupported_cols identify_matching_cols identify_extra_cols identify_extra_rows

Documented in identify_att_differences identify_class_differences identify_differences identify_extra_cols identify_extra_rows identify_matching_cols identify_mode_differences identify_properties identify_unsupported_cols

#' identify_extra_rows
#'
#' Identifies rows that are in a baseline dataset but not in a comparator dataset
#' @param DS1 Baseline dataset (data frame)
#' @param DS2 Comparator dataset (data frame)
#' @param KEYS List of variables that define a unique row within the datasets (strings)
identify_extra_rows <- function(DS1, DS2, KEYS) {
    DS2[["..FLAG.."]] <- "Y"
    dat <- merge(
        subset(DS1, select = KEYS),
        subset(DS2, select = c(KEYS, "..FLAG..")),
        by = KEYS, all.x = TRUE,
        sort = TRUE
    )
    dat <- dat[do.call("order", dat[KEYS]), ]

    dat[is.na(dat[["..FLAG.."]]), KEYS, drop = FALSE]
}



#' identify_extra_cols
#'
#' Identifies columns that are in a baseline dataset but not in a comparator dataset
#' @param DS1 Baseline dataset (data frame)
#' @param DS2 Comparator dataset (data frame)
#' @importFrom tibble tibble
identify_extra_cols <- function(DS1, DS2) {
    match.cols <- sapply(names(DS1), "%in%", names(DS2))
    if (!all(is.logical(match.cols))) {
        stop("Assumption of logical return type is not true")
    }
    tibble(
        COLUMNS = names(DS1)[!match.cols]
    )
}







#' identify_matching_cols
#'
#' Identifies columns with the same name in two data frames
#' @param DS1 Input dataset 1 (data frame)
#' @param DS2 Input dataset 2 (data frame)
#' @param EXCLUDE Columns to ignore
identify_matching_cols <- function(DS1, DS2, EXCLUDE = "") {
    match_cols   <- sapply(names(DS1), "%in%", names(DS2))
    exclude_cols <- sapply(names(DS1), "%in%", EXCLUDE)
    names(DS1)[match_cols & !exclude_cols]
}





#' identify_unsupported_cols
#'
#' Identifies any columns for which the package is not setup to handle
#' @param dsin input dataset
identify_unsupported_cols <- function(dsin) {
    dat <- subset(
        identify_properties(dsin),
        select = c("VARIABLE", "MODE")
    )

    dat[!dat[["MODE"]] %in% c("numeric", "character", "logical"), , drop = FALSE]

}



#' identify_mode_differences
#'
#' Identifies any mode differences between two data frames
#' @param BASE Base dataset for comparison (data.frame)
#' @param COMP Comparator dataset to compare base against (data.frame)
identify_mode_differences <- function(BASE, COMP) {

    matching_cols <- identify_matching_cols(BASE, COMP)

    dat <- merge(
        x = identify_properties(BASE),
        y = identify_properties(COMP),
        by = "VARIABLE",
        all = TRUE,
        suffixes = c(".BASE", ".COMP"),
        sort = TRUE
    )
    dat <-  subset(dat, select = c("VARIABLE", "MODE.BASE", "MODE.COMP"))

    KEEP1 <- dat[["VARIABLE"]] %in% matching_cols
    KEEP2 <- dat[["MODE.BASE"]] != dat[["MODE.COMP"]]

    dat[KEEP1 & KEEP2, , drop = FALSE]

}



#' identify_class_differences
#'
#' Identifies any class differences between two data frames
#' @param BASE Base dataset for comparison (data.frame)
#' @param COMP Comparator dataset to compare base against (data.frame)
identify_class_differences <- function(BASE, COMP) {

    matching_cols <- identify_matching_cols(BASE, COMP)

    dat <- merge(
        x = identify_properties(BASE),
        y = identify_properties(COMP),
        by = "VARIABLE",
        all = TRUE,
        sort = TRUE,
        suffixes =  c(".BASE", ".COMP")
    )

    dat <- subset(dat, select = c("VARIABLE", "CLASS.BASE", "CLASS.COMP"))

    KEEP1 <- dat[["VARIABLE"]] %in% matching_cols
    KEEP2 <- !mapply(
        identical,
        dat[["CLASS.BASE"]],
        dat[["CLASS.COMP"]]
    )

    dat[KEEP1 & KEEP2, , drop = FALSE]

}



#' Identify differences in attributes
#'
#' Identifies any attribute differences between two data frames
#' @param BASE Base dataset for comparison (data.frame)
#' @param COMP Comparator dataset to compare base against (data.frame)
#' @param exclude_cols Columns to exclude from comparison
#' @importFrom tibble tibble
identify_att_differences <- function(BASE, COMP, exclude_cols = "") {

    matching_cols <- identify_matching_cols(BASE, COMP, exclude_cols)

    PROPS <- merge(
        x = identify_properties(BASE),
        y = identify_properties(COMP),
        by = "VARIABLE",
        all = TRUE,
        sort = TRUE,
        suffixes = c(".BASE", ".COMP")
    )

    PROPS <- subset(PROPS, select =  c("VARIABLE", "ATTRIBS.BASE", "ATTRIBS.COMP"))

    PROPS <- PROPS[PROPS[["VARIABLE"]] %in% matching_cols, , drop = FALSE]


    ### Setup dummy return value
    RETURN <- tibble(
        VARIABLE = character(),
        ATTR_NAME = character(),
        VALUES.BASE = list(),
        VALUES.COMP = list()
    )

    for (i in  PROPS[["VARIABLE"]]) {

        PROPS_filt <- PROPS[PROPS[["VARIABLE"]] == i, , drop = FALSE]

        ### Get a vector of all available attributes across both variables
        ATTRIB_NAMES <- unique(c(
            names(PROPS_filt[["ATTRIBS.BASE"]][[1]]),
            names(PROPS_filt[["ATTRIBS.COMP"]][[1]])
        ))

        ### If variable has no attributes move onto the next variable
        if (is.null(ATTRIB_NAMES)) next()

        ### Loop over each attribute checking if they are identical and outputing
        ### anyones that arn't
        for (j in ATTRIB_NAMES){

            ATTRIB_BASE <- PROPS_filt[["ATTRIBS.BASE"]][[1]][j]
            ATTRIB_COMP <- PROPS_filt[["ATTRIBS.COMP"]][[1]][j]

            if (!identical(ATTRIB_BASE, ATTRIB_COMP)) {

                ATT_DIFFS <- tibble(
                    VARIABLE = i,
                    ATTR_NAME = j,
                    VALUES.BASE = ifelse(is.null(ATTRIB_BASE), list(), ATTRIB_BASE),
                    VALUES.COMP = ifelse(is.null(ATTRIB_COMP), list(), ATTRIB_COMP)
                )

                RETURN <- rbind(RETURN, ATT_DIFFS)
            }
        }
    }
    return(RETURN)
}





#' identify_differences
#'
#' Compares each column within 2 datasets to identify any values which they
#' mismatch on.
#' @param BASE Base dataset for comparison (data.frame)
#' @param COMP Comparator dataset to compare base against (data.frame)
#' @param KEYS List of variables that define a unique row within the datasets (strings)
#' @param exclude_cols Columns to exclude from comparison
#' @param tolerance Level of tolerance for numeric differences between two variables
#' @param scale Scale that tolerance should be set on. If NULL assume absolute
identify_differences <- function(
    BASE,
    COMP,
    KEYS,
    exclude_cols,
    tolerance = sqrt(.Machine$double.eps),
    scale = NULL
) {

    matching_cols <- identify_matching_cols(BASE, COMP, c(KEYS, exclude_cols))

    if (length(matching_cols) == 0) return(tibble())

    DAT <- merge(
        x = BASE,
        y = COMP,
        by = KEYS,
        suffix = c(".x", ".y"),
        sort = TRUE
    )
    DAT <- DAT[do.call("order", DAT[KEYS]), ]

    matching_list <- mapply(
        is_variable_different,
        matching_cols,
        MoreArgs = list(
            keynames = KEYS,
            datain = DAT,
            tolerance = tolerance,
            scale = scale
        ),
        SIMPLIFY = FALSE
    )

    matching_list
}









#' identify_properties
#'
#' Returns a dataframe of metadata for a given dataset.
#' Returned values include variable names , class , mode , type & attributes
#' @param dsin input dataframe that you want to get the metadata from
#' @importFrom tibble tibble
identify_properties <- function(dsin) {

    ### If missing or null return empty dataset
    if (is.null(dsin)) {
        x <- tibble(
            VARIABLE = character(),
            CLASS     = list(),
            MODE      = character(),
            TYPE      = character(),
            ATTRIBS   = list()
        )
        return(x)
    }

    tibble(
        VARIABLE = names(dsin),
        CLASS     = lapply(dsin, class),
        MODE      = sapply(dsin, mode),
        TYPE      = sapply(dsin, typeof),
        ATTRIBS   = lapply(dsin, attributes)
    )
}
gowerc/diffdf documentation built on April 9, 2024, 9:34 p.m.