R/fillByGroup.R

Defines functions groupMode .fillByGroup fillByGroup

Documented in fillByGroup groupMode

# fillByGroup ----
#' Fill NAs in data.frame by grouping
#'
#'Wrapper of tidyr::fill with checks for NAs in grouping values and the option
#'to fill with the majority value if there is more than one value per group.
#'If an error is raised, an example group that causes an error is returned
#'
#'@param df A data.frame or tibble with missing (NA) values to be filled
#'@param group (character(n)) Names of column(s) to group by
#'@param fill (character(n)) Name(s) of column(s) to fill
#'@param multiple (Default: "stop") How should multiple values in columns to
#'be filled be handled?  Either "stop" (raise an error), "mode" (select
#'the most common value) or "ignore" (entries with multiple possible modes are
#'set to NA).
#'@param method Either "only_na", of only missing entries should be filled,
#' or "all", if all entries should be replaced with their group mode
#'
#'@author Helen Lindsay
#'@returns df, with NA entries filled where possible by grouping and taking
#'the most common value within each column and group.
#'@importFrom tidyr fill
#'@importFrom stats complete.cases
#'@importFrom dplyr group_by ungroup n_distinct coalesce all_of
#'@importFrom utils capture.output
#'@export
#'
#'@examples
#'
#'df <- data.frame(A = rep(c("A", "B"), each = 3),
#'                 B = c(NA, "C", "D", "E", NA, "E"))
#'# Setting multiple = "mode" means that the most common value will be
#'# used for filling, or the first if there are ties.
#'fillByGroup(df, group = "A", fill = "B", multiple = "mode")
#'
#'# Setting multiple = "ignore" means that groups with multiple values
#'# will not be filled.
#'fillByGroup(df, group = "A", fill = "B", multiple = "ignore")
fillByGroup <- function(df, group, fill, method=c("only_na", "all"),
                        multiple=c("stop", "mode", "ignore")){
    .stopIfColExists(df, sprintf("n%s", fill))

    multiple <- match.arg(multiple)
    overwrite <- match.arg(method) == "all"
    original_nrows <- nrow(df)

    df <- AbNames::splitMerge(df, complete.cases(!!!syms(group)),
                              .fillByGroup, group = group, fill = fill,
                              multiple = multiple, overwrite = overwrite)

    if (! nrow(df) == original_nrows){
        warning("Rows have been lost or gained when merging NA rows")
    }

    return(df)
}


# .fillByGroup ----
#
#'@keywords internal
#'@importFrom dplyr all_of if_any
.fillByGroup <- function(df, group, fill, multiple, overwrite){
    # To do?  Fill with best option matching a condition e.g. no punctuation?
    # Could be done by creating a duplicate column with no punctuation, filling
    # by antigen

    # Group data frame, check if there are multiple values per group
    cn <- colnames(df)
    df <- .addNPerGroup(df, group, fill)
    n_per_gp <- setdiff(colnames(df), cn)

    # Case: multiple possible values in a fill group and we should stop
    if (multiple == "stop" & any(df[, n_per_gp] > 1)){
        msg <- "Some fill columns have multiple values per group, e.g. \n"
        fg <- .printGroupMatch(df, if_any(all_of(n_per_gp), ~">"(.x, 1)))
        stop(msg, fg)
    }

    # Case: only one value per group
    if (all(df[, n_per_gp] <= 1)){
        df <- tidyr::fill(df, !!!syms(fill), .direction = "updown") %>%
            dplyr::ungroup()
    }

    # Case: multiple values per group, select the most frequent one
    if (multiple %in% c("mode", "ignore")){
        df <- .freducePartial(df, groupMode, cls = "cl", cl = fill, gp = group,
                              keep_first = multiple == "mode",
                              overwrite = overwrite)
    }

    df <- dplyr::select(df, -all_of(n_per_gp))

    return(df)
}


# groupMode ----
#
#' Find the most common value per group
#'
#' Given a grouped data.frame, count values per group and return a data.frame
#' with an additional column containing the most common value for each group.
#' If there are several equally common values, the first will be chosen.
#' Used by [fillByGroup()].
#'
#'@param df a data.frame or tibble
#'@param cl Name of column to find mode (character(1)).
#'@param new_cl Name of column to create.  If NA (default), col is modified
#'@param gp Name(s) of columns to group by
#'@param min_n (integer(1), default NA) Minimum number of occurrences of
#'majority value.  If provided, the majority value will be set to NA when it
#'occurs less than min_n times.
#'@param keep_first (logical(1), default TRUE) If there are multiple modes,
#'should the first (in order of the data.frame) be selected?  If FALSE,
#'entries are set to NA
#'@param overwrite (logical(1), default FALSE) If FALSE, new_cl will only
#'replace NAs.  If TRUE, new_cl will contain the mode.  Note that
#'keep_first = FALSE with overwrite = TRUE may lead to values being replaced
#'with NA.
#'@importFrom dplyr n
#'@importFrom rlang .data
#'@returns df with group mode values either replacing values in column "cl",
#'or in a new column new_cl.
#'@examples
#'df <- data.frame(Antigen = c("CD274", "CD274", "PD-L1"),
#'                 Clone = rep("29E.2A3", 3))
#'groupMode(df, cl="Antigen", gp="Clone", new_cl="Antigen_mode")
groupMode <- function(df, cl, gp, new_cl=NA, min_n=NA,
                      keep_first=TRUE, overwrite=FALSE){

    n <- .tempColName(df, 1, "n")
    tmp <- .tempColName(df, 1)

    if (! is.na(new_cl)) {
        overwrite = TRUE
    } else {
        new_cl <- cl
    }

    # Find the mode for column cl in group gp
    df <- df %>%
        dplyr::group_by(!!!syms(c(gp, cl))) %>%
        dplyr::mutate(!!n := n(),
                      !!n := ifelse(is.na(!!sym(cl)), -1, !!sym(n))) %>%
        dplyr::group_by(!!!syms(gp)) %>%
        dplyr::mutate(!!tmp := .data[[cl]][.data[[n]] == max(.data[[n]])][1])

    if (! is.na(min_n)){
        df <- df %>%
            dplyr::mutate(!!tmp := ifelse(max(!!sym(n)) < min_n,
                                          NA, !!sym(tmp)))
    }

    if (! isTRUE(keep_first)){
        # Set tmp to NA if there are multiple valid modes
        x <- .tempColName(df, 1, "x")
        df <- df %>%
            dplyr::mutate(!!x :=
                    sum(.data[[n]] == max(.data[[n]])) == max(.data[[n]]),
                    !!tmp := ifelse(.data[[x]] == TRUE, .data[[tmp]], NA)) %>%
        dplyr::select(-dplyr::any_of(x))
    }

    if (isFALSE(overwrite)){
        # Fill with the majority value if current value is NA
        df <- df %>%
            dplyr::mutate(!!new_cl :=
                              dplyr::coalesce(.data[[cl]], .data[[tmp]]))
    } else {
        # Set new_cl to the actual mode
        df <- df %>%
            dplyr::mutate(!!new_cl := .data[[tmp]])
    }

    df <- df %>%
        dplyr::select(-dplyr::any_of(c(tmp, n))) %>%
        dplyr::ungroup()
    return(df)
}
HelenLindsay/AbNames documentation built on June 6, 2023, 1:18 p.m.