R/data_manipulation.R

Defines functions reorder_levels discard_redundant.default discard_redundant any_not_na all_na discard_all_na rescale_unit rescale_centered z z_trans update_by left_union go_arrange go_first expand_grid

Documented in discard_all_na discard_redundant discard_redundant.default expand_grid go_arrange go_first left_union reorder_levels rescale_centered rescale_unit update_by z_trans

#' dplyrified expand.grid
#'
#' works like expand.grid, but returns a tibble data_frame
#'
#' @param ... factors
#' @return data_frame
#'
#' @author Martin Schmettow
#' @export


expand_grid <-
  function(...)
    expand.grid(stringsAsFactors = F, ...) %>% tibble::as_tibble()


#' Moving column to first position
#'
#' The referred column becomes the first, keeping the order of the others
#'
#' @param D data frame
#' @param ... formulas, like in select, but with ~
#' @return data frame
#'
#' go_first and go_arrange expect a column specification, similar to
#' dplyr::select, but as a formula
#'
#' @examples
#' D <- data_frame(x = 1:3, y = 6:4, z = c(8,9,7))
#' go_first(D, ~y)
#' go_first(D, ~y:z)
#' go_arrange(D, ~y)
#'
#'
#'
#' @author Martin Schmettow
#' @export


go_first <-
  function(D, ...){
    class <- class(D)
    cols <- quos(...)
    df1 <- dplyr::select(D, !!!cols)
    out <- mascutils::left_union(df1, D)
    class(out) <- class
    out
  }


#' @rdname go_first
#' @export

go_arrange <-
  function(D, ...){
    class <- class(D)
    cols <- quos(...)
    df1 <- dplyr::select(D, !!!cols)
    out <- mascutils::left_union(df1, D) %>%
      dplyr::arrange_(names(df1))
    class(out) <- class
    out
  }



#' adding columns that don't exist yet
#'
#'
#' @param df1 master data frame, where columns are kept
#' @param df2 data frame where additional columns are taken from
#' @return data frame
#'
#'
#'
#' @author Martin Schmettow
#' @export

left_union <-
  function(df1, df2){
    class <- class(df1)
    if(nrow(df1) != nrow(df2)) stop("dataframes have different number of rows")
    added_cols <- setdiff(names(df2), names(df1))
    out <- dplyr::bind_cols(df1, df2[,added_cols])
    class(out) <- class
    out
  }


#' update columns conditionally
#'
#'
#' @param D data frame
#' @param filter predicate function (like dplyr::filter)
#' @param ... expressions
#' @return data frame
#'
#' Applies mutations to the filtered group, only.
#'
#' @examples
#' D <- tribble(~group, ~value,  1, 4, 1, 9, 2, -4, 2, -9)
#'
#' D %>% mutate(value = if_else(group == 1, sqrt(value), value))
#' ## Produces NaNs, because sqrt() is evaluated before selection
#'
#' D %>% mutate_by(group == 1, value = sqrt(value))
#' ## sqrt() is only evaluated
#'
#' @author Martin Schmettow
#' @export



update_by <-
  function(D, by, ...){
    flt <- enquo(by)
    args <- enquos(...)

    missing_cols <- dplyr::setdiff(names(args), names(D))
    if (length(missing_cols > 0)) stop(paste0("\nColumn does not exist: ",
                                              missing_cols))

    D <- dplyr::mutate(D, .tmp_idx = dplyr::row_number())

    mut <- D %>%
      dplyr::filter(!!flt) %>%
      dplyr::mutate(!!!args)

    rmn <- D %>%
      dplyr::filter(!(!!flt))

    bind_rows(mut, rmn) %>%
      dplyr::arrange(.tmp_idx) %>%
      dplyr::select(-.tmp_idx)
  }





#' Adding z-transformed scores
#'
#' The (simple) range of columns are z-transformed and added to the data frame (foo_z)
#'
#' @param D data frame
#' @param ... dplyr::select range of variables
#' @return data frame
#'
#'
#' @author Martin Schmettow
#' @import dplyr
#' @import tidyr
#' @export


z_trans <- function(D,  ...){
  col_spec <- quos(...)
  df_z <- dplyr::select(D, !!!col_spec) %>%
    transmute_all(z)
  names(df_z) <- stringr::str_c("z", names(df_z), sep = "")
  bind_cols(D, df_z)
}

z <- function(x) (x - mean(x, na.rm = T))/sd(x, na.rm = T)

#' @rdname z_trans
#' @export

z_score <- z_trans


#' centered rescaling
#'
#' symmetrically scales a variable up or down, maintaining the center of the scale
#'
#' @param x numerical vector
#' @param scale rescale factor
#' @return numerical vector
#'
#'
#' @author Martin Schmettow
#' @import dplyr
#' @import tidyr
#' @export


rescale_centered <- function(x, scale = .999){
  mean_x <- mean(x, na.rm = T)
  x_center <- x - mean_x
  x_shrink <- x_center * scale
  out <- x_shrink + mean_x
  out
}

#' rescaling to unit interval
#'
#' scales a variable to the interval [0;1]
#'
#' @param x numerical vector
#' @param lower lowest possible value
#' @param upper highest possible value
#' @return numerical vector
#'
#'
#' @author Martin Schmettow
#' @import dplyr
#' @import tidyr
#' @export


rescale_unit     <- function(x,
                             lower = min(x, na.rm = T),
                             upper = max(x, na.rm = T), scale = 1){
  x_to_zero <- x - lower
  x_to_one <- x_to_zero/(upper -lower)
  out <- rescale_centered(x_to_one, scale = scale)
  out <- x_to_one
  out
}

#' @rdname rescale_unit
#' @export

rescale_zero_one <- rescale_unit
## TODO:
# check out dplyr:::select_.data.frame for how to use dplyrs column expansion



#' Removing completely unmeasured variables
#'
#' all columns that are completely NA are discarded
#'
#' @param D data frame
#' @return data frame
#'
#'
#' @author Martin Schmettow
#' @export

discard_all_na <-
  function(D){
    filter = which(plyr::aaply(as.matrix(D), 2, any_not_na))
    var_non_na = colnames(D)[filter]
    out = select(D, one_of(var_non_na))
    out
  }

all_na <-
  function(x) all(is.na(x))

any_not_na <-
  function(x) !all_na(x)


#' Remove redundant variables
#'
#' all variables that have a constant value are removed from a data frame
#'
#' @param D data frame
#' @param except vector of column names to keep
#' @return data frame
#'
#'
#' @author Martin Schmettow
#' @export

discard_redundant <-
  function(D, except, ...) UseMethod("discard_redundant", D)


#' @rdname discard_redundant
#' @export
#'
discard_redundant.default <- function(D, except = c()){
  if(nrow(D) < 2) return(D)
  colnames <- colnames(D)
  cols_except <- colnames %in% except
  cols_nonred <- plyr::aaply(as.matrix(D), 2, function(v) length(unique(v)) > 1)
  cols_keep   <- cols_except | cols_nonred

  D[, c(cols_keep)]
}


#' Re-order levels of a factor
#'
#' Re-orders the levels of a factor by a vector of new positions.
#'
#' @param x factor
#' @param positions vector of positions
#' @return factor
#'
#'
#' @author Martin Schmettow
#' @export


reorder_levels <- function(x, positions){
  levels <- unique(x)
  if(length(positions) != length(levels))
    stop(stringr::str_c("Number of positions (",length(positions),")
                           does not match number of levels (", length(levels) ,")"))
  missing <- setdiff(1:length(levels), positions)
  above <- setdiff(positions, 1:length(levels))
  if(length(missing) | length(above)) {
    stop(stringr::str_c("Invalid positions vector. Must contain exactly the values 1:", length(levels)))
  }
  out <- factor(x, levels = levels[positions])
  out
}

# x <- c("A", "B", "C")
# reorder_levels(x, c(3,2,4))

# discard_redundant <- function(D){
#   if(nrow(D) < 2) return(D)
#
#   a <- as.matrix(D)
#   nonred <- plyr::aaply(a, 2, function(v) length(unique(v)) > 1)
#   D[, c(nonred)]
# }
schmettow/mascutils documentation built on May 12, 2020, 11:39 a.m.