R/pivot_longer_multicol.R

Defines functions pivot_longer_multicol

Documented in pivot_longer_multicol

#---------------------------------
# External Dependencies:
# tidyr
# stringr
#
# Internal Dependencies:
# extract_groups
#---------------------------------


#' pivot_longer_multicol
#'
#' This function chains two pivots from \code{\link[tidyr]{pivot_longer}} with
#' one pivot from \code{\link[tidyr]{pivot_wider}} to pivot from wide to long
#' on multiple columns. The columns to be pivoted must all have a "tag" in
#' in the original column name (e.g., "sch_1" and "sch_2" have a "_1" and a "_2"
#' tag indicating that data from two schools is present in the dataset). The
#' names must all have some common formatting/symbols before the tag in order
#' to allow all variables to be manipulated at the same time using this
#' function.
#'
#' @param .dat A dataframe containing the data to be pivoted.
#'
#' @param .cols Either a string vector listing the names of the columns to be
#' pivoted or a \code{\link[tidyselect]{starts_with}} selection helper.
#'
#' @param .tag_group Numeric scalar. Indicates the capture group (as defined
#' by the regular expression in \code{.capture_groups}) that contains the
#' commonly formatted "tag". Defaults to 2 (e.g., "sch_1" contains the tag in
#' the second group, while "t1_sch" contains a tag in the first). The tags will
#' eventually be passed to a variable (named by \code{.tag_name}), so it is
#' helpful if the tags are meaningful. For example, for variables
#' "sch_1" and "sch_2", a new variable will be created that will take values of
#' "1" and "2".
#'
#' @param .capture_groups A string containing a regular expression defining
#' two capture groups, one capturing the name of the variable to be saved and
#' one group capturing the commonly formatted "tag" of the variables. Defaults
#' to \code{"(.+)_(.+)"}, which splits the variable names at the last "_"
#' character present in the variable name (the final "_" is removed).
#'
#' @param .tag_name String. Indicates the variable name that should be given to
#' the new variable containing the variable tags. Because this function was
#' created to manage the longitudinal school mobility process, the argument
#' defaults to "time".
#'
#' @return This function returns a dataframe with the columns in \code{.cols}
#' pivoted to have a single column per unique value of the non-tag in
#' the capture group and a number of rows equal to the number of unique
#' values in the tag. NOTE: If the variables to be pivoted contain different
#' tag lengths (e.g., "sch_1" and "sch_2" have a tag with length 2 while
#' "id_1", "id_2", "id_3", and "id_4" have a tag with length 4), the variable
#' with the shorter tag length will have missing values where the tags do not
#' overlap.
#'
#' @export
#'
#' @examples \dontrun{
#'
#' # prepare some data
#'
#' temp_dat <- tibble(
#'   x = rnorm(100),
#'   y = rnorm(100),
#'   z_1 = rep(c(1, 0), 50),
#'   z_2 = rep(c(5, 6, 7, 8), 25),
#'   g_1 = rnorm(100),
#'   g_2 = rnorm(100),
#'   g_3 = rnorm(100),
#'   g_4 = rnorm(100)
#' )
#'
#' pivot_longer_multicol(
#'   .dat = temp_dat,
#'   .cols = tidyr::matches("_")
#' )
#'
#' }
pivot_longer_multicol <-
  function(
    .dat,
    .cols,
    .tag_group = 2,
    .capture_groups = "(.+)_(.+)",
    .tag_name = "time"
  ) {

    ##--setup--##

    # use enquo and eval_select to evaluate the .cols argument
    .cols <-
      tidyselect::eval_select(
        expr = dplyr::enquo(.cols),
        data = .dat[unique(names(.dat))]
      )

    # Determine the new variable names after applying the regex with
    # .capture_groups to the first pivot_longer step
    # NOTE: we can only have two capture groups for pivot_longer -
    # one to capture the variable name and one to capture the repetition tag
    # (e.g., "sch_id" and "1" or "2").

    value_nms <-
      .dat %>%
      dplyr::select(., !!.cols) %>%
      names(.) %>%
      extract_groups(
        .strings = .,
        .capture_groups = .capture_groups,
        .tag_group = .tag_group
      )

    # use .tag_group (the position of the "tag" or the repeated indicator
    # for the variable names) to set up the input for the "names_to" argument
    # in pivot_longer. with two capture groups, names_to needs to take a
    # vector input of length 2.

    # One of the elements should be the special
    # ".value" sentinel, which uses the values in extract_groups for the
    # opposide .tag_group (i.e., if .tag_group is 1, then it uses the values of
    # 2) to set names for the new variables.

    # the other element should be "names" (could be anything, but "names"
    # makes the most sense in our context). This refers to the variable names
    # that we want to save for later (e.g., "sch_id", "sch_wt", "z_predictor").
    # the order of the "names_to" vector input should be set by .tag_group.
    # if the tag is in capture group 2, then ".value" should be in position
    # 2 and "names" should be in position 1. If the tag is in capture group 1,
    # then the order should be swapped accordingly.

    # to accomplish this step, create an empty character vector, assign
    # ".value" to the .tag_group index, and then assign "names" to the
    # remaining index (which can be identified by is.na(namesto_input))

    namesto_input <- character()
    namesto_input[.tag_group] <- ".value"
    namesto_input[is.na(namesto_input)] <- "names"


    ##--convert mobility data from wide to long

    # because there are three groups of wide variables to pivot
    # (sch_id*, z_predictor*, sch_wt*), we need multiple steps:
    #
    # 1. pivot_longer, taking all of the *_1 & *_2 variables
    # and sending their names to one column ("names") and sending their
    # values to two different columns ("1" and "2")
    # 2. pivot longer again, reducing the names of the two values columns
    # ("1" and "2") to one column, given by .tag_name, and the values of
    # those columns to a new column called "value"
    # 3. pivot wider, expanding the variable names in "names" to three different
    # columns, "sch_id", "z_predictor", and "sch_wt". The values for these
    # new variables are pulled from the "values" column. There should now
    # be two rows for each person in the data, and the "values" and
    # "names" columns are dropped.

    .dat %>%
      tidyr::pivot_longer(
        data = .,
        cols = .cols,
        names_to = namesto_input,
        names_pattern = .capture_groups
      ) %>%
      tidyr::pivot_longer(
        data = .,
        cols = value_nms,
        names_to = .tag_name,
        values_to = "values"
      ) %>%
      tidyr::pivot_wider(
        data = .,
        names_from = "names",
        values_from = "values"
      )

  }
tessaleejohnson/corclus documentation built on Oct. 11, 2022, 3:46 a.m.