R/shapetools.R

Defines functions pull_vector crop unfiscalize sheet2var unclusterize extract_a_cluster rm_nacols headerize make_ascii rm_matchcol rm_matchrow gather_cols append_info make_rect

Documented in append_info crop extract_a_cluster gather_cols headerize make_ascii make_rect pull_vector rm_matchcol rm_matchrow rm_nacols sheet2var unclusterize unfiscalize

#' Make loaded df as rectangular-shape df
#'
#' @inheritParams readxl::read_excel
#' @param df Data frame to be processed
#' @param range Cell range to be extracted in Excel-format("A1:Z10")
#' @export
make_rect <- function(df, range) {
  from  <- stringr::str_extract(range, "^[A-Z]+[0-9]+") %>%
    cellranger::as.cell_addr(strict = FALSE) %>%
    unclass()
  to <- stringr::str_extract(range, "[A-Z]+[0-9]+$") %>%
    cellranger::as.cell_addr(strict = FALSE) %>%
    unclass()
  t <- min(from$row, to$row)
  b <- max(from$row, to$row)
  l <- min(from$col, to$col)
  r <- max(from$col, to$col)
  df[t:b, l:r]
}

#' Append information stored in list to data frame
#'
#' @param info List conposed of `key = value` pairs
#' @param headerized If FALSE, allow appending to data frame with
#'   tentative colnames
#' @inheritParams make_rect
#' @export
append_info <- function(df, info, headerized = FALSE) {
  df_info <- list2df(info, nrow = nrow(df))
  if (headerized == FALSE) {
    df_info[1, ] <- names(info)
    tentative_name <- as.character(seq(ncol(df) + 1, ncol(df) + length(info)))
    colnames(df_info) <- tentative_name
  }
  cbind(df, df_info)
}

#' Gather columns to variable
#'
#' @inheritParams make_rect
#' @param regex Regex to match columns to be gathered
#' @param newname New name for colnames to be gatherd
#' @param varname New name for values to be gatherd
#' @export
gather_cols <- function(df, regex, newname, varname) {
  cols2gather <- stringr::str_extract(colnames(df), regex) %>%
    stats::na.omit()
  df %>%
    tidyr::gather_(cols2gather, key = newname, value = varname)
}

#' Remove rows matched to key
#'
#' @param key Name of rows to be removed
#' @param colpos Position of the colmun contains key
#' @param regex If TRUE, \code{key} was recognized as regular expression
#' @inheritParams make_rect
#' @export
rm_matchrow <- function(df, key, colpos, regex) {
  df <- as.data.frame(df)
  target <- dplyr::pull(df, colpos)
  if (regex) {
    df[-stringr::str_which(target, key), ]
  } else {
    key_noregex <- paste0("^", key, "$")
    df[-stringr::str_which(target, key_noregex), ]
  }
}

#' Remove columns matched to key
#'
#' @param key Name of columns to be removed
#' @param rowpos Position of the row contains \code{key}
#' @param regex If TRUE, \code{key} was recognized as regular expression
#' @inheritParams make_rect
#' @export
rm_matchcol <- function(df, key, rowpos, regex) {
  target <- dplyr::slice(df, rowpos) %>%
    unlist() %>%
    as.vector()
  if (regex) {
    df[, -stringr::str_which(target, key)]
  } else {
    key_noregex <- paste0("^", key, "$")
    df[, -stringr::str_which(target, pattern = key_noregex)]
  }
}

#' Convert full-width numbers in df into ASCII numbers
#'
#' @param x Data frame or vector to be processed
#' @param col Number of the target column
#' @param row Number of the target row
#' @param numerize If TRUE, remove characters convert column to numeric
#' @param headerized If FALSE (default), allow df with tentative colnames
#' @export
make_ascii <- function(x, col = NULL, row = NULL,
                       numerize = FALSE, headerized = FALSE) {
  if (!is.data.frame(x) & !is.list(x)) {
    string <- x
  } else {
    row_offset <- 0
    x <- as.data.frame(x)
    if (headerized) {
      header <- colnames(x)
      body   <- x
    } else {
      header     <- vectorize_row(x, 1)
      body       <- x[-1, ]
      row_offset <- -1
    }
    if (is.null(col) & is.null(row)) {
      rlang::abort(message = "Give me at least 'col' or 'row'.",
                   .subclass = "make_ascii_error")
    } else {
      edit_row    <- !is.null(row) && (row > 1 | headerized == TRUE)
      edit_col    <- !is.null(col)
      edit_header <- !is.null(row) && row == 1 && headerized == FALSE
    }
    if (edit_col) {
      string <- dplyr::pull(body, col)
    } else if (edit_header) {
      string <- header
    } else if (edit_row) {
      string <- vectorize_row(body, row + row_offset)
    }
  }

  ascii <- purrr::map_chr(string, Nippon::zen2han)

  if (numerize) {
    ascii <- ascii %>%
      stringr::str_remove_all("\\D")
  }

  if (is.vector(x)) return(ascii)

  if (edit_col) {
    body[, col] <- ascii
  } else if (edit_header) {
    header <- ascii
  } else if (edit_row) {
    body[row + row_offset, ] <- ascii
  }
  if (headerized) {
    colnames(body) <- header
    out <- body
  } else {
    out <- rbind(header, body)
  }
  out
}

#' Change specific row into df header
#'
#' @inheritParams make_rect
#' @param row Position of the row to make df header
#' @export
headerize <- function(df, row) {
  df   <- as.data.frame(df)
  body <- df[-row, ]
  head <- df[row, ] %>%
    as.character() %>%
    make.unique()
  if (head[1] == "NA") head[1] <- "topleft"
  magrittr::set_colnames(body, head)
}

#' Remove column whose colname is NA
#'
#' @inheritParams make_rect
#' @export
rm_nacols <- function(df) {
  df_leftmost <- df[, 1]
  df_right    <- df[, -1]
  name_left   <- colnames(df)[1]
  name_right  <- colnames(df)[-1]
  not_na <- !stringr::str_detect(colnames(df_right), "^NA(.[1-9]+)?$")
  if (length(not_na) == 0) {
    df
  } else {
    not_na <- tidyr::replace_na(not_na, FALSE)
    not_na
    out <- cbind(df_leftmost, df_right[, not_na]) %>%
      data.frame(stringsAsFactors = FALSE)
    colnames(out) <- c(name_left, name_right[not_na])
    out[, 1] <- as.character(out[, 1])
    out
  }
}

#' Extract a cluster from df using the keyword
#'
#' This function is the substancial function of \code{unclusterize}.
#' @inheritParams make_rect
#' @param direction The direction to which data clusters distribute
#' @param find_from The row or column position
#'   which \code{excract_cluster()} search key
#' @param pos_key Position where the \code{regex} of \code{unclusterize}
#'   matched the keyword
#' @param offset The offset (\code{c(row, pos})) of the cluster topleft from
#'   the coordination of keyword
#' @param ends List of regex to locate row- and column- ends of each cluster
#'   Form should be like \code{ends = list(row = "2019", col = "[Dd]ecember$")}.
#' Regex \code{row = } must specify the end of 'left most' columnn of df,
#'  not that of the column with key matched by \code{regex}
#' @param info Parameters to make key:value list such as
#' \describe{
#'  \item{key_offset}{Offset \code{c(row, col)} of \code{key} topleft
#'   from df topleft. If \code{NULL}, automatically set to \code{keyn}}
#'  \item{key_dim}{Dimension \code{c(row, col)} of \code{key}}
#'  \item{value_offset}{Offset \code{c(row, col)} of \code{value} topleft from
#'   df topleft}
#'  \item{value_dim}{Dimension \code{c(row, col)} of \code{value}}
#' }
extract_a_cluster <- function(pos_key, find_from, direction, df,
                              offset = c(0, 0), ends,
                              info = NULL) {
  rofst <- offset[1]
  cofst <- offset[2]

  if (direction == "v") {
    row <- pos_key + rofst
    col <- find_from + cofst
    maxrow <- locate_matchend(dplyr::pull(df, col)[row:nrow(df)],
                              ends[["row"]]) + row - 1
    maxcol <- locate_matchend(vectorize_row(df, row)[find_from:ncol(df)],
                              ends[["col"]])
    nrow <- maxrow - row + 1
    ncol <- maxcol - cofst
  } else if (direction == "h"){
    row <- find_from + rofst
    col <- pos_key + cofst
    maxrow <- locate_matchend(dplyr::pull(df, col)[row:nrow(df)],
                              ends[["row"]]) + row - 1
    maxcol <- locate_matchend(vectorize_row(df, row)[col:ncol(df)],
                              ends[["col"]]) + col - 1
    nrow <- maxrow - row + 1
    ncol <- maxcol - pos_key - cofst + 1
  }

  out <- df[row:(row + nrow - 1), col:(col + ncol - 1)] %>%
    make_ascii(row = 1)

  if (offset[1] == -1 && offset[2] == 0) {
    out[1, 1] <- out[2, 1]
    out <- out[-2, ]
  }

  if (is.null(info)) return(out)

  value_offset <- info$value_offset
  value_dim    <- info$value_dim
  rvalue       <- row + value_offset[1]
  cvalue       <- col + value_offset[2]
  value        <- df[rvalue:(rvalue + value_dim[1] - 1),
                     cvalue:(cvalue + value_dim[2] - 1)] %>%
    unlist() %>%
    as.vector()
  if (is.null(info$key_offset)) {
    key <- paste0("key", 1:max(value_dim))
  } else {
    key_offset   <- info$key_offset
    key_dim      <- info$key_dim
    rkey <- row + key_offset[1]
    ckey <- col + key_offset[2]
    key  <- df[rkey:(rkey + key_dim[1] - 1),
               ckey:(ckey + key_dim[2] - 1)] %>%
      unlist() %>% as.vector()
  }

  info_list <- as.list(stats::setNames(value, key))
  out %>%
    append_info(info = info_list, headerized = FALSE)
}

#' Extract data clusters from data frame using the keyword
#'
#' This function extracts data clusters from single Excel sheet.
#' @inheritParams make_rect
#' @inheritParams extract_a_cluster
#' @inheritParams rebel
#' @param regex Regular expression to match keywords
#' @param direction Directoin of the cluster revolution
#' @param pos Positon of row/column to scan using \code{regex}
#' @export
unclusterize <- function(df, regex, direction, pos,
                         offset = c(0, 0), ends,
                         info = NULL, crop = NULL) {

  cropped <- crop(df, direction = crop$direction,
                  pos = crop$pos, regex = crop$regex,
                  use_after = crop$use_after)
  if (direction == "h") {
    pos_key <- locate_keys(df = cropped, row = pos, regex = regex)
    purrr::map(pos_key, extract_a_cluster, find_from = pos,
               direction = "h", df = cropped,
               offset = offset, ends = ends, info = info)
  } else if (direction == "v") {
    pos_key <- locate_keys(df = cropped, col = pos, regex = regex)
    purrr::map(pos_key, extract_a_cluster, find_from = pos,
               direction = "v", df = cropped,
               offset = offset, ends = ends, info = info)
  } else {
    warning("Set 'direction' correctly")
    cropped
  }
}

#' Convert sheetname to variable
#'
#' @inheritParams make_rect
#' @param as Name of the new column which contains sheetnames
sheet2var <- function(df, as) {
  sheetname <- attr(df, "sheetname")
  out <- df %>%
    dplyr::mutate(!! as := sheetname)
}

#' Convert fiscal year column into true year
#'
#' @inheritParams make_rect
#' @inheritParams unfiscalize_vec
#' @param ycol Position of fiscal year column
#' @param mcol Position of month column
#' @export
unfiscalize <- function(df, ycol, mcol, month_start, rule) {
  df         <- as.data.frame(df)
  df[, ycol] <- as.integer(df[, ycol])
  df[, mcol] <- as.integer(df[, mcol])
  plist <- list(fisyr = df[, ycol],
                month = df[, mcol],
                month_start = month_start,
                rule = rule)
  trueyr <- purrr::pmap_int(plist, unfiscalize_vec)
  if (any(stringr::str_detect(colnames(df), "year"))) {
    df$trueyr <- trueyr
  } else {
    df$year   <- trueyr
  }
  tibble::as_tibble(df)
}

#' Crop data frame at specific keyword
#'
#' @inheritParams make_rect
#' @inheritParams unclusterize
#' @param use_after If TRUE, use part after match
crop <- function(df, direction = NULL, pos = NULL, regex = NULL,
                 use_after = FALSE) {
  if (is.null(direction)) return(df)

  match <- pull_vector(df, direction = direction, pos = pos) %>%
    stringr::str_which(regex)

  if (direction == "h") {
    before <- df[, 1:match]
    after  <- df[, match:ncol(df)]
  } else {
    before <- df[1:match, ]
    after  <- df[match:nrow(df), ]
  }

  if (use_after == TRUE) {
    return(after)
  }
  before
}

#' Pull vector out of data frame
#'
#' @inheritParams make_rect
#' @param direction Direction of the vector to be pulled from df
#' #' \describe{
#'  \item{"h"}{specific row of the df will be returned as avector}
#'  \item{"v"}{specific column of the df will be returned as a vector}
#' }
#' @param pos Row- or column position to be searched
pull_vector <- function(df, direction, pos) {
  if (direction == "v") {
    return(dplyr::pull(df, pos))
  }
  vectorize_row(df, pos)
}
Rindrics/lucifer documentation built on Dec. 18, 2021, 10:50 a.m.