R/trim.R
In diffobj: Diffs for R Objects

# Copyright (C) 2021 Brodie Gaslam
#
# This file is part of "diffobj - Diffs for R Objects"
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# Go to <https://www.r-project.org/Licenses/GPL-2> for a copy of the license.

# Detect and remove atomic headers

.pat.atom <- "^\\s*\\[[1-9][0-9]*\\]\\s"
.pat.mat <- "^\\s*\\[[1-9]+[0-9]*,\\]\\s"

# dfs/tables colon for data.table, SGR for tibble, starting to get
# dangerously broad; we should really split out the tibble business into its own
# method.
.pat.tbl <-
  "^(?:\033\\[[^m]*m)?\\s*[1-9]+[0-9]*:?(?:\033\\[[^m]*m)?\\s"
.pat.attr <- "^attr\\(,\"(\\\\\"|[^\"])*\")$"

# Find first attribute and drop everything after it

up_to_attr <- function(x) {

  attr.id <- grep(.pat.attr, x)
  if(length(attr.id) && attr.id[1L] > 1L) {
    y <- head(x, attr.id[1L] - 1L)
  } else {
    y <- x
  }
  y
}
# Get atomic content on a best-efforts basis
# Note that functionality for named vectors is turned off since they become
# fairly pathological when wrap periodicities are not the same (Issue #43);

which_atomic_cont <- function(x.chr, x) {
  # Limit to everything before attribute

  y <- up_to_attr(x.chr)

  res <- if(!is.null(nm <- names(x))) {
    integer(0L)
    # # name mode; find all lines from output that contain only names

    # nm.tar <- unlist(strsplit(names(x), "\\s+"))
    # y.split <- strsplit(sub("^\\s+", "", y), "\\s+")
    # only.nm <- vapply(y.split, function(z) all(z %in% nm.tar), logical(1L))

    # # Look for TF pattern starting with first TRUE

    # if(any(only.nm)) {
    #   first.t <- min(which(only.nm))
    #   only.nm.sub <- if(first.t > 1L) {
    #     tail(only.nm, -(first.t - 1L))
    #   } else only.nm
    #   only.nm.check <-
    #     only.nm.sub == rep(c(TRUE, FALSE), length.out=length(only.nm.sub))
    #   last.t <- which(!only.nm.check)
    #   last.t <- if(!length(last.t)) length(only.nm.check) + 1L else min(last.t)

    #   # Modulo check makes sure we have full T,F repeats
    #   if(length(last.t) && last.t %% 2L) {
    #     # Ensure that all names are present in the order they are supposed to be
    #     tar.seq <- first.t:(last.t + first.t - 2L)

    #     if(all(unlist(y.split[tar.seq][c(TRUE, FALSE)]) == nm.tar)) {
    #       tar.seq
    #     } else integer(0L)
    #   } else integer(0L)
    # } else integer(0L)
  } else which_atomic_rh(x.chr)
  res
}
# Identify elements that contain row headers, these are guaranteed to be
# sequential incrementing with no gaps, or zero length.

which_atomic_rh <- function(x) {
  stopifnot(is.character(x), !anyNA(x))

  # Now find the row headers if any prior to the attributes

  y <- up_to_attr(x)
  w.pat <- grepl(.pat.atom, y)

  # Grab first set that matches for checking, there could be more particularly
  # if the object in question has attributes, but we explicitly rule out
  # attributes

  w.pat.rle <- rle(w.pat)
  res <- if(any(w.pat.rle$values)) {
    # First get the indices of the patterns that match

    first.block <- min(which(w.pat.rle$values))
    w.pat.start <- sum(head(w.pat.rle$lengths, first.block - 1L), 0L) + 1L
    w.pat.ind <-
      seq(from=w.pat.start, length.out=w.pat.rle$lengths[first.block], by=1L)

    # Re extract those and run checks on them to make sure they truly are
    # what we think they are: width of headers is the same, and numbers
    # increment in equal increments starting at 1

    r.h.rows <- y[w.pat.ind]
    r.h.vals <- regmatches(r.h.rows, regexpr(.pat.atom, r.h.rows))
    r.h.lens.u <- length(unique(nchar(r.h.vals)))
    r.h.nums <- sub(".*?([0-9]+).*", "\\1", r.h.vals, perl=TRUE)
    r.h.nums.u <- length(unique(diff(as.numeric(r.h.nums))))

    if(
      r.h.nums.u <= 1L && r.h.lens.u == 1L && r.h.nums[[1L]] == "1" &&
      (length(w.pat.ind) < 2L || all(diff(w.pat.ind)) == 1L)
    ) {
      w.pat.ind
    } else integer(0L)
  } else integer(0L)
}
strip_atomic_rh <- function(x) {
  stopifnot(is.character(x), !anyNA(x))
  w.r.h <- which_atomic_rh(x)
  x[w.r.h] <- sub(.pat.atom, "", x[w.r.h])
  x
}
# Detect table row headers; a bit lazy, combining all table like into one
# function when in reality more subtlety is warranted; also, we only care about
# numeric row headers.
#
# Matrices used to be done here as well, but then got split off so the `pat`
# argument is legacy

wtr_help <- function(x, pat) {
  # Should expect to find pattern repeated some number of times, and then whole
  # pattern possibly repeated the same number of times separated by the same
  # gap each time if the table is too wide and wraps.

  w.pat <- grepl(pat, x)
  w.pat.rle <- rle(w.pat)

  # It must be the case that the first block of matches occurs after non-matches
  # since the first header should happen first

  res <- integer(0L)

  if(
    any(w.pat.rle$values) && length(w.pat.rle$values) > 1L &&
    w.pat.rle$values[2L]
  ) {
    tar.len <- w.pat.rle$lengths[2L]
    match.blocks <- w.pat.rle$values & w.pat.rle$lengths == tar.len

    # Only take matches they if alternate T/F

    match.break <-
      match.blocks != rep(c(FALSE, TRUE), length.out=length(match.blocks))

    match.valid <- if(any(match.break)) {
      # actually very difficult to test this; need a df like structure that is
      # wrapped and has some irregularity that crops up later, and we're not
      # actually able to generate these with vanilla structures
      head(match.blocks, min(which(match.break)) - 1L)
    } else match.blocks

    # Make sure that all interstitial blocks are same length and that they all
    # start with at least one space

    interstitial <- which(
      !match.valid & seq_along(match.valid) > 1L &
      seq_along(match.valid) != length(match.valid)
    )
    if(
      !length(interstitial) || (
        length(interstitial) &&
        length(unique(w.pat.rle$lengths[interstitial])) == 1L &&
        all(grepl("^\\s", x[unlist(rle_sub(w.pat.rle, interstitial))]))
      )
    ) {
      # Make sure row headers are the same for each repeating block; start by
      # extracting the actual headers; need to get a list of each sequence of
      # headers

      max.valid <- max(which(match.valid))
      ranges <- rle_sub(
        w.pat.rle, seq_along(w.pat.rle$lengths) <= max.valid & w.pat.rle$values
      )
      heads.l <- regmatches(x, regexec(pat, x))
      heads <- character(length(heads.l))
      heads[w.pat] <- as.character(heads.l[w.pat])

      heads.num <- as.integer(
        sub(".*?(?:\033\\[[^m]*m.*?)*([0-9]+).*", "\\1", heads, perl=TRUE)
      )
      head.ranges <- lapply(ranges, function(x) heads.num[x])

      all.identical <-
        all(vapply(head.ranges, identical, logical(1L), head.ranges[[1L]]))
      all.one.apart <-
        all(vapply(head.ranges, function(x) all(diff(x) == 1L), logical(1L)))

      if(all.identical && all.one.apart && head.ranges[[1L]][1L] == 1L) {
        res <- unlist(ranges)
  } } }
  res
}
which_table_rh <- function(x) {
  stopifnot(is.character(x), !anyNA(x))
  res <- wtr_help(x, .pat.tbl)
  if(length(res)) attr(res, "pat") <- .pat.tbl
  res
}
strip_table_rh <- function(x) {
  w <- which_table_rh(x)
  if(!length(w)) {
    x
  } else {
    pat <- attr(w, "pat")
    if(!is.chr.1L(pat))
      # nocov start
      stop("Logic Error: unexpected row header pattern; contact maintainer.")
      # nocov end
    x[w] <- sub(pat, "", x[w])
    x
  }
}
# Matrices; should really try to leverage logic in wtr_help, but not quite the
# same

which_matrix_rh <- function(x, dim.names.x) {
  guides <- detect_matrix_guides(x, dim.names.x)
  res <- integer(0L)
  if(length(guides)) {
    pieces <- split_by_guides(x, guides)
    if(!length(pieces)) stop("Logic Error: no matrix pieces") # nocov
    # Get all rows matching the matrix row header so long as they are adjacent;
    # this is only really different if there is an attribute in the last piece
    pat.ind <- lapply(
      pieces,
      function(y) {
        pat.match <- grep(.pat.mat, y)
        if(length(pat.match) > 1)
          pat.match[c(TRUE, !cumsum(diff(pat.match) != 1L))]
        else pat.match
      }
    )
    if(
      all(vapply(pat.ind, identical, logical(1L), pat.ind[[1L]])) &&
      (length(pat.ind[[1L]]) == 1L || all(diff(pat.ind[[1L]]) == 1L))
    ) {
      piece.nums <- as.integer(
        sub(".*?([0-9]+).*", "\\1", pieces[[1L]][pat.ind[[1L]]], perl=TRUE)
      )
      if(
        length(piece.nums) && piece.nums[1L] == 1L &&
        (length(piece.nums) == 1L || all(diff(piece.nums) == 1L))
      ) {
        res <- unlist(
          lapply(seq_along(pieces), function(i)
            attr(pieces[[i]], "idx")[pat.ind[[i]]]
  ) ) } } }
  res
}
strip_matrix_rh <- function(x, dim.names.x) {
  to.rep <- which_matrix_rh(x, dim.names.x)
  res <- x
  res[to.rep] <- sub(.pat.mat, "", x[to.rep])
  res
}
# Handle arrays

which_array_rh <- function(x, dim.names.x) {
  arr.h <- detect_array_guides(x, dim.names.x)
  dat <- split_by_guides(x, arr.h)

  # Look for the stuff between array guides; those should be matrix like
  # and have the same rows in each one

  m.h <- lapply(dat, which_matrix_rh, head(dim.names.x, 2L))

  res <- integer(0L)
  if(length(m.h) && all(vapply(m.h, identical, logical(1L), m.h[[1L]]))) {
    res <- unlist(Map(function(y, z) attr(y, "idx")[z], dat, m.h))
  }
  res
}
strip_array_rh <- function(x, dim.names.x) {
  inds <- which_array_rh(x, dim.names.x)
  res <- x
  res[inds] <- sub(.pat.mat, "", x[inds])
  res
}
# Lists, need to recurse through the various list components
#
# This is not done super rigorously; the main point of failure is if sub-objects
# produce patterns that match list sub-object headers which may cause confusion
#
# Super inefficient currently since we keep switching back and forth between
# index and trimmed formats so we can re-use `trimPrint`...
#
# Also, right now we are passing the list components with all the trailing
# new lines, and it isn't completely clear that is the right thing to do
#
# Note that we're not actually trimming the list headers themselves since unlike
# in atomics and matrices, etc, the list headers are on their own line and won't
# affect the matching diff of the actual contents of the list

strip_list_rh <- function(x, obj) {
  if(!length(obj)) {
    # empty list, nothing to do, and also if it is nested causes problems later
    x
  } else {
    # Split output into each list component

    list.h <- detect_list_guides(x)
    dat <- split_by_guides(x, list.h, drop.leading=FALSE)
    elements <- flatten_list(obj)

    # Special case where first element in list is deeper than one value, which
    # means there will be leading non-data elements in `dat` that we have to
    # reconstruct; note that if no len then rendered as `list()` so it doesn't
    # get a guide.

    offset <- if(
      is.list(obj[[1L]]) && !is.object(obj[[1L]]) && length(obj[[1L]])
    ) 1L else 0L

    if(length(elements) != length(dat) - offset) {
      # Something went wrong here, so return as is?
      x
    } else {
      # Use `trimPrint` to get indices, and trim back to stuff without row
      # header

      if(offset) {
        hd <- dat[[1L]]
        tl <- tail(dat, -offset)
      } else {
        hd <- NULL
        tl <- dat
      }
      dat.trim <- Map(trimPrint, elements, tl)
      dat.w.o.rh <- Map(
        function(chr, ind) substr(chr, ind[, 1], ind[, 2]), tl, dat.trim
      )
      unlist(
        c(
          list(hd),
          c(
            as.list(x[list.h]), dat.w.o.rh
          )[order(rep(seq_along(list.h), 2))]
      ) )
    }
  }
}
# Very similar logic to lists

strip_s4_rh <- function(x, obj) {
  stopifnot(isS4(obj))

  if(!length(slotNames(obj))) {
    # Not possible to have object without slots (would be virtual class)
    stop("Internal Error: s4 object w/o slots; contact maintainer") # nocov
  } else {
    # Split output into each list component

    s4.h <- detect_s4_guides(x, obj)
    dat <- split_by_guides(x, s4.h, drop.leading=FALSE)
    elements <- lapply(slotNames(obj), slot, object=obj)

    dat.trim <- Map(trimPrint, elements, dat)
    dat.w.o.rh <- unlist(
      Map(
        function(chr, ind) substr(chr, ind[, 1], ind[, 2]), dat, dat.trim
    ) )
    if(length(dat.w.o.rh) + length(s4.h) == length(x)) {
      res <- character(length(x))
      res[s4.h] <- x[s4.h]
      res[!seq_along(res) %in% s4.h] <- dat.w.o.rh
      res
    } else {
      # This should not happen, only a warning because operating without trimed
      # S4 guides is not a huge eissue
      # nocov start
      warning('Unable to detect S4 object guides.')
      x
      # nocov end
  } }
}
#' Methods to Remove Unsemantic Text Prior to Diff
#'
#' \code{\link[=diffPrint]{diff*}} methods, in particular \code{diffPrint},
#' modify the text representation of an object prior to running the diff to
#' reduce the incidence of spurious mismatches caused by unsemantic differences.
#' For example, we look to remove matrix row indices and atomic vector indices
#' (i.e. the \samp{[1,]} or \samp{[1]} strings at the beginning of each display
#' line).
#'
#' Consider: \preformatted{
#' > matrix(10:12)
#'      [,1]
#' [1,]   10
#' [2,]   11
#' [3,]   12
#' > matrix(11:12)
#'      [,1]
#' [1,]   11
#' [2,]   12
#' }
#' In this case, the line by line diff would find all rows of the matrix to
#' be mismatched because where the data matches (rows containing
#' 11 and 12) the indices do not.  By trimming out the row indices before
#' the diff, the diff can recognize that row 2 and 3  from the first matrix
#' should be matched to row 1 and 2 of the second.
#'
#' These methods follow a similar interface as the \code{\link[=guides]{guide*}}
#' methods, with one available for each \code{diff*} method except for
#' \code{diffCsv} since that one uses \code{diffPrint} internally.  The
#' unsemantic differences are added back after the diff for display purposes,
#' and are colored in grey to indicate they are ignored in the diff.
#'
#' Currently only \code{trimPrint} and \code{trimStr} do anything meaningful.
#' \code{trimPrint} removes row index headers provided that they are of the
#' default un-named variety.  If you add row names, or if numeric row indices
#' are not ascending from 1, they will not be stripped as those have meaning.
#' \code{trimStr} removes the \samp{..$}, \samp{..-}, and \samp{..@} tokens
#' to minimize spurious matches.
#'
#' You can modify how text is trimmed by providing your own functions to the
#' \code{trim} argument of the \code{diff*} methods, or by defining
#' \code{trim*} methods for your objects.  Note that the return value for these
#' functions is the start and end columns of the text that should be
#' \emph{kept} and used in the diff.
#'
#' As with guides, trimming is on a best efforts basis and may fail with
#' \dQuote{pathological} display representations.  Since the diff still works
#' even with failed trimming this is considered an acceptable compromise.
#' Trimming is more likely to fail with nested recursive structures.
#'
#' @note \code{obj.as.chr} will be as processed by
#'   \code{\link{strip_hz_control}} and as such will not be identical to the
#'   captured output if it contains tabs, newlines, or carriage returns.
#' @rdname trim
#' @name trim
#' @aliases trimPrint, trimStr, trimChr, trimDeparse, trimFile
#' @param obj the object
#' @param obj.as.chr character the \code{print}ed representation of the object
#' @return a \code{length(obj.as.chr)} row and 2 column integer matrix with the
#'   start (first column) and end (second column) character positions of the sub
#'   string to run diffs on.

NULL

#' @export
#' @rdname trim

setGeneric("trimPrint",
  function(obj, obj.as.chr) standardGeneric("trimPrint")
)
#' @rdname trim

setMethod(
  "trimPrint", c("ANY", "character"),
  function(obj, obj.as.chr) {
    # Remove the stuff we don't want

    stripped <- if(is.matrix(obj)) {
      strip_matrix_rh(obj.as.chr, dimnames(obj))
    } else if(
      length(dim(obj)) == 2L ||
      (is.ts(obj) && frequency(obj) > 1)
    ) {
      strip_table_rh(obj.as.chr)
    } else if (is.array(obj)) {
      strip_array_rh(obj.as.chr, dimnames(obj))
    } else if(is.atomic(obj)) {
      strip_atomic_rh(obj.as.chr)
    } else if(is.list(obj) && !is.object(obj)) {
      strip_list_rh(obj.as.chr, obj)
    } else if(isS4(obj) && is_default_show_obj(obj)) {
      strip_s4_rh(obj.as.chr, obj)
    } else obj.as.chr

    trim_sub(obj.as.chr, stripped)
  }
)
#' @export
#' @rdname trim

setGeneric("trimStr",
  function(obj, obj.as.chr) standardGeneric("trimStr")
)
#' @rdname trim

setMethod(
  "trimStr", c("ANY", "character"),
  function(obj, obj.as.chr) {
    # Remove the stuff we don't want

    pat <- "^ (?: \\.\\.)*(?:\\$|-|@) "
    stripped <- gsub(pat, "", obj.as.chr, perl=TRUE)

    # Figure out the indices that correspond to what we want, knowing that all
    # removals should have occured at front of string

    trim_sub(obj.as.chr, stripped)
  }
)
# Helper function; returns untrimmed objects

trim_identity <- function(obj, obj.as.chr)
  cbind(rep(1L, length(obj.as.chr)), nchar(obj.as.chr))

#' @export
#' @rdname trim

setGeneric(
  "trimChr", function(obj, obj.as.chr) standardGeneric("trimChr")
)

#' @rdname trim

setMethod("trimChr", c("ANY", "character"), trim_identity)

#' @export
#' @rdname trim

setGeneric(
  "trimDeparse",
  function(obj, obj.as.chr) standardGeneric("trimDeparse")
)
#' @rdname trim

setMethod("trimDeparse", c("ANY", "character"), trim_identity)

#' @export
#' @rdname trim

setGeneric(
  "trimFile", function(obj, obj.as.chr) standardGeneric("trimFile")
)

#' @rdname trim

setMethod("trimFile", c("ANY", "character"), trim_identity)

# Helper fun used by trim functions that remove front of strings and rely on
# string comparison to determine trim indices

trim_sub <- function(obj.as.chr, obj.stripped) {
  if(length(obj.as.chr) != length(obj.stripped))
    # nocov start
    stop(
      "Logic Error: trimmed string does not have same number of elements as ",
      "original; contact maintainer"
    )
    # nocov end
  stripped.chars <- nchar(obj.stripped)
  char.diff <- nchar(obj.as.chr) - stripped.chars
  sub.start <- char.diff + 1L
  sub.end <- sub.start - 1L + stripped.chars

  if(!all(substr(obj.as.chr, sub.start, sub.end) == obj.stripped))
    # nocov start
    stop(
      "Logic Error: trimmed string is not a substring of orginal, ",
      "contact maintainer"
    )
    # nocov end
  cbind(sub.start, sub.end)
}
# Re-insert the trimmed stuff back into the original string, note that we
# use normal string funs, not ANSI aware ones, because the row header stuff is
# done in an ANSI unaware manner.

untrim <- function(dat, word.c, etc) {
  fun <- etc@style@funs@trim
  res <- with(
    dat,
    paste0(
      fun(substr(raw, 0, trim.ind.start - 1L)), word.c,
      fun(substr(raw, trim.ind.end + 1L, nchar(raw) + 1L))
  ) )
  # substitute blanks

  res[!nzchar(dat$raw)] <- etc@style@blank.sub
  res
}

valid_trim_ind <- function(x)
  if(
    !is.integer(x) || !is.matrix(x) || anyNA(x) || !ncol(x) == 2L
  ) {
    "must be a two column integer matrix with no NAs"
  } else TRUE

apply_trim <- function(obj, obj.as.chr, trim_fun) {
  if(!isTRUE(two.arg <- is.two.arg.fun(trim_fun)))
    stop(
      "Invalid trim function (", two.arg, ").  If you did not customize the ",
      "trim function contact maintainer; see `?trim`"
    )
  trim <- try(trim_fun(obj, obj.as.chr))
  msg.extra <- paste0(
    "If you did not specify a `trim` function or define custom `trim*` ",
    "methods contact maintainer (see `?trim`).  Proceeding without trimming."
  )
  if(inherits(trim, "try-error")) {
    warning(
      "`trim*` method produced an error when attempting to trim ; ", msg.extra
    )
    trim <- cbind(rep(1L, length(obj.as.chr)), nchar(obj.as.chr))
  }
  if(!isTRUE(trim.check <- valid_trim_ind(trim)))
    stop("`trim*` method return value ", trim.check, "; ", msg.extra)
  if(nrow(trim) != length(obj.as.chr))
    stop(
      "`trim*` method output matrix must have as many rows as object ",
      "character representation has elements; ", msg.extra
    )
  trim
}