Nothing
# Copyright (C) 2021 Brodie Gaslam
#
# This file is part of "diffobj - Diffs for R Objects"
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# Go to <https://www.r-project.org/Licenses/GPL-2> for a copy of the license.
# Used to initialize the word difference index lists; represents a non matching
# result for use with `regmatches`
.word.diff.atom <- -1L
attr(.word.diff.atom, "match.length") <- -1L
# Matches syntactically valid R variable names
.reg.r.ident <- "(?:\\.[[:alpha:]]|[[:alpha:]])[[:alnum:]_.]*"
# Helper function when lining up word in a word diff to the lines they came from
#
# This one used to be simple but grew out of control as we discovered corner
# cases; would be good to see if there is a better collapse algorithm that
# naturally handles the corner cases (note: we added general handling of the
# situation where many hunks share the same line, but have not yet removed
# specific handling for corner cases generated by that issue so there is
# redundant code in here).
#
# lines: is a list of what lines are in each hunk,
# cont: is a logical vector of same length as lines denoting whether a
# particular value in lines is context or diff
# hunk.diff: logical vector denoting if for the other object the hunk contains
# only differences (seemingly not used in the most recent algorithm)
#
# What do we do about lines that are fully context? These are flexible in as
# much as we can put them anyplace between the two diff hunks. We are trying to
# maximize overlapping context elements.
reassign_lines2 <- function(lines, cont, hunk.diff) {
# Find out what lines show up as duplicated
hunk.count <- length(cont)
hunk.len <- vapply(lines, length, integer(1L))
hunk.n <- seq_along(cont)
nums <- unlist(lines)
nums.l <- unlist(
lapply(seq_along(lines), function(x) rep(x, length(lines[[x]])))
)
nums.d <- unique(nums[duplicated(nums)])
# For each duplicated number, find range of hunks that contain it and remove
# it from inappropriate hunks / add it to proper ones
lines.p <- lines
for(n in nums.d) {
n.r <- range(nums.l[nums == n])
# If any of the non-empty hunks are diff hunks, remove line reference from
# every hunk except the first non-empty diff hunk, otherwise remove the
# reference from everything except the first non-empty matching hunk
b.w <- hunk.n >= n.r[[1L]] & hunk.n <= n.r[[2L]]
min.diff.h <- head(which(!cont & b.w & hunk.len), 1L)
min.mtch.h <- head(which(cont & b.w & hunk.len), 1L)
keep.h <- if(length(min.diff.h)) min.diff.h else min.mtch.h
if(length(keep.h))
for(i in n.r[[1L]]:n.r[[2L]])
if(i != keep.h) lines.p[[i]] <- lines.p[[i]][lines.p[[i]] != n]
}
lines.p
}
## Helper Function for Mapping Word Diffs to Lines
##
## Used when we're doing a wrapped diff for atomic vectors. We expect to
## receive `tar/cur.dat` with any meta data lines (e.g. factor levels, time
## series meta data) removed already. The challenge is we need to then be able
## to re-map back each word back to the line it was on originally before
## unwrapping. This may include adding padding line blanks in the case one hunk
## displays across more lines than another.
#
## This function does two things: inserts padding lines when hunks in one object
## end up longer than in the other and similar lines don't align, and computes
## mock character strings that will be used by to force alignments. We
## manufacture unique strings that either match or don't match across the two
## objects depending on the word contents of each line, and then pass those
## back as the `comp` component of the `tar.dat` and `cur.dat` returned. The
## subsequent line diff will use `comp` and cause the relevant lines to be lined
## up. This is inefficient and round-about, but has the huge benefit of
## allowing us to plug in the wrapped diff into our existing line diff
## infrastructure
##
## Note that in "word" mode the returned values may be longer than the input ones
## as it may be necessary to add lines to get things to match-up. Added lines
## are indicated by TRUE values in the `fill` component of the `*.dat` return
## values
##
## We have been through several iterations trying to get the most intuitive
## behavior and the result is a fairly non-intuitive and likely inefficient
## algorithm. It works for the most part, so we leave it as is, but is long,
## messy, and should be replaced by a more elegant solution.
##
## @param tar.ends and cur.ends are the indices of the last elements in each
## line of the vector
## @param tar.dat and cur.dat are the data, see `line_diff` body for detailed
## description of them (about 100 lines in). Note that the data has been
## subset to just the portion of it that has row headers (e.g. excluding
## factor levels, etc.)
## @param tar/cur.ends the position in the unwrapped vector of the last "word"
## in each line.
word_to_line_map <- function(
hunks, tar.dat, cur.dat, tar.ends, cur.ends
) {
# Once we've done all the replication and disambiguation, we need to make sure
# diff hunks have the same number of lines. Start mix lines or start/end mix
# lines should go at beginning (this includes full "context" lines where there
# is an insertion or deletion in middle. End mix lines should go at end.
#
# For each hunk, we need to identify what lines it contains, and whether the
# lines are contained in full or not
#
# If a diff hunk is empty for tar/cur, and the corresponding cur/tar hunk
# does not begin/end at beginning of line, then must add lines containing
# adjoining elements to the diff
find_word_line <- function(h.i, pos, ends.a, ends.b, hunks) {
inds_pos <- function(h) c(h$A, h$B)[c(h$A, h$B) > 0L]
inds_neg <- function(h) abs(c(h$A, h$B)[c(h$A, h$B) < 0L])
h <- hunks[[h.i]]
h.prev <- if(h.i > 1L) hunks[[h.i - 1L]]
h.next <- if(h.i < length(hunks)) hunks[[h.i + 1L]]
inds.a <- if(pos) inds_pos(h) else inds_neg(h)
inds.b <- if(pos) inds_neg(h) else inds_pos(h)
ints.a <- c(1L, head(ends.a, -1L) + 1L)
ints.b <- c(1L, head(ends.b, -1L) + 1L)
ends.b.m <- max(ends.b)
# If a diff hunk and empty, but the matching hunk isn't empty, then add
# the last element of prior hunk and first element of next hunk
if(!h$context && !length(inds.a) && length(inds.b)) {
inds.prev <- if(h.i > 1L) if(pos) inds_pos(h.prev) else inds_neg(h.prev)
inds.next <- if(h.i < length(hunks))
if(pos) inds_pos(h.next) else inds_neg(h.next)
ind.b.min <- min(inds.b)
ind.b.max <- max(inds.b)
add.left <- if(!ind.b.min %in% ints.b) max(inds.prev)
add.right <- if(!ind.b.max %in% ends.b) min(inds.next)
inds.a <- if(length(add.left) && length(add.right))
seq(from=add.left, to=add.right, by=1L) else c(add.left, add.right)
}
sort(unique(findInterval(inds.a, ints.a)))
}
find_full_diff_line <- function(dat, ends, diffs) {
w.t <- vapply(
dat$word.ind,
function(x) if(is.null(a.val <- attr(x, "word.count"))) -1L else a.val,
integer(1L)
)
inds.d.l <- findInterval(diffs, c(1L, head(ends, -1L) + 1L))
inds.tab <- tabulate(inds.d.l, length(ends))
diff.full <- which(inds.tab == w.t & inds.tab)
}
h.seq <- seq_along(hunks)
tar.lines <- lapply(h.seq, find_word_line, TRUE, tar.ends, cur.ends, hunks)
cur.lines <- lapply(h.seq, find_word_line, FALSE, cur.ends, tar.ends, hunks)
# which hunks are context hunks?
h.cont <- vapply(hunks, "[[", logical(1L), "context")
# Compute what indices are in each lines; we are going to use this to
# categorize what type of line this is; some of this might be duplicative with
# what we did earlier, but that was so long ago I don't want to get back into
# it.
tar.idx <- Map(seq, c(1L, head(tar.ends, -1L) + 1L), tar.ends, by=1L)
cur.idx <- Map(seq, c(1L, head(cur.ends, -1L) + 1L), cur.ends, by=1L)
tar.diff <- unlist(
lapply(hunks[!h.cont], function(x) with(x, abs(c(A, B))[c(A, B) > 0]))
)
cur.diff <- unlist(
lapply(hunks[!h.cont], function(x) with(x, abs(c(A, B))[c(A, B) < 0]))
)
# identify whether a line starts with context, ends with context, neither, or
# both
context_type <- function(idx, diffs) {
idx.in <- idx %in% diffs
if(!length(idx)) { # arbitrarily assign this case to both
"both"
} else {
if(head(idx.in, 1L) && tail(idx.in, 1L)) {
"neither"
} else if(head(idx.in, 1L)) {
"ends"
} else if(tail(idx.in, 1L)) {
"starts"
} else {
"both"
}
}
}
tar.end.mix <- vapply(tar.idx, context_type, character(1L), diffs=tar.diff)
cur.end.mix <- vapply(cur.idx, context_type, character(1L), diffs=cur.diff)
# Handle cases where line is shared by multiple hunks; also need to know which
# hunks contain only lines that are fully different (and by extension, are
# themselves fully different) as these don't need to have a line from the
# opposite object brought in for alignment
diff.inds <- unlist(lapply(hunks[!h.cont], "[", c("A", "B")))
if(is.null(diff.inds)) diff.inds <- integer()
tar.inds.d <- diff.inds[diff.inds > 0]
cur.inds.d <- abs(diff.inds[diff.inds < 0])
tar.tot.diff.l <- find_full_diff_line(tar.dat, tar.ends, tar.inds.d)
cur.tot.diff.l <- find_full_diff_line(cur.dat, cur.ends, cur.inds.d)
# Remove duplicated line references
tar.lines.u <- reassign_lines2(tar.lines, h.cont)
cur.lines.u <- reassign_lines2(cur.lines, h.cont)
# Search for aligned matching hunks that are empty, and if both those have
# adjacent empty diff hunks, remove the matched and diff hunks from both
# NOTE: this changes the number of hunks in the word diff!
len.orig <- length(tar.lines.u)
tar.lines.p <- tar.lines.u
cur.lines.p <- cur.lines.u
j <- if(h.cont[[1L]]) 1L else 2L
l.cont <- as.list(h.cont)
k <- 0
while(j < length(tar.lines.p)) {
if((k <- k + 1L) > len.orig) {
# nocov start
stop("Logic Error: infine loop in atomic hunk align; contact maintainer.")
# nocov end
}
if(!length(tar.lines.p[[j]]) && !length(cur.lines.p[[j]])) {
if(j > 1L) {
tar.lo <- !length(tar.lines.p[[j - 1L]])
cur.lo <- !length(cur.lines.p[[j - 1L]])
} else tar.lo <- cur.lo <- FALSE
tar.hi <- !length(tar.lines.p[[j + 1L]])
cur.hi <- !length(cur.lines.p[[j + 1L]])
# Need to remove paired empty match and diff; since we are shortening the
# list we don't need to increment J (note possible memory inefficiency
# here)
if((tar.lo || tar.hi) && (cur.lo || cur.hi)) {
if(tar.lo) tar.lines.p[(j - 1L):j] <- NULL else
tar.lines.p[j:(j + 1L)] <- NULL
if(cur.lo) cur.lines.p[(j - 1L):j] <- NULL else
cur.lines.p[j:(j + 1L)] <- NULL
l.cont[j:(j + 1L)] <- NULL
} else {
j <- j + 1L
}
} else j <- j + 1L
}
# Update our context vector since we have now possibly removed hunks
h.cont <- unlist(l.cont)
# If necessary, populate empty diff hunks with matching lines; this happens
# if one of tar/cur has differences but the other doesn't
steal_matching_line <- function(lines, i) {
lines.p <- lines
l.len <- length(lines)
if(l.len > i && length(lines[[i + 1L]])) {
lines.p[[i]] <- head(lines.p[[i + 1L]], 1L)
lines.p[[i + 1L]] <- tail(lines.p[[i + 1L]], -1L)
} else if (i > 1L && length(lines[[i - 1L]])) {
lines.p[[i]] <- tail(lines.p[[i - 1L]], 1L)
lines.p[[i - 1L]] <- head(lines.p[[i - 1L]], -1L)
}
lines.p
}
tar.lines.f <- tar.lines.p
cur.lines.f <- cur.lines.p
# lines that are all diffs
hunk_diff <- function(vec, tot.diffs) length(vec) && all(vec %in% tot.diffs)
tar.tot.diff.h <- vapply(tar.lines, hunk_diff, logical(1L), tar.tot.diff.l)
cur.tot.diff.h <- vapply(cur.lines, hunk_diff, logical(1L), cur.tot.diff.l)
for(i in seq_along(h.cont)) {
if(!h.cont[[i]]) {
t.i <- tar.lines.f[[i]]
c.i <- cur.lines.f[[i]]
if(!length(t.i) && length(c.i) && !cur.tot.diff.h[[i]]) {
tar.lines.f <- steal_matching_line(tar.lines.f, i)
} else if (!length(c.i) && length(t.i) && !tar.tot.diff.h[[i]]) {
cur.lines.f <- steal_matching_line(cur.lines.f, i)
}
}
}
# We now need to make sure that every hunk is the same length
if(
length(tar.lines.f) != length(cur.lines.f) ||
length(tar.lines.f) != length(h.cont)
)
# nocov start
stop(
"Logic error: mismatched hunk sizes when aligning words to lines; ",
"contact maintainer."
)
# nocov end
tar.lines.f2 <- tar.lines.f
cur.lines.f2 <- cur.lines.f
# add padding vector as close to middle of input vector as possible, except
# in special cases (only one short line, or first or last hunks)
pad_in_middle <- function(vec, pad)
c(
head(vec, ceiling(length(vec) / 2)),
pad,
tail(vec, floor(length(vec) / 2))
)
for(i in seq_along(tar.lines.f)) {
if(length(tar.lines.f[[i]]) != length(cur.lines.f[[i]])) {
tar.long <- length(tar.lines.f[[i]]) > length(cur.lines.f[[i]])
long <- if(tar.long) tar.lines.f[[i]] else cur.lines.f[[i]]
short <- if(!tar.long) tar.lines.f[[i]] else cur.lines.f[[i]]
long.type <- if(tar.long) tar.end.mix[long] else cur.end.mix[long]
short.type <- if(!tar.long) tar.end.mix[short] else cur.end.mix[short]
pad <- rep(NA, length(long) - length(short))
short.pad <- if(i == 1L && length(tar.lines.f) > 1L) {
c(pad, short)
} else if (i == length(tar.lines.f)) {
c(short, pad)
} else if(h.cont[[i]] || length(short) != 1L) {
pad_in_middle(short, pad)
} else {
if(
short.type == "ends" && (long.type[[1L]] %in% c("ends", "neither"))
) {
c(pad, short)
} else c(short, pad)
}
if(tar.long) cur.lines.f2[[i]] <- short.pad
else tar.lines.f2[[i]] <- short.pad
} }
# Augment the input vectors by the blanks we added; these blanks are
# represented by NAs in our index vector.
augment <- function(dat, lines) {
lines.u <- unlist(lines)
lines.len <- length(lines.u)
for(i in names(dat)) {
i.vec <- vector(typeof(dat[[i]]), length(lines.u))
i.vec[!is.na(lines.u)] <- dat[[i]]
if(i == "word.ind") {
i.vec[is.na(lines.u)] <- list(.word.diff.atom)
} else if (i == "fill") {
# warning: this is also used/subverted for augmenting the original
# indices so think before you change it
i.vec[is.na(lines.u)] <- TRUE
}
dat[[i]] <- i.vec
}
dat
}
tar.dat.aug <- augment(tar.dat, tar.lines.f2)
cur.dat.aug <- augment(cur.dat, cur.lines.f2)
# Generate the final vectors to do the diffs on; these should be unique
# and matching for the matches, and unique and mismatching for the
# mismatches
hunk_match <- function(i, l) rep(h.cont[i], length(l[[i]]))
tar.match <- unlist(lapply(seq_along(h.cont), hunk_match, l=tar.lines.f2))
cur.match <- unlist(lapply(seq_along(h.cont), hunk_match, l=cur.lines.f2))
pos.nums <- sum(tar.match)
if(pos.nums != length(unlist(cur.lines.f2[h.cont]))) {
# nocov start
stop("Logic Error: pos nums incorrect; contact maintainer")
# nocov end
}
neg.nums <- sum(!tar.match, !cur.match)
strings <- make_unique_strings(
pos.nums + neg.nums, c(tar.dat.aug$raw, cur.dat.aug$raw)
)
strings.pos <- strings[seq.int(pos.nums)]
strings.neg <- tail(strings, neg.nums)
if(neg.nums + pos.nums != length(strings)) {
# nocov start
stop("Logic Error: num-string maping failed; contact maintainer")
# nocov end
}
tar.dat.aug$comp[tar.match] <- strings.pos
cur.dat.aug$comp[cur.match] <- strings.pos
tar.dat.aug$comp[!tar.match] <- head(strings.neg, sum(!tar.match))
cur.dat.aug$comp[!cur.match] <- tail(strings.neg, sum(!cur.match))
list(tar.dat=tar.dat.aug, cur.dat=cur.dat.aug)
}
# Pull out mismatching words from the word regexec; helper functions
reg_pull <- function(ind, reg) {
reg.out <- reg[ind]
attr(reg.out, "match.length") <- attr(reg, "match.length")[ind]
attr(reg.out, "useBytes") <- attr(reg, "useBytes")
attr(reg.out, "word.count") <- length(reg)
reg.out
}
# Generate the indices in each row and apply the pulling functions
# - reg list produced by `gregexpr` and such
# - ends length of each line in words
# - mismatch index of mismatching words
#
reg_apply <- function(reg, ends, mismatch) {
if(!length(reg)) {
reg
} else {
use.bytes <- attr(reg[[1L]], "useBytes") # assume useBytes value unchanging
regs.fin <- reg
buckets <- head(c(0L, ends) + 1L, -1L)
mism.lines <- findInterval(mismatch, buckets)
mism.lines.u <- unique(mism.lines)
mtch.lines.u <- which(!seq_along(ends) %in% mism.lines.u )
# These don't have any mismatches
attr(.word.diff.atom, "useBytes") <- use.bytes
regs.fin[mtch.lines.u] <-
replicate(length(mtch.lines.u), .word.diff.atom, simplify=FALSE)
# These do have mismatches, we need to split them up in list elements and
# substract the starting index to identify position within each sub-list
if(length(mism.lines.u)) {
inds.msm <- Map(
"-", unname(split(mismatch, mism.lines)), buckets[mism.lines.u] - 1L
)
regs.fin[mism.lines.u] <- Map(reg_pull, inds.msm, reg[mism.lines.u])
}
regs.fin
}
}
# Modify `tar.dat` and `cur.dat` by generating `regmatches` indices for the
# words that are different
#
# If `diff.mode` is "wrap", then wrapped atomic vector output is unwrapped and
# the diff is carried out in the unwrapped form, and then re-assembled. See
# `word_to_line_map` for details in how its done. Return values may be longer
# than input in this mode.
#
# `match.quotes` will make "words" starting and ending with quotes; it should
# only be used with atomic character vectors or possibly deparsed objects.
diff_word2 <- function(
tar.dat, cur.dat, tar.ind, cur.ind, etc, match.quotes=FALSE, diff.mode,
warn=TRUE
) {
stopifnot(
is.TF(match.quotes), is.TF(warn)
# isTRUE(valid_dat(tar.dat)), isTRUE(valid_dat(cur.dat)) # too expensive
)
# Compute the char by char diffs for each line
reg <- paste0(
# grab leading spaces for each word; these will be stripped before actual
# word diff, but we want them to be part of mismatch so they are removed
# when we construct the equal strings as that allows better matching b/w
# strings with differences removed; could do trailing spaces instead
"\\s*(?:",
# Some attempt at matching R identifiers; note we explicitly chose not to
# match `.` or `..`, etc, since those could easily be punctuation
sprintf("%s|", .reg.r.ident),
# Not whitespaces that doesn't include quotes
"[^ \"]+|",
# Quoted phrases as structured in atomic character vectors
if(match.quotes) "(?:(?<= )|(?<=^))\"(?:[^\"]|\\\")*?\"(?:(?= )|(?=$))|",
# Other quoted phrases we might see in expressions or deparsed chr vecs,
# this is a bit lazy currently b/c we're not forcing precise matching b/w
# starting and ending delimiters
"(?:(?<=[ ([,{])|(?<=^))\"(?:[^\"]|\\\"|\"(?=[^ ]))*?",
"\"(?:(?=[ ,)\\]}])|(?=$))|",
# Other otherwise 'illegal' quotes that couldn't be matched to one of the
# known valid quote structures
"\")"
)
tar.chr <- tar.dat$trim[tar.ind]
cur.chr <- cur.dat$trim[cur.ind]
tar.reg <- gregexpr(reg, tar.chr, perl=TRUE)
cur.reg <- gregexpr(reg, cur.chr, perl=TRUE)
tar.split <- regmatches(tar.chr, tar.reg)
cur.split <- regmatches(cur.chr, cur.reg)
# Collapse into one line if to do the diff across lines, but record
# item counts so we can reconstitute the lines at the end
tar.lens <- vapply(tar.split, length, integer(1L))
cur.lens <- vapply(cur.split, length, integer(1L))
tar.unsplit <- unlist(tar.split)
cur.unsplit <- unlist(cur.split)
if(is.null(tar.unsplit)) tar.unsplit <- character(0L)
if(is.null(cur.unsplit)) cur.unsplit <- character(0L)
# Remove the leading spaces we grabbed for each word
tar.unsplit <- trimws(tar.unsplit, "left")
cur.unsplit <- trimws(cur.unsplit, "left")
# Run the word diff as a line diff configured in a manner compatible for the
# word diff
etc@line.limit <- etc@hunk.limit <- etc@context <- -1L
etc@mode <- "context"
diffs <- char_diff(
tar.unsplit, cur.unsplit, etc=etc, diff.mode=diff.mode, warn=warn
)
# Need to figure out which elements match, and which ones do not
hunks.flat <- diffs$hunks
tar.mism <- unlist(
lapply(hunks.flat, function(x) if(!x$context) x$A else integer(0L))
)
cur.mism <- abs(
unlist(lapply(hunks.flat, function(x) if(!x$context) x$B else integer(0L)))
)
# Figure out which line each of these elements came from, and what index
# in each of those lines they are; we use the recorded lengths in words of
# each line to reconstruct this; also record original line length so we
# can compute token ratios
tar.ends <- cumsum(tar.lens)
cur.ends <- cumsum(cur.lens)
tar.dat$word.ind[tar.ind] <- reg_apply(tar.reg, tar.ends, tar.mism)
cur.dat$word.ind[cur.ind] <- reg_apply(cur.reg, cur.ends, cur.mism)
# If in wrap mode (which is really atomic mode), generate a spoofed
# `comp` vector (see word_to_line_map)
#
# Note that we're only operating on a subset of the data via tar.ind and
# cur.ind, these are supposed to be the contiguous block of lines that have
# row headers.
tar.dat.fin <- tar.dat
cur.dat.fin <- cur.dat
if(diff.mode == "wrap") {
tar.dat.ind <- lapply(tar.dat, '[', tar.ind)
cur.dat.ind <- lapply(cur.dat, '[', cur.ind)
word.line.mapped <- word_to_line_map(
hunks.flat, tar.dat.ind, cur.dat.ind, tar.ends, cur.ends
)
# Merge back the mapped data, need to account for possiblity of padding
# lines being added.
tar.len.old <- length(tar.dat[[1L]])
cur.len.old <- length(cur.dat[[1L]])
tar.ind.lo <- seq_len(head(tar.ind, 1L) - 1L)
tar.ind.hi <- seq_len(tar.len.old - tail(tar.ind, 1L)) + tail(tar.ind, 1L)
cur.ind.lo <- seq_len(head(cur.ind, 1L) - 1L)
cur.ind.hi <- seq_len(cur.len.old - tail(cur.ind, 1L)) + tail(cur.ind, 1L)
interleave <- function(idx, new, old, lo, hi)
c(old[[idx]][lo], new[[idx]], old[[idx]][hi])
tar.dat.fin <- setNames(
lapply(
seq_along(tar.dat), interleave,
new=word.line.mapped[['tar.dat']], old=tar.dat,
lo=tar.ind.lo, hi=tar.ind.hi
),
names(tar.dat)
)
cur.dat.fin <- setNames(
lapply(
seq_along(cur.dat), interleave,
new=word.line.mapped[['cur.dat']], old=cur.dat,
lo=cur.ind.lo, hi=cur.ind.hi
),
names(cur.dat)
)
}
list(
tar.dat=tar.dat.fin, cur.dat=cur.dat.fin, hit.diffs.max=diffs$hit.diffs.max
)
}
# Make unique strings
#
# Makes gibberish strings that are 16 characters long, are unique, and don't
# overlap with `invalid`. This allows us to generate strings we can use to
# cause a specific diff outcome.
#
# n: how long the character vector should be
# invalid: what values cannot be contained in the returned values
make_unique_strings <- function(n, invalid) {
pool <- c(
letters, LETTERS, 0:9, "_", ".", "*", "+", "-", "=", "(", ")", "{",
"}", "~", "`", "!", "@", "#", "$", "%", "^", "&", ";", ":", "<", ">", "?",
",", "/"
)
cols <- 16 # use 16 character samples, should be more than big enough
dat <- matrix("", ncol=16, nrow=n)
rows <- 1:n
safety <- 0
repeat {
dat[rows, ] <-
matrix(sample(pool, cols * length(rows), replace=TRUE), ncol=cols)
dat.chr <- do.call(paste0, split(dat, col(dat)))
rows <- which(duplicated(dat.chr) | dat.chr %in% invalid)
if(!length(rows)) break
# nocov start
if(safety <- safety + 1 > 100)
stop(
"Logic Error: unable to generate unique strings; this should be ",
"incredibly rare as we are sampling from 10^31 elements, so try ",
"again and if it happens again contact maintainer"
)
# nocov end
}
dat.chr
}
# Add word diff highlighting
word_color <- function(txt, inds, fun) {
word.list <- regmatches(txt, inds)
word.lens <- vapply(word.list, length, integer(1L))
# remove leading space before coloring
words.u <- if(length(word.list)) unlist(word.list) else character(0L)
words.u.trim.ind <- regexpr("\\S.*", words.u)
words.u.trim <- regmatches(words.u, words.u.trim.ind)
# color and re-insert back into space
words.c.trim <- fun(words.u.trim)
regmatches(words.u, words.u.trim.ind) <- words.c.trim
# split back into original lines
words.res <- vector("list", length(word.list))
words.res[!!word.lens] <- split(
words.u, rep(seq_along(word.lens), times=word.lens)
)
words.res[!word.lens] <- list(character(0L))
regmatches(txt, inds) <- words.res
txt
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.