R/diffLines.R

Defines functions diffLines

Documented in diffLines

#' diffLines
#'
#' Diffs two sets of lines. It returns a data frame with three variables
#' \describe{
#' \item{\code{left}}{lines from \code{lines1}}
#' \item{\code{right}}{lines from \code{lines2}}
#' \item{\code{diff}}{symbols: \code{=} lines are identical, \code{|} lines are similar, \code{>} line exists only in \code{lines2}, and \code{<} line exists only in \code{lines1}} 
#' }
#'
#' @param lines1 character: vector for first set of lines
#' @param lines2 character: vector for second set of lines
#'
#' @return a data frame with several variables 
#' \describe{
#' \item{\code{left}}{line number of \code{lines1}, \code{NA} means no line in \code{lines1}}
#' \item{\code{lines1}}{line from \code{lines1}} 
#' \item{\code{diff}}{diff symbol}
#' \item{\code{lines2}}{line from \code{lines2}} 
#' \item{\code{right}}{line number of \code{lines2}, \code{NA} means no line in \code{lines2}}
#' \item{\code{dist}}{Levensthein distance between the line from \code{lines1} and \code{lines2}}
#' }
#' 
#' @importFrom utils adist
#' @export 
#'
#' @examples
#' old <- c('png("helloworld.png")', 'plot(c(0,1), c(0,1), type="n")',
#'          'text(0.25, 0.25, "Hello World!")', 'dev.off()')
#' d <- diffLines(readLines(system.file("helloworld.R", package="extpro")), old)
#' cat("\n", sprintf("%-60s %1s %-60s\n", d$lines1, d$diff, d$lines2))
#' cat("\n", sprintf("%3.0f %1s %3.0f %3.0f\n", d$left, d$diff, d$right, d$dist))
diffLines <- function(lines1, lines2) {
  do.match <- function(match1, lind) {
    if (length(match1)) {
      nind <- length(lind)
      if (is.null(dim(match1))) match1 <- matrix(match1, nrow=1)
      for (i in 1:nrow(match1)) {
        ci <- match1[i,'col']
        if (lind[ci]==0) {
          li <- lind[1:ci]
          cl <- which(li>0)
          rl <- if (length(cl)) max(li[cl]) else 0
          li <- lind[ci:nind]
          cu <- which(li>0)
          ru <- if (length(cu)) min(li[cu]) else 0   
          ri <-  match1[i,'row']
          ok <- TRUE
          if (rl) {
            ok <- if (ru) (ri>rl) & (ri<ru) else (ri>rl) 
          } else {
            if (ru) ok <- (ri<ru) 
          }
          if (ok) lind[ci] <- ri 
        }
      }
    }
    lind
  }
  # end of do.match
  #
  n1    <- nchar(lines1)
  n2    <- nchar(lines2)
  dist  <- adist(lines1, lines2)
  row   <- as.vector(matrix(1:length(lines1), ncol=length(lines2), nrow=length(lines1)))
  col   <- as.vector(matrix(1:length(lines2), ncol=length(lines2), nrow=length(lines1), byrow=TRUE))
  match <- cbind(dist   = as.vector(dist), row=row, col=col,
                 minlen = apply(cbind(n1[row], n2[col]), 1, min),
                 maxlen = apply(cbind(n1[row], n2[col]), 1, max)
  )
  lind  <- rep(0, length(lines2))
  # match non-empty lines
  match1 <- match[match[,'dist']<match[,'minlen'],]
  match1 <- match1[order(match1[,'dist'], -match1[,'minlen']),]
  lind   <- do.match(match1, lind)
  # match empty lines
  match1 <- match[match[,'minlen']==0,]
  match1 <- match1[order(match1[,'maxlen']),]
  lind   <- do.match(match1, lind)  
  #
  df <- list(left=integer(0), lines1=character(0), diff=character(0), lines2=character(0), right=integer(0), dist=integer(0))
  last <- 0
  #  browser()
  for (i in 1:length(lind)) {
    if (lind[i]) {
      if (last<lind[i]-1) {
        df$left    <- c(df$left, (last+1):(lind[i]-1))
        df$lines1  <- c(df$lines1,  lines1[(last+1):(lind[i]-1)])
        df$diff    <- c(df$diff, rep("<", lind[i]-last-1))
        df$lines2  <- c(df$lines2, rep("", lind[i]-last-1))
        df$right   <- c(df$right, rep(NA_integer_, lind[i]-last-1))
        df$dist    <- c(df$dist, rep(NA_integer_, lind[i]-last-1))
      }
      df$left   <- c(df$left, lind[i])
      df$lines1 <- c(df$lines1, lines1[lind[i]])
      df$diff   <- c(df$diff, if(dist[lind[i],i]) "|" else "=")
      df$lines2 <- c(df$lines2, lines2[i])
      df$right  <- c(df$right, i)
      df$dist   <- c(df$dist, dist[lind[i],i])
      last <- lind[i]
    } else {
      df$left   <- c(df$left, NA_integer_)
      df$lines1 <- c(df$lines1, "")
      df$diff   <- c(df$diff, ">")
      df$lines2 <- c(df$lines2, lines2[i])
      df$right  <- c(df$right, i)
      df$dist   <- c(df$dist, NA_integer_)
    }
  }
  if (last<length(lines1)) {
    df$left   <- c(df$left, (last+1):length(lines1))
    df$lines1 <- c(df$lines1, lines1[(last+1):length(lines1)])
    df$diff   <- c(df$diff,  rep("<", length(lines1)-last))
    df$lines2 <- c(df$lines2, rep("", length(lines1)-last))
    df$right  <- c(df$right, rep(NA_integer_, length(lines1)-last))
    df$dist   <- c(df$dist,  rep(NA_integer_, length(lines1)-last))
  } 
  data.frame(df, stringsAsFactors = FALSE)
}
sigbertklinke/extpro documentation built on Dec. 31, 2020, 7:26 a.m.