R/diff.R

match_seq <- function(a,b, min){
  
  y <- lapply(b, '==', a)
  
  cond <- lapply(seq_along(y), function(x){
    y[[x]][x:(length(a) - length(b) + x)]
  })
  
  cond <- Reduce('+', cond)
  
  res <- which(cond == length(b))
  res[res > min]
  
}

diff_text <- function(a, b) {
  
  # Voir si besoin de unlist
  cond <- lapply(tokenize_words(b), "==", tokenize_words(a)) %>%
    unlist
  
  if(any(cond)) {
    
    diff_lcs(a, b)
    
  } else {
    
    a <- tokenize_words(a)
    b <- tokenize_words(b)
    
    diff <- data.frame(
      a = seq_along(a),
      b = rep(NA, length(a)),
      mot = a,
      status = rep("-", length(a))
    )
    
    data.frame(
      a = rep(NA, length(b)),
      b = seq_along(b),
      mot = b,
      status = rep("+", length(b))
    ) %>% rbind(diff)
    
  }
  
}

diff_lcs <- function(a, b) {
  
  align <- align_local(a, b, progress = FALSE)
  
  a <- tokenize_words(a)
  b <- tokenize_words(b)
  
  lcs <- sapply(align[1:2], str_split, " ") %>%
    sapply(str_to_lower) %>%
    matrix(ncol = 2, byrow = F) %>%
    as.data.frame(stringsAsFactors = FALSE)
  
  names(lcs) <- c("a_edits", "b_edits")
  
  # Groupage de lcs par sous-sequences
  
  lcs$group <- cumsum((substr(lcs[, 1], 1, 1) == "#") | (substr(lcs[, 2], 1, 1) == "#"))
  tmp <- rle(lcs$group)
  tmp$values <- tmp$length > 1
  tmp$values[tmp$values] <- cumsum(tmp$values[tmp$values])
  lcs$group <- inverse.rle(tmp)
  
  lcs <- lcs[(substr(lcs[, 1], 1, 1) != "#") & (substr(lcs[, 2], 1, 1) != "#"), ]
  min <- 0
  
  lcsA <- foreach(i = unique(lcs$group)) %do% {
    
    text <- lcs$a_edits[lcs$group == i]
    
    m <- match_seq(a, text, min) %>% min
    res <- seq(m, m + length(text) - 1, by = 1)
    
    min <- max(res)
    
    res
    
  } %>% unlist
  
  min <- 0
  
  lcsB <- foreach(i = unique(lcs$group)) %do% {
    
    text <- lcs$a_edits[lcs$group == i]
    
    m <- match_seq(b, text, min) %>% min
    res <- seq(m, m + length(text) - 1, by = 1)
    
    min <- max(res)
    
    res
    
  } %>% unlist
  
  diff <- data.frame(
    a = lcsA,
    b = lcsB,
    mot = b[lcsB],
    status = rep("=", length(lcsA)),
    stringsAsFactors = FALSE
  )
  
  a_seq <- seq_along(a)[!seq_along(a) %in% lcsA]
  b_seq <- seq_along(b)[!seq_along(b) %in% lcsB]
  
  diff <- data.frame(
    a = a_seq,
    b = rep(NA, length(a_seq)),
    mot = a[a_seq],
    status = rep("-", length(a_seq)),
    stringsAsFactors = FALSE
  ) %>% rbind(diff)
  
  diff <- data.frame(
    a = rep(NA, length(b_seq)),
    b = b_seq,
    mot = b[b_seq],
    status = rep("+", length(b_seq)),
    stringsAsFactors = FALSE
  ) %>% rbind(diff)
  
  diff
  
}

set_id <- function(tbl_version) {
  
  tbl_version$id <- NA
  tbl_version$id[tbl_version$step == 1] <- seq(1, (tbl_version$step == 1) %>% cumsum %>% max, by = 1)
  tbl_past <- filter(tbl_version, tbl_version$step == 1)
  start_id <- max(tbl_version$id, na.rm = TRUE)
  
  tbl_version_id <- foreach(i = unique(tbl_version$step)[-1], .combine= rbind) %do% {
    
    tbl_now <- filter(tbl_version, step == i)
    
    tbl_now$id <- tbl_past$id[match(tbl_now$a, tbl_past$b)]
    
    if(anyNA(tbl_now$a)) {
      
      tbl_now[is.na(tbl_now$a), "id"] <- seq(1, cumsum(is.na(tbl_now$a)) %>% max, by = 1) + start_id + 1
      
    }
    
    tbl_past <- tbl_now
    
    max_id <- max(tbl_past$id, na.rm = TRUE)
    start_id <- ifelse(max_id > start_id, max_id, start_id)
    
    tbl_now
    
  }
  
  rbind(
    tbl_version[tbl_version$step == 1, ],
    tbl_version_id
  )
  
}
cafeine05/WikiSocio documentation built on May 13, 2019, 10:39 a.m.