R/fuzz_functions.R

Defines functions fuzz_token_set_ratio fuzz_token_sort_ratio fuzz_partial_ratio fuzz_m_ratio fuzzdist

Documented in fuzzdist fuzz_m_ratio fuzz_partial_ratio fuzz_token_set_ratio fuzz_token_sort_ratio

#' @rdname fuzz_
fuzzdist <- function(a, b, method = c(
  "fuzz_m_ratio", "fuzz_partial_ratio", "fuzz_token_sort_ratio", "fuzz_token_set_ratio")
){
  method <- match.arg(method)
  do.call(
    method,
    list(a, b)
  )
}

#' @rdname fuzz_
fuzz_m_ratio <- function(a, b){
  out <- lapply(b, function(b, a){
    z <- c(a, b)
    if(any(is.na(z))){
      return(NA)
    }else{
      z_list <- lapply(strsplit(z, ""),
        function(x, minval){x[1:minval]},
        minval = min(nchar(z))
        )
      z_match <- apply(
        do.call(cbind, z_list),
        1,
        function(x){x[1] == x[2]}
      )
      return(
        1 - (2 * length(which(z_match)) / sum(nchar(z)))
      )
    }
  },
  a = a)
  return(as.numeric(out))
}


#' @rdname fuzz_
fuzz_partial_ratio <- function(a, b){
  out <- lapply(b, function(b, a){
    z <- c(a, b)
    if(any(is.na(z))){
      return(NA)
    }else{
      zn <- nchar(z)
      n_reps <- (max(zn) - min(zn))
      z_list <- lapply(
        c(0: n_reps),
        function(x, lookup, keep){lookup[(keep + x)]},
        lookup = strsplit(z[which.max(zn)], "")[[1]],
        keep = seq_len(min(zn))
      )
      z_ratio <- lapply(z_list, function(x, comparison){
        match_value <- apply(
          cbind(x, comparison),
          1,
          function(y){y[1] == y[2]}
        )
        length(which(match_value))/length(x)
      },
      comparison = strsplit(z[which.min(zn)], "")[[1]]
      )
      return(1 - max(as.numeric(z_ratio)))
    }
  },
  a = a)
  return(as.numeric(out))
}


#' @rdname fuzz_
fuzz_token_sort_ratio <- function(a, b){
  out <- lapply(b, function(b, a){
    z <- c(a, b)
    if(any(is.na(z))){
      return(NA)
    }else{
      z_split <- strsplit(z, " ")
      z_split <- lapply(z_split, make.unique, sep="_XDUP_")
      in_check <- z_split[[1]] %in% z_split[[2]]
      intersection <- sort(z_split[[1]][which(in_check)])
      string_list <- list(
        t0 = intersection,
        t1 = c(intersection,
               sort(z_split[[1]][which(!in_check)])
        ),
        t2 = c(intersection,
               unlist(lapply(z_split[[2]][which(!(z_split[[2]] %in% intersection))], function(x){strsplit(x, "_XDUP_")[[1]][1]}))
        )
      )
      string_list <- lapply(string_list, function(x){
        if(length(x) < 1){
          return("")
        }else{
          return(paste(x, collapse = " "))
        }
      })
      result <- c(
        fuzz_m_ratio(string_list$t0, string_list$t1),
        fuzz_m_ratio(string_list$t0, string_list$t2),
        fuzz_m_ratio(string_list$t1, string_list$t2)
      )
      return(max(result))
    }
  },
  a = a)
  return(as.numeric(out))
  return(as.numeric(out))
}


#' @rdname fuzz_
fuzz_token_set_ratio <- function(a, b){
  out <- lapply(b, function(b, a){
    z <- c(a, b)
    if(any(is.na(z))){
      return(NA)
    }else{
      z_split <- strsplit(z, " ")
      in_check <- z_split[[1]] %in% z_split[[2]]
      intersection <- sort(z_split[[1]][which(in_check)])
      string_list <- list(
        t0 = intersection,
        t1 = c(intersection,
          sort(z_split[[1]][which(!in_check)])
        ),
        t2 = c(intersection,
          sort(z_split[[2]][which(!(z_split[[2]] %in% intersection))])
        )
      )
      string_list <- lapply(string_list, function(x){
        if(length(x) < 1){
          return("")
        }else{
          return(paste(x, collapse = " "))
        }
      })
      result <- c(
        fuzz_m_ratio(string_list$t0, string_list$t1),
        fuzz_m_ratio(string_list$t0, string_list$t2),
        fuzz_m_ratio(string_list$t1, string_list$t2)
        )
      return(max(result))
    }
  },
  a = a)
  return(as.numeric(out))
}

Try the synthesisr package in your browser

Any scripts or data that you put into this service are public.

synthesisr documentation built on July 2, 2020, 2:16 a.m.