R/sim_func.R

Defines functions sim_func

Documented in sim_func

#' Compute levenstein edit similarity between strings, accounting for ambiguous
#' characters and potentially aggregating similarity over multiple
#' transformations (e.g. pinyin for phonetic similarity and four corner code for
#' visual similarity)
#'
#' @param s_1 A vector or matrix of characters representing one set of original
#'   strings
#' @param s_2 A vector or matrix of characters representing a second set of
#'   original strings (must be the same length as s_1)
#' @param aggr A character function name or function object indicating how
#'   similarities calculated over multiple transformations should be aggregated
#' @param method A character input identifying the string similarity method to
#'   be used by stringsim from the 'stringdist' package
#' @param ... Other options to be passed to stringsim
#' @return A vector of similarities corresponding to the pairwise
#'   comparisons of elements of s_1 and s_2

sim_func <- function(s_1, s_2,aggr='mean',method='lv',q = 1,...){ #inputs may be string vectors

  if(any(c('data.frame','matrix') %in% class(s_1))){
    temp = lapply(1:ncol(s_1), function(c){
      sim_func(s_1[,c],s_2[,c])
    })

    temp = do.call(cbind,temp)
    return(apply(temp,1,match.fun(aggr)))
  }

  blankinds = which((nchar(s_1) + nchar(s_2)) == 0)

  one_blankinds = which((nchar(s_1) + nchar(s_2)) != 0 & (nchar(s_1) * nchar(s_2)) == 0)

  doinds = which((nchar(s_1) * nchar(s_2)) > 0)

  hominds = doinds[grep('\\[\\w*,\\w*\\]',paste(s_1[doinds],s_2[doinds]))]

  if(method %in% c('cosine','jaccard') & q > 1){s_1 = paste0('_',s_1,'_'); s_2 = paste0('_',s_2,'_')}

  res = rep(0,length(s_1))

    if(length(hominds)==0){
      res[doinds] = stringsim(s_1[doinds],s_2[doinds],method=method,q = q, ...)
      if(method == 'lcs'){
        totalchar = nchar(s_1[doinds]) + nchar(s_2[doinds])
        minchar = pmin(nchar(s_1[doinds]),nchar(s_2[doinds]))

        res[doinds] = res[doinds] * totalchar/2 / minchar
      }     #redefine denominator to be maximum shared string length(i.e. shorter of two strings)
      res[blankinds] = NA
      res[one_blankinds] = 0

      return(res)
    }

    doinds = setdiff(doinds,hominds)
    res[doinds] = stringsim(s_1[doinds],s_2[doinds],method=method,q = q, ...)
    if(method == 'lcs'){
      meanchar = (nchar(s_1[doinds]) + nchar(s_2[doinds])) / 2
      minchar = pmin(nchar(s_1[doinds]),nchar(s_2[doinds]))

      res[doinds] = res[doinds] * meanchar / minchar
    }

    torun = do.call(rbind,lapply(hominds,function(i){

      v1 = s_1[i]
      v2 = s_2[i]

      s1 = if(grepl('\\[\\w*,\\w*\\]',v1)){
        homonym(v1)
      }else v1
      s2 = if(grepl('\\[\\w*,\\w*\\]',v2)){
        homonym(v2)
      }else v2
      matrix(c(rep(s1,each=length(s2)),rep(s2,length(s1)),rep(i,length(s1)*length(s2))),ncol=3)
    }))

    simtemp = stringsim(torun[,1],torun[,2],method=method,q=q,...)

    if(method == 'lcs'){
      totalchar = rowMeans(nchar(torun[,1:2]))
      minchar = pmin(nchar(torun[,1]),nchar(torun[,2]))
      simtemp = simtemp * totalchar / minchar / 2
    }

    res[hominds] = tapply(simtemp,as.integer(torun[,3]),max)
    res[blankinds] = NA
    res[one_blankinds] = 0

    return(res)

  }
OPTI-SURVEIL/chinsimi documentation built on Oct. 27, 2019, 7:05 p.m.