R/private_functions.R

private_strReverse <- function(x)
  sapply(lapply(strsplit(x, NULL), rev), paste, collapse = "")

########### Funciones ############

private_cruzarPDV<-function(mergeTMP,idt1,idt2,campoAprox,threshold,method,p=0,q=1){

  flush.console()
  print(paste0('Matching using ',method,'...'))
  distTMP<-stringdist::stringdist(mergeTMP[,paste(campoAprox,".x",sep='')],mergeTMP[,paste(campoAprox,".y",sep='')],
                      method = method,p=p,q=q)
  mergeTMP<-data.frame(mergeTMP[order(distTMP),],distancia=distTMP[order(distTMP)])

  mergeTMP<-mergeTMP[mergeTMP$distancia<threshold,]
  pdvCruzadosTMPRes<-mergeTMP[!duplicated(mergeTMP[,idt1]),]


  return(pdvCruzadosTMPRes)
}


# A<-sourcePivoteUC
# B<-sourcePivoteUC
# idA<-'idUC'
# idB<-'idUC'
# listadoCruce<-listadoCruce
# varAprox<-varAprox
# precisionAlta<- precisionAlta
# invertir = T


private_multiCruce<-function(A,B,idA,idB,listadoCruce,varAprox,precisionAlta=F,invertir=F){

  if(precisionAlta){
    Qthres<-2
    LCSThres<-2
    JWThres<-0.04
    JACCThres<-0.05
    cosineThres<-0.05
    q<-3
  }else{
    Qthres<-3
    LCSThres<-5
    JWThres<-0.05
    JACCThres<-0.167
    cosineThres<-0.2
    q<-1
  }



  flush.console()
  print('Generando tabla...')
  # mergeTMP<-merge(A,B,by=listadoCruce)
  # A<-censo_pivote
  # B<-dm_clientes
  mergeTMP<-dplyr::inner_join(A,B,by=listadoCruce)
  mergeTMP<-mergeTMP[complete.cases(mergeTMP[,listadoCruce]),]
  # mergeTMP<-mergeTMP[mergeTMP$idUC.x!=mergeTMP$idUC.y,]
  pdvCruzados<-NULL
  print(paste('Generada tabla de ',nrow(mergeTMP),' filas',sep=''))
  if(nrow(mergeTMP)>0){
    if(idA==idB){
      idA<-paste(idA,".x",sep='')
      idB<-paste(idB,".y",sep='')
    }
    if(identical(A,B)){
      mergeTMP<-mergeTMP[mergeTMP[,idA]!=mergeTMP[,idB],]
    }
    if(invertir){
      print('Invertir campos...')
      flush.console()
      varAInv<-as.vector(sapply(mergeTMP[,paste(varAprox,".x",sep='')], function(x) paste(rev(strsplit(x,' ')[[1]]),collapse=' ')))
      varBInv<-as.vector(sapply(mergeTMP[,paste(varAprox,".y",sep='')], function(x) paste(rev(strsplit(x,' ')[[1]]),collapse=' ')))
      varA<-mergeTMP[,paste(varAprox,".x",sep='')]
      varB<-mergeTMP[,paste(varAprox,".y",sep='')]
    }

    print('cruce normal...')
    pdvCruzadosTMPQ<-private_cruzarPDV(mergeTMP,idA,idB,varAprox,Qthres,'qgram',q=q)
    pdvCruzadosTMPLCS<-private_cruzarPDV(mergeTMP,idA,idB,varAprox,LCSThres,'lcs')
    pdvCruzadosTMPJW<-private_cruzarPDV(mergeTMP,idA,idB,varAprox,JWThres,'jw',p=0.2)
    pdvCruzadosTMPJACC<-private_cruzarPDV(mergeTMP,idA,idB,varAprox,JACCThres,'jaccard')
    pdvCruzadosTMPCosine<-NULL#cruzarPDV(A,B,listadoCruce,varAprox,JACCThres,'cosine')

    if(invertir){
      print('cruce invertidos...')
      flush.console()
      mergeTMP[,paste(varAprox,".x",sep='')]<-varAInv
      mergeTMP[,paste(varAprox,".y",sep='')]<-varBInv
      pdvCruzadosTMPQInv1<-private_cruzarPDV(mergeTMP,idA,idB,varAprox,Qthres,'qgram',q=q)
      pdvCruzadosTMPLCSInv1<-private_cruzarPDV(mergeTMP,idA,idB,varAprox,LCSThres,'lcs')
      pdvCruzadosTMPJWInv1<-private_cruzarPDV(mergeTMP,idA,idB,varAprox,JWThres,'jw',p=0.2)
      pdvCruzadosTMPJACCInv1<-private_cruzarPDV(mergeTMP,idA,idB,varAprox,JACCThres,'jaccard')
      pdvCruzadosTMPCosineInv1<-NULL#cruzarPDV(A,B,listadoCruce,varAprox,JACCThres,'cosine')

      print('cruce A invertido...')
      flush.console()
      mergeTMP[,paste(varAprox,".x",sep='')]<-varAInv
      mergeTMP[,paste(varAprox,".y",sep='')]<-varB
      pdvCruzadosTMPQInv2<-private_cruzarPDV(mergeTMP,idA,idB,varAprox,Qthres,'qgram',q=q)
      pdvCruzadosTMPLCSInv2<-private_cruzarPDV(mergeTMP,idA,idB,varAprox,LCSThres,'lcs')
      pdvCruzadosTMPJWInv2<-private_cruzarPDV(mergeTMP,idA,idB,varAprox,JWThres,'jw',p=0.2)
      pdvCruzadosTMPJACCInv2<-private_cruzarPDV(mergeTMP,idA,idB,varAprox,JACCThres,'jaccard')
      pdvCruzadosTMPCosineInv2<-NULL#cruzarPDV(A,B,listadoCruce,varAprox,JACCThres,'cosine')

      print('cruce B invertido...')
      flush.console()
      mergeTMP[,paste(varAprox,".x",sep='')]<-varA
      mergeTMP[,paste(varAprox,".y",sep='')]<-varBInv
      pdvCruzadosTMPQInv3<-private_cruzarPDV(mergeTMP,idA,idB,varAprox,Qthres,'qgram',q=q)
      pdvCruzadosTMPLCSInv3<-private_cruzarPDV(mergeTMP,idA,idB,varAprox,LCSThres,'lcs')
      pdvCruzadosTMPJWInv3<-private_cruzarPDV(mergeTMP,idA,idB,varAprox,JWThres,'jw',p=0.2)
      pdvCruzadosTMPJACCInv3<-private_cruzarPDV(mergeTMP,idA,idB,varAprox,JACCThres,'jaccard')
      pdvCruzadosTMPCosineInv3<-NULL#cruzarPDV(A,B,listadoCruce,varAprox,JACCThres,'cosine')
    }
    if(invertir){
      pdvCruzados<-rbind(pdvCruzadosTMPQ,pdvCruzadosTMPLCS,pdvCruzadosTMPJW,pdvCruzadosTMPJACC,pdvCruzadosTMPCosine,
                         pdvCruzadosTMPQInv1,pdvCruzadosTMPLCSInv1,pdvCruzadosTMPJWInv1,pdvCruzadosTMPJACCInv1,pdvCruzadosTMPCosineInv1,
                         pdvCruzadosTMPQInv2,pdvCruzadosTMPLCSInv2,pdvCruzadosTMPJWInv2,pdvCruzadosTMPJACCInv2,pdvCruzadosTMPCosineInv2,
                         pdvCruzadosTMPQInv3,pdvCruzadosTMPLCSInv3,pdvCruzadosTMPJWInv3,pdvCruzadosTMPJACCInv3,pdvCruzadosTMPCosineInv3
      )
    }else{
      pdvCruzados<-rbind(pdvCruzadosTMPQ,pdvCruzadosTMPLCS,pdvCruzadosTMPJW,pdvCruzadosTMPJACC,pdvCruzadosTMPCosine)
    }

  }




  pdvCruzados<-pdvCruzados[!duplicated(pdvCruzados[,idA]),]
  rm(mergeTMP)
  return(pdvCruzados)

}
albertferre/match-mergeR documentation built on May 10, 2019, 8:51 a.m.