R/verbatim_dedouble.R

Defines functions verbatim_dedouble

# library(dplyr)
# library(readxl)
# library(verbatim.utils)
#
# f1<-data.table::fread("e:/Travail - Freelance/TNS/brexit/data/gb2/brexit_netbase_uk only_before results_2016-09-01_tab delimited.csv",sep="\t")
# f2<-data.table::fread("e:/Travail - Freelance/TNS/brexit/data/gb2/brexit_netbase_uk only_after results_2016-09-01_KF_tab delimited.csv",sep="\t")
# nrow(f1)
# nrow(f2)
# f<-rbindlist(list(f1,f2))
# system.time({
# txt<-force_encoding(f$`Sound Bite Text`,tolower=TRUE)
# })
# 122.93        0.19      123.93
# f1<-read_excel("e:/Travail - Freelance/TNS/brexit/StreamFrenchQueryBrexit1.xlsx")
# dim(f1)
# f2<-read_excel("e:/Travail - Freelance/TNS/brexit/StreamFrenchQuery2.xlsx")
# dim(f2)
# f<-rbind(f1,f2)
# f$`Sound Bite Text`[1]
# txt<-f$`Sound Bite Text`
# txt=txt[10001:20000]
# data <- readxl::read_excel("c:/Users/Dominique/Desktop/Documents présentation TNS 28 06 2016/data/tbl_master.xlsx")
# txt<-data$`Full Text`
# library(dplyr)
# library(verbatim.utils)
# # txt<-c(data$`Full Text`%>%rep(1),paste0("Je me demande si en fait il faut faire ",data$`Full Text`%>%rep(1)))
# txt<-data$`Full Text`
# length(txt)
# data<-read.csv("c:/Users/Dominique/Downloads/Sentiment(1).csv",sep=",")
# txt0<-txt<-data$text%>%as.character
#' @export verbatim_dedouble
#'
#'
#'
verbatim_dedouble<-function(txt,exact=FALSE,mc.cores = 4L, n_minhashes= 50L, bands = 5L , threshold = 1-cos(pi/3) ,progress=FALSE, use_duplicated = TRUE){
    # exact=FALSE;mc.cores = 4L; n_minhashes= 30L; bands = 5L ; threshold = 1-cos(pi/3) ;progress=TRUE; use_duplicated = TRUE
    library(textreuse)
  library(dplyr)
  library(stringr)
  library(Matrix)
  library(igraph)
  library(data.table)
  names(txt)<-NULL
  if(use_duplicated){
  txt<-data.table(txt=txt,id=seq_along(txt),key="txt")
  txt2<-unique(txt)
  setnames(txt2,"id","id2")
  txt2<-txt[txt2]
  txt2<-txt2[order(txt2$id),]
  id2<-txt2$id2
  id<-which(!duplicated(txt2$id2))
  txt<-txt2$txt[id]
  }
  # duplicated(txt)%>%table

  np<-options("mc.cores")$mc.cores
  on.exit(options("mc.cores" = np))
  options("mc.cores" = mc.cores)


  minhash <- minhash_generator(n = n_minhashes, seed = 3552)
  e20<-suppressWarnings(TextReuseCorpus(text=txt,  tokenizer = tokenizers::tokenize_ngrams, n = 3,lowercase = FALSE,simplify=TRUE,
                                        minhash_func = minhash, keep_tokens = TRUE,
                                        progress = progress))
  a0<-data.table:::merge.data.table(data.table(a=skipped(e20),un=1),data.table(b=skipped(e20),un=1),by="un",allow.cartesian=TRUE)
  a0<-subset(a0,a!=b)
  a0[,un:=NULL]
  a0<-as.data.frame(a0)
  e2<-tryCatch({
    buckets <- lsh(e20, bands = bands, progress = progress)
    candidates <- lsh_candidates(buckets)
    e2<-lsh_compare(candidates, e20, jaccard_similarity, progress = progress)
    e2<-as.data.frame(e2)
    e2<-subset(e2,score>=lsh_threshold(n_minhashes,bands)/2)


    a<-substr(e2$a,5,1000)%>%as.numeric
    b<-substr(e2$b,5,1000)%>%as.numeric
    # e2$ta<-txt[a]
    # e2$tb<-txt[b]
    # d<-stringdist::stringdist(e2$ta,e2$tb,method="cosine",nthread= mc.cores)
    a2<-paste0("doc-",a)
    b2<-paste0("doc-",b)
    d2<-unlist(parallel::mclapply(seq_along(a),function(t)jaccard_dissimilarity(e20[[a2[t]]]$tokens,e20[[b2[t]]]$tokens),mc.cores = if(.Platform[[1]]=="windows") 1 else mc.cores))
    e2$d<-d2
    e2<-subset(e2,d<=if(exact) 1e-6 else threshold)
    e2$a<-substr(e2$a,5,1000)%>%as.numeric
    e2$b<-substr(e2$b,5,1000)%>%as.numeric
    e2
  },error=function(err){print(err);NULL})
  a0$a<-a0$a%>%as.character%>%substr(5,1000)%>%as.numeric
  a0$b<-a0$b%>%as.character%>%substr(5,1000)%>%as.numeric
  d<-stringdist::stringdist(txt[a0$a],txt[a0$b],method="cosine",nthread= mc.cores)
  a0$score<-0
  a0$d<-d
  a0<-a0[a0$d<=(if(exact) 1e-6 else 0.05),]
  e2<-rbind(e2,a0)

  a<-e2$a
  b<-e2$b
  M<-sparseMatrix(i=c(a,b),j=c(b,a),x=1,dims = rep(length(txt),2))
  g<-graph_from_adjacency_matrix(M,"undirected")
  cg<-components(g)
  id0<-cg$membership
  w<-cg$csize

if(use_duplicated){
    id0<-id0[fastmatch::fmatch(id2,id)]
    w<-(data.frame(id=id0)%>%group_by(id)%>%summarise(n=n())%>%arrange(id))$n
   return(list(id=id0,weight=w))
} else {

  w<-(data.frame(id=id0)%>%group_by(id)%>%summarise(n=n())%>%arrange(id))$n
  return(list(id=id0,weight=w))
}
}
# verbatim_dedouble<-function(txt,exact=FALSE,mc.cores = 4L, n_minhashes= 200L, bands = 50L , threshold = 1-cos(pi/3) ,progress=FALSE){
#   library(text2vec)
#   library(LSHR)
#   library(dplyr)
#   library(stringr)
#   library(Matrix)
#   library(igraph)
#   names(txt)<-NULL
#   txt<-tolower(txt)
#   np<-options("mc.cores")$mc.cores
#   on.exit(options("mc.cores" = np))
#   if(.Platform[[1]]== "windows") mc.cores<-1
#   options("mc.cores" = mc.cores)
#
#   it <- itoken(txt, preprocess_function = tolower,tokenizer = word_tokenizer)
#   vocab <- create_vocabulary(it,ngram = c(ngram_min = 5L, ngram_max = 5L)) %>%
#     prune_vocabulary(term_count_min = 2,max_number_of_terms=600000)
#   vocab$vocab<-vocab$vocab%>%subset(doc_counts>=2)
#   vocab$vocab<-head(vocab$vocab,500000)
#   vectorizer <- vocab_vectorizer(vocab)
#   it <- itoken(txt, preprocess_function = tolower,tokenizer = word_tokenizer)
#   is_err<- tryCatch({
#     dtm <- create_dtm(it, vectorizer)
#     FALSE
#   },error=function(err)TRUE)
#   if(is_err)return(list(id=seq_along(txt),weight=rep(1,length(txt))))
#
#   gm<- get_hash_matrix(dtm, hashfun_number = n_minhashes, measure = "jaccard",seed=123L)
#   sign_mat <- get_signature_matrix(dtm, hashfun_number = n_minhashes, measure = 'jaccard',seed=123L, mc.cores =  mc.cores)
#   candidate_indices <-get_similar_pairs(sign_mat, bands_number = bands)
#   head(candidate_indices)
#
#   d<-sapply(seq_along(candidate_indices$id1)[1:1000],function(t)jaccard_atomic(dtm[candidate_indices[t,]$id1,],dtm[candidate_indices[t,]$id2,]))
#
#   e2<-tryCatch({
#     buckets <- lsh(e20, bands = bands, progress = progress)
#     candidates <- lsh_candidates(buckets)
#     e2<-lsh_compare(candidates, e20, jaccard_similarity, progress = progress)
#     e2<-as.data.frame(e2)
#
#
#
#     a<-substr(e2$a,5,1000)%>%as.numeric
#     b<-substr(e2$b,5,1000)%>%as.numeric
#     # e2$ta<-txt[a]
#     # e2$tb<-txt[b]
#     # d<-stringdist::stringdist(e2$ta,e2$tb,method="cosine",nthread= mc.cores)
#     a2<-paste0("doc-",a)
#     b2<-paste0("doc-",b)
#     d2<-unlist(parallel::mclapply(seq_along(a),function(t)jaccard_dissimilarity(e20[[a2[t]]]$tokens,e20[[b2[t]]]$tokens),mc.cores = if(.Platform[[1]]=="windows") 1 else mc.cores))
#     e2$d<-d2
#     e2<-subset(e2,d<=if(exact) 1e-6 else threshold)
#     a<-substr(e2$a,5,1000)%>%as.numeric
#     b<-substr(e2$b,5,1000)%>%as.numeric
#   },error=function(err){print(err);NULL})
#   a0$a<-a0$a%>%as.character%>%substr(5,1000)%>%as.numeric
#   a0$b<-a0$b%>%as.character%>%substr(5,1000)%>%as.numeric
#   d<-stringdist::stringdist(txt[a0$a],txt[a0$b],method="cosine",nthread= mc.cores)
#   a0$d<-d
#   a0<-a0[d<=if(exact) 1e-6 else 0.05,]
#   e2<-rbind(e2,a0)
#
#   a<-e2$a
#   b<-e2$b
#   M<-sparseMatrix(i=c(a,b),j=c(b,a),x=1,dims = rep(length(txt),2))
#   g<-graph_from_adjacency_matrix(M,"undirected")
#   cg<-components(g)
#   return(list(id=cg$membership,weight=cg$csize))
# }
dominiqueemmanuel/verbatim.utils documentation built on Jan. 20, 2020, 3:16 a.m.