demo/backup/Untitled.R

library(data.table)
rs = data.table(RS.data)

#####
# Create a test code
#####
  code = create.code(excerpts = rs$text, expressions = c("data","number")) 
  code = autocode(code, simplify=F)
  code$testSet = as.matrix(data.frame(
    ID = c(3476,1679,342,1719,651,359,179,784,728,3364),
    X1 = c(0,0,1,0,0,0,0,0,0,1)
  ))
#####
# END
#####

pull.more = function() {
  
}
filterOut = function(x, corpus) {
  filtered = lapply(x, function(y) {
    list(excerpt = y, words = list(corpus[sapply(corpus$docs, function(z) {
      y %in% z   
    }),V1]))
  })
  fDT = rbindlist(filtered)
  fDT
}
   
col = c("text")
exDT = data.table(text = code$excerpts)
dw = exDT[, {
  wds = strsplit(as.character(.SD[[col]]), " ")[[1]]
  wds = tolower(gsub('[[:punct:]]| ', '', wds))
  wds = wds[grep(x=wds, pattern="^$", invert=T)]
  # browser()
  wds
}, by=1:nrow(exDT), .SDcols = col]

dd3 = dw[, list(freq=.N, docs=list(.SD$nrow), seen=F), by=V1, .SDcols=c("nrow", "V1")]
setorder(dd3, -freq)

yesses = code$testSet[which(code$testSet[,2] == 1),1]
yesWords = unique(dw[(nrow %in% yesses)]$V1)
nos = code$testSet[which(code$testSet[,2] == 0),1]
noWords = unique(dw[(nrow %in% nos)]$V1)

unseens = (1:length(code$excerpts))[-code$testSet[,1]]
unseenWords = unique(dw[(nrow %in% unseens)]$V1)

topUnseen = dd3[V1 %in% newWordsFiltered,.SD[1:20]]
freqUnseen = sort(table(unlist(topUnseen$docs)), decreasing = T)[1:2]
freqInds = as.numeric(names(freqUnseen))

cat("Excerpts to include: ", freqInds, "\n")
print(code$excerpts[freqInds])
epistemic-analytics/ncodeR documentation built on June 15, 2019, 12:03 a.m.