library(data.table) rs = data.table(RS.data)

pull.more = function() {

}

code = create.code(excerpts = rs$text, expressions = c("data","number")) code = autocode(code, simplify=F) code$testSet = as.matrix(data.frame( ID = c(3476,1679,342,1719,651,359,179,784,728,3364), X1 = c(0,0,1,0,0,0,0,0,0,1) ))

col = c("text") exDT = data.table(text = code$excerpts) dw = exDT[, { wds = strsplit(as.character(.SD[[col]]), " ")[[1]] wds = tolower(gsub('[[:punct:]]| ', '', wds)) wds = wds[grep(x=wds, pattern="^$", invert=T)] # browser() wds }, by=1:nrow(exDT), .SDcols = col]

dd3 = dw[, list(freq=.N, docs=list(.SD$nrow), seen=F), by=V1, .SDcols=c("nrow", "V1")] setorder(dd3, -freq)

filterOut = function(x, corpus) { browser() filtered = lapply(x, function(y) { list(excerpt = y, words = list(corpus[sapply(corpus$docs, function(z) { y %in% z
}),V1])) }) fDT = rbindlist(filtered) fDT }

yesses = code$testSet[which(code$testSet[,2] == 1),1] yesWords = unique(dw[(nrow %in% yesses)]$V1) nos = code$testSet[which(code$testSet[,2] == 0),1] noWords = unique(dw[(nrow %in% nos)]$V1)

unseens = (1:length(code$excerpts))[-code$testSet[,1]] unseenWords = unique(dw[(nrow %in% unseens)]$V1)

topUnseen = dd3[V1 %in% newWordsFiltered,.SD[1:20]] freqUnseen = sort(table(unlist(topUnseen$docs)), decreasing = T)[1:2] freqInds = as.numeric(names(freqUnseen))

cat("Excerpts to include: ", freqInds, "\n")



epistemic-analytics/ncodeR documentation built on June 15, 2019, 12:03 a.m.