# Roxygen documentation
#' Bastiat, please process my text and create the model variables.
#'
#' @return A list including data frames for the estimation data, the word scores and a vector of variables.
#' @references www.globaltradealert.org
#' @author Johannes Fritz for GTA
# Function infos and parameters --------------------------------------------
bt_create_estimation_data <- function(bid=NULL,
evaluation=NULL,
word.score=NULL,
text=NULL,
acting.agency=NULL,
train.share=.82,
detective.name=NULL,
detective.number=NULL,
for.training=T,
max.input.rows = 20000
) {
#not mounting these packages here causes failure when bt is used elsewhere
library(tidytext)
library(dplyr)
library(stats)
library(data.table)
library(caret)
# bid = update.core$bid
# text=paste(update.core$act.title.en, update.core$act.description.en)
# acting.agency = update.core$acting.agency
#for stack migration
#lc.update = read.csv("C:/Users/c-cam/GTA data team Dropbox/Bastiat/0 projects/033 stack migration/bt-stack-migration/gn_update_core.csv")
# bid = lc.update$bid
# text = paste(lc.update$act.title.en, lc.update$act.description.en)
# evaluation = sample(c(0,1),nrow(lc.update), replace = T)
# acting.agency = lc.update$acting.agency
# train.share = .82
# for.training = T
#!!!VERY IMPORTANT!!!
#in R integer variables can only go up to ~+-2*10^9 which is exceeded later
#storing our vars as doubles solves this problem
#I was originally doing this ad hoc but it happens again and again, so here is a janky function.
cc_df_col_int2dbl = function(tgt.df){
#rooting out naughty integers
converted = 0
for(col in colnames(tgt.df)){
if(typeof(tgt.df[,col]) == "integer"){
tgt.df[,col] = as.numeric(tgt.df[,col])
converted = converted + 1
}
}
message(paste(converted, "cols converted to double"))
return(tgt.df)
}
#wrapper function to preserve original structure
cc_count_freq = function(tgt.df){
return(
tgt.df %>%
dplyr::count(word) %>%
cc_df_col_int2dbl()
)
}
detective.characteristics=bt_get_detective_characteristics(d.name=detective.name,
d.number=detective.number)
#data.downscale = 3
### word-level variables
if(is.null(evaluation)){
tf=data.frame(bid=bid,
text=text,
stringsAsFactors = F)
}else{
tf=data.frame(bid=bid,
evaluation=evaluation,
text=text,
stringsAsFactors = F)
}
if(for.training
& (max.input.rows > 0)
& (nrow(tf) > max.input.rows)){
message("input data is too large. to save your processor, it will be reduced in size. change max.input.rows to zero for no limit or increase it for a higher limit.")
tf = tf[sample(nrow(tf), max.input.rows),]
}
#testing
#tf1 = tf
#VERY IMPORTANT:
#REMOVE non alphanumerics from the tf. this is to alleviate problems caused by
#errant bits of punctuation, html, mojibake, etc
tf$text = gsub(pattern = "[^A-zÀ-ÿ]|_", #the regex engine in R treats _ as an alphanumeric, must include it explicitly
replacement = " ",
x = tf$text)
if(for.training){
train.split=sample(unique(tf$bid), ceiling(nrow(tf)*train.share))
}
variables=detective.characteristics$variables
estimation.variables=variables
## ensuring I have all acting agencies, if called for
if(detective.characteristics$vars.incl.acting.agency){
#commented out because there are so many AAs now it's impossible to train on all of them.
# if(for.training){
# while(length(setdiff(agency.dummies, unique(acting.agency[which(tf$bid %in% train.split)])))>0){
# rm(train.split)
# train.split=sample(unique(tf$bid), ceiling(nrow(tf)*train.share))
# print("resplitting to ensure presence of all agencies")
# }
#}
estimation.variables=c(estimation.variables[!variables %in% "acting.agency"],agency.dummies.col.names)
}
#tf1.5 = tf
#tf1.5$bid[!(tf1.5$bid %in% tf$bid)]
#warning: this can create a very large df (1 billion rows+)
tf=unnest_tokens(tf, word, text, drop=F)
#### FEATURE CLEANING
### removing numbers: do you ? or just the tags.
# tf$jibberish=apply(tf, 1, function(x) as.numeric(nchar(paste(unlist(str_extract_all(x[which(names(tf)=="word")], "\\w+")), collapse="")))/nchar(x[which(names(tf)=="word")]))
tf$word=as.character(tf$word)
tf=subset(tf, nchar(word)>=4 & nchar(word)<=30)
#test
#tf2 = tf
### EXPLORE UDPIPE
#I worry as a significant part of our corpus is not in proper english, or not in english at all.
# rake=keywords_rake(tf, "word", "bid", n_min=1, ngram_max = 4)
# coll=keywords_collocation(tf, "word", "bid", n_min=2, ngram_max = 4)
#
# x <- udpipe_download_model(language = "english")
# x$file_model
# ud_english <- udpipe_load_model(x$file_model)
#
# e=as.data.frame(udpipe_annotate(ud_english, x = training$text[1], tagger = "default", parser = "default"))
#
## CREATING THE METRICS
if(!is.null(evaluation)){
## simple frequencies
#word.freq=as.data.frame(table(tf$word))
word.freq = tf %>%
cc_count_freq()
names(word.freq)=c("word","frequency.corpus")
tf = merge(tf, word.freq, by="word", all.x=T)
#word.freq=as.data.frame(table(tf$word[tf$evaluation==1]))
word.freq = cc_count_freq(tf[tf$evaluation==1,])
names(word.freq)=c("word","frequency.relevant")
tf=merge(tf, word.freq, by="word", all.x=T)
#word.freq=as.data.frame(table(tf$word[tf$evaluation==0]))
word.freq = cc_count_freq(tf[tf$evaluation==1,])
names(word.freq)=c("word","frequency.irrelevant")
tf=merge(tf, word.freq, by="word", all.x=T)
tf[is.na(tf)]=0
#test
#tf3=tf
# GINI --------------------------------------------------------------------
### GINI ###
### WARNING: 23-04-2021
### THIS IS BROKEN due to integer overflow caused by all the massive dfs.
### this is because our datasets have grown!
### need to change this process to be less memory intensive. I think it needs a total redo.
### completely supersede with w2v maybe? will think about how to do this.
# I could exclude features by GINI value. Does not seem to do much, so I skip it here.
# If reconsidered, need to add those BIDs removed due to lack of high-GINI words into the estimation results by assigning 0/1 randomnly.
# if("gini.normalised" %in% unique(c(variables, detective.characteristics$dtmatrix.metric))){
#gini=as.data.frame(table(subset(tf, bid %in% train.split)$word)))
gini = subset(tf, bid %in% train.split) %>%
cc_count_freq()
names(gini)=c("word","freq.total")
#gini$freq.total = as.numeric(gini$freq.total)
#gini.ir=as.data.frame(table(subset(tf, bid %in% train.split & evaluation==0)$word))
gini.ir = subset(tf, bid %in% train.split & evaluation==0) %>%
cc_count_freq()
names(gini.ir)=c("word","freq.irrelevant")
gini=merge(gini, gini.ir, by="word", all.x=T)
#gini.ir=as.data.frame(table(subset(tf, bid %in% train.split & evaluation==1)$word))
gini.ir = subset(tf, bid %in% train.split & evaluation==1) %>%
cc_count_freq()
names(gini.ir)=c("word","freq.relevant")
gini=merge(gini, gini.ir, by="word", all.x=T)
rm(gini.ir)
gini[is.na(gini)]=0
gini$gini.simple=(gini$freq.relevant/gini$freq.total)^2+(gini$freq.irrelevant/gini$freq.total)^2
gini$gini.normalised.num=gini$freq.relevant/(gini$freq.total*length(unique(subset(tf, bid %in% train.split & evaluation==1)$bid)))
gini$gini.normalised.denom=gini$freq.irrelevant/(gini$freq.total*length(unique(subset(tf, bid %in% train.split & evaluation==0)$bid)))
gini$gini.normalised=(gini$gini.normalised.num/(gini$gini.normalised.num+ gini$gini.normalised.denom))^2 +
(gini$gini.normalised.denom/(gini$gini.normalised.denom+ gini$gini.normalised.num))^2
#the below was used to root out problems with overflow. I leave it here in case it happens again.
# tst = c()
# err = c()
# wrn = c()
# const = length(unique(subset(tf, bid %in% train.split & evaluation==0)$bid))
# for(i in 1:nrow(tf)){
# tryCatch(expr={
# tst=c(tst,
# (as.numeric(gini$freq.total)[i]*as.numeric(const) )
# )
#
#
# },error = function(e){
# print(paste("error with:", i))
# err = c(err, i)
# },
# warning = function(w){
# print(paste("warn with:", i))
# err = c(wrn, i)
# }
# )
# }
# }
### delta
# if(sum(as.numeric(grepl("gta.gini", unique(c(variables, detective.characteristics$dtmatrix.metric)))))>0){
gta.gini.threshold=100
nonsense=c("nbsp", "quot", "january", "february","march","april","may","june","july","august","september","october","november","december")
gta.words=gtabastiat::gta.corpus %>%
cc_df_col_int2dbl()
gta.words$word = tolower(gta.words$word)
stop.en=get_stopwords()$word
gini.result=data.frame(word=character(),
gta.text=character(),
gta.gini.ir=numeric(),
gta.gini.re=numeric(),
gta.gini.delta=numeric(),
stringsAsFactors = F)
## loop over GTA text types (all, description, title, source)
for(txt in unique(gta.words$text)){
## normalising gta frequency by vastly different total word count
gta.words$gta.freq.word=round(gta.words$gta.freq.word/sum(gta.words$gta.freq.word)*nrow(subset(tf, ! word %in% stop.en)),0)
## irrelevant cases
#gta.gini=as.data.frame(table(tolower(subset(tf, bid %in% train.split & evaluation==0)$word)))
gta.gini = subset(tf, bid %in% train.split & evaluation==0) %>%
cc_count_freq()
names(gta.gini)=c("word","gta.freq.word")
gta.gini=rbind(gta.gini, subset(gta.words, text==txt)[,c("word","gta.freq.word")])
# removing words also removed from GTA corpus
gta.gini=subset(gta.gini, ! tolower(word) %in% stop.en)
gta.gini=subset(gta.gini, grepl("[0-9]+", word)==F)
gta.gini=subset(gta.gini, ! word %in% nonsense)
gta.gini=aggregate(gta.freq.word ~word, gta.gini, sum)
setnames(gta.gini, "gta.freq.word","freq.total")
#dbg
#gta.gini1 = gta.gini
#gta.gini.ir=as.data.frame(table(subset(tf, bid %in% train.split & evaluation==0)$word))
gta.gini.ir = subset(tf, bid %in% train.split & evaluation==0) %>%
cc_count_freq()
names(gta.gini.ir)=c("word","freq.irrelevant")
gta.gini=merge(gta.gini, gta.gini.ir, by="word", all.x=T)
rm(gta.gini.ir)
gta.gini=merge(gta.gini, subset(gta.words, text==txt)[,c("word", "gta.freq.word")],by="word",all.x=T)
setnames(gta.gini, "gta.freq.word","freq.gta")
gta.gini[is.na(gta.gini)]=0
gta.gini=subset(gta.gini, freq.total>gta.gini.threshold)
gta.gini$gta.gini.ir=(gta.gini$freq.gta/gta.gini$freq.total)^2+(gta.gini$freq.irrelevant/gta.gini$freq.total)^2
gta.gini.delta=gta.gini[,c("word","gta.gini.ir")]
## relevant cases
gta.gini=as.data.frame(table(tolower(subset(tf, bid %in% train.split & evaluation==1)$word)))
names(gta.gini)=c("word","gta.freq.word")
gta.gini=rbind(gta.gini, subset(gta.words, text==txt)[,c("word","gta.freq.word")])
# removing words also removed from GTA corpus
gta.gini=subset(gta.gini, ! tolower(word) %in% stop.en)
gta.gini=subset(gta.gini, grepl("[0-9]+", word)==F)
gta.gini=subset(gta.gini, ! word %in% nonsense)
gta.gini=aggregate(gta.freq.word ~word, gta.gini, sum)
setnames(gta.gini, "gta.freq.word","freq.total")
#gta.gini.ir=as.data.frame(table(subset(tf, bid %in% train.split & evaluation==1)$word))
gta.gini.ir = subset(tf, bid %in% train.split & evaluation==1) %>%
cc_count_freq()
names(gta.gini.ir)=c("word","freq.relevant")
gta.gini=merge(gta.gini, gta.gini.ir, by="word", all.x=T)
rm(gta.gini.ir)
gta.gini=merge(gta.gini, subset(gta.words, text==txt)[,c("word", "gta.freq.word")],by="word",all.x=T)
setnames(gta.gini, "gta.freq.word","freq.gta")
gta.gini[is.na(gta.gini)]=0
gta.gini=subset(gta.gini, freq.total>gta.gini.threshold)
gta.gini$gta.gini.re=(gta.gini$freq.gta/gta.gini$freq.total)^2+(gta.gini$freq.relevant/gta.gini$freq.total)^2
gta.gini.delta=merge(gta.gini.delta, gta.gini[,c("word","gta.gini.re")], by="word", all.x=T)
gta.gini.delta$gta.gini.delta=gta.gini.delta$gta.gini.ir-gta.gini.delta$gta.gini.re
gta.gini.delta$gta.text=txt
gini.result=rbind(gini.result, gta.gini.delta[,names(gini.result)])
print(txt)
}
gta.gini=gini.result
rm(gini.result, gta.gini.delta)
# }
# ODDS RATIO --------------------------------------------------------------
# if(sum(as.numeric((c("odds.relevant","odds.irrelevant", "odds.ratio") %in% unique(c(variables, detective.characteristics$dtmatrix.metric)))))>0){
#odds=as.data.frame(table(subset(tf, bid %in% train.split)$word))
odds = subset(tf, bid %in% train.split) %>%
cc_count_freq()
names(odds)=c("word","freq.total")
#odds.ir=as.data.frame(table(subset(tf, bid %in% train.split & evaluation==0)$word))
odds.ir = subset(tf, bid %in% train.split & evaluation==0) %>%
cc_count_freq()
names(odds.ir)=c("word","freq.irrelevant")
odds=merge(odds, odds.ir, by="word", all.x=T)
#same variable name used for relevant results for memory saving - don't get confused
#odds.ir=as.data.frame(table(subset(tf, bid %in% train.split & evaluation==1)$word))
odds.ir = subset(tf, bid %in% train.split & evaluation==1) %>%
cc_count_freq()
names(odds.ir)=c("word","freq.relevant")
odds=merge(odds, odds.ir, by="word", all.x=T)
odds[is.na(odds)]=0
rm(odds.ir)
odds$odds.relevant=((odds$freq.relevant/sum(odds$freq.relevant))/(1-odds$freq.relevant/sum(odds$freq.relevant)))
odds$odds.irrelevant=((odds$freq.irrelevant/sum(odds$freq.irrelevant))/(1-odds$freq.irrelevant/sum(odds$freq.irrelevant)))
odds$odds.ratio=log(odds$odds.relevant/odds$odds.irrelevant)
odds$odds.ratio[odds$odds.irrelevant==0]=max(odds$odds.ratio[odds$odds.irrelevant!=0])*1.1
odds$odds.ratio[odds$odds.relevant==0]=min(odds$odds.ratio[odds$odds.relevant!=0])*1.1
# }
# INFORMATION GAIN --------------------------------------------------------
### information gain
# observations=length(unique(tf$bid))
# prob.relevant=length(unique(tf$bid[tf$evaluation==1]))/length(unique(tf$bid))
# prob.irrelevant=length(unique(tf$bid[tf$evaluation==0]))/length(unique(tf$bid))
#
# gain=data.frame(word=gini$word,
# info.gain.relevant=NA,
# info.gain.irrelevant=NA)
#
# for(i in 1:nrow(gain)){
# gain$info.gain.relevant[i]=prob.relevant-length(unique(tf$bid[tf$evaluation==1 & tf$word==gain$word[i]]))/length(unique(tf$bid[tf$word==gain$word[i]]))
# gain$info.gain.irrelevant[i]=prob.irrelevant-length(unique(tf$bid[tf$evaluation==0 & tf$word==gain$word[i]]))/length(unique(tf$bid[tf$word==gain$word[i]]))
# print(i)
# }
showcols = function(tgt.df){
for(col in colnames(tgt.df)){
print(typeof(tgt.df[,col]))
}
}
#### FEATURE TRANSFORMATION
### word2vec, LSI, PLSA, NMF here, if you want.
### NMF
# https://sites.google.com/site/mlshortcourse/home/data-sets/text-classification-in-r
# WORD FREQUENCIES --------------------------------------------------------
### word frequencies
#word.share.relevant=as.data.frame(table(subset(tf, evaluation==1 & bid %in% train.split)$word))
word.share.relevant = subset(tf, evaluation==1 & bid %in% train.split) %>%
cc_count_freq()
#word.share.relevant$share=word.share.relevant$Freq/sum(word.share.relevant$Freq)
word.share.relevant$share=word.share.relevant$n/sum(word.share.relevant$n)
#word.share.irrelevant=as.data.frame(table(subset(tf, evaluation==0 & bid %in% train.split)$word))
word.share.irrelevant = subset(tf, evaluation==0 & bid %in% train.split) %>%
cc_count_freq()
#word.share.irrelevant$share=word.share.irrelevant$Freq/sum(word.share.irrelevant$Freq)
word.share.irrelevant$share=word.share.irrelevant$n/sum(word.share.irrelevant$n)
### document frequencies
doc.share=aggregate(bid~word + evaluation, subset(tf, bid %in% train.split), function(x) length(unique(x)))
doc.share$d.share=doc.share$bid/nrow(subset(tf, evaluation==1 & bid %in% train.split))
doc.share$d.share[doc.share$evaluation==0] =doc.share$bid[doc.share$evaluation==0]/nrow(subset(tf, evaluation==0 & bid %in% train.split))
#setnames(word.share.irrelevant, "Var1", "word")
#setnames(word.share.relevant, "Var1", "word")
### word scores (tf-idf, tf-oidf)
word.share.relevant=merge(word.share.relevant, subset(doc.share, evaluation==1)[,c("word","d.share")], by="word", all.x=T)
word.share.relevant$score.r.idf=word.share.relevant$share/(word.share.relevant$d.share + min(word.share.relevant$d.share, na.rm = T))
word.share.relevant$d.share=NULL
word.share.relevant=merge(word.share.relevant, subset(doc.share, evaluation==0)[,c("word","d.share")], by="word", all.x=T)
word.share.relevant$d.share[is.na(word.share.relevant$d.share)]=min(subset(doc.share, evaluation==0)$d.share, na.rm=T)
word.share.relevant$score.r.oidf=word.share.relevant$share/(word.share.relevant$d.share + min(word.share.relevant$d.share, na.rm = T))
word.share.irrelevant=merge(word.share.irrelevant, subset(doc.share, evaluation==0)[,c("word","d.share")], by="word", all.x=T)
word.share.irrelevant$score.ir.idf=word.share.irrelevant$share/(word.share.irrelevant$d.share + min(word.share.irrelevant$d.share, na.rm = T))
word.share.irrelevant$d.share=NULL
word.share.irrelevant=merge(word.share.irrelevant, subset(doc.share, evaluation==1)[,c("word","d.share")], by="word", all.x=T)
word.share.irrelevant$d.share[is.na(word.share.irrelevant$d.share)]=min(subset(doc.share, evaluation==1)$d.share, na.rm=T)
word.share.irrelevant$score.ir.oidf=word.share.irrelevant$share/(word.share.irrelevant$d.share + min(word.share.irrelevant$d.share, na.rm = T))
word.score=merge(word.share.relevant[,c("word", "score.r.idf","score.r.oidf")], word.share.irrelevant[,c("word", "score.ir.idf","score.ir.oidf")], by="word", all=T)
word.score[is.na(word.score)]=0
word.score$score.delta=word.score$score.r.idf-word.score$score.ir.idf
word.score$score.delta.abs=abs(word.score$score.delta)
word.score$exclusive.r=as.numeric(word.score$word %in% subset(word.score, score.ir.idf==0)$word)
word.score$exclusive.ir=as.numeric(word.score$word %in% subset(word.score, score.r.idf==0)$word)
### ToDo: Adjust exclusiveness for probability of appearance (e.g. inverse tf-idf in own corpus (=it's really rare here, too))
#### Dampening features
## transform by natural log or square-root to smooth outliers
## packing word score DF
# if("gini.normalised" %in% unique(c(variables, detective.characteristics$dtmatrix.metric))){
word.score=merge(word.score, gini[,c("word","gini.normalised")], by="word", all=T)
# }
for(gg in c("gta.gini.all","gta.gini.source","gta.gini.title","gta.gini.description")){
# if(gg %in% unique(c(variables, detective.characteristics$dtmatrix.metric))){
g.var=gsub("gta.gini.","",gg)
word.score=merge(word.score, subset(gta.gini, gta.text==g.var)[,c("word","gta.gini.delta")], by="word", all=T)
setnames(word.score, "gta.gini.delta",gg)
print(gg)
# }
}
# if(sum(as.numeric((c("odds.relevant","odds.irrelevant", "odds.ratio") %in% unique(c(variables, detective.characteristics$dtmatrix.metric)))))>0){
word.score=merge(word.score, odds[,c("word","odds.relevant","odds.irrelevant", "odds.ratio" )], by="word", all=T)
# }
# if(length(intersect(c("gta.share.all","gta.share.source","gta.share.title", "gta.share.description"), unique(c(variables, detective.characteristics$dtmatrix.metric))))>0){
gta.words=gtabastiat::gta.corpus %>%
cc_df_col_int2dbl()
#ws1 = word.score
# for(gs in intersect(c("gta.share.all","gta.share.source","gta.share.title", "gta.share.description"), unique(c(variables, detective.characteristics$dtmatrix.metric)))){
for(gs in c("gta.share.all","gta.share.source","gta.share.title", "gta.share.description")){
g.var=gsub("gta.share.","",gs)
word.score=merge(word.score, subset(gta.words, text==g.var)[,c("word","gta.share.word")], by="word", all=T)
setnames(word.score, "gta.share.word",gs)
}
# }
}
#tf4 = tf
#### GENERATING THE VARIABLES
#### AGGRECGATE VARIABLES TO DOC LEVEL
tf=merge(tf, word.score, by="word", all.x = T)
#some NAs are created. THese bids are lost when aggregating so we must replace them with zero
#an example of this is LUX-LEX-733 (which was accidentally scraped in german):
#automobilbranche autopilot autopilot befindet bekannt entwicklung fuhrt irre knight munchen noch serie sich technologien uberholspur verbraucher werben
#obviously we shouldn't be trying to classify non en things, but sometimes they slip through so must deal with it
tf[is.na(tf)] = 0
tf.agg=aggregate(score.delta ~ bid, tf, sum)
if(is.null(evaluation)==F){
tf.agg=merge(tf.agg, unique(data.frame(bid=bid, evaluation=evaluation, stringsAsFactors = F)), by="bid", all.x=T)
}
#tfa1 = tf.agg
setnames(tf.agg, "score.delta","delta.sum")
tf.agg=merge(tf.agg, aggregate(score.delta ~ bid , tf, min), by="bid", all.x=T)
setnames(tf.agg, "score.delta","delta.min")
tf.agg=merge(tf.agg, aggregate(score.delta ~ bid , tf, max), by="bid", all.x=T)
setnames(tf.agg, "score.delta","delta.max")
tf.agg=merge(tf.agg, aggregate(score.r.idf ~ bid , tf, sum), by="bid", all.x=T)
tf.agg=merge(tf.agg, aggregate(score.r.oidf ~ bid , tf, sum), by="bid", all.x=T)
tf.agg=merge(tf.agg, aggregate(score.ir.idf ~ bid , tf, sum), by="bid", all.x=T)
tf.agg=merge(tf.agg, aggregate(score.ir.oidf ~ bid , tf, sum), by="bid", all.x=T)
tf.agg=merge(tf.agg, aggregate(exclusive.r ~ bid , tf, sum), by="bid", all.x=T)
setnames(tf.agg, "exclusive.r","exclusive.r.sum")
tf.agg=merge(tf.agg, aggregate(exclusive.r ~ bid , tf, max), by="bid", all.x=T)
setnames(tf.agg, "exclusive.r","exclusive.r.max")
tf.agg=merge(tf.agg, aggregate(exclusive.ir ~ bid , tf, sum), by="bid", all.x=T)
setnames(tf.agg, "exclusive.ir","exclusive.ir.sum")
tf.agg=merge(tf.agg, aggregate(exclusive.ir ~ bid , tf, max), by="bid", all.x=T)
setnames(tf.agg, "exclusive.ir","exclusive.ir.max")
#dbg
#tfa2 = tf.agg
#generate them all to avoid rerunning this again and again
for(var in intersect(estimation.variables, c("gta.share.all","gta.share.source","gta.share.title","gta.share.description","gta.gini.all","gta.gini.source","gta.gini.title","gta.gini.description","gini.normalised","odds.relevant","odds.irrelevant", "odds.ratio"))){
#for(var in c("gta.share.all","gta.share.source","gta.share.title","gta.share.description","gta.gini.all","gta.gini.source","gta.gini.title","gta.gini.description","gini.normalised","odds.relevant","odds.irrelevant", "odds.ratio")){
eval(parse(text=paste("tf.agg=merge(tf.agg, aggregate(",var," ~ bid , tf, function(x) mean(x, na.rm=T)), by='bid', all.x=T)",sep="")))
}
## adding aggregate variables, if called for
## text.level variables
aggregate.variables=data.frame(bid=bid,
text=text,
stringsAsFactors = F)
#av1 = aggregate.variables
##TODO I think this would work nicely as a w2v model.
## acting.agency
# acting agencies ---------------------------------------------------------
#TODO generate all the vars without having to run this function repeatedly as it is s l o w
if(detective.characteristics$vars.incl.acting.agency){
aggregate.variables$acting.agency=acting.agency
aggregate.variables$acting.agency=factor(aggregate.variables$acting.agency, levels=agency.dummies)
aggregate.variables$acting.agency[is.na(aggregate.variables$acting.agency)]="Other"
aa.dummies = as.data.frame(predict(caret::dummyVars(~ acting.agency, data = aggregate.variables), newdata = aggregate.variables))
names(aa.dummies)=tolower(gsub(" ","",gsub("acting.agency\\.?","",names(aa.dummies))))
aa.dummies$bid=bid
if(any(! agency.dummies.col.names %in% names(aa.dummies))){
for(miss.aa in setdiff(agency.dummies.col.names, names(aa.dummies))){
eval(parse(text=paste("aa.dummies$",miss.aa,"=0",sep=""))) }
}
aggregate.variables$acting.agency=NULL
aggregate.variables=merge(aggregate.variables, aa.dummies[,c("bid", agency.dummies.col.names)], by="bid", all.x=T)
rm(aa.dummies)
}
## is.td
if(detective.characteristics$vars.incl.td){
aggregate.variables$is.td=as.numeric(grepl("-[(TD)|(SG)|(AD)|(CVD)]+-",aggregate.variables$bid))
}
## keywords
if(detective.characteristics$vars.incl.keywords){
print("Generating keyword-related variables ...")
#this causes an error if you don't set the locale when run on the server due to some encoding problem.
Sys.setlocale(category = "LC_ALL", locale = "C")
keyword.variables=b_process_keywords(bid=aggregate.variables$bid,
text=aggregate.variables$text)
keyword.variables = keyword.variables[,c("bid","pos.word","pos.word.char", "neg.word", "neg.word.char")]
#the merge is the way to go here, but it causes memory overflow
# aggregate.variables=merge(aggregate.variables,
# keyword.variables[,c("bid","pos.word","pos.word.char", "neg.word", "neg.word.char")],
# by="bid", all.x=T)
aggregate.variables$pos.word = NA
aggregate.variables$pos.word.char = NA
aggregate.variables$neg.word = NA
aggregate.variables$neg.word.char = NA
pb = txtProgressBar(min = 0, max = nrow(keyword.variables), char = "~", style = 3)
for(i in 1:nrow(keyword.variables)){
#dont need second comparison in the kw.vars table as we are iterating through the rows sequentially anyway, i.e.
# keyword.variables$pos.word[keyword.variables$bid = keyword.variables$bid[i]] == keyword.variables$pos.word[i]
setTxtProgressBar(pb, i)
#aggregate.variables$pos.word[aggregate.variables$bid == keyword.variables$bid[i]] = keyword.variables$pos.word[keyword.variables$bid = keyword.variables$bid[i]]
aggregate.variables$pos.word[aggregate.variables$bid == keyword.variables$bid[i]] = keyword.variables$pos.word[i]
aggregate.variables$pos.word.char[aggregate.variables$bid == keyword.variables$bid[i]] = keyword.variables$pos.word.char[i]
aggregate.variables$neg.word[aggregate.variables$bid == keyword.variables$bid[i]] = keyword.variables$neg.word[i]
aggregate.variables$neg.word.char[aggregate.variables$bid == keyword.variables$bid[i]] = keyword.variables$neg.word.char[i]
}
close(pb)
print("Generating keyword-related variables ... complete.")
}
if(ncol(aggregate.variables)>2){
aggregate.variables=unique(aggregate.variables)
aggregate.variables=subset(aggregate.variables, bid %in% tf.agg$bid)
tf.agg=merge(tf.agg, aggregate.variables, by="bid", all.x=T)
}
## cleaning out tf.agg
if(is.null(evaluation)){
tf.agg=tf.agg[,c(estimation.variables, "bid")]
} else {
tf.agg=tf.agg[,c(estimation.variables, "bid","evaluation")]
}
## DTM, if called for
if(detective.characteristics$dtmatrix.included){
nr.terms=detective.characteristics$dtmatrix.term.count
eval(parse(text=paste("tf=tf[order(tf$",detective.characteristics$dtmatrix.metric,", decreasing = T),]", sep="")))
model.words=unique(tf$word)[1:nr.terms]
tf.dtm=dplyr::count(subset(tf, word %in% model.words), vars=c('bid', 'word'))
tf.dtm=as.data.frame(as.matrix(cast_dtm(tf.dtm, bid, word, freq)))
tf.dtm$bid=rownames(tf.dtm)
missing.bids=unique(tf$bid)[! unique(tf$bid) %in% tf.dtm$bid]
tf.dt2=as.data.frame(matrix(0,nrow=length(missing.bids), ncol=ncol(tf.dtm)))
names(tf.dt2)=names(tf.dtm)
tf.dt2$bid=missing.bids
tf.dtm=rbind(tf.dtm, tf.dt2)
tf.agg=merge(tf.agg, tf.dtm[,c("bid", model.words)], by="bid", all.x=T)
estimation.variables=c(estimation.variables, names(tf.agg)[!names(tf.agg) %in% c(estimation.variables, "bid","evaluation")])
}
output.list<- list("estimation.variables"=estimation.variables,
"word.score"=word.score,
"estimation.data"=tf.agg,
"detective.characteristics"=detective.characteristics)
return(output.list)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.