data_generators/DataCreator.R

source('./R/Functions.R')
s3_bucket <- 'alpha-app-pq-tool'

#This does the burdensome analysis needed to run Latent Semantic Analysis on the set of MoJ PQs.
#The set itself can be generated by scraping from the web using the "MoJScraper.R" script.


#This code returns an index of PQ documents of decreasing similarity given
#a search query.
#
#It creates a searchSpace.rda file which is used by global.R and server.R.
#


#This creates and saves the following R objects, so that they can be loaded in to an R script

# normalised LSA space to assess similarity with search query (saved as searchSpace.rda)

#Note that searchSpace.rda is used by global.R and server.R

#It also saves two .csv files:
# All the data for PQs, and the cluster that each belongs to (MoJAllPQsForTableau.csv)
# The top dozen words for each cluster (topDozen.csv)

#CAVEAT: To make the eventual similarity query run fast,
#Andy 'sparsified' the normalised rank-reduced
#lsa space by setting all values within 0.01 of zero to zero and putting it into
#simple-triplet format (which doesn't store zero values in memory).  The result
#is that |d| is no longer constant across all documents (ie they are no longer
#quite normalised to length 1).
#They now vary slightly in length, but only differ by 0.036 at most
#(summary stats on the distribution of lengths can be found in the code)
#so it's not too much of a problem.

#To use, just write in the file name of the .csv containing the PQs (which may have been generated
#by MoJScraper.R) under the file parameter below, and run the code.

#LIBRARIES#
library(tm)
library(lsa)
library(cluster)
library(dplyr)
library(slam)
library(stringr)
library(readr)
library(optparse)

#GRAB COMMAND LINE ARGS

option_list = list(
  make_option(c("-i", "--input_file"),
    type    = "character",
    default = NULL, 
    help    = "dataset file name",
    metavar = "character"
  ),
  make_option(c("-x", "--x_dims"),
              type    = "numeric",
              default = NULL, 
              help    = "number of dimensions [default= %default]",
              metavar = "character"
  ),
  make_option(c("-k", "--k_clusters"),
    type    = "numeric",
    default = NULL, 
    help    = "number of clusters [default= %default]",
    metavar = "character"
  ),
  make_option(c("-o", "--output_dir"),
    type    = "character",
    default = NULL, 
    help    = "directory to which outputs are saved [default= %default]",
    metavar = "character"
  ),
  make_option(c("-e", "--environment"),
    type    = "character",
    default = NULL, 
    help    = "Sets K, input and output to sensible values for 'test' and 'prod' environments.  Values can be either 'test' or 'prod'",
    metavar = "character"
  )
)
 
opt_parser = OptionParser(option_list = option_list);
opt = parse_args(opt_parser);

## Override options if 'environment' is set


if( opt$environment == 'test' ) {
  opt$input_file <- str_interp("${SHINY_ROOT}/tests/testthat/examples/data/lsa_training_sample.csv")
  opt$output_dir <- str_interp("${SHINY_ROOT}/tests/testthat/examples/data/")
  opt$k_clusters <- 100
  opt$x_dims <- 100
} else if( opt$environment == 'prod' ) {
  opt$input_file <- str_interp("${s3_bucket}/archived_pqs.csv")
  opt$output_dir <- str_interp("${s3_bucket}/")
  opt$k_clusters <- 1000
  opt$x_dims <- 2000
}



print(str_interp('X has been set to ${opt$x_dims}'))
print(str_interp('K has been set to ${opt$k_clusters}'))
print(str_interp('Reading from file ${opt$input_file}'))
print(str_interp('Saving to ${opt$output_dir} directory'))

#PARAMETERS
#Number of clusters, and also rank of LSA space

#SCRIPT

#GETTING DATA

#read in questions
print('Reading questions')
aPQ <- s3tools::read_using(readr::read_csv, opt$input_file)
aPQ <- aPQ[order(aPQ$Question_Date, decreasing = TRUE),] #reorder to have most recent first
questionsVec <- aPQ$Question_Text

#MAKE THE TERM-DOCUMENT MATRIX AND LATENT SEMANTIC ANALYSIS SPACE
print('Making the TDM')
#Create the corpus
PQCorp <- Corpus(VectorSource(questionsVec))
#Stem the corpus
PQCorp.stems <- tm_map(cleanCorpus(PQCorp), stemDocument)

#Create the term-document matrix. For each term in each document we assign a score based on the
#inverse frequency of the appearance of that term in documents in the corpus, normalised for the
#document length (in some sense), and zero if the term is absent from the document entirely.
#Details can be seen by inspecting the help documentation for the weightSMART function.
tdm <- TermDocumentMatrix(
         PQCorp.stems,
         control = list(
           weighting = function(x) weightSMART(x, spec = "btn"),
           wordLengths = c(2, Inf)))

tdm <- as.matrix(tdm) %>% normalize() %>% as.simple_triplet_matrix() #normalize doc lengths


#Create the latent semantic space. The idea is that it creates a basis of variation, like a PCA, and
#allows you to cut down the number of dimensions you need. 
print('Making the LSA')
lsaAll <- lsa(tdm, dims = dimcalc_raw())

#CLUSTERING
print('Doing some clustering')
#We reduce the LSA space to rank k, and then get the positions of our documents in this latent semantic space.
posns <- diag(lsaAll$sk[1:opt$x_dims]) %*% t(lsaAll$dk[, 1:opt$x_dims])

#distances between documents in this space, based on cosine similarity.
#first normalise columns so all have length 1
normposns <- normalize(posns)
#then since a . b = |a| |b| cos(theta) and we have normalised so that |a| = |b| = 1
#we can just get cos(theta) = a.b
#so we just need to do the dot product between each document
diss <- 1 - (t(normposns) %*% normposns)
#we now take out anything 'small' to make sure things that should be 0 aren't
#rendered tiny and non-zero by floating point arithmetic
diss[which(abs(diss) < 10^-14)] <- 0

#a hierarchical clustering. At the moment we only use this to define our clusters,
#by taking a cut through it at the right stage.
hier <- hclust(as.dist(diss), method = "complete")

#We choose k to be the number of clusters into which we divide our set of questions.
#See the appendix for some sort of reasoning behind this.

klusters <- cutree(hier, opt$k_clusters)

m <- as.matrix(tdm)

print("summarising topics")
#this summarises the top 12 terms per cluster using the summarise function from above.
topDozenWordsPerTopic <- data.frame(
  topic = unlist(lapply(seq(1, opt$k_clusters), function(x)rep(x, 12))),
  word = unlist(lapply(seq(1, opt$k_clusters),
           function(x) names(summarise("cluster", x, m, hier, 12, questionsVec, opt$k_clusters)))),
  
  freq = unlist(lapply(seq(1, opt$k_clusters),
           function(x) summarise("cluster", x, m, hier, 12, questionsVec, opt$k_clusters))),
  row.names = NULL, stringsAsFactors = F)

#keywords to describe clusters
clusterKeywords <- sapply(seq(opt$k_clusters),
                     function(x)
                       names(summarise("cluster", x, m, hier, 3, questionsVec, opt$k_clusters)))
clusterKeywordsVec <- sapply(seq_along(clusterKeywords[1, ]),
                        function(x)
                          paste0(clusterKeywords[, x], collapse = ", "))

#Member summaries
print("summarising members")

allMembers <- sort(unique(aPQ$Question_MP))

topDozenWordsPerMember <- data.frame(
  member = unlist(lapply(allMembers, function(x)rep(x, 12))),
  word = unlist(lapply(allMembers,
                       function(x) names(summarise("MP", x, m, aPQ$Question_MP, 12, questionsVec)))),
  freq = unlist(lapply(allMembers,
                       function(x) summarise("MP", x, m, aPQ$Question_MP, 12, questionsVec))),
  row.names = NULL, stringsAsFactors = F)


#MAKE SPACE FOR FAST QUERY SEARCHING
print('Making the search space')
#We reduce the dimensionality of the space to be of rank k, where k is our
#parameter above (also the number of clusters we are going to use)
lsaOut <- lsaAll$tk[, 1:opt$x_dims] %*% posns
#normalise the space
search.space <- normalize(lsaOut)

#"sparsify" by setting middle 95% of values to zero
quant95s <- quantile(as.vector(search.space), c(0.025, 0.975))
search.space[which(search.space > quant95s[1] & search.space < quant95s[2])] <- 0

##This is just a check to see that this sparsification doesn't lead to wildly varying document lengths
collengths <- sapply(seq_along(aPQ$Question_ID),
                function(x) normVec(search.space[, x]))
summary(collengths)

#save disk space by saving as simple triplet matrix
search.space <- as.simple_triplet_matrix(search.space)



#### SAVING ####
print('Saving the output')
#save(tdm, file = "tdm.rda")
#save(lsaOut,file = "lsaOut.rda")
#save(klusters,file = "klusters.rda")

save_location = opt$output_dir

save(search.space, file = "searchSpace.rda")

s3tools::write_file_to_s3("searchSpace.rda", str_interp("${save_location}searchSpace.rda"), overwrite =TRUE)
file.remove("searchSpace.rda")



#Save data to be directly loaded in to Tableau

#The questions and their data (including cluster)
savedf <- data.frame(
  Document_Number = seq_along(aPQ$Question_ID),
  Question_ID = aPQ$Question_ID,
  Question_Text = aPQ$Question_Text,
  Answer_Text = aPQ$Answer_Text,
  Question_MP = aPQ$Question_MP,
  MP_Party    = aPQ$Party,
  MP_Constituency = aPQ$MP_Constituency,
  Answer_MP = aPQ$Answer_MP,
  Date = aPQ$Question_Date,
  Answer_Date = aPQ$Answer_Date,
  #Corrected_Date = aPQ$Corrected_Date,
  Topic = klusters,
  Topic_Keywords = clusterKeywordsVec[klusters],
  stringsAsFactors = FALSE)
#write.csv(savedf, "MoJwrittenPQs.csv")
s3tools::write_df_to_csv_in_s3(savedf, "alpha-app-pq-tool/MoJwrittenPQs.csv", overwrite =TRUE)


#The information about the clusters
#write.csv(topDozenWordsPerTopic, "topDozenWordsPerTopic.csv")
s3tools::write_df_to_csv_in_s3(topDozenWordsPerTopic, "alpha-app-pq-tool/topDozenWordsPerTopic.csv", overwrite =TRUE)

#The information about the members
#write.csv(topDozenWordsPerMember, "topDozenWordsPerMember.csv")
s3tools::write_df_to_csv_in_s3(topDozenWordsPerMember, "alpha-app-pq-tool/topDozenWordsPerMember.csv", overwrite =TRUE)



print(str_interp("${nrow(aPQ)} questions incorporated"))

##### APPENDIX #####

#Here we see how many clusters is a good number for our data. We calculate the
#silhouette for each clustering - the higher the better. We also calculate the
#median number of questions per cluster given the total cluster number.
#If these calculations have already been done you can simply load the
#"silhouettewidths.rda" and "medianpercluster.rda" files. Otherwise you will
#have to regenerate the value running the code.

#load(file = "silhouettewidths.rda")
#load(file = "medianpercluster.rda")
#load(file = "clusterings.rda")

#if you want to regenerate the data run the following

#minClust <- 2
#maxClust <- 4000
#clusterings <- sapply(seq(minClust, maxClust),
#                 function(x) cutree(hier,x))

#kSilWidths <- sapply(seq(minClust, maxClust),
#                function(x)
#                  mean(
#                    silhouette(
#                      clusterings[, x + 1 - minClust], diss)[, 3]
#                  )
#               )
#names(kSilWidths) <- seq(minClust, maxClust)

#medianClusterMembership <- sapply(seq(minClust, maxClust),
#                             function(x){
#                               median(
#                                 sapply(
#                                   seq(minClust, x),
#                                     function(y) length(which(clusterings[, x + 1 - minClust] == y))
#                                 )
#                               )
#                             }
#                            )

#if you want to save it
#save(kSilWidths, file = "silhouettewidths.rda")
#save(medianClusterMembership, file = "medianpercluster.rda")
#save(clusterings, file = "clusterings.rda")

#plot(kSilWidths, type = "l")
#which.max(kSilWidths)
#max(kSilWidths)
#medianClusterMembership[which.max(kSilWidths)]
#you can see that the "best" number of clusters is around 2668. However, this results in
#a median of only two questions per cluster, and the silhouette is still pretty small, at ~0.23.
#So we probably want more questions per cluster on average, particularly as it's not like the
#clusterings are "good" anyway. Hence the arbitrary choice of 1000, which gives a silhouette
#of ~0.161 and a median of 4 questions per cluster.
#kSilWidths[2000]
#medianClusterMembership[2000]

#We might be able to do better than arbitrarily picking 1000 by defining some function of
#median and silhouette and maximising it (although then the function definition is still
#arbitary).
moj-analytical-services/pq-tool documentation built on June 13, 2021, 11:10 p.m.