# Script to cluster programmes according to item matrix
library(jsonlite)
library(RPostgreSQL)
library(tidyverse)
library(NbClust)
redshift_creds <- fromJSON("~/Documents/Admin/redshift_credentials.json")
driver <- dbDriver("PostgreSQL")
db_connection <- dbConnect(driver,
host = redshift_creds["host_name"],
port = redshift_creds["port_num"],
dbname = redshift_creds["db_name"],
user = redshift_creds["user_name"],
password = redshift_creds["password"])
item_matrix <- dbGetQuery(db_connection, "
SELECT * FROM central_insights_sandbox.jf_item_matrix
")
dbDisconnect(db_connection)
# Select latent features
item_matrix_reduced <- item_matrix %>%
select(tidyselect::matches("f[0-9]"))
# Run the model
set.seed(1234)
nbclust_k <- NbClust(data = item_matrix_reduced, method = "kmeans", index = "silhouette", min.nc = 3)
optimum_k <- nbclust_k$Best.nc[1]
kmodel <- kmeans(item_matrix_reduced, centers = optimum_k, nstart = 25)
cluster_centres <- kmodel$centers
cluster_size <- kmodel$size
item_matrix_reduced$cluster <- as.factor(kmodel$cluster)
calculate_distance <- function() {
# Initialise vector
distances <- vector()
# Outer loop: Iterate over number of clusters
for(i in 1:nrow(cluster_centres)) {
# Inner loop: Iterature over size of clusters
for(j in 1:cluster_size[i]) {
d <- dist(rbind(item_matrix_reduced[item_matrix_reduced$cluster==i,][j, 1:20],
cluster_centres[i]))
distances <- append(distances, d)
}
}
# Return vector of distances from cluster centres
return(distances)
}
distances <- calculate_distance()
item_matrix$dist <- distances
# Append data frame with cluster
item_matrix$cluster <- as.factor(kmodel$cluster)
# Save to disk
write.csv(item_matrix, "~/Desktop/latent-variables/item-matrix.csv", row.names = FALSE)
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.