R/cluster-users.R

# Script to cluster programmes according to user matrix

library(jsonlite)
library(RPostgreSQL)
library(tidyverse)
library(NbClust)

redshift_creds <- fromJSON("~/Documents/Admin/redshift_credentials.json")

driver <- dbDriver("PostgreSQL")

db_connection <- dbConnect(driver,
                           host = redshift_creds["host_name"],
                           port = redshift_creds["port_num"],
                           dbname = redshift_creds["db_name"],
                           user = redshift_creds["user_name"],
                           password = redshift_creds["password"])

user_matrix <- dbGetQuery(db_connection, "
SELECT * FROM central_insights_sandbox.dh_user_matrix
")

dbDisconnect(db_connection)

# Select latent features
user_matrix_reduced <- user_matrix %>%
  select(tidyselect::matches("f[0-9]"))

# Run the model
set.seed(1234)

nbclust_k <- NbClust(data = user_matrix_reduced,
                     distance = "euclidean",
                     method = "kmeans",
                     index = "silhouette",
                     min.nc = 10,
                     max.nc = 10)

optimum_k <- nbclust_k$Best.nc[1]

kmodel <- kmeans(user_matrix_reduced, centers = 10, nstart = 25)

# Append data frame with cluster
item_matrix$cluster <- as.factor(kmodel$cluster)

# Save to disk
write.csv(item_matrix, "~/Desktop/latent-variables/item-matrix.csv", row.names = FALSE)
bbc/insights-latent-feature-explorer documentation built on Nov. 3, 2019, 2:08 p.m.