R/preprocess-itemmatrix.R

Defines functions calculate_distance

##############################
# STAGE 1: SETUP             #              
##############################

library(tidyverse)
library(dbplyr)
library(rjson)
library(aws.secrets)
library(httr)
library(RPostgreSQL)
library(NbClust)

# Connect to Redshift with AWS secrets
s3credentials <-content(GET("http://169.254.169.254/latest/meta-data/iam/security-credentials/live-r-server-r-server-ComponentRole-1P8OXESQT6GMR"))
s3credentials2 <- fromJSON(s3credentials)
token <- s3credentials2$Token
secret_access_key <- s3credentials2$SecretAccessKey
access_key_id <- s3credentials2$AccessKeyId

# Set AWS region
Sys.setenv("AWS_DEFAULT_REGION" = "eu-west-1")

# Load credentials from AWS
secret <- get_secret_value('servers/r_server/prod/credentials/scv_redshift')$SecretString
secret <- fromJSON(secret)

# Connect to Redshift
redshift <- src_postgres(host='live-idl-prod-redshift-component-redshiftcluster-1q6vyltqf8lth.ctm1v7db0ubd.eu-west-1.redshift.amazonaws.com', port='5439', 
                         dbname = 'redshiftdb',
                         user = secret$redshift_username, 
                         password = secret$redshift_password)

# Load core data
item_matrix <- dbGetQuery(redshift$con, "SELECT * FROM central_insights_sandbox.dh_item_matrix_enriched")

##############################
# STAGE 2: DATA PROCESSING   #             
##############################

# Select latent features
item_matrix_reduced <- item_matrix %>%
  select(tidyselect::matches("f[0-9]"))

# Take a sample
set.seed(1234)

nbclust_k <- NbClust(data = item_matrix_reduced,
                     method = "kmeans",
                     index = "silhouette",
                     min.nc = 5,
                     max.nc = 20)

optimum_k <- nbclust_k$Best.nc[1]

kmodel <- kmeans(item_matrix_reduced, centers = optimum_k, nstart = 25)
cluster_centres <- kmodel$centers
cluster_size <- kmodel$size

item_matrix_reduced$cluster <- as.factor(kmodel$cluster)

# Function to calculate distance from vector of cluster centres
calculate_distance <- function() {
  
  # Initialise vector
  distances <- vector()
  
  # Outer loop: Iterate over number of clusters
  for(i in 1:nrow(cluster_centres)) {
    
    # Inner loop: Iterature over size of clusters
    for(j in 1:cluster_size[i]) {
      d <- dist(rbind(item_matrix_reduced[item_matrix_reduced$cluster==i,][j, 1:20],
                      cluster_centres[i]))
      distances <- append(distances, d)
    }
    
  }
  
  # Return vector of distances from cluster centres
  return(distances)
  
}

distances <- calculate_distance()
item_matrix$dist <- distances

item_matrix$cluster <- as.factor(kmodel$cluster)

##############################
# STAGE 3: WRITE TO SERVER   #             
##############################

saveRDS(item_matrix, "/efs/shiny-server/latent-feature-explorer/item_matrix.RDS")
bbc/insights-latent-feature-explorer documentation built on Nov. 3, 2019, 2:08 p.m.