options(shiny.reactlog=TRUE)

library(flexdashboard)
library(tidyverse)
library(RColorBrewer)
library(scales)

item_matrix <- readRDS("/efs/shiny-server/hidden-variable-explorer/item_matrix.RDS")
# user_matrix <- readRDS("/efs/shiny-server/hidden-variable-explorer/user_matrix.RDS")

user_aggbycluster <- readRDS("/efs/shiny-server/hidden-variable-explorer/user_aggbycluster.RDS")
user_aggbygender <- readRDS("/efs/shiny-server/hidden-variable-explorer/user_aggbygender.RDS")
user_aggbyage <- readRDS("/efs/shiny-server/hidden-variable-explorer/user_aggbyage.RDS")
user_aggbyfrequency <- readRDS("/efs/shiny-server/hidden-variable-explorer/user_aggbyfrequency.RDS")
user_favshows <- readRDS("/efs/shiny-server/hidden-variable-explorer/user_favshows.RDS")
user_clustersizes <- readRDS("/efs/shiny-server/hidden-variable-explorer/user_clustersizes.RDS")

item_clustersizes <- item_matrix %>% count(cluster)

latent_features <- item_matrix %>% select(tidyselect::matches("f[0-9]")) %>% names()

item_nclusters <- length(unique(item_matrix$cluster))

item_cols <- hue_pal()(item_nclusters)
user_cols <- hue_pal()(nrow(user_aggbycluster))

clusters <- item_matrix$cluster %>% 
  as.numeric() %>% 
  unique() %>% 
  sort()

user_clusters <- sort(unique(as.numeric(user_aggbycluster$cluster)))

top_10 <- item_matrix %>%
  select(tleo, latent_features) %>% 
  gather(latent_feature, value, -tleo) %>%
  group_by(latent_feature) %>%
  top_n(10, value) %>%
  ungroup() %>%
  mutate(col = "positive")

bottom_10 <- item_matrix %>%
  select(tleo, latent_features) %>% 
  gather(latent_feature, value, -tleo) %>%
  group_by(latent_feature) %>%
  top_n(-10, value) %>%
  ungroup() %>%
  mutate(col = "negative")

total <- rbind(top_10, bottom_10)

th <- theme_minimal(base_size = 18) +
  theme(text = element_text(family = "BBC Reith Sans"))

theme_set(th)

Overview

Inputs {.sidebar}

Column

Overview {.no-title}

What is a hidden variable?

Hidden variables (also known as latent variables) are those that are not directly observed, but rather inferred from other, observed variables.

Here's a medical analogy. The common cold (the 'hidden variable') cannot be directly observed by a GP. However, the virus produces a number of symptoms (the 'observed variables') - a blocked nose, a sore throat and headaches - that enable us to make the diagnosis.

What would a hidden variable be in a BBC context?

Theoretically, a hidden variable could represent a characteristic of a programme, e.g suspensfulness or quirkiness. It could also represent a theme, e.g. friendship or dysfunctional family life.

However, the data does not directly tell us what a hidden variable represents. It is up to us to decide the meaning, based on the programmes that score highly on it. For example, if a hidden variable has a positive association with 'Radio 3 in Concert', 'BBC Proms' and 'Composer of the Week', we might decide that it represents classical music.

renderImage({
  list(src = "hidden-variables.png", width = 525, height = 250)
}, deleteFile = FALSE)

How are hidden variables extracted?

Hidden variables are extracted from a set of observed variables using some kind of statistical procedure. There are various such methods, depending on the application. Some common methods include principal components analysis and factor analysis, which is common in psychology and the behavioural sciences.

For this project, we extract hidden variables using a technique called matrix factorisation. The mathematics behind matrix factorisation is complex, but the method is useful because it allows us to explore hidden variables in relation to both our programmes and our users.

If you have any questions about the Hidden Variable Explorer, please contact Joshua Feldman (Joshua.Feldman\@bbc.co.uk).

1. Explore the hidden variables

Inputs {.sidebar}

When building recommendation models, we unearth information about 'hidden' variables of our shows. These hidden variables become available to us when we look at how the audience co-view different groups of shows. The problem is that these dimensions don't have names - we need to work out what they represent. If we do, we can identify different areas where we might have less content than intended.

To help us get a better understanding of this 'hidden' landscape, we have coloured shows that have roughly similar values across each dimension.

selectInput("column1", "Select the first hidden variable:",
            choices = latent_features)

selectInput("column2", "Select the second hidden variable:",
            choices = latent_features, selected = latent_features[2])

Column {data-width=600}

Compare two hidden variables

renderPlot({
  item_matrix %>% 
    ggplot(aes_string(x = input$column1, y = input$column2,
                      label = "tleo", col = "cluster")) +
    geom_text(check_overlap = TRUE, fontface = "bold", key_glyph = draw_key_point) +
    geom_vline(xintercept = 0, linetype = "dashed") +
    geom_hline(yintercept = 0, linetype = "dashed") +
    guides(col = guide_legend(title = "Cluster")) +
    theme(legend.position = "bottom")
})

Column {data-width=400}

Top/bottom programmes of the x-variable

renderPlot({
  total %>%
    mutate(tleo = tidytext::reorder_within(tleo, value, latent_feature)) %>%
    filter(latent_feature == input$column1) %>%
    ggplot(aes(tleo, value, fill = col)) +
    geom_col() +
    coord_flip() +
    tidytext::scale_x_reordered() +
    guides(fill = FALSE) +
    theme(axis.title = element_blank())
})

Top/bottom programmes of the y-variable

renderPlot({
  total %>%
    mutate(tleo = tidytext::reorder_within(tleo, value, latent_feature)) %>%
    filter(latent_feature == input$column2) %>%
    ggplot(aes(tleo, value, fill = col)) +
    geom_col() +
    coord_flip() +
    tidytext::scale_x_reordered() +
    guides(fill = FALSE) +
    theme(axis.title = element_blank())
})

2. Clustering our programmes

Inputs {.sidebar}

We have clustered BBC shows according to their values for each of the hidden variables For example, if two shows have a very high score on Variables 1-10 but a very low score on Variables 11-20, they are likely to appear in the same cluster. In theory, this should mean that shows within each cluster should be similar – whether in genre, tone or audience.

The number of clusters is optimised so that the clusters are distinct from one another, but also so that the shows within each of them are similar.

selectInput("feature1", "Select a hidden variable:",
            choices = latent_features, selected = latent_features[1])

selectInput("feature2", "Select a hidden variable:",
            choices = latent_features, selected = latent_features[2])

selectInput("cluster1", "Select a cluster:",
            choices = clusters, selected = clusters[1])

# selectInput("cluster2", "Select a cluster:",
#             choices = clusters, selected = clusters[2])

Column {data-width=600}

Compare clusters by hidden variables

renderPlot({
  item_col <- item_cols[as.numeric(input$cluster1)]

  item_matrix %>% 
    select(cluster, latent_features) %>% 
    group_by(cluster) %>% 
    summarise_all(mean) %>% 
    left_join(item_clustersizes) %>% 
    mutate(flag = ifelse(cluster == input$cluster1, "1", "0")) %>% 
    mutate(label = paste("Cluster ", cluster, ":\n", n, " shows", sep = "")) %>% 
    ggplot(aes_string(input$feature1, input$feature2, col = "flag", label = "label")) +
    geom_text(vjust = 1.5, family = "BBC Reith Sans", fontface = "bold", size = 5,
              check_overlap = TRUE) +
    geom_point(aes_string(size = "n"), alpha = 0.25) +
    geom_vline(xintercept = 0, linetype = "dashed") +
    geom_hline(yintercept = 0, linetype = "dashed") +
    scale_size(range = c(5, 30)) +
    scale_color_manual(values = c("grey", item_col)) +
    guides(col = FALSE, size = FALSE)
})

Column {data-width=400 .tabset}

Genre

renderPlot({
  item_matrix %>% 
    filter(!is.na(genre)) %>% 
    filter(genre != "null") %>% 
    filter(cluster == input$cluster1) %>% 
    count(genre) %>% 
    mutate(prop = n / sum(n)) %>% 
    mutate(genre = reorder(genre, prop)) %>% 
    ggplot(aes(genre, prop, fill = prop)) +
    geom_col() +
    coord_flip() +
    scale_y_continuous(labels = scales::percent) +
    guides(fill = FALSE) +
    labs(x = "Genre", y = "Percentage of cluster")
})

Master Brand

renderPlot({
  item_matrix %>% 
    filter(!is.na(genre)) %>% 
    mutate(master_brand_name = as.factor(master_brand_name)) %>% 
    filter(cluster == input$cluster1) %>% 
    count(master_brand_name) %>% 
    mutate(prop = n / sum(n)) %>% 
    mutate(master_brand_name = reorder(master_brand_name, prop)) %>% 
    ggplot(aes(master_brand_name, prop, fill = prop)) +
    geom_col() +
    coord_flip() +
    scale_y_continuous(labels = scales::percent) +
    guides(fill = FALSE) +
    labs(x = "Master brand", y = "Percentage of cluster")
})

Top hidden variables

renderPlot({
  item_matrix %>% 
    filter(cluster == input$cluster1) %>% 
    select(cluster, latent_features) %>% 
    group_by(cluster) %>% 
    summarise_all(mean) %>% 
    gather(latent_feature, value, -cluster) %>% 
    mutate(latent_feature = reorder(latent_feature, value)) %>% 
    mutate(col = ifelse(value > 0, "positive", "negative")) %>% 
    ggplot(aes(latent_feature, value, fill = col)) +
    geom_col() +
    coord_flip() +
    guides(fill = FALSE) +
    labs(x = "Hidden variable", y = "Value")
})

Example programmes

# renderText({
#   paste(item_matrix$tleo[item_matrix$cluster==input$cluster1], collapse = " • ")
# })

renderPrint({
  item_matrix %>% 
    filter(cluster == input$cluster1) %>% 
    select(tleo, dist) %>% 
    arrange(dist) %>% 
    as.tibble() %>% 
    print(n = 50)
})

3. Clustering our users

Inputs {.sidebar}

We can cluster our users in the same way that we cluster our shows – that is, according to their values for each of the hidden variables.

Again, the number of clusters is optimised so that the clusters are distinct from one another, but also so that the users within each of them are similar in terms of the content they consume.

selectInput("user_feature1", "Select a hidden variable:",
            choices = latent_features, selected = latent_features[1])

selectInput("user_feature2", "Select a hidden variable:",
            choices = latent_features, selected = latent_features[2])

selectInput("user_cluster", "Select a cluster:",
            choices = user_clusters, selected = user_clusters[1])

Column {data-width=600}

Compare clusters by hidden variables

renderPlot({
  user_col <- user_cols[as.numeric(input$user_cluster)]

  user_aggbycluster %>% 
    mutate(flag = ifelse(cluster == input$user_cluster, "1", "0")) %>% 
    ggplot(aes_string(input$user_feature1, input$user_feature2, col = "flag", label = "label")) +
    geom_text(vjust = 1.5, family = "BBC Reith Sans", fontface = "bold", size = 5,
              check_overlap = TRUE) +
    geom_point(aes_string(size = "n"), alpha = 0.25) +
    geom_vline(xintercept = 0, linetype = "dashed") +
    geom_hline(yintercept = 0, linetype = "dashed") +
    scale_size(range = c(5, 30)) +
    scale_color_manual(values = c("grey", user_col)) +
    guides(col = FALSE, size = FALSE)

})

Column {data-width=400 .tabset}

Gender

renderPlot({
  user_aggbygender %>% 
    filter(cluster == input$user_cluster) %>% 
    mutate(gender = reorder(gender, prop)) %>% 
    ggplot(aes(gender, prop, fill = prop)) +
    geom_col() +
    coord_flip() +
    scale_y_continuous(labels = scales::percent) +
    guides(fill = FALSE) +
    labs(x = "Gender", y = "Percentage of cluster")
})

Age range

renderPlot({
  user_aggbyage %>% 
    filter(cluster == input$user_cluster) %>% 
    mutate(age_range = reorder(age_range, prop)) %>% 
    ggplot(aes(age_range, prop, fill = prop)) +
    geom_col() +
    coord_flip() +
    scale_y_continuous(labels = scales::percent) +
    guides(fill = FALSE) +
    labs(x = "Age range", y = "Percentage of cluster")
})

Frequency

renderPlot({
  user_aggbyfrequency %>% 
    filter(cluster == input$user_cluster) %>% 
    mutate(frequency_band = reorder(frequency_band, prop)) %>% 
    ggplot(aes(frequency_band, prop, fill = prop)) +
    geom_col() +
    coord_flip() +
    scale_y_continuous(labels = scales::percent) +
    guides(fill = FALSE) +
    labs(x = "Frequency band", y = "Percentage of cluster")
})

Top hidden variables

renderPlot({
  user_aggbycluster %>% 
    select(cluster, latent_features) %>% 
    filter(cluster == input$user_cluster) %>% 
    gather(latent_feature, value, -cluster) %>% 
    mutate(latent_feature = reorder(latent_feature, value)) %>% 
    mutate(col = ifelse(value > 0, "positive", "negative")) %>% 
    ggplot(aes(latent_feature, value, fill = col)) +
    geom_col() +
    coord_flip() +
    guides(fill = FALSE) +
    labs(x = "Hidden variable", y = "Value")
})

Favourite shows

renderPrint({
   user_favshows %>% 
    filter(cluster == input$user_cluster) %>% 
    left_join(user_clustersizes) %>% 
    mutate("percent_of_cluster" = paste(round((reach / n) * 100), "%", sep = "")) %>% 
    select(tleo, reach, percent_of_cluster) %>% 
    arrange(-reach) %>% 
    print(n = 50)
})


bbc/insights-latent-feature-explorer documentation built on Nov. 3, 2019, 2:08 p.m.