scratch/taxonomies.R

library(tidyverse)
library(zoo)
library(viridis)
library(collapsibleTree)
library(R6)
library(readxl)
library(janitor)
library(svglite)
library(ggforce) #includes geom_circle (https://ggforce.data-imaginist.com/reference/geom_circle.html)
library(ggfittext)
library(ggrepel)


# ------------------------------------------------------------------------------
# Coursera skills taxonomy linked to the WEF taxonomy
cdata <- read.csv("coursera-data-feb/coursera_wef_taxonomy_map.csv")
paste("Of", length(cdata$wef_skill), "Coursera skills,", sum(cdata$wef_skill %>% is.na()), "have no Forum counterpart") %>% print()


# Arrange the data so it can be made a well-ordered dendrogram
cdata <- cdata %>%
  filter(level == 2) %>%
  select(domain_id, competency_id, skill_id) %>%
  arrange_all()

# Print the interactive dendrogram
`Coursera Skills Taxonomy` <- cdata
p <- collapsibleTree(`Coursera Skills Taxonomy`, c("domain_id", "competency_id", "skill_id"))
p

# The following section is commented as the input data is incomplete -----------
# Make the diagram with names not IDs
#`Coursera Skills Taxonomy`$domain_name     <- data$skill_name[match(`Coursera Skills Taxonomy`$domain_id    , data$skill_id)]
#`Coursera Skills Taxonomy`$competency_name <- data$skill_name[match(`Coursera Skills Taxonomy`$competency_id, data$skill_id)]
#`Coursera Skills Taxonomy`$skill_name      <- data$skill_name[match(`Coursera Skills Taxonomy`$skill_id     , data$skill_id)]

# However quite a few competency names were not included in the "dictionary" (data)
# As a result they are NA here
# As a result tye dendrograme doesn't quite work right
# Have asked Eric Karsten about this

# Print the interactive dendrogram
#p <- collapsibleTree(`Coursera Skills Taxonomy`, c("domain_name", "competency_name", "skill_name"))
#p
# ------------------------------------------------------------------------------

# Add mappings from each of domains competencies and skills to Forum level-4 skills
# Propagate the domain mappings and the the competency mappings down to skills mappings
# Note that a single Coursera skill is often mapped by two Forum level-4 skills,
# as they may be mapped twice as e.g. domains and competencies
`Coursera Skills Taxonomy`$wef_skill1 <- data$wef_skill[match(`Coursera Skills Taxonomy`$domain_id    , data$skill_id)]
`Coursera Skills Taxonomy`$wef_skill2 <- data$wef_skill[match(`Coursera Skills Taxonomy`$competency_id, data$skill_id)]
`Coursera Skills Taxonomy`$wef_skill3 <- data$wef_skill[match(`Coursera Skills Taxonomy`$skill_id     , data$skill_id)]

# ------------------------------------------------------------------------------


# ------------------------------------------------------------------------------
# WEF skills taxonomy linked to the WEF taxonomy
forum_skills <- read_excel(path = "data/taxonomy-WEF.xlsx")
save(forum_skills, file = "data/forum_skills.RData")

# Fill downwards as the Excel file's merged cells were made NA except at the top
# Don't do this for level 4 as some are intentionally blank
fdata <- fdata %>%
  select(L1, L2, L3, L4) %>%
  fill(L1, L2, L3)

# Make them factors now we have done necessary naming fixes to Excel lapses
fdata <- fdata %>% transmute(
  L1 = factor(L1),
  L2 = factor(L2),
  L3 = factor(L3),
  L4 = factor(L4),
)

# Print the interactive dendrogram
`Forum Skills Taxonomy` <- fdata
p <- collapsibleTree(`Forum Skills Taxonomy`, c("L1", "L2", "L3", "L4"))
p


# ------------------------------------------------------------------------------

# Let's start by mapping Coursera competencies (L2) to Forum level-3 skills

# First we need a level-2-granularity data frame for Coursera
cdata2 <- cdata %>% rename(
  L1 = domain_id,
  L2 = competency_id,
  L3 = skill_id
) %>% select(L1, L2) %>%
  unique()

# Now make a level-3-granularity data frame for the Forum's taxonomy
fdata3 <- fdata %>%
  select(L1, L2, L3) %>%
  unique()

# As we are making a matrix to go with level-2 Coursera, add two empty rows
fdata3 <- bind_rows(data.frame(L1=c(NA,NA), L2=c(NA,NA), L3=c(NA,NA)), fdata3)

# Now transpose it, so we can build the redistribution matrix
fdata3T <- t(fdata3)

# Force the column names to be the same so we can bind the data frames to make the matrix structure
colnames(cdata2) <- colnames(fdata3T)

# Bind the labelling parts of the matrix
library(data.table)
cfmatrix <- rbindlist(list(as.data.table(fdata3T), cdata2), fill = TRUE)

# Output a csv file to fill with 1s and 0s to redistribute enrollment numbers
cfmatrix %>% write.csv("empty-matrix.csv", col.names = FALSE, row.names = FALSE)


# !!!!
# To do:
#  - The enrollment data is mixed between level 1 and 2
#  - There is some duplication, with categories and subcategories sometimes both reported
#  - So you need to do a bespoke job; you can't just make a single level-X to level-Y aggregation matrix
#  - Start with the data, and choose which to use when there is duplication
#  - Manually sort what is available into level-3 categories
#  - ...or level-4 where it seems more appropriate
#  - ...while adding in sensible industry specialized categories where needed







# ------------------------------------------------------------------------------
# ------------------------------------------------------------------------------
# ------------------------------------------------------------------------------
# ------------------------------------------------------------------------------
# ------------------------------------------------------------------------------
# Experimentation with OO taxonomies

daughters = c("b", "c", "d", "e", "f", "g", "h")
mothers   = c("a", "a", "b", "b", "c", "c", "c")
data <- data.frame(daughters, mothers)

Taxonomy <- R6Class(
  "Taxonomy",
  public = list(
    initialize = function(name, data) {
      private$name <- name
      private$data <- data
    },
    GetName = function() {
      return(private$name)
    },
    GetData = function() {
      return(private$data)
    }
  ),
  private = list(
    name = NULL,
    data = NULL,
    daughters = vector()
  )
)

taxonomy <- Taxonomy$new("WEF", data)

Skill <- R6Class(
  "Skill",
  public = list(
    initialize = function(name) {
      private$name <- name
      private$daughters <- Skill$new("Bob")
      private$daughters <- Skill$new("Janet")
    },
    GetName = function() {
      return(private$name)
    },
    GetDaughters = function() {
      str(private$daughters)
      return(private$daughters)
    }
  ),
  private = list(
    name = NULL,
    daughters = vector()
  )
)

tax1 <- Skill$new("Alexia")
tax1$GetName()
z <- tax1$GetDaughters()
markrayner/skillr documentation built on March 31, 2022, 9:07 p.m.