scratch/save_ONET_taxonomy.R

# A macro to save the O*NET taxonomy as a data.tree within the package
# https://www.onetcenter.org/database.html#individual-files
# https://www.onetcenter.org/dictionary/26.2/excel/content_model_reference.html
library(janitor)
library(tidyverse)
library(readxl)
library(data.tree)

data <- read_excel(path = "scratch/ONET/Content Model Reference.xlsx")

# Clean up column names and choose the codes and the names
data <- data %>%
  clean_names() %>%
  select(element_id, element_name)


# Check for repeated labels that could scupper the algorithm
ggplot(data = data, mapping = aes(x = element_name)) + geom_bar() + coord_flip()

# Need to check for duplicated labels
duplicates <- data %>% 
  group_by(element_name) %>% 
  summarize(n=n()) %>% 
  filter(n>1)

# Where there is a duplicated name, append the ID to make it unique
# (All IDs are unique)
data <- data %>% left_join(duplicates, by = "element_name")
data$element_name <- ifelse(
  is.na(data$n),
  data$element_name,
  paste0(data$element_name, " (", data$element_id, ")")
)

#  Clean up
remove(duplicates)
data <- data %>% select(element_id, element_name)

# Create a list of character vectors for each element_id
# e.g. "5.D.1" becomes "5" "D" "1"
mother_id <- strsplit(data$element_id, split = "\\.")

# Loop over the list (hard to get the pasting right without a loop)
for (i in 1:length(mother_id)) {

  # remove the last lement of the vector
  # e.g. "5" "D" "1" becomes "5" "D"
  mother_id[[i]] <- mother_id[[i]] %>% head(-1)

  # stick them back together to make mother codes, e.g. "5.D"
  mother_id[[i]] <- paste(mother_id[[i]], collapse = ".")
}

# Make a mother data frame to join the children to via mother_id
mothers <- data %>%
  rename(
    mother_id = element_id,
    mother_name = element_name
  )

# Add a row for the ultimate mother
mothers <- rbind(
  data.frame(mother_id = "0", mother_name = "O*NET"),
  mothers
)

# Add a mother_id column to the original data frame
data$mother_id <- as.vector(as.character(mother_id))

# If there is no mother, the mother_id is 0
data$mother_id[mother_id == ""] <- "0"

# Clean up objects we no longer need
remove(i)
remove(mother_id)

# Create an edge list by joining the mothers to the daughters
edges <- left_join(mothers, data, by = "mother_id") %>%
  rename(
    child_id = element_id,
    child_name = element_name,
  )

# Put edges in the right format for data.tree
edges <- edges %>%
  select(mother_name, child_name) %>%
  drop_na()

# Make the data.tree
onet.skills <- FromDataFrameNetwork(edges)

# See the data.tree in all its glory
print(onet.skills)

# Save the data.tree so it can be accessed by via library(skillr)
save(onet.skills, file = "data/onet.skills.RData", version = 2)

# Clean up
remove(onet.skills)
remove(data)
remove(edges)
remove(mothers)
markrayner/skillr documentation built on March 31, 2022, 9:07 p.m.