DSjobtracker

knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>",
  warning = FALSE,
  message = FALSE
)
library(DSjobtracker)
data("DStidy")

Getting started with DSjobtracker

The package contains two datasets

  1. DSraw : Raw dataset with r nrow(DSraw) rows and r ncol(DSraw) columns
  2. DStidy : Cleaned tidy dataset with r nrow(DStidy) rows and r ncol(DStidy) columns

Both of these datasets contain information about job vacancies related to data science, which were collected for the span of a month, by searching for specific Search_Term and then following the search results to gather data manually.

Usage

  1. Install the library from github
# install devtools if not already installed
# install.packages("devtools")
devtools::install_github("thiyangt/DSjobtracker")
  1. Load the library
library(DSjobtracker)
  1. Load the dataset into your environment
data("DStidy")

Overview of columns

tibble::glimpse(DStidy)

More information on the meanings of the column names can be accessed through the help

?DStidy

Examples

Barplot of top twenty skills required for data science jobs

library(tidyr)
library(magrittr)
library(dplyr)
library(ggplot2)
library(wordcloud2)
library(viridis)
library(forcats)

theme_set(theme_minimal())

skills_long <- DStidy %>%
  select(c(R:Bahasa_Malaysia)) %>%
  pivot_longer(c(R:Bahasa_Malaysia), values_to = "Value", names_to = "Name") %>%
  mutate(Value = as.numeric(levels(Value))[Value]) %>%
  group_by(Name) %>%
  summarize(Total = sum(Value)) %>%
  arrange(Total)

skills_long %>%
  mutate(Name = factor(Name, levels = .$Name)) %>%
  top_n(20) %>%
  ggplot(aes(x = Name, y = Total)) +
  geom_bar(stat = "identity") +
  geom_label(aes(label = Total),
    nudge_y = -10, size = 3.25,
    label.padding = unit(0.125, "lines")
  ) +
  coord_flip() +
  labs(
    title = "Top twenty skills required for data science jobs",
    x = "Skill Required", y = "No of job vacancies"
  )

Wordcloud of software skills

not_software_columns <- c(
  "Presentation_Skills", "Data_visualization",
  "Spreadsheets", "BigData",
  "Communication", "BigData",
  "Data_warehouse", "cloud_storage",
  "Google_Cloud", "Machine_Learning",
  "Computer_vision", "Deep_Learning", "RDBMS",
  "web_design_and_development_tools", "AI",
  "Natural_Language_Processing(NLP)",
  "graphics_and_design_skills", "Data_marketing",
  "SEO", "Content_Management",
  "Data_Pipelines", "MPP_Platforms", "agile_execution",
  "Data_management", "Data_mining", "Data_science",
  "Web_Analytic_tools", "IOT",
  "Numerical_Analysis", "Finance_Knowledge", "Economic",
  "Investment_Knowledge", "Problem_Solving",
  "Korean_language", "Team_Handling",
  "Debtor_reconcilation", "Payroll_management",
  "Bayesian", "Optimization", "Bahasa_Malaysia"
)
indicators <- DStidy %>%
  select(c(R:Bahasa_Malaysia))
software_indicators <- indicators %>%
  select(colnames(.)[!colnames(.) %in% not_software_columns])

software_indicators_long <- software_indicators %>%
  pivot_longer(colnames(.), values_to = "Value", names_to = "Name") %>%
  mutate(Value = as.numeric(levels(Value))[Value]) %>%
  group_by(Name) %>%
  summarize(Total = sum(Value)) %>%
  arrange(Total)

wordcloud2(software_indicators_long %>%
  transmute(word = Name, freq = log(Total)),
size = 0.35,
minRotation = pi / 2, maxRotation = pi / 2,
color = viridis(nrow(software_indicators_long)),
fontFamily = "Montserrat"
)

The log of the counts were used to visualize them better

Required experience and the salary

count_data <- DStidy %>%
  select(Experience_Category, Edu_Category) %>%
  filter(!is.na(Edu_Category)) %>%
  count(Experience_Category, Edu_Category)

max_vacancies <- max(count_data$n)

count_data %>%
  ggplot(aes(x = fct_rev(Experience_Category), y = n, color = Edu_Category, size = n)) +
  geom_point() +
  geom_curve(
    data = tibble(
      x1 = c(4.4), x2 = c(4),
      y1 = c(62.5), y2 = c(58)
    ),
    aes(x = x1, y = y1, xend = x2, yend = y2),
    arrow = arrow(length = unit(0.07, "inch")), size = 0.4,
    color = "gray20", curvature = -0.3
  ) +
  coord_flip() +
  lims(y = c(0, 100)) +
  annotate("text",
    x = 4.65, y = 60, size = 2.8,
    label = paste("With less than 2 years of experience\n and a BSc degree \n you can still apply for", max_vacancies, "job vacancies")
  ) +
  scale_color_brewer(type = "qual", palette = "Set1") +
  scale_size(guide = "none") +
  labs(
    x = "Years of Experience needed",
    y = "Number of job vacancies",
    color = "Minimum education qualification"
  ) +
  theme(legend.position = "bottom") +
  guides(color = guide_legend(nrow = 2, title.position = "top"))

Software Skills needed for each Job Category

# radar plot with job category and skills in a radar
job_skill_data <- DStidy %>% 
  select(R:Bahasa_Malaysia,Job_Category) %>% 
  filter(Job_Category != "Unimportant") %>% 
  pivot_longer(c(R:Bahasa_Malaysia),names_to="Name",values_to = "Value") %>%
  mutate(Value = as.numeric(levels(Value))[Value]) %>%
  group_by(Job_Category,Name) %>% 
  summarize(Total = sum(Value)) %>% 
  ungroup() %>% 
  filter(Total > 0) %>% 
  mutate(logTotal = log(Total)) %>%
  ungroup()

common_skills <- job_skill_data %>% 
  count(Name) %>% 
  filter(n == 3 & !(Name %in% not_software_columns)) %>% 
  .$Name

plot_data <- job_skill_data %>% 
  filter(Name %in% common_skills) %>% 
  mutate(Name = as.numeric(factor(Name,labels = common_skills)))

  plot_data %>% 
  ggplot(aes(x = Name,y = logTotal,fill = Job_Category,color = Job_Category))+
  geom_area(size = 0,position = position_dodge(width=0.9),alpha=0.1) + 
  geom_point(size=0.5) +
  geom_segment(aes(xend = Name,yend = logTotal,alpha = logTotal),
               y = 0,size = 1.25)+
  scale_x_continuous(labels = common_skills,breaks = 1:length(common_skills)) +
  theme(axis.text.y = element_blank(),
        legend.position = "none") + 
  labs(x = NULL,
       y = NULL) +
  scale_fill_brewer(palette = "Set1",type = "qual") + 
  coord_polar() + 
  facet_wrap(~ Job_Category,ncol=1)


Try the DSjobtracker package in your browser

Any scripts or data that you put into this service are public.

DSjobtracker documentation built on Dec. 15, 2020, 5:31 p.m.