## ---- load-pkgs2
library(kableExtra)
library(janitor)
library(tsibble)
library(brolgar)
library(patchwork)
## ---- age-table
age_table <- full_demographics %>%
group_by(age_1979) %>%
count(age_1979)
age_table %>%
mutate(n = scales::comma(n)) %>%
kable(
caption = "The frequency table of the age at the start of the survey (?CHECK) in the full NSLY79 data",
col.names = c("Age", "Number of individuals"),
booktabs = TRUE,
linesep = "",
align = "r") %>%
kable_styling()
## ---- gender-race-table
#' @param x a tabyl with frequency table in "core" attribute
freq_formatting <- function(x) {
attr(x, "core") %>%
adorn_totals(c("row", "col")) %>%
mutate(across(where(is.numeric), ~scales::comma(.x, 1)))
}
sex_race_table <- full_demographics %>%
tabyl(sex, race) %>%
adorn_totals(c("row", "col")) %>%
adorn_percentages("row") %>%
adorn_pct_formatting(digits = 2) %>%
adorn_ns(., position = "front", ns = freq_formatting(.)) %>%
mutate(sex = str_to_title(sex))
kable(sex_race_table,
caption = "The contingency table for sex and race for the full NLSY79 data.",
col.names = c("Sex", "Hispanic", "Black", "Non-Black, Non-Hispanic", "Total"),
booktabs = TRUE,
linesep = "",
align = "lrrrr") %>%
kable_classic() %>%
row_spec(2, hline_after = TRUE) %>%
add_header_above(c(" " = 1, "Race" = 3, " " = 1))
## ---- summarytable
kable(as.array(summary(wages_before$mean_hourly_wage)),
caption = "Summary statistics of the hourly wages from the high school dropout subcohort of NLSY79 data",
col.names = c("Statistics", "Hourly wage ($)"),
booktabs = TRUE,
linesep = "") %>%
kable_styling()
## ---- sample-plot
wages_before_tsibble <- as_tsibble(x = wages_before,
key = id,
index = year,
regular = FALSE)
set.seed(20210225)
ggplot(wages_before_tsibble,
aes(x = year,
y = mean_hourly_wage,
group = id)) +
geom_line(alpha = 0.7) +
facet_sample(n_per_facet = 1, n_facets = 20) +
scale_x_continuous("Year",
breaks = seq(1980, 2020, 10),
labels = c("80", "90", "00", "10", "20"),
minor_breaks = seq(1980, 2020, 5)) +
theme_bw() +
#theme(axis.text.x = element_text(angle = 10, size = 12)) +
ylab("Hourly wage ($)")
## ---- feature-plot
wages_high <- filter(wages_before, mean_hourly_wage > 500) %>%
as_tibble() %>%
head(n = 6)
wages_high2 <- wages_before %>%
filter(id %in% wages_high$id)
spag <- wages_before %>%
ggplot(aes(x = year,
y = mean_hourly_wage,
group = id)) +
geom_line(alpha = 0.5) +
scale_x_continuous("Year",
breaks = seq(1980, 2020, 10),
#labels = c("80", "90", "00", "10", "20"),
minor_breaks = seq(1980, 2020, 5)) +
labs(tag = "(A)") +
theme_bw() +
ylab("Hourly wage ($)") #+
#theme(plot.title = element_text(size = 10))
wages_three_feat <- wages_before_tsibble %>%
features(mean_hourly_wage,
feat_three_num
)
wages_feat_long <- wages_three_feat %>%
pivot_longer(c(min, med, max),
names_to = "feature", values_to = "value") %>%
mutate(feature = factor(feature, levels = c("min", "med", "max")))
feature <- ggplot(wages_feat_long) +
geom_density(aes(x = value, colour = feature, fill = feature), alpha = 0.3) +
labs(tag = "(B)") +
theme_bw() +
theme(#plot.title = element_text(size = 10),
#axis.text.x = element_text(angle = 10, size = 6),
legend.position = "none")
feature_bp <- ggplot(wages_feat_long,
aes(y=value, x = feature,
fill = feature, color = feature)) +
geom_boxplot() +
theme_bw() +
labs(tag = "(B)") +
ylab("Hourly wage ($)") +
xlab("Feature") +
theme(legend.position = "none")
plot_high <- ggplot(filter(wages_high2, id == 39)) +
geom_line(aes(x = year,
y = mean_hourly_wage)) +
geom_point(aes(x = year,
y = mean_hourly_wage),
size = 0.5,
alpha = 0.5) +
annotate("text", x = 2010, y = 1000, label = "id: 39") +
scale_x_continuous("Year",
breaks = seq(1980, 2020, 10),
#labels = c("80", "90", "00", "10", "20"),
minor_breaks = seq(1980, 2020, 5)) +
theme_bw() +
labs(tag = "(C)", y = "Hourly wage ($)")
#spag + feature + feature_bp + plot_high + plot_layout(nrow = 1, guides = "collect") &
# theme(legend.position = "bottom")
spag + feature_bp + plot_high +
plot_layout(nrow = 1, guides = "collect") #&
#theme(legend.position = "bottom")
## ---- compare-data
set.seed(31251587)
wages_cleaned <- readRDS(here::here("paper/results/wages_after.rds"))
sample_id <- sample(unique(wages_cleaned$id), 20)
sample <- subset(wages_cleaned, id %in% sample_id)
wages_compare <- sample %>%
select(id, year, mean_hourly_wage, wages_rlm) %>%
rename(mean_hourly_wage_rlm = wages_rlm) %>%
pivot_longer(c(-id, -year), names_to = "type", values_to = "wages")
## ---- compare-plot
ggplot(wages_compare) +
geom_line(aes(x = year,
y = wages,
colour = type,
linetype = type),
alpha = 1, size=1.3) +
facet_wrap(~id, scales = "free_y") +
theme_bw() +
theme(#axis.text.x = element_text(angle = 10, size = 5),
legend.position = "bottom") +
scale_linetype_manual("",
#values = c("mean_hourly_wage","mean_hourly_wage_rlm"),
values = c("11", "solid"),
labels = c("Before", "After")) +
scale_x_continuous("Year",
breaks = seq(1980, 2020, 10),
labels = c("80", "90", "00", "10", "20"),
minor_breaks = seq(1980, 2020, 5)) +
#xlab("Year") +
ylab("Hourly wage ($)") +
# scale_colour_brewer("", palette = "Dark2", direction = -1,
scale_colour_grey("", labels = c("Before", "After"))
# scale_color_hue(labels = c("Before", "After"))
## ---- fixed-feature-plot
spag2 <- wages_cleaned %>%
ggplot(aes(x = year,
y = wages_rlm,
group = id)) +
geom_line(alpha = 0.1) +
scale_x_continuous("Year",
breaks = seq(1980, 2020, 10),
labels = c("80", "90", "00", "10", "20"),
minor_breaks = seq(1980, 2020, 5)) +
theme_bw() +
labs(tag = "(A)") +
theme(plot.title = element_text(size = 10)) +
ylab("Hourly wage ($)")
wages_hs2020_rlm <- as_tsibble(x = wages_cleaned,
key = id,
index = year,
regular = FALSE)
wages_three_feat_rlm <- wages_hs2020_rlm %>%
features(wages_rlm,
feat_three_num
)
wages_feat_long_rlm <- wages_three_feat_rlm %>%
pivot_longer(c(min, med, max), names_to = "feature", values_to = "value") %>%
mutate(feature = factor(feature, levels = c("min", "med", "max")))
feature2 <- ggplot(wages_feat_long_rlm) +
geom_density(aes(x = value, colour = feature, fill = feature), alpha = 0.3) +
labs(tag = "(B)") +
theme_bw() +
theme(plot.title = element_text(size = 10)) +
xlab("value (log10)")
feature2_bp <- ggplot(wages_feat_long_rlm,
aes(y=value, x = feature,
fill = feature, color = feature)) +
geom_boxplot() +
theme_bw() +
labs(x = "Feature",
y = "Hourly wage ($)",
tag = "(B)") +
theme(legend.position = "none")
plot_high_after <- ggplot(filter(wages_cleaned, id == 39)) +
geom_line(aes(x = year,
y = wages_rlm)) +
geom_point(aes(x = year,
y = wages_rlm),
size = 0.5,
alpha = 0.5) +
annotate("text", x = 1985, y = 26, label = "id: 39") +
scale_x_continuous("Year",
breaks = seq(1980, 2020, 10),
labels = c("80", "90", "00", "10", "20"),
minor_breaks = seq(1980, 2020, 5)) +
theme_bw() +
labs(tag = "(C)", y = "Hourly wage ($)")
spag2 + feature2_bp + plot_high_after
## ---- save-data
# select out the old value of mean hourly wage and change it with the wages_rlm value
wages_clean <- wages_cleaned %>%
select(-mean_hourly_wage) %>%
rename(mean_hourly_wage = wages_rlm)
# rename and select the wages in tidy
wages <- wages_clean %>%
select(id, year, mean_hourly_wage, age_1979, sex, race, grade, hgc, hgc_i, hgc_1979, ged = dip_or_ged,
number_of_jobs, total_hours, stwork, yr_wforce, exp, is_wm, is_pred) %>%
mutate(id = as.factor(id),
hgc = as.factor(hgc),
year = as.integer(year),
age_1979 = as.integer(age_1979),
ged = as.factor(ged),
number_of_jobs = as.integer(number_of_jobs)) %>%
rename(wage = mean_hourly_wage,
njobs = number_of_jobs,
hours = total_hours) %>%
as_tsibble(key = id,
index = year,
regular = FALSE)
# load the wages data to the package
usethis::use_data(wages, overwrite = TRUE)
# Create a data set for demographic variables
demog_nlsy79 <- full_demographics %>%
select(id,
age_1979,
sex,
race,
hgc,
hgc_i,
dip_or_ged) %>%
mutate(id = as.factor(id),
age_1979 = as.integer(age_1979),
hgc = as.factor(hgc),
ged = as.factor(dip_or_ged))
# Create a data set for the high school dropouts cohort
wages_hs_do <- wages %>%
filter(hgc_i < 12 | (hgc_i >= 12 & ged == 2)) %>%
filter(age_1979 <= 17,
sex == "m") %>%
as_tsibble(key = id,
index = year,
regular = FALSE)
# investigate the dropouts IDs with the IDs in the original data
sw <- brolgar:: wages
`%!in%` <- Negate(`%in%`)
not_in_sw <- sw %>% filter(id %!in% wages_hs_do$id) %>% distinct(id, .keep_all = TRUE)
join <- left_join(not_in_sw, full_demographics, by = "id")
# some IDs are more than 17 y.o
join_more_17 <- join %>% filter(age_1979 > 17)
#this IDs are not eligible according to original data definition, bu we decide to include them later on
# the following should be eligible by age rule but somehow not included
join_less_17 <- join %>% filter(age_1979 <= 17)
# reasons:
# some ID are not eligible as their hgc_i >= 12 and GED is coded to 1
join_not_elig <- join_less_17 %>% filter(hgc_i >= 12 & dip_or_ged == 1)
# the information of dip_or_ged is missing in some IDs, decide to include them
join_missing_ged <- join_less_17 %>% filter(is.na(dip_or_ged))
## the ged is coded to 3, we decide to include them in the data
join_ged_both <- join_less_17 %>% filter(dip_or_ged == 3)
## the IDs do not exist in the complete wages data
join_not_exist <- join_less_17 %>% filter(id %!in% wages$id)
# based on those decisions, we refilter the data
not_elig <- join %>% filter(hgc_i >= 12 & dip_or_ged == 1)
join_eligible <- join %>% filter(id %!in% not_elig$id)
# filter the wages data
also_do <- wages %>% filter(id %in% join_eligible$id)
# bind the initial dropouts data and the eligible data that are not captured in the first filtering
wages_hs_do <- rbind(wages_hs_do, also_do)
# load the dropouts' wages data to the package
usethis::use_data(wages_hs_do, overwrite = TRUE)
### ---- nrow
a <- nrow(categories_qnames)
b <- nrow(full_demographics)
d <- eligible_wages %>%
group_by(id) %>%
count(id) %>%
nrow()
n_hgc <- wages %>%
as_tibble() %>%
group_by(id) %>%
count(id) %>%
nrow()
n_obs_hgc <- nrow(wages)
n_obs_hgc_pred <- filter(wages, is_pred == TRUE) %>%
nrow()
n_do <- wages_hs_do %>%
as_tibble() %>%
group_by(id) %>%
count(id) %>%
nrow()
n_obs_do <- nrow(wages_hs_do)
n_obs_do_pred <- filter(wages_hs_do, is_pred == TRUE) %>%
nrow()
### ---- flow-chart
include_graphics(here::here("paper/figures/flowchart.png"))
# grViz("digraph flowchart {
# node [fontname = Helvetica, shape = rectangle]
# tab1 [label = '@@1']
# tab2 [label = '@@2']
# tab3 [label = '@@3']
# tab4 [label = '@@4']
# tab5 [label = '@@5']
# tab6 [label = '@@6']
# tab7 [label = '@@7']
#
# tab1 -> tab2;
# tab1 -> tab3;
# tab1 -> tab4 -> tab5 -> tab6;
# tab5 -> tab7;
# tab6 -> tab7;
# tab2 -> tab7
# }
#
# [1]: paste0('NLSY79, n = ', a)
# [2]: paste0('Extract demographic variable, n = ', b)
# [3]: paste0('Exclude ', a - d, ' IDs whose hourly rate is missing')
# [4]: paste0('Eligible ID, n = ', d)
# [5]: paste0('Cohort whose hgc is up to 12th grade and \\n participated at least 5 years in the survey, \\n n = ', n_hgc, ', n_obs = ', n_obs_hgc, ' \\n (', n_obs_hgc_pred, ' observations are predicted value)')
# [6]: paste0('High school dropouts cohort, \\n n = ', n_do, ', n_obs = ', n_obs_do, ' \\n (', n_obs_do_pred, ' observations are predicted value)')
# [7]: 'yowie Package'
# ",
# height = 700)
### ---- flow-chart-blind
include_graphics(here::here("paper/figures/flowchart_blind.png"))
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.