In rfaridi/hiesBD: What the Package Does (One Line, Title Case)

knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>"
)

library(tidyverse)
library(labelled)
library(here)

let's load some library first

Reproduce table 2.1

Let's bring in household data

load("./data/HIES_2016/cover_hies_2016.rda")

Let's import household members data

load(here("./data/HIES_2016/hh_sec_1a.rda"))
members <- hh_sec_1a

Check no of strata

hhold %>% 
  summarize(no.of.strata=n_distinct(stratum16))

here i thought it will be 20 according to the report, but seems its different, only 16. Must be just rural and urban

Now check the no. of substratum

hhold %>% 
  summarize(no.of.strata=n_distinct(stratum))

Now it matches with the no. of substratum mentioned in the report

Check no. of psu

hhold %>% 
  summarize(no.of.psu=n_distinct(psu))

It matches with the Report

Let's check out other bunch of variables

what is that ruc?

var_label(hhold$ruc)
hhold %>% 
  summarize(n_distinct(ruc))

So it seems 3 different values

hhold %>% 
    count(ruc)

Check out urbrural

hhold %>% 
    count(urbrural)

First check whether there is unique household id

hhold %>% 
  summarize(n_distinct(hhid))

In 2016, there seems to be unique hhid

members %>% 
  summarize(n_distinct(hhid))

Find household size

members <- members %>% 
              group_by(hhid) %>% 
               mutate(hsize=n())

Now we will select only household head and few other variables

members_tr <- members %>%
            rename(rel_hh=s1aq02,
                   sex_hh=s1aq01) %>% 
            filter(rel_hh==1) %>% 
             select(hhid, sex_hh, hsize, psu, stratum, stratum16, ruc, urbrural,hhwgt) %>% 
            ungroup()

Find number male and female headed households

members_tr %>%
  #group_by(sex.hh) %>% 
  summarise(n())

Alternative method to find grouping count

members_tr %>% 
  count(sex_hh)

Let's check out unweighted distribution of household size

Let's first find the overall mean household size

members_tr %>% 
  summarise(national=mean(hsize))

First by rural urban category

members_tr %>% 
    group_by(urbrural) %>% 
    summarize(mean_size=mean(hsize))

Then by sex of household head

members_tr %>% 
    group_by(sex.hh) %>% 
    summarize(mean_size=mean(hsize))

Now the two way table as in the Report (Table number 2.1)

members_tr %>% 
    group_by(sex_hh, urbrural) %>% 
    summarize(mean_size=mean(hsize))

Let's save members_tr object

save(members_tr, file=here("./data/HIES_2016/members_tr.rda"))

If we match with the report in the table, we will find that it is not exactly matching. This is because, so far we have not considered the survey design into our analysis.

Let's do that by introducing the srvyr package

library(survey)
library(srvyr)

Now let's insert the survey design

members_svy <- members_tr %>%  
             as_survey(id=psu,strata=stratum,weights=hhwgt)
class(members_svy)
summary(members)

members_svy %>% 
   summarise(national=survey_mean(hsize)) %>%  # note the use of survey mean
   mutate(national=round(national,2)) %>% 
   pull(national)

Rural urban category wise household size

members_svy %>% 
    group_by(urbrural) %>% 
    summarize(mean_hsize=survey_mean(hsize)) %>% 
    ungroup() %>% 
    mutate(mean_hsize=round(mean_hsize,2)) %>% 
    select(-mean_hsize_se)

Sex of household head wise household size

members_svy %>% 
    group_by(sex_hh) %>% 
    summarize(mean_hsize=survey_mean(hsize)) %>%          
    mutate(mean_hsize=round(mean_hsize,2)) %>% 
    select(-mean_hsize_se)

Now the final cross tabulation between sex of household head and rural urban category

members_svy %>% 
    group_by(sex_hh, urbrural) %>% 
    summarize(mean_hsize=survey_mean(hsize)) %>% 
    mutate(mean_hsize=round(mean_hsize,2)) %>% 
    select(-mean_hsize_se)

Now let's save members

save(members_svy, file = here("./data/HIES_2016/members_svy.rda"))

We find that now the above numbers exactly matches Table 2.1 in the report.

Table 2.2

members_svy %>%
    mutate(hsize=as_factor(hsize),
       urbrural=as_factor(urbrural)) %>% 
    group_by(hsize,urbrural) %>% 
    survey_tally() %>% 
    select(-n_se) %>% 
    pivot_wider(names_from=urbrural,values_from=n) %>% 
    mutate(National=Rural+Urban) %>% 
    relocate(National,.after=hsize) %>% 
    mutate(across(!hsize, .fns=~ round(.x/sum(.x,na.rm=T)*100,1),.names="perc_{.col}")) # we can avoid creating new variables by not including `names` argument

Table 2.3

load(here("./data/members.rda"))

Health

First read the data

health <- read_dta("../HIES2016-20200905T093607Z-001/HIES2016/HH_SEC_3A.dta")

health %>% 
    count(s3aq01)

health %>% 
    count(s3aq02)

Smoking

load(here("./data/HIES_2016/smoker_2016.rda"))

smokers_svy <- smoker_2016 %>%  
                  as_survey(id=psu,
                            strata=stratum,
                            weights=hhwgt)
class(smokers_svy)
summary(smokers_svy)

smokers_svy %>% 
   summarise(national=survey_mean(smoker)) %>%  # note the use of survey mean
   mutate(national=round(national,2)) %>% 
   pull(national)

Rural urban category wise household size

smokers_svy %>% 
    group_by(urbrural) %>% 
    summarize(smoker_perc=survey_mean(smoker)) %>% 
    ungroup() %>% 
    mutate(mean_hsize=round(mean_hsize,2)) %>% 
    select(-mean_hsize_se)

smoker_2016 %>% 
    group_by(urbrural) %>% 
    summarize(smoker_perc=mean(smoker))

Sex of household head wise household size

members_svy %>% 
    group_by(sex_hh) %>% 
    summarize(mean_hsize=survey_mean(hsize)) %>%          
    mutate(mean_hsize=round(mean_hsize,2)) %>% 
    select(-mean_hsize_se)

Now the final cross tabulation between sex of household head and rural urban category

members_svy %>% 
    group_by(sex_hh, urbrural) %>% 
    summarize(mean_hsize=survey_mean(hsize)) %>% 
    mutate(mean_hsize=round(mean_hsize,2)) %>% 
    select(-mean_hsize_se)