knitr::opts_chunk$set(
  echo = TRUE, 
  eval = TRUE,
  message = FALSE,
  warning = FALSE
)

はじめに

PIAACやPISAといった複雑な調査データでは特殊な方法で標準誤差を推定する必要がある。このノートでは、PIAACのデータを例に標準誤差の推定方法を紹介する。^1

以下、

  1. 標準誤差の推定式を概観
  2. 平均値を例に、srvyrパッケージによる推定と手動での推定を比較
  3. kamaken::gcom_jackを利用した因果効果の推定

を行う。

前準備

パッケージ読み込み

library(tidyverse)
library(haven)
library(survey)
library(srvyr)
library(kamaken)

kable <- partial(knitr::kable, digits = 3)

データ読み込み

data <- read_sav('https://webfs.oecd.org/piaac/puf-data/SPSS/prgjpnp1.sav')

df <- 
  data |> 
  # 変数を絞る
  select(
    country = CNTRYID, 
    id = SEQID,
    age = AGE_R, 
    gender = GENDER_R, 
    region = REG_TL2, 
    edu = B_Q01a, 
    medu = J_Q06b, 
    fedu = J_Q07b, 
    numbooks = J_Q08,
    sampling_weight = SPFWT0, 
    # 読解力、数的思考力、ITスキルのスコア
    matches('^PV'), 
    # Replicate weights
    matches(str_c('SPFWT', 1:80)),
    VEMETHOD
  ) |> 
  # 変数名を小文字に変換
  rename_with(str_to_lower) |> 
  as_factor() |> 
  mutate(
    age = as.character(age) |> parse_double(),
    # 年齢をカテゴリ化
    agegroup = cut(
      age,
      breaks = c(15, 20, 25, 30, 35, 40, 45, 50, 55, 60, 65),
      labels = c('16-20', '21-25', '26-30', '31-35', '36-40', '41-45', '46-50', '51-55', '56-60', '61-65')
    ),
    # 大学卒業ダミー
    univ = case_match(
      edu,
      'No formal qualification or below ISCED 1' ~ 0,
      'ISCED 2' ~ 0,
      'ISCED 3C shorter than 2 years' ~ 0,
      'ISCED 3C 2 years or more' ~ 0,
      'ISCED 3A-B' ~ 0,
      'ISCED 3 (without distinction A-B-C, 2y+)' ~ 0,
      'ISCED 4 (without distinction A-B-C)' ~ 0,
      'ISCED 5B' ~ 0,
      'ISCED 5A, bachelor degree' ~ 1,
      'ISCED 5A, master degree' ~ 1,
      'ISCED 6' ~ 1,
      'Foreign qualification' ~ NA,
      NA ~ NA
    )
  )

標準誤差の推定式

$$ \mathrm{SE}\theta = \sqrt{h\sum{r=1}^{R} (\hat{\theta}_{(r)} - \hat{\theta})^2} $$

平均値の推定

の二つを推定する

packageによる推定

df_design <- 
  df |> 
  # 調査デザインの設定
  as_survey_rep(
    weights = sampling_weight, 
    repweights = matches('spfwt'), 
    type = 'JK2',
    mse = TRUE
  )

result_literacy <- 
  df_design |> 
  # 読解力の各PVごとに平均値と標準誤差を計算
  summarise(
    across(matches('pvlit'), \(x) survey_mean(x, na.rm = TRUE))
  ) 

# 結果の整理
package <- 
  result_literacy |>
  rename_with(\(x) str_replace(x, '(\\d)$', '\\1_estimate')) |>
  # 縦持ちに変換
  pivot_longer(
    cols = matches('pvlit'),
    names_to = c('literacy', '.value'),
    names_pattern = '(pvlit\\d{1,2})_(.+)',
  ) 

kable(package)

手作業による推定

point_estimate <-
  df |> 
  select(sampling_weight, matches('(pvlit|spfwt)')) |> 
  summarise(
    across(
      matches('pvlit'), 
      \(x) weighted.mean(x, w = sampling_weight, na.rm = TRUE), 
      .names = '{.col}_estimate'
    )
  ) |> 
  pivot_longer(
    cols = matches('pvlit'),
    names_to = c('literacy', '.value'),
    names_pattern = '(pvlit\\d{1,2})_(.+)',
  )

# jackknife法に用いるウェイト(80個)
jackweight <- select(df, matches('spfwt'))

jack_estimate <-
  # 各ウェイトを用いて、それぞれのPVの平均値を計算
  map(
    jackweight, \(weight) 
    df |> 
      summarise(
        across(
          matches('pvlit'),
          \(x) weighted.mean(x, w = weight, na.rm = TRUE),
          .names = '{.col}_jack'
        )
      )
  ) |> 
  bind_rows(.id = 'replicate') |> 
  pivot_longer(
    cols = matches('pvlit'),
    names_to = c('literacy', '.value'),
    names_pattern = '(pvlit\\d{1,2})_(.+)',
  )

handmade <- 
  # 点推定値にジャックナイフウェイトを用いた推定値を結合
  left_join(
    point_estimate, 
    jack_estimate, 
    by = join_by(literacy)
  ) |> 
  # 標準誤差の計算
  summarise(
    estimate = mean(estimate),
    se = sqrt(sum((estimate - jack)^2)),
    .by = literacy
  )

kable(handmade)

結果の比較

left_join(
  package, 
  handmade, 
  by = join_by(literacy),
  suffix = c('_package', '_handmade')
) |> 
  select(literacy, estimate_package, estimate_handmade, se_package, se_handmade) |>
  mutate(
    diff_estimate = estimate_package - estimate_handmade,
    diff_se = se_package - se_handmade
  ) |> 
  kable()

PVの統合

$$ \hat{\theta} = \frac{1}{M} \sum_{m=1}^{M} \hat{\theta}_{(m)} $$

$$ \bar{U} = \frac{1}{M} \sum_{m=1}^{M} \hat{U}_{(m)} $$

$$ B = \frac{1}{M-1} \sum_{m=1}^{M} (\hat{\theta}_{(m)} - \hat{\theta})^2 $$

$$ \hat{SE} = \sqrt{\bar{U} + \left(1 + \frac{1}{M}\right)B} $$

handmade |> 
  pool_rubin(std.error = se) |> 
  kable()

より複雑なケース

因果効果の推定

推定結果

# 並列化
future::plan('multisession')

df |> 
  filter(age >= 26) |> 
  gcomp_jack(
    .outcome = pvlit1,
    .treatment = univ,
    .formula_rhs = ~ 1 | univ^gender^agegroup,
    .estimand = 'ATE',
    .weights = sampling_weight,
    .repweights = matches('spfwt'),
    .type = 'JK2',
    .by = c(gender, agegroup)
  ) |> 
  kable()

10個のPVに対して一括で推定

# 読解力PVの変数名を取得
pvs <- df |> select(matches('pvlit')) |> names()

result <- 
  map(
    pvs,
    \(x) gcomp_jack(
      .data = df |> filter(age >= 26),
      # mapで回すときは`!!`または`{{`を使う
      .outcome = !!x,
      # .outcome = {{x}},
      .treatment = univ,
      .formula_rhs = ~ 1 | univ^gender^agegroup,
      .estimand = 'ATE',
      .weights = sampling_weight,
      .repweights = matches('spfwt'),
      .type = 'JK2',
      .by = gender
    )) |> 
  bind_rows(.id = 'pv')

result

結果の統合

result |> 
  pool_rubin(term = c(estimand, gender)) |> 
  kable()

Jakubowski, Maciej & Artur Pokropek, 2019, "piaactools: A program for data analysis with PIAAC data," The Stata Journal, 19(1): 112-128. https://doi.org/10.1177/1536867X19830909

[^2]: Buuren, Stef van. 2018. Flexible Imputation of Missing Data. 2nd ed. New York: Chapman & Hall/CRC. https://stefvanbuuren.name/fimd/

[^3]: Hernán, Miguel A. & James M. Robins, 2024, Causal Inference: What If, Boca Raton: Chapman & Hall/CRC. https://www.hsph.harvard.edu/miguel-hernan/causal-inference-book/



Sickle-Sword/kamaken documentation built on Feb. 2, 2025, 2:56 a.m.