knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>",
  fig.path = "man/figures/README-",
  out.width = "100%"
)

Introduction

This packages standardizes the output for the R packages wru, predictrace, and includes a python script and virtual environment setup for the python library ethnicolr.

Codes

library(rRace); library(reticulate); library(tidyverse); library(ggthemes)

We test the package with a build-in table "name_table" which includes some random names.

tab_names <- name_table
head(tab_names)

Prediction based on predictrace

tab_prr <- race_prr(tab_names)
head(tab_prr)

Prediction based on wru

First we have to download census data

download_census(
  .key = "YOUR KEY",
  .geo = "county",
  .use_age = TRUE,
  .use_gen = TRUE,
  .dir = "_debug_data/census/",
  .workers = 10,
  .retry = 10,
  .progress = TRUE,
  .states = unique(tab_names$state)
)
tab_wru <- race_wru(
  .tab = tab_names,
  .use_geo = TRUE,
  .use_age = TRUE,
  .use_gen = TRUE,
  .census_dir = "_debug_data/census/",
  .census_geo = "county"
)

head(tab_wru)

Prediction based on ethnicolr

Python Setup

ToDO: Explain the Setup

.py_path <- filter(conda_list(), name == "env-ethnicolr")[[2]]
Sys.setenv(RETICULATE_PYTHON = .py_path)
reticulate::py_discover_config()

Load Function

The ethnicolr function is stored as a system file, you can easily load it with the following command.

.path_py <- system.file("extdata/script_ethnicolr.py", package = "rRace")
reticulate::source_python(.path_py)

You can safely ignore the error about tensorflow (if present). It only tells you that you don't have a GPU setup on your machine.

As a reference the complete function (not necessary to execute it)

import os 
import pandas as pd 
import ethnicolr

def race_eth(tab, methods = ["CEL", "FLF", "FLL"]):

    # CEL: CENSUS DATA, prediction on LAST NAME ------------------------------------------
    if "CEL" in methods:
        df1 = ethnicolr.pred_census_ln(df=tab, namecol="last_name")
        cols = {'api': 'pasian', 'black': 'pblack', 'hispanic': 'phispa', 'white': 'pwhite'}
        df1.rename(columns=cols, inplace=True)
        df1["method"]="CEL-1-0-0-0-0-N"
    else:
        df1=pd.DataFrame()

    # FLF: FLORIDA DATA, prediction on FULL NAME -----------------------------------------
    if "FLF" in methods:
        df2 = ethnicolr.pred_fl_reg_name(df=tab, lname_col="last_name", fname_col="first_name")
        cols={'asian': 'pasian', 'nh_black': 'pblack', 'hispanic': 'phispa', 'nh_white': 'pwhite'}
        df2.rename(columns=cols, inplace=True)
        df2["method"]="FLF-1-1-0-0-0-N"
    else:
        df2=pd.DataFrame()

    # FLL: FLORIDA DATA, prediction on LAST NAME -----------------------------------------
    if "FLL" in methods:
        df3 = ethnicolr.pred_fl_reg_ln(df=tab, namecol="last_name")
        cols={'asian': 'pasian', 'nh_black': 'pblack', 'hispanic': 'phispa', 'nh_white': 'pwhite'}
        df3.rename(columns=cols, inplace=True)
        df3["method"]="FLL-1-1-0-0-0-N"
    else:
        df3=pd.DataFrame()

    # combine dataframes to single dataframe ----------------------------------------------------
    df_out=pd.concat([df1, df2, df3], ignore_index=True)
    df_out["pother"]=0

    # select and reorder columns ----------------------------------------------------------------
    cols_old=list(tab.columns.values)
    cols_old.remove("__name")
    cols_new=["method", "pasian", "pblack", "phispa", "pwhite", "pother", "race"]
    df_out=df_out[cols_old+cols_new]

    # recode race values ------------------------------------------------------------------------
    race_old=["hispanic", "nh_white", "nh_black"]
    race_new=["hispa", "white", "black"]
    df_out["race"]=df_out["race"].replace(race_old, race_new)

    # return output -----------------------------------------------------------------------------
    return df_out

Again, you can safely ignore the error about tensorflow (if present).

Prediction (Ethnicolr)

tab_eth <- as_tibble(race_eth(tab_names))
head(tab_eth)

Combine Predictions

tab_race <- race_predict(tab_prr, tab_wru, tab_eth)
head(tab_race)

Output

Race based on Highest Prob.

tab_race %>%
  filter(rank_hp == 1) %>%
  count(race) %>%
  ggplot(aes(race, n, fill = race)) + 
  geom_col() + 
  scale_fill_brewer(palette = "PuBu") +
  ggthemes::geom_rangeframe(color = "black") +
  ggthemes::theme_tufte()

Race based on Guess Diff.

tab_race %>%
  filter(rank_gd == 1) %>%
  count(race) %>%
  ggplot(aes(race, n, fill = race)) + 
  geom_col() + 
  scale_fill_brewer(palette = "PuBu") +
  ggthemes::geom_rangeframe(color = "black") +
  ggthemes::theme_tufte()


MatthiasUckert/Rrace documentation built on Sept. 8, 2023, 3:26 p.m.