Reproducible Research

Make sure you've followed the instructions under "Setting up your API key" in the ipumsr API vignette (vignette("ipums-api", package = "ipumsr")) before * running this template.*

{.tabset}

Delete this section before sharing

This template is for two types of IPUMS users:

  1. The user creating a new analysis that they'd like to share. We'll call this user the analyst.
  2. The user with whom the analyst shares their analysis, and who wants to run, and perhaps modify, that analysis themself. We'll call this user the collaborator.

This template uses the IPUMS API to help the analyst and collaborator work with the same dataset. It helps the analyst by including code to download their IPUMS data extract and save the extract definition in a shareable format, and it helps the collaborator by including code to create and download a new extract matching that shared definition.

If you're reading this, you are probably the analyst, because we recommend that the analyst deletes this section before sharing their analysis.

If you are the analyst, follow these steps to make your analysis shareable:

  1. Submit the extract you want to analyze using the online extract system or API functions.
  2. Fill in the parameters in the first code chunk below.
  3. Click the RStudio Knit button or use rmarkdown::render() to run the template. Repeat this step until your extract is ready.
  4. Once your extract is downloaded and the HTML report is created, update the file paths in the "Define File Paths" section below by copying the code generated at the bottom of this section in the HTML report.
  5. Delete this section and proceed with your analysis.
#### Key Parameters #####

# If you change any of these parameters after running the template, delete all 
#   files in `data_dir` to ensure a fresh start

collection <- "usa" # The IPUMS data collection of your extract; run 
                    # `ipums_data_collections()` for a list of supported
                    # collections

extract_num <- NULL # The extract number, or leave as `NULL` for your most 
                    # recent extract

descriptive_name <- "my_ipums_extract" # A descriptive label for your extract; 
                                       # used to rename your data files

data_dir <- "data" # The folder in which to save data, codebook, and .json files

This next code chunk pulls down your extract definition and saves it to a JSON file in data_dir. If the extract is ready, it downloads the data and codebook files to data_dir and renames them according to descriptive_name. If the extract is not ready, the code throws an error to inform you that your extract is not ready yet, and that you should try re-running this template again later.

# Load ipumsr
suppressPackageStartupMessages(
  library(ipumsr)
)

# Create data_dir if it doesn't exist
if (!dir.exists(data_dir)) dir.create(data_dir)

# Define file paths
json_path <- file.path(data_dir, paste0(descriptive_name,".json"))
renamed_data_path <- file.path(data_dir, paste0(descriptive_name,".dat.gz"))
renamed_ddi_path <- file.path(data_dir, paste0(descriptive_name,".xml"))
gitignore_path <- file.path(data_dir, ".gitignore")

# Get info on the designated extract (most recent extract if extract_num is NULL)
if (is.null(extract_num)) {
  extract_definition <- get_last_extract_info(collection)
} else {
  extract_definition <- get_extract_info(c(collection, extract_num))
}

# Do we already have the data and/or the JSON?
no_json <- !file.exists(json_path)
no_data_yes_json <- !file.exists(renamed_data_path) & file.exists(json_path)
yes_data_yes_json <- file.exists(renamed_data_path) & file.exists(json_path)

# If no JSON file, create it
if (no_json) {
  save_extract_as_json(extract_definition, file = json_path)
  no_data_yes_json <- TRUE
}

# If we don't yet have the data, check whether the extract is ready 
if (no_data_yes_json) {
  extract_is_ready <- is_extract_ready(extract_definition)
  extract_is_stale <- !extract_is_ready & extract_definition$status == "completed"
  if (extract_is_stale) {
    stop(
      paste0(
        "The data files for ", collection, " extract number ", 
        extract_definition$number, " have been removed from IPUMS servers. ",
        "Resubmit this extract by running `submit_extract(get_extract_info(\"", 
        collection, ":", extract_definition$number, "\"))` and update the ", 
        "`extract_num` parameter before re-running the template."
      ),
      call. = FALSE
    )
  }

  # If extract is ready, download files and rename according to `descriptive_name`
  if (extract_is_ready) {
    ddi_file <- download_extract(extract_definition, download_dir = data_dir)
    data_file <- gsub("\\.xml$", ".dat.gz", ddi_file)
    ddi_file_successfully_renamed <- file.rename(ddi_file, renamed_ddi_path)
    data_file_successfully_renamed <- file.rename(data_file, renamed_data_path)
    if (!ddi_file_successfully_renamed || !data_file_successfully_renamed) {
      stop(
        "Problem renaming DDI and/or data file; please report bug at ", 
        "https://github.com/ipums/ipumsr/issues, including a copy of this ", 
        "file if possible.", call. = FALSE
      )
    }
    # Add the data and codebook files to .gitignore
    files_to_gitignore <- c(
      basename(renamed_data_path), 
      basename(renamed_ddi_path)
    )
    if (file.exists(gitignore_path)) {
      existing_gitignore_lines <- readLines(gitignore_path)
      files_to_gitignore <- c(existing_gitignore_lines, files_to_gitignore)
    }
    writeLines(files_to_gitignore, con = gitignore_path)
    yes_data_yes_json <- TRUE
  } else { # If extract isn't ready, stop execution
    stop(
      "NOT AN ERROR: ", collection, " extract number ", 
      extract_definition$number, " is not yet ready to download. Try ", 
      "re-running again later.", call. = FALSE
    )
  }
}

# If data are downloaded, copy file paths, then delete this section
if (yes_data_yes_json) {
  cat(  
    paste0(
      "```\n",
      "Data, codebook, and .json extract definition files have been saved to ", 
      "folder \"", data_dir, "\".\n\nNext, copy the code below into the ", 
      "\"Define File Paths\" code chunk, overwriting the existing code:\n\n", 
      "extract_definition_path <- \"", json_path, "\"\n",
      "data_path <- \"", renamed_data_path, "\"\n",
      "ddi_path <- \"", renamed_ddi_path, "\"\n\n",
      "Finally, delete all text and code in the section \"Delete this section ", 
      "before sharing\"\n",
      "```"
    )
  )
}

Load Packages

suppressPackageStartupMessages({
  library(ipumsr)
  # library() additional packages as necessary
})

Define File Paths

extract_definition_path <- json_path
data_path <- gsub("\\.json$", ".dat.gz", extract_definition_path)
ddi_path <- gsub("\\.json$", ".xml", extract_definition_path)

Load your IPUMS Data

This analysis of IPUMS data is designed to be shared, and thus does not assume that you have already downloaded the data used in the analysis. The code below checks whether the data are already downloaded, and if they aren't, it submits a new IPUMS extract request according to the specifications in the included extract definition JSON file.

# Define path to "waiting_for_extract" flag file
data_dir <- dirname(extract_definition_path)
waiting_for_extract_path <- file.path(data_dir, "waiting_for_extract.txt")

# Ensure the JSON extract definition is present
json_file_exists <- file.exists(extract_definition_path)
if (!json_file_exists) {
  stop(
    "File '", extract_definition_path, "' not found; make sure that ",
    "`extract_definition_path` is the path to the .json extract definition ", 
    "file.", call. = FALSE
  )
}

# Are the data downloaded, or are we waiting for an extract?
data_not_downloaded <- !file.exists(data_path)
data_downloaded <- file.exists(data_path)
waiting_for_extract <- file.exists(waiting_for_extract_path)

# Ensure that IPUMS_API_KEY environment variable is defined
ipums_api_key_undefined <- Sys.getenv("IPUMS_API_KEY") == ""
if (data_not_downloaded & ipums_api_key_undefined) {
  stop(
    "Environment variable 'IPUMS_API_KEY' is undefined. Make sure you've ", 
    "followed the instructions under 'Setting up your API key' in the ", 
    "ipumsr API vignette (`vignette(\"ipums-api\", package = \"ipumsr\")`) ", 
    "before running this script.", call. = FALSE
  )
}

# If not yet waiting for extract, create and submit a new extract and create 
#   the "waiting_for_extract" flag file
if (data_not_downloaded & !waiting_for_extract) {
  extract_definition <- define_extract_from_json(extract_definition_path)
  submitted_extract <- submit_extract(extract_definition)
  writeLines(
    paste0(submitted_extract$collection, ":", submitted_extract$number),
    con = waiting_for_extract_path
  )
  waiting_for_extract <- TRUE
}

# If waiting for an extract, read extract ID from flag file and check the status
if (data_not_downloaded & waiting_for_extract) {
  extract_id <- readLines(waiting_for_extract_path)
  extract_info <- get_extract_info(extract_id)
  extract_is_ready <- is_extract_ready(extract_info)
  extract_is_stale <- !extract_is_ready & extract_info$status == "completed"
  if (extract_is_stale) {
    stop(
      paste0(
        "The data files for ", extract_info$collection, " extract number ", 
        extract_info$number, " have been removed from IPUMS servers. ",
        "Please delete the file '", waiting_for_extract_path, "' and re-run ", 
        "the template."
      ),
      call. = FALSE
    )
  }

  # If the extract is ready, download files and rename to match the JSON file, 
  #   then delete the waiting_for_extract flag file
  if (extract_is_ready) {
    orig_ddi_path <- download_extract(extract_info, download_dir = data_dir)
    orig_data_path <- gsub("\\.xml$", ".dat.gz", orig_ddi_path)
    ddi_file_successfully_renamed <- file.rename(orig_ddi_path, ddi_path)
    data_file_successfully_renamed <- file.rename(orig_data_path, data_path)
    if (!ddi_file_successfully_renamed || !data_file_successfully_renamed) {
      stop(
        "Problem renaming DDI and/or data file; please report bug at ", 
        "https://github.com/ipums/ipumsr/issues, including a copy of this ", 
        "file if possible.", call. = FALSE
      )
    }
    data_downloaded <- TRUE
    waiting_file_successfully_removed <- file.remove(waiting_for_extract_path)
    if (!waiting_file_successfully_removed) {
      stop(
        "Unable to remove 'waiting_for_extract.txt'; please report bug at ", 
        "https://github.com/ipums/ipumsr/issues, including a copy of this ", 
        "file if possible.", call. = FALSE
      )
    }
  } else { # If extract is not ready, stop execution
    stop(
      "NOT AN ERROR: ", extract_info$collection, " extract number ", 
      extract_info$number, " is not yet ready to download. Try ", 
      "re-running again later.", call. = FALSE
    )
  }
}
ddi <- read_ipums_ddi(ddi_path)
data <- read_ipums_micro(ddi, data_file = data_path)

Analysis Awaits {.active}

data


Try the ipumsr package in your browser

Any scripts or data that you put into this service are public.

ipumsr documentation built on Sept. 12, 2024, 7:38 a.m.