This template includes all the code needed to construct a topic model of gender in science reports using the stm package, all the way from converting PDF files to text to producing output to help interpret the model.

Analysts must set the file paths and parameters listed in the section "Set file paths and parameters" below, but then this document can be knitted with "Knit" button in RStudio and the entire analysis will be run.

However, the full analysis does take a while to run, so you may want to split this code up into multiple files and run them in sequence in case you run into any bugs along the way.

Also, note that this script creates folders and files in the working directory, so make sure your working directory is set to a good place to store the intermediate and results files for this analysis.

Attach required packages

Install these with install.packages() (or remotes::install_github() in the case of gensci.stm) if they are not already installed.

library(gensci.stm)
library(stringr)
library(purrr)
library(dplyr)
library(texanaaid)

Set file paths and parameters

Path to Dropbox

Set the path to Dropbox on your computer (assuming reports are stored in Dropbox).

path_to_dropbox <- ""

Paths to PDF reports

Set the paths to the directories containing the full reports and containing the reports broken up into sections.

full_reports_dir <- file.path(path_to_dropbox, "Data", 
                              "All_Reports_Our_Sample_OCR_May_2016")
report_sections_dir <- file.path(path_to_dropbox, "Data", "Sorted_report_pdfs")

Paths to Adobe-converted text reports

Set the path to the directory containing the reports converted to text by Adobe Acrobat, which we discovered produces better quality conversions for some reports, and list the names of reports for which to use the Adobe conversion.

adobe_full_reports_dir <- file.path(
  path_to_dropbox, 
  "Data", 
  "All_Reports_Our_Sample_Converted_to_Text", 
  "Converted With Acrobat"
)
adobe_report_sections_dir <- file.path(path_to_dropbox, "Data", "Sorted_report_txts")
adobe_reports <- c("NSF_2004", "UN_2000", "UN_2003", "US_2000", "US_2004", 
                   "EC_1999a", "EC_2001", "NRC_1987", "NRC_2010")

Paths at which to save intermediate outputs

Set the names of the folders which will be created in your working directory to store copies of the PDF report files and the files containing the text extracted from the PDFs.

corpus_pdf_dir <- "Latest Sample PDFs"
corpus_txt_dir <- "Latest Sample TXT"

Set the name of the folder to be created in your working directory in which to store data on common n-grams.

ngram_dir <- "Latest Sample Ngrams"

Set the name of the folders to be created inyour working directory in which to store the report content after removing repetitive "boilerplate" text, in which to store the actual repetitive text removed (for reference), and in which to store algorithmically-identified header and footer text removed from the reports.

boilerplate_removed_dir <- "Latest Sample Boilerplate Removed"
boilerplate_dir <- "Boilerplate Removed from Latest Sample"
hf_dir <- "Headers and Footers Removed from Latest Sample"

Set the name of the folders to be created inyour working directory in which to store the report content after removing named entities, and in which to store the named entities removed (for reference).

nes_removed_dir <- "Latest Sample NEs Removed"
nes_dir <- "NEs Removed from Latest Sample"

Set the name of the file, to be saved in your working directory, that will contain all necessary direct inputs for the STM topic model.

stm_input_file <- "input_for_stm_nes_removed.Rdata"

Analysis parameters

Do you want to analyze the full reports, or just particular sections of the reports? Options for this parameter are "full", "intro", "executive-summary", "foreword-preface", and "conclusion". Note that for option "foreword-preface", only the preface will be included for reports that have both a foreword and a preface.

report_section <- "full"

Should UN reports be excluded?

exclude_un <- TRUE

Should the reports be split into 3,000 word sections, with each section treated as its own document? This option will be ignored if report_section is not set to "full".

split_documents_into_chunks <- TRUE

Set the number of topics "k", and the name of the folder to be created in your working directory in which to store the model results.

k <- 10
results_dir <- paste0("gender-in-science-topic-model-", k, "-topics")

Set the title of the PDF summary report file, to be saved in results_dir.

pdf_summary_title <- if (report_section == "full") {
  paste0("Summary Report, Full Reports, ", k, " Topics")
} else {
  paste0("Summary Report, ", str_to_title(report_section), " Sections, ", k, 
         " Topics")
}

Code that runs the analysis

Copy PDF files into subfolder of working directory

report_section_patterns <- c(
  "intro" = "_intro\\.(pdf|txt)$", 
  "executive-summary" = "_exec\\.(pdf|txt)$",
  "foreword-preface" = "_(fore|pref)\\.(pdf|txt)$",
  "conclusion" = "_concl\\.(pdf|txt)$"
)
make_sure_dir(corpus_pdf_dir)
if (report_section == "full") {
  report_files <- list.files(full_reports_dir)
  no_ref_files <- str_replace(report_files, "\\.pdf$", "_no_ref.pdf")
  has_refs_removed_version <- no_ref_files %in% list.files(sorted_reports_dir)

  file.copy(
    from = file.path(full_reports_dir, report_files[!has_refs_removed_version]), 
    to = file.path(corpus_pdf_dir, report_files[!has_refs_removed_version])
  )
  file.copy(
    from = file.path(sorted_reports_dir, no_ref_files[has_refs_removed_version]), 
    to = file.path(corpus_pdf_dir, report_files[has_refs_removed_version])
  )
} else {
  if (!report_section %in% names(report_section_patterns)) {
    stop(
      "Set the value of `report_section` to one of the following values: ", 
      paste0('"', c("full", names(report_section_patterns)), '"', 
             collapse = ", ")
    )
  }
  focal_file_pattern <- report_section_patterns[report_section]
  focal_files <- list.files(report_sections_dir, pattern = focal_file_pattern)
  copy_results <- file.copy(
    from = file.path(report_sections_dir, focal_files),
    to = file.path(
      corpus_pdf_dir, 
      str_replace(focal_files, focal_file_pattern, ".pdf")
    )
  )
  if (!all(copy_results)) {
    stop("Problem copying PDF files; make sure all paths are specified ", 
         "correctly and that files do not already exist in destination")
  }
}

if (exclude_un) {
  # Remove the UN reports
  UN_meta <- gensci_meta %>% 
      filter(US.EU.UN == "INT")
  adobe_reports <- setdiff(adobe_reports, UN_meta$New.Names.for.Pdfs)
  if (report_section == "full") {
    UN_files <- UN_meta %>% 
      pull(New.Names.for.Pdfs)
  } else if (report_section == "intro") {
    UN_files <- UN_meta %>% 
      filter(!is.na(Introduction)) %>% 
      pull(New.Names.for.Pdfs)
  } else if (report_section == "executive-summary") {
    UN_files <- UN_meta %>% 
      filter(!is.na(Executive.Summary)) %>% 
      pull(New.Names.for.Pdfs)
  } else if (report_section == "foreword-preface") {
    foreword_files <- UN_meta %>% 
      filter(!is.na(Foreword)) %>% 
      pull(New.Names.for.Pdfs)
    preface_files <- UN_meta %>% 
      filter(!is.na(Preface)) %>% 
      pull(New.Names.for.Pdfs)
    UN_files <- unique(c(foreword_files, preface_files))
  } else if (report_section == "conclusion") {
    UN_files <- UN_meta %>% 
      filter(!is.na(Conclusion)) %>% 
      pull(New.Names.for.Pdfs)
  }

  remove_results <- file.remove(
    file.path(corpus_pdf_dir, paste0(UN_files, ".pdf"))
  )
}

Extract text from PDFs

extract_text_from_pdfs(corpus_pdf_dir, corpus_txt_dir, save_Rdata=FALSE, save_txt_files=TRUE, language="en")

Substitute Adobe conversion for some reports

if (report_section == "full") {
  # If we have a version with the reference section removed, use it
  has_refs_removed_version <- 
    paste0(adobe_reports, "_no_ref.txt") %in% list.files(adobe_report_sections_dir)
  adobe_no_ref_files <- sapply(adobe_reports[has_refs_removed_version], paste0, "_no_ref")

  substitute_documents(
    corpus_txt_dir, 
    adobe_report_sections_dir, 
    replacement_scheme = adobe_no_ref_files
  )

  # Otherwise use the full report
  adobe_files <- sapply(adobe[!has_refs_removed_version], paste0)
  substitute_documents(
    corpus_txt_dir, 
    full_reports_dir, 
    replacement_scheme = adobe_files
  )
} else {
  if (!report_section %in% names(report_section_patterns)) {
    stop(
      "Set the value of `report_section` to one of the following values: ", 
      paste0('"', c("full", names(report_section_patterns)), '"', 
             collapse = ", ")
    )
  }
  focal_file_pattern <- report_section_patterns[report_section]
  has_focal_section <- map_lgl(
    paste0(adobe_reports, focal_file_pattern),
    ~ any(str_detect(list.files(adobe_report_sections_dir), .))
  )
  focal_files <- map_chr(
    paste0(adobe_reports[has_focal_section], focal_file_pattern),
    ~ str_subset(list.files(adobe_report_sections_dir), .)
  )
  replacement_scheme <- set_names(
    str_remove(focal_files, "\\.txt$"),
    adobe_reports[has_focal_section]
  )
  substitution_results <- substitute_documents(
    corpus_txt_dir,
    adobe_report_sections_dir,
    replacement_scheme = replacement_scheme
  )
  if (!all(substitution_results)) {
    stop("Problem substituting documents")
  }
}

Get ngrams to identify repetitive "boilerplate" text

make_sure_dir(ngram_dir)
get_ngrams(corpus_txt_dir, ngram_dir)

Remove repetitive "boilerplate" text

for(dir in c(boilerplate_removed_dir, boilerplate_dir, hf_dir)) make_sure_dir(dir)
remove_boilerplate(corpus_txt_dir, ngram_dir, boilerplate_removed_dir, 
                   boilerplate_dir, hf_dir)

Remove state and country names

remove_state_and_country_names(boilerplate_removed_dir, boilerplate_removed_dir)

Remove named entities

make_sure_dir(nes_removed_dir)
make_sure_dir(nes_dir)
remove_named_entities(boilerplate_removed_dir, nes_removed_dir, nes_dir)

Create input objects for STM analysis

if (report_section != "full") {
  split_documents_into_chunks <- FALSE
}
create_stm_input(
  nes_removed_dir, 
  stm_input_file, 
  split_docs = split_documents_into_chunks
)

Run STM analysis

make_sure_dir(results_dir)
run_stm(stm_input_file, results_dir, k, split_docs = split_documents_into_chunks)

Produce summary PDF

make_pdf_summary_report(
  stm_input_file, 
  results_dir, 
  pdf_summary_title, 
  geogs = c("EU", "US", if (!exclude_un) "UN")
)


dtburk/gensci.stm documentation built on Nov. 13, 2019, 12:33 a.m.