This template includes all the code needed to construct a topic model of gender in science reports using the stm package, all the way from converting PDF files to text to producing output to help interpret the model.
Analysts must set the file paths and parameters listed in the section "Set file paths and parameters" below, but then this document can be knitted with "Knit" button in RStudio and the entire analysis will be run.
However, the full analysis does take a while to run, so you may want to split this code up into multiple files and run them in sequence in case you run into any bugs along the way.
Also, note that this script creates folders and files in the working directory, so make sure your working directory is set to a good place to store the intermediate and results files for this analysis.
Install these with install.packages()
(or remotes::install_github()
in the
case of gensci.stm
) if they are not already installed.
library(gensci.stm) library(stringr) library(purrr) library(dplyr) library(texanaaid)
Set the path to Dropbox on your computer (assuming reports are stored in Dropbox).
path_to_dropbox <- ""
Set the paths to the directories containing the full reports and containing the reports broken up into sections.
full_reports_dir <- file.path(path_to_dropbox, "Data", "All_Reports_Our_Sample_OCR_May_2016") report_sections_dir <- file.path(path_to_dropbox, "Data", "Sorted_report_pdfs")
Set the path to the directory containing the reports converted to text by Adobe Acrobat, which we discovered produces better quality conversions for some reports, and list the names of reports for which to use the Adobe conversion.
adobe_full_reports_dir <- file.path( path_to_dropbox, "Data", "All_Reports_Our_Sample_Converted_to_Text", "Converted With Acrobat" ) adobe_report_sections_dir <- file.path(path_to_dropbox, "Data", "Sorted_report_txts") adobe_reports <- c("NSF_2004", "UN_2000", "UN_2003", "US_2000", "US_2004", "EC_1999a", "EC_2001", "NRC_1987", "NRC_2010")
Set the names of the folders which will be created in your working directory to store copies of the PDF report files and the files containing the text extracted from the PDFs.
corpus_pdf_dir <- "Latest Sample PDFs" corpus_txt_dir <- "Latest Sample TXT"
Set the name of the folder to be created in your working directory in which to store data on common n-grams.
ngram_dir <- "Latest Sample Ngrams"
Set the name of the folders to be created inyour working directory in which to store the report content after removing repetitive "boilerplate" text, in which to store the actual repetitive text removed (for reference), and in which to store algorithmically-identified header and footer text removed from the reports.
boilerplate_removed_dir <- "Latest Sample Boilerplate Removed" boilerplate_dir <- "Boilerplate Removed from Latest Sample" hf_dir <- "Headers and Footers Removed from Latest Sample"
Set the name of the folders to be created inyour working directory in which to store the report content after removing named entities, and in which to store the named entities removed (for reference).
nes_removed_dir <- "Latest Sample NEs Removed" nes_dir <- "NEs Removed from Latest Sample"
Set the name of the file, to be saved in your working directory, that will contain all necessary direct inputs for the STM topic model.
stm_input_file <- "input_for_stm_nes_removed.Rdata"
Do you want to analyze the full reports, or just particular sections of the reports? Options for this parameter are "full", "intro", "executive-summary", "foreword-preface", and "conclusion". Note that for option "foreword-preface", only the preface will be included for reports that have both a foreword and a preface.
report_section <- "full"
Should UN reports be excluded?
exclude_un <- TRUE
Should the reports be split into 3,000 word sections, with each section treated
as its own document? This option will be ignored if report_section
is not
set to "full".
split_documents_into_chunks <- TRUE
Set the number of topics "k", and the name of the folder to be created in your working directory in which to store the model results.
k <- 10 results_dir <- paste0("gender-in-science-topic-model-", k, "-topics")
Set the title of the PDF summary report file, to be saved in results_dir
.
pdf_summary_title <- if (report_section == "full") { paste0("Summary Report, Full Reports, ", k, " Topics") } else { paste0("Summary Report, ", str_to_title(report_section), " Sections, ", k, " Topics") }
report_section_patterns <- c( "intro" = "_intro\\.(pdf|txt)$", "executive-summary" = "_exec\\.(pdf|txt)$", "foreword-preface" = "_(fore|pref)\\.(pdf|txt)$", "conclusion" = "_concl\\.(pdf|txt)$" ) make_sure_dir(corpus_pdf_dir) if (report_section == "full") { report_files <- list.files(full_reports_dir) no_ref_files <- str_replace(report_files, "\\.pdf$", "_no_ref.pdf") has_refs_removed_version <- no_ref_files %in% list.files(sorted_reports_dir) file.copy( from = file.path(full_reports_dir, report_files[!has_refs_removed_version]), to = file.path(corpus_pdf_dir, report_files[!has_refs_removed_version]) ) file.copy( from = file.path(sorted_reports_dir, no_ref_files[has_refs_removed_version]), to = file.path(corpus_pdf_dir, report_files[has_refs_removed_version]) ) } else { if (!report_section %in% names(report_section_patterns)) { stop( "Set the value of `report_section` to one of the following values: ", paste0('"', c("full", names(report_section_patterns)), '"', collapse = ", ") ) } focal_file_pattern <- report_section_patterns[report_section] focal_files <- list.files(report_sections_dir, pattern = focal_file_pattern) copy_results <- file.copy( from = file.path(report_sections_dir, focal_files), to = file.path( corpus_pdf_dir, str_replace(focal_files, focal_file_pattern, ".pdf") ) ) if (!all(copy_results)) { stop("Problem copying PDF files; make sure all paths are specified ", "correctly and that files do not already exist in destination") } } if (exclude_un) { # Remove the UN reports UN_meta <- gensci_meta %>% filter(US.EU.UN == "INT") adobe_reports <- setdiff(adobe_reports, UN_meta$New.Names.for.Pdfs) if (report_section == "full") { UN_files <- UN_meta %>% pull(New.Names.for.Pdfs) } else if (report_section == "intro") { UN_files <- UN_meta %>% filter(!is.na(Introduction)) %>% pull(New.Names.for.Pdfs) } else if (report_section == "executive-summary") { UN_files <- UN_meta %>% filter(!is.na(Executive.Summary)) %>% pull(New.Names.for.Pdfs) } else if (report_section == "foreword-preface") { foreword_files <- UN_meta %>% filter(!is.na(Foreword)) %>% pull(New.Names.for.Pdfs) preface_files <- UN_meta %>% filter(!is.na(Preface)) %>% pull(New.Names.for.Pdfs) UN_files <- unique(c(foreword_files, preface_files)) } else if (report_section == "conclusion") { UN_files <- UN_meta %>% filter(!is.na(Conclusion)) %>% pull(New.Names.for.Pdfs) } remove_results <- file.remove( file.path(corpus_pdf_dir, paste0(UN_files, ".pdf")) ) }
extract_text_from_pdfs(corpus_pdf_dir, corpus_txt_dir, save_Rdata=FALSE, save_txt_files=TRUE, language="en")
if (report_section == "full") { # If we have a version with the reference section removed, use it has_refs_removed_version <- paste0(adobe_reports, "_no_ref.txt") %in% list.files(adobe_report_sections_dir) adobe_no_ref_files <- sapply(adobe_reports[has_refs_removed_version], paste0, "_no_ref") substitute_documents( corpus_txt_dir, adobe_report_sections_dir, replacement_scheme = adobe_no_ref_files ) # Otherwise use the full report adobe_files <- sapply(adobe[!has_refs_removed_version], paste0) substitute_documents( corpus_txt_dir, full_reports_dir, replacement_scheme = adobe_files ) } else { if (!report_section %in% names(report_section_patterns)) { stop( "Set the value of `report_section` to one of the following values: ", paste0('"', c("full", names(report_section_patterns)), '"', collapse = ", ") ) } focal_file_pattern <- report_section_patterns[report_section] has_focal_section <- map_lgl( paste0(adobe_reports, focal_file_pattern), ~ any(str_detect(list.files(adobe_report_sections_dir), .)) ) focal_files <- map_chr( paste0(adobe_reports[has_focal_section], focal_file_pattern), ~ str_subset(list.files(adobe_report_sections_dir), .) ) replacement_scheme <- set_names( str_remove(focal_files, "\\.txt$"), adobe_reports[has_focal_section] ) substitution_results <- substitute_documents( corpus_txt_dir, adobe_report_sections_dir, replacement_scheme = replacement_scheme ) if (!all(substitution_results)) { stop("Problem substituting documents") } }
make_sure_dir(ngram_dir) get_ngrams(corpus_txt_dir, ngram_dir)
for(dir in c(boilerplate_removed_dir, boilerplate_dir, hf_dir)) make_sure_dir(dir) remove_boilerplate(corpus_txt_dir, ngram_dir, boilerplate_removed_dir, boilerplate_dir, hf_dir)
remove_state_and_country_names(boilerplate_removed_dir, boilerplate_removed_dir)
make_sure_dir(nes_removed_dir) make_sure_dir(nes_dir) remove_named_entities(boilerplate_removed_dir, nes_removed_dir, nes_dir)
if (report_section != "full") { split_documents_into_chunks <- FALSE } create_stm_input( nes_removed_dir, stm_input_file, split_docs = split_documents_into_chunks )
make_sure_dir(results_dir) run_stm(stm_input_file, results_dir, k, split_docs = split_documents_into_chunks)
make_pdf_summary_report( stm_input_file, results_dir, pdf_summary_title, geogs = c("EU", "US", if (!exclude_un) "UN") )
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.