# PAPERS.R
#
# This script exports a table of working paper attributes.
#
# Ben Davies
# March 2022
# Load packages
library(bldr)
library(data.table)
library(dplyr)
library(readr)
library(rvest)
library(stringr)
library(tidyr)
library(xml2)
# Import helper functions
source('data-raw/helpers.R')
# Import raw metadata
papers_raw = fread('data-raw/metadata/working_papers.tab', quote = '', encoding = 'Latin-1')
outlets_raw = fread('data-raw/metadata/working_papers_published.tab', quote = '', encoding = 'Latin-1')
# Set boundary issue date
max_issue_date = '2021-12-31'
# Define function for removing known parenthetical notes
remove_parenthetical_notes = function(x) {
subfun = function(x, pattern, y) gsub(pattern, y, x, ignore.case = T, perl = TRUE)
x %>%
subfun('\\(expanded version\\)', '') %>% # 8387, 12024
subfun('\\(long version\\)', '') %>% # 14662, 15660, 15881
subfun('\\(part.*\\)$', '') %>% # 3018, 3344
subfun('\\(rev.*\\)', '') %>% # 337, 409, 414, 508, 584, 660, 942
subfun('\\(see also.*\\)', '') %>% # 3131, 3132
subfun('\\(series.*ble\\)', '') # 8467
}
# Define function for removing HTML tags, replacing non-ASCII characters
# with ASCII equivalents, forcing spaces after colons, and squishing whitespace
clean_text = function(x) {
subfun = function(x, pattern, y) gsub(pattern, y, x, perl = TRUE)
x %>%
subfun('<.*?>(.*)</.*?>', '\\1') %>%
subfun('à', 'a') %>%
subfun('á', 'a') %>%
subfun('é', 'e') %>%
subfun('í', 'i') %>%
subfun('ï', 'i') %>%
subfun('ñ', 'n') %>%
subfun('ô', 'o') %>%
subfun('`', '\'') %>%
subfun('‘', '\'') %>%
subfun('’', '\'') %>%
subfun('“', '"') %>%
subfun('”', '"') %>%
subfun('‐', '-') %>%
subfun('–', '--') %>%
subfun('—', '---') %>%
subfun(':([A-Za-z0-9])', ': \\1') %>%
stringr::str_squish()
}
# Define function for fixing title-specific errors
fix_title = function(x) {
subfun = function(x, pattern, y) gsub(pattern, y, x, perl = TRUE)
x %>%
subfun(' -A', ' - A') %>% # 13
subfun('Useof', 'Use of') %>% # 66
subfun('THe', 'The') %>% # 86
subfun('nS', 'n S') %>% # 138
subfun('fL', 'f L') %>% # 158
subfun('Rosetak', 'Rosepack') %>% # 165
subfun('ome, Cau', 'ome Cau') %>% # 167
subfun('lA', 'l A') %>% # 171
subfun('mC', 'm C') %>% # 179
subfun('nW', 'n W') %>% # 305
subfun('dE', 'd E') %>% # 336, 4809
subfun('lT', 'l T') %>% # 337, 570
subfun('Investigationof', 'Investigation of') %>% # 344
subfun('Regimeswith', 'Regimes with') %>% # 500
subfun('Systsems', 'Systems') %>% # 574
subfun('n:A', 'n: A') %>% # 579, 9799
subfun('e3', 'es') %>% # 646
subfun('gC', 'g C') %>% # 677
subfun('ldA', 'ld A') %>% # 732
subfun('Markats', 'Markets') %>% # 843
subfun('Discrepion', 'Discretion') %>% # 889
subfun('eE', 'e E') %>% # 949, 1850, 3440
subfun('yF', 'y F') %>% # 993
subfun('eC', 'e C') %>% # 1063
subfun('Macro- econ', 'Macroecon') %>% # 1162
subfun('Commentand', 'Comment and') %>% # 1200
subfun('Japaneseand', 'Japanese and') %>% # 1264
subfun(',P', ', P') %>% # 1106
subfun('Determinationand', 'Determination and') %>% # 1112
subfun('Mult-Div', 'Multi-Div') %>% # 1213
subfun('tI', 't I') %>% # 1348
subfun('sD', 's D') %>% # 1348
subfun('Fctry Assoc[.] w/Gains in Effcny[?]: Evdnc[.]', 'Factory Associated with Gains in Efficiency? Evidence') %>% # 1386
subfun('Mnfctr[.]', 'Manufacturing') %>% # 1386
subfun('onT', 'on T') %>% # 1396
subfun('nge- ', 'nge') %>% # 1407, 5429
subfun('fI', 'f I') %>% # 1425, 1655
subfun('eI', 'e I') %>% # 1460, 3332
subfun('fU', 'f U') %>% # 1463
subfun('tR', 't R') %>% # 1480, 1508
subfun('eF', 'e F') %>% # 1484, 4492
subfun('nC', 'n C') %>% # 1513
subfun('Developedand', 'Developed and') %>% # 1528
subfun('rM', 'r M') %>% # 1559, 6753
subfun('Compenstation', 'Compensation') %>% # 1578
subfun('Quardractic', 'Quadratic') %>% # 1581
subfun('fF', 'f F') %>% # 1586
subfun('nV', 'n V') %>% # 1623
subfun('tM', 't M') %>% # 1642
subfun('eP', 'e P') %>% # 1650, 4596, 5235
subfun('Comparisonof', 'Comparison of') %>% # 1735
subfun('tyC', 'ty C') %>% # 1736
subfun('nE', 'n E') %>% # 1763
subfun('eM', 'e M') %>% # 1783, 1903
subfun('Competative', 'Competitive') %>% # 1897
subfun('RUn', 'Run') %>% # 1931
subfun('Reasse ssment', 'Reassessment') %>% # 1949
subfun('lP', 'l P') %>% # 1969
subfun('dL', 'd L') %>% # 1981
subfun('lD', 'l D') %>% # 2012, 2682
subfun('Exchage', 'Exchange') %>% # 2017
subfun('forthe', 'for the') %>% # 2069
subfun('rE', 'r E') %>% # 2102
subfun('eR', 'e R') %>% # 2162, 7385
subfun('dO', 'd O') %>% # 2173, 7224
subfun('sU', 's U') %>% # 2215, 6263
subfun('Choiceby', 'Choice by') %>% # 2292
subfun('inthe', 'in the') %>% # 2337
subfun('dT', 'd T') %>% # 2375, 2624
subfun('Fina([a-z]+)al', 'Financial') %>% # 2376, 3116, 9286
subfun('Macroeocnomics', 'Macroeconomics') %>% # 2473
subfun('rP', 'r P') %>% # 2543
subfun('mE', 'm E') %>% # 2583
subfun('sure- ', 'sure') %>% # 2950
subfun('Commited', 'Committed') %>% # 3005
subfun('Produc- tivity', 'Productivity') %>% # 3026
subfun('Exhange', 'Exchange') %>% # 3067
subfun('TAx', 'Tax') %>% # 3074
subfun('Endogeous', 'Endogenous') %>% # 3085
subfun('Reinve stment', 'Reinvestment') %>% # 3093
subfun('Ebolution', 'Evolution') %>% # 3104
subfun('folio- ', 'folio-') %>% # 3114
subfun('OBs', 'Obs') %>% # 3132
subfun('BAr', 'Bar') %>% # 3188
subfun('Comparive', 'Comparative') %>% # 3194
subfun('Correlationswith', 'Correlations with') %>% # 3249
subfun('Inve([a-z]+)ment', 'Investment') %>% # 3249, 10337
subfun('acor', 'acro') %>% # 3344
subfun('Competiton', 'Competition') %>% # 3344
subfun('Genral', 'General') %>% # 3356
subfun('vs.Living', 'vs. Living') %>% # 3559
subfun('lB', 'l B') %>% # 3560
subfun('dC', 'd C') %>% # 3583
subfun('Com-mon', 'Common') %>% # 3637
subfun('Thrift, [.][.][.]$', 'Thrift, Public Debt, Capital Taxation and Policy Towards Human Capital Formation') %>% # 3637
subfun('Endo-genous', 'Endogenous') %>% # 3671
subfun('Cross- Sec', 'Cross-Sec') %>% # 3806
subfun('ti- cal', 'tical') %>% # 3811
subfun('Mid- Cen', 'Mid-Cen') %>% # 3817
subfun('Engogenous', 'Endogenous') %>% # 4026
subfun('Elderly-', 'Elderly -') %>% # 4182
subfun('Studyof', 'Study of') %>% # 4211
subfun('sF', 's F') %>% # 4270
subfun('Infaliton', 'Inflation') %>% # 4319
subfun('Effciency', 'Efficiency') %>% # 4381
subfun('Persoanl', 'Personal') %>% # 4391
subfun('yH', 'y H') %>% # 4460
subfun('e1', 'e 1') %>% # 4496
subfun('Firm- Level', 'Firm-Level') %>% # 4540
subfun('Convicton', 'Conviction') %>% # 4551
subfun('"Household.*Bag,"', 'Household Demand for Garbage and Recycling Collection with the Start of a Price per Bag') %>% # 4670
subfun('usS', 'us S') %>% # 4731
subfun('ationa ', 'ations ') %>% # 4754, 10565
subfun('hC', 'h C') %>% # 4786
subfun('dF', 'd F') %>% # 4800
subfun('Quantative', 'Quantitative') %>% # 4819
subfun('sT', 's T') %>% # 4870
subfun('fromthe', 'from the') %>% #w5058, h0072, h0085
subfun('gI', 'g I') %>% # 5060
subfun('hG', 'h G') %>% # 5170
subfun('Imigration', 'Immigration') %>% # 5185
subfun('1890- ', '1890-') %>% # 5314
subfun('yP', 'y P') %>% # 5362
subfun('eS', 'e S') %>% # 5364, 6732, 24938
subfun('Means- Tested', 'Means-Tested') %>% # 5372
subfun('der- City', 'der-City') %>% # 5425
subfun('yM', 'y M') %>% # 5474
subfun('DoB', 'Do B') %>% # 5695
subfun('Flowof', 'Flow of') %>% # 5712
subfun('Emp irical', 'Empirical') %>% # w5771
subfun('Endgenous', 'Endogenous') %>% # 5851
subfun('Acccoun', 'Accounts Data') %>% # 5884
subfun('Masschusetts', 'Massachusetts') %>% # 5957
subfun(',I', ', I') %>% # 6223
subfun('lyis', 'lysis') %>% # 6310
subfun('SYs', 'Sys') %>% # 6436
subfun('Complimentarity', 'Complementarity') %>% # 6600
subfun('QuUality', 'Quality') %>% # 6753
subfun('Endogeneously', 'Endogenously') %>% # 7060
subfun('Equilibriumin', 'Equilibrium in') %>% # 7026
subfun('Hunderd', 'Hundred') %>% # 7195
subfun('lC', 'l C') %>% # 7386
subfun('nA', 'n A') %>% # 7493
subfun('n: b', 'n\" b') %>% # 7493
subfun('Comparision', 'Comparison') %>% # 7599
subfun('SHe', 'She') %>% # 8060
subfun('Commerical', 'Commercial') %>% # 8060
subfun(',a', ', a') %>% # 9614
subfun('Inequiality', 'Inequality') %>% # 9830
subfun('Nonaddative', 'Nonadditive') %>% # 9895
subfun('Commision', 'Commission') %>% # 10030
subfun('Complemetarity', 'Complementarity') %>% # 10350
subfun('s1', 's') %>% # 10447
subfun('Self_Control', 'Self Control') %>% # 10819
subfun('mis ', 'mic ') %>% # 11470
subfun('Collpase', 'Collapse') %>% # 11153
subfun('N 1$', 'N+1') %>% # 11713
subfun('contracs', 'contracts') %>% # 11804
subfun('jA', 'ja') %>% # 12393
subfun('\\%u2019', '\'') %>% # 12396
subfun(',2004', ', 2004') %>% # 12607
subfun(',1980', ', 1980') %>% # 15274
subfun('ter\\?Cyc', 'ter-Cyc') %>% # 18062
subfun('China -US', 'China-US') %>% # 19898
subfun('A-az', 'iaz') %>% # 21350
subfun('^\\?', '') %>% # 21802
subfun('E\\?ect', 'Effect') %>% # 22950
subfun('dor\\?Uni', 'dor-Uni') %>% # 23018
subfun('CO\\? Em', 'CO2 Em') %>% # 24293, 26086, 26537
subfun('^q\\?$', 'q5') %>% # 24709
subfun('Emit CO\\?', 'Emit CO2') %>% # 25063
subfun(',W', ', W') %>% # 25311
subfun('Di\\?er', 'Differ') %>% # 25380, 26375
subfun('Pro\\?t', 'Profit') %>% # 26027
subfun('\\?E', '? E') %>% # 26268
subfun('Employess', 'Employees') %>% # 26272
subfun(' o\\? ', ' off ') %>% # 26624
subfun('i\\?I', 'i-I') %>% # 27175
subfun('DEg', 'Deg') %>% # 27239
subfun('ChinaÂ', 'China') %>% # 27502
subfun(' R\\? ', ' R ') %>% # 27632
subfun('R\\?1', 'R<=1') %>% # 28093
subfun('Ce MENT', 'CeMENT') %>% # 28727
subfun('CO\\? Fert', 'CO2 Fert') %>% # 29320
subfun('of-the- ', 'of-the-') %>% # h0073
subfun('Rela-tion', 'Relation') %>% # h0079
subfun('tothe', 'to the') %>% # h0080
subfun('nalExp', 'nal Exp') %>% # t0005
subfun('Structral', 'Structural') %>% # t0031
subfun('recent development$', 'recent developments') %>% # t0034
subfun('tioWhe', 'tio Whe') %>% # t0069
subfun('ashDixits:"(.*)[.]$', 'ash Dixit\'s "\\1"') %>% # t0077
subfun('denti- ', 'denti') %>% # t0106
subfun('JTPA Exp$', 'JTPA Experiment') %>% # t0149
subfun('Cleaningthe', 'Cleaning the') %>% # t0160
subfun('ri-zz', 'riz') %>% # t0168
subfun('Quasi- ', 'Quasi-') %>% # t0170
subfun('Norwegian [.][.][.]$', 'Norwegian Vocational Rehabilitation Programs') %>% # t0262
subfun('and Application in', 'and Applications in') # t0281
}
# Define function for matching patterns using Perl regular expressions
check_text = function(x, y) {
grepl(x, y, ignore.case = T, perl = T)
}
# Extract outlets
journal_false_negatives = c(
with_prefix(c(1826, 2095, 2297, 2313, 3478, 3609, 3912, 7126, 8243, 9439,
12220, 13192, 13931, 14570, 14572, 16641, 16795, 16887, 17169, 18745, 19054, 19284,
20341, 20397, 21465, 22908, 26707, 27290, 27362, 27364, 27547, 28454, 28801), 'w')
)
top5_false_positives = c(
with_prefix(c(2107, 5777, 5805, 6762, 16247), 'w')
)
outlets = outlets_raw %>%
as_tibble() %>%
filter(grepl('^(h|t|w)[0-9]', paper)) %>%
mutate(
type = replace(type, paper %in% journal_false_negatives, 'journal'),
AER = check_text('(?<!Latin )American Economic Review(?!.*?Insights)', published_text) | grepl('aecrev', journal_abbrev) | journal_abbrev == 'AER',
AER = AER & !check_text('Papers and Proceed', published_text),
ECTA = check_text('Econometrica', published_text) | grepl('emetrp', journal_abbrev),
JPE = check_text('(?<!(Scottish|European) )Journal of Political Economy', published_text) | grepl('jpolec', journal_abbrev),
QJE = check_text('Quarterly Journal of Economics', published_text) | grepl('qjecon', journal_abbrev) | journal_abbrev == 'QJE',
RES = check_text('Review of Economic Studies', published_text) | grepl('restud', journal_abbrev),
top5 = (AER | ECTA | JPE | QJE | RES) & !paper %in% top5_false_positives
) %>%
filter(type %in% c('book', 'journal')) %>%
group_by(paper) %>%
summarise(outlet = case_when(sum(top5) > 0 ~ 1,
sum(type == 'journal') > 0 ~ 2,
T ~ 3)) %>%
ungroup()
# Collate working paper information
excluded_papers = c(
with_prefix(c(125), 'h'),
with_prefix(c(306, 331, 332, 335, 336, 337, 338, 345), 't'),
with_prefix(c(156, 623, 2432, 7044, 7255, 7436, 7565, 8649, 9101, 9694, 13410, 13800, 21929), 'w')
)
papers = papers_raw %>%
as_tibble() %>%
filter(grepl('^(h|t|w)[0-9]', paper)) %>%
filter(issue_date <= max_issue_date) %>%
mutate(year = as.integer(substr(issue_date, 1, 4)),
month = as.integer(substr(issue_date, 6, 7))) %>%
select(paper, year, month, title) %>%
left_join(outlets) %>%
mutate(title = remove_parenthetical_notes(title),
title = clean_text(title),
title = fix_title(title),
title = if_else(paper %in% c('w9635', 'w9793', 'w12110'), title, sub('^"(.*)"$', '\\1', title))) %>%
filter(paper != 'w0000') %>%
filter(!paper %in% excluded_papers) %>%
sort_by_paper()
# Export data
write_csv(papers, 'data-raw/papers.csv')
save(papers, file = 'data/papers.rda', version = 2, compress = 'bzip2')
# Save session info
save_session_info('data-raw/papers.log')
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.