library(tidyverse)
library(historicalnetworks)
# yf_corpus <- build_corpus(keywords = "yellow fever",
# date_range = "1700 TO 1899")
#
# # saving this output is a good idea, given the runtime
# save(list = "yf_corpus", file = "data/yf_corpus.rda")
corpus_keywords1 <- c("yellow fever", "fever", "disease", "medicine", "health", "military hygiene")
corpus_keywords2 <- c("vomito", "fiebre", "enfermidad", "medicina", "salud", "higiene",
"fièvre", "maladie", "médecine", "santé", "hygiène",
"west indies", "caribbean", "mexico", "new spain")
corpus_keywords3 <- c("caribe", "méxico",
"caraïbes", "mexique", "saint domingue")
all_corpus1 <- purrr::map_df(corpus_keywords1, function(keyword) {
corpus_part <- tryCatch({build_corpus(keywords = keyword,
date_range = "1650 TO 1900",
max_results = 50000)},
error = function(e) {
tibble(id = "error",
author = keyword)
})
return(corpus_part)
})
all_corpus2 <- purrr::map_df(corpus_keywords2, function(keyword) {
corpus_part <- tryCatch({build_corpus(keywords = keyword,
date_range = "1650 TO 1900",
max_results = 50000)},
error = function(e) {
tibble(id = "error",
author = keyword)
})
return(corpus_part)
})
all_corpus3 <- purrr::map_df(corpus_keywords3, function(keyword) {
corpus_part <- tryCatch({build_corpus(keywords = keyword,
date_range = "1650 TO 1900",
max_results = 50000)},
error = function(e) {
tibble(id = "error",
author = keyword)
})
return(corpus_part)
})
keywords_to_redo <- bind_rows(all_corpus1, all_corpus2, all_corpus3) %>%
filter(id == "error") %>%
pull(author)
found_items <- purrr::map(keywords_to_redo, function(keyword) {
internetarchive::ia_search(c(text = keyword, date = date_range),
num_results = max_results)
}) %>%
unlist() %>%
unique() %>%
sort() %>%
internetarchive::ia_get_items()
ia_metadata2 <- function(items) {
metadata_to_data_frame <- function(i) {
m <- unlist(i$metadata)
if (is.character(unlist(i$metadata$identifier))) {
data_frame(id = unlist(i$metadata$identifier), field = names(m),
value = unname(m))
} else {
data_frame(id = NA_character_, field = NA_character_,
value = NA_character_)
}
}
dfs <- lapply(items, metadata_to_data_frame)
bind_rows(dfs) %>% filter(!is.na(id))
}
metadata <- found_items %>%
ia_metadata2() %>%
tidyr::spread(key = field, value = value) %>%
select(id, title, date, creator, creator1, creator2, volume, publisher) %>%
mutate(author = ifelse(is.na(creator), creator1, creator),
city = ifelse(str_detect(publisher, ":"),
str_extract(publisher, "^.*?(?=:)") %>%
str_replace("(^[^,]*).*", "\\1"),
str_replace(publisher, "(^[^,]*).*", "\\1")) %>%
str_replace("^A\\s+", "") %>%
str_replace("^Printed in\\s+", "") %>%
str_replace_all("[\\[\\]]", "") %>%
str_replace("-", " ") %>%
str_replace("\\?\\s*$", "") %>%
ifelse(test = str_detect(., "&|and"), NA_character_, .) %>%
str_trim()) %>%
select(id, author, date, title, city, publisher, creator2, volume)
all_corpus4 <- metadata %>%
left_join(found_items %>%
ia_files2() %>%
filter(str_detect(file, "djvu\\.txt$")) %>%
group_by(id) %>%
internetarchive::ia_download(dir = download_dir, overwrite = FALSE)) %>%
filter(!is.na(local_file)) %>%
select(-file, -type)
all_corpus_raw <- bind_rows(all_corpus1, all_corpus2, all_corpus3, all_corpus4) %>%
filter(!id=="error")
save(list = "all_corpus_raw", file = "data/all_corpus_raw.rda")
load("data/all_corpus_raw.rda")
all_files <- list.files("data-raw/corpus")
all_corpus_raw <- all_corpus_raw %>% filter(local_file %in% paste0("data-raw/corpus/", all_files))
yf_keywords <- c("yellow fever", "fiebre amarillo", "fièvre jaune", "febre amarella",
"black vomit", "vomito negro", "vomito prieto",
"maladie de siam")
yf_corpus_all <- all_corpus_raw %>%
omit_duplicates() %>%
subset_corpus(yf_keywords) %>%
mutate(local_file = str_replace(local_file, "corpus", "yf_corpus"))
dir.create("data-raw/yf_corpus")
file.copy(str_replace(yf_corpus_all$local_file, "yf_corpus", "corpus"), yf_corpus_all$local_file)
save(list = "yf_corpus_all", file = "data/yf_corpus_all.rda")
load("data/yf_corpus_all.rda")
yf_corpus <- yf_corpus %>%
omit_duplicates() %>%
filter(!(str_detect(title, "Magazine|Journal|Gazette|(Scientific American)")))
race_pattern <- "blacks|negroes|slave|africa|négres|esclave|africain|senegal|negros|esclavo"
yf_corpus_race <- yf_corpus %>%
mutate(local_file = str_replace(local_file, "/corpus/", "/yf_corpus/")) %>%
subset_corpus(race_pattern)
lining_race_1g <- yf_corpus_race %>%
find_citing(data_frame(cited_author = "Lining", cited_year = 1756), near = race_pattern)
classified_lining_race_1g <- classify_citing(lining_race_1g) %>%
filter(classification > 0) %>%
mutate(author = recode(author,
"Royal College of Physicians of London. n 80046799" = "De Maria, Alfonso"))
lining_race_2g <- yf_corpus_race %>%
find_citing(classified_lining_race_1g, near = race_pattern)
classified_lining_race_2g <- classify_citing(lining_race_2g) %>%
filter(classification > 0) %>%
mutate(author = recode(author,
"Royal College of Physicians of London. n 80046799" = "De Maria, Alfonso"))
cross_cites <- bind_rows(classified_lining_race_1g,
classified_lining_race_2g) %>%
find_citing(classified_lining_race_2g, near = race_pattern)
classified_cross_cites <- classify_citing(cross_cites) %>%
filter(classification > 0)
all_cites <- bind_rows(classified_lining_race_1g,
classified_lining_race_2g,
classified_cross_cites) %>%
distinct() %>%
mutate(citing_author = if_else(str_detect(author, ","),
str_replace(author, "(^[^,]*).*", "\\1"),
author),
citing_author = recode(citing_author,
"Great Britain. General Board of Health" = "GB GBH",
"Jones" = "Jones and Allen"),
citing = paste(citing_author, date),
cited = recode(cited,
"Jones 1794" = "Jones and Allen 1794")) %>%
select(cited, citing, date, classification)
lining_plot <- citation_network_plot(all_cites)
####
elevation_pattern <- "(elevat(ion|ed))|altitude|oaks?|hills?|mountains?|robles?|montañas?|colina|((high|low)\\s+(lands?|grounds?))|(lieux\\s+(éléves|bas))|(tierras?\\s+(altas?|bajas?))"
yf_corpus_elevation <- yf_corpus_all %>%
subset_corpus(elevation_pattern)
yf_corpus_elevation <- yf_corpus_elevation %>%
mutate(year = str_extract(date, "\\d{4}"))
yf_corpus_elevation2 <- yf_corpus_elevation %>%
subset_corpus(match_near(match_vector(yf_keywords), elevation_pattern))
early_elevation0 <- bind_rows(tibble(cited_author = "Duhamel", cited_year = 1759),
tibble(cited_author = "Lind", cited_year = 1768), # b21365246 (1788 ed); 1768 ed has quote at 198-199
tibble(cited_author = "Monro", cited_year = 1764), # accdisease00monr (cites Duhamel)
tibble(cited_author = "Rollo", cited_year = 1783), # b21947430 (cites Monro and Lind (2d edition))
tibble(cited_author = "Hunter", cited_year = 1788), # * observationsondi00hunt
tibble(cited_author = "Jackson", cited_year = 1791), # * atreatiseonfeve00jackgoog (disagrees, at 87)
tibble(cited_author = "Chisholm", cited_year = 1799), # * 2546006R.nlm.nih.gov; b21299018_0001
tibble(cited_author = "Humboldt", cited_year = 1811), # politicalessayo00blacgoog
tibble(cited_author = "Bally", cited_year = 1814), # b28740476 (pp324-326, 334-336; cites Humboldt and Chisholm as additional examples)
tibble(cited_author = "Dickinson", cited_year = 1819), # * b28741365 at 59 (cites Humboldt)
tibble(cited_author = "Tulloch", cited_year = 1835))
# * In all_corpus_raw, but not yf_corpus_elevation2
early_elevation <- all_corpus_raw %>%
distinct() %>%
filter(id == "b21365246" | # Lind 1788 [1768]
id == "accdisease00monr" | # Monro 1764
id == "b21947430" | # Rollo 1783
id == "observationsondi00hunt" | # Hunter 1788
id == "atreatiseonfeve00jackgoog" | # Jackson 1791 at 72, 189
id == "b21299018_0001" | # Chisholm 1801 at 277-278; not in earlier versions of Chisholm
id == "politicalessayo09humbgoog" | # Humboldt 1811
id == "b28740476" | # Bally 1814
id == "b28268428") %>% # Dickinson 1819
bind_rows(tibble(id = NA, author = "Duhamel", date = "1759"),
tibble(id = NA, author = "Du Hamel", date = "1759"),
tibble(id = NA, author = "Lind", date = "1768")) %>%
mutate(local_file = str_replace(local_file, "corpus", "yf_corpus"))
early_elevation2 <- all_corpus_raw %>%
distinct() %>%
filter(id == "b21304415" | # Bancroft 1811
id == "observationsondi00blan" | # Blane 1785
id == "2548030R.nlm.nih.gov" | # Currie 1794 at 71
id == "2541029R.nlm.nih.gov" | # Alibert 1807 at 175
id == "presentstateofsp02walt" | # Walton 1810 at 101 "Elevated and airy situations are the most healthy . . . [vomito] is observable in Vera Cruz, from example, and arises from the situation and moist vicinity of that city ; hence it ceases when we get into the interior."
id == "histoiredesmalad01poup" | # Poupée-Desportes 1770 at 66
id == "2551017R.nlm.nih.gov" | # Davidge 1814 at 72
id == "b21297940" | # Pym 1815 at 201
id == "b22372167" | # Doughty 1816 at 73
id == "in.ernet.dli.2015.31007" | # Fergusson 1817 at 130
id == "2574034R.nlm.nih.gov" | # Hosack 1820 at 949
# id == "b21353815_0" | # Dalmas 1820 (2d ed., totally rewritten) at 65 "Enfin, lorsqu'en 1802 l'armée du général Leclerc s'empara de Saint-Domingue, j'appris du général Thouvenot que les troupes stationées sur le plateau de Plaisance conservérent leur viguer et leur santé, quoique la fièvre jaune y fut plusieurs fois importée" du Cap
id == "2577008R.nlm.nih.gov" | # Whitney 1833, _The Family Physician_(!), at 22: "These precautions should be strictly attended to by soldiers while marching, and their marches should be short, and their quarters on high airy grounds, or if on low marshy grounds it should be drained by ditching."
id == "cihm_43284" | # Martin 1834 (History of the British Colonies, Vol. 2) at 179-180 (re Jamaica)
id == "atwelvemonthsre00maddgoog" | # Madden 1835 at 337-338, quotes British Army surgeon Dr. M'Grath (long at Jamaica): "cases of yellow fever are always to be found in Jamaica except in the mountainous districts where at a certain elevation it is wholly unknown." Never at over 3000 ft
# id == "cihm_38226" | # Martin 1834b at 6
id == "63540640RX1.nlm.nih.gov" | # Tweedie 1840 at 337
id == "b29339819" | # Curtis 1839 at 104-105: "Residents in the East or West Indies should be very careful in the choice of the situation of their dwellings: the more elevated, and the further removed from all kinds of water, the better."
id == "b21454309" | # Alison 1843 at 420 (textbook?) : "the worst fevers of the tropical climates are nearly confined to a moderate elevation above the level of the sea"
id == "narrativeatouri02henrgoog" | # Tudor 1834 at 161-162
id == "b21359076" # Thomas 1808 at 57
) %>% bind_rows(tibble(id = NA, author = "Tulloch", date = "1835"),
tibble(id = NA, author = "Dalmas", date = "1805")) # [Dalmas 1805 at 41 not in archive.org except reprint in mmoiresurlanon00lefo] "J'ai appris du général Thouvenot , chef de l'état-major, que les troupes qui occupoieat le plateau élevé de Plaisance ont toujours joui d'une bonne santé, malgré que la maladie y ait élé plusieurs fois apportée par des personnes qui l'avoient prise au Cap, ou ailleurs"
cross_ee <- early_elevation %>%
filter(!is.na(local_file)) %>%
find_citing(early_elevation, near = elevation_pattern)
classified_cross_ee <- classify_citing(cross_ee) %>%
filter(classification > 0)
cites_early_elevation <- yf_corpus_elevation2 %>%
filter(!is.na(local_file)) %>%
find_citing(early_elevation, near = elevation_pattern) %>%
arrange(year, author, cited) %>%
filter(!(str_detect(title, "Magazine|[jJ]ournal|Gazette|(Scientific American)")))
# done <- read_csv("classify_cites_early_elevation.csv") %>%
# left_join(yf_corpus_elevation2, by = "id")
#
# cites_early_elevation <- cites_early_elevation %>%
# anti_join(done, by = "author")
cites_early_elevation_pre1846 <- cites_early_elevation %>% filter(year < 1846)
classified_early_elevation_pre1846_1g <- classify_citing(cites_early_elevation_pre1846)
classified_early_elevation_1g2 <- read_csv("classify_cites_early_elevation_pre1846.csv", col_types = "ccicc") %>%
filter(classification > 0) %>%
left_join(yf_corpus_elevation2, by = c("id"))
cites_ee2 <- yf_corpus_elevation2 %>%
filter(year < 1846) %>%
filter(!is.na(local_file)) %>%
find_citing(early_elevation2, near = match_near(elevation_pattern, match_vector(yf_keywords))) %>%
arrange(year, author, cited) %>%
filter(!(str_detect(title, "Magazine|[jJ]ournal|Gazette|(Scientific American)")))
classified_cites_ee2 <- classify_citing(cites_ee2) %>%
filter(classification > 0)
classified_cross_ee <- read_csv("classify_cross_ee.csv", col_types = "ccicc") %>%
filter(classification > 0) %>%
left_join(yf_corpus_elevation, by = c("id"))
classified_early_elevation_1g2 <- read_csv("classify_cites_early_elevation_pre1846.csv", col_types = "ccicc") %>%
filter(classification > 0) %>%
left_join(yf_corpus_elevation2, by = c("id"))
classified_cites_ee2 <- read_csv("classify_cites_ee2.csv", col_types = "ccicc") %>%
filter(classification > 0) %>%
left_join(yf_corpus_elevation2, by = c("id"))
classified_all <- bind_rows(classified_cross_ee,
classified_early_elevation_1g2,
classified_cites_ee2) %>%
filter(classification == 1)
elevation_cites0 <- classified_all %>%
mutate(author = recode(author,
"Royal College of Physicians of London" = "Dickinson, Nodes",
"Royal College of Physicians of Edinburgh" = if_else(id == "b21914461", "Johnson", "Arnold"),
"University of Glasgow. Library" = "Hutchison",
"Not Available" = "Duncan",
"Académie des sciences (France) n 50055092" = "Portal"),
citing_author = if_else(str_detect(author, ","),
str_replace(author, "(^[^,]*).*", "\\1"),
word(author, -1)),
citing = paste(citing_author, date),
city = recode(city,
"C. S. Van Winkle" = "New York",
"Longman" = "London"),
city = if_else(is.na(city), "London", city),
classification = if_else((str_detect(city, "London|Glasgow|Edinburgh") | str_detect(author, "Mill")), 1,
if_else((str_detect(city, "Paris|Bruxelles|(Saint Pierre)") | str_detect(author, "Humboldt|Duhamel|Poupée-Desportes")), 3,
if_else(str_detect(city, "Philadelphia|(New York)|Boston|Lexington|Charlottesville|Charleston|Cahawba"), 2,
if_else(city == "Erlangen", 4, NA_real_)))),
stage = if_else(year < 1793, 1,
if_else(year < 1829, 2,
if_else(year < 1839, 3, 4)))) %>%
filter(!str_detect(author, "Rush"))
elevation_cites <- elevation_cites0 %>%
select(cited, citing, date, classification, stage) %>%
distinct()
elevation_cited_only <- data_frame(citing = elevation_cites$cited[!elevation_cites$cited %in% elevation_cites$citing]) %>%
mutate(year = str_extract(citing, "\\d{4}") %>%
as.numeric(),
classification = if_else(str_detect(citing, "Humboldt|Duhamel|Poupée-Desportes|Dalmas|Moreau"), 3, 1)) %>%
distinct()
# elevation_2g <- yf_corpus_race %>%
# find_citing(classified_elevation_1g, near = elevation_pattern)
#
# classified_elevation_2g <- classify_citing(elevation_2g) %>%
# filter(classification > 0)
#
# cross_cites <- bind_rows(classified_elevation_1g,
# classified_elevation_2g) %>%
# find_citing(classified_lining_race_2g, near = race_pattern)
#
# classified_cross_cites <- classify_citing(cross_cites) %>%
# filter(classification > 0)
# cited_works %>% filter(!only_citing) %>% left_join(yf_corpus_all %>% mutate(citing_author = if_else(str_detect(author, ","),
# str_replace(author, "(^[^,]*).*", "\\1"),
# word(author, -1)),
# citing = paste(citing_author, date)), by = "citing") %>% View() #write_csv("cited_works.csv")
citation_network_plot(elevation_cites, elevation_cited_only,
label_only_cited = TRUE, custom_plot = TRUE) +
geom_edges(color = "grey85",
arrow = arrow(length = unit(5, "pt")),
curvature = 0.03) +
geom_nodes(size = 4, shape = 21,
aes(color = as.factor(cite_network$classification),
fill = as.factor(cite_network$classification))) +
geom_nodetext(aes(label = vertex.names)) +
theme(legend.background = element_blank(), legend.key = element_blank(),
panel.background = element_blank(), panel.border = element_blank(),
strip.background = element_blank(), plot.background = element_blank(),
axis.line = element_blank(), panel.grid = element_blank()) +
scale_fill_manual(values = c("1" = "white", "2" = "red", "3" = "blue", "4" = "grey75")) +
scale_color_manual(values = c("1" = "red", "2" = "blue", "3" = "red", "4" = "grey75")) +
theme(legend.position="none")
ggsave("vignettes/elevation_plot.jpg", width = 11, height = 7)
ggsave("vignettes/elevation_plot_short.jpg", width = 11, height = 4.5)
citation_network_plot(elevation_cites, elevation_cited_only,
label_only_cited = TRUE, custom_plot = TRUE,
by = "stage") +
geom_edges(color = "grey85",
arrow = arrow(length = unit(5, "pt")),
curvature = 0.03) +
geom_nodes(size = 4, shape = 21,
aes(color = as.factor(cite_network$classification),
fill = as.factor(cite_network$classification))) +
geom_nodetext(aes(label = vertex.names)) +
theme(legend.background = element_blank(), legend.key = element_blank(),
panel.background = element_blank(), panel.border = element_blank(),
strip.background = element_blank(), plot.background = element_blank(),
axis.line = element_blank(), panel.grid = element_blank()) +
scale_fill_manual(values = c("1" = "white", "2" = "red", "3" = "blue", "4" = "grey75")) +
scale_color_manual(values = c("1" = "red", "2" = "blue", "3" = "red", "4" = "grey75")) +
theme(legend.position="none")
# ggplot(cite_network,
# aes(x, y, xend = xend, yend = yend)) +
# coord_flip() +
# theme(
# axis.title.y = element_blank(),
# axis.text.y = element_blank(),
# axis.ticks.y = element_blank(),
# axis.title.x = element_blank()) +
# scale_y_continuous(breaks = pretty(layout$y, n = 5)) +
# expand_limits(y = c(min(pretty(layout$y, n = 5)), max(pretty(layout$y, n = 5)))) +
# geom_edges(color = "grey85",
# arrow = arrow(length = unit(5, "pt")),
# curvature = 0.03) +
# geom_nodes(size = 4, shape = 21,
# aes(color = as.factor(cite_network$classification),
# fill = as.factor(cite_network$classification))) +
# geom_nodetext(aes(label = vertex.names)) +
# theme(legend.background = element_blank(), legend.key = element_blank(),
# panel.background = element_blank(), panel.border = element_blank(),
# strip.background = element_blank(), plot.background = element_blank(),
# axis.line = element_blank(), panel.grid = element_blank()) +
# scale_fill_manual(values = c("1" = "white", "2" = "red", "3" = "blue", "4" = "grey75")) +
# scale_color_manual(values = c("1" = "red", "2" = "blue", "3" = "red", "4" = "grey75")) +
# theme(legend.position="none") + facet_wrap(~ stage)
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.