knitr::opts_chunk$set(echo = TRUE)
library(tidyverse) library(XML) library(ggmap) library(easyPubMed) library(lubridate)
my_query <- "((laser capture microdissection) OR (laser microdissection)) AND ((microarray) OR (transcriptome) OR (RNA-seq))" my_ids <- get_pubmed_ids(my_query, api_key = Sys.getenv("PUBMED_API_KEY"))
my_xmls <- fetch_pubmed_data(my_ids, format = "xml", retmax = 1500) xmlx_list <- articles_to_list(my_xmls)
con <- file("pubmed_lcm.xml") writeLines(my_xmls, con = con) close(con)
df <- map_dfr(xmlx_list, article_to_df, max_chars = 0)
Cool, that took a while. There's some extra information here that I don't necessarily need, but I'll save the whole thing for now.
saveRDS(df, "lcm_info.rds")
For now, I only care about the address of the first author.
df_1st <- df %>% group_by(title) %>% mutate(author_ord = seq_along(title)) %>% filter(author_ord == 1) %>% select(-author_ord)
Also, I want to exclude reviews, which means I need to get article types.
xmls <- xmlParse("pubmed_lcm.xml") types <- xpathApply(xmls, "//PublicationTypeList") types <- lapply(types, function(x) xmlSApply(x, xmlValue)) is_review <- map_lgl(types, ~ any(.x == "Review"))
df_1st <- df_1st[!is_review, ]
df_1st <- df_1st %>% unite(col = "date_published", year, month, day, sep = "-") %>% mutate(date_published = ymd(date_published))
Just learnt this from Stack Overflow.
df_1st <- df_1st %>% mutate(geodata = purrr::map(address, ~ {if (is.na(.x)) NA else geocode(.x, output = "all")}))
# Should have done this earlier df_1st <- df_1st %>% filter(!is.na(address))
df_1st <- df_1st %>% mutate(address_components = purrr::map(geodata, list("results", 1, "address_components")), to_rm = map_lgl(address_components, is.null)) %>% filter(!to_rm) %>% select(-to_rm)
df_1st <- df_1st %>% mutate(address_components = purrr::map(address_components, ~ as_tibble(transpose(.x)) %>% unnest(c("long_name", "short_name")) %>% unnest("types") %>% unnest("types")), country = map_chr(address_components, ~ .x$long_name[.x$types == "country"]), `state/province` = map_chr(address_components, ~ {out <- .x$long_name[.x$types == "administrative_area_level_1"] if (length(out) == 0) NA_character_ else out })) df_1st <- df_1st %>% mutate(city = map_chr(address_components, ~ {out <- .x$long_name[.x$types == "postal_town"] if (length(out) == 0) out <- .x$long_name[.x$types == "locality"] if (length(out) == 0) out <- .x$long_name[.x$types == "administrative_area_level_2"] if (length(out) == 0) out <- .x$long_name[.x$types == "administrative_area_level_1"] if (length(out) == 0) out <- NA_character_ out }))
Sounds like really dumb, but I'm going to geocode the cities again.
df_1st <- df_1st %>% mutate(`state/province` = str_remove(`state/province`, "\\s(Sheng)|(Shi)$"), city = str_remove(city, "\\s(City)|(Shi)$"))
df_1st <- ungroup(df_1st)
cities <- df_1st %>% select(city, `state/province`, country) %>% distinct() %>% filter(!is.na(city)) %>% unite(col = "city2", city, `state/province`, country, remove = FALSE, sep = ", ") %>% mutate(city2 = str_remove(city2, "NA, ")) cities_gc <- geocode(cities$city2) cities_gc <- cbind(cities, cities_gc) cities_gc <- sf::st_as_sf(cities_gc, coords = c("lon", "lat"), crs = sf::st_crs(4326)) %>% select(city, city2, `state/province`, country, geometry)
saveRDS(cities_gc, "lcm_city_gc.rds")
abstracts <- map_dfr(xmlx_list, article_to_df, getAuthors = FALSE, max_chars = 1e6)
abstracts <- abstracts %>% select(pmid:journal) abstracts <- abstracts[!is_review,]
abstracts <- abstracts %>% unite(col = "date_published", year, month, day, sep = "-") %>% mutate(date_published = ymd(date_published)) %>% inner_join(df_1st[, c("pmid", "address", "country", "state/province", "city")])
Also remove empty abstracts
abstracts <- abstracts %>% filter(abstract != "")
abstracts <- abstracts %>% mutate(date_num = as.numeric(date_published))
abstracts <- abstracts %>% unite(col = "city2", city, country, sep = ", ", remove = FALSE)
saveRDS(df_1st, "pubmed_lcm_1st_author_no_review.rds")
saveRDS(abstracts, "abstracts.rds")
To do: Before I submit the paper, I should write functions to update the LCM abstracts without having to start over all the API queries.
usethis::use_data(abstracts) usethis::use_data(cities_gc)
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.