knitr::opts_chunk$set(echo = TRUE)
library(tidyverse)
library(XML)
library(ggmap)
library(easyPubMed)
library(lubridate)
my_query <- "((laser capture microdissection) OR (laser microdissection)) AND ((microarray) OR (transcriptome) OR (RNA-seq))"
my_ids <- get_pubmed_ids(my_query, api_key = Sys.getenv("PUBMED_API_KEY"))
my_xmls <- fetch_pubmed_data(my_ids, format = "xml", retmax = 1500)
xmlx_list <- articles_to_list(my_xmls)
con <- file("pubmed_lcm.xml")
writeLines(my_xmls, con = con)
close(con)
df <- map_dfr(xmlx_list, article_to_df, max_chars = 0)

Cool, that took a while. There's some extra information here that I don't necessarily need, but I'll save the whole thing for now.

saveRDS(df, "lcm_info.rds")

For now, I only care about the address of the first author.

df_1st <- df %>% 
  group_by(title) %>% 
  mutate(author_ord = seq_along(title)) %>% 
  filter(author_ord == 1) %>% 
  select(-author_ord)

Also, I want to exclude reviews, which means I need to get article types.

xmls <- xmlParse("pubmed_lcm.xml")
types <- xpathApply(xmls, "//PublicationTypeList")
types <- lapply(types, function(x) xmlSApply(x, xmlValue))
is_review <- map_lgl(types, ~ any(.x == "Review"))
df_1st <- df_1st[!is_review, ]
df_1st <- df_1st %>% 
  unite(col = "date_published", year, month, day, sep = "-") %>% 
  mutate(date_published = ymd(date_published))

Just learnt this from Stack Overflow.

df_1st <- df_1st %>% 
  mutate(geodata = purrr::map(address, ~ {if (is.na(.x)) NA else geocode(.x, output = "all")}))
# Should have done this earlier
df_1st <- df_1st %>% 
  filter(!is.na(address))
df_1st <- df_1st %>% 
  mutate(address_components = purrr::map(geodata, list("results", 1, "address_components")),
         to_rm = map_lgl(address_components, is.null)) %>% 
  filter(!to_rm) %>% 
  select(-to_rm)
df_1st <- df_1st %>% 
  mutate(address_components = purrr::map(address_components, 
                                  ~ as_tibble(transpose(.x)) %>% 
                                    unnest(c("long_name", "short_name")) %>% 
                                    unnest("types") %>% 
                                    unnest("types")),
         country = map_chr(address_components, ~ .x$long_name[.x$types == "country"]),
         `state/province` = map_chr(address_components, 
                         ~ {out <- .x$long_name[.x$types == "administrative_area_level_1"]
                         if (length(out) == 0) NA_character_ else out
                         }))
df_1st <- df_1st %>% 
  mutate(city = map_chr(address_components, 
                        ~ {out <- .x$long_name[.x$types == "postal_town"]
                        if (length(out) == 0)
                          out <- .x$long_name[.x$types == "locality"]
                        if (length(out) == 0) 
                          out <- .x$long_name[.x$types == "administrative_area_level_2"]
                        if (length(out) == 0)
                          out <- .x$long_name[.x$types == "administrative_area_level_1"]
                        if (length(out) == 0) out <- NA_character_
                        out
                        }))

Sounds like really dumb, but I'm going to geocode the cities again.

df_1st <- df_1st %>% 
  mutate(`state/province` = str_remove(`state/province`, "\\s(Sheng)|(Shi)$"),
         city = str_remove(city, "\\s(City)|(Shi)$"))
df_1st <- ungroup(df_1st)
cities <- df_1st %>%
  select(city, `state/province`, country) %>%
  distinct() %>%
  filter(!is.na(city)) %>%
  unite(col = "city2", city, `state/province`, country, remove = FALSE, sep = ", ") %>%
  mutate(city2 = str_remove(city2, "NA, "))
cities_gc <- geocode(cities$city2)
cities_gc <- cbind(cities, cities_gc)
cities_gc <- sf::st_as_sf(cities_gc, coords = c("lon", "lat"), crs = sf::st_crs(4326)) %>%
  select(city, city2, `state/province`, country, geometry)
saveRDS(cities_gc, "lcm_city_gc.rds")
abstracts <- map_dfr(xmlx_list, article_to_df, getAuthors = FALSE, max_chars = 1e6)
abstracts <- abstracts %>% 
  select(pmid:journal)
abstracts <- abstracts[!is_review,]
abstracts <- abstracts %>% 
  unite(col = "date_published", year, month, day, sep = "-") %>% 
  mutate(date_published = ymd(date_published)) %>% 
  inner_join(df_1st[, c("pmid", "address", "country", "state/province", "city")])

Also remove empty abstracts

abstracts <- abstracts %>% 
  filter(abstract != "")
abstracts <- abstracts %>% 
  mutate(date_num = as.numeric(date_published))
abstracts <- abstracts %>% 
  unite(col = "city2", city, country, sep = ", ", remove = FALSE) 
saveRDS(df_1st, "pubmed_lcm_1st_author_no_review.rds")
saveRDS(abstracts, "abstracts.rds")

To do: Before I submit the paper, I should write functions to update the LCM abstracts without having to start over all the API queries.

usethis::use_data(abstracts)
usethis::use_data(cities_gc)


pachterlab/museumst documentation built on April 20, 2024, 11:26 p.m.