inst/data-raw/process/PMID23920110_Kavishwar-2013/process.R

library(rvest)
library(tidyr)
library(dplyr)
library(stringr)

# import and extraction
tab1 <- read_html("https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3840740/table/table1-0022155413502792/") %>% html_table()
tab1 <- tab1[[2]] %>% as_tibble() %>%
  select(ID = "CaseID", Expr ="Staining for IC2, Insulin and Glucagon") %>%
  separate(Expr, c("target", "value"), sep = ": ?")

# reshaping
tab1 <- tab1 %>%
  drop_na(value) %>%
  mutate(target = substr(target, 1, 3)) %>%
  pivot_wider(names_from = target, values_from = value) %>%
  select(-c("Isl"))

# recoding
tab1 <- tab1 %>%
 mutate_at(2:4, function(x) sapply(x, function(i) if(i == "None") 0 else stringr::str_count(i, "\\+")))

# We'll leave the Ins score for 6110 as 1 since that seems the most reasonable encoding
write.table(tab1, "PMID23920110_1_Kavishwar-2013.tsv", sep = "\t", row.names = F, quote = F)
avucoh/nPOD documentation built on April 1, 2020, 5:24 p.m.