inst/data-raw/process/PMID24498006_Ye-2014/process.R

library(rvest)
library(dplyr)
library(tidyr)


tab1 <- read_html("https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3909047/table/pone-0086985-t001/") %>% html_table()
tab1 <- tab1[[2]]
tab1 <- tab1[c(2:7, 10:14), 1:2]
names(tab1) <- c("Key", "ID")

tab2 <- read_html("https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3909047/table/pone-0086985-t002/") %>% html_table()
tab2 <- tab2[[2]]
tab2 <- tab2[2:12, c(1, 3:5)]
names(tab2) <- c("ID", "MMc", "InsPos.MMc", "Anue.Polyploidy.prct")
tab2$ID <- tab1$ID

tab2 <- tab2 %>% as_tibble() %>%
  separate("MMc", c("MMC.n", "MMC.prct"), "/") %>%
  separate("InsPos.MMc", c("InsPos.MMC.n", "InsPos.MMC.prct"), "/")

# For the exported dataset, we only keep percentages instead of counts
dataset <- tab2 %>%
  select(ID, MMC.prct, InsPos.MMC.prct, Anue.Polyploidy.prct) %>%
  mutate_all(function(x) as.numeric(gsub("%", "", x)))

# Also include data in Supplemental Table (pone.0086985.s003.docx)
CD45.prct <- c(
"20.6%", # T1D Case 1
"2.9%", # T1D Case 2
"1.08%", # T1D Case 2
"2.06%", # Control 1
"1.01%", # Control 2
"0.20%") # Control 3
CD45.prct <- as.numeric(gsub("%", "", CD45.prct))
CD45.prct <- tibble(ID = as.numeric(tab1$ID[c(1:3, 9:11)]), CD45.prct)

dataset %>% full_join(CD45.prct, "ID")
write.table(dataset, file = "PMID24498006_1_Ye-2014.tsv", sep = "\t", row.names = F, quote = F)
avucoh/nPOD documentation built on April 1, 2020, 5:24 p.m.