inst/data-raw/process/PMID25687234_Reddy-2015/process.R

library(rvest)
library(data.table)

tab3 <- read_html("https://link.springer.com/article/10.1007%2Fs00125-015-3519-6/tables/3") %>% html_table()
tab3 <- tab3[[1]]
names(tab3) <- c("ID", "duration", "InsPos",
                 "CD45.cells.periislet", "CD45.cells.periislet.med",
                 "CD45.cells.intraislet", "CD45.cells.intraislet.med",
                 "CD45.cells.islet", "CD45.cells.islet.med",
                 "total.islets", "insulitis.islets")
tab3 <- tab3[, -2]
mean_cols <- c("CD45.cells.periislet", "CD45.cells.intraislet", "CD45.cells.islet")
sd_cols <- paste0(mean_cols, "_SD")
tab3 <- as.data.table(tab3)
tab3[, (sd_cols) := lapply(.SD, function(x) as.numeric(gsub(".* ± ", "", x))), .SDcols = mean_cols]
tab3[, (mean_cols) := lapply(.SD, function(x) as.numeric(gsub(" ± .*", "", x))), .SDcols = mean_cols]

# Pivot table to represent means/medians stratified by Ins+ vs. Ins- Islets
# (refer to entries starting at case 6209 in web table)
tab3[, InsPos := ifelse(InsPos == "+", "InsPos", "InsNeg")]
tab3 <- dcast(tab3, ID ~ InsPos, value.var = names(tab3)[-c(1,2)])

# A bit of renaming to follow conventions
newnames <- strsplit(names(tab3)[-1], "_Ins")
newnames <- mapply(paste0, paste0("Ins", sapply(newnames, "[", 2), "."), sapply(newnames, "[", 1), USE.NAMES = F)
setnames(tab3, c("ID", newnames))

# Certain values are not shown explicitly even though they're mentioned in paper and intended to be conveyed by the data here,
# i.e. "percentage of islets with insulitis in insulin-positive and -negative islets" (see table caption), which we calculate:
tab3[, "InsPos.insulitis.prct" := round((InsPos.insulitis.islets/InsPos.total.islets) * 100, digits = 2) ]
tab3[, "InsNeg.insulitis.prct" := round((InsNeg.insulitis.islets/InsNeg.total.islets) * 100, digits = 2) ]
# Also insulitis in total islets (not stratified)
tab3[, "total.insulitis" := rowSums(.SD, na.rm = T), .SDcols = c("InsNeg.insulitis.islets", "InsPos.insulitis.islets")]
tab3[, "total.islets" := rowSums(.SD, na.rm = T), .SDcols = c("InsNeg.total.islets", "InsPos.total.islets")]
tab3[, "insulitis.prct" := round((total.insulitis/total.islets) * 100, digits = 2)]

# We also derive
tab3[, "CD45.cells.periislet" := sum(InsNeg.CD45.cells.periislet, InsPos.CD45.cells.periislet)]
tab3[, "CD45.cells.intraislet" := sum(InsNeg.CD45.cells.intraislet,InsPos.CD45.cells.intraislet)]

write.table(tab3, file = "PMID25687234_1_Reddy-2015.tsv", sep = "\t", row.names = F, quote = F)
avucoh/nPOD documentation built on April 1, 2020, 5:24 p.m.