library(pdftools)
library(dplyr) # req version >= 0.8.3
library(tidyr) # req version >= 1.0.2.9000
pdf <- pdf_text("jem_20111187_sm.pdf")
# Ignore pg1
x <- unlist(lapply(pdf[-1], function(x) strsplit(x, "\\n\\s*(?=6[0-9]{3})", perl = T)))
# Check the initial parsing against PDF file
substr(x, 1, 4)
# Data selected for import: pancreas-region, insulin expression, islet MHC-I, islet CD8
# Other data (e.g. in age, gender, comments, HLA typing) are part of nPOD Core and do not need to be extracted
x <- x[grep("^6[0-9]{3}", x)]
rows <- lapply(x, function(l) strsplit(l, "\\n")[[1]][1])
rows <- lapply(rows, function(x) strsplit(x, "\\s+")[[1]])
# id data is the first index
id <- lapply(rows, `[`, 1)
# region data approx betwwen the 6th to 10th index
region <- lapply(rows, function(x) grep("Head|Body|Tail|Unknown", x[6:10], value = T))
expr <- lapply(rows, function(x) x[grep("^(⫺|⫹|\\+)", x)]) # character encoding of PDF a bit garbled
expr <- lapply(expr, function(x) gsub("⫺", "-", x))
expr <- lapply(expr, function(x) gsub("⫹", "+", x))
# on manual examination, realize that one row yields erroneous extracted entry, which we fix
# which(lengths(expr) != 3)
expr[[55]] <- expr[[55]][2:4]
dataset <- cbind(unlist(id), unlist(region), do.call(rbind, expr))
colnames(dataset) <- c("ID", "Region", "Insulin", "MHCI", "CD8")
dataset <- as_tibble(dataset)
dataset <- dataset %>%
mutate(Region = tolower(Region)) %>%
separate(Region, sep = "/", fill = "right", c("r1", "r2", "r3")) %>%
separate(Insulin, sep = "/", fill = "right", paste0("Ins_", 1:3)) %>%
separate(MHCI, sep = "/", fill = "right", paste0("MHCI_", 1:3)) %>%
separate(CD8, sep = "/", fill = "right", paste0("CD8_", 1:3)) %>%
mutate_at(vars(-c(1:4)), function(x) recode(x, `++` = 2, `+` = 1, `-` = 0))
dataset <- dataset %>%
pivot_wider(names_from = r1, values_from = c(Ins_1, MHCI_1, CD8_1), names_sep = ".") %>%
pivot_wider(names_from = r2, values_from = c(Ins_2, MHCI_2, CD8_2), names_sep = ".") %>%
pivot_wider(names_from = r3, values_from = c(Ins_3, MHCI_3, CD8_3), names_sep = ".") %>%
select(-ends_with("NA"))
dataset <- dataset %>%
unite("Ins.head", grep("Ins.*head", names(dataset), val = T), na.rm = T) %>%
unite("Ins.body", grep("Ins.*body", names(dataset), val = T), na.rm = T) %>%
unite("Ins.tail", grep("Ins.*tail", names(dataset), val = T), na.rm = T) %>%
unite("MHCI.head", grep("MHCI.*head", names(dataset), val = T), na.rm = T) %>%
unite("MHCI.body", grep("MHCI.*body", names(dataset), val = T), na.rm = T) %>%
unite("MHCI.tail", grep("MHCI.*tail", names(dataset), val = T), na.rm = T) %>%
unite("CD8.head", grep("CD8.*head", names(dataset), val = T), na.rm = T) %>%
unite("CD8.body", grep("CD8.*body", names(dataset), val = T), na.rm = T) %>%
unite("CD8.tail", grep("CD8.*tail", names(dataset), val = T), na.rm = T)
# Rename "unknown" part to generic higher-level part term "pancreas" and coerce to numeric
dataset <- dataset %>%
rename_all(function(x) gsub("_[1-3].unknown", ".panc", x)) %>%
mutate_all(as.integer)
# Unfortunately, it is still necessary to do a manual review/comparison with the PDF to check for errors,
# of which there are a few because of the poor formatting of extracted data
dataset$Ins.tail[dataset$ID %in% c(6047, 6090, 6044, 6101, 6080, 6108, 6109, 6082)] <- 2
dataset[c("Ins.panc", "MHCI.panc", "CD8.panc")][dataset$ID == 6031, ] <- c(1, 0, 0)
# For nearly all cases, expression is consistent across different regions (one exception is 6065 for Ins expression)
# so we generate an overall score using the mode
modev <- function(x) {
ux <- unique(na.omit(x))
ux[which.max(tabulate(match(x, ux)))]
}
dataset <- dataset %>%
group_by(ID) %>%
mutate(Ins = modev(c(Ins.head, Ins.tail, Ins.body, Ins.panc))) %>%
mutate(MHCI = modev(c(MHCI.head, MHCI.tail, MHCI.body, MHCI.panc))) %>%
mutate(CD8 = modev(c(CD8.head, CD8.tail, CD8.body, CD8.panc)))
write.table(dataset, "PMID22213807_1_Coppieters-2012.tsv", sep = "\t", quote = F, row.names = F)
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.