nPOD: What the Package Does (One Line, Title Case)

library(pdftools)
library(dplyr) # req version >= 0.8.3
library(tidyr) # req version >= 1.0.2.9000

pdf <- pdf_text("jem_20111187_sm.pdf")

# Ignore pg1
x <- unlist(lapply(pdf[-1], function(x) strsplit(x, "\\n\\s*(?=6[0-9]{3})", perl = T)))
# Check the initial parsing against PDF file
substr(x, 1, 4)

# Data selected for import: pancreas-region, insulin expression, islet MHC-I, islet CD8
# Other data (e.g. in age, gender, comments, HLA typing) are part of nPOD Core and do not need to be extracted
x <- x[grep("^6[0-9]{3}", x)]
rows <- lapply(x, function(l) strsplit(l, "\\n")[[1]][1])
rows <- lapply(rows, function(x) strsplit(x, "\\s+")[[1]])
# id data is the first index
id <- lapply(rows, `[`, 1)
# region data approx betwwen the 6th to 10th index
region <- lapply(rows, function(x) grep("Head|Body|Tail|Unknown", x[6:10], value = T))
expr <- lapply(rows, function(x) x[grep("^(⫺|⫹|\\+)", x)])  # character encoding of PDF a bit garbled
expr <- lapply(expr, function(x) gsub("⫺", "-", x))
expr <- lapply(expr, function(x) gsub("⫹", "+", x))

# on manual examination, realize that one row yields erroneous extracted entry, which we fix
# which(lengths(expr) != 3)
expr[[55]] <- expr[[55]][2:4]

dataset <- cbind(unlist(id), unlist(region), do.call(rbind, expr))
colnames(dataset) <- c("ID", "Region", "Insulin", "MHCI", "CD8")
dataset <- as_tibble(dataset)
dataset <- dataset %>% 
  mutate(Region = tolower(Region)) %>%
  separate(Region, sep = "/", fill = "right", c("r1", "r2", "r3")) %>%
  separate(Insulin, sep = "/", fill = "right", paste0("Ins_", 1:3)) %>%
  separate(MHCI, sep = "/", fill = "right", paste0("MHCI_", 1:3)) %>%
  separate(CD8, sep = "/", fill = "right", paste0("CD8_", 1:3)) %>%
  mutate_at(vars(-c(1:4)), function(x) recode(x, `++` = 2, `+` = 1, `-` = 0))

dataset <- dataset %>% 
  pivot_wider(names_from = r1, values_from = c(Ins_1, MHCI_1, CD8_1), names_sep = ".") %>%
  pivot_wider(names_from = r2, values_from = c(Ins_2, MHCI_2, CD8_2), names_sep = ".") %>%
  pivot_wider(names_from = r3, values_from = c(Ins_3, MHCI_3, CD8_3), names_sep = ".") %>%
  select(-ends_with("NA"))

dataset <- dataset %>% 
  unite("Ins.head", grep("Ins.*head", names(dataset), val = T), na.rm = T) %>%
  unite("Ins.body", grep("Ins.*body", names(dataset), val = T), na.rm = T) %>%
  unite("Ins.tail", grep("Ins.*tail", names(dataset), val = T), na.rm = T) %>%
  unite("MHCI.head", grep("MHCI.*head", names(dataset), val = T), na.rm = T) %>%
  unite("MHCI.body", grep("MHCI.*body", names(dataset), val = T), na.rm = T) %>%
  unite("MHCI.tail", grep("MHCI.*tail", names(dataset), val = T), na.rm = T) %>%
  unite("CD8.head", grep("CD8.*head", names(dataset), val = T), na.rm = T) %>%
  unite("CD8.body", grep("CD8.*body", names(dataset), val = T), na.rm = T) %>%
  unite("CD8.tail", grep("CD8.*tail", names(dataset), val = T), na.rm = T)


# Rename "unknown" part to generic higher-level part term "pancreas" and coerce to numeric
dataset <- dataset %>%
  rename_all(function(x) gsub("_[1-3].unknown", ".panc", x)) %>%
  mutate_all(as.integer)

# Unfortunately, it is still necessary to do a manual review/comparison with the PDF to check for errors,
# of which there are a few because of the poor formatting of extracted data
dataset$Ins.tail[dataset$ID %in% c(6047, 6090, 6044, 6101, 6080, 6108, 6109, 6082)] <- 2
dataset[c("Ins.panc", "MHCI.panc", "CD8.panc")][dataset$ID == 6031, ] <- c(1, 0, 0)

# For nearly all cases, expression is consistent across different regions (one exception is 6065 for Ins expression)
# so we generate an overall score using the mode
modev <- function(x) {
  ux <- unique(na.omit(x))
  ux[which.max(tabulate(match(x, ux)))]
}

dataset <- dataset %>%
  group_by(ID) %>%
  mutate(Ins = modev(c(Ins.head, Ins.tail, Ins.body, Ins.panc))) %>%
  mutate(MHCI = modev(c(MHCI.head, MHCI.tail, MHCI.body, MHCI.panc))) %>%
  mutate(CD8 = modev(c(CD8.head, CD8.tail, CD8.body, CD8.panc)))

write.table(dataset, "PMID22213807_1_Coppieters-2012.tsv",  sep = "\t", quote = F, row.names = F)

avucoh/nPOD documentation built on April 1, 2020, 5:24 p.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

avucoh/nPOD
What the Package Does (One Line, Title Case)

inst/data-raw/process/PMID22213807_Coppieters-2012/process.R
In avucoh/nPOD: What the Package Does (One Line, Title Case)

R Package Documentation

Browse R Packages

We want your feedback!

avucoh/nPOD What the Package Does (One Line, Title Case)

inst/data-raw/process/PMID22213807_Coppieters-2012/process.R In avucoh/nPOD: What the Package Does (One Line, Title Case)

R Package Documentation

Browse R Packages

We want your feedback!

avucoh/nPOD
What the Package Does (One Line, Title Case)

inst/data-raw/process/PMID22213807_Coppieters-2012/process.R
In avucoh/nPOD: What the Package Does (One Line, Title Case)