knitr::opts_chunk$set(echo = TRUE, cache = FALSE)

Installation

BiocManager::install('seandavi/MungeCuratedMGS')

Data entities

library(MungeCuratedMGS)

Authentication

Run only once to authenticate, before running vignette (not evaluated in vignette):

library(googlesheets)
token = gs_auth()
saveRDS(token, file = "/tmp/googlesheets_token.rds")

Then run with the vignette, each time:

library(googlesheets)
suppressMessages(gs_auth(token = "/tmp/googlesheets_token.rds", verbose = FALSE))

Ontologies

EFO

efo = get_efo()
head(efo)

UBERON

uberon = get_uberon()
head(uberon)

Ontologies can be combined like so:

library(dplyr)
full_ontologies = dplyr::bind_rows(uberon, efo)

NCBI Taxonomy

Names and Taxon IDs

taxon_dir = get_taxon_dump_files()
taxon_id_names = get_taxon_names(taxon_dir)
head(taxon_id_names, 10)

obtain the set cMD features that map to an NCBI TaxID

library(curatedMetagenomicData)
ds <- curatedMetagenomicData("*metaphlan*", dryrun = TRUE)
ds <- grep("WenC", ds, value = TRUE, invert = TRUE)
esets <- curatedMetagenomicData(ds, dryrun = FALSE)
feats <- lapply(esets, rownames)
allfeats <- Reduce(union, feats)
cat(allfeats, file = "../inst/extdata/cMD_allfeats.txt", sep = "\n")
feat.file <- system.file("extdata/cMD_allfeats.txt", package = "MungeCuratedMGS")
allfeats <- scan(feat.file, what = "character")
m2n <- metaphlan2ncbi(metaphlan.version = "2.0")
m2n.cmd <- m2n[allfeats] 
m2n.cmd <- m2n.cmd[!is.na(m2n.cmd)]
dat <- cbind(names(m2n.cmd), unname(m2n.cmd))
colnames(dat) <- c("metaphlan", "ncbi")
write.csv(dat, file = "../inst/extdata/cMD_metaphlan2ncbi.csv", 
            quote = FALSE, row.names = FALSE)

Data export

Write all curation tables as csv files

sheet <- curation_sheet()
dim(sheet)
study <- create_study_table(sheet)
experiment <- create_experiment_table(sheet)
signature <- create_signature_table(sheet)

Create keys and write to file:

freeform.cols <- c("antibiotics exclusion", "Group 1 definition", "source", "description")

sig.keys <- paste("Signature", seq_len(nrow(signature)))
exp.cols <- setdiff(colnames(experiment), freeform.cols)
exp.keys <- create_keys("Experiment ", experiment[,exp.cols])
stud.cols <- setdiff(colnames(study), freeform.cols)
stud.keys <- create_keys("Study ", study[,stud.cols])

# reset signature counter (in signature table)
sig.rle <- rle(exp.keys)$lengths
sig.rle <- lapply(sig.rle, seq_len)
sig.keys <- paste("Signature", unlist(sig.rle))

# reset experiment counter (in signature table)
exp.spl <- split(exp.keys, as.integer(sub("Study ", "", stud.keys)))
exp.rle <- lapply(exp.spl, function(e) rle(e)$lengths)
exp.sig.keys <- lapply(exp.rle, function(e) rep(seq_along(e), e))
exp.sig.keys <- unname(unlist(exp.sig.keys))

signature$Experiment <- paste("Experiment", exp.sig.keys)
signature$Study <- stud.keys
experiment$Study <- stud.keys

signature$`Page Name` <- sig.keys
experiment$`Page Name` <- signature$Experiment
study$`Page Name`<- stud.keys 

stud.exp <- paste(experiment[["Study"]], experiment[["Page Name"]])
experiment <- experiment[!duplicated(stud.exp),]
study <- unique(study)

# check whether there are orphaned experiment or signatures 
stopifnot(all(study[["Page Name"]] == unique(experiment[["Study"]])))

# check whether there are orphaned experiment or signatures 
stud.exp <- paste(experiment[["Study"]], experiment[["Page Name"]])
stud.exp.sig <- paste(signature[["Study"]], signature[["Experiment"]])
stopifnot(all(stud.exp == unique(stud.exp.sig)))

# check whether there are studies that have no experiments
stopifnot(all(study[["Page Name"]] %in% unique(experiment[["Study"]])))

# check whether there are experiments that have no signatures
stopifnot(all(stud.exp %in% unique(stud.exp.sig)))

write.csv(study, file="studies.csv", row.names = FALSE)
write.csv(experiment, file="experiments.csv", row.names = FALSE)
write.csv(signature, file="signatures.csv", row.names = FALSE)

Double-check via joins:

ind <- colnames(experiment) == "Page Name"
colnames(experiment)[ind] <- "Experiment"
ind <- colnames(study) == "Page Name"
colnames(study)[ind] <- "Study"
sig.exp <- plyr::join(experiment, signature, by = c("Study", "Experiment"))
ses <- plyr::join(study,sig.exp, by = "Study")
icols <- intersect(colnames(ses), colnames(sheet))
is.consistent <- function(n) all(ses[,n] == sheet[,n])
cons <- vapply(icols, is.consistent, logical(1))
cons
dupl <- study$PMID[duplicated(study$PMID)]
dupl.studs <- study[study$PMID %in% dupl,]
dupl.studs[order(dupl.studs$PMID),]

Write the signatures file again, alternating metaphlan and NCBI columns:

nonsigcols <- grep("Metaphlan|NCBI|Study|Page", colnames(signature), invert=TRUE)
sigcols <- as.integer(t(cbind(grep("Metaphlan", colnames(signature)), grep("NCBI", colnames(signature)))))
keys <- grep("Study|Page", colnames(signature))
signature2 <- signature[, c(nonsigcols, sigcols, keys)]
write.csv(signature2, file="signatures_alternatingNCBI.csv", row.names = FALSE)

Single PMIDs split into multiple studies

pmidsplits <- table(study$PMID)
pmidsplits <- pmidsplits[pmidsplits > 1]
pmidsplits <- names(pmidsplits)
splits <- study[study$PMID %in% pmidsplits, ]
(splits <- splits[order(splits$PMID), ])
write.csv(splits, "splitstudies.csv", row.names = FALSE)
signature_ike <- readr::read_csv("~/Downloads/Signature_ike.csv")
study_ike <- readr::read_csv("~/Downloads/Studies_ike.csv")
colnames(signature_ike)[!colnames(signature_ike) %in% colnames(signature)] %>%
  grep(pattern="metaphlan|ncbi", x=., invert = TRUE, ignore.case = TRUE, value = TRUE)
colnames(signature)[!colnames(signature) %in% colnames(signature_ike)] %>%
  grep(pattern="metaphlan|ncbi", x=., invert = TRUE, ignore.case = TRUE, value = TRUE)
colnames(study_ike)[!colnames(study_ike) %in% colnames(study)] %>%
  grep(pattern="metaphlan|ncbi", x=., invert = TRUE, ignore.case = TRUE, value = TRUE)
colnames(study)[!colnames(study) %in% colnames(study_ike)] %>%
  grep(pattern="metaphlan|ncbi", x=., invert = TRUE, ignore.case = TRUE, value = TRUE)

Test import-export consistency

Get bulk export files from bugsigdb.org:

studs <- readr::read_csv("https://tinyurl.com/ycg8fs9x")
studs <- subset(studs, PMID %in% study$PMID)
exps <- readr::read_csv("https://tinyurl.com/yb2fmpa3")
exps <- subset(exps, Study %in% studs$`Study page name`)
sigs <- readr::read_csv("https://tinyurl.com/yakgsowm")
sigs <- subset(sigs, Study %in% studs$`Study page name`)
#sigs <- as.data.frame(sigs)
ses.str1 <- apply(sigs[,c("Study", "Experiment", "Signature page name")], 
                  1, paste, collapse = " ")
#rownames(sigs) <- ses.str1
ses.str2 <- apply(signature[,c("Study", "Experiment", "Page Name")], 
                  1, paste, collapse = " ")
ind <- ses.str1 %in% ses.str2
sigs <- sigs[ind,]
ses.str1 <- ses.str1[ind]
signature <- signature[ses.str2 %in% ses.str1,]
#rownames(signature) <- ses.str2
#isect <- intersect(ses.str1, ses.str2)
#sigs <- sigs[isect,]
#signature <- signature[isect,]

Check consisteny:

testImportExport(study, experiment, signature,
                 studs, exps, sigs)


seandavi/MungeCuratedMGS documentation built on July 17, 2021, 7:17 p.m.