In seandavi/MungeCuratedMGS: Data munging for Curated Microbiome Signatures

knitr::opts_chunk$set(echo = TRUE, cache = FALSE)

Installation

BiocManager::install('seandavi/MungeCuratedMGS')

Data entities

library(MungeCuratedMGS)

Authentication

Run only once to authenticate, before running vignette (not evaluated in vignette):

library(googlesheets)
token = gs_auth()
saveRDS(token, file = "/tmp/googlesheets_token.rds")

Then run with the vignette, each time:

library(googlesheets)
suppressMessages(gs_auth(token = "/tmp/googlesheets_token.rds", verbose = FALSE))

Ontologies

EFO

efo = get_efo()
head(efo)

UBERON

uberon = get_uberon()
head(uberon)

Ontologies can be combined like so:

library(dplyr)
full_ontologies = dplyr::bind_rows(uberon, efo)

NCBI Taxonomy

Names and Taxon IDs

taxon_dir = get_taxon_dump_files()
taxon_id_names = get_taxon_names(taxon_dir)
head(taxon_id_names, 10)

obtain the set cMD features that map to an NCBI TaxID

library(curatedMetagenomicData)
ds <- curatedMetagenomicData("*metaphlan*", dryrun = TRUE)
ds <- grep("WenC", ds, value = TRUE, invert = TRUE)
esets <- curatedMetagenomicData(ds, dryrun = FALSE)
feats <- lapply(esets, rownames)
allfeats <- Reduce(union, feats)
cat(allfeats, file = "../inst/extdata/cMD_allfeats.txt", sep = "\n")

feat.file <- system.file("extdata/cMD_allfeats.txt", package = "MungeCuratedMGS")
allfeats <- scan(feat.file, what = "character")
m2n <- metaphlan2ncbi(metaphlan.version = "2.0")
m2n.cmd <- m2n[allfeats] 
m2n.cmd <- m2n.cmd[!is.na(m2n.cmd)]

dat <- cbind(names(m2n.cmd), unname(m2n.cmd))
colnames(dat) <- c("metaphlan", "ncbi")
write.csv(dat, file = "../inst/extdata/cMD_metaphlan2ncbi.csv", 
            quote = FALSE, row.names = FALSE)

Data export

Write all curation tables as csv files

sheet <- curation_sheet()
dim(sheet)

study <- create_study_table(sheet)
experiment <- create_experiment_table(sheet)
signature <- create_signature_table(sheet)

Create keys and write to file:

freeform.cols <- c("antibiotics exclusion", "Group 1 definition", "source", "description")

sig.keys <- paste("Signature", seq_len(nrow(signature)))
exp.cols <- setdiff(colnames(experiment), freeform.cols)
exp.keys <- create_keys("Experiment ", experiment[,exp.cols])
stud.cols <- setdiff(colnames(study), freeform.cols)
stud.keys <- create_keys("Study ", study[,stud.cols])

# reset signature counter (in signature table)
sig.rle <- rle(exp.keys)$lengths
sig.rle <- lapply(sig.rle, seq_len)
sig.keys <- paste("Signature", unlist(sig.rle))

# reset experiment counter (in signature table)
exp.spl <- split(exp.keys, as.integer(sub("Study ", "", stud.keys)))
exp.rle <- lapply(exp.spl, function(e) rle(e)$lengths)
exp.sig.keys <- lapply(exp.rle, function(e) rep(seq_along(e), e))
exp.sig.keys <- unname(unlist(exp.sig.keys))

signature$Experiment <- paste("Experiment", exp.sig.keys)
signature$Study <- stud.keys
experiment$Study <- stud.keys

signature$`Page Name` <- sig.keys
experiment$`Page Name` <- signature$Experiment
study$`Page Name`<- stud.keys 

stud.exp <- paste(experiment[["Study"]], experiment[["Page Name"]])
experiment <- experiment[!duplicated(stud.exp),]
study <- unique(study)

# check whether there are orphaned experiment or signatures 
stopifnot(all(study[["Page Name"]] == unique(experiment[["Study"]])))

# check whether there are orphaned experiment or signatures 
stud.exp <- paste(experiment[["Study"]], experiment[["Page Name"]])
stud.exp.sig <- paste(signature[["Study"]], signature[["Experiment"]])
stopifnot(all(stud.exp == unique(stud.exp.sig)))

# check whether there are studies that have no experiments
stopifnot(all(study[["Page Name"]] %in% unique(experiment[["Study"]])))

# check whether there are experiments that have no signatures
stopifnot(all(stud.exp %in% unique(stud.exp.sig)))

write.csv(study, file="studies.csv", row.names = FALSE)
write.csv(experiment, file="experiments.csv", row.names = FALSE)
write.csv(signature, file="signatures.csv", row.names = FALSE)

Double-check via joins:

ind <- colnames(experiment) == "Page Name"
colnames(experiment)[ind] <- "Experiment"
ind <- colnames(study) == "Page Name"
colnames(study)[ind] <- "Study"
sig.exp <- plyr::join(experiment, signature, by = c("Study", "Experiment"))
ses <- plyr::join(study,sig.exp, by = "Study")
icols <- intersect(colnames(ses), colnames(sheet))
is.consistent <- function(n) all(ses[,n] == sheet[,n])
cons <- vapply(icols, is.consistent, logical(1))
cons

dupl <- study$PMID[duplicated(study$PMID)]
dupl.studs <- study[study$PMID %in% dupl,]
dupl.studs[order(dupl.studs$PMID),]

Write the signatures file again, alternating metaphlan and NCBI columns:

nonsigcols <- grep("Metaphlan|NCBI|Study|Page", colnames(signature), invert=TRUE)
sigcols <- as.integer(t(cbind(grep("Metaphlan", colnames(signature)), grep("NCBI", colnames(signature)))))
keys <- grep("Study|Page", colnames(signature))
signature2 <- signature[, c(nonsigcols, sigcols, keys)]
write.csv(signature2, file="signatures_alternatingNCBI.csv", row.names = FALSE)

Single PMIDs split into multiple studies

pmidsplits <- table(study$PMID)
pmidsplits <- pmidsplits[pmidsplits > 1]
pmidsplits <- names(pmidsplits)
splits <- study[study$PMID %in% pmidsplits, ]
(splits <- splits[order(splits$PMID), ])
write.csv(splits, "splitstudies.csv", row.names = FALSE)

signature_ike <- readr::read_csv("~/Downloads/Signature_ike.csv")
study_ike <- readr::read_csv("~/Downloads/Studies_ike.csv")

colnames(signature_ike)[!colnames(signature_ike) %in% colnames(signature)] %>%
  grep(pattern="metaphlan|ncbi", x=., invert = TRUE, ignore.case = TRUE, value = TRUE)

colnames(signature)[!colnames(signature) %in% colnames(signature_ike)] %>%
  grep(pattern="metaphlan|ncbi", x=., invert = TRUE, ignore.case = TRUE, value = TRUE)

colnames(study_ike)[!colnames(study_ike) %in% colnames(study)] %>%
  grep(pattern="metaphlan|ncbi", x=., invert = TRUE, ignore.case = TRUE, value = TRUE)

colnames(study)[!colnames(study) %in% colnames(study_ike)] %>%
  grep(pattern="metaphlan|ncbi", x=., invert = TRUE, ignore.case = TRUE, value = TRUE)

Test import-export consistency

Get bulk export files from bugsigdb.org:

studs <- readr::read_csv("https://tinyurl.com/ycg8fs9x")
studs <- subset(studs, PMID %in% study$PMID)
exps <- readr::read_csv("https://tinyurl.com/yb2fmpa3")
exps <- subset(exps, Study %in% studs$`Study page name`)
sigs <- readr::read_csv("https://tinyurl.com/yakgsowm")
sigs <- subset(sigs, Study %in% studs$`Study page name`)
#sigs <- as.data.frame(sigs)
ses.str1 <- apply(sigs[,c("Study", "Experiment", "Signature page name")], 
                  1, paste, collapse = " ")
#rownames(sigs) <- ses.str1
ses.str2 <- apply(signature[,c("Study", "Experiment", "Page Name")], 
                  1, paste, collapse = " ")
ind <- ses.str1 %in% ses.str2
sigs <- sigs[ind,]
ses.str1 <- ses.str1[ind]
signature <- signature[ses.str2 %in% ses.str1,]
#rownames(signature) <- ses.str2
#isect <- intersect(ses.str1, ses.str2)
#sigs <- sigs[isect,]
#signature <- signature[isect,]

Check consisteny:

testImportExport(study, experiment, signature,
                 studs, exps, sigs)

seandavi/MungeCuratedMGS documentation built on July 17, 2021, 7:17 p.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

seandavi/MungeCuratedMGS
Data munging for Curated Microbiome Signatures

In seandavi/MungeCuratedMGS: Data munging for Curated Microbiome Signatures

Installation

Data entities

Authentication

Ontologies

EFO

UBERON

NCBI Taxonomy

Names and Taxon IDs

obtain the set cMD features that map to an NCBI TaxID

Data export

Write all curation tables as csv files

Test import-export consistency

R Package Documentation

Browse R Packages

We want your feedback!

seandavi/MungeCuratedMGS Data munging for Curated Microbiome Signatures

In seandavi/MungeCuratedMGS: Data munging for Curated Microbiome Signatures

Installation

Data entities

Authentication

Ontologies

EFO

UBERON

NCBI Taxonomy

Names and Taxon IDs

obtain the set cMD features that map to an NCBI TaxID

Data export

Write all curation tables as csv files

Test import-export consistency

R Package Documentation

Browse R Packages

We want your feedback!

seandavi/MungeCuratedMGS
Data munging for Curated Microbiome Signatures