knitr::opts_chunk$set(echo = TRUE, cache = FALSE)
BiocManager::install('seandavi/MungeCuratedMGS')
library(MungeCuratedMGS)
Run only once to authenticate, before running vignette (not evaluated in vignette):
library(googlesheets) token = gs_auth() saveRDS(token, file = "/tmp/googlesheets_token.rds")
Then run with the vignette, each time:
library(googlesheets) suppressMessages(gs_auth(token = "/tmp/googlesheets_token.rds", verbose = FALSE))
efo = get_efo() head(efo)
uberon = get_uberon() head(uberon)
Ontologies can be combined like so:
library(dplyr) full_ontologies = dplyr::bind_rows(uberon, efo)
taxon_dir = get_taxon_dump_files() taxon_id_names = get_taxon_names(taxon_dir) head(taxon_id_names, 10)
library(curatedMetagenomicData) ds <- curatedMetagenomicData("*metaphlan*", dryrun = TRUE) ds <- grep("WenC", ds, value = TRUE, invert = TRUE) esets <- curatedMetagenomicData(ds, dryrun = FALSE) feats <- lapply(esets, rownames) allfeats <- Reduce(union, feats) cat(allfeats, file = "../inst/extdata/cMD_allfeats.txt", sep = "\n")
feat.file <- system.file("extdata/cMD_allfeats.txt", package = "MungeCuratedMGS") allfeats <- scan(feat.file, what = "character") m2n <- metaphlan2ncbi(metaphlan.version = "2.0") m2n.cmd <- m2n[allfeats] m2n.cmd <- m2n.cmd[!is.na(m2n.cmd)]
dat <- cbind(names(m2n.cmd), unname(m2n.cmd)) colnames(dat) <- c("metaphlan", "ncbi") write.csv(dat, file = "../inst/extdata/cMD_metaphlan2ncbi.csv", quote = FALSE, row.names = FALSE)
sheet <- curation_sheet() dim(sheet)
study <- create_study_table(sheet) experiment <- create_experiment_table(sheet) signature <- create_signature_table(sheet)
Create keys and write to file:
freeform.cols <- c("antibiotics exclusion", "Group 1 definition", "source", "description") sig.keys <- paste("Signature", seq_len(nrow(signature))) exp.cols <- setdiff(colnames(experiment), freeform.cols) exp.keys <- create_keys("Experiment ", experiment[,exp.cols]) stud.cols <- setdiff(colnames(study), freeform.cols) stud.keys <- create_keys("Study ", study[,stud.cols]) # reset signature counter (in signature table) sig.rle <- rle(exp.keys)$lengths sig.rle <- lapply(sig.rle, seq_len) sig.keys <- paste("Signature", unlist(sig.rle)) # reset experiment counter (in signature table) exp.spl <- split(exp.keys, as.integer(sub("Study ", "", stud.keys))) exp.rle <- lapply(exp.spl, function(e) rle(e)$lengths) exp.sig.keys <- lapply(exp.rle, function(e) rep(seq_along(e), e)) exp.sig.keys <- unname(unlist(exp.sig.keys)) signature$Experiment <- paste("Experiment", exp.sig.keys) signature$Study <- stud.keys experiment$Study <- stud.keys signature$`Page Name` <- sig.keys experiment$`Page Name` <- signature$Experiment study$`Page Name`<- stud.keys stud.exp <- paste(experiment[["Study"]], experiment[["Page Name"]]) experiment <- experiment[!duplicated(stud.exp),] study <- unique(study) # check whether there are orphaned experiment or signatures stopifnot(all(study[["Page Name"]] == unique(experiment[["Study"]]))) # check whether there are orphaned experiment or signatures stud.exp <- paste(experiment[["Study"]], experiment[["Page Name"]]) stud.exp.sig <- paste(signature[["Study"]], signature[["Experiment"]]) stopifnot(all(stud.exp == unique(stud.exp.sig))) # check whether there are studies that have no experiments stopifnot(all(study[["Page Name"]] %in% unique(experiment[["Study"]]))) # check whether there are experiments that have no signatures stopifnot(all(stud.exp %in% unique(stud.exp.sig))) write.csv(study, file="studies.csv", row.names = FALSE) write.csv(experiment, file="experiments.csv", row.names = FALSE) write.csv(signature, file="signatures.csv", row.names = FALSE)
Double-check via joins:
ind <- colnames(experiment) == "Page Name" colnames(experiment)[ind] <- "Experiment" ind <- colnames(study) == "Page Name" colnames(study)[ind] <- "Study" sig.exp <- plyr::join(experiment, signature, by = c("Study", "Experiment")) ses <- plyr::join(study,sig.exp, by = "Study") icols <- intersect(colnames(ses), colnames(sheet)) is.consistent <- function(n) all(ses[,n] == sheet[,n]) cons <- vapply(icols, is.consistent, logical(1)) cons
dupl <- study$PMID[duplicated(study$PMID)] dupl.studs <- study[study$PMID %in% dupl,] dupl.studs[order(dupl.studs$PMID),]
Write the signatures file again, alternating metaphlan and NCBI columns:
nonsigcols <- grep("Metaphlan|NCBI|Study|Page", colnames(signature), invert=TRUE) sigcols <- as.integer(t(cbind(grep("Metaphlan", colnames(signature)), grep("NCBI", colnames(signature))))) keys <- grep("Study|Page", colnames(signature)) signature2 <- signature[, c(nonsigcols, sigcols, keys)] write.csv(signature2, file="signatures_alternatingNCBI.csv", row.names = FALSE)
Single PMIDs split into multiple studies
pmidsplits <- table(study$PMID) pmidsplits <- pmidsplits[pmidsplits > 1] pmidsplits <- names(pmidsplits) splits <- study[study$PMID %in% pmidsplits, ] (splits <- splits[order(splits$PMID), ]) write.csv(splits, "splitstudies.csv", row.names = FALSE)
signature_ike <- readr::read_csv("~/Downloads/Signature_ike.csv") study_ike <- readr::read_csv("~/Downloads/Studies_ike.csv")
colnames(signature_ike)[!colnames(signature_ike) %in% colnames(signature)] %>% grep(pattern="metaphlan|ncbi", x=., invert = TRUE, ignore.case = TRUE, value = TRUE)
colnames(signature)[!colnames(signature) %in% colnames(signature_ike)] %>% grep(pattern="metaphlan|ncbi", x=., invert = TRUE, ignore.case = TRUE, value = TRUE)
colnames(study_ike)[!colnames(study_ike) %in% colnames(study)] %>% grep(pattern="metaphlan|ncbi", x=., invert = TRUE, ignore.case = TRUE, value = TRUE)
colnames(study)[!colnames(study) %in% colnames(study_ike)] %>% grep(pattern="metaphlan|ncbi", x=., invert = TRUE, ignore.case = TRUE, value = TRUE)
Get bulk export files from bugsigdb.org:
studs <- readr::read_csv("https://tinyurl.com/ycg8fs9x") studs <- subset(studs, PMID %in% study$PMID) exps <- readr::read_csv("https://tinyurl.com/yb2fmpa3") exps <- subset(exps, Study %in% studs$`Study page name`) sigs <- readr::read_csv("https://tinyurl.com/yakgsowm") sigs <- subset(sigs, Study %in% studs$`Study page name`) #sigs <- as.data.frame(sigs) ses.str1 <- apply(sigs[,c("Study", "Experiment", "Signature page name")], 1, paste, collapse = " ") #rownames(sigs) <- ses.str1 ses.str2 <- apply(signature[,c("Study", "Experiment", "Page Name")], 1, paste, collapse = " ") ind <- ses.str1 %in% ses.str2 sigs <- sigs[ind,] ses.str1 <- ses.str1[ind] signature <- signature[ses.str2 %in% ses.str1,] #rownames(signature) <- ses.str2 #isect <- intersect(ses.str1, ses.str2) #sigs <- sigs[isect,] #signature <- signature[isect,]
Check consisteny:
testImportExport(study, experiment, signature,
studs, exps, sigs)
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.