knitr::opts_chunk$set(echo = TRUE, cache=FALSE)
library(curatedMetagenomicDataCuration)

This vignette was built on r Sys.time() based on curatedMetagenomicDataCuration version r sessionInfo()$otherPkgs$curatedMetagenomicDataCuration$Version.

Define base GitHub URLs for viewing / editing in place:

baseurl <- "https://github.com/waldronlab/curatedMetagenomicDataCuration"
viewurl0 <- paste0(baseurl, "/tree/master/inst")
editurl0 <- paste0(baseurl, "/edit/master/inst")

If firstonly <- TRUE, only show results from the first dataset. This can be used as a temporary measure to shorten the output when troubleshooting the syntax-checking system itself.

firstonly <- FALSE
library(readr)
template <- read_csv(
  system.file("extdata/template.csv",
              package = "curatedMetagenomicDataCuration"),
  col_types = cols(
    col.name = col_character(),
    uniqueness = col_character(),
    requiredness = col_character(),
    multiplevalues = col_logical(),
    allowedvalues = col_character(),
    description = col_character()
  )
)
templatelist <- lapply(1:nrow(template), function(i){
  output <- t(template[i, ])
  output <- paste0("* *", rownames(output), "*: ", output[, 1])
  return( output )
})
names(templatelist) <- template$col.name
allfiles <- dir(
  system.file("curated", package = "curatedMetagenomicDataCuration"),
  recursive = TRUE,
  pattern = "_metadata.tsv$",
  full.names = TRUE
)
allfiles.rel <- strsplit(allfiles, split="curatedMetagenomicDataCuration")
allfiles.rel <- sapply(allfiles.rel, function(x) x[[2]])
checks <- lapply(allfiles, function(fname){
  dat <- read.delim(fname, 
                    sep = "\t", 
                    stringsAsFactors = FALSE, 
                    na.strings = c("NA"),
                    check.names=FALSE)
  return( checkCuration(dat) )
})
names(checks) <- basename(allfiles)
ivec <- seq_along(allfiles)
if(firstonly) ivec <- 1

for (i in ivec) {
  datasetname <- names(checks)[i]
  viewurl <- paste0(viewurl0, allfiles.rel[i])
  editurl <- paste0(editurl0, allfiles.rel[i])
  cat("# ", datasetname, "\n")
  cat("\n")
  cat("[View](", viewurl, ") or [edit](", editurl, ") this file directly on github.com. \n")
  cat("\n")
  if(identical(checks[[i]], list(missingcols = NULL, invalidcols = NULL, values = NULL))){
    cat("All checks OK. \n")
    cat("\n")
    next
  }
  if(!is.null(checks[[i]]$missingcols)){
    cat("## Required columns that are missing \n")
    cat(checks[[i]]$missingcols, "\n")
    cat("\n")
  }
  if(!is.null(checks[[i]]$invalidcols)){
    cat("## Column name errors \n")
    for (j in seq_along(checks[[i]]$invalidcols)){
      cat(paste0("* \"", checks[[i]]$invalidcols[j], "\"", " is not defined in the template. \n"))
      cat("\n")
    }
  }
  cat("\n")
  if(!is.null(checks[[i]]$values)){
    cat("## Entry errors \n")
    cat("\n")
    for (j in seq_along(checks[[i]]$values)){
      if(!any(grepl("!!!", checks[[i]]$values[, j]))) next
      cat("\n")
      problemvariable <- colnames(checks[[i]]$values)[j]
      cat("### ", problemvariable, "\n")
      cat("\n")
      cat("**Template definition** \n")
      cat("\n")
      for (k in 2:6)
        cat(templatelist[[problemvariable]][k], "  \n ")
      cat("\n")
      cat("**Errors** \n")
      cat("\n")
      output <- paste0(checks[[i]]$values$sampleID, "   :   ", checks[[i]]$values[, j])
      for (k in seq_along(output))
        if(grepl("!!!", output[k]))
          cat(k, ". ", gsub("!!!", "\"", output[k]), " \n ")
    }
  }
  cat("\n")
}


waldronlab/curatedMetagenomicDataCuration documentation built on June 9, 2025, 3:56 p.m.