R/megaptera2Rmarkdown.R

Defines functions megaptera2Rmarkdown

Documented in megaptera2Rmarkdown

## This code is part of the megaptera package
## © C. Heibl 2016 (last update 2019-10-30)

#' @title RMarkdown Report
#' @description Creates a status report for a megaptera Project using RMarkdown.
#' @param x An object of class \code{\link{megapteraProj}}.
#' @param file A character vector giving a file name.
#' @param nmax An integer giving the maximun number of taxa for printed lists.
#' @return None, \code{megaptera2Rmarkdown} is called for its side effect.
#' @seealso \code{\link{checkSpecLocus}}, \code{\link{checkSpecies}}
#' @importFrom utils packageDescription
#' @export

megaptera2Rmarkdown <- function(x, file, nmax = 100){
  
  if (missing(file)) file <- "megaptera.Rmd"
  if (length(grep("^report/", file)) == 0){
    file <- paste("report", file, sep = "/")
  }
  
  ## Write CSS file
  ## --------------
  megapteraCSS(file = paste0(gsub("(.*/).*$", "\\1", file),
                             "megaptera.css"))
  
  ## Prepare some strings for output
  ## -------------------------------
  tip_rank <- ifelse(x@taxon@tip.rank == "genus", "genera", "species")
  Tip_rank <- ifelse(x@taxon@tip.rank == "genus", "Genera", "Species")
  
  z <- c("---", 
         paste0('title: "', 
                paste("megaptera", packageDescription("megaptera")$Version),
                '"'),
         paste("date:", Sys.time()),
         "output:", 
         "  html_document:",
         "    toc: true",
         "    css: megaptera.css",
         "---")
  
  save(x, file = "report/megapteraProj.rda")
  
  z <- c(z, 
         "```{r, echo=FALSE, message=FALSE, warning=FALSE}",
         "library(megaptera)",
         "load('megapteraProj.rda')", 
         "```")
  
  ## SETTINGS: TAXONOMY
  ## ------------------
  ingroup_speclist <- all(unlist(sapply(x@taxon@ingroup, is.Linnean)))
  ingroup <- head(x@taxon@ingroup, 3)
  if (ingroup_speclist) ingroup <- paste0("*", ingroup, "*")
  ingroup <- paste(ingroup, collapse = ", ")
  n <- length(x@taxon@ingroup)
  if (n > 3){
    ingroup <- paste0(ingroup, paste0(", ... [", n, " taxa in total]"))
  }
  outgroup_speclist <- all(unlist(sapply(x@taxon@outgroup, is.Linnean)))
  outgroup <- head(x@taxon@outgroup, 3)
  if (outgroup_speclist) outgroup <- paste0("*", outgroup, "*")
  outgroup <- paste(outgroup, collapse = ", ")
  n <- length(x@taxon@outgroup)
  if (n > 3){
    outgroup <- paste0(outgroup, paste0(", ... [", n, " taxa in total]"))
  }
  
  z <- c(z, 
         "# Settings",
         "## Taxonomic settings",
         paste("- **Kingdom**:", x@taxon@kingdom),
         paste("- **Ingroup**:", ingroup),
         paste("- **extend ingroup**:", ifelse(x@taxon@extend.ingroup, "yes", "no")),
         paste("- **Outgroup**:", outgroup),
         paste("- **extend outgroup**:", ifelse(x@taxon@extend.outgroup, "yes", "no")),
         paste("- **Hybrids**:", ifelse(x@taxon@exclude.hybrids, "excluded", "included")),
         paste("- **Tip rank**:", x@taxon@tip.rank),
         paste("- **Reference rank** :", x@taxon@reference.rank),
         paste("- **User-defined guide tree** :", ifelse(inherits(x@taxon, "taxonGuidetree"), "yes", "no")),
         "")
  
  ## SETTINGS: DATABASE
  ## ------------------
  z <- c(z, 
         "## PostgreSQL connection parameters",
         paste("- **host**:", x@db@host),
         paste("- **port**:", x@db@port),
         paste("- **dbname**:", x@db@dbname),
         paste("- **user**:", x@db@user),
         paste("- **password**:", x@db@password),
         "")
  
  ## SETTINGS: PIPELINE
  ## ------------------
  z <- c(z, 
         "## Pipeline settings",
         paste("- **Execution**:", ifelse(x@params@parallel, "parallel", "serial")),
         paste("- **Number of CPUs**:", x@params@cpus),
         paste("- **Type of cluster**:", x@params@cluster.type),
         "")
  
  ## 2: STATUS locus-wise
  ## --------------------
  conn <- dbconnect(x)
  loci <- dbGetQuery(conn, "SELECT DISTINCT locus FROM sequence")
  loci <- loci$locus[!is.na(loci$locus)]
  dbDisconnect(conn)
  
  if (!length(loci)){
    if ("taxonomy" %in% dbTableNames(x, "all")){
      z <- c(z, "# Status of the pipeline", 
             "Taxonomy table has been created (by running stepA)", "",
             "No sequences have been downloaded yet (stepB has to be called next)", "")
    }
  } else {
    
    ## create status table when stepB has been called
    ## at least once
    STATUS <- dbProgress(x, locus.wise = FALSE)
    names(STATUS) <- c("Locus", LETTERS[2:8])
    ss <- STATUS
    ss[ss == "pending"] <- 1
    ss[ss == "success"] <- 2
    ss[ss == "failure"] <- 3
    ss[ss == "error"] <- 4
    ss <- ss[order(ss[, 2], ss[, 3], ss[, 4], ss[, 5], ss[, 6], ss[, 7], ss[, 8]), ]
    ss$Locus <- gsub("^_", "", ss$Locus)
    
    ss[ss == 1] <- '<div class="pending"></div>'
    ss[ss == 2] <- '<div class="success"></div>'
    ss[ss == 3] <- '<div class="failure"></div>'
    ss[ss == 4] <- '<div class="error"></div>'
    ss <- htmlTable(ss)
    z <- c(z, 
           "# Status of the pipeline",
           "<table style=\"border-collapse: collapse;\">",
           "<tr>",
           "<td style=\"border:2px solid #fff;\"><div class=\"pending\"></div></td>",
           "<td style=\"border:2px solid #fff;\">pending, </td>",
           "<td style=\"border:2px solid #fff;\"><div class=\"success\"></div></td>",
           "<td style=\"border:2px solid #fff;\">success,</td>",
           "<td style=\"border:2px solid #fff;\"><div class=\"failure\"></div></td>",
           "<td style=\"border:2px solid #fff;\">failure,</td>",
           "<td style=\"border:2px solid #fff;\"><div class=\"error\"></div></td>",
           "<td style=\"border:2px solid #fff;\">error.</td></tr>",
           "</table><br>",
           ss, "")
  }
  
  
  ## 3: TAXONOMY
  ## ------------------
  tax <- dbReadTaxonomy(x)
  
  ## Formatting of species lists
  ## ---------------------------
  frmt <- function(z){
    z <- paste0("*", z, "*")
    if (length(z) > 1){
      z <- paste(z[1], paste0("(", paste(z[-1], collapse = ", "), ")"))
    }
    paste("-", z)
  }
  
  ## ingroup
  ## -------
  if (ingroup_speclist){
    received  <- lapply(x@taxon@ingroup, intersect, tax$taxon)
    received <- sapply(received, length) > 0
    ingroup_received <- x@taxon@ingroup[received]
    n_ingroup_received <- length(ingroup_received)
    ingroup_missing <- x@taxon@ingroup[!received]
    n_ingroup_missing <- length(ingroup_missing)
    # ingroup_false <- setdiff(ingroup_received, x@taxon@ingroup)
    
    ingroup_coverage <- c(
      "Query of the NCBI Taxonomy Database gave these results:", "",
      paste0("**", n_ingroup_received, "** ingroup species (of ",
             length(x@taxon@ingroup), " = ",
             round(n_ingroup_received / length(x@taxon@ingroup) * 100, 2),
             "%) could be retrieved"))
    if (n_ingroup_received <= nmax){
      ingroup_coverage <- c(ingroup_coverage, "", sapply(ingroup_received, frmt))
    }
    if (n_ingroup_missing){
      ingroup_coverage <- c(ingroup_coverage, "",
                        paste0("**", n_ingroup_missing, "** ingroup species (",
                               round(n_ingroup_missing / length(x@taxon@ingroup) * 100, 2),
                               "%) were missing"))
      if (n_ingroup_missing <= nmax) {
        ingroup_coverage <- c(ingroup_coverage, "", sapply(ingroup_missing, frmt))
      }
      save(ingroup_missing, file = "report/ingroup-missing-taxonomy.rda")
    }
    # if (length(ingroup_false)){
    #   ingroup_coverage <- c(ingroup_coverage, "",
    #                     paste0("- **WARNING**: *", ingroup_false, "* falsely as ingroup classified"))
    # }
  } else {
    ranks <- data.frame(sg = c("species", "genus", "family", "order"),
                        pl = c("species", "genera", "families", "orders"),
                        stringsAsFactors = FALSE)
    ingroup <- taxdumpSubset(tax, x@taxon@ingroup, root = "mrca")
    ingroup_tab <- table(ingroup$rank)
    ig_ranks <- ranks[ranks$sg %in% names(ingroup_tab), ]
    ingroup_tab <- ingroup_tab[ig_ranks$sg]
    names(ingroup_tab)[ingroup_tab > 1] <- ig_ranks$pl[ingroup_tab > 1]
    ingroup_coverage <- c(paste0("NCBI Taxonomy was searched for **", x@taxon@tip.rank, 
                                  "** of **", x@taxon@ingroup, "**."), "",
                           paste0("The search retrieved ", 
                                  paste(paste(ingroup_tab, names(ingroup_tab)), collapse = " in ")), ".")
    if (ingroup_tab["species"] <= nmax) ingroup_coverage <- c(ingroup_coverage, "", 
                                                                paste0("- *", sort(ingroup$taxon[ingroup$rank == x@taxon@tip.rank]), "*"))
  }
  ingroup_coverage <- c("## Ingroup", ingroup_coverage)
  
  ## outgroup
  ## -------
  if (outgroup_speclist){
    received  <- lapply(x@taxon@outgroup, intersect, tax$taxon)
    received <- sapply(received, length) > 0
    outgroup_received <- x@taxon@outgroup[received]
    n_outgroup_received <- length(outgroup_received)
    outgroup_missing <- x@taxon@outgroup[!received]
    n_outgroup_missing <- length(outgroup_missing)
    # outgroup_false <- setdiff(outgroup_received, x@taxon@outgroup)
    
    outgroup_coverage <- c(
      "Query of the NCBI Taxonomy Database gave these results:", "",
      paste0("**", n_outgroup_received, "** outgroup species (of ",
             length(x@taxon@outgroup), " = ",
             round(n_outgroup_received / length(x@taxon@outgroup) * 100, 2),
             "%) could be retrieved"))
    if (n_outgroup_received <= nmax){
      outgroup_coverage <- c(outgroup_coverage, "", sapply(outgroup_received, frmt))
    }
    if (n_outgroup_missing){
      outgroup_coverage <- c(outgroup_coverage, "",
                            paste0("**", n_outgroup_missing, "** outgroup species (",
                                   round(n_outgroup_missing / length(x@taxon@outgroup) * 100, 2),
                                   "%) were missing"))
      if (n_outgroup_missing <= nmax) {
        outgroup_coverage <- c(outgroup_coverage, "", sapply(outgroup_missing, frmt))
      }
      
    }
    # if (length(outgroup_false)){
    #   outgroup_coverage <- c(outgroup_coverage, "",
    #                     paste0("- **WARNING**: *", outgroup_false, "* falsely as outgroup classified"))
    # }
  } else {
    ranks <- data.frame(sg = c("species", "genus", "family", "order"),
                        pl = c("species", "genera", "families", "orders"),
                        stringsAsFactors = FALSE)
    outgroup <- taxdumpSubset(tax, x@taxon@outgroup, root = "mrca")
    outgroup_tab <- table(outgroup$rank)
    og_ranks <- ranks[ranks$sg %in% names(outgroup_tab), ]
    outgroup_tab <- outgroup_tab[og_ranks$sg]
    names(outgroup_tab)[outgroup_tab > 1] <- og_ranks$pl[outgroup_tab > 1]
    outgroup_coverage <- c(paste0("NCBI Taxonomy was searched for **", x@taxon@tip.rank, 
                                 "** of **", x@taxon@outgroup, "**."), "",
                          paste0("The search retrieved ", 
                                paste(paste(outgroup_tab, names(outgroup_tab)), collapse = " in "), "."))
    if (outgroup_tab["species"] <= nmax) outgroup_coverage <- c(outgroup_coverage, "", 
                                                                paste0("- *", sort(outgroup$taxon[outgroup$rank == x@taxon@tip.rank]), "*"))
  }
  outgroup_coverage <- c("## Outgroup", outgroup_coverage)
  
  ## guide tree
  ## ----------
  if (inherits(x@taxon, 'taxonGuidetree')){
    guide_tree <- c("## User-defined guide tree",
                    "```{r, echo=FALSE, message=FALSE}",
                    "plot(ladderize(x@taxon@guide.tree), type = 'phylo', no.margin = TRUE)",
                    "```", "")
  } else {
    guide_tree <- ""
  }
  z <- c(z, "# Taxonomy", ingroup_coverage, "", outgroup_coverage, "", guide_tree)
  
  
  
  ## stop here if no sequences have been downloaded so far
  ## -----------------------------------------------------
  if (!length(loci)){
    write(z, file = file)
    return()  
  }
  
  ## RAW SEQUENCES
  ## -------------
  ll <- dbReadLocus(x)
  loci_genbank <- ll[grep("gb", names(ll))]
  loci_genbank[loci_genbank > 1] <- 1
  loci_genbank <- rowSums(loci_genbank)
  
  taxa_with_seq <- names(loci_genbank)[loci_genbank > 0]
  taxa_without_seq <- sort(setdiff(
    union(sapply(x@taxon@ingroup, head, 1),
        sapply(x@taxon@outgroup, head, 1)
    ), taxa_with_seq
  ))
  n_taxa_with_seq <-  length(taxa_with_seq)
  n_taxa_without_seq <-  length(taxa_without_seq)
  n_queried <- length(x@taxon@ingroup) + length(x@taxon@outgroup)
  
  if (n_taxa_without_seq){
    write(taxa_without_seq, "report/species-without-sequences.txt")
    taxaNotFound <- c(
      paste("##", Tip_rank ,"without any sequences"),
      paste0(n_taxa_without_seq, " ", tip_rank, " (of ", nrow(ll), " with available taxonomy = ",
             round(100 * n_taxa_without_seq/nrow(ll), 2), "% and of ",
             n_queried, " queried = ", round(100 * n_taxa_without_seq/n_queried, 2), 
             "%) have no sequences."))
    if (n_taxa_without_seq <= nmax){ 
      taxaNotFound <- c(taxaNotFound, "", paste0("- *", taxa_without_seq, "*"))
    } 
  } else {
    taxaNotFound <- paste("## All", tip_rank ,"have at least one sequences")
    # stop("implement me!")
  }
  z <- c(z, "# DNA sequence retrieval",
         taxaNotFound, "",
         paste("## Number of", tip_rank, "per locus"),
         "```{r, echo=FALSE, message=FALSE}",
         "gb <- checkSpecLocus(x, 'retrieved')",
         "```", "")
  
  
  ## SELECTED SEQUENCES
  ## ------------------
  # if (all(STATUS[, "F"] %in% c("pending", "failure", "error"))){
  #   write(z, file = file)
  #   return()  
  # }
  loci_selected <- ll[, grep("sel_", names(ll)), drop = FALSE]
  loci_selected[loci_selected != "0"] <- 1
  loci_selected <- apply(loci_selected, c(1, 2), as.numeric) ## coerce to numeric!
  loci_selected <- rowSums(loci_selected)
  taxa_not_selected <- names(loci_selected)[loci_selected == 0]
  taxa_not_selected <- setdiff(taxa_not_selected, taxa_without_seq)
  n_taxa_not_selected <- length(taxa_not_selected)
  if (n_taxa_not_selected){
    taxaNotSelected <- c(
      paste("##", Tip_rank, "not selected for alignment"),
      paste0(n_taxa_not_selected, " ", tip_rank, " (of ", nrow(ll), " available = ", 
             round(100* n_taxa_not_selected/nrow(ll), 2),
             "% and of ", n_taxa_with_seq, " retrieved = ",
             round(100* n_taxa_not_selected/n_taxa_with_seq, 2), "%) have not been selected."))
    if (n_taxa_not_selected <= nmax){
      taxaNotSelected <- c(taxaNotSelected, "", paste0("- *", taxa_not_selected, "*"))
    }
  } else {
    taxaNotSelected <- NULL
  }
  tab <- checkSpecLocus(x, "sel", plot = FALSE)
  tab <- cbind(rownames(tab$specPerMarker), tab$specPerMarker)
  colnames(tab) <- c("Locus", "Number of sequences", 
                     "Number of private sequences")
  tab <- htmlTable(tab) 
  
  z <- c(z, 
         "# Sequence selection and alignment",
         taxaNotSelected, " ",
         paste("## Number of selected", tip_rank, "per locus"),
         "```{r, echo=FALSE, message=FALSE}",
         "sel <- checkSpecLocus(x, 'sel')",
         "```", "", tab)
  
  if (all(STATUS[, "G"] %in% c("pending", "failure", "error"))){
    write(z, file = file)
    return()  
  }
  
  if (all(STATUS[, "H"] %in% c("pending", "failure", "error"))){
    write(z, file = file)
    return()  
  }
  
  ## SATURATION + BLOCKS
  ## ------------------
  msa <- dbSummaryMSA(x)
  msa[, -1] <- apply(msa[, -1, drop = FALSE], 2, 
                     function(y) round(as.numeric(y), digits = 4))
  msa <- cbind(Locus = rownames(msa), msa)
  tab <- htmlTable(msa)
  z <- c(z, 
         "# Evaluation of alignments",
         "## Saturation",
         tab, 
         "```{r, echo=FALSE, message=FALSE}",
         "b <- checkBlocks(x)",
         "```", "")
  
  ## Write files
  ## -----------
  write(z, file = file)
  
}
heibl/megaptera documentation built on Jan. 17, 2021, 3:34 a.m.