R/validate.input.R

Defines functions validate.data.annot validate.data validate.baseline validate.annot validate.numeric validate.geneid.flag validate.flag validate.col.types validate.stats.cols validate.stats validate.pval.range validate.signif validate.pvals validate.FC validate.genes.present validate.geneset validate.comp.names validate.single.table.isnotlist validate.pvalflag validate.pathways.db

Documented in validate.annot validate.baseline validate.col.types validate.comp.names validate.data validate.data.annot validate.FC validate.flag validate.geneset validate.genes.present validate.numeric validate.pathways.db validate.pvalflag validate.pval.range validate.pvals validate.single.table.isnotlist validate.stats validate.stats.cols

#' @title Validate Pathways DB
#' @description To ensure selected db name is correct.
#' @param pathway.db The databse to be used for encrichment analysis. Can be one of the following, "rWikiPathways", "KEGG", "REACTOME", "Hallmark","rWikiPathways_aug_2020"
#' @param customized.pathways the customized pathways in the format of two column dataframe (column name as "gs_name" and "entrez_gene") to be used in analysis
#' @importFrom stringr str_c
#' @references Xingpeng Li, Tatiana Gelaf Romer & Siddhartha Pachhai RVA - RNAseq Visualization Automation tool.
validate.pathways.db <- function(pathway.db, customized.pathways){
  valid.db <- c("rWikiPathways", "KEGG", "REACTOME", "Hallmark","rWikiPathways_aug_2020")

  if(is.null(pathway.db) & is.null(customized.pathways)){
    stop(paste0("\nPlease specify either the customized pathway as dataframe by using 'customized.pathways' parameter or use one of the follwing db with pathway.db parameter: ",
                str_c(valid.db, collapse = " , " )))
  }else if(!is.null(pathway.db) & is.null(customized.pathways)){
    if (!pathway.db %in% valid.db){
      stop(paste0("\nThe DB name you have entered is cannot be used please select one of the follwing",
                  str_c(valid.db, collapse = " , " )))
    }

    if (pathway.db == "rWikiPathways_aug_2020"){
      message(paste0("\n\n Using rWikiPathways from August 2020, please use",
                 " pathway.db = rWikiPathways for latest version \n\n" ))
    }

    message(paste0("\n Currently using the ", pathway.db ," database for enrichment \n\n"))

  }else if(is.null(pathway.db) & !is.null(customized.pathways)){
    message("\n\nNo pathway database specified, will use customized pathway data for analysis.\n\n")
    if(!all(colnames(customized.pathways) == c("gs_name","entrez_gene"))){
      stop(paste0("\nPlease check the input customized.pathways with with input has two columns with gene set name and entrez gene name,\n",
                  "column names as 'gs_name' for geneset and 'entrez_gene' for gene in that geneset.\n\n"))
    }
  }else if(!is.null(pathway.db) & !is.null(customized.pathways)){
   message("\nBoth database and customized pathway information are provided, the customized pathway information will be appended to the database selected.\n\n")
  }
}


#' @title Validate pval flag
#' @description To ensure p value flags are the same accross datasets.
#' @param data A list of summary statistics table (data.frame) from limma or DEseq2, where rownames are gene id.
#' @param value P value flag.
#' @importFrom purrr map
#' @references Xingpeng Li, Tatiana Gelaf Romer & Siddhartha Pachhai RVA - RNAseq Visualization Automation tool.
validate.pvalflag <- function(data, value) {
  if(!inherits(data, 'list')) {
    if(!(value %in% colnames(data))) {
      stop(paste0('\n \n The supplied p-value flag was not found in dataset. If a signle data frame was passed make sure that p.value.flag is one your columns. If a list was passed make sure that p.value.flag is present in all data frames. \n'))
    }
  }else if(inherits(data, 'list')) {
    map(data, validate.pvalflag, value=value)
  }

}


#' @title Validate Single Table is not list
#' @description Makes sure the summary table being input is of the right class and format.
#' @param data summary statistics table (data.frame) from limma or DEseq2, where rownames are gene id.
#' @importFrom purrr map
#' @references Xingpeng Li, Tatiana Gelaf Romer & Siddhartha Pachhai RVA - RNAseq Visualization Automation tool.
validate.single.table.isnotlist <- function(data) {

  if (inherits(data,"list") & length(data) == 1){
    stop(paste0("There is only 1 summary table being passed in as a list, please convert to data frame"))
  }

  if (inherits(data,"list")){
    map(data,validate.single.table.isnotlist)
  }
}






#' @title Validate Comp Names
#' @description This function ensures that when a list of data frames are used as input the
#'        the number of comp names are the same as the number of data frames.
#' @param comp.names a character vector contain the comparison names corresponding to the same order to the \code{dat.list}. default = NULL.
#' @param data summary statistics table (data.frame) from limma or DEseq2, where rownames are gene id.
#' @references Xingpeng Li, Tatiana Gelaf Romer & Siddhartha Pachhai RVA - RNAseq Visualization Automation tool.
validate.comp.names <- function(comp.names, data) {
  if (!inherits(data,"list")){
    if (inherits(comp.names, "list") | inherits(comp.names, "vector")){
      stop(paste0("If input is a data frame please input comp.names as character or NULL"))
    }
  }
  else {
    if(length(data) != length(comp.names)){
      stop(paste0("Please make sure the provided summary statistics",
                " list dat.list has the same length as comp.names"))
    }

    if(length(unique(comp.names)) != length(comp.names)){
        stop(paste0("Number of unique comp names is not equal to the number of comp names provided"))

    }
  }
}




#' @title Validate Geneset
#' @description This function ensures that the input geneset to check.cutoff
#'        is formatted properly and in a usable form.
#' @param data summary statistics table or a list contain multiple summary statistics tables from limma or DEseq2, where each row is a gene.
#' @param geneset a summary statistic table contain the genes want to be highlighted, the gene name format (in row names)
#' needs to be consistent to the main summary statistics table). For example, this summary statistics
#'  table coulb be the output summary statistics table from Disease vs Healthy comparison (Only contain
#'  the subsetted significant genes want to be highlighted).
#' @param highlight.1 genes want to be highlighted, in the format of a vector consists of gene names. The gene name format
#'  needs to be consistent to the main summary statistics table.
#' @param highlight.2 genes want to be highlighted, in the format of a vector consists of gene names. The gene name format
#'  needs to be consistent to the main summary statistics table.

#'
#' @importFrom purrr set_names map2 map
#'
#' @return A character value indicating if the geneset was passed as a
#'         dataframe (`df`) or two vectors (`vec`), if a list is input
#'         the number of returned values equal the length of the list
#'
#'
#' @details The function ensures that only a dataframe or vectors are supplied,
#'         that at least one or the other is supplied, and that their formatting
#'         is correct if supplied. It also checks if any of the genes overlap
#'         with the genes in the datanames.
#' @references Xingpeng Li, Tatiana Gelaf Romer & Siddhartha Pachhai RVA - RNAseq Visualization Automation tool.
validate.geneset <- function(data,
                             geneset,
                             highlight.1,
                             highlight.2) {


        #added recursively check list
        if(inherits(data, "list")){

          message("Checking gene sets for listof data frames")

          map(data,validate.geneset, geneset=geneset, highlight.1 = highlight.1, highlight.2 = highlight.2)


        }else{


          if (class(highlight.1) == "list"){
            highlight.1 = unlist(highlight.1, use.names=FALSE)
          }

          if (class(highlight.2) == "list"){
            highlight.2 = unlist(highlight.2, use.names=FALSE)
          }

          if(is.null(geneset) & is.null(highlight.1) & is.null(highlight.2)) {
                  geneset.type <- "No.highlight"
                  return(geneset.type)
          }
          if(!is.null(geneset) & (!is.null(highlight.1) | !is.null(highlight.2))) {
                  stop(paste0("Please supply only ONE of: geneset parameter OR",
                              " highlight.1 and/or highlight.2 parameter."))
          }

          if(is.data.frame(geneset)) {
                  geneset.genes <- row.names(geneset)
                  if(is.null(geneset.genes)) {
                          stop(paste0("Input gene set must have rows labeled",
                                      " with gene names."))
                  }
                geneset.type <- "df"

          }

          else if(is.character(geneset)) {
                geneset.genes <- geneset

          }

          else if (is.null(geneset)) {

                if(is.null(highlight.1) | is.null(highlight.2)) {
                        stop(paste0("You must supply either a geneset parameter",
                                    " or a highlight.1 AND a highlight.2",
                                    " parameter."))
                }
                if(!is.character(highlight.1) | !is.character(highlight.2)) {
                        stop(paste0("Invalid data type supplied as gene sets to",
                                    " highlight.1 and highlight.2. Please",
                                    " supply two character vectors."))
                }

                geneset.genes <- c(highlight.1, highlight.2)
                geneset.type <- "vec"

          }
          else {
                stop(paste0("Invalid data type supplied as gene set. Please",
                            " supply a character vector or dataframe."))
          }

        validate.genes.present(rownames(data), geneset.genes)
        return(geneset.type)

        }
}


#' @title Validate genes present
#' @description Checks how many of the gene id's in the dataset are there in the geneset.
#' @param data.genes The gene id's.
#' @param geneset a summary statistic table contain the genes want to be highlighted, the gene name format (in row names)
#' needs to be consistent to the main summary statistics table). For example, this summary statistics
#'  table coulb be the output summary statistics table from Disease vs Healthy comparison (Only contain
#'  the subsetted significant genes want to be highlighted).
#' @references Xingpeng Li, Tatiana Gelaf Romer & Siddhartha Pachhai RVA - RNAseq Visualization Automation tool.
validate.genes.present <- function(data.genes, geneset) {

        missing.genes <- geneset[!(geneset %in% data.genes)]
        if(length(missing.genes) == length(geneset)) {
                stop(paste0("None of the genes in the gene set exist in dataset."))
        }
        if(length(missing.genes) > 0) {
                warning(paste0("\n\n",length(missing.genes)," genes from the gene set are missing",
                                " in the dataset and will be",
                                " ignored: \n\n"))
        }
}

#' @title Validate Foldchange
#' @description This function ensures the fold change minimum, maximum, and step
#'         are valid.
#' @inheritParams plot_cutoff
#'
#' @details Specifically it checks that the FCmax is greater than the FCmin,
#'         that at least 1 FCstep can fit within the FCmax and FCmin, that
#'         FCmax and FCmin values are non-negative, and that FCstep is positive.
#' @references Xingpeng Li, Tatiana Gelaf Romer & Siddhartha Pachhai RVA - RNAseq Visualization Automation tool.
validate.FC <- function(FCmin,
                        FCmax,
                        FCstep) {
        if(FCmax < FCmin) {
                stop("Choose an FCmax greater than FCmin.")
        }
        if(FCmax < 0 | FCmin < 0) {
                stop("FCmin and FCmax must be non-negative.")
        }
        if(FCstep <= 0) {
                stop("FCstep must be positive.")
        }
        if(abs(FCstep) >= abs(FCmax - FCmin)) {
                stop("FCstep does not fit within the FC bounds.")
        }
}

#' @title Validate Pvalues
#' @description This function ensures the fold change minimum, maximum, and step
#'         are valid.
#' @inheritParams plot_cutoff
#'
#' @details Specifically it checks that the pvalues are between 0-1, and that
#'         at least 1 `p.step` fits within the `p.min` and `p.max` bounds and
#'         is positive.
#' @references Xingpeng Li, Tatiana Gelaf Romer & Siddhartha Pachhai RVA - RNAseq Visualization Automation tool.
validate.pvals <- function(p.min,
                           p.max,
                           p.step) {
        validate.pval.range(p.min, "p.min")
        validate.pval.range(p.max, "p.max")

        if(p.step <= 0 | p.step > (p.max-p.min)) {
                stop("Invalid p.step value. p.step values must be positive and",
                     " less than p-value range to plot.")
        }
}

validate.signif <- function(signif.vals) {
        for(p.val in signif.vals) {
                validate.pval.range(p.val, "signif.val")
        }
}

#' @title Validate P-value Range
#'
#' @description Error-handling for invalid p-value.
#'
#' @param pval The pvalue
#' @param name The name of the value to include in the error.
#' @references Xingpeng Li, Tatiana Gelaf Romer & Siddhartha Pachhai RVA - RNAseq Visualization Automation tool.
validate.pval.range <- function(pval,
                                name) {
        if(pval < 0 | pval > 1) {
                stop(paste0("Invalid ", name, " value entered;",
                            " p-values must be between 0 and 1."))
        }
}

#' @title Validate Summary Statistics File
#'
#' @description Check for required column names and types.
#'
#' @param datin the summary statistics file.
#' @param name summary statistics file position indicator
#' @param ... pass on variables
#'
#' @references Xingpeng Li, Tatiana Gelaf Romer & Siddhartha Pachhai RVA - RNAseq Visualization Automation tool.
validate.stats <- function(datin,
                           name = 1,
                           ...) {

        validate.stats.cols(datin, name = name, list(...))
        validate.col.types(datin, name = name, list(...))
}

#' @title Check Summary Statistics Required Columns
#' @description Required columns are `FCflag` and `FDRflag`
#' @param req.cols required column names of `FCflag` and `FDRflag` pass on from validate.stats
#' @inheritParams validate.stats
#' @references Xingpeng Li, Tatiana Gelaf Romer & Siddhartha Pachhai RVA - RNAseq Visualization Automation tool.
validate.stats.cols <- function(datin,
                                name = 1,
                                req.cols) {

        existing.cols <- colnames(datin)

        missing.cols <- req.cols[!(req.cols %in% existing.cols)]

        if(length(missing.cols) > 0) {
                stop(paste0("The following required columns are missing from",
                            " summary statistics file ", name, ": ",
                            paste0(missing.cols, collapse = ",")))
        }
}

#' @title Check Summary Statistics Required Column Types
#' @description `FCflag` and `FDRflag` must be numeric.
#' @param flags `FCflag` or `FDRflag` to be checked
#' @inheritParams validate.stats
#' @references Xingpeng Li, Tatiana Gelaf Romer & Siddhartha Pachhai RVA - RNAseq Visualization Automation tool.
validate.col.types <- function(datin,
                               name = 1,
                               flags) {
        for(flag in flags) {
                validate.numeric(datin, flag, name = name)
        }
}


#' @title Validate Flag Value Is Valid
#' @description Enures that the `value` is one of `Options` and throws an error
#'         otherwise.
#' @param value The user-input value for the parameter
#' @param Options A vector of valid values for `value`
#' @param name The name of the parameter to be displayed in the error
#' @references Xingpeng Li, Tatiana Gelaf Romer & Siddhartha Pachhai RVA - RNAseq Visualization Automation tool.
validate.flag <- function(value, name, Options) {
        if(!(value %in% Options)) {
                stop(paste0("Invalid ", name, " value \"", value, "\". ",
                            name, " values must be one of [\"",
                            str_c(Options, collapse = "\", \""), "\"]."))
        }

}

validate.geneid.flag <- function(value, name) {
        Options <-c("ACCNUM", "ALIAS", "ENSEMBL", "ENSEMBLPROT, ENSEMBLTRANS",
                    "ENTREZID", "ENZYME", "EVIDENCE", "EVIDENCEALL", "GENENAME",
                    "GO", "GOALL", "IPI", "MAP", "OMIM, ONTOLOGY",
                    "ONTOLOGYALL", "PATH", "PFAM", "PMID", "PROSITE", "REFSEQ",
                    "SYMBOL", "UCSCKG", "UNIGENE", "UNIPROT")
        validate.flag(value, name, Options)
}

#' @title Validate Numeric Column
#' @description Ensures that a column in a dataframe which must be numeric is
#'         numeric and throws an error otherwise.
#'
#' @param datin The data in question.
#' @param col The column to validate as numeric.
#' @param name the position of dataset
#'
#' @details This specifically checks if any of the values in the column can
#'         be coerced as numeric.
#' @references Xingpeng Li, Tatiana Gelaf Romer & Siddhartha Pachhai RVA - RNAseq Visualization Automation tool.
validate.numeric <- function(datin,
                             col,
                             name = 1) {
        vals <- unlist(datin[,col])
        non.na.vals <- vals[!is.na(vals)]

        vals.num <- as.numeric(vals)
        if(all(is.na(vals.num))) {
                stop(paste0("Non-numeric data detected in ", col, "column of",
                            " dataset ", name))
        }
}

#' @title Validate Annotation Table
#' @description Ensure that an annotation has all of the required columns.
#'
#' @param data The input count data.
#' @param annot The annotation dataframe.
#' @param annot.flags The vector of annotation flags passed by the user.
#' @param sample.id Sample id label to check if in annot.
#' @param fill The fill value indicated by the user,"count" or "CPM".
#' @param baseline.flag The baseline.flag passed by the user.
#' @param baseline.val The baseline value passed by the user.
#'
#' @details The function will check the following:
#'         \itemize{
#'                 \item The `annot.flags` values are columns in `annot`
#'                 \item If `fill` = "cfb": validate the `baseline.flag` and
#'                         `baseline.val` parameters.
#'                 \item `sample.id` is a column in `annot`.
#'         }
#' @references Xingpeng Li, Tatiana Gelaf Romer & Siddhartha Pachhai RVA - RNAseq Visualization Automation tool.
validate.annot <- function(data, annot, annot.flags, sample.id,
                           fill = "CPM",
                           baseline.flag = NULL,
                           baseline.val = NULL) {
        cols <- colnames(annot)
        if(!is.null(sample.id) && !(sample.id %in% cols)) {
                stop(paste0("Required column [", sample.id,
                            "] is missing from your annotation input data.")
                )
        }
        missing.flags <- annot.flags[!(annot.flags %in% cols)]
        if(length(missing.flags) > 0) {
                stop(paste0("Annotation data missing flag column(s) [",
                            str_c(missing.flags, collapse = ", "),
                            "].")
                )
        }

        if(!all(colnames(data) == annot[,sample.id])) {
          stop("Please make sure the column names of count data matched \'sample.id\' in sample annotation file, they need to be in the same length and order.")
        }

        if(fill == "CFB") {
                validate.baseline(annot, baseline.val, baseline.flag)
        }
}

#' @title Validate Baseline Values
#' @description Ensures that user-input `baseline.val` and `baseline.flag`
#'         parameters are valid with respect to the `annot` dataframe.
#' @inheritParams validate.annot
#'
#' @details Specifically, validates that `baseline.flag` value(s) are columns
#'         in `annot`, and that `baseline.val` value(s) occur at least once in
#'         their respective `baseline.flag` columns.
#' @references Xingpeng Li, Tatiana Gelaf Romer & Siddhartha Pachhai RVA - RNAseq Visualization Automation tool.
validate.baseline <- function(annot, baseline.val, baseline.flag) {
        cols <- colnames(annot)

        missing.baseline <- baseline.flag[!(baseline.flag %in% cols)]
        if(length(missing.baseline) > 0) {
                stop(paste0("Baseline flag column(s) [",
                            str_c(missing.baseline, collapse = ", "),
                            "] are missing from you sample annotation",
                            " file.")
                )
        }

        if(length(baseline.val) != length(baseline.flag)) {
                stop(paste0("Unequal lengths for baseline.flag and",
                            " baseline.val input. Each flag must have a",
                            " corresponding baseline value.")
                )
        }

        for(i in seq_along(baseline.flag)) {
                flag <- baseline.flag[[i]]
                val <- baseline.val[[i]]
                data <- annot[[flag]]
                if(all(data != val)) {
                        stop(paste0("Baseline value ", val, " for flag ",
                                    "\"", flag, "\" is not one of the",
                                    " values in column \"", flag, "\"",
                                    " of your annotation data.")
                        )
                }
        }
}

#' @title Validate Data Input
#' @description Ensures that the data input has the required formatting.
#'
#' @param data The wide-format dataframe with input data.
#'
#' @details Specifically, checks if `data` has rownaems and that all other
#'        columns can be coerced to numeric.
#' @references Xingpeng Li, Tatiana Gelaf Romer & Siddhartha Pachhai RVA - RNAseq Visualization Automation tool.
validate.data <- function(data) {
        if(is.null(rownames(data))) {
                stop("Data must have rownames labeling genes.")
        }
        for(data.col in colnames(data)) {
                validate.numeric(data, data.col)
        }
}

#' @title Validate Data in the Context of Annotation
#' @description Ensures that the annotation file matches the data file with
#'         respect to sample IDs. Throws warnings if there are discrepencies.
#' @param data input data
#' @param annot annotation file
#' @param sample.id sample id in the input
#' @references Xingpeng Li, Tatiana Gelaf Romer & Siddhartha Pachhai RVA - RNAseq Visualization Automation tool.
validate.data.annot <- function(data, annot, sample.id) {

        annot.samples <- annot[[sample.id]]
        data.samples <- colnames(data)

        extra.samples <- data.samples[!(data.samples %in% annot.samples)]
        if(length(extra.samples) > 0) {
                warning(paste0("The following samples in your data input data",
                               " are missing from your annotation input: [",
                               str_c(extra.samples, collapse = ", "),
                               "]. These samples will be omitted from analysis.")
                )
        }

        missing.samples <- annot.samples[!(annot.samples %in% data.samples)]
        if(length(missing.samples) > 0) {
                warning(paste0("The following samples in your annotation input",
                               " are missing from your data input data: [",
                               str_c(missing.samples, collapse = ", "),
                               "]. These samples will be omitted from analysis.")
                        )
        }
}

Try the RVA package in your browser

Any scripts or data that you put into this service are public.

RVA documentation built on Nov. 2, 2021, 1:06 a.m.