R/functions.R

Defines functions stepwise_regression wrap_de differential_expression build_formula cqn convert_geneids simple_filter filter_genes collapse_duplicate_hgnc_symbol check_mismatch get_biomart gtf_stats biomart_stats parse_counts biomart_obj clean_covariates coerce_continuous coerce_factors complete_path get_data

Documented in biomart_obj biomart_stats build_formula check_mismatch clean_covariates coerce_continuous coerce_factors collapse_duplicate_hgnc_symbol complete_path convert_geneids cqn differential_expression filter_genes get_biomart get_data gtf_stats parse_counts simple_filter stepwise_regression wrap_de

#'Detect file type and download data
#'
#'This function takes synIds and version number to download any
#'rectangular file type from Synapse.
#'
#'@param synid A character vector with a Synapse Id.
#'@param version Optional. A numeric vector with the Synapse file
#'version number.
#'@export
#'@return A tibble.
#'@examples
#'\dontrun{
#'file <- get_data(synID = "syn1234", version = 7)
#'
#'}
get_data <- function(synid, version = NULL) {
  # login to Synapse
  synapser::synLogin()

  df <- tibble::as_tibble(
    data.table::fread(
      synapser::synGet(
        synid,
        version = as.numeric(version)
        )$path,
      header = TRUE
      )
    )
  df
}
#' Create complete URL to Synapse entity
#'
#' This function creates the url reference to entities in Synapse.
#' @inheritParams get_data
#' @export
complete_path <- function(synid, version = NULL) {
  if (is.null(version)) {
    glue::glue("https://www.synapse.org/#!Synapse:{synid}")
  } else {
    glue::glue("https://www.synapse.org/#!Synapse:{synid}.{version}")
  }
}
#'Coerce objects to type factors.
#'
#'@param md  A data frame with sample identifiers in a column and relevant experimental covariates.
#'@param factors A vector of factor variables.
coerce_factors <- function(md, factors) {
  md[, factors] <- lapply(md[, factors, drop = FALSE], factor)
  md
}
#'Coerce objects to type numeric.
#'
#'@inheritParams coerce_factors
#'@param continuous A vector of continuous variables.
#'@importFrom magrittr %>%
coerce_continuous <- function(md, continuous) {
  subset <- md[,continuous]
  test_coercion <- lapply(subset, function(x) class(utils::type.convert(x)))
  if (all(test_coercion %in% c("integer", "numeric"))) {
    md[, continuous] <- lapply(md[, continuous, drop = FALSE], function(x) as.numeric(x))
    md
  } else {
    mismatched <- continuous[which(!test_coercion %in% c("integer", "numeric"))]
    stop(glue::glue("Variable {mismatched} can not be coerced to numeric."))
  }
}
#'Create covariate matrix from tidy metadata data frame.
#'
#'This function takes a tidy format. Coerces vectors to correct type. Only include
#'covariates that have 2 or more levels. Sample identifiers are stored as rownames.
#'
#'@inheritParams coerce_factors
#'@inheritParams coerce_continuous
#'@param sample_identifier The name of the column with the sample identifiers that map to the gene counts data frame.
#'
#'@export
#'@return A data frame with coerced variables.
#'@examples
#'data <- tibble::tribble(
#'  ~individualID, ~diagnosis, ~RIN,
#'  "ind5436", "control", 7.7,
#'  "ind234", "disease", 7.1
#'  )
#' clean_covariates(data, factors = c("individualID", "diagnosis"),
#' continuous = c("RIN"),
#' sample_identifier = c("individualID"))
clean_covariates <- function(md, factors, continuous, sample_identifier) {
  if (missing(factors) | missing(continuous)) {
    stop("Factor and continuous variables are required.")
  } else if (length(intersect(factors, continuous)) != 0) {
    stop("Variables are present in both the continuous and factor arguments. Variables must be designated
         numeric or factor, not both.")
  } else if (any(!(factors %in% colnames(md))) | any(!(all(continuous %in% colnames(md))))) {
    stop("Variables provided are not present in the metadata.")
  } else {
    md <- coerce_factors(md, factors)
    md <- coerce_continuous(md, continuous)
    md <- tibble::column_to_rownames(md, var = sample_identifier)
    md
  }
}
#'Get available Ensembl dataset
#'
#'Helper function to search relative Ensembl datasets by partial organism names.
#'
#'@param organism A character vector of the organism name. This argument takes partial strings. For example,"hsa" will match "hsapiens_gene_ensembl".
#'@param host An optional character vector specifying the release version. This specification is highly recommended for a reproducible workflow. (see \code{"biomaRt::listEnsemblArchives()"})
#'@export
biomart_obj <- function(organism, host) {
  message("Connecting to BioMart ...")
  ensembl <- biomaRt::useMart("ENSEMBL_MART_ENSEMBL", host = host)
  ds <- biomaRt::listDatasets(ensembl)[, "dataset"]
  ds <- grep(paste0("^", organism), ds, value = TRUE)
  if (length(ds) == 0) {
    stop(paste("Mart not found for:", organism))
  } else if (length(ds) > 1) {
    message("Found several marts")
    sapply(ds, function(d) message(paste(which(ds == d), d, sep = ": ")))
    n <- readline(paste0("Choose mart (1-", length(ds), ") : "))
    ds <- ds[as.integer(n)]
  }
  ensembl <- biomaRt::useDataset(ds, mart = ensembl)
  ensembl
}
#' Subset counts data frame
#'
#' The biomaRt query requires only gene identifiers to be passed as input. Some alignment
#' output combines additional metadata with the counts file. This function will remove
#' extraneous rows and is especially meant to address output from the STAR aligner.
#' @inheritParams get_biomart
parse_counts <- function(count_df){
  if (any(grepl("N_", row.names(count_df)))) {
    ind <- which(grepl("N_", row.names(count_df)))
    df_parsed <- count_df[-ind,]
    df_parsed
  } else {
    count_df
  }
}
#'Get Gene Length and GC: biomart
#'
#'Get gene length and GC content of `gene` from data frame (`df`) containing
#'gene IDs in a column labeled the value of `column_id`. `df` must have a column
#'labeled sequence where the sequence info for exons/or transcripts are and
#'feature start and stop coordinates in columns labled the value of `start` and
#'`stop`. This will return a vector of `gene`, length, and GC content as a
#'percentage.
#'
#'@param gene a gene name corresponding to a value in `df[,column_id]`
#'@param column_id the column name containg gene feature IDs
#'@param df the dataframe of exon/transcript annotations
#'@param start column name of feature start coordinate
#'@param end column name of feature end coordinate
#'@export
biomart_stats <- function(gene, column_id, df, start, end) {
  df <- df[ df[,column_id] == gene,]
  # Split the sequence into a list object
  seq <- strsplit(df$sequence, '')

  # Name the vector of bases
  for (i in 1:length(seq)) {
    names(seq[[i]]) <- df[i,start]:df[i,end]
  }

  # Unlist and remove overlapping bases
  bases <- unlist(seq)
  bases <- bases[!duplicated(names(bases))]

  # gene length:
  length <- length(bases)

  # Calculate base stats and GC content
  base_freq <- BSgenome::alphabetFrequency(Biostrings::DNAString(
    paste0(bases,collapse = '')
  ))

  gc <- sum(base_freq[c('G','C')])/sum(base_freq[c('A','G','C','T')])*100

  return(c('ID'= gene,
           'gene_length'= length,
           'percentage_gene_gc_content'=gc)
  )
}
#'Get Gene Length and GC: GTF/Fasta
#'
#'Get gene GC content of `gene` from data frame (`df`) containing
#'gene IDs in a column labeled the value of `column_id`. `df` must have a column
#'labeled sequence where the sequence info for exons/or transcripts are and
#'feature start and stop coordinates in columns labled the value of `start` and
#'`stop`. This will return a vector of `gene`, length, and GC content as a
#'percentage.
#'
#'@param i A character of a gene id.
#'@param data a dataframe of exons transcrtipts or genes, imported from a GTF file.
#'one line per-feature, data$V1 corresponds to the featur chormosome, data$V4 to
#' start coordinate, data$V5 to stop coordinate, and data$V7 to the sequence strand.
#'@param genome a list of DNAStringSet objects named by their contig (chromosome)
#'name.
#'@export
gtf_stats <- function( i, data, genome){
  gene_model <- data[data$gene_id == i,]

  gr <- GenomicRanges::reduce(
    GenomicRanges::GRanges(
      paste0(
        names(genome),':',
        gene_model$V4, '-',gene_model$V5, ':',
        gene_model$V7
      )))

  # calculate the length
  len <- sum(GenomicRanges::width(gr))

  ## Get Exon Sequence
  seq <- BSgenome::getSeq(genome, gr)

  ## Get Base Frequency
  alphafreq <- BSgenome::alphabetFrequency(seq)
  totalfreq <- colSums(alphafreq)

  ## get G/C content as a fraction
  gc <- (sum(totalfreq[c('G','C')]) /
           sum(totalfreq[c('A','G','C','T')])) * 100
  return(c(i, gc, len))
}
#'Get Ensembl biomaRt object
#'
#'Get GC content, gene Ids, gene symbols, gene biotypes, gene lengths
#'and other metadata from Ensembl BioMart. Object returned contains gene Ids
#'as rownames.
#'
#'@param count_df A counts data frame with sample identifiers as column names
#'and gene Ids are rownames.
#'@inheritParams get_data
#'@param filters A character vector listing biomaRt query filters.
#'(For a list of filters see \code{"biomaRt::listFilters()"})
#'@param host An optional character vector specifying the release version.
#'This specification is highly recommended for a reproducible workflow.
#'(see \code{"biomaRt::listEnsemblArchives()"})
#'@param organism A character vector of the organism name. This argument
#'takes partial strings. For example,"hsa" will match "hsapiens_gene_ensembl".
#'@param custom Defaults to FALSE If TRUE, the GC and Gene Length, and gene
#' name are calculated from user specified FASTA and GTF File.
#'@param gtfID Defaults to NULL. A character vector with a Synapse ID
#'corresponding to a gtf formatted gene annotation file.
#'@param gtfVersion Optional. A numeric vector with the gene GTF Synapse file
#' version number.
#'@param fastaID Defaults to NULL. A character vector with a Synapse ID
#'corresponding to a FASTA formatted genome annotation file.
#'@param fastaVersion Optional. A numeric vector with the genome FASTA Synapse
#'file version number.
#'@param cores An integer of cores to specify in the parallel backend (eg. 4).
#'@param isexon Defaults to FALSE. If TRUE, the GC and Gene Length parameters will
#'only consider exotic regions and omit intronic regions.
#'@importFrom rlang .data
#'@export
get_biomart <- function(count_df, synid, version, host, filters, organism,
                        custom = FALSE, gtfID = NULL, gtfVersion = NULL, fastaID = NULL,
                        fastaVersion = NULL, cores = NULL, isexon = FALSE) {
  if (is.null(synid)) {
    if(is.null(cores)){
      cores = parallel::detectCores()-1
    }

    # Check for extraneous rows
    count_df <- parse_counts(count_df)

    # Parse gene IDs to use in query
    gene_ids <- convert_geneids(count_df)

    # Create Biomart Object
    if (is.null(custom) | isFALSE(custom)) {
      # use biomaRt to construct the biomart_object

      # Get available datset from Ensembl
      ensembl <- biomart_obj(organism, host)

      message(
        paste0(
          "Downloading sequence",
          ifelse(length(gene_ids) > 1, "s", ""),
          " ..."
        )
      )

      if (length(gene_ids) > 100) {
        message("This may take a few minutes ...")
      }
      if (isTRUE(isexon)) {
        # calculate length and GC only for exonic regions
        # set feature, start, and stop equal to exon values
        feature <- "ensembl_exon_id"
        start <- "exon_chrom_start"
        end <- "exon_chrom_end"
        typ <- "gene_exon"
      } else {
        # calculate length and GC only for entire transcript
        # set feature, start, and stop equal to transcript values
        feature <- "ensembl_transcript_id"
        start <- "transcript_start"
        end <- "transcript_end"
        typ <- "transcript_exon_intron"
      }

      # pull feature level coordinates by either exon or transcript coordinates
      #"hgnc_symbol", "gene_biotype",
      attrs <- c(filters, feature, "chromosome_name", start, end)
      coords <- biomaRt::getBM(
        filters = filters,
        attributes = attrs,
        values = gene_ids,
        mart = ensembl,
        useCache = FALSE
      )

      #Pull the feature Sequences from Biomart
      seqs <- biomaRt::getSequence(
        id = coords[,feature],
        type = feature,
        seqType = typ,
        mart = ensembl,
        useCache = FALSE
      )

      # Insert the sequences into coords
      row.names(seqs) <- seqs[,feature]
      coords$sequence <- NA
      coords$sequence <- seqs[ coords[,feature], ][,typ]

      coords[,start] <- as.numeric(coords[,start])
      coords[,end] <- as.numeric(coords[,end])

      # Biomart gene column name
      gene_value <- names(coords)[1]

      # Calculate GC content and Gene Length
      cl <- snow::makeCluster(cores, outfile="log.log")
      len_gc <- data.frame()

      for (chr in names(table(coords$chromosome_name))) {

        df <- coords[coords$chromosome_name == chr,]
        genes <- as.list(gene_ids[gene_ids %in%
                            coords[coords$chromosome_name == chr,]$ensembl_gene_id
        ])

        message(paste0("Starting Chromosome: ", chr))
        foo <- as.data.frame(do.call(rbind, parallel::parLapply(
          cl = cl,
          genes,
          biomart_stats,
          column_id = gene_value,
          df = df,
          start = start,
          end = end
        )))
        len_gc <- as.data.frame(rbind(len_gc,foo))
      }

      colnames(len_gc)[1] <- gene_value
      snow::stopCluster(cl)
      rm(cl)

      # Pull gene info
      attrs <- c(filters, "hgnc_symbol", "gene_biotype", "chromosome_name")
      gene_info <- biomaRt::getBM(
        filters = filters,
        attributes = attrs,
        values = gene_ids,
        mart = ensembl,
        useCache = FALSE
      )

      # Join Data:
      biomart_results <- gene_info %>%
        dplyr::full_join(len_gc, by = gene_value)
      biomart_results <- biomart_results[, c(gene_value, 'hgnc_symbol',
                                             'percentage_gene_gc_content', 'gene_biotype',
                                             'chromosome_name', 'gene_length'
      )
      ]
      biomart_results$percentage_gene_gc_content <- as.numeric(biomart_results$percentage_gene_gc_content)
      biomart_results$gene_length <- as.numeric(biomart_results$gene_length)
    } else {
      # use custom specified GTF and FASTA from synapse

      # Load GTF and FASTA from synapse
      synapser::synLogin()
      gtf <- synapser::synGet(gtfID, version = gtfVersion)
      genome <- synapser::synGet(fastaID, version = fastaVersion)

      biom <- GenomicTools.fileHandler::importGTF(file=gtf$path,
                                                  level="gene",
                                                  features=c("gene_id",
                                                             "gene_name",
                                                             "gene_type"))
      biom <- biom[ , c("V1", "gene_id", "gene_name", "gene_type")]

      # set feature level
      if (isTRUE(isexon)) {
        # calculate length and GC only for exonic regions
        lvl <- "exon"
      }else{
        # calculate length and GC only for entire transcript
        lvl <- "transcript"
      }
      # pull features from gtf file
      feature_gtf <- GenomicTools.fileHandler::importGTF(
        file=gtf$path,
        level=lvl
      )

      #Calc GC content
      dna <- Biostrings::readDNAStringSet(genome$path)

      # assemble chromosomes chromosome
      chroms <- list()
      for(contig in names(table(biom$V1))){
        chroms[[contig]] <- dna[grep(contig,names(dna),value=T)][1]
      }

      # Translation for fasta chrom name alterations
      region_names <- rep(NA, length(names(chroms)))
      names(region_names) <- names(chroms)
      for(nam in names(region_names)){
        region_names[nam] <- names(chroms[[nam]])
      }

      # Gene named vector with of chromosome designations
      #gene_chr <- biom$V1
      #names(gene_chr) <- biom$gene_id
      gene_chr <- biom$gene_id
      #genome <- chroms[gene_chr[i]]

      cl <- snow::makeCluster(cores, outfile = "log.log")
      stats <- matrix(NA,0,3)
      for (element in names(table(biom$V1))) {
        calcs <- do.call(rbind, parallel::parLapply(
          cl = cl,
          as.list(biom[biom$V1 == element, ]$gene_id),
          gtf_stats,
          data = as.data.frame(feature_gtf)[feature_gtf$V1 == element,],
          genome = chroms[[element]]
        ))
        stats <- rbind(stats,calcs)
      }
      snow::stopCluster(cl)
      rm(cl)

      stats <- as.data.frame(stats)
      colnames(stats) <- c(filters, "percentage_gene_gc_content",
                           "gene_length")
      colnames(biom) <- c("chromosome_name", filters, "hgnc_symbol",
                           "gene_biotype")
      biomart_results <- biom %>%
        dplyr::full_join(stats, by = "ensembl_gene_id")

      #Remove PAR_Y
      biomart_results <- biomart_results[
        !(grepl('_PAR_Y', biomart_results$ensembl_gene_id)),
      ]

      # Clean chr from chromosomes
      biomart_results$chromosome_name <- gsub(
        'chr', '', biomart_results$chromosome_name
      )

      # Clean ENSGs
      biomart_results <- as.data.frame(biomart_results)
      biomart_results[,filters] <- do.call(
        rbind,
        strsplit(biomart_results[,filters], '[.]'))[,1]
      biomart_results <- biomart_results[ ,c(filters, 'hgnc_symbol',
                                             'percentage_gene_gc_content',
                                             'gene_biotype',
                                             'chromosome_name', 'gene_length')]
      biomart_results <- as.data.frame(biomart_results)
      biomart_results$percentage_gene_gc_content <- as.numeric(biomart_results$percentage_gene_gc_content)
      biomart_results$gene_length <- as.integer(biomart_results$gene_length)
    }
    # Finish cleaning bioMart Object
    # Duplicate Ensembl Ids are collapsed into a single entry
    biomart_results <- collapse_duplicate_hgnc_symbol(biomart_results)

    # Biomart IDs as rownames
    biomart_results <- tibble::column_to_rownames(
      biomart_results, var = filters
    )
  }else{

    # Download biomart object from syndID specified in config.yml
    biomart_results <- get_data(synid, version)

    # Biomart IDs as rownames
    biomart_results <- tibble::column_to_rownames(
      biomart_results, var = filters
    )

    # Gene metadata required for count CQN
    required_variables <- c("gene_length", "percentage_gene_gc_content")
    if (!all(required_variables %in% colnames(biomart_results))) {
      vars <- glue::glue_collapse(
        setdiff(required_variables, colnames(biomart_results)),
        sep = ", ",
        last = " and "
      )
      message(glue::glue("Warning: {vars} missing from biomart object.
                         This information is required for Conditional
                         Quantile Normalization"))
    }
  }
  return(biomart_results)
}
#'Check for consistent identifiers
#'
#'Subsequent steps assume unique identifiers are present within both the counts
#'matrix and the metadata. This function checks whether samples in the counts
#'matrix match samples in the metadata, and vice versa.versa.
#'@inheritParams get_biomart
#'@param md A data frame with sample identifiers as row names and relevant experimental covariates.
#'@export
check_mismatch <- function(md, count_df) {
  # Isolate the SampleID's from the metadata and counts matrix
  md_sampleid <- rownames(md)
  cnt_sampleid <- colnames(count_df)

  # Ensures each sample in the metadata has a corresponding column in the counts matrix and vice versa
  if ((length(setdiff(md_sampleid, cnt_sampleid)) > 0) || (length(setdiff(cnt_sampleid, md_sampleid)) > 0)){
    stop("All sample identifiers in the counts matrix and metadata file must match")
  }
}

#'Duplicate HGNC
#'
#'Count normalization requires Ensembl Ids to be unique. In rare cases, there are more
#'than one HGNC symbol per gene Id. This function collapses the duplicate entries into
#'a single entry by appending the HGNC symbols in a comma separated list.
#'@param biomart_results Output of \code{"sageseqr::get_biomart()"}. Gene Ids are
#'stored as rownames.
#'@importFrom rlang .data
#'
#'@export
collapse_duplicate_hgnc_symbol <- function(biomart_results){
  biomart_results %>%
    dplyr::group_by(.data$ensembl_gene_id) %>%
    dplyr::mutate(hgnc_symbol = paste(.data$hgnc_symbol, collapse = ", ")) %>%
    unique()
}
#' Filter genes
#'
#' Filter genes with low expression. This function is more permissive by setting conditions
#' that corresponds to metadata variables. The gene matrix is split by condition and the
#' counts per million (CPM) for a given condition is computed by \code{"sageseqr::simple_filter()"}.
#'
#' @inheritParams coerce_factors
#' @inheritParams get_biomart
#' @inheritParams simple_filter
#' @param conditions Optional. Conditions to bin gene counts that correspond to
#' variables in `md`.
#' @param clean_metadata A data frame with sample identifiers as rownames and variables as
#' factors or numeric as determined by \code{"sageseqr::clean_covariates()"}.
#' @importFrom magrittr %>%
#' @export
filter_genes <- function(clean_metadata, count_df,
                         cpm_threshold, conditions_threshold,
                         conditions = NULL) {

  if (class(conditions) == "list") {
    conditions <- unique(conditions[[1]])
  } else {
    conditions <- unique(conditions)
  }

  # Check for extraneous rows
  count_df <- parse_counts(count_df)

  if (!is.null(conditions)) {
    if (!any(conditions %in% colnames(clean_metadata))) {
      stop("Conditions are missing from the metadata.")
    }

    split_data <- split(clean_metadata,
                        f = as.list(clean_metadata[, conditions, drop = F]),
                        drop = T)

    map_genes <- purrr::map(split_data, function(x) {
      simple_filter(count_df[, rownames(x), drop = F],
                    cpm_threshold,
                    conditions_threshold)
    })

    genes_to_analyze <- unlist(map_genes) %>%
      unique() %>%
      sort()
  } else {
    genes_to_analyze <- simple_filter(
      count_df,
      cpm_threshold,
      conditions_threshold
    )
  }

  processed_counts <- count_df[genes_to_analyze,]

  # Convert transcript Ids to gene Ids in counts with convert_geneids()
  rownames(processed_counts) <- convert_geneids(processed_counts)
  processed_counts
}
#' Filter genes with low expression
#'
#' `simple_filter` converts a gene counts matrix into counts per million (CPM) and identifies
#' the genes that meet the minimum CPM threshold in a percentage of samples. The minimum CPM
#' threshold and percent threshold is user defined. The function returns a list of genes.
#'
#' @inheritParams get_biomart
#' @param cpm_threshold The minimum number of CPM allowed.
#' @param conditions_threshold Percentage of samples that should contain the minimum CPM.
#' @examples
#'\dontrun{
#'gene_list <- simple_filter(count_df = counts, cpm_threshold = 1, condition_threshold = 0.5)
#'}
simple_filter <- function(count_df, cpm_threshold, conditions_threshold) {
  cpm  <- edgeR::cpm(count_df)
  cpm[is.nan(cpm)] <- 0
  fraction <- rowMeans(cpm >= cpm_threshold)
  keep <- fraction >= conditions_threshold
  genes <- keep[keep]
  genes <- names(genes)
  genes
}
#'Get gene Ids
#'
#'@inheritParams get_biomart
#'@importFrom rlang .data
#'
#'@export
convert_geneids <- function(count_df) {
  if (any(grepl("\\.", rownames(count_df)))) {
    geneids <- tibble::tibble(ids = rownames(count_df)) %>%
      tidyr::separate(.data$ids, c("ensembl_gene_id", "position"), sep = "\\.")
    geneids$ensembl_gene_id
  } else {
    rownames(count_df)
  }
}
#' Conditional Quantile Normalization (CQN)
#'
#' Normalize counts by CQN. By providing a biomart object, the systematic effect of GC content
#' is removed and gene length (in bp) variation is accounted for. Genes with missing GC content
#' or gene lengths will be removed from the counts matrix.
#'
#'@param filtered_counts A counts data frame with genes removed that have low expression.
#'@inheritParams collapse_duplicate_hgnc_symbol
#'@export
cqn <- function(filtered_counts, biomart_results) {

  required_variables <- c("gene_length", "percentage_gene_gc_content")

  if (!all(required_variables %in% colnames(biomart_results))) {
    message(glue::glue("Error:{setdiff(required_variables,
                       colnames(biomart_results))} missing from
                       biomart object. This information is required
                       for Conditional Quantile Normalization"))
  } else {

    genes_to_analyze <- intersect(rownames(filtered_counts), rownames(biomart_results))

    to_normalize <- subset(filtered_counts, rownames(filtered_counts) %in% genes_to_analyze)
    gc_length <- subset(biomart_results, rownames(biomart_results) %in% genes_to_analyze)

    normalized_counts <- suppressWarnings(cqn::cqn(to_normalize,
                                  x = gc_length[, "percentage_gene_gc_content"],
                                  lengths = gc_length[, "gene_length"],
                                  lengthMethod = "smooth",
                                  verbose = FALSE
    )
    )

    normalized_counts$E <- normalized_counts$y + normalized_counts$offset

    return(normalized_counts)
  }
}
#' Dropped Genes Finder
#'
#' Finds gene IDs in the filtered counts which were removed in cqn normalization
#' most likely as a function if no GC or length estimates being present in the
#' user specified biomart object.
#' @param filtered_counts The target containing counts after low gene
#' expression has been removed. Defaults to target name constrained by
#'  \code{"targets::tar_make()"}.
#' @param cqn_counts A counts data frame normalized by CQN.
#'
#' @export
dropped_genes <- function (filtered_counts, cqn_counts) {
  missing <- row.names(filtered_counts)[
    !(row.names(filtered_counts) %in% row.names(cqn_counts))
  ]
  if (length(missing) == 0) {
    dropped <- NULL
  }else{
    dropped <- missing
  }
  return(dropped)
}
#'@importFrom quantreg rq
#'@export
quantreg::rq
#'@importFrom mclust Mclust
#'@export
mclust::Mclust
#'@importFrom mclust mclustBIC
#'@export
mclust::mclustBIC
#' Formula Builder
#'
#' If a linear mixed model is used, all categorical variables must be
#' modeled as a random effect. This function identifies the variable class
#' to scale numeric variables and model categorical variables as a random
#' effect. Additionally, variables are scaled to account for
#' multiple variables that might have an order of magnitude difference.
#'
#' @param model_variables Optional. Vector of variables to include in the linear (mixed) model.
#' If not supplied, the model will include all variables in \code{md}.
#' @param primary_variable Vector of variables that will be collapsed into a single
#' fixed effect interaction term.
#' @param random_effect A vector of variables to consider as random effects instead
#' of fixed effects.
#' @param add_model Optional. User Speciffied variables to add to the null model
#' apriori to model generation. (Default = NULL)
#' @param is_num Is there a numerical covariate to use as an interaction with the primary variable(s). default= NULL
#' @param num_var A numerical metadata column to use in an inaction with the primary variable(s). default= NULL
#' @param exclude_variables Vector of variables to exclude from testing.
#' @inheritParams coerce_factors
#' @export
build_formula <- function(md, primary_variable, model_variables = NULL,
                          is_num = NULL, num_var = NULL, random_effect = NULL,
                          exclude_variables = NULL, add_model = NULL) {


  if (!(all(purrr::map_lgl(md, function(x) inherits(x, c("numeric", "factor")))))) {
    stop("Use sageseqr::clean_covariates() to coerce variables into factor and numeric types.")
  }

  if (!is.null(model_variables)) {
    if(!is.null(num_var) | !isFALSE(num_var)){
      vars <- c(model_variables, add_model, primary_variable,num_var)
      vars <- vars[!duplicated(vars)]
      md <- dplyr::select(md, dplyr::all_of(vars))
    }else{
      #md <- dplyr::select(md, dplyr::all_of(c(model_variables,
      #                                        num_var, primary_variable)))
      md <- dplyr::select(md, dplyr::all_of(c(model_variables,
                                              add_model,
                                              primary_variable)))
    }
  }

  if (!is.null(exclude_variables)) {
    if (exclude_variables %in% colnames(md)) {
      stop("exclude_variables and model_variables are the same.")
    }
    md <- dplyr::select(md, -dplyr::all_of(exclude_variables))
  }

  # Update metadata to reflect variable subset

  # Variables of factor or numeric class are required
  if(!is.null(num_var) | !isFALSE(num_var)){
    col_type <- dplyr::select(md, -dplyr::all_of(c(primary_variable))) %>%
      dplyr::summarise_all(class) %>%
      tidyr::pivot_longer(tidyr::everything(), names_to = "variable", values_to = "class")
  }else{
    col_type <- dplyr::select(md, -dplyr::all_of(c(vars))) %>%
      dplyr::summarise_all(class) %>%
      tidyr::pivot_longer(tidyr::everything(), names_to = "variable", values_to = "class")
  }

  col_type[ col_type$variable %in% random_effect, ]$class <- 'random'
  # Categorical or factor variables are modeled as a random effect by (1|variable)
  # Numeric variables are scaled to account for when the spread of data values differs
  # by an order of magnitude
  form <- purrr::map(1:length(col_type$class), function(i){
    switch(col_type$class[i],
           "factor" = glue::glue( col_type$variable[i] ),
           "random" = glue::glue("(1|", col_type$variable[i], ")"),
           "numeric" = glue::glue("scale(", col_type$variable[i], ")")
    )
  })
  names(form) <- col_type$variable

  # List with multiple values can cause glue_collapse to fail. This step is conservative
  # as it unlists all lists at this step.
  if (class(primary_variable) == "list") {
    primary_variable <- primary_variable[[1]]
  }

  # A new categorical is created to model an interaction between two variables
  #interaction_term <- glue::glue_collapse({primary_variable}, "_")
  if(isTRUE(is_num)){
    # A new categorical is created to model an interaction between two variables
    interaction_term <- paste0(glue::glue_collapse({primary_variable}, "_"),'.',num_var)

    metadata = md %>%
      tidyr::unite(!!interaction_term, dplyr::all_of(primary_variable), sep = "_")

    metadata[ , paste0(num_var,'.',as.character(names(table(metadata[interaction_term] ))))] <- 0

    for( i in paste0(num_var,'.',as.character(names(table(metadata[interaction_term] )))) ){
      metadata[metadata[,interaction_term] == gsub(paste0( num_var, '.'),'',i) ,i] <-
        metadata[metadata[,interaction_term] == gsub(paste0( num_var, '.'),'',i) ,num_var]
    }

    metadata[,interaction_term] <- as.factor(metadata[,interaction_term])
    de_conditions <- paste0( 'scale(', num_var, '.', levels(metadata[,interaction_term]), ')')

    metadata <- dplyr::select(metadata, -dplyr::all_of(c(interaction_term,num_var)))

    #Remove numeric from form
    remo_var <- NULL
    for( i in 1:length(form)){
      if(grepl(num_var, as.character(form[[i]]))) {
        #message(form[[i]])
        remo_var <- i
      }
    }
    if(!is.null(remo_var)){
      form <- form[-remo_var]
    }

    #Make Formula objects:
    formula = formula(
      glue::glue("~ ",
                 glue::glue_collapse(
                   de_conditions,
                   sep = "+"),
                 "+",
                 glue::glue_collapse(form, sep = "+")
      ))

    formula_non_intercept = formula(
      glue::glue("~ 0+",
                 glue::glue_collapse(
                   de_conditions,
                   sep = "+"),
                 "+",
                 glue::glue_collapse(form, sep = "+")
      ))
    if(!is.null(add_model)){
      formula_base_model = formula(glue::glue("~ {interaction_term}+",
                                              glue::glue_collapse(
                                                form[add_model],
                                                sep = "+"
                                              )
                                            ))
    }else{
      formula_base_model = formula(glue::glue("~ ",
                                              glue::glue_collapse(
                                                c(de_conditions,form[add_model]),
                                                sep = "+")
                                            ))
    }

  }else{
    # A new categorical is created to model an interaction between two variables
    interaction_term <- glue::glue_collapse({primary_variable}, "_")

    metadata = md %>%
      tidyr::unite(!!interaction_term, dplyr::all_of(primary_variable), sep = "_")

    metadata[,interaction_term] <- as.factor(metadata[,interaction_term])
    de_conditions <- levels(metadata[,interaction_term])

    #Make Formula objects:
    formula = formula(
      glue::glue("~ {interaction_term}+", glue::glue_collapse(form, sep = "+")
      ))

    formula_non_intercept = formula(
      glue::glue("~ 0+{interaction_term}+", glue::glue_collapse(form, sep = "+")
      ))

    if(!is.null(add_model)){
      formula_base_model = formula(glue::glue("~ {interaction_term}+",
                                              glue::glue_collapse(form[add_model], sep = "+")))
    }else{
      formula_base_model = formula(glue::glue("~ {interaction_term}"))
    }
  }

  object <- list(metadata = metadata,
                 formula = formula,
                 formula_non_intercept = formula_non_intercept,
                 formula_base_model = formula_base_model,
                 primary_variable = interaction_term,
                 variables = unlist(form[!(names(form) %in% add_model)]),
                 de_conditions = de_conditions
  )

  object
}
#' Differential Expression with Dream
#'
#' Differential expression testing is performed with \code{"variancePartition::dream()"} to increase power
#' and decrease false positives for repeated measure designs. The `primary_variable` is modeled as a fixed
#' effect. If you wish to test an interaction term, `primary_variable` can take multiple variable names.
#'
#' @param cqn_counts A counts data frame normalized by CQN.
#' @param p_value_threshold Numeric. P-values are adjusted by Benjamini and
#' Hochberg (BH) false discovery rate (FDR). Significant genes are those with an
#' adjusted p-value greater than this threshold.
#' @param fold_change_threshold Numeric. Significant genes are those with a
#' fold-change greater than this threshold.
#' @param cores An integer of cores to specify in the parallel backend (eg. 4).
#' @param is_num Is there a numerical covariate to use as an interaction with the primary variable(s). default= NULL
#' @param num_var A numerical metadata column to use in an inaction with the primary variable(s). default= NULL
#' @param cont Optional. A list specifying contrasts of the primary variable(s)
#' to consider for differential sequencing results if using factor(s) as your
#' primary variable. If not specified all combinations will be tested. If specified
#' this will speed up the pipeline. Specify the contrast with the factor values
#' involved in the contrast separated by a hyphen. (eg for diagnosis, \code{"primary_variable = c(\"AD-CT\")"}
#' where AD is the value in diagnosis column for cases and CT is the value for controls.
#' For multi-level contrasts, eg. \code{"primary_variable = c(\"diagnosis\", \"Sex\")"} would have contrasts
#' specified as; \code{cont= c(\"ZZ_F-CT_F\", \"ZZ_M-CT_M\")} to look at cases vs controls in
#' females and cases vs controls in males independently. While the order before or
#' after the hyphen doesn't matter, the order of values  before/after the underscore does matter.
#' The value order must be the same as the \code{"primary:} specification.
#' eg. \code{"primary_variable = c(\"diagnosis\",\"sex\")} must be \code{\"CT_M-ZZ_M\"} while
#' \code{"primary_variable = c(\"sex\",\"diagnosis\")"} must be \code{\"M_CT-M_ZZ\"} (default= NULL)
#' @inheritParams cqn
#' @inheritParams coerce_factors
#' @inheritParams build_formula
#' @inheritParams cqn
#' @export
#' @return A named list with \code{"variancePartition::voomWithDreamWeights()"}
#'  normalized counts, contrasts from \code{"variancePartition::getContrasts()"},
#'  linear mixed model fits from \code{"variancePartition::dream()"}, differential
#'  expression results from \code{"limma::topTable()"} and gene feature-specific
#'  metadata, the response variable, the model formula fit to compute differential
#'  expression results.
#'  The list names are: \code{"list(voom_object, contrasts_to_plot, fits, differential_expression,
#'  primary_variable, formula)"}
differential_expression <- function(filtered_counts, cqn_counts, md,
                                    primary_variable, biomart_results,
                                    p_value_threshold, fold_change_threshold,
                                    random_effect = NULL, model_variables = NULL,
                                    exclude_variables = NULL, cores = NULL,
                                    is_num = NULL, num_var = NULL,
                                    cont = NULL) {
  # force order of samples in metadata to match order of samples in counts.
  # Required by variancePartition
  if(is.null(cores)){
    cores = parallel::detectCores()-1
  }
  md <- md[match(colnames(filtered_counts),rownames(md)),]

  metadata_input <- build_formula(md, primary_variable, model_variables,
                                  random_effect = random_effect, is_num = is_num,
                                  num_var = num_var)

  gene_expression <- edgeR::DGEList(filtered_counts)
  gene_expression <- edgeR::calcNormFactors(gene_expression)
  voom_gene_expression <- variancePartition::voomWithDreamWeights(counts = gene_expression,
                                                                  formula = metadata_input$formula,
                                                                  data = metadata_input$metadata,
                                                                  BPPARAM = BiocParallel::SnowParam(cores)
  )

  voom_gene_expression$E <- cqn_counts

  de_conditions <- metadata_input$de_conditions
  if(isTRUE(is_num)){
    de_conditions <- gsub("scale\\(",'',de_conditions)
    if(all( ')' == stringr::str_sub(de_conditions, - 1, - 1))) {
      de_conditions <- gsub("\\)",'',de_conditions)
    }
    conditions_for_contrast <- purrr::map_chr(metadata_input$de_conditions,
                                              function(x) glue::glue(x)
    )
    setup_coefficients <- gtools::combinations(n = length(conditions_for_contrast),
                                               r = 2,
                                               v = conditions_for_contrast
    )

    if(!is.null(cont)){
      conts <- sapply(cont, function(i) { strsplit(i,'-')} )
      new_setup_coefficients <- NULL
      for( i in conts){
        new_setup_coefficients <- rbind(
          new_setup_coefficients,
          setup_coefficients[
            ( grepl(i[1],setup_coefficients[,1]) | grepl(i[1],setup_coefficients[,2])) &
              ( grepl(i[2],setup_coefficients[,1]) | grepl(i[2],setup_coefficients[,2])),
          ])
      }
      setup_coefficients <- new_setup_coefficients
    }

  }else{
    conditions_for_contrast <- purrr::map_chr(de_conditions,
                                              function(x) glue::glue({metadata_input$primary_variable}, x)
    )
    setup_coefficients <- gtools::combinations(n = length(conditions_for_contrast),
                                               r = 2,
                                               v = conditions_for_contrast
    )

    if(!is.null(cont)){
      conts <- sapply(cont, function(i) { strsplit(i,'-')} )
      new_setup_coefficients <- NULL
      for( i in conts){
        new_setup_coefficients <- rbind(
          new_setup_coefficients,
          setup_coefficients[
            ( grepl(i[1],setup_coefficients[,1]) | grepl(i[1],setup_coefficients[,2])) &
              ( grepl(i[2],setup_coefficients[,1]) | grepl(i[2],setup_coefficients[,2])),
          ])
      }
      setup_coefficients <- new_setup_coefficients
    }
  }

  contrasts <- lapply(seq_len(nrow(setup_coefficients)),
                      function(i) variancePartition::getContrast(exprObj = voom_gene_expression,
                                                                 formula = metadata_input$formula_non_intercept,
                                                                 data = metadata_input$metadata,
                                                                 coefficient = as.vector(setup_coefficients[i,])
                      )
  )

  contrasts <- as.data.frame(contrasts)

  df <- as.data.frame(setup_coefficients)

  names(contrasts) <- stringr::str_glue_data(df, "{V1} - {V2}")

  fit_contrasts = variancePartition::dream(exprObj = voom_gene_expression,
                                           formula = metadata_input$formula_non_intercept,
                                           data = metadata_input$metadata,
                                           L = contrasts,
                                           BPPARAM = BiocParallel::SnowParam(cores)
  )

  if (is.null(lme4::findbars(stats::as.formula(metadata_input$formula_non_intercept)))) {
    message("Applying eBayes Directly")
    fit_contrasts <- limma::eBayes(fit_contrasts)

    de <- lapply(names(contrasts), function(i, fit) {
      genes <- limma::topTable(fit, coef = i, number = Inf,
                               sort.by = "logFC", confint = TRUE)
      genes <- tibble::rownames_to_column(genes, var = "ensembl_gene_id")
    }, fit_contrasts)

  }else {
    CIs <- limma::eBayes(fit_contrasts)

    de <- lapply(names(contrasts), function(i, fit, ci) {
      genes <- limma::topTable(fit, coef = i, number = Inf,
                               sort.by = "logFC")
      genes <- tibble::rownames_to_column(genes, var = "ensembl_gene_id") #%>%
      #dplyr::select(-CI.L, CI.R)

      cis <- limma::topTable(ci, coef = i, number = Inf,
                             sort.by = "logFC", confint = TRUE)
      cis <- tibble::rownames_to_column(cis, var = "ensembl_gene_id")

      genes <- cis %>%
        dplyr::select(ensembl_gene_id,CI.L, CI.R) %>%
        dplyr::left_join(.,genes, by='ensembl_gene_id') %>%
        dplyr::select(., ensembl_gene_id, logFC, CI.L, CI.R, AveExpr, t, P.Value, adj.P.Val)

    }, fit_contrasts, CIs)
  }

  if(isTRUE(is_num)){
    c_names <- names(contrasts)
    c_names <- gsub("scale\\(",'',c_names)
    if(all( ')' == stringr::str_sub(c_names, - 1, - 1))) {
      c_names <- gsub("\\)",'',c_names)
    }

    replace_val <- strsplit(metadata_input$primary_variable,'[.]')[[1]][2]
    rep_val <-  gsub('[.]','*',metadata_input$primary_variable)
    c_names <- gsub(replace_val,rep_val,c_names)


    names(de) <- c_names

    de <- data.table::rbindlist(de, idcol = "Comparison") %>%
      dplyr::mutate(Comparison = .data$Comparison,
                    Direction = .data$logFC/abs(.data$logFC),
                    Direction = ifelse(.data$Direction == -1,"down", .data$Direction),
                    Direction = ifelse(.data$Direction == 1, "up", .data$Direction),
                    Direction = ifelse(.data$`adj.P.Val` > p_value_threshold | abs(.data$logFC) < log2(fold_change_threshold),
                                       "none",
                                       .data$Direction)
      )

  }else{
    names(de) <- names(contrasts)

    de <- data.table::rbindlist(de, idcol = "Comparison") %>%
      dplyr::mutate(Comparison = gsub(metadata_input$primary_variable, "", .data$Comparison),
                    Direction = .data$logFC/abs(.data$logFC),
                    Direction = ifelse(.data$Direction == -1,"down", .data$Direction),
                    Direction = ifelse(.data$Direction == 1, "up", .data$Direction),
                    Direction = ifelse(.data$`adj.P.Val` > p_value_threshold | abs(.data$logFC) < log2(fold_change_threshold),
                                       "none",
                                       .data$Direction)
      )
  }

  # Join metadata from biomart to differential expression results
  biomart_results <- tibble::rownames_to_column(biomart_results, var = "ensembl_gene_id")

  de <- dplyr::left_join(de, biomart_results, by = c("ensembl_gene_id"))

  object <- list(voom_object = voom_gene_expression,
                 contrasts_to_plot = contrasts,
                 fits = fit_contrasts,
                 differential_expression = de,
                 primary_variable = metadata_input$primary_variable,
                 formula = metadata_input$formula_non_intercept)

  object
}
#' Wrapper for Differential Expression Analysis
#'
#' This function will pass multiple conditions to test to \code{"sagseqr::differential_expression()"}.
#' @param conditions A named list of conditions to test as `primary`
#' in \code{"sagseqr::differential_expression()"}.
#' @param dropped a vector of gene names to drop from filtered counts, as they
#' were not cqn normalized
#' @inheritParams differential_expression
#' @inheritParams build_formula
#' @export
wrap_de <- function(conditions, filtered_counts, cqn_counts, md, dropped,
                    biomart_results, p_value_threshold, fold_change_threshold,
                    model_variables = names(md), random_effect = NULL, exclude_variables = NULL,
                    cores = NULL, cont = NULL, is_num = NULL, num_var = NULL) {

  if (!is.null(dropped)) {
    filtered_counts <- filtered_counts[
      !(row.names(filtered_counts) %in% dropped),
    ]
  }
  purrr::map(
    conditions,
    function(x) differential_expression(
      filtered_counts,
      cqn_counts,
      md,
      primary_variable = x$primary,
      is_num = x$is_numeric_int,
      num_var = x$numeric,
      biomart_results,
      p_value_threshold,
      fold_change_threshold,
      model_variables,
      random_effect = random_effect,
      cores = cores,
      cont = x$contrasts
      )
    )
}
#' Stepwise Regression
#'
#' This function performs multivariate forward stepwise regression evaluated by multivariate Bayesian Information
#' Critera (BIC) by wrapping \code{"mvIC::mvForwardStepwise()"}.
#'
#' @inheritParams differential_expression
#' @inheritParams build_formula
#' @param skip Defaults to NULL. If TRUE, this step will be skipped in the
#' targets plan.
#' @return Table with BIC criteria for exclusion or inclusion of variables in
#' the model, linear (mixed) model formula and vector of variables to include.
#' @export
stepwise_regression <- function(md, primary_variable, cqn_counts,
                                model_variables = names(md), skip = NULL,
                                random_effect = NULL, add_model = NULL) {
  # skip stepwise generation if skip = TRUE
  if(isTRUE(skip)) {
    return("Skipping stepwise regression model generation...")
  } else {
  metadata_input <- build_formula(md, primary_variable, model_variables, add_model = add_model, random_effect = random_effect)
  model <- mvIC::mvForwardStepwise(exprObj = cqn_counts$E,
                                   baseFormula = metadata_input$formula_base_model,
                                   data = metadata_input$metadata,
                                   variables = array(metadata_input$variables)
  )

  to_visualize <- model$trace %>%
    dplyr::select(.data$iter, .data$variable, .data$delta,
                  .data$score, .data$isBest, .data$isAdded, .data$m) %>%
    dplyr::rename(iteration = .data$iter,
                  `best (tested against baseline)` = .data$isBest,
                  `added to model` = .data$isAdded,
                  `(effective) number of parameters estimated` = .data$m
                  )

  # return vector of variables that map to the metadata columns. This requires
  # some cleaning of extraneous characters. Do not include primary variable(s)

  to_include <- model$trace$variable[model$trace$isAdded == "yes" & model$trace$iter != 0]
  to_include <- gsub("\\)|\\(1\\||scale\\(", "", to_include)

  output <- list(
    to_visualize = to_visualize,
    formula = model$formula,
    variables_in_model = to_include
  )

  return(output)
  }
}
#' Summarize Biotypes
#'
#' Computes the fraction of genes of a particular biotype. The number of genes
#' must be above 100 to be summarized.
#'
#' @inheritParams collapse_duplicate_hgnc_symbol
#' @inheritParams cqn
#' @export
summarize_biotypes <- function(filtered_counts, biomart_results) {
  biomart_results[rownames(filtered_counts),] %>%
    dplyr::group_by(.data$gene_biotype) %>%
    dplyr::summarise(fraction = dplyr::n()) %>%
    dplyr::filter(.data$fraction > 100) %>%
    dplyr::mutate(fraction = .data$fraction/dim(filtered_counts)[1])
}
#' Prepare output
#'
#' Store data in temporary files to prepare for Synapse upload.
#' @param target The object to be stored.
#' @param data_name An identifier to embed in the file name.
#' @param rowname Optional. If applicable, the name of the variable to store
#' rownames.
#' @export
prepare_results <- function(target, data_name, rowname = NULL) {
  if (!is.null(rowname)) {
    target <- tibble::rownames_to_column(target, rowname)
  }

  # the file name will contain the primary name of the target
  tmp <- fs::file_temp(data_name, ext = ".tsv")

  utils::write.table(
    target,
    file = tmp,
    sep = "\t",
    row.names = FALSE,
    quote = FALSE
    )
  tmp
}
#' Store output to Synapse
#'
#' Store transformed data, markdown and html report to a Folder in Synapse.
#'
#' @param parent_id A Synapse Id that corresponds to a project or
#' folder to store output.
#' @param cqn_counts The target containing Conditional Quantile Normalized
#'  (CQN) counts. Defaults to target name constrained by
#'  \code{"targets::tar_make"}.
#' @param clean_md The target containing the metadata data frame.
#' Defaults to target name constrained by \code{"targets::tar_make"}.
#' @param filtered_counts The target containing counts after low gene
#' expression has been removed. Defaults to target name constrained by
#'  \code{"targets::tar_make()"}.
#' @param biomart_results The target containing gene annotations from
#' biomart. Defaults to target name constrained by
#' \code{"targets::tar_make"}.
#' @param de_results The target containing differential expression gene
#' lists. Defaults to target name constrained in the _targets.R file.
#' @param residualized_counts The target containing counts adjusted for batch
#' effects. Defaults to target name constrained in the _targets.R file.
#' @param report The target containing the rendered html document. Defaults
#' to target name constrained in the _targets.R file.
#' @param rownames A list of variables to store rownames ordered by `metadata`,
#' `filtered_counts`, `biomart_results`, `cqn_counts`. If not applicable,
#' set as NULL.
#' @param syn_names A list of human-readable names for the Synapse entities
#' ordered
#' by `metadata`, `filtered_counts`, `biomart_results`, `cqn_counts`.
#' @param inputs A character vector of Synapse Ids to create provenance between
#' output files and input files.
#' @param activity_provenance A phrase to describe the data transformation for
#' provenance.
#' @param data_names A list of identifiers to embed in the file name ordered
#' by `clean_md`, `filtered_counts`, `biomart_results`, `cqn_counts`,
#' `de_results`.
#' @param config_file Optional. Path to configuration file.
#' @param report_name Name of output markdown file.
#' @export
store_results <- function(clean_md = clean_md,
                          filtered_counts = filtered_counts,
                          biomart_results = biomart_results,
                          cqn_counts = cqn_counts$E,
                          de_results = de,
                          residualized_counts = residualized_counts,
                          report = report,
                          syn_names, data_names,
                          parent_id, inputs, activity_provenance,
                          rownames = NULL, config_file = NULL,
                          report_name = NULL) {

  # include sageseqr package version in Synapse provenance
  ver <- utils::packageVersion("sageseqr")
  description <- glue::glue(
    "analyzed with sageseqr {ver}"
  )

  # nest targets targets in a list. Every time a new target is to-be stored, it
  # must be added as an argument to this function and then added to this list.
  targets <- list(
    clean_md,
    filtered_counts,
    biomart_results,
    as.data.frame(cqn_counts)
  )

  # parse differential expression gene list
  de_results <- purrr::map(de_results, function(x) x$differential_expression)

  # parse residualized counts
  parse_residual_matrix <- purrr::map(residualized_counts, function(x) x$output)

  # append differential expression data frames already nested in list
  targets <- append(targets, parse_residual_matrix)
  targets <- append(targets, de_results)

  # append null rownames for differential expression object
  rownames <- append(rownames, rep(list(NULL), length(residualized_counts) + length(de_results)))

  mash <- list(
    target = targets,
    rowname = rownames,
    data_name = data_names
  )

  file_location <- purrr::pmap(
    mash,
    prepare_results
    )

  mash <- list(
    parent = parent_id,
    syn_names = syn_names,
    paths = file_location
    )

  file_to_upload <- purrr::pmap(
    mash,
    function(paths, parent, syn_names) synapser::File(
      path = paths,
      parent = parent,
      name = syn_names
      )
  )

  # nest input Synapse Ids to apply the same provenance to all files
  inputs <- list(inputs)

  mash <- list(
    files = file_to_upload,
    inputs = inputs,
    activity_provenance = activity_provenance,
    activity_description = description
    )

  # login to Synapse
  synapser::synLogin()

  for_provenance <- purrr::pmap(
    mash,
    function(
      files, inputs, activity_provenance, activity_description
      ) synapser::synStore(
        obj = files,
        used = inputs,
        activityName = activity_provenance,
        activityDescription = activity_description,
        forceVersion = FALSE
      )
  )

  if (!is.null(config_file)) {
    file <- synapser::File(
      path = config_file,
      parent = parent_id,
      name = "Configuration file"
    )

    config_provenance <- synapser::synStore(
      obj = file,
      activityName = activity_provenance
      )

    for_provenance <- append(for_provenance, config_provenance)
  }

  if (!is.null(report_name)) {
    path <- glue::glue("{getwd()}/{report_name}.html")

    used_ids <- unlist(
      purrr::map(
        for_provenance,
        ~.x$get("id")
        )
    )

    file <- synapser::File(
      path = path,
      parent = parent_id
    )

    markdown_provenance <- synapser::synStore(
      obj = file,
      used = used_ids,
      activityName = activity_provenance,
      activityDescription = description
    )
  }
  message(glue::glue("Files uploaded to {parent_id}"))
}
#' Provenance helper
#'
#' Collapse Synapse Ids and version.
#' @param metadata_id Synapse ID to clean metadata file with sample identifiers
#' in a column and variables of interest as column names. There cannot be any
#' missing values.
#' @param counts_id Synapse ID to counts data frame with identifiers to the
#' metadata as column names and gene ids in a column.
#' @param metadata_version Optionally, include Synapse file version number. If
#' omitted, current version will be downloaded.
#' @param counts_version Optionally, include Synapse file version number.
#' @param biomart_id Synapse ID to biomart object.
#' @param biomart_version Optionally, include Synapse file version number.
#' @export
provenance_helper <- function(metadata_id,  counts_id, metadata_version = NULL,
                              counts_version = NULL, biomart_id = NULL,
                              biomart_version = NULL) {

  if (is.null(metadata_version)) {
    ids <- metadata_id
  } else {
    ids <- glue::glue("{metadata_id}.{metadata_version}")
  }

  if (is.null(counts_version)) {
    ids <- c(ids, counts_id)
  } else {
    ids <- c(ids, glue::glue("{counts_id}.{counts_version}"))
  }

  if (is.null(biomart_version) & is.null(biomart_id)) {
    ids
  } else if (is.null(biomart_version) & !is.null(biomart_id)) {
    ids <- c(ids, biomart_id)
  } else {
    ids <- c(ids, glue::glue("{biomart_id}.{biomart_version}"))
  }
  ids
}
#' Initialize differential expression analysis workflow
#'
#' The `sageseqr` package provides a `targets` workflow to string together the
#' data processing and computational steps of RNA-seq differential expression
#' analysis. This funciton copies a markdown document and _targets.R file to your
#' working directory. The _targets.R file in your working directory is required
#' for the workflow to run.
#'
#' @export
start_de <- function() {
  # copy sageseqr-report.Rmd markdown to working directory
  if (!file.exists("sageseqr-report.Rmd")) {
    fs::file_copy(system.file("sageseqr-report.Rmd", package = "sageseqr"),
                  new_path = getwd())
  }

  # copy _targets.R file to working directory
  if (!file.exists("_targets.R")) {
    fs::file_copy(system.file("_targets.R", package = "sageseqr"),
                  new_path = getwd())
  }
}
#' Compute residualized counts matrix
#'
#' Residuals of the best fit linear regression model are computed for each
#'  observation. Batch effects are adjusted for in the returned counts matrix
#'  while preserving the effect of the predictor variable.
#'
#' Counts are normalized prior to linear modeling to compute residuals. A
#' precision weight is assigned to each gene feature to estimate the
#' mean-variance relationship. Counts normalized by conditional quantile
#'  normalization (CQN) are used in place of log2 normalized counts.
#'
#' @inheritParams cqn
#' @inheritParams differential_expression
#' @inheritParams filter_genes
#' @param dropped a vector of gene names to drop from filtered counts, as they
#' were not cqn normalized
#' @export
compute_residuals <- function(clean_metadata, filtered_counts, dropped,
                              cqn_counts = cqn_counts$E, primary_variable,
                              random_effect = NULL, model_variables = NULL,
                              is_num = NULL, num_var = NULL, cores = NULL)  {

  if (!is.null(dropped)) {
    filtered_counts <- filtered_counts[
      !(row.names(filtered_counts) %in% dropped),
    ]
  }

  # set the number of cores if not specified in the config
  if(is.null(cores)){
    cores = parallel::detectCores()-1
  }
  # force order of samples in metadata to match order of samples in counts.
  # Required by variancePartition
  clean_metadata <- clean_metadata[match(colnames(filtered_counts),rownames(clean_metadata)),]

  #is_num <- primary_variable$is_numeric_int
  #num_var <- primary_variable$numeric
  #primary_variable <- primary_variable$primary

  metadata_input <- build_formula(clean_metadata, primary_variable, model_variables,
                                  random_effect = random_effect, is_num = is_num, num_var = num_var)

  # Estimate voom weights with DREAM
  gene_expression <- edgeR::DGEList(filtered_counts)
  gene_expression <- edgeR::calcNormFactors(gene_expression)
  voom <- variancePartition::voomWithDreamWeights(
    counts = gene_expression,
    formula = metadata_input$formula,
    data = metadata_input$metadata,
    BPPARAM = BiocParallel::SnowParam(cores)
  )

  # fit linear model using weights and best model
  voom$E <- cqn_counts
  adjusted_fit <- variancePartition::dream(
    exprObj = voom,
    formula = metadata_input$formula,
    data = metadata_input$metadata,
    computeResiduals = TRUE,
    BPPARAM = BiocParallel::SnowParam(cores)
  )

  # compute residual matrix
  if(is.null(lme4::findbars(
    stats::as.formula( metadata_input$formula_non_intercept) ))){
    residual_gene_expression <- adjusted_fit$residuals
  }else{
    residual_gene_expression <- stats::residuals(adjusted_fit)
  }

  # calculate weighted residuals and add back signal from predictor
  if(isTRUE(is_num)) {
    variables_to_add_back <- metadata_input$de_conditions
  }else{
    variables_to_add_back <- grep(
      metadata_input$primary_variable,
      colnames(adjusted_fit$design),
      value = TRUE
    )
  }
  output <- residual_gene_expression +
    adjusted_fit$coefficients[
      ,variables_to_add_back
      ] %*% t(
        adjusted_fit$design[
          ,variables_to_add_back]
        )

  # save gene features
  output <- tibble::rownames_to_column(as.data.frame(output), var = "feature")

  return(
    list(
      output = output,
      signal = variables_to_add_back,
      adjusted_fit = adjusted_fit,
      formula = metadata_input$formula
      )
  )
}
kelshmo/sageseqr documentation built on June 13, 2024, 10:48 p.m.