R/DGE_Tools.R

Defines functions eBayes_autoprop Txt2DF tsmsg

Documented in eBayes_autoprop

# DGE_Tools.R
# Author: JRT 07Feb2017
#
# A set of functions to facilitate differential gene expression analysis.

#Need separate data objects for Gene and Transcript level because they have
#different row counts
#
#The GeneData list of lists defines: the datafiles to load into
#a DGEobj, the standard name they will be given in the SummarizedExperiment
#object and the type of data contained in that file (rowRanges, colData, Assay or metadata)
#
#Where:
# rowRanges is genes or transcripts annotation
# colData is samples annotation
# Assay is nrow(rowdata) rows by nrow(colData) columns (e.g. counts, FPKM, zFPKM, TPM, etc.)
# The pre-defined GeneData and TranscriptData lists are configured for the Omicsoft pipeline default filenames.
# You can change the filenames to support other datasources.  But do not change the name or type values.
#
# GeneData (list of lists)
#
# Defines the data files to import into a SummarizedExperiment.
# Each file is described by a list with defines the fieldname, the type of data
# and the text file containing the data.  The predefined GeneData item
# is preconfigured for datafiles from the Omicsoft pipeline.  Modify GeneData
# file values to use data from a different pipeline.
#
# "GeneData"
# .GeneData = list(
#   Annotation = list(name = "Annotation",
#                     file = "RNA-Seq.Count.Annotation.txt",
#                     type = "rowRanges"),
#   Design = list(name = "Design",
#                 file = "RNA-Seq.Design.txt",
#                 type = "colData"),
#   Counts = list(name = "Counts",
#                 file = "RNA-Seq.Count.Table.txt",
#                 type = "Assay"),
#   Level = list(name = "Level",
#                level = "Gene",
#                type = "metadata"),
#   # FPKM = list(name = "FPKM",
#   #             file = "RNA-Seq.FPKM.Table.txt",
#   #             type = "Assay"),
#   # TPM = list(name = "TPM",
#   #             file = "Genes.TPM.Table.txt",
#   #             type = "Assay"),
#   QC.Metrics = list(name = "QC.Metrics",
#             file = "RNA-Seq.QCMetrics.Table.txt",
#             type = "metadata")
# )
# uncomment this block to save changes to this datastructure to be built into the package
# x = getwd()
# setwd ("~/R/lib/pkgsrc/DGE.Tools2/")
# save(GeneData, file="./data/GeneData.rda")
# setwd(x)

# TranscriptData (list of lists)
#
# Defines the data files to import into a SummarizedExperiment.
# Each file is described by a list with defines the fieldname, the type of data
# and the text file containing the data.  The predefined TranscriptData item
# is preconfigured for datafiles from the Omicsoft pipeline.  Modify TranscriptData
# file values to use data from a different pipeline.
#
# .TranscriptData = list(
#   Annotation = list(name = "Annotation",
#                     file = "RNA-Seq.Transcript_Count.Annotation.txt",
#                     type = "rowRanges"),
#   Design = list(name = "Design",
#                 file = "RNA-Seq.Design.txt",
#                 type = "colData"),
#   Counts = list(name = "Counts",
#                 file = "RNA-Seq.Transcript_Count.Table.txt",
#                 type = "Assay"),
#   Level = list(name = "Level",
#                level = "Transcript",
#                type = "metadata"),
#   # FPKM = list(name = "FPKM",
#   #             file = "RNA-Seq.Transcript_FPKM.Table.txt",
#   #             type = "Assay"),
#   # TPM = list(name = "TPM",
#   #             file = "Transcripts.TPM.Table.txt",
#   #             type = "Assay"),
#   QC.Metrics = list(name = "QC.Metrics",
#             file = "RNA-Seq.QCMetrics.Table.txt",
#             type = "metadata")
# )
# x = getwd()
# setwd ("~/R/lib/pkgsrc/DGE.Tools2/")
# save(TranscriptData, file="./data/TranscriptData.rda")
# setwd(x)

### Utility FUNCTIONS ###

### Function tsmsg ###
# a timestamped message
#' @export
tsmsg <- function(...) {
  # Works like message() but prepends a timestamp
  message(date(), ": ", ...)
}

#df2GR is obsolete.  Can now use:  gr <- as(geneInfoDF, "GRanges")
# Function df2GR
# @import magrittr IRanges GenomicRanges
# df2GR <- function(df, seqnames=c("seqnames", "chr", "chromosome"),
#                   start="start", end="end", strand="strand",
#                   start.offset=1, end.offset=start.offset) {
#   #Convert a Annotation DF to a genomic Ranges object
#   #Optional parameters for seqnames, start, end, strand anticipate possible you might have for these
#   #fields in your annotation file.  Only need to modify these if your annotation uses differenc colnames
#   #for these fields
#   #
#     #These lines return the colnames used in your datafile.
#     seqnames.col <- match(seqnames, tolower(colnames(df))) %>% na.omit %>% .[1]
#     start.col <- match(start, tolower(colnames(df))) %>% na.omit %>% .[1]
#     end.col <- match(end, tolower(colnames(df))) %>% na.omit %>% .[1]
#     strand.col <- match(strand, tolower(colnames(df))) %>% na.omit %>% .[1]
#     other.cols <- setdiff(seq_along(colnames(df)), c(seqnames.col, start.col, end.col, strand.col))
#
#   	#make sure start and end are numeric; if not, remove commas and convert to numeric
#   	if (is.character(df[[start.col]])) {
#   	  df[[start.col]] = gsub(",", "", df[[start.col]]) %>% as.numeric
#   	}
#     if (is.character(df[[end.col]])) {
#       df[[end.col]] = gsub(",", "", df[[end.col]]) %>% as.numeric
#     }
#
#     MyRanges = IRanges::IRanges(start=df[[start.col]] - start.offset + 1,
#                                 end=df[[end.col]]) - end.offset + 1
#
#     gr <- GenomicRanges::GRanges(seqnames=df[[seqnames.col]],
#                                  ranges=MyRanges,
#                                  strand=df[[strand.col]])
#
#     GenomicRanges::mcols(gr) <- df[other.cols]
#     names(gr) <- rownames(df)
#     return(gr)
# }

### Function Txt2DF ###
Txt2DF <- function(filename) {
  #configured to read Omicsoft .txt files correctly capturing GeneIDs as rownames
  if (file.exists(filename)) {
    df = read.table (filename, sep="\t", stringsAsFactors = FALSE,
                     header=TRUE, row.names = 1, comment.char="",
                     quote="", na.strings=c("NA", "."),
                     check.names=TRUE)
    return (df)
  } else {
    warning (paste ("Warning: File = ", filename, "not found."))
    return (-1)
  }
}


#' Function  eBayes_autoprop
#'
#' If you use the following function in place of eBayes for single contrasts,
#' you can actually use the B-statistic from the results of topTable.
#' The B-statistic is the log-odds of differential expression, so a zero
#' means a 50-50 chance of DE, positive means more likely DE than not,
#' and negative means more likely non-DE. (Note that this statistic uses
#' the natural logarithm, not base 10 or base 2 logs.) However, this
#' statistic is only accurate if you supply a prior estimate of the
#' fraction of genes that are DE. The default proportion is 1%,
#' which should give conservative values as long as at least 1% of
#' genes are DE.
#'
#' The function below uses limma propTrueNull to determine this prior, which
#' means that the B-statistics should be reliable as long as propTrueNull
#' returns a reasonable-looking value, which you can verify by looking at
#' the p-value distribution.
#'
#' @author Ryan Thompson, \email{rct@@thompsonclan.org}
#' @keywords eBayes, DGE, limma
#'
#' @param ... See ?eBayes
#' @param prop.method defaults to "lfdr"
#'
#' @return A fit is returned
#'
#' @examples
#'      MyFit = eBayes_autoprop (MyFit)
#'
#' @importFrom  limma propTrueNull
#'
#' @export
eBayes_autoprop <- function(..., prop.method="lfdr") {
  eb <- eBayes(...)
  ptn <- propTrueNull(eb$p.value, method=prop.method)
  eBayes(..., proportion=1-ptn)
}
jrthompson54/DGE.Tools2 documentation built on May 12, 2021, 8:47 p.m.