R/rnaseq_functions.R

Defines functions getDEGwithLFC getDEG fixExcel neck makeNumeric rowScale nameChange makeColData

Documented in fixExcel getDEG getDEGwithLFC makeColData makeNumeric nameChange neck rowScale

#' Create column metadata
#'
#' Creates column metadata table (colData) from a gene expression matrix.
#'
#' @param expr Gene expression matrix, with column descriptors separated by underscores or periods.
#' @param sample.id The position(s) in the parsed sample names that contain the sample IDs.
#' @param cond The position(s) in the parsed sample names containing the conditions.
#'
#' @return Matrix of column metadata
#'
#' @examples
#' makeColData(expr, 3, c(2,3))
#'
#' @export
makeColData <- function(expr, sample.id, cond) {
  labls <- colnames(expr)
  labls <- strsplit(labls, "[._]")
  id    <- sapply(labls, FUN = function(x) x[sample.id])
  if(length(cond) > 1) {
    condition <- sapply(labls, FUN = function(x) x[cond])
    condition <- as.data.frame(t(condition))
    condition <- unite(condition, "condition", everything(), sep = ".")[,1]
    condition <- ifelse(sapply(condition, function(x) grepl(pattern = "NA", x)),
                        gsub(pattern = ".NA", replacement = "", condition),
                        condition)
  } else {
    condition <- sapply(labls, FUN = function(x) x[cond])
  }
  colData <- as.matrix(cbind(id, condition))
  row.names(colData) <- NULL
  return(colData)
}

#' Change gene names
#'
#' Changes names of genes in expression matrix from one nomenclature to another using org.Hs.eg.db.
#' Genes with no names in the target nomenclature will be removed from the matrix.
#' If multiple identifiers map to the same target name, the median expression level for the identifier will be used.
#'
#' @param expr Gene expression matrix, with genes as rows and samples as columns
#' @param from The current nomenclature (ENSEMBL, ENTREZID, SYMBOL) of the gene expression matrix.
#' @param to The desired nomenclature of the gene expression matrix.
#' @param verbose logical: print summary after changing names (default = FALSE)
#'
#' @return Gene expression matrix with renamed rows
#'
#' @examples
#' nameChange(expr, "ENSEMBL", "ENTREZID")
#'
#' @export
nameChange <- function(expr, from = c("ENSEMBL", "ENTREZID", "SYMBOL"), to = c("SYMBOL", "ENTREZID", "ENSEMBL"), verbose = FALSE, df = FALSE) {
  if(class(expr) == "matrix"){
    expr <- data.frame(expr)
  }

  names.from <- rownames(expr)
  names.to   <- mapIds(org.Hs.eg.db, names.from, to, from)
  ?mapIds
  if(isTRUE(verbose)) {
    nomatch <- length(rownames(expr[is.na(names.to)]))
    print(paste(nomatch, from, "identifiers lack", to, "identifiers:", sep = " "))
    print(rownames(expr[is.na(names.to), ]))
  }

  expr <- expr[!is.na(names.to), ]
  names.to <- names.to[!is.na(names.to)]
  expr <- dplyr::mutate(expr, tmp = names.to)
  expr <- aggregate(. ~ tmp, data = expr, median)
  rownames(expr) <- expr$tmp
  expr <- dplyr::select(expr, -tmp)

  if(isTRUE(df)) {
    return(expr)
  } else {
    expr <- as.matrix(expr)
    return(expr)
  }
}

#' Scale rows
#'
#' This function applies the base scale() function to the rows, rather than columns, of a data frame.
#'
#' @param expr Gene expression matrix
#' @param center logical: mean center rows (default = TRUE)
#' @param scale logical: scale rows by dividing by standard deviations (default = TRUE)
#'
#' @return Gene expression matrix with scaled rows
#'
#' @examples
#' rowScale(expr)
#'
#' @export
rowScale <- function(expr, center = TRUE, scale = TRUE) {
  expr <- makeNumeric(expr)
  expr <- t(expr)
  expr <- scale(expr, center = center, scale = scale)
  expr <- t(expr)
  return(expr)
}

#' Make Numeric
#'
#' This function coerces all columns of a data frame or matrix to numeric.
#'
#' @param mat Matrix to be coerced to numeric
#' @param forceDF logical: coerce output to data frame (default = FALSE)
#' @param forceMatrix logical: coerce output to matrix (default = FALSE)
#'
#' @return Gene expression matrix with scaled rows
#'
#' @examples
#' makeNumeric(expr)
#'
#' @export
makeNumeric <- function(mat, force.df = FALSE, force.matrix = FALSE) {
  rnms <- rownames(mat)
  mat <- apply(mat, 2, as.numeric)
  if(isTRUE(force.df)){
    mat <- as.data.frame(mat)
  } else if(isTRUE(force.matrix)){
    mat <- as.matrix(mat)
  }
  rownames(mat) <- rnms
  return(mat)
}

#' Show first rows and columns of data frame or matrix
#'
#' This function is similar to head, but shows the first n rows and columns, rather
#' than just the first n rows.
#'
#' @param x data frame or matrix to be viewed
#' @param rows number of rows to display; default is 10
#' @param cols number of columns to display; default is 10
#'
#' @return Subset of data frame or matrix
#'
#' @examples
#' neck(expr)
#'
#' @export
neck <- function(x, rows = 10, cols = 10) {
  if(dim(x)[1] < rows){
    rows <- dim(x)[1]
  }
  if(dim(x)[2] < cols){
    cols <- dim(x)[2]
  }
  x <- x[1:rows, 1:cols]
  return(x)
}

#' Fix gene names changed to dates by MS Excel
#'
#' Checks row names of gene expression matrix for dates. If present, date names will be changed to the
#' appropriate gene symbol.
#'
#' @param x gene expression matrix
#' @param type class of x ("mat" or "vct"; default is "mat")
#' @param col optional; column of matrix containing gene symbols (by default, rownames are taken to be gene names)
#'
#' @return matrix
#'
#' @examples
#' fixExcel(expr)
#'
#' @export
fixExcel <- function(x, col = NULL, type = "mat") {
  switch(type,
         mat = {
           if(is.null(col)) {
             names <- rownames(x)
           } else {
             names <- x[, col]
             }
           },
         vct = {names <- x}
  )

  if(!any(names %in% excel.genes$date)){
    stop("No dates in these gene names")
  } else {
    dates <- names[names %in% excel.genes$date]
    symb  <- excel.genes$gene[match(dates, excel.genes$date)]
    names[names %in% excel.genes$date] <- symb
  }

  switch(type,
         mat = {
           if(is.null(col)) {
            rownames(x) <- names
         } else {
           x[, col] <- names
           }
        },
         vct = {x <- names}
  )

  return(x)
}

#' Get names of differentially expressed genes from DESeq2 results
#'
#' By default, this function returns a list of two character vectors, one of DEGs with positive lfc, and one
#' of DEGs with negative lfc. If only one direction is desired, direction may be specified (as "up" or "down")
#'
#' @param res DESeq2 results, with desired contrast
#' @param direction optional; specifies whether to return up- or downregulated genes ("up" or "down")
#'
#' @return list of two character vectors
#'
#' @examples
#' getDEG(results)
#'
#' @export
getDEG <- function(res, direction=NULL) {
  res <- res %>% mutate(ENS = rownames(res))

  x <- mapIds(org.Hs.eg.db, res$ENS, "SYMBOL", "ENSEMBL")
  res <- res %>% mutate(SYMB = x)

  x <- mapIds(org.Hs.eg.db, res$ENS, "GENENAME", "ENSEMBL")
  res <- res %>% mutate(GNAME = x)

  res <- res %>% filter(padj < 0.05)

  if(is.null(direction)){
    deg <- list(up <- res %>% filter(log2FoldChange > 0),
                down <- res %>% filter(log2FoldChange < 0))
  } else if(direction == "up"){
    deg <- res %>% filter(log2FoldChange > 0)
  } else if (direction == "down") {
    deg  <- res %>% filter(log2FoldChange < 0)
  } else stop("direction must be 'up' or 'down'")

  if(class(deg) == "list"){
    deg <- lapply(deg, function(x) {
      select(x, SYMB) %>% .[[1]] %>% as.character(.)
    })
    names(deg) <- c("up", "down")
  } else {
    deg <- deg %>% select(SYMB) %>% .[[1]] %>% as.character(.)
  }
  return(deg)
}

#' Get names of differentially expressed genes from DESeq2 results, with log fold changes.
#'
#' This function differs from getDEG only in that it returns a 2-column matrix of DEG symbols and logfoldchanges,
#' rather than just a vector of DEG gene symbols.
#'
#' @param res DESeq2 results, with desired contrast
#' @param direction optional; specifies whether to return up- or downregulated genes ("up" or "down")
#'
#' @return list of two character vectors
#'
#' @examples
#' getDEGwithLFC(results)
#'
#' @export
getDEGwithLFC <- function(res, direction=NULL) {
  res <- res %>% mutate(ENS = rownames(res))

  res <- res %>% filter(padj < 0.05)

  x <- mapIds(org.Hs.eg.db, res$ENS, "SYMBOL", "ENSEMBL")
  res <- res %>% mutate(SYMB = x)
  res <- res %>% filter(!is.na(SYMB))

  x <- mapIds(org.Hs.eg.db, res$ENS, "GENENAME", "ENSEMBL")
  res <- res %>% mutate(GNAME = x)

  if(is.null(direction)){
    deg <- list(up <- res %>% filter(log2FoldChange > 0),
                down <- res %>% filter(log2FoldChange < 0))
  } else if(direction == "up"){
    deg <- res %>% filter(log2FoldChange > 0)
  } else if (direction == "down") {
    deg  <- res %>% filter(log2FoldChange < 0)
  } else stop("direction must be 'up' or 'down'")

  if(class(deg) == "list"){
    deg <- lapply(deg, function(x) {
      select(x, SYMB, log2FoldChange, padj) %>% arrange(desc(abs(log2FoldChange)))
    })
    names(deg) <- c("up", "down")
  } else {
    deg <- deg %>% select(x, SYMB, log2FoldChange, padj) %>% arrange(desc(abs(log2FoldChange)))
  }
  return(deg)
}
danielderrick/defunctions documentation built on Aug. 4, 2017, 6:23 p.m.