R/runSexLab.R

Defines functions runSexLab trainSexLab predSexLab .filterGenesByVar .geomMeanAcrossGenes .geomMeanScore

Documented in .filterGenesByVar .geomMeanAcrossGenes .geomMeanScore predSexLab runSexLab trainSexLab

#' Run sex labeling using default settings.
#'
#' @param expr_mat expression data matrix - rows are genes, columns are samples
#' @param platform the microarray platform (e.g. GPL570), if specified will use a model specific to that platform
#' @param is_rank TRUE if the data is already in rank form, otherwise will perform the conversion
#'
#' @return sex_lab - a list of sex labels (0/1 or "female"/"male")
runSexLab <- function(expr_mat, platform=NULL, is_rank=FALSE, numeric_lab=TRUE){
  # TODO
  #   assert that input is correct
  #   assert that the ranked data is actually ranked if it says it is
  error("This function is not implemented yet, please use predSexLab instead")
  # if it is not ranked, ranked the data
  if (!is_rank){

  }

  if (is.null(platform)){
    # use the standard fit
  } else {
    # read in the platform list

    # is it in the platform list? -- if not, throw and error

    # load the fit
  }

  # sex label
  sex_lab <- predSexLab(fit, test_dat, numeric_lab)
  return(sex_lab)
}


#' Train a sex labeling classifier using input training data.
#'
#' @param train_dat training data frame - rows are genes (entrez id), columns are samples, values are ranks
#' @param train_lab named list of training labels - 0 for female, 1 for male
#' @param female_genes a list of female-specific genes (entrez id), if not provided - default genes will be used
#' @param male_genes a list of male-specific genes (entrez id), if not provided, default genes will be used
#' @param cut_frac
#'
#' @return fit
trainSexLab <- function(train_dat, train_lab, female_genes = NULL, male_genes = NULL,
                        cut_frac = "1st Qu.") {
  require('pROC')

  # // TODO:
  # get rid of cut_frac

  .checkTrainInputFormat(train_dat, train_lab)

  # loads genes if not specified
  if (is.null(female_genes)) {
    female_genes <- sex_lab_genes$f
  }
  if (is.null(male_genes)) {
    male_genes <- sex_lab_genes$m
  }

  # make sure the genes are in the rownames
  .checkGeneFormat(train_dat, female_genes)
  .checkGeneFormat(train_dat, female_genes)

  # remove low variance probes
  m_genes <- .filterGenesByVar(train_dat, male_genes, cut_frac)
  f_genes <- .filterGenesByVar(train_dat, female_genes, cut_frac)

  # calculate scores for the training data
  scores <- .geomMeanScore(train_dat, f_genes, m_genes) # calculate the score
  preds <- scores$m - scores$f
  preds2 <- preds[!is.nan(preds)]
  labels <- train_lab[colnames(train_dat)][!is.nan(preds)]

  # compute the threshold
  roc_plot_train <- roc(labels, preds2, plot = FALSE)
  threshold_gpl <- coords(roc_plot_train, "best", best.method = "closest.topleft", ret = c("threshold"), transpose=TRUE)

  # fit object
  return(list(f = f_genes, m = m_genes, threshold = threshold_gpl))
}

#' Use a particular sex labeling fit to label an input dataset
#' @param fit a sex labeling fit object
#' @param expr_mat data frame to label - rows are genes, columns are samples, values are ranks
#' @param numeric_lab if FALSE, the output is "female/male", otherwise it is numeric (0/1)
#' @param scores whether to return the scores, defaults to FALSE
#' @param ret_expr whether to return the expression values, defaults to FALSE
#'
#' @return sex_lab - a list of sex labels (0/1 or "female"/"male")
#'       if `scores` is specified, the result is a data.frame with columns for the f, m scores
#'           and the sex label
#'       if `ret_expr` is specified, a list is returned, "sl" are the sex labels (as list or df),
#'          "f_mat" and "m_mat" contain the expression or ranks of the genes that go into the score
predSexLab <- function(fit, expr_mat, numeric_lab = TRUE,
                       scores=FALSE, ret_expr=FALSE) {
  require('pROC')

  # check input
  .checkFitFormat(fit)
  .checkExprMatFormat(expr_mat)

  # find the fit format
  f_genes <- intersect(fit$f, rownames(expr_mat))
  m_genes <- intersect(fit$m, rownames(expr_mat))

  # assert that there are some sex labeling genes
  .my_assert("The expresion matrix does not contain any of the f genes", !is.null(f_genes))
  .my_assert("The expresion matrix does not contain any of the m genes", !is.null(m_genes))

  threshold_gpl <- fit$threshold

  # caclulate the score
  scores_test <- .geomMeanScore(expr_mat, f_genes, m_genes) # calculate the score
  preds_test <- scores_test$m - scores_test$f
  if (numeric_lab) {
    sex_lab <- ifelse(preds_test > threshold_gpl, 1, 0)
  } else {
    sex_lab <- ifelse(preds_test > threshold_gpl, "male", "female")
  }

  # add sample names to the list of sex labels
  names(sex_lab) <- colnames(expr_mat)

  # add the scores to the output
  if (scores){
    scores_m <- scores_test$m
    scores_f <- scores_test$f
    names(sex_lab) <- NULL
    df <- data.frame(do.call(rbind, list(sex_lab, scores_m, scores_f)))
    colnames(df) <- colnames(expr_mat)
    rownames(df) <- c("sex", "score_m", "score_f")
    sex_lab <- data.frame(t(df))
  }

  # add the df chunks to the output
  if (ret_expr){
    f_mat <- expr_mat[f_genes,]
    m_mat <- expr_mat[m_genes,]
    return(list("sl"=sex_lab, "f_mat"=f_mat, "m_mat"=m_mat))
  }

  return(sex_lab)
}

# --------- Helper functions --------- #

#' Helper function to filter genes based on their standard deviations.
#'
#' Removes sex labeling genes with variance below the provided quartile cut. This can
#' be helpful for removing genes on a platform that have low variance,
#' and such are not informative for sex labeling.
#'
#' @param expr_mat expr_mat - matrix of expression genes
#' @param genes list of sex labeling genes
#' @param cut_frac the fraction
#'
#' @return a filtered set of sex labeling genes
.filterGenesByVar <- function(expr_mat, genes, cut_frac = "1st Qu.") {

  # if there is no cutoff, just return the list
  if (cut_frac == "Min.") {
    return(genes)
  }

  # otherwise, find the subset of genes, and filter
  genes <- intersect(genes, rownames(expr_mat))
  sds <- apply(expr_mat[genes, colnames(expr_mat)], 1, sd, na.rm = TRUE)
  cut <- summary(sds)[cut_frac]
  genes2 <- genes[sds > cut]
  genes2 <- genes2[!is.na(genes2)]
  return(genes2)
}



#' Helper function to calculate geometric mean of a set of genes across all samples in a matrix.
#'
#' Computes the geometric mean of the expression of a particular set of genes for every sample
#' (column) in a matrix. NAs are ignored during this calculation. The result is a vector of
#' geometric means, each corresponding to one sample (column) in the original matrix.
#'
#' @param expr_mat expression dataset, rows are genes, columns are samples
#' @param genes list of genes to calculate the geometric mean of
#'
#' @return a vector of geometric means
.geomMeanAcrossGenes <- function(expr_mat, genes){
  require('psych')
  sapply(1:ncol(expr_mat), function(x) {
    dat <- unlist(expr_mat[genes, x])
    return(geometric.mean(dat, na.rm = TRUE))
  })
}

#' Calculate the score geometric mean of male genes - female genes
#'
#' Relies on the psych package to calculate geometric means for each set of genes.
#' The end result is a score for the difference in means which is used for classification.
#'
#' @param expr_mat expression dataset: rows are genes, columns are samples
#' @param female_genes list of genes with female-specific expression
#' @param male_genes list of genes with male-specific expression
#'
#' @return a list of geometric mean scores for males ("m") and females ("f")
.geomMeanScore <- function(expr_mat, female_genes, male_genes) {

  mean_m <- .geomMeanAcrossGenes(expr_mat, male_genes)
  mean_f <- .geomMeanAcrossGenes(expr_mat, female_genes)

  return(list( "f"=mean_f, "m"=mean_m))
}
erflynn/exprsex documentation built on Feb. 23, 2020, 2:34 a.m.