R/FindImportantGenes.R

Defines functions mergeData getImportantGenesList getTopGenes TCGA_getImportantGenes TCGA_plotImportantGenes

Documented in TCGA_getImportantGenes TCGA_plotImportantGenes

# merge the data
mergeData = function( TCGA_preprocesed, RJ_classification )
{
  # initial datachecks
  if ( !is.data.frame(RJ_classification) )
  {
    stop( "You must pass in results as a dataframe from the 'classification' call from an RJclust object")
  }
  if ( ncol( RJ_classification ) != 1)
  {
    stop( "You must pass in results as a dataframe from the 'classification' call from an RJclust object with one column")
  }
  # now merge our X data with our classification so we can sort
  X_merged = merge( TCGA_preprocesed, RJ_classification, by.x = "row.names", by.y = "row.names", no.dups = TRUE )

  # sort data bsed on column X
  X_sorted = X_merged[order( X_merged[, ncol( X_merged ) ] ),] # sort by RJ classification (G)
  X_sorted[, ncol( X_sorted ) ] = as.integer( X_sorted[, ncol( X_sorted )] )

  # get rownames
  X_rownames = X_sorted[, 1]

  # sort data and return as matrix - remove first column of rownames from merge
  X_sorted = X_sorted[, -1]
  X_sorted = ( as.matrix( X_sorted ) ) # return as matrix for compatability with c++

  return( X_sorted )
}

# get the list of importnat genes from running in C
getImportantGenesList = function( sortedData, cluster )
{
  temp_return = getImportantGenes_c( sortedData, cluster )
  # return the answer in list format
  return( temp_return[, 1] )
}

# get the most important genes
getTopGenes = function( sums, geneNames, threshold = NULL )
{
  # if the threshold is null, assume the threshold is 2 standard deviations
  if (  is.null( threshold ) )
  {
    threshold = 2*sd( sums ) + mean( sums )
  }

  # get the outlier genes greater than the threshold
  outliers_index = which( sums > threshold )

  # bind the data with the genes and sort
  data = cbind.data.frame( geneNames[outliers_index], sums[outliers_index] )
  colnames( data ) = c( "Genes", "Sums" )
  data[, 2] = sort( data[,2], decreasing = T )

  return( data )
}


#' TCGA_getImportantGenes
#'
#' @param TCGA_data Original TCGA dataset ( unprocessed )
#' @param classificationResults 'cluster' results from running RJalgorithm
#' @param threshold_clean Percentage of 0s that will cause a gene ( column ) to be discarded
#' @param cluster Which cluster you want analyzed ( an integer )
#' @param threshold_importance What value to consdier a gene "important" ( default is a gene more than 2 times the standard deviation )
#' @param num_return Number of important genes to be returned (won't apply if num_return is lower than the number under threshold_importance)
#'
#' @return Returns a list of genes and their sums of importance
#' @export
#'
#' @examples
#' data(OV)
#' X = TCGA_cleanData(OV)
#' G = RJclust(X, 3)$classification
#' important_genes = TCGA_getImportantGenes(OV, G, 0, 1)
TCGA_getImportantGenes = function( TCGA_data, classificationResults, threshold_clean, cluster, threshold_importance = NULL, num_return = NULL )
{
  # turn G int oa dataframe if it is not alrady
  if ( !is.data.frame( classificationResults ) )
  {
    classificationResults = as.data.frame( classificationResults )
  }

  # clean original data
  data = TCGA_cleanData( TCGA_data, threshold_clean )
  genes = colnames( data )

  # get sorted data
  X_sorted = mergeData( data, classificationResults )

  # get important genes
  geneSums = getImportantGenesList( X_sorted, cluster )

  # get top genes
  importanceList = getTopGenes( geneSums, genes, threshold_importance )

  # if there is no specificed number to return, return all
  if ( is.null( num_return ) )
  {
    num_return = nrow( importanceList )
  }

  # if num_return is too large, warn the user
  if ( !is.null( num_return ) & num_return > nrow( data ) )
  {
    warning( paste( "Cannot return", num_return, "because there are only", nrow( importanceList ), "to plot. Instead returning 50" ) )
    num_return = 50
  }

  importanceList = importanceList[1:num_return, ]

  meanGene = mean( importanceList[, 2 ])

  return( list(importanceList = importanceList, mean = meanGene ) )
}

#' TCGA_plotImportantGenes
#'
#' @param geneSums importanceList result from from TCGA_getImportantGenes
#' @param numPlotted Number of genes to be plotted ( default is 15 )
#' @param mean Mean result from TCGA_GetImportantGenes
#'
#' @return Barplot of gene importance
#' @export
#'
#' @examples
#' data(OV)
#' X = TCGA_cleanData(OV)
#' G = RJclust(X, 3)$classification
#' important_genes = TCGA_getImportantGenes(OV, G, 0, 1)
#' TCGA_plotImportantGenes( important_genes$importanceList, 20 )
TCGA_plotImportantGenes = function(  geneSums, numPlotted = 15, mean = NULL )
{
  # check that the number of genes is reasonable to plot
  if ( numPlotted > nrow( geneSums ) )
  {
    warning( paste( "Cannot plot", numPlotted, "because there are only", nrow( geneSums ), "to plot. Instead plotting 15" ) )
    numPlotted = 15
  }

  # set up labels nad data
  label = "Genes With Largest Importance"
  x = geneSums[1:numPlotted, 2]
  namesPlot = geneSums[1:numPlotted, 1]

  # plot if no mean is provdied
  if ( is.null( mean ) )
  {
    barplot( x, main = label, names.arg = namesPlot, las = 2 )
  }

  # if mean is supplied, add a horizontal line for the mean
  else
  {
    barplot( x, main = label, names.arg = namesPlot, las = 2 )
    abline( h = mean, col = "red" )
    legend( "topright", legend = c( "Mean of all patients" ), fill = c( "red" ) )
  }
}
rshudde/RJclust documentation built on Dec. 8, 2019, 4:06 p.m.