# merge the data
mergeData = function( TCGA_preprocesed, RJ_classification )
{
# initial datachecks
if ( !is.data.frame(RJ_classification) )
{
stop( "You must pass in results as a dataframe from the 'classification' call from an RJclust object")
}
if ( ncol( RJ_classification ) != 1)
{
stop( "You must pass in results as a dataframe from the 'classification' call from an RJclust object with one column")
}
# now merge our X data with our classification so we can sort
X_merged = merge( TCGA_preprocesed, RJ_classification, by.x = "row.names", by.y = "row.names", no.dups = TRUE )
# sort data bsed on column X
X_sorted = X_merged[order( X_merged[, ncol( X_merged ) ] ),] # sort by RJ classification (G)
X_sorted[, ncol( X_sorted ) ] = as.integer( X_sorted[, ncol( X_sorted )] )
# get rownames
X_rownames = X_sorted[, 1]
# sort data and return as matrix - remove first column of rownames from merge
X_sorted = X_sorted[, -1]
X_sorted = ( as.matrix( X_sorted ) ) # return as matrix for compatability with c++
return( X_sorted )
}
# get the list of importnat genes from running in C
getImportantGenesList = function( sortedData, cluster )
{
temp_return = getImportantGenes_c( sortedData, cluster )
# return the answer in list format
return( temp_return[, 1] )
}
# get the most important genes
getTopGenes = function( sums, geneNames, threshold = NULL )
{
# if the threshold is null, assume the threshold is 2 standard deviations
if ( is.null( threshold ) )
{
threshold = 2*sd( sums ) + mean( sums )
}
# get the outlier genes greater than the threshold
outliers_index = which( sums > threshold )
# bind the data with the genes and sort
data = cbind.data.frame( geneNames[outliers_index], sums[outliers_index] )
colnames( data ) = c( "Genes", "Sums" )
data[, 2] = sort( data[,2], decreasing = T )
return( data )
}
#' TCGA_getImportantGenes
#'
#' @param TCGA_data Original TCGA dataset ( unprocessed )
#' @param classificationResults 'cluster' results from running RJalgorithm
#' @param threshold_clean Percentage of 0s that will cause a gene ( column ) to be discarded
#' @param cluster Which cluster you want analyzed ( an integer )
#' @param threshold_importance What value to consdier a gene "important" ( default is a gene more than 2 times the standard deviation )
#' @param num_return Number of important genes to be returned (won't apply if num_return is lower than the number under threshold_importance)
#'
#' @return Returns a list of genes and their sums of importance
#' @export
#'
#' @examples
#' data(OV)
#' X = TCGA_cleanData(OV)
#' G = RJclust(X, 3)$classification
#' important_genes = TCGA_getImportantGenes(OV, G, 0, 1)
TCGA_getImportantGenes = function( TCGA_data, classificationResults, threshold_clean, cluster, threshold_importance = NULL, num_return = NULL )
{
# turn G int oa dataframe if it is not alrady
if ( !is.data.frame( classificationResults ) )
{
classificationResults = as.data.frame( classificationResults )
}
# clean original data
data = TCGA_cleanData( TCGA_data, threshold_clean )
genes = colnames( data )
# get sorted data
X_sorted = mergeData( data, classificationResults )
# get important genes
geneSums = getImportantGenesList( X_sorted, cluster )
# get top genes
importanceList = getTopGenes( geneSums, genes, threshold_importance )
# if there is no specificed number to return, return all
if ( is.null( num_return ) )
{
num_return = nrow( importanceList )
}
# if num_return is too large, warn the user
if ( !is.null( num_return ) & num_return > nrow( data ) )
{
warning( paste( "Cannot return", num_return, "because there are only", nrow( importanceList ), "to plot. Instead returning 50" ) )
num_return = 50
}
importanceList = importanceList[1:num_return, ]
meanGene = mean( importanceList[, 2 ])
return( list(importanceList = importanceList, mean = meanGene ) )
}
#' TCGA_plotImportantGenes
#'
#' @param geneSums importanceList result from from TCGA_getImportantGenes
#' @param numPlotted Number of genes to be plotted ( default is 15 )
#' @param mean Mean result from TCGA_GetImportantGenes
#'
#' @return Barplot of gene importance
#' @export
#'
#' @examples
#' data(OV)
#' X = TCGA_cleanData(OV)
#' G = RJclust(X, 3)$classification
#' important_genes = TCGA_getImportantGenes(OV, G, 0, 1)
#' TCGA_plotImportantGenes( important_genes$importanceList, 20 )
TCGA_plotImportantGenes = function( geneSums, numPlotted = 15, mean = NULL )
{
# check that the number of genes is reasonable to plot
if ( numPlotted > nrow( geneSums ) )
{
warning( paste( "Cannot plot", numPlotted, "because there are only", nrow( geneSums ), "to plot. Instead plotting 15" ) )
numPlotted = 15
}
# set up labels nad data
label = "Genes With Largest Importance"
x = geneSums[1:numPlotted, 2]
namesPlot = geneSums[1:numPlotted, 1]
# plot if no mean is provdied
if ( is.null( mean ) )
{
barplot( x, main = label, names.arg = namesPlot, las = 2 )
}
# if mean is supplied, add a horizontal line for the mean
else
{
barplot( x, main = label, names.arg = namesPlot, las = 2 )
abline( h = mean, col = "red" )
legend( "topright", legend = c( "Mean of all patients" ), fill = c( "red" ) )
}
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.