Nothing
cluster_GO <- function(
go_id, result, eSet, f=result$factor, subset=NULL,
method_dist="euclidean", method_hclust="average", cex=0.8,
main=paste(go_id, result$GO[result$GO$go_id == go_id, "name_1006"]),
xlab="Distance", cex.main=1, main.Lsplit=NULL, ...){
# if the result provided does not contain the slots required for this
# function
if (! all(c("factor","GO") %in% names(result))){
stop("'result=' argument misses required slots.
Is it a GO_analyse() output?")
}
# If the GO identifier is not present in the results
if (! go_id %in% result$GO$go_id){
# Return an error and stop
stop(go_id, " was not found in result$mapping$go_id.")
}
# Subset the data to the given values of the given factors, if existing
if (!is.null(subset)){
eSet <- subEset(eSet=eSet, subset=subset)
}
# Fetch the list of genes associated with the go_id
gene_ids <- list_genes(go_id=go_id, result=result, data.only=TRUE)
# Fetch and transform the expression data for those genes
genes_expr <- t(exprs(eSet)[gene_ids,])
# Hierarchical clustering using dist and hclust
# (The clearest method to read the labels and control their size)
di <- dist(genes_expr, method=method_dist)
cl <- hclust(di, method=method_hclust, ...)
# If requested, split the main title to lines with fewer than a given
# count of characters, while respecting space-separated words
if (!is.null(main.Lsplit)){
if (is.numeric(main.Lsplit)){
main <- string_Lsplit(string = main, line.length = main.Lsplit)
}
else{
stop("main.Lsplit should be a numeric value or NULL.")
}
}
# Rows are samples, label them according to the user's chosen factor
sample_labels <- pData(eSet)[,f]
# Save the current plotting parameters
op <- par(no.readonly=TRUE)
# whatever happens, restore original setting when leaving function
on.exit(par(op))
# Change the font size of the title
par(cex.main=cex.main)
plot(cl, hang=-1, label=sample_labels, cex=cex, main=main, xlab=xlab, ...)
}
expression_plot <- function(
gene_id, result, eSet, x_var, f=result$factor, subset=NULL,
xlab=x_var, ylab="log2(cpm)", ylim=range(exprs(eSet)),
col.palette="Accent",
col=brewer.pal(n=length(levels(pData(eSet)[,f])), name=col.palette),
level=0.95, title=NULL, title.size=2, axis.title.size=20,
axis.text.size=15, axis.text.angle=0,
legend.title.size=20, legend.text.size=15, legend.key.size=30){
# if the feature identifier is absent from the dataset
if (!gene_id %in% rownames(eSet)){
# suggest close matches if any
matches <- agrep(
pattern=gene_id,
x=rownames(eSet),
max.distance=1,
fixed=TRUE,
value=TRUE
)
if (length(matches) > 0){
cat(gene_id, "not found in dataset. Did you mean:", fill=TRUE)
return(matches)
}
else{
stop(gene_id, "not found in dataset. No close match either.")
}
}
# if the result provided does not contain the slots required for this
# function
if (! all(c("factor","genes") %in% names(result))){
stop("'result=' argument misses required slots.
Is it a GO_analyse() output?")
}
# If the X variable requested does not exist in the sample annotations
if (! x_var %in% colnames(pData(eSet))){
# Return an error and stop
stop("x_var: ", x_var, " is not an existing column in pData(eSet).")
}
# If the factor requested does not exist in the sample annotations
if (! f %in% colnames(pData(eSet))){
# Return an error and stop
stop("f: ", f, " is not an existing column in pData(eSet).")
}
# Subset the data to the given values of the given factors, if existing
if (!is.null(subset)){
eSet <- subEset(eSet=eSet, subset=subset)
}
# Build the title message from the combination of gene_id and gene_symbol
title <- paste(
gene_id,
" = ",
result$genes[gene_id, "external_gene_name"]
)
# Assemble a data frame containing the necessary information for ggplot
df <- data.frame(
Expression=exprs(eSet)[gene_id,],
Factor=pData(eSet)[,f],
X=pData(eSet)[,x_var]
)
# Generate the plot
gg <- ggplot(df) +
geom_smooth(
aes_string(
x = "X", y = "Expression",
group = "Factor", color = "Factor", fill = "Factor"
),
level = level
) +
labs(title = title, x = xlab, y = ylab) +
theme(
plot.title=element_text(size = rel(title.size)),
axis.title=element_text(size = axis.title.size),
axis.text=element_text(size = axis.text.size),
axis.text.x=element_text(angle = axis.text.angle),
legend.text=element_text(size = legend.text.size),
legend.title=element_text(size = legend.title.size),
legend.key.size = unit(legend.key.size, "points")
) +
scale_colour_manual(values = col, name = f) +
scale_fill_manual(values = col, name = f)
# If a non-default ylim was given, apply it
if (!is.null(ylim)){
gg <- gg + scale_y_continuous(limits = ylim)
}
# Return the plot
return(gg)
}
expression_plot_symbol <- function(
gene_symbol, result, eSet, x_var, f=result$factor, subset=NULL,
index=0, xlab=x_var, ylab="log2(cpm)", ylim=range(exprs(eSet)),
col.palette="Accent",
col=brewer.pal(n=length(levels(pData(eSet)[,f])), name=col.palette),
level=0.95, titles=c(), title.size=2, axis.title.size=20,
axis.text.size=15, axis.text.angle=0,
legend.title.size=20, legend.text.size=20, legend.key.size=30){
# if the result provided does not contain the slots required for this
# function
if (! all(c("factor","genes") %in% names(result))){
stop("'result=' argument misses required slots.
Is it a GO_analyse() output?")
}
#
# If the X variable requested does not exist in the sample annotations
if (! x_var %in% colnames(pData(eSet))){
stop("x_var: ", x_var, " is not an existing column in pData(eSet).")
}
# If the factor requested does not exist in the sample annotations
if (! f %in% colnames(pData(eSet))){
stop("f: ", f, " is not an existing column in pData(eSet).")
}
# Subset the data to the given values of the given factors, if existing
if (!is.null(subset)){
eSet <- subEset(eSet=eSet, subset=subset)
}
# If the gene name provided is empty
if (gene_symbol == ''){
stop("Empty gene name given.")
}
# the GO_analyse result provided contains the annotation of each feature
# identifier
# present in the dataset to a gene name, if any
cat(
"Fetching feature identifier(s) annotated to", gene_symbol, "...",
fill=TRUE
)
mapping <- data.frame(
gene_id=rownames(result$genes),
external_gene_name=result$genes$external_gene_name,
stringsAsFactors=FALSE
)
# if the gene name is absent from the mapping table
if(!gene_symbol %in% mapping$external_gene_name){
# suggest close matches if any
matches <- agrep(
pattern=gene_symbol,
x=mapping$external_gene_name,
fixed=TRUE, value=TRUE
)
# if we do have one or more close matches to the symbol
if (length(matches) > 0){
# list them to the user for help and stop the function
cat(gene_symbol, "not found in dataset. Did you mean:", fill=TRUE)
return(matches)
}
# if we don't have close matches in the dataset, tell the user and
# stop the function
else{
stop(
paste(
gene_symbol,
"not found in dataset. No close match either."
)
)
}
}
# At this stage we know the gene symbol has at least one corresponding
# feature identifier in the Ensembl BioMart, fetch all identifier(s)
# corresponding to that gene symbol
gene_ids <- mapping[mapping$external_gene_name == gene_symbol, "gene_id"]
# However, we still don't know how many of those identifiers are present
# in the expression dataset. Remove the feature identifiers absent from
# our dataset as we cannot plot them
gene_ids_present <- gene_ids[gene_ids %in% rownames(eSet)]
# If none of the feature identifiers are present in the dataset
if (length(gene_ids_present) == 0){
cat(
"Feature identifiers were found for", gene_symbol, "\n",
"but none of them were found in the expression dataset.\n",
"Feature identifiers were:"
)
return(gene_ids)
}
# At this stage we are finally sure that at least one of the feature
# identifiers corresponding to the gene name are also present in the
# expression dataset. This is the best moment to generate as many titles
# as there are feature identifier(s) annotated to the given gene symbol
# If the user left the default vector of titles (empty)
if (is.null(titles)){
# Create a smart title for each plot
for (ensembl in gene_ids_present){
titles <- c(titles, paste(gene_symbol, " = ", ensembl))
}
}
# If the user changed the default vector of titles
else{
# if the number of titles does not match the number of plots
if(length(titles) != length(gene_ids_present)){
# return an error and stop
stop(
"The number of titles (", length(titles),
") does not match the number of plots (",
length(gene_ids_present), ")."
)
}
}
# If there are strictly more than 1 gene id associated with the gene
# symbol
if (length(gene_ids_present) > 1){
# Tell the user
cat("Multiple gene ids found for", gene_symbol, fill=TRUE)
cat("Indices are:", fill=TRUE)
print(gene_ids_present)
# if the user did not change the default index value (0)
# the function will plot all Ensembl ids in a lattice
if (index==0){
# A first time user might not know that how to plot a single plot
cat(
"Use argument 'index=1' to plot the first gene id alone,",
"and so on.", fill=TRUE
)
# Prepare a grid to plot multiple graphs while optimising the
# number of columns and rows
columns <- ceiling(sqrt(length(gene_ids_present)))
# Store all the plots in a list
plots <- list()
for (i in seq(1,length(gene_ids_present))){
cat("Plotting", gene_ids_present[i], fill=TRUE)
plots[[i]] <- expression_plot(
gene_id=gene_ids_present[i],
eSet=eSet,
x_var=x_var, result=result, f=f, xlab=xlab, ylab=ylab,
ylim=ylim, col.palette=col.palette, col=col, level=level,
title=titles[i], title.size=title.size,
axis.title.size=axis.title.size,
axis.text.size=axis.text.size,
axis.text.angle=axis.text.angle,
legend.text.size=legend.text.size,
legend.title.size=legend.title.size,
legend.key.size=legend.key.size
)
}
# Plot all the graphs in the optimised lattice, using the
# feature-based plotting function
multiplot(plotlist=plots, cols=columns)
return(gene_ids_present)
}
# If the user gave a non-zero index
else{
# If the index is out of bound
if (abs(index) > length(gene_ids_present)){
# Return an error
print("Index is out of bound.")
cat("Indices are:", fill=TRUE)
}
# If the index is acceptable
else{
# Plot the corresponding graph
cat("Plotting", gene_ids_present[index], fill=TRUE)
expression_plot(
gene_id=gene_ids_present[index],
eSet=eSet, x_var=x_var,
result=result, f=f, xlab=xlab, ylab=ylab,
ylim=ylim, col.palette=col.palette, col=col, level=level,
title=titles[index], title.size=title.size,
axis.title.size=axis.title.size,
axis.text.size=axis.text.size,
axis.text.angle=axis.text.angle,
legend.text.size=legend.text.size,
legend.title.size=legend.title.size,
legend.key.size=legend.key.size
)
}
}
}
# If there is a unique gene id associated to the gene symbol
else{
cat("Unique gene id found for", gene_symbol, fill=TRUE)
cat("Plotting", gene_ids_present, fill=TRUE)
expression_plot(
gene_id=gene_ids_present,
eSet=eSet, x_var=x_var, result=result, f=f, xlab=xlab, ylab=ylab,
ylim=ylim, col.palette=col.palette, col=col,
level=level, title=titles, title.size=title.size,
axis.title.size=axis.title.size,
axis.text.size=axis.text.size,
axis.text.angle=axis.text.angle,
legend.text.size=legend.text.size,
legend.title.size=legend.title.size,
legend.key.size=legend.key.size
)
}
}
expression_profiles <- function(
gene_id, result, eSet, x_var, seriesF, subset=NULL,
colourF=result$factor, linetypeF=colourF, line.size=1.5,
xlab=x_var, ylab="log2(cpm)", ylim=range(exprs(eSet)),
col.palette="Accent",
col=brewer.pal(
n=length(levels(pData(eSet)[,colourF])),
name=col.palette),
lty=1:length(levels(pData(eSet)[,linetypeF])),
title=NULL, title.size=2, axis.title.size=20,
axis.text.size=15, axis.text.angle=0,
legend.title.size=20, legend.text.size=15,
legend.key.size=30){
# if the feature identifier is absent from the dataset
if (!gene_id %in% rownames(eSet)){
# suggest close matches if any
matches <- agrep(
pattern=gene_id, x=rownames(eSet), max.distance=1,
fixed=TRUE, value=TRUE
)
if (length(matches) > 0){
cat(gene_id, "not found in dataset. Did you mean:", fill=TRUE)
return(matches)
}
else{
stop(gene_id, "not found in dataset. No close match either.")
}
}
# if the result provided does not contain the slots required for this
# function
if (! all(c("factor","genes") %in% names(result))){
stop("'result=' argument misses required slots.
Is it a GO_analyse() output?")
}
# If the X variable requested does not exist in the sample annotations
if (! x_var %in% colnames(pData(eSet))){
# Return an error and stop
stop("x_var: ", x_var, " is not an existing column in pData(eSet).")
}
# If the series factor requested does not exist in the sample
# annotations
if (! seriesF %in% colnames(pData(eSet))){
# Return an error and stop
stop(
"seriesF: ", seriesF,
" is not an existing column in pData(eSet)."
)
}
# If the colour factor requested does not exist in the sample
# annotations
if (! colourF %in% colnames(pData(eSet))){
# Return an error and stop
stop(
"colourF: ", colourF,
" is not an existing column in pData(eSet)."
)
}
# If the linetype factor requested does not exist in the sample
# annotations
if (! linetypeF %in% colnames(pData(eSet))){
# Return an error and stop
stop(
"linetypeF: ", linetypeF,
" is not an existing column in pData(eSet)."
)
}
# Subset the data to the given values of the given factors, if existing
if (!is.null(subset)){
eSet <- subEset(eSet=eSet, subset=subset)
}
# Build the title message from the combination of gene_id and gene_symbol
title <- paste(
gene_id,
" = ",
result$genes[gene_id, "external_gene_name"]
)
# Assemble a data frame containing the necessary information for ggplot
df <- data.frame(
Expression=exprs(eSet)[gene_id,],
X=pData(eSet)[,x_var],
Profile=pData(eSet)[,seriesF],
LineType=pData(eSet)[,linetypeF],
Colour=pData(eSet)[,colourF]
)
# Generate the plot
gg <- ggplot(data=df) +
geom_line(
aes_string(
x = "X", y = "Expression",
group = "Profile", linetype = "LineType", colour = "Colour"
),
size = line.size
) +
labs(title = title, x = xlab, y = ylab) +
theme(
plot.title=element_text(size = rel(title.size)),
axis.title=element_text(size = axis.title.size),
axis.text=element_text(size = axis.text.size),
axis.text.x=element_text(angle = axis.text.angle),
legend.text=element_text(size = legend.text.size),
legend.title=element_text(size = legend.title.size),
legend.key.size = unit(legend.key.size, "points")
) +
scale_colour_manual(values = col, name = colourF) +
scale_linetype_manual(values = lty, name = linetypeF)
# If a non-default ylim was given, apply it
if (!is.null(ylim)){
gg <- gg + scale_y_continuous(limits = ylim)
}
# Return the plot
return(gg)
}
expression_profiles_symbol <- function(
gene_symbol, result, eSet, x_var, seriesF, subset=NULL,
colourF=result$factor, linetypeF=colourF, line.size=1.5,
index=0, xlab=x_var, ylab="log2(cpm)", ylim=range(exprs(eSet)),
col.palette="Accent",
col=brewer.pal(
n=length(levels(pData(eSet)[,colourF])), name=col.palette),
lty=1:length(levels(pData(eSet)[,linetypeF])),
titles=c(), title.size=2, axis.title.size=20,
axis.text.size=15, axis.text.angle=0,
legend.title.size=20, legend.text.size=15,
legend.key.size=30){
# if the result provided does not contain the slots required for this
# function
if (! all(c("factor","genes") %in% names(result))){
stop("'result=' argument misses required slots.
Is it a GO_analyse() output?")
}
# If the X variable requested does not exist in the sample annotations
if (! x_var %in% colnames(pData(eSet))){
stop("x_var: ", x_var, " is not an existing column in pData(eSet).")
}
# If the series factor requested does not exist in the sample
# annotations
if (! seriesF %in% colnames(pData(eSet))){
# Return an error and stop
stop(
"seriesF: ", seriesF,
" is not an existing column in pData(eSet)."
)
}
# If the colour factor requested does not exist in the sample
# annotations
if (! colourF %in% colnames(pData(eSet))){
# Return an error and stop
stop(
"colourF: ", colourF,
" is not an existing column in pData(eSet)."
)
}
# If the linetype factor requested does not exist in the sample
# annotations
if (! linetypeF %in% colnames(pData(eSet))){
# Return an error and stop
stop(
"linetypeF: ", linetypeF,
" is not an existing column in pData(eSet)."
)
}
# Subset the data to the given values of the given factors, if existing
if (!is.null(subset)){
eSet <- subEset(eSet=eSet, subset=subset)
}
# If the gene name provided is empty
if (gene_symbol == ''){
stop("Empty gene name given.")
}
# the GO_analyse result provided contains the annotation of each feature
# identifier present in the dataset to a gene name, if any
cat(
"Fetching feature identifier(s) annotated to", gene_symbol, "...",
fill=TRUE
)
mapping <- data.frame(
gene_id=rownames(result$genes),
external_gene_name=result$genes$external_gene_name,
stringsAsFactors=FALSE
)
# if the gene name is absent from the mapping table
if(!gene_symbol %in% mapping$external_gene_name){
# suggest close matches if any
matches <- agrep(
pattern=gene_symbol, x=mapping$external_gene_name,
fixed=TRUE, value=TRUE)
# if we do have one or more close matches to the symbol
if (length(matches) > 0){
# list them to the user for help and stop the function
cat(gene_symbol, "not found in dataset. Did you mean:", fill=TRUE)
return(matches)
}
# if we don't have close matches in the dataset, tell the user and
# stop the function
else{
stop(
paste(
gene_symbol,
"not found in dataset. No close match either."
)
)
}
}
# At this stage we know the gene symbol has at least one corresponding
# feature identifier in the Ensembl BioMart, fetch all identifier(s)
# corresponding to that gene symbol
gene_ids <- mapping[mapping$external_gene_name == gene_symbol, "gene_id"]
# However, we still don't know how many of those identifiers are present
# in the expression dataset. Remove the feature identifiers absent from
# our dataset as we cannot plot them
gene_ids_present <- gene_ids[gene_ids %in% rownames(eSet)]
# If none of the feature identifiers are present in the dataset
if (length(gene_ids_present) == 0){
cat(
"Feature identifiers were found for", gene_symbol, "\n",
"but none of them were found in the expression dataset.\n",
"Feature identifiers were:"
)
print(gene_ids)
# return the number of plots printed
return(0)
}
# At this stage we are finally sure that at least one of the feature
# identifiers corresponding to the gene name are also present in the
# expression dataset. This is the best moment to generate as many titles
# as there are feature identifier(s) annotated to the given gene symbol
# If the user left the default vector of titles (empty)
if (is.null(titles)){
# Create a smart title for each plot
for (ensembl in gene_ids_present){
titles <- c(titles, paste(gene_symbol, " = ", ensembl))
}
}
# If the user changed the default vector of titles
else{
# if the number of titles does not match the number of plots
if(length(titles) != length(gene_ids_present)){
# return an error and stop
stop(
"The number of titles (", length(titles),
") does not match the number of plots (",
length(gene_ids_present), ")."
)
}
}
# If there are strictly more than 1 gene id associated with the gene
# symbol
if (length(gene_ids_present) > 1){
# Tell the user
cat("Multiple gene ids found for", gene_symbol, fill=TRUE)
cat("Indices are:", fill=TRUE)
print(gene_ids_present)
# if the user did not change the default index value (0)
# the function will plot all Ensembl ids in a lattice
if (index==0){
# A first time user might not know that how to plot a single plot
cat(
"Use argument 'index=1' to plot the first gene id alone,",
"and so on.", fill=TRUE
)
# Prepare a grid to plot multiple graphs while optimising the
# number of columns and rows
columns <- ceiling(sqrt(length(gene_ids_present)))
# Store all the plots in a list
plots <- list()
for (i in seq(1,length(gene_ids_present))){
cat("Plotting", gene_ids_present[i], fill=TRUE)
plots[[i]] <- expression_profiles(
gene_id=gene_ids_present[i], result=result, eSet=eSet,
x_var=x_var, seriesF=seriesF, colourF=colourF,
linetypeF=linetypeF, line.size=line.size,
xlab=xlab, ylab=ylab, ylim=ylim,
col.palette=col.palette,
col=col, lty=lty, title=titles[i], title.size=title.size,
axis.title.size=axis.title.size,
axis.text.size=axis.text.size,
axis.text.angle=axis.text.angle,
legend.title.size=legend.title.size,
legend.text.size=legend.text.size,
legend.key.size=legend.key.size
)
}
# Plot all the graphs in the optimised lattice, using the
# feature-based plotting function
multiplot(plotlist=plots, cols=columns)
return(gene_ids_present)
}
# If the user gave a non-zero index
else{
# If the index is out of bound
if (abs(index) > length(gene_ids_present)){
# Return an error
print("Index is out of bound.")
cat("Indices are:", fill=TRUE)
}
# If the index is acceptable
else{
# Plot the corresponding graph
cat("Plotting", gene_ids_present[index], fill=TRUE)
expression_profiles(
gene_id=gene_ids_present[index], result=result, eSet=eSet,
x_var=x_var, seriesF=seriesF, colourF=colourF,
linetypeF=linetypeF, line.size=line.size,
xlab=xlab, ylab=ylab, ylim=ylim,
col.palette=col.palette,
col=col, lty=lty, title=titles[i], title.size=title.size,
axis.title.size=axis.title.size,
axis.text.size=axis.text.size,
axis.text.angle=axis.text.angle,
legend.title.size=legend.title.size,
legend.text.size=legend.text.size,
legend.key.size=legend.key.size
)
}
}
}
# If there is a unique gene id associated to the gene symbol
else{
cat("Unique gene id found for", gene_symbol, fill=TRUE)
cat("Plotting", gene_ids_present, fill=TRUE)
expression_profiles(
gene_id=gene_ids_present, result=result, eSet=eSet,
x_var=x_var, seriesF=seriesF, colourF=colourF,
linetypeF=linetypeF, line.size=line.size,
xlab=xlab, ylab=ylab, ylim=ylim,
col.palette=col.palette,
col=col, lty=lty, title=titles[i], title.size=title.size,
axis.title.size=axis.title.size,
axis.text.size=axis.text.size,
axis.text.angle=axis.text.angle,
legend.title.size=legend.title.size,
legend.text.size=legend.text.size,
legend.key.size=legend.key.size
)
}
}
heatmap_GO <- function(
go_id, result, eSet, f=result$factor, subset=NULL, gene_names=TRUE,
NA.names=FALSE, margins=c(7 ,5),
scale="none", cexCol=1.2, cexRow=0.5,
labRow=NULL,
cex.main=1, trace="none", expr.col=bluered(75),
row.col.palette="Accent",
row.col=c(),
main=paste(
go_id, result$GO[result$GO$go_id == go_id,"name_1006"]
),
main.Lsplit=NULL,
...){
# if the result provided does not contain the slots required for this
# function
if (! all(c("factor","GO","genes") %in% names(result))){
stop("'result=' argument misses required slots.
Is it a GO_analyse() output?")
}
# If the GO identifier is not present in the results
if (! go_id %in% result$GO$go_id){
# Return an error and stop
stop("go_id: ", go_id, " was not found in result$mapping$go_id.")
}
# Subset the data to the given values of the given factors, if existing
if (!is.null(subset)){
eSet <- subEset(eSet=eSet, subset=subset)
}
# Also update the RowSideColors of the heatmap.2 to a shorter vector
# because the number of samples has been reduced
# NOTE: if the user wants to provide his own color array he should
# either give the exact number of colors as he expects in the
# subsetted ExpressionSet, or rather use the row.col.palette
# argument to provide the palette, rather than the array of colors
if (length(row.col) != ncol(eSet)){
row.col <- brewer.pal(
n = length(unique(pData(eSet)[,f])),
name = row.col.palette
)
}
# Fetch the list of genes associated with the go_id
gene_ids <- list_genes(go_id=go_id, result=result, data.only=TRUE)
# Fetch and format the expression data for those genes
genes_expr <- t(exprs(eSet)[gene_ids,])
# Rows are samples, if the user didn't provide custom labels
if (is.null(labRow)){
# label them according to the user's choson factor
labRow <- pData(eSet)[,f]
}
else{
# If the user provided row labels, they have to match the number of
# sample
# except if length is 1, then assume it is a factor from phenoData
if (length(labRow) == 1){
labRow = pData(eSet)[,labRow]
}
else if (length(labRow) != ncol(eSet)){
stop(
"The number of custom row labels provided (",
length(labRow),
") does not match the number of samples (",
ncol(eSet), ".")
}
}
# Columns are features, label them by identifier or name
if (gene_names){
gene_labels <- result$genes[gene_ids, "external_gene_name"]
# Default: if some feature identifiers have no associated gene name
# fill the blanks with the feature identifier
if (any(gene_labels == '') & !NA.names){
gene_labels[gene_labels == ''] <- gene_ids[gene_labels == '']
}
# if the user prefers to leave blank gene names, do nothing
}
else{
gene_labels <- gene_ids
}
# If requested, split the main title to lines with fewer than a given
# count of characters, while respecting space-separated words
if (!is.null(main.Lsplit)){
if (is.numeric(main.Lsplit)){
main <- string_Lsplit(string=main, line.length=main.Lsplit)
}
else{
stop("main.Lsplit should be a numeric value or NULL.")
}
}
# A vector detailing the color of each sample must be prepared
samples.col <- row.col[as.factor(pData(eSet)[,f])]
# Save the current plotting parameters
op <- par(no.readonly=TRUE)
# whatever happens, restore original setting when leaving function
on.exit(par(op))
# Change the font size of the title
par(cex.main=cex.main)
# Plot the heatmap of the data
heatmap.2(
genes_expr, labRow=labRow, labCol=gene_labels,
scale=scale, cexCol=cexCol, cexRow=cexRow, main=main,
trace=trace, RowSideColors=samples.col, col=expr.col,
margins = margins, ...)
}
hist_scores <- function(
result,
main=paste(
"Distribution of average scores in",
deparse(substitute(result))), xlab="Average score",
...){
# if the result provided does not contain the slots required for this
# function
if (! all(c("GO") %in% names(result))){
stop("'result=' argument misses required slots.
Is it a GO_analyse() output?")
}
hist(result$GO$ave_score, main=main, xlab=xlab, ...)
}
list_genes <- function(go_id, result, data.only=TRUE){
# if the result provided does not contain the slots required for this
# function
if (! all(c("mapping","genes") %in% names(result))){
stop("'result=' argument misses required slots.
Is it a GO_analyse() output?")
}
# If the go_id requested is not present in the dataset
if (!go_id %in% result$mapping$go_id){
# return an error and stop
stop("go_id: ", go_id, " was not found in result$mapping$go_id.")
}
# List of gene_ids annotated to the GO term
gene_ids <- result$mapping[result$mapping$go_id == go_id,"gene_id"]
# BioMart can have more genes associated with the GO term
# than the number actually present in the dataset
if (data.only){
# Filter out those not in the dataset
gene_ids <- gene_ids[gene_ids %in% rownames(result$genes)]
}
# return the list of gene_ids associated with it
return(gene_ids)
}
plot_design <- function(
go_id, result, eSet, subset=NULL,
factors=colnames(pData(eSet)), main="", main.Lsplit=NULL, ...){
# if the result provided does not contain the slots required for this
# function
if (! all(c("GO") %in% names(result))){
stop("'result=' argument misses required slots.
Is it a GO_analyse() output?")
}
# If the GO identifier is not present in the results
if (! go_id %in% result$GO$go_id){
# Return an error and stop
stop("go_id: ", go_id, " was not found in result$GO$go_id.")
}
# if the user changed the default value
# check that all given factors exist in colnames(eSet)
if (any(factors != colnames(pData(eSet)))){
for(f in factors){
if (!f %in% colnames(pData(eSet))){
# Otherwise, stop and return which of them is not a valid
# factor name
stop(f, " is not an existing column in pData(eSet).")
}
}
}
# Subset the data to the given values of the given factors, if existing
if (!is.null(subset)){
eSet <- subEset(eSet=eSet, subset=subset)
}
# Fetch the list of genes associated with the go_id
# If the user gave the output of a GO_analyse command as result=
# that list contains the mapping between feature and GO_id
gene_ids_present <- list_genes(
go_id=go_id, result=result, data.only=TRUE
)
GO_name <- result$GO[result$GO$go_id == go_id, "name_1006"]
# Prepare a temporary data frame plot.design-friendly
df <- data.frame(
t(exprs(eSet)[gene_ids_present,]),
pData(eSet)[,factors]
)
# If no custom title was given
if (main == ""){
# Generate a smart one (careful: the same title will be used for all
# genes in the GO term)
# Smart title is the name of the GO term
main <- paste(go_id, GO_name)
# If requested, split the main title to lines with fewer than a given
# count of characters, while respecting space-separated words
if (!is.null(main.Lsplit)){
if (is.numeric(main.Lsplit)){
main <- string_Lsplit(string=main, line.length=main.Lsplit)
}
else{
stop("main.Lsplit should be a numeric value or NULL.")
}
}
}
# Perform a plot.design of all the genes in the data frame (= in the GO
# term and in the dataset)
plot.design(df, main=main, ...)
}
pValue_GO = function(
result, N=1000, ranked.by=result$rank.by, rank.by='P',
FUN.GO=result$FUN.GO)
{
if(length(ranked.by) != 1){
stop('Invalid ranked.by argument. Choose one ranking mathod.')
}
if(length(rank.by) != 1){
stop('Invalid rank.by argument. Choose one ranking mathod.')
}
if (! ranked.by %in% c('Rank','Score','rank','score')){
stop('Invalid ranked.by argument. See man page for details.')
}
if (ranked.by == 'rank'){
ranked.by <- 'Rank'
}
if (ranked.by == 'score'){
ranked.by <- 'Score'
}
if (! rank.by %in% c('Rank','Score','rank','score','p','P', 'p.val')){
stop('Invalid ranked.by argument. See man page for details.')
}
if (rank.by == 'rank'){
ranked.by <- 'Rank'
}
else if (rank.by == 'score'){
ranked.by <- 'Score'
}
else if(rank.by %in% c('p', 'P')){
rank.by <- 'p.val'
}
# Create a reference copy of the ontology scores
real.go <- result$GO[,c('go_id', 'ave_rank', 'ave_score')]
# Prepare a counter for p-value
real.go$p.count <- 0
# Fetch all the genes with expression data and their rank/score
random.genes <- result$genes[,c('Score', 'Rank')]
# Add the annotated genes absent from the expression data with 0 score
# and max rank (just like in GO_analyse)
all.annotated.genes <- unique(result$mapping$gene_id)
NA.genes <- all.annotated.genes[
! all.annotated.genes %in% rownames(random.genes)
]
random.genes <- rbind(
random.genes,
data.frame(
row.names = NA.genes,
Score=rep(0, length(NA.genes)),
Rank=rep(nrow(random.genes)+1, length(NA.genes))))
# For each iteration
for(i in 1:N){
# Progress bar
progress.bar(x = i, max = N)
# Randomise the gene order using the latest shuffled copy,
# alternatively re-use the original copy every time?)
rownames(random.genes) <- sample(
x = rownames(random.genes),
size = nrow(random.genes),
replace = FALSE) # replace = F is default, but let's be clear
# Merge the randomised genes with the table of ontologies
random.genes2GO <- merge(
x = random.genes, y = result[['mapping']],
by.x = 'row.names', by.y = 'gene_id',
all.y = TRUE, sort = FALSE)
# compute the new average rank/score
if(ranked.by == 'Rank'){
# Replace NAs (annotated genes without expression data)
# by max rank + 1
random.genes2GO$Rank[is.na(random.genes2GO$Rank)] <-
max(random.genes2GO$Rank, na.rm = TRUE) + 1
# Calculate the randomised average rank
random.aveRank <- aggregate(
Rank ~ go_id, data = random.genes2GO, FUN = FUN.GO
)
# Combine the real and randomised results for comparison
# (ignore ontologies with 0 terms,
# these will be given p-value of 1 later)
compare.aveRanks <- merge(
x = real.go, y = random.aveRank,
by = 'go_id', sort = FALSE,
all = FALSE) # all=F is default, but let's be clear
# get the list of genes where random is better than real
worst.GO <- compare.aveRanks$go_id[
compare.aveRanks$Rank <= compare.aveRanks$ave_rank
]
# for all the above, add 1 to the p.count
real.go$p.count[real.go$go_id %in% worst.GO] <-
real.go$p.count[real.go$go_id %in% worst.GO] + 1
} else if(ranked.by == 'Score'){
# Replace NAs (annotated genes without expression data) by score 0
random.genes2GO$Score[is.na(random.genes2GO$Score)] <- 0
# Calculate the randomised average rank
random.aveScore <- aggregate(
Score ~ go_id, data = random.genes2GO, FUN = FUN.GO
)
# Combine the real and randomised results for comparison
# (ignore ontologies with 0 terms,
# these will be given p-value of 1 later)
compare.aveScore <- merge(
x = real.go, y = random.aveScore,
by = 'go_id', sort = FALSE,
all = FALSE) # all=F is default, but let's be clear
# get the list of ontologies where random is better than real
worst.GO <- compare.aveScore$go_id[
compare.aveScore$Score <= random.aveScore$ave_score
]
# for all the above, add 1 to the p.count
real.go$p.count[real.go$go_id %in% worst.GO] <-
real.go$p.count[real.go$go_id %in% worst.GO] + 1
}
}
# Divide the total p.count by the number of iterations (1)
real.go$p.val <- real.go$p.count / N
# Keep only the go_id and p.val columns for merging with the GO table
real.go <- real.go[,c('go_id', 'p.val')]
# Merge the p.val column with the GO table of the GO_analyse output
GO.new <- merge(
x = result$GO, y = real.go,
by = 'go_id', all.x = TRUE, sort = FALSE)
# Set the p-value to 1 for ontologies not annotated with any gene
GO.new$p.val[!GO.new$go_id %in% unique(result$mapping$go_id)] <- 1
# reorder columns
GO.new <- GO.new[,c(
"go_id","ave_rank","ave_score","total_count","data_count","p.val",
"name_1006","namespace_1003")]
# Replace the GO table in the GO_analyse output
result$GO <- GO.new
# Use the rerank function to reorder according to user choice
result <- rerank(result=result, rank.by=rank.by)
# Add/update a slot in the GO_analyse output that states the number of
# iterations used to compute the p-value
result$p.iterations <- N
# return the update GO_analyse output
return(result)
}
quantiles_scores <- function(
result, probs=c(0.9, 0.95, 0.99, 0.999, 0.9999), quartiles=FALSE
){
# if the result provided does not contain the slots required for this
# function
if (! all(c("GO") %in% names(result))){
stop("'result=' argument misses required slots.
Is it a GO_analyse() output?")
}
# If user changes to "quartiles=TRUE" then the following will be true
if (quartiles){
quantile(x=result$GO$ave_score)
}
# Default is to give range of top 10%, 5%, 1%, 0.1% and 0.01%
else {
quantile(x=result$GO$ave_score, probs=probs)
}
}
rerank <- function(result, rank.by = 'rank'){
# if the result provided does not contain the slots required for this
# function
if (! all(c("GO","genes") %in% names(result))){
stop("'result=' argument misses required slots.
Is it a GO_analyse() output?")
}
# If p-value filter was requested, check that the p.value step was indeed
# performed
if ('p.val' %in% rank.by){
if (! 'p.val' %in% colnames(result$GO)){
stop('"p.val" column absent from result$GO.
Is it a pValue_GO() output?')
}
}
# Reorder the GO and gene tables accordin to the user's choice
if (rank.by == "rank"){
result$GO <- result$GO[order(result$GO$ave_rank),]
result$genes <- result$genes[order(result$genes$Rank),]
}
else if (rank.by == "score"){
result$GO <- result$GO[order(result$GO$ave_score, decreasing=TRUE),]
result$genes <- result$genes[
order(
result$genes$Score, decreasing=TRUE
),
]
} else if (rank.by == "p.val"){
result$GO <- result$GO[order(result$GO$p.val),]
}
else{
stop("Invalid ranking method: ", rank.by)
}
# update the rank.by slot
result$rank.by <- rank.by
return(result)
}
subEset <- function(eSet, subset=list()){
# subset should be a list of factor names with the associated values
# to retain for that factor
if (!class(subset) == "list"){
stop("'subset=' argument should be a list.")
}
if (length(subset) > 0){
for (f_filter in names(subset)){
# Check that the name of the list item is a column name in
# the phenodata
if (!f_filter %in% colnames(pData(eSet))){
stop(f_filter, " is not a valid column in pData(eSet).")
}
if (length(subset[[f_filter]]) == 0){
stop("No value provided for filter ", f_filter)
}
for (v_filter in subset[[f_filter]]){
# Check that the value is an existing value
# in the phenodata
if(!v_filter %in% pData(eSet)[,f_filter]){
stop(
v_filter,
" is not a valid value of eSet$",
f_filter
)
}
}
# at this stage, column and values exist
# Subset the eSet to the
eSet <- eSet[,pData(eSet)[,f_filter] %in% subset[[f_filter]]]
}
# Clean possible empty levels in factors not used for subsetting
for (name in colnames(pData(eSet))){
# Update the factor levels
if ("factor" %in% class(pData(eSet)[,name])){
pData(eSet)[,name] <- droplevels(pData(eSet)[,name])
}
}
}
else{
message("Empty list of filters given. Returning the original eSet")
}
return(eSet)
}
subset_scores <- function(result, ...){
# if the result provided does not contain the slots required for this
# function
if (! all(c('GO', 'mapping', 'genes') %in% names(result))){
stop("'result=' argument misses required slots.
Is it a GO_analyse() output?")
}
# Save the list of filter and value for easier referencing
filters <- list(...)
# If p-value filter was requested, check that the p.value step was indeed
# performed
if ('p.val' %in% names(filters)){
if (! 'p.val' %in% colnames(result$GO)){
stop('"p.val" column absent from result$GO.
Is it a pValue_GO() output?')
}
}
# Deal with supported synonyms
names(filters) <- replace(
names(filters), names(filters) == 'data', 'data_count')
names(filters) <- replace(
names(filters), names(filters) == 'total', 'total_count')
names(filters) <- replace(
names(filters), names(filters) == 'rank', 'ave_rank')
names(filters) <- replace(
names(filters), names(filters) == 'score', 'ave_score')
names(filters) <- replace(
names(filters), names(filters) == 'namespace', 'namespace_1003')
names(filters) <- replace(
names(filters), names(filters) == 'P', 'p.val')
names(filters) <- replace(
names(filters), names(filters) == 'p', 'p.val')
# Create/update a "filter.GO" slot in the result object
if (is.null(result$filters.GO)){
result$filters.GO <- list()
}
# prepares a table where the filtering results will be saved
filtered <- data.frame(row.names=result$GO$go_id)
# For each filter (careful about the direction of testing)
for (filter in names(filters)){
# Save the filter status of each row for this filter
# Also: warn the user if applying conflicting filters to a previously
# filtered object
# All filters for superiror/equal values
if (filter %in% c('total_count', 'data_count', 'ave_score')){
# If the filter was not applied yet, add it
if (!filter %in% names(result$filters.GO)){
result$filters.GO[[filter]] <- filters[[filter]]
}
# If the filter was applied already on the result object
else{
# if the result object was already filtered for an
# equal or larger value
if (result$filters.GO[[filter]] >= filters[[filter]]){
warning(
'result object was already filter for an equal or ',
'higher cutoff of filter ', filter, ': ',
result$filters.GO[[filter]], '. Ignoring filter.'
)
next
}
# if filtering for a higher value than previously
else{
# update the filter
result$filters.GO[[filter]] <- filters[[filter]]
}
}
filtered[,filter] <- result$GO[,filter] >= filters[[filter]]
}
# All filters for superiror/equal values
else if (filter %in% c('ave_rank', 'p.val')){
# If the filter was not applied yet, add it
if (!filter %in% names(result$filters.GO)){
result$filters.GO[[filter]] <- filters[[filter]]
}
# If the filter was applied already on the result object
else{
# if the result object was already filtered for an
# equal or lower value
if (result$filters.GO[[filter]] <= filters[[filter]]){
warning(
'result object was already filter for an equal or ',
'lower cutoff of filter ', filter, ': ',
result$filters.GO[[filter]], '. Ignoring filter.'
)
next
}
# if filtering for a lower value than previously
else{
# update the filter
result$filters.GO[[filter]] <- filters[[filter]]
}
}
filtered[,filter] <- result$GO[,filter] <= filters[[filter]]
}
## Filters on the namespace of the GO term
else if (filter == 'namespace_1003'){
# If the filter was not applied yet, add it
if (!filter %in% names(result$filters.GO)){
result$filters.GO[[filter]] <- filters[[filter]]
}
# If the filter was applied already on the result object
else{
# if filtering for the same ontology
if (result$filters.GO[[filter]] == filters[[filter]]){
# ignore the
note(
'result object was already filter for the same',
'namespace.'
)
next
}
# if filtering for a different ontology:
# the new one is not present anyway
else{
stop(
'result object was already filter for a different ',
'namespace: ', result$filters.GO[[filter]],
'. No more data for namespace: ', filters[[filter]]
)
}
}
# GO namespace filtering should offer shortcuts
if (filters[[filter]] %in% c('biological_process', 'BP')){
filtered[,filter] <- result$GO$namespace_1003 ==
'biological_process'
}
else if (filters[[filter]] %in% c('molecular_function', 'MF')){
filtered[,filter] <- result$GO$namespace_1003 ==
'molecular_function'
}
else if (filters[[filter]] %in% c('cellular_component', 'CC')){
filtered[,filter] <- result$GO$namespace_1003 ==
'cellular_component'
}
else{
stop(
"Valid values for filter ", filter, " are ",
"'biological_process', 'BP',",
"'molecular_function', 'MF',",
"'cellular_component', and'CC'."
)
}
}
## other filter names cause an error
else{
stop(filter, " is not a valid filter.")
}
}
# Only the rows passing all filters will be kept
# (all test summarised in one yes if all test are passed)
filtered$merge <- apply(X=filtered, MARGIN=1, FUN=all)
# Filter the different slots of results to remain consistent between them
# and save memory
## Subset the score table according to the above filters
result$GO <- result$GO[filtered$merge,]
## Subset the gene/GO mapping to keep only the GO terms left in the score
## table
result$mapping <- result$mapping[
result$mapping$go_id %in% result$GO$go_id,
]
## Subset the anova table to keep only the genes annotated to the genes
## left in the mapping table
result$genes <- result$genes[
rownames(result$genes) %in% result$mapping$gene_id,
]
return(result)
}
table_genes <- function(go_id, result, data.only=FALSE, order.by='rank'){
# if the result provided does not contain the slots required for this
# function
if (! all(c("mapping","genes") %in% names(result))){
stop("'result=' argument misses required slots.
Is it a GO_analyse() output?")
}
# If the go_id requested is not present in the dataset
if (!go_id %in% result$mapping$go_id){
# return an error and stop
stop("go_id: ", go_id, " was not found in result$mapping$go_id.")
}
# Otherwise fetch the list of gene_ids associated with it
gene_ids <- list_genes(go_id=go_id, result=result, data.only=data.only)
# Then fetch the results of those genes
res_table <- result$genes[gene_ids,]
# If gene absent from dataset were also requested (data.only=FALSE)
# Those do not have analysis results and therefore return NA
# Leave NAs for the results, but put their name again as the row name
rownames(res_table) <- gene_ids
# Sorting
if (order.by %in% c('rank', 'score')){
res_table <- res_table[order(res_table$Rank),]
}
else if (order.by == 'gene_id'){
res_table <- res_table[order(rownames(res_table)),]
}
else if (
order.by %in%
c('name', 'external_gene_name','external_gene_id')
){
res_table <- res_table[order(res_table$external_gene_name),]
}
# Return the information for all those genes
return(res_table)
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.