R/post_analysis.R

Defines functions cluster_GO expression_plot expression_plot_symbol expression_profiles expression_profiles_symbol heatmap_GO hist_scores list_genes overlap_GO plot_design pValue_GO quantiles_scores rerank subEset subset_scores table_genes

Documented in cluster_GO expression_plot expression_plot_symbol expression_profiles expression_profiles_symbol heatmap_GO hist_scores list_genes overlap_GO plot_design pValue_GO quantiles_scores rerank subEset subset_scores table_genes

cluster_GO <- function(
    go_id, result, eSet, f=result$factor, subset=NULL,
    method_dist="euclidean", method_hclust="average", cex=0.8,
    main=paste(go_id, result$GO[result$GO$go_id == go_id, "name_1006"]),
    xlab="Distance", cex.main=1, main.Lsplit=NULL, ...){
    # if the result provided does not contain the slots required for this
    # function
    if (! all(c("factor","GO") %in% names(result))){
        stop("'result=' argument misses required slots.
    Is it a GO_analyse() output?")
    }
    # If the GO identifier is not present in the results
    if (! go_id %in% result$GO$go_id){
        # Return an error and stop
        stop(go_id, " was not found in result$mapping$go_id.")
    }
    # Subset the data to the given values of the given factors, if existing
    if (!is.null(subset)){
        eSet <- subEset(eSet=eSet, subset=subset)
    }
    # Fetch the list of genes associated with the go_id
    gene_ids <- list_genes(go_id=go_id, result=result, data.only=TRUE)
    # Fetch and transform the expression data for those genes
    genes_expr <- t(exprs(eSet)[gene_ids,])
    # Hierarchical clustering using dist and hclust
    # (The clearest method to read the labels and control their size)
    di <- dist(genes_expr, method=method_dist)
    cl <- hclust(di, method=method_hclust, ...)
    # If requested, split the main title to lines with fewer than a given
    # count of characters, while respecting space-separated words
    if (!is.null(main.Lsplit)){
        if (is.numeric(main.Lsplit)){ 
            main <- string_Lsplit(string=main, line.length=main.Lsplit)
        }
        else{
            stop("main.Lsplit should be a numeric value or NULL.")
        }
    }
    # Rows are samples, label them according to the user's chosen factor
    sample_labels <- pData(eSet)[,f]
    # Save the current plotting parameters
    op <- par(no.readonly=TRUE)
    # whatever happens, restore original setting when leaving function
    on.exit(par(op))
    # Change the font size of the title
    par(cex.main=cex.main)
    plot(cl, hang=-1, label=sample_labels, cex=cex, main=main, xlab=xlab, ...)
}

expression_plot <- function(
    gene_id, result, eSet, x_var, f=result$factor, subset=NULL,
    xlab=x_var, ylab="log2(cpm)", ylim=range(exprs(eSet)),
    col.palette="Accent",
    col=brewer.pal(n=length(levels(pData(eSet)[,f])), name=col.palette),
    level=0.95, title=NULL, title.size=2, axis.title.size=20, 
    axis.text.size=15, axis.text.angle=0,
    legend.title.size=20, legend.text.size=15, legend.key.size=30){
    # if the feature identifier is absent from the dataset
    if (!gene_id %in% rownames(eSet)){
        # suggest close matches if any
        matches <- agrep(
            pattern=gene_id,
            x=rownames(eSet),
            max.distance=1,
            fixed=TRUE,
            value=TRUE
            )
        if (length(matches) > 0){
            cat(gene_id, "not found in dataset. Did you mean:", fill=TRUE)
            return(matches)
        }
        else{
            stop(gene_id, "not found in dataset. No close match either.")
        }
    }
    # if the result provided does not contain the slots required for this
    # function
    if (! all(c("factor","genes") %in% names(result))){
        stop("'result=' argument misses required slots.
    Is it a GO_analyse() output?")
    }
    # If the X variable requested does not exist in the sample annotations
    if (! x_var %in% colnames(pData(eSet))){
        # Return an error and stop
        stop("x_var: ", x_var, " is not an existing column in pData(eSet).")
    }
    # If the factor requested does not exist in the sample annotations
    if (! f %in% colnames(pData(eSet))){
        # Return an error and stop
        stop("f: ", f, " is not an existing column in pData(eSet).")
    }
    # Subset the data to the given values of the given factors, if existing
    if (!is.null(subset)){
        eSet <- subEset(eSet=eSet, subset=subset)
    }
    # Build the title message from the combination of gene_id and gene_symbol
    title <- paste(
        gene_id,
        " = ",
        result$genes[gene_id, "external_gene_name"]
        )
    # Assemble a data frame containing the necessary information for ggplot
    df <- data.frame(
        Expression=exprs(eSet)[gene_id,],
        Factor=pData(eSet)[,f],
        X=pData(eSet)[,x_var]
        )
    # Generate the plot
    gg <- ggplot(df) +
        geom_smooth(
            aes(x=X, y=Expression, group=Factor, color=Factor, fill=Factor),
            level=level
            ) +
        labs(title=title, x=xlab, y=ylab) +
        theme(
            plot.title=element_text(size=rel(title.size)),
            axis.title=element_text(size=axis.title.size),
            axis.text=element_text(size=axis.text.size),
            axis.text.x=element_text(angle=axis.text.angle),
            legend.text=element_text(size=legend.text.size),
            legend.title=element_text(size=legend.title.size),
            legend.key.size=unit(legend.key.size, "points")
            ) +
        scale_colour_manual(values=col, name=f) + 
        scale_fill_manual(values=col, name=f)
    # If a non-default ylim was given, apply it
    if (!is.null(ylim)){
        gg <- gg + scale_y_continuous(limits=ylim)
    }
    # Return the plot
    return(gg)
}

expression_plot_symbol <- function(
    gene_symbol, result, eSet, x_var, f=result$factor, subset=NULL,
    index=0, xlab=x_var, ylab="log2(cpm)", ylim=range(exprs(eSet)), 
    col.palette="Accent",
    col=brewer.pal(n=length(levels(pData(eSet)[,f])), name=col.palette),
    level=0.95, titles=c(), title.size=2, axis.title.size=20,
    axis.text.size=15, axis.text.angle=0,
    legend.title.size=20, legend.text.size=20, legend.key.size=30){
    # if the result provided does not contain the slots required for this
    # function
    if (! all(c("factor","genes") %in% names(result))){
        stop("'result=' argument misses required slots.
    Is it a GO_analyse() output?")
    }
    # 
    # If the X variable requested does not exist in the sample annotations
    if (! x_var %in% colnames(pData(eSet))){
        stop("x_var: ", x_var, " is not an existing column in pData(eSet).")
    }
    # If the factor requested does not exist in the sample annotations
    if (! f %in% colnames(pData(eSet))){
        stop("f: ", f, " is not an existing column in pData(eSet).")
    }
    # Subset the data to the given values of the given factors, if existing
    if (!is.null(subset)){
        eSet <- subEset(eSet=eSet, subset=subset)
    }
    # the GO_analyse result provided contains the annotation of each feature
    # identifier
    # present in the dataset to a gene name, if any
    cat(
        "Fetching feature identifier(s) annotated to", gene_symbol, "...",
        fill=TRUE
        )
    mapping <- data.frame(
        gene_id=rownames(result$genes), 
        external_gene_name=result$genes$external_gene_name,
        stringsAsFactors=FALSE
        )
    # if the gene name is absent from the mapping table
    if(!gene_symbol %in% mapping$external_gene_name){
        # suggest close matches if any
        matches <- agrep(
            pattern=gene_symbol,
            x=mapping$external_gene_name,
            fixed=TRUE, value=TRUE
            )
        # if we do have one or more close matches to the symbol
        if (length(matches) > 0){
            # list them to the user for help and stop the function
            cat(gene_symbol, "not found in dataset. Did you mean:", fill=TRUE)
            return(matches)
        }
        # if we don't have close matches in the dataset, tell the user and
        # stop the function
        else{
            stop(
                paste(
                    gene_symbol,
                    "not found in dataset. No close match either."
                    )
                )
        }
    }
    # At this stage we know the gene symbol has at least one corresponding
    # feature identifier in the Ensembl BioMart, fetch all identifier(s)
    # corresponding to that gene symbol
    gene_ids <- mapping[mapping$external_gene_name == gene_symbol, "gene_id"]
    # However, we still don't know how many of those identifiers are present
    # in the expression dataset. Remove the feature identifiers absent from
    # our dataset as we cannot plot them
    gene_ids_present <- gene_ids[gene_ids %in% rownames(eSet)]
    # If none of the feature identifiers are present in the dataset
    if (length(gene_ids_present) == 0){
        cat(
            "Feature identifiers were found for", gene_symbol, "\n",
            "but none of them were found in the expression dataset.\n",
            "Feature identifiers were:"
            )
        return(gene_ids)
    }
    # At this stage we are finally sure that at least one of the feature
    # identifiers corresponding to the gene name are also present in the
    # expression dataset. This is the best moment to generate as many titles
    # as there are feature identifier(s) annotated to the given gene symbol
    # If the user left the default vector of titles (empty)
    if (is.null(titles)){
        # Create a smart title for each plot
        for (ensembl in gene_ids_present){
            titles <- c(titles, paste(gene_symbol, " = ", ensembl))
        }
    }
    # If the user changed the default vector of titles
    else{
        # if the number of titles does not match the number of plots
        if(length(titles) != length(gene_ids_present)){
            # return an error and stop
            stop(
                "The number of titles (", length(titles),
                ") does not match the number of plots (",
                length(gene_ids_present), ")."
                )
        }
    }
    # If there are strictly more than 1 gene id associated with the gene
    # symbol
    if (length(gene_ids_present) > 1){
        # Tell the user
        cat("Multiple gene ids found for", gene_symbol, fill=TRUE)
        cat("Indices are:", fill=TRUE)
        print(gene_ids_present)
        # if the user did not change the default index value (0)
        # the function will plot all Ensembl ids in a lattice
        if (index==0){
            # A first time user might not know that how to plot a single plot
            cat(
                "Use argument 'index=1' to plot the first gene id alone,",
                "and so on.", fill=TRUE
                )
            # Prepare a grid to plot multiple graphs while optimising the
            # number of columns and rows
            columns <- ceiling(sqrt(length(gene_ids_present)))
            # Store all the plots in a list
            plots <- list()
            for (i in seq(1,length(gene_ids_present))){
                cat("Plotting", gene_ids_present[i], fill=TRUE)
                plots[[i]] <- expression_plot(
                    gene_id=gene_ids_present[i],
                    eSet=eSet,
                    x_var=x_var, result=result, f=f, xlab=xlab, ylab=ylab,
                    ylim=ylim, col.palette=col.palette, col=col, level=level,
                    title=titles[i], title.size=title.size,
                    axis.title.size=axis.title.size,
                    axis.text.size=axis.text.size,
                    axis.text.angle=axis.text.angle,
                    legend.text.size=legend.text.size,
                    legend.title.size=legend.title.size,
                    legend.key.size=legend.key.size
                    )
            }
            # Plot all the graphs in the optimised lattice, using the
            # feature-based plotting function
            multiplot(plotlist=plots, cols=columns)
            return(gene_ids_present)
        }
        # If the user gave a non-zero index
        else{
            # If the index is out of bound
            if (abs(index) > length(gene_ids_present)){
                # Return an error
                print("Index is out of bound.")
                cat("Indices are:", fill=TRUE)
            }
            # If the index is acceptable
            else{
                # Plot the corresponding graph
                cat("Plotting", gene_ids_present[index], fill=TRUE)
                expression_plot(
                    gene_id=gene_ids_present[index],
                    eSet=eSet, x_var=x_var,
                    result=result, f=f, xlab=xlab, ylab=ylab,
                    ylim=ylim, col.palette=col.palette, col=col, level=level,
                    title=titles[index], title.size=title.size,
                    axis.title.size=axis.title.size,
                    axis.text.size=axis.text.size,
                    axis.text.angle=axis.text.angle,
                    legend.text.size=legend.text.size,
                    legend.title.size=legend.title.size,
                    legend.key.size=legend.key.size
                    )
            }
        }
    }
    # If there is a unique gene id associated to the gene symbol
    else{
        cat("Unique gene id found for", gene_symbol, fill=TRUE)
        cat("Plotting", gene_ids_present, fill=TRUE)
        expression_plot(
            gene_id=gene_ids_present,
            eSet=eSet, x_var=x_var, result=result, f=f, xlab=xlab, ylab=ylab,
            ylim=ylim, col.palette=col.palette, col=col,
            level=level, title=titles, title.size=title.size,
            axis.title.size=axis.title.size,
            axis.text.size=axis.text.size,
            axis.text.angle=axis.text.angle,
            legend.text.size=legend.text.size,
            legend.title.size=legend.title.size,
            legend.key.size=legend.key.size
            )
    }
}

expression_profiles <- function(
    gene_id, result, eSet, x_var, seriesF, subset=NULL,
    colourF=result$factor, linetypeF=colourF, line.size=1.5,
    xlab=x_var, ylab="log2(cpm)", ylim=range(exprs(eSet)),
    col.palette="Accent",
    col=brewer.pal(
        n=length(levels(pData(eSet)[,colourF])),
        name=col.palette),
    lty=1:length(levels(pData(eSet)[,linetypeF])),
    title=NULL, title.size=2, axis.title.size=20,
    axis.text.size=15, axis.text.angle=0,
    legend.title.size=20, legend.text.size=15,
    legend.key.size=30){
    # if the feature identifier is absent from the dataset
    if (!gene_id %in% rownames(eSet)){
        # suggest close matches if any
        matches <- agrep(
            pattern=gene_id, x=rownames(eSet), max.distance=1,
            fixed=TRUE, value=TRUE
            )
        if (length(matches) > 0){
            cat(gene_id, "not found in dataset. Did you mean:", fill=TRUE)
            return(matches)
        }
        else{
            stop(gene_id, "not found in dataset. No close match either.")
        }
    }
    # if the result provided does not contain the slots required for this
    # function
    if (! all(c("factor","genes") %in% names(result))){
        stop("'result=' argument misses required slots.
    Is it a GO_analyse() output?")
    }
    # If the X variable requested does not exist in the sample annotations
    if (! x_var %in% colnames(pData(eSet))){
        # Return an error and stop
        stop("x_var: ", x_var, " is not an existing column in pData(eSet).")
    }
    # If the series factor requested does not exist in the sample
    # annotations
    if (! seriesF %in% colnames(pData(eSet))){
        # Return an error and stop
        stop(
            "seriesF: ", seriesF,
            " is not an existing column in pData(eSet)."
            )
    }
    # If the colour factor requested does not exist in the sample
    # annotations
    if (! colourF %in% colnames(pData(eSet))){
        # Return an error and stop
        stop(
            "colourF: ", colourF,
            " is not an existing column in pData(eSet)."
            )
    }
    # If the linetype factor requested does not exist in the sample
    # annotations
    if (! linetypeF %in% colnames(pData(eSet))){
        # Return an error and stop
        stop(
            "linetypeF: ", linetypeF,
            " is not an existing column in pData(eSet)."
            )
    }
    # Subset the data to the given values of the given factors, if existing
    if (!is.null(subset)){
        eSet <- subEset(eSet=eSet, subset=subset)
    }
    # Build the title message from the combination of gene_id and gene_symbol
    title <- paste(
        gene_id,
        " = ",
        result$genes[gene_id, "external_gene_name"]
        )
    # Assemble a data frame containing the necessary information for ggplot
    df <- data.frame(
        Expression=exprs(eSet)[gene_id,],
        X=pData(eSet)[,x_var],
        Profile=pData(eSet)[,seriesF],
        LineType=pData(eSet)[,linetypeF],
        Colour=pData(eSet)[,colourF]
        )
    # Generate the plot
    gg <- ggplot(data=df) +
        geom_line(
            aes(x=X,
                y=Expression,
                group=Profile,
                linetype=LineType,
                colour=Colour
                ),
            size=line.size) +
        labs(title=title, x=xlab, y=ylab) +
        theme(
            plot.title=element_text(size=rel(title.size)),
            axis.title=element_text(size=axis.title.size),
            axis.text=element_text(size=axis.text.size),
            axis.text.x=element_text(angle=axis.text.angle),
            legend.text=element_text(size=legend.text.size), 
            legend.title=element_text(size=legend.title.size),
            plot.title=element_text(size=rel(2)),
            legend.key.size=unit(legend.key.size, "points")
            ) +
        scale_colour_manual(values=col, name=colourF) + 
        scale_linetype_manual(values=lty, name=linetypeF)
    # If a non-default ylim was given, apply it
    if (!is.null(ylim)){
        gg <- gg + scale_y_continuous(limits=ylim)
    }
    # Return the plot
    return(gg)
}

expression_profiles_symbol <- function(
    gene_symbol, result, eSet, x_var, seriesF, subset=NULL,
    colourF=result$factor, linetypeF=colourF, line.size=1.5,
    index=0, xlab=x_var, ylab="log2(cpm)", ylim=range(exprs(eSet)),
    col.palette="Accent",
    col=brewer.pal(
        n=length(levels(pData(eSet)[,colourF])), name=col.palette),
    lty=1:length(levels(pData(eSet)[,linetypeF])),
    titles=c(), title.size=2, axis.title.size=20,
    axis.text.size=15, axis.text.angle=0,
    legend.title.size=20, legend.text.size=15,
    legend.key.size=30){
    # if the result provided does not contain the slots required for this
    # function
    if (! all(c("factor","genes") %in% names(result))){
        stop("'result=' argument misses required slots.
    Is it a GO_analyse() output?")
    }
    # If the X variable requested does not exist in the sample annotations
    if (! x_var %in% colnames(pData(eSet))){
        stop("x_var: ", x_var, " is not an existing column in pData(eSet).")
    }
    # If the series factor requested does not exist in the sample
    # annotations
    if (! seriesF %in% colnames(pData(eSet))){
        # Return an error and stop
        stop(
            "seriesF: ", seriesF,
            " is not an existing column in pData(eSet)."
            )
    }
    # If the colour factor requested does not exist in the sample
    # annotations
    if (! colourF %in% colnames(pData(eSet))){
        # Return an error and stop
        stop(
            "colourF: ", colourF,
            " is not an existing column in pData(eSet)."
            )
    }
    # If the linetype factor requested does not exist in the sample
    # annotations
    if (! linetypeF %in% colnames(pData(eSet))){
        # Return an error and stop
        stop(
            "linetypeF: ", linetypeF,
            " is not an existing column in pData(eSet)."
            )
    }
    # Subset the data to the given values of the given factors, if existing
    if (!is.null(subset)){
        eSet <- subEset(eSet=eSet, subset=subset)
    }
    # the GO_analyse result provided contains the annotation of each feature
    # identifier present in the dataset to a gene name, if any
    cat(
        "Fetching feature identifier(s) annotated to", gene_symbol, "...",
        fill=TRUE
        )
    mapping <- data.frame(
        gene_id=rownames(result$genes),
        external_gene_name=result$genes$external_gene_name,
        stringsAsFactors=FALSE
        )
    # if the gene name is absent from the mapping table
    if(!gene_symbol %in% mapping$external_gene_name){
        # suggest close matches if any
        matches <- agrep(
            pattern=gene_symbol, x=mapping$external_gene_name,
            fixed=TRUE, value=TRUE)
        # if we do have one or more close matches to the symbol
        if (length(matches) > 0){
            # list them to the user for help and stop the function
            cat(gene_symbol, "not found in dataset. Did you mean:", fill=TRUE)
            return(matches)
        }
        # if we don't have close matches in the dataset, tell the user and
        # stop the function
        else{
            stop(
                paste(
                    gene_symbol,
                    "not found in dataset. No close match either."
                    )
                )
        }
    }
    # At this stage we know the gene symbol has at least one corresponding
    # feature identifier in the Ensembl BioMart, fetch all identifier(s)
    # corresponding to that gene symbol
    gene_ids <- mapping[mapping$external_gene_name == gene_symbol, "gene_id"]
    # However, we still don't know how many of those identifiers are present
    # in the expression dataset. Remove the feature identifiers absent from
    # our dataset as we cannot plot them
    gene_ids_present <- gene_ids[gene_ids %in% rownames(eSet)]
    # If none of the feature identifiers are present in the dataset
    if (length(gene_ids_present) == 0){
        cat(
            "Feature identifiers were found for", gene_symbol, "\n",
            "but none of them were found in the expression dataset.\n",
            "Feature identifiers were:"
            )
        print(gene_ids)
        # return the number of plots printed
        return(0)
    }
    # At this stage we are finally sure that at least one of the feature
    # identifiers corresponding to the gene name are also present in the
    # expression dataset. This is the best moment to generate as many titles
    # as there are feature identifier(s) annotated to the given gene symbol
    # If the user left the default vector of titles (empty)
    if (is.null(titles)){
        # Create a smart title for each plot
        for (ensembl in gene_ids_present){
            titles <- c(titles, paste(gene_symbol, " = ", ensembl))
        }
    }
    # If the user changed the default vector of titles
    else{
        # if the number of titles does not match the number of plots
        if(length(titles) != length(gene_ids_present)){
            # return an error and stop
            stop(
                "The number of titles (", length(titles),
                ") does not match the number of plots (",
                length(gene_ids_present), ")."
                )
        }
    }
    # If there are strictly more than 1 gene id associated with the gene
    # symbol
    if (length(gene_ids_present) > 1){
        # Tell the user
        cat("Multiple gene ids found for", gene_symbol, fill=TRUE)
        cat("Indices are:", fill=TRUE)
        print(gene_ids_present)
        # if the user did not change the default index value (0)
        # the function will plot all Ensembl ids in a lattice
        if (index==0){
            # A first time user might not know that how to plot a single plot
            cat(
                "Use argument 'index=1' to plot the first gene id alone,",
                "and so on.", fill=TRUE
                )
            # Prepare a grid to plot multiple graphs while optimising the
            # number of columns and rows
            columns <- ceiling(sqrt(length(gene_ids_present)))
            # Store all the plots in a list
            plots <- list()
            for (i in seq(1,length(gene_ids_present))){
                cat("Plotting", gene_ids_present[i], fill=TRUE)
                plots[[i]] <- expression_profiles(
                    gene_id=gene_ids_present[i], result=result, eSet=eSet,
                    x_var=x_var, seriesF=seriesF, colourF=colourF,
                    linetypeF=linetypeF, line.size=line.size,
                    xlab=xlab, ylab=ylab, ylim=ylim,
                    col.palette=col.palette,
                    col=col, lty=lty, title=titles[i], title.size=title.size,
                    axis.title.size=axis.title.size,
                    axis.text.size=axis.text.size,
                    axis.text.angle=axis.text.angle,
                    legend.title.size=legend.title.size,
                    legend.text.size=legend.text.size,
                    legend.key.size=legend.key.size
                    )
            }
            # Plot all the graphs in the optimised lattice, using the
            # feature-based plotting function
            multiplot(plotlist=plots, cols=columns)
            return(gene_ids_present)
        }
        # If the user gave a non-zero index
        else{
            # If the index is out of bound
            if (abs(index) > length(gene_ids_present)){
                # Return an error
                print("Index is out of bound.")
                cat("Indices are:", fill=TRUE)
            }
            # If the index is acceptable
            else{
                # Plot the corresponding graph
                cat("Plotting", gene_ids_present[index], fill=TRUE)
                expression_profiles(
                    gene_id=gene_ids_present[index], result=result, eSet=eSet,
                    x_var=x_var, seriesF=seriesF, colourF=colourF,
                    linetypeF=linetypeF, line.size=line.size,
                    xlab=xlab, ylab=ylab, ylim=ylim,
                    col.palette=col.palette,
                    col=col, lty=lty, title=titles[i], title.size=title.size,
                    axis.title.size=axis.title.size,
                    axis.text.size=axis.text.size,
                    axis.text.angle=axis.text.angle,
                    legend.title.size=legend.title.size,
                    legend.text.size=legend.text.size,
                    legend.key.size=legend.key.size
                    )
            }
        }
    }
    # If there is a unique gene id associated to the gene symbol
    else{
        cat("Unique gene id found for", gene_symbol, fill=TRUE)
        cat("Plotting", gene_ids_present, fill=TRUE)
        expression_profiles(
            gene_id=gene_ids_present, result=result, eSet=eSet,
            x_var=x_var, seriesF=seriesF, colourF=colourF,
            linetypeF=linetypeF, line.size=line.size,
            xlab=xlab, ylab=ylab, ylim=ylim,
            col.palette=col.palette,
            col=col, lty=lty, title=titles[i], title.size=title.size,
            axis.title.size=axis.title.size,
            axis.text.size=axis.text.size,
            axis.text.angle=axis.text.angle,
            legend.title.size=legend.title.size,
            legend.text.size=legend.text.size,
            legend.key.size=legend.key.size
            )
    }
}

heatmap_GO <- function(
    go_id, result, eSet, f=result$factor, subset=NULL, gene_names=TRUE,
    NA.names=FALSE, margins=c(7 ,5),
    scale="none", cexCol=1.2, cexRow=0.5, 
    labRow=NULL,
    cex.main=1, trace="none", expr.col=bluered(75), 
    row.col.palette="Accent",
    row.col=c(),
    main=paste(
        go_id, result$GO[result$GO$go_id == go_id,"name_1006"]
        ),
    main.Lsplit=NULL,
    ...){
    # if the result provided does not contain the slots required for this
    # function
    if (! all(c("factor","GO","genes") %in% names(result))){
        stop("'result=' argument misses required slots.
    Is it a GO_analyse() output?")
    }
    # If the GO identifier is not present in the results
    if (! go_id %in% result$GO$go_id){
        # Return an error and stop
        stop("go_id: ", go_id, " was not found in result$mapping$go_id.")
    }
    # Subset the data to the given values of the given factors, if existing
    if (!is.null(subset)){
        eSet <- subEset(eSet=eSet, subset=subset)
    }
    # Also update the RowSideColors of the heatmap.2 to a shorter vector
    # because the number of samples has been reduced
    # NOTE: if the user wants to provide his own color array he should
    # either give the exact number of colors as he expects in the
    # subsetted ExpressionSet, or rather use the row.col.palette 
    # argument to provide the palette, rather than the array of colors
    if (length(row.col) != ncol(eSet)){
        row.col <- brewer.pal(
            n = length(unique(pData(eSet)[,f])),
            name = row.col.palette
            )
    }
    # Fetch the list of genes associated with the go_id
    gene_ids <- list_genes(go_id=go_id, result=result, data.only=TRUE)
    # Fetch and format the expression data for those genes
    genes_expr <- t(exprs(eSet)[gene_ids,])
    # Rows are samples, if the user didn't provide custom labels
    if (is.null(labRow)){
        # label them according to the user's choson factor
        labRow <- pData(eSet)[,f]
    }
    else{
        # If the user provided row labels, they have to match the number of
        # sample
        # except if length is 1, then assume it is a factor from phenoData
        if (length(labRow) == 1){
            labRow = pData(eSet)[,labRow]
        }
        else if (length(labRow) != ncol(eSet)){
            stop(
                "The number of custom row labels provided (",
                length(labRow),
                ") does not match the number of samples (",
                ncol(eSet), ".")
        }
    }
    # Columns are features, label them by identifier or name
    if (gene_names){
        gene_labels <- result$genes[gene_ids, "external_gene_name"]
        # Default: if some feature identifiers have no associated gene name
        # fill the blanks with the feature identifier
        if (any(gene_labels == '') & !NA.names){
            gene_labels[gene_labels == ''] <- gene_ids[gene_labels == '']
        }
        # if the user prefers to leave blank gene names, do nothing
    }
    else{
        gene_labels <- gene_ids
    }
    # If requested, split the main title to lines with fewer than a given
    # count of characters, while respecting space-separated words
    if (!is.null(main.Lsplit)){
        if (is.numeric(main.Lsplit)){ 
            main <- string_Lsplit(string=main, line.length=main.Lsplit)
        }
        else{
            stop("main.Lsplit should be a numeric value or NULL.")
        }
    }
    # A vector detailing the color of each sample must be prepared
    samples.col <- row.col[as.factor(pData(eSet)[,f])]
    # Save the current plotting parameters
    op <- par(no.readonly=TRUE)
    # whatever happens, restore original setting when leaving function
    on.exit(par(op))
    # Change the font size of the title
    par(cex.main=cex.main)
    # Plot the heatmap of the data
    heatmap.2(
        genes_expr, labRow=labRow, labCol=gene_labels,
        scale=scale, cexCol=cexCol, cexRow=cexRow, main=main,
        trace=trace, RowSideColors=samples.col, col=expr.col,
        margins = margins, ...)
}

hist_scores <- function(
    result,
    main=paste(
        "Distribution of average scores in",
        deparse(substitute(result))), xlab="Average score",
    ...){
    # if the result provided does not contain the slots required for this
    # function
    if (! all(c("GO") %in% names(result))){
        stop("'result=' argument misses required slots.
    Is it a GO_analyse() output?")
    }
    hist(result$GO$ave_score, main=main, xlab=xlab, ...)
}

list_genes <- function(go_id, result, data.only=TRUE){
    # if the result provided does not contain the slots required for this
    # function
    if (! all(c("mapping","genes") %in% names(result))){
        stop("'result=' argument misses required slots.
    Is it a GO_analyse() output?")
    }
    # If the go_id requested is not present in the dataset
    if (!go_id %in% result$mapping$go_id){
        # return an error and stop
        stop("go_id: ", go_id, " was not found in result$mapping$go_id.")
    }
    # List of gene_ids annotated to the GO term
    gene_ids <- result$mapping[result$mapping$go_id == go_id,"gene_id"]
    # BioMart can have more genes associated with the GO term
    # than the number actually present in the dataset
    if (data.only){
        # Filter out those not in the dataset
        gene_ids <- gene_ids[gene_ids %in% rownames(result$genes)]
    }
    # return the list of gene_ids associated with it
    return(gene_ids)
}

overlap_GO <- function(go_ids, result, filename=NULL, mar=rep(0.1, 4), ...){
    # if the result provided does not contain the slots required for this
    # function
    if (! all(c("GO") %in% names(result))){
        stop("'result=' argument misses required slots.
    Is it a GO_analyse() output?")
    }
    # Check that all GO terms are present in the result variable
    if (!all(go_ids %in% result$GO$go_id)){
        stop("Some go_id(s) do not exist in result$GO$go_id.")
    }
    # Check that 2-5 GO terms are given, otherwise venn.diagram would crash
    if (length(go_ids) > 5){
        stop("venn.diagram() supports at most 5 groups: Too many given.")
    }
    if (length(go_ids) < 2){
        stop("venn.diagram() requires at least 2 groups: Too few given.")
    }
    # Creates a list of 2-5 gene lists to compare
    gene_sets <- list()
    for (index in 1:length(go_ids)){
        gene_sets[[index]] <- list_genes(go_id=go_ids[[index]], result=result)
    }
    # generate the venn diagram (potentially to a file)
    venn <- venn.diagram(
        x=gene_sets, filename=filename, category.names=go_ids, mar=mar,
        ...)
    # If no filename was given
    if (is.null(filename)){
        # Erases the current window if any
        grid.newpage()
        # Print the venn diagram to the screen
        grid.draw(venn)
    }
}

plot_design <- function(
    go_id, result, eSet,
    factors=colnames(pData(eSet)), main="", main.Lsplit=NULL, ...){
    # if the result provided does not contain the slots required for this
    # function
    if (! all(c("GO") %in% names(result))){
        stop("'result=' argument misses required slots.
    Is it a GO_analyse() output?")
    }
    # If the GO identifier is not present in the results
    if (! go_id %in% result$GO$go_id){
        # Return an error and stop
        stop("go_id: ", go_id, " was not found in result$GO$go_id.")
    }
    # if the user changed the default value
    # check that all given factors exist in colnames(eSet)
    if (any(factors != colnames(pData(eSet)))){
        for(f in factors){
            if (!f %in% colnames(pData(eSet))){
                # Otherwise, stop and return which of them is not a valid
                # factor name
                stop(f, " is not an existing column in pData(eSet).")
            }
        } 
    }
    # Fetch the list of genes associated with the go_id
    # If the user gave the output of a GO_analyse command as result=
    # that list contains the mapping between feature and GO_id
    gene_ids_present <- list_genes(
        go_id=go_id, result=result, data.only=TRUE
        )
    GO_name <- result$GO[result$GO$go_id == go_id, "name_1006"]
    # Prepare a temporary data frame plot.design-friendly
    df <- data.frame(
        t(exprs(eSet)[gene_ids_present,]),
        pData(eSet)[,factors]
        )
    # If no custom title was given
    if (main == ""){
        # Generate a smart one (careful: the same title will be used for all
        # genes in the GO term)
        # Smart title is the name of the GO term
        main <- paste(go_id, GO_name)
        # If requested, split the main title to lines with fewer than a given
        # count of characters, while respecting space-separated words
        if (!is.null(main.Lsplit)){
            if (is.numeric(main.Lsplit)){ 
                main <- string_Lsplit(string=main, line.length=main.Lsplit)
            }
            else{
                stop("main.Lsplit should be a numeric value or NULL.")
            }
        }
    }
    # Perform a plot.design of all the genes in the data frame (= in the GO
    # term and in the dataset)
    plot.design(df, main=main, ...)
}

pValue_GO = function(
    result, N=1000, ranked.by=result$rank.by, rank.by='P',
    FUN.GO=result$FUN.GO)
    {
    if(length(ranked.by) != 1){
        stop('Invalid ranked.by argument. Choose one ranking mathod.')
    }
    if(length(rank.by) != 1){
        stop('Invalid rank.by argument. Choose one ranking mathod.')
    }
    if (! ranked.by %in% c('Rank','Score','rank','score')){
        stop('Invalid ranked.by argument. See man page for details.')
    }
    if (ranked.by == 'rank'){
        ranked.by <- 'Rank'
    }
    if (ranked.by == 'score'){
        ranked.by <- 'Score'
    }
    if (! rank.by %in% c('Rank','Score','rank','score','p','P', 'p.val')){
        stop('Invalid ranked.by argument. See man page for details.')
    }
    if (rank.by == 'rank'){
        ranked.by <- 'Rank'
    }
    else if (rank.by == 'score'){
        ranked.by <- 'Score'
    }
    else if(rank.by %in% c('p', 'P')){
        rank.by <- 'p.val'
    }
    # Create a reference copy of the ontology scores
    real.go <- result$GO[,c('go_id', 'ave_rank', 'ave_score')]
    # Prepare a counter for p-value
    real.go$p.count <- 0
    # Fetch all the genes with expression data and their rank/score
    random.genes <- result$genes[,c('Score', 'Rank')]
    # Add the annotated genes absent from the expression data with 0 score
    # and max rank (just like in GO_analyse)
    all.annotated.genes <- unique(result$mapping$gene_id)
    NA.genes <- all.annotated.genes[
        ! all.annotated.genes %in% rownames(random.genes)
        ]
    random.genes <- rbind(
        random.genes,
        data.frame(
            row.names = NA.genes,
            Score=rep(0, length(NA.genes)),
            Rank=rep(nrow(random.genes)+1, length(NA.genes))))
    # For each iteration
    for(i in 1:N){
        # Progress bar
        progress.bar(x = i, max = N)
        # Randomise the gene order using the latest shuffled copy,
        # alternatively re-use the original copy every time?)
        rownames(random.genes) <- sample(
            x = rownames(random.genes),
            size = nrow(random.genes),
            replace = F) # replace = F is default, but let's be clear
        # Merge the randomised genes with the table of ontologies
        random.genes2GO <- merge(
            x = random.genes, y = result[['mapping']],
            by.x = 'row.names', by.y = 'gene_id',
            all.y = T, sort = F)
        # compute the new average rank/score
        if(ranked.by == 'Rank'){
            # Replace NAs (annotated genes without expression data)
            # by max rank + 1
            random.genes2GO$Rank[is.na(random.genes2GO$Rank)] <-
                max(random.genes2GO$Rank, na.rm = T) + 1
            # Calculate the randomised average rank
            random.aveRank <- aggregate(
                Rank ~ go_id, data = random.genes2GO, FUN = FUN.GO
                )
            # Combine the real and randomised results for comparison
            # (ignore ontologies with 0 terms,
            # these will be given p-value of 1 later)
            compare.aveRanks <- merge(
                x = real.go, y = random.aveRank,
                by = 'go_id', sort = F,
                all = F) # all=F is default, but let's be clear
            # get the list of genes where random is better than real
            worst.GO <- compare.aveRanks$go_id[
                compare.aveRanks$Rank <= compare.aveRanks$ave_rank
                ]
            # for all the above, add 1 to the p.count
            real.go$p.count[real.go$go_id %in% worst.GO] <-
                real.go$p.count[real.go$go_id %in% worst.GO] + 1     
        } else if(ranked.by == 'Score'){
            # Replace NAs (annotated genes without expression data) by score 0
            random.genes2GO$Score[is.na(random.genes2GO$Score)] <- 0
            # Calculate the randomised average rank
            random.aveScore <- aggregate(
                Score ~ go_id, data = random.genes2GO, FUN = FUN.GO
                )
            # Combine the real and randomised results for comparison
            # (ignore ontologies with 0 terms,
            # these will be given p-value of 1 later)
            compare.aveScore <- merge(
                x = real.go, y = random.aveScore,
                by = 'go_id', sort = F,
                all = F) # all=F is default, but let's be clear
            # get the list of ontologies where random is better than real
            worst.GO <- compare.aveScore$go_id[
                compare.aveScore$Score <= random.aveScore$ave_score
                ]
            # for all the above, add 1 to the p.count
            real.go$p.count[real.go$go_id %in% worst.GO] <-
                real.go$p.count[real.go$go_id %in% worst.GO] + 1     
        }
    }
    # Divide the total p.count by the number of iterations (1)
    real.go$p.val <- real.go$p.count / N
    # Keep only the go_id and p.val columns for merging with the GO table
    real.go <- real.go[,c('go_id', 'p.val')]
    # Merge the p.val column with the GO table of the GO_analyse output
    GO.new <- merge(
        x = result$GO, y = real.go,
        by = 'go_id', all.x = T, sort = F)
    # Set the p-value to 1 for ontologies not annotated with any gene
    GO.new$p.val[!GO.new$go_id %in% unique(result$mapping$go_id)] <- 1
    # reorder columns
    GO.new <- GO.new[,c(
        "go_id","ave_rank","ave_score","total_count","data_count","p.val",
        "name_1006","namespace_1003")]
    # Replace the GO table in the GO_analyse output
    result$GO <- GO.new
    # Use the rerank function to reorder according to user choice
    result <- rerank(result=result, rank.by=rank.by)
    # Add/update a slot in the GO_analyse output that states the number of
    # iterations used to compute the p-value
    result$p.iterations <- N
    # return the update GO_analyse output
    return(result)
}

quantiles_scores <- function(
    result, probs=c(0.9, 0.95, 0.99, 0.999, 0.9999), quartiles=FALSE
    ){
    # if the result provided does not contain the slots required for this
    # function
    if (! all(c("GO") %in% names(result))){
        stop("'result=' argument misses required slots.
    Is it a GO_analyse() output?")
    }
    # If user changes to "quartiles=TRUE" then the following will be true
    if (quartiles){
        quantile(x=result$GO$ave_score)
    }
    # Default is to give range of top 10%, 5%, 1%, 0.1% and 0.01%
    else {
        quantile(x=result$GO$ave_score, probs=probs)
    }
}

rerank <- function(result, rank.by = 'rank'){
    # if the result provided does not contain the slots required for this
    # function
    if (! all(c("GO","genes") %in% names(result))){
        stop("'result=' argument misses required slots.
    Is it a GO_analyse() output?")
    }
    # If p-value filter was requested, check that the p.value step was indeed
    # performed
    if ('p.val' %in% rank.by){
        if (! 'p.val' %in% colnames(result$GO)){
            stop('"p.val" column absent from result$GO.
    Is it a pValue_GO() output?')
        }
    }
    # Reorder the GO and gene tables accordin to the user's choice
    if (rank.by == "rank"){
        result$GO <- result$GO[order(result$GO$ave_rank),]
        result$genes <- result$genes[order(result$genes$Rank),]
    }
    else if (rank.by == "score"){
        result$GO <- result$GO[order(result$GO$ave_score, decreasing=TRUE),]
        result$genes <- result$genes[
            order(
                result$genes$Score, decreasing=TRUE
                ),
            ]
    } else if (rank.by == "p.val"){
        result$GO <- result$GO[order(result$GO$p.val),]
    }
    else{
        stop("Invalid ranking method: ", rank.by)
    }
    # update the rank.by slot
    result$rank.by <- rank.by
    return(result)
}

subEset <- function(eSet, subset=list()){
    # subset should be a list of factor names with the associated values
    # to retain for that factor
    if (!class(subset) == "list"){
        stop("'subset=' argument should be a list.")
    }
    if (length(subset) > 0){
        for (f_filter in names(subset)){
            # Check that the name of the list item is a column name in
            # the phenodata
            if (!f_filter %in% colnames(pData(eSet))){
                stop(f_filter, " is not a valid column in pData(eSet).")
            }
            if (length(subset[[f_filter]]) == 0){
                stop("No value provided for filter ", f_filter)
            }
            for (v_filter in subset[[f_filter]]){
                # Check that the value is an existing value 
                # in the phenodata
                if(!v_filter %in% pData(eSet)[,f_filter]){
                    stop(
                        v_filter,
                        " is not a valid value of eSet$",
                        f_filter
                        )
                }
            }
            # at this stage, column and values exist
            # Subset the eSet to the
            eSet <- eSet[,pData(eSet)[,f_filter] %in% subset[[f_filter]]]
            # Update the factor levels
            if ("factor" %in% class(pData(eSet)[,f_filter])){
                pData(eSet)[,f_filter] <- factor(pData(eSet)[,f_filter])
            }
        }
    }
    else{
        message("Empty list of filters given. Returning the original eSet")
    }
    return(eSet)
}

subset_scores <- function(result, ...){
    # if the result provided does not contain the slots required for this
    # function
    if (! all(c('GO', 'mapping', 'genes') %in% names(result))){
        stop("'result=' argument misses required slots.
    Is it a GO_analyse() output?")
    }
    # Save the list of filter and value for easier referencing
    filters <- list(...)
    # If p-value filter was requested, check that the p.value step was indeed
    # performed
    if ('p.val' %in% names(filters)){
        if (! 'p.val' %in% colnames(result$GO)){
            stop('"p.val" column absent from result$GO.
    Is it a pValue_GO() output?')
        }
    }
    # Deal with supported synonyms
    names(filters) <- replace(
        names(filters), names(filters) == 'data', 'data_count')
    names(filters) <- replace(
        names(filters), names(filters) == 'total', 'total_count')
    names(filters) <- replace(
        names(filters), names(filters) == 'rank', 'ave_rank')
    names(filters) <- replace(
        names(filters), names(filters) == 'score', 'ave_score')
    names(filters) <- replace(
        names(filters), names(filters) == 'namespace', 'namespace_1003')
    names(filters) <- replace(
        names(filters), names(filters) == 'P', 'p.val')
    names(filters) <- replace(
        names(filters), names(filters) == 'p', 'p.val')
    # Create/update a "filter.GO" slot in the result object
    if (is.null(result$filters.GO)){
        result$filters.GO <- list()
    }
    # prepares a table where the filtering results will be saved
    filtered <- data.frame(row.names=result$GO$go_id)
    # For each filter (careful about the direction of testing)
    for (filter in names(filters)){
        # Save the filter status of each row for this filter
        # Also: warn the user if applying conflicting filters to a previously
        # filtered object
        # All filters for superiror/equal values
        if (filter %in% c('total_count', 'data_count', 'ave_score')){
            # If the filter was not applied yet, add it
            if (!filter %in% names(result$filters.GO)){
                result$filters.GO[[filter]] <- filters[[filter]]
            }
            # If the filter was applied already on the result object
            else{
                # if the result object was already filtered for an
                # equal or larger value
                if (result$filters.GO[[filter]] >= filters[[filter]]){
                    warning(
                        'result object was already filter for an equal or ',
                        'higher cutoff of filter ', filter, ': ',
                        result$filters.GO[[filter]], '. Ignoring filter.'
                        )
                    next
                }
                # if filtering for a higher value than previously
                else{
                    # update the filter
                    result$filters.GO[[filter]] <- filters[[filter]]
                }
            }
            filtered[,filter] <- result$GO[,filter] >= filters[[filter]]
        }
        # All filters for superiror/equal values
        else if (filter %in% c('ave_rank', 'p.val')){
            # If the filter was not applied yet, add it
            if (!filter %in% names(result$filters.GO)){
                result$filters.GO[[filter]] <- filters[[filter]]
            }
            # If the filter was applied already on the result object
            else{
                # if the result object was already filtered for an
                # equal or lower value
                if (result$filters.GO[[filter]] <= filters[[filter]]){
                    warning(
                        'result object was already filter for an equal or ',
                        'lower cutoff of filter ', filter, ': ',
                        result$filters.GO[[filter]], '. Ignoring filter.'
                        )
                    next
                }
                # if filtering for a lower value than previously
                else{
                    # update the filter
                    result$filters.GO[[filter]] <- filters[[filter]]
                }
            }
            filtered[,filter] <- result$GO[,filter] <= filters[[filter]]
        }
        ## Filters on the namespace of the GO term
        else if (filter == 'namespace_1003'){
            # If the filter was not applied yet, add it
            if (!filter %in% names(result$filters.GO)){
                result$filters.GO[[filter]] <- filters[[filter]]
            }
            # If the filter was applied already on the result object
            else{
                # if filtering for the same ontology
                if (result$filters.GO[[filter]] == filters[[filter]]){
                    # ignore the 
                    note(
                        'result object was already filter for the same',
                        'namespace.'
                        )
                    next
                }
                # if filtering for a different ontology:
                # the new one is not present anyway
                else{
                    stop(
                        'result object was already filter for a different ',
                        'namespace: ', result$filters.GO[[filter]],
                        '. No more data for namespace: ', filters[[filter]]
                        )
                }
            }
            # GO namespace filtering should offer shortcuts
            if (filters[[filter]] %in% c('biological_process', 'BP')){
                filtered[,filter] <- result$GO$namespace_1003 ==
                    'biological_process'
            }
            else if (filters[[filter]] %in% c('molecular_function', 'MF')){
                filtered[,filter] <- result$GO$namespace_1003 ==
                    'molecular_function'
            }
            else if (filters[[filter]] %in% c('cellular_component', 'CC')){
                filtered[,filter] <- result$GO$namespace_1003 ==
                    'cellular_component'
            }
            else{
                stop(
                    "Valid values for filter ", filter, " are ",
                    "'biological_process', 'BP',",
                    "'molecular_function', 'MF',",
                    "'cellular_component', and'CC'."
                    )
            }
        }
        ## other filter names cause an error
        else{
            stop(filter, " is not a valid filter.")
        }
    }
    # Only the rows passing all filters will be kept
    # (all test summarised in one yes if all test are passed)
    filtered$merge <- apply(X=filtered, MARGIN=1, FUN=all)
    # Filter the different slots of results to remain consistent between them
    # and save memory
    ## Subset the score table according to the above filters
    result$GO <- result$GO[filtered$merge,]
    ## Subset the gene/GO mapping to keep only the GO terms left in the score
    ## table
    result$mapping <- result$mapping[
        result$mapping$go_id %in% result$GO$go_id,
        ]
    ## Subset the anova table to keep only the genes annotated to the genes
    ## left in the mapping table
    result$genes <- result$genes[
        rownames(result$genes) %in% result$mapping$gene_id,
        ]
    return(result)
}

table_genes <- function(go_id, result, data.only=FALSE, order.by='rank'){
    # if the result provided does not contain the slots required for this
    # function
    if (! all(c("mapping","genes") %in% names(result))){
        stop("'result=' argument misses required slots.
    Is it a GO_analyse() output?")
    }
    # If the go_id requested is not present in the dataset
    if (!go_id %in% result$mapping$go_id){
        # return an error and stop
        stop("go_id: ", go_id, " was not found in result$mapping$go_id.")
    }
    # Otherwise fetch the list of gene_ids associated with it
    gene_ids <- list_genes(go_id=go_id, result=result, data.only=data.only)
    # Then fetch the results of those genes
    res_table <- result$genes[gene_ids,]
    # If gene absent from dataset were also requested (data.only=FALSE)
    # Those do not have analysis results and therefore return NA
    # Leave NAs for the results, but put their name again as the row name
    rownames(res_table) <- gene_ids
    # Sorting
    if (order.by %in% c('rank', 'score')){
        res_table <- res_table[order(res_table$Rank),]
    }
    else if (order.by == 'gene_id'){
        res_table <- res_table[order(rownames(res_table)),]
    }
    else if (
        order.by %in%
            c('name', 'external_gene_name','external_gene_id')
        ){
        res_table <- res_table[order(res_table$external_gene_name),]
    }
    # Return the information for all those genes
    return(res_table)
}
kevinrue/GOexpress-release documentation built on May 20, 2019, 9:08 a.m.