R/exportPathway.R

Defines functions exportPathway

Documented in exportPathway

#' Export the functions of a SQM object into KEGG pathway maps
#'
#' This function is a wrapper for the pathview package (Luo \emph{et al.}, 2017. \emph{Nucleic acids research}, 45:W501-W508). It will generate annotated KEGG pathway maps showing which reactions are present in the different samples. It will also generate legends with the color scales for each sample in separate png files. 
#'
#' @param SQM A SQM, SQMbunch or SQMlite object.
#' @param pathway_id character. The five-number KEGG pathway identifier. A list of all pathway identifiers can be found in \url{https://www.genome.jp/kegg/pathway.html}.
#' @param count character. Either \code{"abund"} for raw abundances, \code{"percent"} for percentages, \code{"bases"} for raw base counts, \code{"tpm"} for TPM normalized values or \code{"copy_number"} for copy numbers (default \code{"copy_number"}). Note that a given count type might not available in this object (e.g. TPM or copy number in SQMlite objects originating from a SQM reads project).
#' @param samples character. An optional vector with the names of the samples to export. If absent, all samples will be exported (default \code{NULL}).
#' @param split_samples logical. Generate a different output file for each sample (default \code{FALSE}).
#' @param sample_colors character. An optional vector with the plotting colors for each sample (default \code{NULL}).
#' @param log_scale logical. Use a base 10 logarithmic transformation for the color scale. Will have no effect if \code{fold_change_groups} is provided (default \code{FALSE}).
#' @param fold_change_groups list. An optional list containing two vectors of samples. If provided, the function will generate a single plot displaying the log2 fold-change between the average abundances of both groups of samples ( log(second group / first group) ) (default \code{NULL}).
#' @param fold_change_colors character. An optional vector with the plotting colors of both groups in the fold-change plot. Will be ignored if \code{fold_change_group} is not provided. 
#' @param max_scale_value numeric. Maximum value to include in the color scale. By default it is the maximum value in the selected samples (if plotting abundances in samples) or the maximum absolute log2 fold-change (if plotting fold changes) (default \code{NULL}).
#' @param color_bins numeric. Number of bins used to generate the gradient in the color scale (default \code{10}).
#' @param output_suffix character. Suffix to be added to the output files (default \code{"pathview"}).
#' @param output_dir character. Directory in which to write the output files (default \code{"."}).
#' @return No return value, but Pathview figures are produced in the current working directory.
#' @seealso \code{\link{plotFunctions}} for plotting the most functions taxa of a SQM object.
#' @examples
#' \donttest{
#' data(Hadza)
#' 
#' exportPathway(Hadza, "00910", count = 'copy_number',
#'               output_dir = tempdir(),
#'               output_suffix = "nitrogen_metabolism",
#'               sample_colors = c("red", "blue"))
#' exportPathway(Hadza, "00250", count = 'tpm',
#'               output_dir = tempdir(),
#'               output_suffix = "ala_asp_glu_metabolism_FoldChange", 
#'               fold_change_groups = list(c("H1"), c("H12")), max_scale_value=2)
#'}
#' @importFrom graphics plot rasterImage text
#' @importFrom grDevices colorRampPalette dev.off png
#' @export
exportPathway = function(SQM, pathway_id, count = 'copy_number', samples = NULL, split_samples = FALSE, sample_colors = NULL, log_scale = FALSE, fold_change_groups = NULL, fold_change_colors = NULL, max_scale_value = NULL, color_bins = 10, output_dir = '.', output_suffix = 'pathview')
    {
    ### Check params.
    if(!inherits(SQM, c('SQM', 'SQMbunch', 'SQMlite'))) { stop('The first argument must be a SQM or a SQMlite object') }
    if(!count %in% c('abund', 'percent', 'bases', 'tpm', 'copy_number'))
        {
        stop('count must be "abund", "percent", "bases", "tpm" or "copy_number"')
        }
    if( count == 'bases' & is.null(SQM[['functions']][['KEGG']][[count]]) )
        {
        stop('There are no tables containing aggregated bases per function in your project, possibly because it was it was generated by sqm_reads.pl')
        }
    if( count == 'tpm' & is.null(SQM[['functions']][['KEGG']][[count]]) )
        {
        stop('There are no tables containing TPM per function in your project, possibly because it was it was generated by sqm_reads.pl')
        }
    if( count == 'copy_number' & is.null(SQM[['functions']][['KEGG']][[count]]) )
        {
        warning('There are no copy number tables in your project, possibly because the single copy genes were not present in the metagenome or the project was generated by sqm_reads.pl. Will use "count=\'percent\' instead".')
        count = 'percent'
        }
    if(!is.null(fold_change_groups))
        {
        if(length(fold_change_groups)!=2 | typeof(fold_change_groups)!='list')
            {
            stop('fold_change_groups must be a length 2 list')
            }
        if(any(!fold_change_groups[[1]] %in% SQM$misc$samples) | any(!fold_change_groups[[2]] %in% SQM$misc$samples))
            {
            stop('The vectors withing fold_change_groups must contain only valid sample names')
            }
        }
    if(is.null(fold_change_colors)) { fold_change_colors = c('red', 'green') }
    if(!is.null(max_scale_value) & !is.numeric(max_scale_value)) { stop('max_scale_value must be numeric') }
    if(!is.null(color_bins) & !is.numeric(color_bins)) { stop('color_bins must be numeric') }
    color_bins = ceiling(color_bins) # Make sure it is a positive integer.

    check.samples(SQM, samples)
 
    if(count %in% c('percent', 'copy_number')) { pseudocount = 0.001 } else { pseudocount = 1 }
 
    ### Select data matrix.
    if(count=='percent') { mat = 100 * t(t(SQM$functions$KEGG$abund) / colSums(SQM$functions$KEGG$abund))
    } else { mat = SQM$functions$KEGG[[count]] }

    if(ncol(mat) > 24)
        {
        warning('We\'ve found that pathview fails when trying to display more than 24 different items, so unless you are grouping your samples with fold_change_groups this will likely not work')
        }
    ### Do stuff.
    if(!is.null(samples)) { mat = mat[,samples,drop=FALSE] }

    if(!is.null(sample_colors))
        {
        if(length(sample_colors) != ncol(mat)) { stop('The number of samples does not correspond with the number of sample colors') }
    } else { sample_colors = rep('red', ncol(mat)) }

    # Taken from https://support.bioconductor.org/p/58110/
    # Download and parse pathway info.
    pathview::download.kegg(pathway.id = pathway_id, species = 'ko', kegg.dir = output_dir)
    xml.file = sprintf('%s/ko%s.xml', output_dir, pathway_id)
    node.data = pathview::node.info(xml.file)
    # Map our data.
    plot.data.gene = pathview::node.map(mol.data=mat, node.data, node.types="ortholog", entrez.gnodes=FALSE)
     
    # pathview::node.map also added the abundances of the different KOs mapping to the same reaction, and assigned it to one of the KOs (the first one?)
    # So plot.data.gene is missing KOs, but has the abundances for each reaction right. We will use that data.
    submat = as.matrix(plot.data.gene[,colnames(mat),drop=FALSE])
    submat[is.na(submat)] = 0

    zeros = submat==0

    ### Do we have to calculate log2FC or not?
    if(is.null(fold_change_groups))
        {
        # Change to log-scale if required.
        if(log_scale) { submat = log(submat+pseudocount, 10) }
    } else
        {
        # Calculate fold change and overwrite submat and plot.data.gene.
        submat = submat + pseudocount
        log2FC = log(rowMeans(submat[,fold_change_groups[[2]],drop=FALSE]) / rowMeans(submat[,fold_change_groups[[1]],drop=FALSE]), 2)
        submat = cbind(submat[,0], log2FC = log2FC)
	zeros = submat==0
	plot.data.gene = cbind(plot.data.gene[,!colnames(plot.data.gene) %in% SQM$misc$samples], log2FC = log2FC)
	}

    ### Set the maximum and minimum scale values.
    if(is.null(fold_change_groups))
        {
        if(is.null(max_scale_value)) { max_scale_value = max(submat)
        } else { if(log_scale) { max_scale_value = log(max_scale_value, 10) } }
        min_scale_value = min(submat)
    } else
        {
        if(is.null(max_scale_value)) { max_scale_value = max(abs(submat)) }
	min_scale_value = -max_scale_value # make it symmetric.
        }
    
    # Turn into discrete color indices.
    breaks = seq(min_scale_value, max_scale_value, length.out=color_bins)
    submat_color = matrix(findInterval(submat, breaks, all.inside=T), nrow=nrow(submat), dimnames=dimnames(submat)) # The lowest index in the interval is 1. This contains zeros as well as low values.
    if(is.null(fold_change_groups))
        {
        submat_color[zeros] = 0
        submat_color = submat_color + 1 # Now the lowest index is 1 (white colour), but it contains only the zeros. Positive values will be from 2 onwards.
        }

    # Customize colors for each sample (or for the logFC) and create legends.
    cols.ts.gene = matrix(NA, nrow=nrow(submat), ncol=ncol(submat), dimnames = dimnames(submat))
    bg.col = "#FFFFFF"
    nice_label = c(abund='Raw abundance', percent = 'Percentage', bases='Bases', tpm='TPM', copy_number='Copy number')[count]
    for(i in 1:ncol(cols.ts.gene))
        {
        if(is.null(fold_change_groups))
            {
            gradient = colorRampPalette(c(bg.col, sample_colors[i]))(color_bins+1) # (color_bins+1) bc the first color will be always white, we want it to be used only for pure zeros.
            if(!log_scale) { true_breaks = breaks } else { true_breaks = 10 ** breaks; true_breaks[1] = 0 } # Breaks for the legend text
        } else 
            {
            gradient = colorRampPalette(c(fold_change_colors[1], bg.col, fold_change_colors[2]))(color_bins)
            true_breaks = breaks # Breaks for the legend text
            }
        cols.ts.gene[,i] = gradient[submat_color[,i]]
        filename = sprintf('%s/ko%s.%s.%s.legend.png', output_dir, pathway_id, output_suffix, colnames(cols.ts.gene)[i])
        message(sprintf('Info: Writing legend file %s\n', filename))
        png(filename)
        plot(c(0,2),c(0,1),type = 'n', axes = FALSE, xlab = '', ylab = '', main = sprintf('%s - %s', colnames(cols.ts.gene)[i], nice_label[count]))
        text(x=1.5, y = seq(0,1,l=color_bins), labels = signif(true_breaks,3))
        rasterImage(rev(gradient), 0, 0, 1,1)
        dev.off()
        }
    cols.ts.gene[zeros] = bg.col # In log2FC plots, if we have an even number of color bins, rxns with zero FC (i.e. absent rxns) will not be exactly white.

    ### PLOT
    thiswd = getwd()
    on.exit(setwd(thiswd))
    setwd(output_dir)
    # Are reactions represented as arrows in this map?? If so, keggview.native may fail
    if('line' %in% unique(node.data$shape[node.data$type=='ortholog']))
        {
        warning('This map uses lines to represent reactions, and we\'ve had isues using keggview native plot in this case.
                 We will switch to graph mode instead')
	parseKGML2Graph2 = utils::getFromNamespace("parseKGML2Graph2", "pathview") 
        gR1 = parseKGML2Graph2(xml.file, genes = FALSE, 
                               expand = FALSE, split.group = FALSE)
        plot.data.cpd=pathview::node.map(NULL, node.data, node.types="compound")
	plot.data.cpd$labels=pathview::cpdkegg2name(plot.data.cpd$labels)[,2]
	mapped.cnodes=rownames(plot.data.cpd)
	node.data$labels[mapped.cnodes]=plot.data.cpd$labels
        pathview::keggview.graph(plot.data.gene = plot.data.gene,
                                 cols.ts.gene = cols.ts.gene, node.data=node.data,
                                 pathway.name = sprintf('ko%s', pathway_id),
                                 kegg.dir = output_dir,
				 path.graph = gR1, map.cpdname = TRUE,
				 cex = 0.15,
                                 same.layer = TRUE, plot.col.key = FALSE, multi.state=!split_samples, out.suffix = output_suffix)
    } else {
        pathview::keggview.native(plot.data.gene = plot.data.gene,
                                 cols.ts.gene = cols.ts.gene, node.data=node.data,
                                 pathway.name = sprintf('ko%s', pathway_id),
                                 kegg.dir = output_dir,
                                 same.layer = TRUE, plot.col.key = FALSE, multi.state=!split_samples, out.suffix = output_suffix)
        }
    file.remove(sprintf('%s/ko%s.png', output_dir, pathway_id))
    file.remove(sprintf('%s/ko%s.xml', output_dir, pathway_id))
    return(invisible(NULL)) 
    }

Try the SQMtools package in your browser

Any scripts or data that you put into this service are public.

SQMtools documentation built on April 3, 2025, 6:16 p.m.