R/exportPathway.R
In SQMtools: Analyze Results Generated by the 'SqueezeMeta' Pipeline

Documented in exportPathway

require(ggplot2)
#' Export the functions of a SQM object into KEGG pathway maps
#'
#' This function is a wrapper for the pathview package (Luo \emph{et al.}, 2017. \emph{Nucleic acids research}, 45:W501-W508). It will generate annotated KEGG pathway maps showing which reactions are present in the different samples. It will also generate legends with the color scales for each sample in separate png files. 
#'
#' @param SQM A SQM, SQMbunch or SQMlite object.
#' @param pathway_id character. The five-number KEGG pathway identifier. A list of all pathway identifiers can be found in \url{https://www.genome.jp/kegg/pathway.html}.
#' @param count character. Either \code{"abund"} for raw abundances, \code{"percent"} for percentages, \code{"bases"} for raw base counts, \code{"tpm"} for TPM normalized values or \code{"copy_number"} for copy numbers (default \code{"copy_number"}). Note that a given count type might not available in this object (e.g. TPM or copy number in SQMlite objects originating from a SQM reads project).
#' @param samples character. An optional vector with the names of the samples to export. If absent, all samples will be exported (default \code{NULL}).
#' @param split_samples logical. Generate a different output file for each sample (default \code{FALSE}).
#' @param sample_colors character. An optional vector with the plotting colors for each sample (default \code{NULL}).
#' @param log_scale logical. Use a base 10 logarithmic transformation for the color scale. Will have no effect if \code{fold_change_groups} is provided (default \code{FALSE}).
#' @param fold_change_groups list. An optional list containing two vectors of samples. If provided, the function will generate a single plot displaying the log2 fold-change between the median abundances of both groups of samples ( log(second group / first group) ) (default \code{NULL}).
#' @param fold_change_colors character. An optional vector with the plotting colors of both groups in the fold-change plot. Will be ignored if \code{fold_change_group} is not provided. 
#' @param max_scale_value numeric. Maximum value to include in the color scale. By default it is the maximum value in the selected samples (if plotting abundances in samples) or the maximum absolute log2 fold-change (if plotting fold changes) (default \code{NULL}).
#' @param color_bins numeric. Number of bins used to generate the gradient in the color scale (default \code{10}).
#' @param rescale_percent logical. Calculate percent counts over the number of reads in the input object, instead of over the total number of reads in the original project (default \code{FALSE}).
#' @param output_suffix character. Suffix to be added to the output files (default \code{"pathview"}).
#' @param output_dir character. Directory in which to write the output files (default \code{"."}).
#' @return A \code{ggplot} if \code{split_samples = FALSE} and the  \href{https://CRAN.R-project.org/package=ggpattern}{ggpattern} package is installed, otherwise nothing. Additionally, Pathview figures will be written in the directory specified by \code{output_dir}.
#' @seealso \code{\link{plotFunctions}} for plotting the most functions taxa of a SQM object.
#' @examples
#' \donttest{
#' data(Hadza)
#' 
#' exportPathway(Hadza, "00910", count = 'copy_number',
#'               output_dir = tempdir(),
#'               output_suffix = "nitrogen_metabolism",
#'               sample_colors = c("red", "blue"))
#' exportPathway(Hadza, "00250", count = 'tpm',
#'               output_dir = tempdir(),
#'               output_suffix = "ala_asp_glu_metabolism_FoldChange", 
#'               fold_change_groups = list(c("H1"), c("H12")), max_scale_value=2)
#'}
#' @importFrom graphics plot rasterImage text
#' @importFrom grDevices colorRampPalette dev.off png
#' @importFrom utils installed.packages
#' @export
exportPathway = function(SQM, pathway_id, count = 'copy_number', samples = NULL, split_samples = FALSE, sample_colors = NULL, log_scale = FALSE, fold_change_groups = NULL, fold_change_colors = NULL, max_scale_value = NULL, color_bins = 10, rescale_percent = FALSE, output_dir = '.', output_suffix = 'pathview')
    {
    ### Check params.
    if(!inherits(SQM, c('SQM', 'SQMbunch', 'SQMlite'))) { stop('The first argument must be a SQM or a SQMlite object') }
    if(!count %in% c(names(SQM$functions$KEGG), 'percent')) # We could also add custom counts here by assigning e.g. a CLR matrix to
        {                                     #  SQM$functions$KEGG$clr
        stop('count must be "abund", "percent", "bases", "tpm" or "copy_number"')
        }
    if( count == 'bases' & is.null(SQM[['functions']][['KEGG']][[count]]) )
        {
        stop('There are no tables containing aggregated bases per function in your project, possibly because it was it was generated by sqm_reads.pl')
        }
    if( count == 'tpm' & is.null(SQM[['functions']][['KEGG']][[count]]) )
        {
        stop('There are no tables containing TPM per function in your project, possibly because it was it was generated by sqm_reads.pl')
        }
    if( count == 'copy_number' & is.null(SQM[['functions']][['KEGG']][[count]]) )
        {
        warning('There are no copy number tables in your project, possibly because the single copy genes were not present in the metagenome or the project was generated by sqm_reads.pl. Will use "count=\'percent\' instead".')
        count = 'percent'
        }
    if(!is.null(fold_change_groups))
        {
        if(length(fold_change_groups)!=2 | typeof(fold_change_groups)!='list')
            {
            stop('fold_change_groups must be a length 2 list')
            }
        if(any(!fold_change_groups[[1]] %in% SQM$misc$samples) | any(!fold_change_groups[[2]] %in% SQM$misc$samples))
            {
            stop('The vectors withing fold_change_groups must contain only valid sample names')
            }
        }
    if(is.null(fold_change_colors)) { fold_change_colors = c('red', 'green') }
    if(!is.null(max_scale_value) & !is.numeric(max_scale_value)) { stop('max_scale_value must be numeric') }
    if(!is.null(color_bins) & !is.numeric(color_bins)) { stop('color_bins must be numeric') }
    color_bins = ceiling(color_bins) # Make sure it is a positive integer.

    check.samples(SQM, samples)
 
    #if(count %in% c('percent', 'copy_number')) { pseudocount = 0.001 } else { pseudocount = 1 }
    if(!count %in% c('abund', 'bases')) { pseudocount = 0.001 } else { pseudocount = 1 }
    ### Select data matrix.
    if(count == 'percent')
        {
	if(rescale_percent)
            {
            total_counts = colSums(SQM$functions$KEGG$abund)
        } else
            {
            total_counts = SQM$total_reads
            }
        mat = 100 * t(t(SQM$functions$KEGG$abund) / total_counts)
	mat[is.na(mat)] = 0 # the line above will generate NAs if some sample has 0 total counts (can happen if this is a subset)
    } else
        {
        mat = SQM$functions$KEGG[[count]]
        }
    ### Do stuff.
    if(!is.null(samples)) { mat = mat[,samples,drop=FALSE] }

    if(ncol(mat) > 24 & is.null(fold_change_groups))
        {
        warning('We\'ve found that pathview fails when trying to display more than 24 different items, so unless you are grouping your samples with fold_change_groups this will likely not work')
        }

    if(!is.null(sample_colors))
        {
        if(length(sample_colors) != ncol(mat)) { stop('The number of samples does not correspond with the number of sample colors') }
    } else { sample_colors = rep('red', ncol(mat)) }

    # Taken from https://support.bioconductor.org/p/58110/
    # Download and parse pathway info.
    pathview::download.kegg(pathway.id = pathway_id, species = 'ko', kegg.dir = output_dir)
    xml.file = sprintf('%s/ko%s.xml', output_dir, pathway_id)
    node.data = pathview::node.info(xml.file)
    # Map our data.
    plot.data.gene = pathview::node.map(mol.data=mat, node.data, node.types="ortholog", entrez.gnodes=FALSE)
     
    # pathview::node.map also added the abundances of the different KOs mapping to the same reaction, and assigned it to one of the KOs (the first one?)
    # So plot.data.gene is missing KOs, but has the abundances for each reaction right. We will use that data.
    submat = as.matrix(plot.data.gene[,colnames(mat),drop=FALSE])
    submat[is.na(submat)] = 0

    zeros = submat==0

    ### Do we have to calculate log2FC or not?
    if(is.null(fold_change_groups))
        {
        # Change to log-scale if required.
        if(log_scale) { submat = log(submat+pseudocount, 10) }
    } else
        {
        # Calculate fold change and overwrite submat and plot.data.gene.
        submat = submat + pseudocount
        log2FC = log(rowMedians(submat[,fold_change_groups[[2]],drop=FALSE]) / rowMedians(submat[,fold_change_groups[[1]],drop=FALSE]), 2)
        submat = cbind(submat[,0], log2FC = log2FC)
	zeros = submat==0
	plot.data.gene = cbind(plot.data.gene[,!colnames(plot.data.gene) %in% SQM$misc$samples], log2FC = log2FC)
	}

    ### Set the maximum and minimum scale values.
    if(is.null(fold_change_groups))
        {
        if(is.null(max_scale_value)) { max_scale_value = max(submat)
        } else { if(log_scale) { max_scale_value = log(max_scale_value, 10) } }
        min_scale_value = min(submat)
    } else
        {
        if(is.null(max_scale_value)) { max_scale_value = max(abs(submat)) }
	min_scale_value = -max_scale_value # make it symmetric.
        }
    
    # Turn into discrete color indices.
    breaks = seq(min_scale_value, max_scale_value, length.out=color_bins)
    submat_color = matrix(findInterval(submat, breaks, all.inside=T), nrow=nrow(submat), dimnames=dimnames(submat)) # The lowest index in the interval is 1. This contains zeros as well as low values.
    if(is.null(fold_change_groups))
        {
        submat_color[zeros] = 0
        submat_color = submat_color + 1 # Now the lowest index is 1 (white colour), but it contains only the zeros. Positive values will be from 2 onwards.
        }

    # Customize colors for each sample (or for the logFC) and create legends.
    cols.ts.gene = matrix(NA, nrow=nrow(submat), ncol=ncol(submat), dimnames = dimnames(submat))
    bg.col = "#FFFFFF"
    nice_label = c(abund='Raw abundance', percent = 'Percentage', bases='Bases', tpm='TPM', copy_number='Copy number')[count]
    for(i in 1:ncol(cols.ts.gene))
        {
        if(is.null(fold_change_groups))
            {
            gradient = colorRampPalette(c(bg.col, sample_colors[i]))(color_bins+1) # (color_bins+1) bc the first color will be always white, we want it to be used only for pure zeros.
            if(!log_scale) { true_breaks = breaks } else { true_breaks = 10 ** breaks; true_breaks[1] = 0 } # Breaks for the legend text
        } else 
            {
            gradient = colorRampPalette(c(fold_change_colors[1], bg.col, fold_change_colors[2]))(color_bins)
            true_breaks = breaks # Breaks for the legend text
            }
        cols.ts.gene[,i] = gradient[submat_color[,i]]
        filename = sprintf('%s/ko%s.%s.%s.legend.png', output_dir, pathway_id, output_suffix, colnames(cols.ts.gene)[i])
        message(sprintf('Info: Writing legend file %s\n', filename))
        png(filename)
        plot(c(0,2),c(0,1),type = 'n', axes = FALSE, xlab = '', ylab = '', main = sprintf('%s - %s', colnames(cols.ts.gene)[i], nice_label[count]))
        text(x=1.5, y = seq(0,1,l=color_bins), labels = signif(true_breaks,3))
        rasterImage(rev(gradient), 0, 0, 1,1)
        dev.off()
        }
    cols.ts.gene[zeros] = bg.col # In log2FC plots, if we have an even number of color bins, rxns with zero FC (i.e. absent rxns) will not be exactly white.

    ### PLOT
    # Switch to the output directory (since pathwview will always write to pwd)
    thiswd = getwd()
    on.exit(setwd(thiswd)) # Make sure to return to the original working directory even on exit, even if we fail
    setwd(output_dir)
    # Are reactions represented as arrows in this map?? If so, keggview.native may fail
    if('line' %in% unique(node.data$shape[node.data$type=='ortholog']))
        {
        warning('This map uses lines to represent reactions, and we\'ve had isues using keggview native plot in this case.
                 We will switch to graph mode instead')
	parseKGML2Graph2 = utils::getFromNamespace('parseKGML2Graph2', 'pathview') 
        gR1 = parseKGML2Graph2(xml.file, genes = FALSE, 
                               expand = FALSE, split.group = FALSE)
        plot.data.cpd=pathview::node.map(NULL, node.data, node.types='compound')
	plot.data.cpd$labels=pathview::cpdkegg2name(plot.data.cpd$labels)[,2]
	mapped.cnodes=rownames(plot.data.cpd)
	node.data$labels[mapped.cnodes]=plot.data.cpd$labels
        pathview::keggview.graph(plot.data.gene = plot.data.gene,
                                 cols.ts.gene = cols.ts.gene, node.data=node.data,
                                 pathway.name = sprintf('ko%s', pathway_id),
                                 kegg.dir = output_dir,
				 path.graph = gR1, map.cpdname = TRUE,
				 cex = 0.15,
                                 same.layer = TRUE, plot.col.key = FALSE, multi.state=!split_samples, out.suffix = output_suffix)
    } else {
        pathview::keggview.native(plot.data.gene = plot.data.gene,
                                 cols.ts.gene = cols.ts.gene, node.data=node.data,
                                 pathway.name = sprintf('ko%s', pathway_id),
                                 kegg.dir = output_dir,
                                 same.layer = TRUE, plot.col.key = FALSE, multi.state=!split_samples, out.suffix = output_suffix)
        }
    file.remove(sprintf('%s/ko%s.png', output_dir, pathway_id))
    file.remove(sprintf('%s/ko%s.xml', output_dir, pathway_id))
    
    ggp = NULL
    if(!split_samples)
        {
        if('ggpattern' %in% rownames(installed.packages()) & 'magick' %in% rownames(installed.packages()))
            {
            nice_labels= c(abund='Raw abundance', percent = 'Percentage', bases='Bases',
                           cpm = 'Coverage per million reads', tpm='TPM', copy_number='Copy number')
            if(count %in% names(nice_labels)) { nice_label = nice_labels[count] } else { nice_label = count }
            geom_tile_pattern = utils::getFromNamespace('geom_tile_pattern', 'ggpattern')
            if(!is.null(fold_change_groups))
                {
                if(!is.null(names(fold_change_groups))) { group_names = names(fold_change_groups)
                } else { group_names = c('Group 1', 'Group 2') }
                scale_color = ggplot2::scale_color_gradient2(high = fold_change_colors[2],
                                                    mid = 'white',
                                                    low = fold_change_colors[1])
                scale_fill = ggplot2::scale_fill_manual(values = fold_change_colors)
                col_lab = sprintf('Log2FC %s', nice_label)
                imgfile = sprintf('ko%s.%s.png', pathway_id, output_suffix)
            } else
                {
                group_names = colnames(mat)
                if(length(samples)==1)
                    {
                    imgfile = sprintf('ko%s.%s.png', pathway_id, output_suffix)
                } else
                    {
                    imgfile = sprintf('ko%s.%s.multi.png', pathway_id, output_suffix)
                    }
                if(length(unique(sample_colors))==1) { high = sample_colors[1] } else { high = 'black' }
                scale_color = ggplot2::scale_color_gradient(high = high, low = 'white')
                scale_fill = ggplot2::scale_fill_manual(values = sample_colors)
                if(log_scale) { col_lab = sprintf('Log10 %s', nice_label) } else { col_lab = nice_label }
                }
            # Put the image created by pathview into a ggplot with the right legend
            # We will create a fake plot using geom_point() to get our color gradient legend
            #  and geom_tile_pattern to get both the image at the center of the plot, and the fill sample/group legend
            # We create a fake data.frame with the sample/group info and the max/min gradient values
            df = data.frame(scale = rep(c(min_scale_value, max_scale_value), length(group_names)),
                            val = rep(0, 2 * length(group_names)),
                            Group = factor(rep(group_names, 2), levels=group_names))
            # Due to how ggpattern and/or magick works, it seemed to be caching my image files
            # This meant that if I ran the command several times but kept the name of the output file the same
            #  it would keep including the first image even after deleting everything
            # So I will just copy the image to /tmp/ with a random name every time and make geom_tile_pattern
            #  read from there
            # Note that the plot can not be made if the file is no longer there (e.g. because we saved the whole
            #  session and then tried to remake the plot somewhere else)  
            imgfile_rand = sprintf('%s.png', tempfile())
            file.copy(from=imgfile, to=imgfile_rand)

            val = Group = scale = x = NULL # to appease R CMD check (they are later used by ggplot2's aes in the context of df,
                                           #  but that syntax bother's R CMD check bc it thinks they are undefined variables)
            # Now do the plot
            ggp = ggplot2::ggplot(df, ggplot2::aes(x='a', y=val, fill=Group, col = scale)) + 
                    ggplot2::geom_point() + ggplot2::geom_bar(stat='identity', width=0) +
                    geom_tile_pattern(data = data.frame(x='a', val=0),
                                      mapping = ggplot2::aes(x=x, y=val), fill = 'white', inherit.aes = FALSE,
                                      pattern='image', pattern_filename=imgfile_rand) +
                    scale_color + scale_fill +
                    # Make sure that the legends appear in a consistent order
                    ggplot2::guides(fill = ggplot2::guide_legend(order = 1)) +
                    ggplot2::labs(color = col_lab) + 
                    # We would use theme_void() to simplify, but also messes up legend position a bit so we do it manually
                    ggplot2::theme(axis.line        = ggplot2::element_blank(),
                                   axis.text.x      = ggplot2::element_blank(),
                                   axis.text.y      = ggplot2::element_blank(),
                                   axis.ticks       = ggplot2::element_blank(),
                                   axis.title.x     = ggplot2::element_blank(),
                                   axis.title.y     = ggplot2::element_blank(),
                                   panel.background = ggplot2::element_blank(),
                                   panel.border     = ggplot2::element_blank(),
                                   panel.grid.major = ggplot2::element_blank(),
                                   panel.grid.minor = ggplot2::element_blank(),
                                   plot.background  = ggplot2::element_blank())
        } else
            {
            warning(sprintf('The `ggpattern` and `magick` packages need to be installed in order to produce a ggplot\n
                             However, the image files were written to %s', output_dir))
            }
        }
    if(is.null(ggp)) { return(invisible(NULL)) } else { return(ggp) }
    }