R/vis.R

Defines functions hpaVisSubcell hpaVisPatho hpaVisTissue

Documented in hpaVisPatho hpaVisSubcell hpaVisTissue

###########################
## Visualize tissue data ##
###########################

#' Visualize tissue data
#'
#' Visualize the expression of protein of interest in each target tissue by cell
#' types.
#'
#' @param data Input the list object generated by \code{hpa_download()} or
#'   \code{hpa_subset()}. Require the \code{normal_tissue} dataset. Use HPA
#'   histology data (built-in) by default.
#' @param targetGene Vector of strings of HGNC gene symbols. By default it is
#'   set to \code{c('TP53', 'EGFR', 'CD44', 'PTEN', 'IDH1')}. You can also mix
#'   HGNC gene symbols and ensemnl ids (start with ENSG) and they will be
#'   converted to HGNC gene symbols.
#' @param targetTissue Vector of strings of normal tissues. Default to breast.
#' @param targetCellType Vector of strings of normal cell types. Default to all.
#' @param color Vector of 4 colors used to depict different expression levels.
#' @param customTheme Logical argument. If \code{TRUE}, the function will return
#'   a barebone ggplot2 plot to be customized further.
#'
#' @return This function will return a ggplot2 plot object, which can be further
#'   modified if desirable. The tissue data is visualized as a heatmap: x axis
#'   contains inquired protein and y axis contains tissue/cells of interest.
#'
#' @family visualization functions
#'
#' @examples
#'   data("hpa_histology_data")
#'   geneList <- c('TP53', 'EGFR', 'CD44', 'PTEN', 'IDH1', 'IDH2', 'CYCS')
#'   tissueList <- c('breast', 'cerebellum', 'skin 1')
#'
#'   ## A typical function call
#'   hpaVisTissue(data=hpa_histology_data,
#'                targetGene=geneList,
#'                targetTissue=tissueList)
#'
#' @import dplyr
#' @import ggplot2
#' @export

hpaVisTissue <- function(data=NULL, 
                         targetGene=NULL, 
                         targetTissue=NULL, 
                         targetCellType=NULL,
                         color=c('#ffffb2', '#fecc5c', '#fd8d3c', '#e31a1c'),
                         customTheme=FALSE) {
    
    infoDisp <- FALSE
    
    # Check if data is provided or not
    if (is.null(data)) {
        message(paste0('No data provided. Use version ', 
                       hpa_histology_data$metadata$HPAversion,
                       "."))
        data = HPAanalyze::hpa_histology_data
    }
    
    # Check if targetGene is provided
    if (is.null(targetGene)) {
        message('targetGene variable not specified, default to TP53, EGFR, CD44, PTEN and IDH1.')
        targetGene <- c('TP53', 'EGFR', 'CD44', 'PTEN', 'IDH1')
        infoDisp <- TRUE
    }
    
    # Check if targetTissue is provided
    if (is.null(targetTissue)) {
        message('targetTissue variable not specified, default to breast.')
        targetTissue <- 'breast'
        infoDisp <- TRUE
    }
    
    # Check if targetCellType is provided
    if (is.null(targetCellType)) {
        message('targetCellType variable not specified, visualize all.')
        targetCellType <- NULL
        infoDisp <- TRUE
    }
    
    # Show a message if any parameter is not defined
    if (infoDisp) {
        message('Use hpaListParam() to list possible values for target variables.')
    }
    
    targetGene <- gene_ensembl_convert(targetGene, "gene")
    
    plotData <- data$normal_tissue %>%
        filter(gene %in% targetGene) %>%
        filter(tissue %in% targetTissue)
    
    if(!is.null(targetCellType)) {
        plotData <- filter(plotData, cell_type %in% targetCellType)
    }
    
    plotData <- mutate(plotData,
                       tissue_cell=paste0(tissue, ' / ', cell_type),
                       level=factor(level, 
                                    levels=c('High', 'Medium', 
                                             'Low', 'Not detected')))
    
    levelColors <- c('Not detected'=color[1],
                     'Low'=color[2],
                     'Medium'=color[3],
                     'High'=color[4])
    
    plot <- ggplot(plotData, aes(x=gene, y=tissue_cell)) +
        geom_tile(aes(fill=level)) +
        scale_x_discrete(limits=targetGene) +
        scale_fill_manual(values=levelColors)
    
    if(!customTheme) {
        plot <- plot + 
            ylab('Tissue / Cell') +
            xlab('Genes') +
            theme_minimal() +
            theme(panel.grid = element_blank()) +
            theme(axis.text.x=element_text(angle=90, hjust=1)) +
            coord_equal()
    }
    
    
    
    return(plot)       
}



##############################
## Visualize pathology data ##
##############################

#' Visualize pathology data
#'
#' Visualize the expression of genes of interest in each cancer.
#'
#' @param data Input the list object generated by \code{hpa_download()} or
#'   \code{hpa_subset()}. Require the \code{pathology} dataset. Use HPA histology
#'   data (built-in) by default.
#' @param targetGene Vector of strings of HGNC gene symbols. By default it is
#'   set to \code{c('TP53', 'EGFR', 'CD44', 'PTEN', 'IDH1')}. You can also mix
#'   HGNC gene symbols and ensemnl ids (start with ENSG) and they will be
#'   converted to HGNC gene symbols.
#' @param targetCancer Vector of strings of normal tissues. The function will
#'   plot all available cancer by default.
#' @param color Vector of 4 colors used to depict different expression levels.
#' @param customTheme Logical argument. If \code{TRUE}, the function will return
#'   a barebone ggplot2 plot to be customized further.
#'
#' @return This function will return a ggplot2 plot object, which can be further
#'   modified if desirable. The pathology data is visualized as multiple bar
#'   graphs, one for each type of cancer. For each bar graph, x axis contains
#'   the inquired protein and y axis contains the proportion of patients.
#'
#' @family visualization functions
#'
#' @examples
#'   data("hpa_histology_data")
#'   geneList <- c('TP53', 'EGFR', 'CD44', 'PTEN', 'IDH1', 'IDH2', 'CYCS')
#'   cancerList <- c('breast cancer', 'glioma', 'melanoma')
#'
#'   ## A typical function call
#'   hpaVisPatho(data=hpa_histology_data,
#'                  targetGene=geneList)
#'
#' @import dplyr
#' @import ggplot2
#' @importFrom stats reshape
#' @export

hpaVisPatho <- function(data=NULL, 
                        targetGene=NULL, 
                        targetCancer=NULL, 
                        color=c('#ffffb2', '#fecc5c', '#fd8d3c', '#e31a1c'),
                        customTheme=FALSE) {
    
    infoDisp <- FALSE
    
    # Check if data is provided or not
    if (is.null(data)) {
        message(paste0('No data provided. Use version ', 
                       hpa_histology_data$metadata$HPAversion,
                       "."))
        data = HPAanalyze::hpa_histology_data
    }
    
    # Check if targetGene is provided
    if (is.null(targetGene)) {
        message('targetGene variable not specified, default to TP53, EGFR, CD44, PTEN and IDH1.')
        targetGene <- c('TP53', 'EGFR', 'CD44', 'PTEN', 'IDH1')
        infoDisp <- TRUE
    }
    
    # Show a message if any parameter is not defined
    if (infoDisp) {
        message('Use hpaListParam() to list possible values for target variables.')
    }
    
    plotData <- data$pathology %>%
        filter(gene %in% targetGene)
    
    if(!is.null(targetCancer)) {
        plotData <- filter(plotData, cancer %in% targetCancer)
    }
    
    targetGene <- gene_ensembl_convert(targetGene, "gene")
    
    plotData <- plotData %>%
        select(gene, cancer, high, medium, low, not_detected) %>%
        rename('High'='high', 'Medium'='medium', 
               'Low'='low', 'Not detected'='not_detected') %>%
        ## The old way used tidyr::gather
        # gather(key = "level", value = "patient_count", -gene, -cancer)
        ## The new way uses stats::reshape
        as.data.frame() %>%
        reshape(direction = "long",
                varying = list(3:6),
                v.names = "patient_count",
                timevar = "level",
                times = c("High", "Medium", "Low", "Not detected")
        )
    
    #re-level
    plotData$level <- factor(plotData$level,
                             levels = c("High", "Medium", "Low", "Not detected"))
    
    levelColors <- c('Not detected'=color[1],
                     'Low'=color[2],
                     'Medium'=color[3],
                     'High'=color[4])
    
    plot <- ggplot(plotData, aes(x=gene, y=patient_count, fill=level)) +
        geom_bar(stat='identity', position='fill') +
        scale_x_discrete(limits=targetGene) +
        scale_fill_manual(values=levelColors) +
        facet_wrap(~ cancer)
    
    if(!customTheme) {
        plot <- plot + 
            ylab('Patient proportions') +
            xlab('Genes') +
            theme_minimal() +
            theme(panel.grid = element_blank()) +
            theme(axis.text.x=element_text(angle=90, hjust=1))
    }
    
    return(plot)       
}


############################
## Visualize subcell data ##
############################

#' Visualize subcellular location data
#'
#' Visualize the the confirmed subcellular locations of genes of interest.
#'
#' @param data Input the list object generated by \code{hpa_download()} or
#'   \code{hpa_subset()}. Require the \code{subcellular_location} dataset. Use
#'   HPA histology data (built-in) by default.
#' @param targetGene Vector of strings of HGNC gene symbols. By default it is
#'   set to \code{c('TP53', 'EGFR', 'CD44', 'PTEN', 'IDH1')}. You can also mix
#'   HGNC gene symbols and ensemnl ids (start with ENSG) and they will be
#'   converted to HGNC gene symbols.
#' @param reliability Vector of string indicate which reliability scores you want to plot. The
#'   default is everything \code{c("enhanced", "supported", "approved",
#'   "uncertain")}.
#' @param color Vector of 2 colors used to depict if the protein expresses in a
#'   location or not.
#' @param customTheme Logical argument. If \code{TRUE}, the function will return
#'   a barebone ggplot2 plot to be customized further.
#'
#' @return This function will return a ggplot2 plot object, which can be further
#'   modified if desirable. The subcellular location data is visualized as a
#'   tile graph, in which the x axis includes the inquired proteins and the y
#'   axis contain the subcellular locations.
#'
#' @family visualization functions
#'
#' @examples
#'   data("hpa_histology_data")
#'   geneList <- c('TP53', 'EGFR', 'CD44', 'PTEN', 'IDH1', 'IDH2', 'CYCS')
#'
#'   ## A typical function call
#'   hpaVisSubcell(data=hpa_histology_data,
#'                   targetGene=geneList)
#'
#' @import dplyr
#' @import ggplot2
#' @importFrom tibble as_tibble
#' @export

hpaVisSubcell <- function(data=NULL, 
                          targetGene=NULL,
                          reliability = c("enhanced", "supported", "approved", "uncertain"),
                          color=c('#ffffb2', '#e31a1c'),
                          customTheme=FALSE) {
    
    infoDisp <- FALSE
    
    # Check if data is provided or not
    if (is.null(data)) {
        message(paste0('No data provided. Use version ', 
                       hpa_histology_data$metadata$HPAversion,
                       "."))
        data = HPAanalyze::hpa_histology_data
    }
    
    # Check if targetGene is provided
    if (is.null(targetGene)) {
        message('targetGene variable not specified, default to TP53, EGFR, CD44, PTEN and IDH1.')
        targetGene <- c('TP53', 'EGFR', 'CD44', 'PTEN', 'IDH1')
        infoDisp <- TRUE
    }
    
    # Show a message if any parameter is not defined
    if (infoDisp) {
        message('Use hpaListParam() to list possible values for target variables.')
    }
    
    targetGene <- gene_ensembl_convert(targetGene, "gene")
    
    plotData <- data$subcellular_location %>%
        filter(gene %in% targetGene) %>%
        mutate(sub_location = NA)
    
    if ("enhanced" %in% reliability) plotData <- 
        mutate(plotData, 
               sub_location =  paste(sub_location, enhanced, sep = ";"))
    if ("supported" %in% reliability) plotData <- 
        mutate(plotData, 
               sub_location =  paste(sub_location, supported, sep = ";"))
    if ("approved" %in% reliability) plotData <- 
        mutate(plotData, 
               sub_location =  paste(sub_location, approved, sep = ";"))
    if ("uncertain" %in% reliability) plotData <- 
        mutate(plotData, 
               sub_location =  paste(sub_location, uncertain, sep = ";"))
    
    # plotData <-  plotData %>%
    #     mutate(sub_location=strsplit(sub_location, ';')) %>%
    #     tidyr::unnest(sub_location) %>%
    #     select(sub_location, gene) %>%
    #     filter(sub_location != "NA") %>%
    #     table() %>%
    #     as_tibble() %>%
    #     mutate(n=factor(n, levels=c('0', '1')))
    
    ## Use apply(as_tibble) %>% bind_rows instead of unnest
    plotData <-  plotData %>%
        mutate(sub_location=strsplit(sub_location, ';')) %>%
        apply(MARGIN = 1, FUN = as_tibble) %>% bind_rows() %>%
        select(sub_location, gene) %>%
        filter(sub_location != "NA") %>%
        table() %>%
        as_tibble() %>%
        mutate(n=factor(n, levels=c('0', '1')))

    levelColors <- c('0'=color[1],
                     '1'=color[length(color)])

    plot <- ggplot(plotData, aes(x=gene, y=sub_location)) +
        geom_tile(aes(fill=n), colour="grey50") +
        scale_x_discrete(limits=targetGene) +
        scale_fill_manual(values=levelColors,
                          name="Detected",
                          breaks = c(0, 1),
                          labels = c("No", "Yes"))

    if(!customTheme) {
        plot <- plot +
            ylab('Subcellular locations') +
            xlab('Genes') +
            theme_minimal() +
            theme(panel.grid = element_blank()) +
            theme(axis.text.x=element_text(angle=45, hjust=1)) +
            coord_equal()
    }

    return(plot)
}

Try the HPAanalyze package in your browser

Any scripts or data that you put into this service are public.

HPAanalyze documentation built on Nov. 26, 2020, 2:01 a.m.