knitr::opts_chunk$set( collapse = TRUE, comment = "#>" )
\newcommand{\diag}{\mathrm{diag}} \newcommand{\tr}{\mathrm{tr}} \newcommand{\E}{\mathrm{E}}
Consider a set of $k$ real matrices $D_i \in \mathbb{R}^{m_i \times n}$, each with full column rank. Each $D_i$ represents the mean expression of $n$ tissues (columns) in species $i$ with $m_i$ genes (rows) annotated for that species. The $p\mbox{-th}$ column of each $D_i$ matrix corresponds to the average gene-expression profile of a functionally equivalent tissue representing a similar phenotype across species. The number of rows is different for each $D_i$ matrix, but they all have the same number of columns.
In OSBF, we estimate an orthonormal basis $V$ for the common expression space based on the inter-tissue correlations within each species. Let $X_i \in \mathbb{R}^{m_i \times n}$ be the standardized gene expression matrix for species $i$. We can write [ X_i = C_iD_i{S_i}^{-1}, ] where [ C_i = I_{m_i} - {m_i}^{-1} 1_{m_i} {1_{m_i}}^T ] is a centering matrix and $S_i = \diag(s_1,\ldots,s_n)$ is a diagonal scaling matrix, where $s_j$ is the standard deviation of $j$-th column of $D_i$. Within species $i$, the correlation between expression profiles for the $n$ tissues is [ R_i = X_i^TX_i/m_i. ] Each $R_i$ matrix has a dimension of $n \times n$ independent of the number of genes annotated in the species. Since the corresponding column of each $D_i$ matrix represents a similar phenotype, we can define an expected correlation matrix across the species:
[ E = \E(R) = \textstyle\frac{1}{k}\sum_{i=1}^{k} R_i. ]
The shared right basis matrix $V$ represents the common expression space and is determined from the eigenvalue decomposition of $E$, where $E=V \Lambda V^T$. The right basis matrix $V$ is identical in all the $k$ matrix factorizations, and the different dimensions of $V$ represent the correlation relationship between tissues independent of the fact to which species tissues belong. Once the $V$ is estimated, the species-specific left basis matrix with orthonormal columns $U_i$ (eigengenes) and the $\Delta_i$ that minimize the factorization error are computed.
# load SBF package library(SBF)
Additional packages required for the workflow
# install packages pkgs <- c("data.table", "dplyr", "matrixStats") require_install <- pkgs[!(pkgs %in% row.names(installed.packages()))] if (length(require_install)) install.packages(require_install) suppressPackageStartupMessages({ library(data.table) library(dplyr) library(matrixStats) })
In this workflow, we will work with gene expression profile of six tissues from eight species.
species <- c("Homo_sapiens", "Pan_troglodytes", "Macaca_mulatta", "Mus_musculus", "Rattus_norvegicus", "Bos_taurus", "Sus_scrofa", "Gallus_gallus") species_short <- sapply(species, getSpeciesShortName) species_short # common tissues present in all 8 species tissues <- c("brain", "heart", "kidney", "liver", "lung", "testis")
Download the processed RNA-Seq counts file ("counts.tar.gz") from https://doi.org/10.6084/m9.figshare.20216858 Uncompress the .tar.gz file and add it to the working directory.
# set the path to the working directory. Change this accordingly path <- "~/Dropbox/0.Analysis/0.paper/" counts_list <- metadata_list <- list() require(data.table) for (sp in species) { # read logTPM counts for each species counts <- data.table::fread(paste0(path, "counts/", sp, "_logTPM.tsv"), sep = "\t", header = TRUE, data.table = FALSE, nThread = 4) row.names(counts) <- counts$V1 counts$V1 <- NULL col_fields <- data.table::tstrsplit(colnames(counts), "_") metadata <- data.frame( project = col_fields[[1]], species = col_fields[[2]], tissue = col_fields[[3]], gsm = col_fields[[4]], name = colnames(counts), stringsAsFactors = FALSE) metadata_sel <- metadata[metadata$tissue %in% tissues, , drop = FALSE] counts_sel <- counts[, colnames(counts) %in% metadata_sel$name, drop = FALSE] metadata_sel$ref <- seq_len(nrow(metadata_sel)) metadata_sel$key <- paste0(metadata_sel$species, "_", metadata_sel$tissue) counts_list[[sp]] <- counts_sel metadata_list[[sp]] <- metadata_sel }
The dimensions and the number of RNA-Seq profiles for different species.
sapply(counts_list, dim)
Now, for each species, let us compute the mean expression profile for each
tissue. We will use calcAvgCounts
function from the SBF
package.
avg_counts <- list() for (sp in species) { avg_counts[[sp]] <- calcAvgCounts(counts_list[[sp]], metadata_list[[sp]]) } # check tissue columns are matching in each species c_tissues <- as.data.frame(sapply(avg_counts, function(x) { data.table::tstrsplit(colnames(x), "_")[[2]] })) if (!all(apply(c_tissues, 1, function(x) all(x == x[1])))) { stop("Error! tissues not matching") }
The dimension of mean expression profiles
sapply(avg_counts, dim)
Remove genes not expressed.
# remove empty rows removeZeros <- function(df) { return(df[rowSums(df) > 0, ]) } avg_counts <- lapply(avg_counts, removeZeros) sapply(avg_counts, dim) # update counts_list counts_list_sub <- list() for (sp in names(avg_counts)) { counts_list_sub[[sp]] <- counts_list[[sp]][row.names(avg_counts[[sp]]), , drop = FALSE] }
We will perform OSBF in two ways.
cat("\nOSBF with optimizing V = FALSE started\n") cat(format(Sys.time(), "%a %b %d %X %Y"), "\n") t1 <- proc.time() # decrease tol to minimize error osbf_noVupdate <- SBF(avg_counts, transform_matrix = TRUE, orthogonal = TRUE, optimizeV = FALSE, tol = 1e-3)#, tol = 1e-10) t2 <- proc.time() cat(format(Sys.time(), "%a %b %d %X %Y"), "\n") cat("OSBF with optimizing V = FALSE finished\n") cat("Time taken:\n") t2 - t1
cat("\nOSBF with optimizing V=TRUE started\n") cat(format(Sys.time(), "%a %b %d %X %Y"), "\n") t1 <- proc.time() # decrease tol to minimize error osbf <- SBF(avg_counts, transform_matrix = TRUE, orthogonal = TRUE, optimizeV = TRUE, tol = 1e-3)#, tol = 1e-10) t2 <- proc.time() cat(format(Sys.time(), "%a %b %d %X %Y"), "\n") cat("OSBF with optimizing V=TRUE finished\n") cat("Time taken:\n") t2 - t1
Note: We use tolerance threshold = 1e-3 for fast computation. Reduce the tolerance threshold (default value 1e-10) to minimize the factorization error futher. Lowering the threshold value increases the computing time.
The final factorization error and number of updates taken:
cat("\n", sprintf("%-27s:", "Final error [No V update]"), sprintf("%16.2f", osbf_noVupdate$error)) cat("\n", sprintf("%-27s:", "Final error [With V update]"), sprintf("%16.2f", osbf$error)) cat("\n", sprintf("%-27s:", "# of update [No V update]"), sprintf("%16d", osbf_noVupdate$error_pos)) cat("\n", sprintf("%-27s:", "# of update [With V update]"), sprintf("%16d", osbf$error_pos))
Optimization with updating $V$ achieves a lower decomposition error.
osbf_noVupdate$error / osbf$error
Let us plot the decomposition error vs. updates.
par(mfrow = c(1, 3)) plot(x = seq_len(length(osbf$error_vec)), y = osbf$error_vec, xlab = "# of updates (step 1 -> )", ylab = "Factorization error", col = "red") plot(x = 10:length(osbf$error_vec), y = osbf$error_vec[10:length(osbf$error_vec)], xlab = "# of updates (step 10 -> )", ylab = "Factorization error", col = "red") plot(x = 50:length(osbf$error_vec), y = osbf$error_vec[50:length(osbf$error_vec)], xlab = "# of updates (step 50 -> )", ylab = "Factorization error", col = "red")
Percentage of information ($p_{ij}$) represented by a common space dimension is defined as $\textstyle{p_{ij} = \delta_{ij}^2 / \sum_{j=1}^6 \delta_{ij}^2 \times 100}$, $\mbox{ where } \textstyle{\Delta_i = \diag(\delta_{i1},\ldots, \delta_{i6})}$
cat("\nPercentage for each delta [No V update]:") percentInfo_noVupdate <- calcPercentInfo(osbf_noVupdate) for (i in names(osbf_noVupdate$delta)) { cat("\n", sprintf("%-25s:", i), sprintf("%8.2f", percentInfo_noVupdate[[i]])) } percentInfo <- calcPercentInfo(osbf) for (i in names(osbf$delta)) { cat("\n", sprintf("%-25s:", i), sprintf("%8.2f", percentInfo[[i]])) }
The percentage of information represented by different dimensions of the two approaches looks very similar.
Project individual profiles and average counts to common space by computing
$D_i^T U_i \Delta^{-1}$. We will projectCounts
function from the SBF
package for this.
# project profles using no V update estimates # we can project both mean expression profiles as well as individual expression # profiles df_proj_avg_noVupdate <- projectCounts(avg_counts, osbf_noVupdate) meta <- data.table::tstrsplit(row.names(df_proj_avg_noVupdate), "_") df_proj_avg_noVupdate$tissue <- factor(meta[[2]]) df_proj_avg_noVupdate$species <- factor(meta[[1]]) df_proj_avg_noVupdate <- df_proj_avg_noVupdate %>% mutate(species = factor(species, levels = species_short)) df_proj_noVupdate <- projectCounts(counts_list_sub, osbf_noVupdate) meta1 <- data.table::tstrsplit(row.names(df_proj_noVupdate), "_") df_proj_noVupdate$tissue <- factor(meta1[[3]]) df_proj_noVupdate$species <- factor(meta1[[2]]) df_proj_noVupdate <- df_proj_noVupdate %>% mutate(species = factor(species, levels = species_short))
Now let us also project profiles with the updated V
# project using V update estimates df_proj_avg <- projectCounts(avg_counts, osbf) meta <- data.table::tstrsplit(row.names(df_proj_avg), "_") df_proj_avg$tissue <- factor(meta[[2]]) df_proj_avg$species <- factor(meta[[1]]) df_proj_avg <- df_proj_avg %>% mutate(species = factor(species, levels = species_short)) df_proj <- projectCounts(counts_list_sub, osbf) meta1 <- data.table::tstrsplit(row.names(df_proj), "_") df_proj$tissue <- factor(meta1[[3]]) df_proj$species <- factor(meta1[[2]]) df_proj <- df_proj %>% mutate(species = factor(species, levels = species_short))
# install packages pkgs <- c("RColorBrewer") require_install <- pkgs[!(pkgs %in% row.names(installed.packages()))] if (length(require_install)) install.packages(require_install) pkgs <- c("ComplexHeatmap") require_install <- pkgs[!(pkgs %in% row.names(installed.packages()))] if (length(require_install)) { if (!require("BiocManager", quietly = TRUE)) install.packages("BiocManager") BiocManager::install("ComplexHeatmap") } suppressPackageStartupMessages({ library(ComplexHeatmap) library(RColorBrewer) })
Compute distances between projected profiles and perform clustering.
data_noVupdate <- df_proj_noVupdate data_noVupdate$tissue <- NULL data_noVupdate$species <- NULL data_noVupdate <- as.matrix(data_noVupdate) data_noVupdate_dist <- as.matrix(dist(data_noVupdate, method = "euclidean")) meta <- data.table::tstrsplit(colnames(data_noVupdate_dist), "_") ht <- ComplexHeatmap::HeatmapAnnotation(tissue = meta[[3]], species = meta[[2]], col = list(tissue = c("brain" = "#1B9E77", "heart" = "#D95F02", "kidney" = "#7570B3", "liver" = "#E7298A", "lung" = "#66A61E", "testis" = "#E6AB02"), species = c("hsapiens" = "#66C2A5", "ptroglodytes" = "#FC8D62", "mmulatta" = "#8DA0CB", "mmusculus" = "#E78AC3", "rnorvegicus" = "#A6D854", "btaurus" = "#FFD92F", "sscrofa" = "#E5C494", "ggallus" = "#B3B3B3")), #show_annotation_name = F, annotation_name_side = "left") mypalette <- RColorBrewer::brewer.pal(9, "Blues") morecolors <- colorRampPalette(mypalette) myheatmap1 <- ComplexHeatmap::Heatmap(as.matrix(data_noVupdate_dist), cluster_rows = TRUE, clustering_method_rows = "centroid", cluster_columns = TRUE, clustering_method_columns = "centroid", top_annotation = ht, col = morecolors(50), show_row_names = FALSE, show_column_names = FALSE, name = "distance") myheatmap1
Compute distances between projected profiles in the common space estimated using optimized $V$ and perform clustering.
data <- df_proj data$tissue <- NULL data$species <- NULL data <- as.matrix(data) data_dist <- as.matrix(dist(data, method = "euclidean")) meta <- data.table::tstrsplit(colnames(data_dist), "_") ht <- ComplexHeatmap::HeatmapAnnotation(tissue = meta[[3]], species = meta[[2]], col = list(tissue = c("brain" = "#1B9E77", "heart" = "#D95F02", "kidney" = "#7570B3", "liver" = "#E7298A", "lung" = "#66A61E", "testis" = "#E6AB02"), species = c("hsapiens" = "#66C2A5", "ptroglodytes" = "#FC8D62", "mmulatta" = "#8DA0CB", "mmusculus" = "#E78AC3", "rnorvegicus" = "#A6D854", "btaurus" = "#FFD92F", "sscrofa" = "#E5C494", "ggallus" = "#B3B3B3")), #show_annotation_name = F, annotation_name_side = "left") mypalette <- RColorBrewer::brewer.pal(9, "Blues") morecolors <- colorRampPalette(mypalette) myheatmap <- ComplexHeatmap::Heatmap(as.matrix(data_dist), cluster_rows = TRUE, clustering_method_rows = "centroid", cluster_columns = TRUE, clustering_method_columns = "centroid", top_annotation = ht, col = morecolors(50), show_row_names = FALSE, show_column_names = FALSE, name = "distance") myheatmap
Gene expression profiles cluster by tissue type independent of the species of origin.
Next, we will explore the 2D projection plots in the common space. We will first define a custom theme that we will use for the plots.
# install packages pkgs <- c("grid", "ggthemes", "ggplot2") require_install <- pkgs[!(pkgs %in% row.names(installed.packages()))] if (length(require_install)) install.packages(require_install) suppressPackageStartupMessages({ library(grid) library(ggthemes) library(ggplot2) })
We will use the following custom theme for the ggplots.
# custom theme function for ggplot2 customTheme <- function(base_size = 10, base_family = "helvetica") { require(grid) require(ggthemes) (ggthemes::theme_foundation(base_size = base_size) + ggplot2::theme(plot.title = element_text(face = "bold", size = rel(1.2), hjust = 0.5), text = element_text(), panel.background = element_rect(colour = NA), plot.background = element_rect(colour = NA), panel.border = element_rect(colour = NA), axis.title = element_text(size = rel(1)), axis.title.y = element_text(angle = 90, vjust = 2), axis.title.x = element_text(vjust = -0.2), axis.text = element_text(), axis.line = element_line(colour = "black"), axis.ticks = element_line(), panel.grid.major = element_blank(), panel.grid.minor = element_blank(), legend.key = element_rect(colour = NA), legend.position = "top", legend.direction = "horizontal", legend.key.size = unit(0.2, "cm"), legend.spacing = unit(0, "cm"), legend.title = element_text(face = "italic"), plot.margin = unit(c(10, 5, 5, 5), "mm"), strip.background = element_rect(colour = "#f0f0f0", fill = "#f0f0f0"), strip.text = element_text(face = "bold"))) }
Let us first check the projected libraries in dimension 1 and 2.
# 2D plot for Dim1 and Dim2 [No V update] sel_colors <- RColorBrewer::brewer.pal(8, "Dark2")[seq_len(length(unique(df_proj_noVupdate$tissue)))] i <- 1 j <- 2 ggplot2::ggplot(df_proj_noVupdate, aes(x = df_proj_noVupdate[, i], y = df_proj_noVupdate[, j], col = tissue, shape = species, fill = tissue)) + xlab(paste("OSBF Dim", i)) + ylab(paste("OSBF Dim", j)) + geom_point(size = 1.5) + scale_color_manual(values = sel_colors) + scale_shape_manual(values = c(21:25, 3:7)) + scale_fill_manual(values = sel_colors) + customTheme() + theme(legend.title = element_blank())
The same with optimized V
# 2D plot for Dim1 and Dim2 [With V update] i <- 1 j <- 2 ggplot2::ggplot(df_proj, aes(x = df_proj[, i], y = df_proj[, j], col = tissue, shape = species, fill = tissue)) + xlab(paste("OSBF Dim", i)) + ylab(paste("OSBF Dim", j)) + geom_point(size = 1.5) + scale_color_manual(values = sel_colors) + scale_shape_manual(values = c(21:25, 3:7)) + scale_fill_manual(values = sel_colors) + customTheme() + theme(legend.title = element_blank())
Let us plot the projected libraries in dimensions 2 and 3.
# 2D plot for Dim2 and Dim3 [No V update] i <- 2 j <- 3 ggplot2::ggplot(df_proj_noVupdate, aes(x = df_proj_noVupdate[, i], y = df_proj_noVupdate[, j], col = tissue, shape = species, fill = tissue)) + xlab(paste("OSBF Dim", i)) + ylab(paste("OSBF Dim", j)) + geom_point(size = 1.5) + scale_color_manual(values = sel_colors) + scale_shape_manual(values = c(21:25, 3:7)) + scale_fill_manual(values = sel_colors) + customTheme() + theme(legend.title = element_blank())
# 2D plot for Dim2 and Dim3 [With V update] i <- 2 j <- 3 ggplot2::ggplot(df_proj, aes(x = df_proj[, i], y = df_proj[, j], col = tissue, shape = species, fill = tissue)) + xlab(paste("OSBF Dim", i)) + ylab(paste("OSBF Dim", j)) + geom_point(size = 1.5) + scale_color_manual(values = sel_colors) + scale_shape_manual(values = c(21:25, 3:7)) + scale_fill_manual(values = sel_colors) + customTheme() + theme(legend.title = element_blank())
Save all 2D plots
sel_colors <- c("#1B9E77", "#D95F02", "#7570B3", "#E7298A", "mediumturquoise", "#E6AB02", "darkmagenta", "#666666", "black", "darkolivegreen2") outputname <- "human_mouse_blood" finished <- c() for (i in 1:(ncol(df_proj) - 2)) { for (j in 1:(ncol(df_proj) - 2)) { if (i == j) next if (j %in% finished) next ggplot(df_proj, aes(x = df_proj[, i], y = df_proj[, j], col = tissue, shape = species, fill = tissue)) + xlab(paste0("OSBF Dim ", i)) + ylab(paste0("OSBF Dim ", j)) + geom_point(size = 1.5) + scale_color_manual(values = sel_colors) + scale_shape_manual(values = c(21:25, 3:7)) + scale_fill_manual(values = sel_colors) + customTheme(base_size = 12) + theme(legend.title = element_blank()) #ggsave(filename = paste0(outdir, "2Dplots/opt_2Dplot_Dim_", i, "-", j, "_", # outputname, ".pdf"), device = "pdf", # width = 7, height = 7, useDingbats = FALSE) } finished <- c(finished, i) }
Similarly, we can check for other dimensions of the common space. We observe that both clustering and 2D projections plots in the common space with optimized V are similar to those without V update. So we will use the common space with optimized V for future analysis.
Functions to compute tissue specificity ($\tau$) and scaled average expression profile.
# function to compute Tau calc_tissue_specificity <- function(a) { a <- as.matrix(a) b <- a / matrixStats::rowMaxs(a) return(rowSums(1 - b) / (ncol(b) - 1)) } Tau <- lapply(avg_counts, function(x) { calc_tissue_specificity(x)}) avg_counts_scaled <- lapply(avg_counts, function(x) { t(scale(t(x)))}) combine_expr <- list() for (sp in names(avg_counts_scaled)) { x <- as.data.frame(avg_counts_scaled[[sp]]) x[["Tau"]] <- Tau[[sp]] combine_expr[[sp]] <- x }
From the 2D projection plot, we find that along the dimension 2, testis profiles lie on the negative axis and separate from the rest of the tissues. We will plot the relationship between $U_i$ loadings in dimension 2 and the testis expression.
sel_dim <- 2 sel_tissue <- "testis" species <- "Homo_sapiens" expr <- combine_expr[[species]] osbf_coef <- osbf$u[[species]] expr[["coef"]] <- osbf_coef[, sel_dim, drop = TRUE] expr1 <- expr[, c(paste0(getSpeciesShortName(species), "_", sel_tissue), "Tau", "coef")] colnames(expr1) <- c("tissue_zscore", "Tau", "coef") head(expr1)
Dimension 2 $U_i$ loadings vs expression specificity ($\tau$) for humans
# plot scatter mid <- 0 p1 <- ggplot2::ggplot(expr1, aes(x = Tau, y = coef, col = tissue_zscore)) + theme_bw() + geom_point(size = 0.5) + xlab("Expression specificity") + ylab(paste0("Dim", sel_dim, " Coefficient")) + scale_color_gradient2(midpoint = mid, low = "blue", mid = "white", high = "red", space = "Lab") + scale_y_continuous(limits = c(-1 * max(abs(expr1$coef)), max(abs(expr1$coef))), breaks = seq(-1 * round(max(abs(expr$coef)), 2), round(max(abs(expr$coef)), 2), by = 0.01)) + customTheme() + theme(legend.position = "right", legend.direction = "vertical") + labs(title = paste0(getScientificName(species)), color = "Z-score") + theme(legend.key.size = unit(0.5, "cm"), plot.title = element_text(face = "italic")) p1
Dimension 2 $U_i$ loadings vs expression specificity ($\tau$) for pig.
sel_dim <- 2 sel_tissue <- "testis" species <- "Sus_scrofa" expr <- combine_expr[[species]] osbf_coef <- osbf$u[[species]] expr[["coef"]] <- osbf_coef[, sel_dim, drop = TRUE] expr1 <- expr[, c(paste0(getSpeciesShortName(species), "_", sel_tissue), "Tau", "coef")] colnames(expr1) <- c("tissue_zscore", "Tau", "coef") # plot scatter mid <- 0 p2 <- ggplot2::ggplot(expr1, aes(x = Tau, y = coef, col = tissue_zscore)) + theme_bw() + geom_point(size = 0.5) + xlab("Expression specificity") + ylab(paste0("Dim", sel_dim, " Coefficient")) + scale_color_gradient2(midpoint = mid, low = "blue", mid = "white", high = "red", space = "Lab") + scale_y_continuous(limits = c(-1 * max(abs(expr1$coef)), max(abs(expr1$coef))), breaks = seq(-1 * round(max(abs(expr$coef)), 2), round(max(abs(expr$coef)), 2), by = 0.01)) + customTheme() + theme(legend.position = "right", legend.direction = "vertical") + labs(title = paste0(getScientificName(species)), color = "Z-score") + theme(legend.key.size = unit(0.5, "cm"), plot.title = element_text(face = "italic")) p2
Download the GO files from: https://doi.org/10.6084/m9.figshare.19633131 We will perform the gene ontology analysis for genes with high coefficients. For GO analysis, we will use shared 1-to-1 orthologs in eight species as the background.
# load GO analysis files # set the path to the working directory. Change this accordingly path <- "~/Dropbox/0.Analysis/0.paper/" file <- "allwayOrthologs_hsapiens-ptroglodytes-mmulatta-mmusculus-rnorvegicus-btaurus-sscrofa-ggallus_ens94.tsv" orthologs <- read.table(file.path(path, "GOKeggFiles/", file), header = TRUE, sep = "\t") head(orthologs)
We will use the goseq Bioconductor package to perform GO enrichment analysis.
# install packages pkgs <- c("goseq") require_install <- pkgs[!(pkgs %in% row.names(installed.packages()))] if (length(require_install)) { if (!require("BiocManager", quietly = TRUE)) install.packages("BiocManager") BiocManager::install("goseq") } suppressPackageStartupMessages({ library(goseq) })
Let us perform GO analysis for top testis-specific genes in Dimension 2. The testis-specific genes have negative loadings.
sel_dim <- 2 sel_tissue <- "testis" top_genes <- 100 # axis positive (pos) or negative (neg) sel_sign <- "neg"
We will perform GO analysis for human first.
species <- "Homo_sapiens" expr <- combine_expr[[species]] osbf_coef <- osbf$u[[species]] expr[["coef"]] <- osbf_coef[, sel_dim, drop = TRUE] expr1 <- expr[, c(paste0(getSpeciesShortName(species), "_", sel_tissue), "Tau", "coef")] colnames(expr1) <- c("tissue_zscore", "Tau", "coef") if (sel_sign == "neg") { cat("\n selecting negative loadings") expr1_selsign <- expr1[expr1$coef < 0, ] } else { cat("\n selecting positive loadings") expr1_selsign <- expr1[expr1$coef >= 0, ] } expr1_selsign$score <- expr1_selsign$Tau * abs(expr1_selsign$coef) expr1_selsign$rank <- rank(-1 * expr1_selsign$score) expr1_selsign <- expr1_selsign[order(expr1_selsign$rank), ] genes_fg <- row.names(expr1_selsign[expr1_selsign$rank <= top_genes, ]) if (species == "Homo_sapiens") { cat("\nUsing orthologs (human IDs) as background") genes_bg <- orthologs$hsapiens } if (species == "Mus_musculus") { cat("\nUsing orthologs (mouse IDs) as background") genes_bg <- orthologs$mmusculus } genes_bg <- genes_bg[!genes_bg %in% genes_fg] genome <- "hg38" total_genes <- unique(c(genes_fg, genes_bg)) up_genes <- as.integer(total_genes %in% genes_fg) names(up_genes) <- total_genes
# set the path to the working directory. Change this accordingly path <- "~/Dropbox/0.Analysis/0.paper/" # load("hg38 length data") load(paste0(path, "GOKeggFiles/hg38_length.EnsemblV94.RData")) lengthData.up <- lengthData[names(up_genes)] # load("hg38 EnsembleID to GO data") load(paste0(path, "GOKeggFiles/hg38_geneID2GO.EnsemblID2GO.EnsembleV94.Robj")) pwf <- goseq::nullp(up_genes, bias.data = lengthData.up, plot.fit = FALSE) go <- goseq::goseq(pwf, "hg38", "ensGene", gene2cat = geneID2GO, test.cats = c("GO:BP")) go.sub <- go[go$ontology == "BP", ] go.sub$padj <- p.adjust(go.sub$over_represented_pvalue, method = "BH") go.sub[["ratio"]] <- round(go.sub[["numDEInCat"]] / go.sub[["numInCat"]], 4) go.sub <- go.sub[with(go.sub, order(padj, decreasing = c(FALSE))), ] go.sub$over_represented_pvalue <- NULL go.sub$under_represented_pvalue <- NULL
head(go.sub)
Barplot with top human GO terms and their p-value.
# GO enrichment plot for human go_out <- head(go.sub, n = 8) go_out$padj <- as.numeric(go_out$padj) go_out$term <- factor(go_out$term, levels = go_out$term) breaks <- round(c(0, 1 / 4, 2 / 4, 3 / 4, 1) * max(go_out[["ratio"]]), 2) go_plot <- ggplot2::ggplot(go_out, aes(x = term, y = ratio, fill = padj)) + geom_col() + scale_y_continuous(expand = c(0, 0), breaks = breaks, limits = c(0, max(go_out[["ratio"]] + 0.05))) + scale_x_discrete() + coord_flip() + scale_color_gradient(low = "blue", high = "red") + ylab(paste0("Ratio of genes in GO category (", getScientificName(species), ")")) + xlab("") + customTheme() + theme(legend.position = "right", legend.direction = "vertical", plot.margin = unit(c(10, 5, 5, 5), "mm")) go_plot
GO analysis for mouse.
species <- "Mus_musculus" expr <- combine_expr[[species]] osbf_coef <- osbf$u[[species]] expr[["coef"]] <- osbf_coef[, sel_dim, drop = TRUE] expr1 <- expr[, c(paste0(getSpeciesShortName(species), "_", sel_tissue), "Tau", "coef")] colnames(expr1) <- c("tissue_zscore", "Tau", "coef") if (sel_sign == "neg") { cat("\n selecting negative loadings") expr1_selsign <- expr1[expr1$coef < 0, ] } else { cat("\n selecting positive loadings") expr1_selsign <- expr1[expr1$coef >= 0, ] } expr1_selsign$score <- expr1_selsign$Tau * abs(expr1_selsign$coef) expr1_selsign$rank <- rank(-1 * expr1_selsign$score) expr1_selsign <- expr1_selsign[order(expr1_selsign$rank), ] genes_fg <- row.names(expr1_selsign[expr1_selsign$rank <= top_genes, ]) if (species == "Homo_sapiens") { cat("\nUsing orthologs (human IDs) as background") genes_bg <- orthologs$hsapiens } if (species == "Mus_musculus") { cat("\nUsing orthologs (mouse IDs) as background") genes_bg <- orthologs$mmusculus } genes_bg <- genes_bg[!genes_bg %in% genes_fg] genome <- "mm10" total_genes <- unique(c(genes_fg, genes_bg)) up_genes <- as.integer(total_genes %in% genes_fg) names(up_genes) <- total_genes
# set the path to the working directory. Change this accordingly path <- "~/Dropbox/0.Analysis/0.paper/" # load("mm10 length data") load(paste0(path, "/GOKeggFiles/mm10_length_EnsemblV94.RData")) lengthData.up <- lengthData[names(up_genes)] # load("mm10 EnsembleID to GO data") load(paste0(path, "/GOKeggFiles/mm10_geneID2GO.EnsemblID2GO.EnsembleV94.Robj")) pwf <- goseq::nullp(up_genes, bias.data = lengthData.up, plot.fit = FALSE) go <- goseq::goseq(pwf, "mm10", "ensGene", gene2cat = mouse_geneID2GO, test.cats = c("GO:BP")) go.sub <- go[go$ontology == "BP", ] go.sub$padj <- p.adjust(go.sub$over_represented_pvalue, method = "BH") go.sub[["ratio"]] <- round(go.sub[["numDEInCat"]] / go.sub[["numInCat"]], 4) go.sub <- go.sub[with(go.sub, order(padj, decreasing = c(FALSE))), ] go.sub$over_represented_pvalue <- NULL go.sub$under_represented_pvalue <- NULL
head(go.sub)
Barplot with top mouse GO terms and their p-value.
# GO enrichment plot for mouse go_out <- head(go.sub, n = 8) go_out$padj <- as.numeric(go_out$padj) go_out$term <- factor(go_out$term, levels = go_out$term) breaks <- round(c(0, 1 / 4, 2 / 4, 3 / 4, 1) * max(go_out[["ratio"]]), 2) go_plot <- ggplot2::ggplot(go_out, aes(x = term, y = ratio, fill = padj)) + geom_col() + scale_y_continuous(expand = c(0, 0), breaks = breaks, limits = c(0, max(go_out[["ratio"]] + 0.05))) + scale_x_discrete() + coord_flip() + scale_color_gradient(low = "blue", high = "red") + ylab(paste0("Ratio of genes in GO category (", getScientificName(species), ")")) + xlab("") + customTheme() + theme(legend.position = "right", legend.direction = "vertical", plot.margin = unit(c(10, 5, 5, 5), "mm")) go_plot
The OSBF's $V$ represents the space "shared by all species". It is a space independent of species representing relationships between the column phenotypes. In all tissues/cell types, essential cellular processes related to translation, cellular transport, and cell cycle are common in all species. Since this is the most dominant shared property across all columns and species, our dimension 1 represents this. Next, there are commonalities between the same cell types/tissues across species. The shared properties of the same tissue/cell type across species are not as prominent as the basic cellular processes, and hence reflected in the rest of the dimensions of OSBF. We will now explore individual dimensions and identify the genes with significant contribution to each of these dimensions.
First, we will identify those genes with significant contribution to different tissue types. We will first create a null distribution of scores for the coefficient and identify genes of interest with respect to the null.
We will create random average counts based on shuffled reads. Then we will
apply TPM normalization for these counts using normalizeTPM
function
from the SBF
package.
Download raw counts files from
https://doi.org/10.6084/m9.figshare.20216858
# set seed s1 <- 135 s2 <- 13835 species <- c("Homo_sapiens", "Pan_troglodytes", "Macaca_mulatta", "Mus_musculus", "Rattus_norvegicus", "Bos_taurus", "Sus_scrofa", "Gallus_gallus") species_short <- sapply(species, getSpeciesShortName) tissues <- c("brain", "heart", "kidney", "liver", "lung", "testis") # set the path to the working directory. Change this accordingly path <- "~/Dropbox/0.Analysis/0.paper/" counts_list_shuff <- metadata_list_shuff <- avg_counts_shuff <- list() for (sp in species) { # reading raw counts counts <- read.table(paste0(path, "counts/", sp, "_rawcounts.tsv"), header = TRUE, sep = "\t", row.names = 1) info <- data.table::tstrsplit(colnames(counts), "_") metadata <- data.frame(project = info[[1]], species = info[[2]], tissue = info[[3]], gsm = info[[4]], name = colnames(counts), stringsAsFactors = FALSE) metadata$ref <- seq_len(nrow(metadata)) metadata$key <- paste0(metadata$species, "_", metadata$tissue) metadata$tissue_factor <- factor(metadata$tissue) counts_avg <- calcAvgCounts(counts, metadata) cnames <- colnames(counts_avg) rnames <- row.names(counts_avg) set.seed(s1) counts_avg <- as.data.frame(apply(counts_avg, 2, sample)) set.seed(s2) counts_avg <- as.data.frame(t(apply(counts_avg, 1, sample))) colnames(counts_avg) <- cnames row.names(counts_avg) <- rnames # normalize the shuffled counts to log TPM # set the path to the working directory. Change this accordingly path <- "~/Dropbox/0.Analysis/0.paper/" gene_length <- read.table(paste0(path, "ensembl94_annotation/", sp, "_genelength.tsv"), sep = "\t", header = TRUE, row.names = 1, stringsAsFactors = FALSE) if (!all(row.names(counts_avg) %in% row.names(gene_length))) stop("Error") gene_length$Length <- gene_length$Length / 1e3 gene_length <- gene_length[row.names(counts_avg), , drop = TRUE] names(gene_length) <- row.names(counts_avg) counts_tpm <- normalizeTPM(rawCounts = counts_avg, gene_len = gene_length) min_tpm <- 1 counts_tpm[counts_tpm < min_tpm] <- 1 counts_tpm <- log2(counts_tpm) info <- data.table::tstrsplit(colnames(counts_tpm), "_") metadata <- data.frame( species = info[[1]], tissue = info[[2]], name = colnames(counts_tpm), stringsAsFactors = FALSE) metadata$key <- paste0(metadata$species, "_", metadata$tissue) avg_counts_shuff[[sp]] <- calcAvgCounts(counts_tpm, metadata) # write.table(avg_counts_shuff[[sp]],file=paste0(mainDir, "shuffledavgCounts/avg_logTPM_", # species,"_SHUFF_counts_seed",z,"_",z1,".tsv"), # sep="\t",quote=F,col.names=NA) counts_list_shuff[[sp]] <- counts_tpm metadata_list_shuff[[sp]] <- metadata } sapply(counts_list_shuff, dim) # remove zero counts avg_counts_shuff <- lapply(avg_counts_shuff, removeZeros) sapply(avg_counts_shuff, dim)
Depending upon the shuffled counts this could take a while. Decrease
tol
for lower factorization error.
cat(format(Sys.time(), "%a %b %d %X %Y"), "\n") # decrease tol to minimize error osbf_shuf <- SBF(avg_counts_shuff, transform_matrix = TRUE, orthogonal = TRUE, tol = 1e-2) cat(format(Sys.time(), "%a %b %d %X %Y"), "\n") osbf_shuf$error
Compute Tau and scaled expression for the null datasets
Tau_null <- lapply(avg_counts_shuff, function(x) {calc_tissue_specificity(x)}) avg_counts_shuff_scaled <- lapply(avg_counts_shuff, function(x) { t(scale(t(x))) }) combine_expr_null <- list() for (sp in names(avg_counts_shuff_scaled)) { x <- as.data.frame(avg_counts_shuff_scaled[[sp]]) x[["Tau"]] <- Tau_null[[sp]] combine_expr_null[[sp]] <- x }
Now let us find genes with significant loadings in dimension 2. We will use the emphirical null generated from the shuffled counts to get the p-value.
sel_dim <- 2 sel_tissue <- "testis" # axis positive (pos) or negative (neg) sel_sign <- "neg" species <- "Homo_sapiens" species_short <- "hsapiens" expr <- combine_expr[[species]] osbf_coef <- osbf$u[[species]] expr[["coef"]] <- osbf_coef[, sel_dim, drop = TRUE] expr1 <- expr[, c(paste0(species_short, "_", sel_tissue), "Tau", "coef")] colnames(expr1) <- c("tissue_zscore", "Tau", "coef") # null loadings for the same dimensions expr_null <- combine_expr_null[[species]] null_u <- osbf_shuf$u[[species]] expr_null[["coef"]] <- null_u[, sel_dim, drop = TRUE] expr1_null <- expr_null[, c(paste0(species_short, "_", sel_tissue), "Tau", "coef")] if (sel_sign == "pos") { expr1 <- expr1[expr1$coef >= 0, ] expr1_null <- expr1_null[expr1_null$coef >= 0, ] } else if (sel_dim == "neg") { expr1 <- expr1[expr1$coef < 0, ] expr1_null <- expr1_null[expr1_null$coef < 0, ] } expr1$score <- expr1$Tau * abs(expr1$coef) expr1$rank <- rank(-1 * expr1$score) expr1 <- expr1[order(expr1$rank), ] expr1_null$score <- expr1_null$Tau * abs(expr1_null$coef) expr1$pvalue <- sapply(expr1$score, function(x) { sum(as.integer(expr1_null$score > x)) / length(expr1_null$score) }) head(expr1)
Find the number of genes with significant p-value
# cut off for the p-value alpha <- 1e-3 summary(expr1$pvalue <= alpha)
Lets check the top 10 genes for dimension 2. Download annotation files from https://doi.org/10.6084/m9.figshare.20216876
# set the path to the working directory. Change this accordingly path <- "~/Dropbox/0.Analysis/0.paper/" gene_info <- read.table(paste0(path, "ensembl94_annotation/", species_short, "_genes_completeinfo.tsv"), sep = "\t", header = TRUE, quote = "\"") gene_info <- gene_info[!duplicated(gene_info$ensembl_gene_id), ] gene_info <- gene_info[gene_info$ensembl_gene_id %in% row.names(expr1), ] row.names(gene_info) <- gene_info$ensembl_gene_id gene_info <- gene_info[row.names(expr1), ] expr1$gene_name <- gene_info$external_gene_name expr1$biotype <- gene_info$gene_biotype head(expr1, n = 10)
We see that that the top genes are well studied genes with testis-specific expression.
Lets plot the $U_1$ loadings vs expression specificity
Lets plot for human
sel_dim <- 1 species <- "Homo_sapiens" expr <- combine_expr[[species]] osbf_coef <- osbf$u[[species]] expr[["coef"]] <- osbf_coef[, sel_dim, drop = TRUE] # plot scatter mid <- 0.5 p1 <- ggplot2::ggplot(expr, aes(x = Tau, y = coef, col = Tau)) + theme_bw() + geom_point(size = 0.5) + xlab("Expression specificity") + ylab(paste0("Dim", sel_dim, " Coefficient")) + scale_color_gradient2(midpoint = mid, low = "blue", mid = "white", high = "red", space = "Lab") + customTheme() + theme(legend.position = "right", legend.direction = "vertical") + labs(title = getScientificName(species), color = "Tau") + theme(legend.key.size = unit(0.5, "cm"), plot.title = element_text(face = "italic")) p1
Now let us find genes with significant loadings in dimension 1. We will use the empirical null generated from the shuffled counts to get the p-value.
sel_dim <- 1 combine_expr_Dim1 <- list() for (sp in names(combine_expr)) { expr <- combine_expr[[sp]] osbf_coef <- osbf$u[[sp]] expr[["coef"]] <- osbf_coef[, sel_dim, drop = TRUE] expr_null <- combine_expr_null[[sp]] null_u <- osbf_shuf$u[[sp]] expr_null[["coef"]] <- null_u[, sel_dim, drop = TRUE] expr$score <- abs(expr$coef) expr$rank <- rank(-1 * expr$score) expr_null$score <- abs(expr_null$coef) expr[[paste0("Dim", sel_dim, "_pval")]] <- sapply(expr$score, function(x) { sum(as.integer(expr_null$score > x)) / length(expr_null$score) }) combine_expr_Dim1[[sp]] <- expr }
# cut off for the p-value alpha <- 1e-3 sapply(combine_expr_Dim1, function(x) {summary(x$Dim1_pval <= alpha)})
Now lets find the details of these genes.
# set the path to the working directory. Change this accordingly path <- "~/Dropbox/0.Analysis/0.paper/" file <- "allwayOrthologs_hsapiens-ptroglodytes-mmulatta-mmusculus-rnorvegicus-btaurus-sscrofa-ggallus_ens94.tsv" orthologs <- read.table(file.path(path, "GOKeggFiles/", file), header = TRUE, sep = "\t") Dim1_genes <- list() for (sp in names(combine_expr_Dim1)) { x <- combine_expr_Dim1[[sp]][combine_expr_Dim1[[sp]]$Dim1_pval <= alpha, ] x <- x[order(x$rank), c("rank", "score", "Tau", "Dim1_pval"), drop = FALSE] # get gene details gene_info <- read.table(paste0(path, "ensembl94_annotation/", getSpeciesShortName(sp), "_genes_completeinfo.tsv"), sep = "\t", header = TRUE, quote = "\"") gene_info <- gene_info[!duplicated(gene_info$ensembl_gene_id), ] gene_info <- gene_info[gene_info$ensembl_gene_id %in% row.names(x), ] row.names(gene_info) <- gene_info$ensembl_gene_id gene_info <- gene_info[row.names(x), ] gene_info <- gene_info[, c("ensembl_gene_id", "external_gene_name", "gene_biotype", "chromosome_name")] colnames(gene_info) <- c("id", "gene_name", "biotype", "chr") species_allway_orthogs <- orthologs[, getSpeciesShortName(sp), drop = TRUE] ortholog_status <- rep(FALSE, nrow(x)) ortholog_status[row.names(x) %in% species_allway_orthogs] <- TRUE x$allwayOrtholog <- as.factor(ortholog_status) df <- cbind(gene_info, x) row.names(df) <- NULL Dim1_genes[[sp]] <- df }
Top Dim1 human genes
head(Dim1_genes[["Homo_sapiens"]])
Top Dim1 mouse genes
head(Dim1_genes[["Mus_musculus"]])
Summary stats of dimension 1 genes
cnames <- c("Dim", "species", "nTotal", "nCoding", "nMt", "nAllwayOrtholog") summary_stats <- data.frame(matrix(NA, nrow = 0, ncol = length(cnames))) colnames(summary_stats) <- cnames for (sp in names(Dim1_genes)) { df <- Dim1_genes[[sp]] stats <- data.frame(matrix(0L, nrow = 1, ncol = length(cnames))) colnames(stats) <- cnames stats$Dim <- "Dim1" stats$species <- getSpeciesShortName(sp) stats$nTotal <- nrow(df) stats$nCoding <- round(nrow(df[df$biotype == "protein_coding", , drop = FALSE]) * 100 / nrow(df), 2) stats$nMt <- round(nrow(df[tolower(df$chr) == "mt", , drop = FALSE]) * 100 / nrow(df), 2) stats$nAllwayOrtholog <- round(nrow(df[df$allwayOrtholog == TRUE, , drop = FALSE]) * 100 / nrow(df), 2) summary_stats <- rbind(summary_stats, stats) } summary_stats
sessionInfo()
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.