R/correlations.R

Defines functions select_genes_int store_micro write_cor ensembl2symbol plot_cor sign_cor relevant filter_values readSGCCA boots_corr plot_single_cor pathsPerMicro

Documented in boots_corr ensembl2symbol filter_values pathsPerMicro plot_cor plot_single_cor readSGCCA relevant select_genes_int sign_cor store_micro write_cor

#' Enrichment by microorganisms
#'
#' Function to split by microorganism and compare if they are enriched in some
#' pathways
#' @param all_genes All the genes that are found relevant (weight != 0)
#' @param x The table of microoganisms, genes, correlations and adjusted p-values.
#' @return A list of tables with the pathways enriched for every microorganism
#' @export
pathsPerMicro <- function(x, all_genes){
  genesOrig <- trimVer(names(all_genes))
  x[, "Gene"] <- trimVer(x[, "Gene"])
  perMicro <- split(x, x[, "Microorganism"])
  if (!requireNamespace("org.Hs.eg.db", quietly = TRUE)) {
    stop("Install Org.Hs.eg.db from Bioconductor", call. = FALSE)
  }
  if (!requireNamespace("reactome.db", quietly = TRUE)) {
    stop("Install reatome.db from Bioconductor", call. = FALSE)
  }
  entrezID <- AnnotationDbi::mapIds(org.Hs.eg.db::org.Hs.eg.db, keys = genesOrig, keytype = "ENSEMBL",
                     column = "ENTREZID")
  entrezID <- entrezID[!is.na(entrezID)]
  paths2genes <- access_reactome()
  genes <- unlist(paths2genes, use.names = FALSE)
  pathways <- rep(names(paths2genes), lengths(paths2genes))
  message("Calculating the enrichment")
  T2G <- cbind.data.frame(pathways, genes)
  lapply(perMicro, function(x) {
    significant <- x$Gene
    relevant <- entrezID[significant]
    relevant <- relevant[!is.na(relevant)]
    if (length(relevant) <= 10) {
      return(NULL)
    }
    enrich <- clusterProfiler::enricher(gene = relevant ,
                                        # universe = entrezID,
                                        TERM2GENE = T2G)
    as.data.frame(enrich)
    enrich <- as.data.frame(enrich)
    if (nrow(enrich) >= 1) {
      enrich$Description <- AnnotationDbi::mapIds(reactome.db::reactome.db, keys = rownames(enrich),
                                   keytype = "PATHID", column = "PATHNAME")
    }
    enrich
  })
}
#' Plot correlations
#'
#' @param x Gene expression matrix
#' @param gene Name of the gene to plot
#' @param y Microorganism matrix
#' @param microorganism Name of the microorganism
#' @param colr Factor to use for colors
#' @param case Factor to use for points shape
#' @param cor_val Title of the correlation (usually the correlation value)
#' @export
plot_single_cor <- function(x, gene, y, microorganism, colr, case, cor_val) {
  genes <- trimVer(gene)
  symbol <- tryCatch({AnnotationDbi::mapIds(
    org.Hs.eg.db::org.Hs.eg.db, keys = genes, keytype = "ENSEMBL",
    column = "SYMBOL"
  )},  error = function(e){NA})

  if (is.na(symbol)) {
    return(NA)
  }

  x_s <- x[gene, ]
  x_s[x_s == 0] <- NA

  y_s <- y[microorganism, ]
  y_s[y_s == 0] <- NA
  pch_start <- 15

  # main <- cor(x_s, y_s, method = "spearman", use = "pairwise.complete.obs")
  plot(x_s, y_s, xlab = paste(symbol, collapse = " "), ylab = microorganism, main = cor_val, col = colr,
       pch = pch_start + as.numeric(case))
  legend("bottomleft", fill = as.factor(levels(colr)), legend = levels(colr))
  legend("topright", pch = pch_start + as.numeric(as.factor(levels(case))),
         legend = levels(case))
  # legend("topleft", lty = "solid", col = as.factor(levels(bg)),
  # legend = levels(bg), lwd = 2)
}


#' Bootstrapping to asses how probable is to have such enrichment in those p-values matrix
#' @param pvalues Vector of pvalues
#' @param component The named component different from 0 from a SGCCA object
#' @param iter The number of permutations
#' @return The numeric pvalue of the bootstrap.
#' @export
boots_corr <- function(pvalues, component, iter = 10000) {
  b <- vapply(seq_len(iter), function(x){
    sel <- sample(x = seq_len(ncol(pvalues)), size = length(component))
    sum(pvalues[, sel] < 0.05, na.rm = TRUE)
  }, numeric(1L))
  n <- sum(pvalues[, colnames(pvalues) %in% names(component)] < 0.05, na.rm = TRUE)
  sum(b >= n)/length(b)
}

#' Read component
#'
#' @param file The .RDS file were the sgcca object is stored
#' @param set The set for which we want the component
#' @param component The number of the component we want.
#' @return The vector of the component without those variables with weight 0
#' @export
readSGCCA <- function(file, set = 1, component = 1) {
  # Load data from all the patients
  sgcca.centroid <- readRDS(file)

  # Find outliers/important genes
  comp1 <- sgcca.centroid$a[[set]][, component]
  outliers <- comp1 != 0
  comp1[outliers]
}

#' Filter correlations
#'
#' Filter the correlations based on the pvalue
#' @param comp1 A component with names
#' @param cors A matrix with the correlations
#' @param pval A matrix with the pvalues
#' @param threshold The numeric threshold of the filter
#' @return A list with matrices of correlations and pvalues that pass the filter
#' @export
#' @seealso [sign_cor()], [relevant()]
filter_values <- function(comp1, cors, pval, threshold) {

  keepGenes <- colnames(cors) %in% names(comp1)
  cors <- cors[, keepGenes]
  pval <- pval[, keepGenes]

  cors <- cors[, !is.na(colnames(cors))]
  pval <- pval[, !is.na(colnames(pval))]

  message("Dimensions ", paste0(dim(cors), collapse = ", "))

  # Genes below the threshold
  keepCols <- apply(pval, 2, function(x){any(x < threshold)})
  keepRows <- apply(pval, 1, function(x){any(x < threshold)})

  keepCols[is.na(keepCols)] <- FALSE
  keepRows[is.na(keepRows)] <- FALSE

  if (sum(keepCols) == 0 || sum(keepRows) == 0) {
    stop("No relevant correlations with this threshold")
  }

  cors <- cors[keepRows, keepCols]
  pval <- pval[keepRows, keepCols]
  if (is.null(pval) || is.null(cors)) {
    stop("No relevant correlations with this threshold")
  }
  message("Dimensions ", paste0(dim(cors), collapse = ", "))

  list(cors = cors, pval = pval)
}



#' List the correlations
#'
#' Prepare the correlations to be shared
#' @inheritParams filter_values
#' @return A data.frame with microorganisms, genes, their correlations and the
#' pvalue
#' @export
#' @seealso [filter_values()], [sign_cor()]
relevant <- function(comp1, cors, pval, threshold = 0.05) {
  l <- filter_values(comp1, cors, pval, threshold)
  pval <- l$pval
  cors <- l$cors
  if (ncol(pval) == 0) {
    stop("No relevant correlations with this threshold")
  }
  ind <- as.data.frame(which(pval < threshold, arr.ind = TRUE),
                       stringAsFactors = FALSE)
  rownames(ind) <- seq_len(nrow(ind)) # TODO test
  cor_pval <- apply(ind, 1, function(x){
    c("cors" = cors[x[1], x[2]],
      "pvalue" = pval[x[1], x[2]])
  })
  ind$row <- rownames(cors)[ind$row]
  ind$col <- colnames(cors)[ind$col]
  ind <- cbind.data.frame(ind, t(cor_pval))
  colnames(ind) <- c("Microorganism", "Gene", "Correlation", "pvalue")

  ind <- ind[!duplicated(ind), ]
  ind <- ind[order(ind$Microorganism, ind$pvalue, decreasing = c(TRUE, FALSE)), ]
  rownames(ind) <- seq_len(nrow(ind))
  ind
}


#' List the correlations
#'
#' Prepare the correlations to be shared (without filtering them)
#' @inheritParams filter_values
#' @return A data.frame with microorganisms, genes, their correlations and the
#' pvalue
#' @export
#' @seealso [filter_values()], [relevant()]
sign_cor <- function(cors, pval, threshold = 0.05) {
  if (sum(pval < threshold, na.rm = TRUE) == 0) {
    stop("No relevant correlations with this threshold")
  }

  ind <- as.data.frame(which(pval < threshold, arr.ind = TRUE),
                       stringAsFactors = FALSE)
  rownames(ind) <- seq_len(nrow(ind)) # TODO test
  cor_pval <- apply(ind, 1, function(x){
    c("cors" = cors[x[1], x[2]],
      "pvalue" = pval[x[1], x[2]])
  })
  ind$row <- rownames(cors)[ind$row]
  ind$col <- colnames(cors)[ind$col]
  ind <- cbind(ind, t(cor_pval))
  colnames(ind) <- c("Microorganism", "Gene", "Correlation", "pvalue")

  ind <- ind[!duplicated(ind), ]
  ind <- ind[order(ind$Microorganism, ind$pvalue, decreasing = c(TRUE, FALSE)), ]
  rownames(ind) <- seq_len(nrow(ind))
  ind
}

#' Plot a correlation with ggplot
#'
#' Use filter_values to decide how to and create an html file
#' @param file The name of the file with the model
#' @inheritParams filter_values
#' @param label Label for the output html file
#' @note Expects genes in rows and species at the columns in `cors` and
#' `pvalue`
#' @return Create a html file at Figures/heatmap...html
#' @export
plot_cor <- function(file, cors, pval, threshold, label) {
  if (!requireNamespace("heatmaply", quietly = TRUE)) {
    stop("Install heatmaply from CRAN", call. = FALSE)
  }
  if (!requireNamespace("ggplot2", quietly = TRUE)) {
    stop("Install ggplot2 from CRAN", call. = FALSE)
  }
  l <- filter_values(file, cors, pval, threshold)
  cors <- l$cors

  cors <- cors[!duplicated(rownames(cors)), ]
  colors_g <- ggplot2::scale_fill_gradient2(low = "blue", high = "red",
                                            midpoint = 0, limits = c(-1, 1))
  heatmaply::heatmaply(cors, name = "Cor",
            ylab = "Genes",
            xlab = "Genus",
            scale_fill_gradient_fun = colors_g,
            file = paste0("Figures/heatmap", label,".html"))

}


#' Translate ensmbl to symbols
#'
#' @param x A data.frame with a column named Gene
#' @return The same data.frame with gene names without duplications and empty
#' symbols
#' @export
ensembl2symbol <- function(x) {
  all_samples_symbol <- x
  all_samples_symbol$Gene <- trimVer(all_samples_symbol$Gene)
  all_samples_symbol$Gene <- AnnotationDbi::mapIds(org.Hs.eg.db::org.Hs.eg.db, keys = all_samples_symbol$Gene,
                                    keytype = "ENSEMBL", column = "SYMBOL")
  all_samples_symbol <- all_samples_symbol[!is.na(all_samples_symbol$Gene), ]
  all_samples_symbol[!duplicated(all_samples_symbol), ]
}

#' Translate symbols and store
#'
#' @inheritParams ensembl2symbol
#' @param file The name of the file were to store them
#' @seealso [ensembl2symbol()]
#' @return NULL
#' @export
write_cor <- function(x, file){
  all_samples_symbol <- ensembl2symbol(x)
  write.csv(all_samples_symbol, file = file, row.names = FALSE, na = "")
}


#' Store result by microorganism
#'
#' @param x Name of microorganism
#' @param o2 Filter samples
#' @param label Label for the files
#' @return NULL
#' @export
store_micro <- function(x, o2, label) {
  if (nrow(o2[[x]]) > 1 && !is.null(o2[[x]])) {
    write.csv(o2[[x]], row.names = FALSE, na = "",
              file = paste0(x, label, ".csv"))
  }
}

#' Filter genes
#'
#' @param file Name of the file with folder
#' @param expr Matrix of expression
#' @return A Matrix with the filtered genes
#' @export
select_genes_int <- function(file, expr) {
  # Load data from all the patients

  comp1 <- readSGCCA(file)

  keepGenes <- rownames(expr) %in% names(comp1)
  expr[keepGenes, ]
}
llrs/integration-helper documentation built on Sept. 24, 2021, 10:57 a.m.