Case study 1: Distinguishing vertical and horizontal pleiotropy for SNP-protein associations

knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>"
)

This RMarkdown document demonstrates how key elements from the notebook for case study 1 in the EpiGraphDB paper can be achieved using the R package. For detailed explanations of the case study please refer to the paper or the case study notebook.

Context

Mendelian randomization (MR), a technique to evaluate the causal role of modifiable exposures on a health outcome using a set of genetic instruments, tacitly assumes that a genetic variant (e.g. SNP) is only related to an outcome of interest through an exposure (i.e. the "exclusion restriction criterion"). Horizontal pleiotropy, where a SNP is associated with multiple phenotypes independently of the exposure of interest, potentially violates this assumption delivering misleading conclusions. In contrast, vertical pleiotropy, where a SNP is associated with multiple phenotypes on the same biological pathway, does not violate this assumption.

Here, we use external evidence on biological pathways and protein-protein interactions to assess the pleiotropic profile for a genetic variant based on the genes (and proteins) with which that variant is associated. In a graph representation, we will show that:

This case study goes as follows:

library("magrittr")
library("dplyr")
library("purrr")
library("glue")
library("igraph")
library("epigraphdb")

Here we configure the parameters used in the case study example. In this example we will look at the variant rs12720356, a SNP located in chromosome 19 that has been associated with Crohn’s disease and psoriasis.**

SNP <- "rs12720356"

GENELIST <- c("ZGLP1", "FDX1L", "MRPL4", "ICAM5", "TYK2", "GRAP2", "KRI1", "TMED1", "ICAM1")

PPI_N_INTERMEDIATE_PROTEINS <- 1

Mapping genes to proteins

The first step of the analysis is to map each of these genes to their protein product.

get_gene_protein <- function(genelist) {
  endpoint <- "/mappings/gene-to-protein"
  params <- list(
    gene_name_list = genelist %>% I()
  )
  r <- query_epigraphdb(
    route = endpoint,
    params = params,
    mode = "table",
    method = "POST"
  )
  protein_df <- r
  if (nrow(protein_df) > 0) {
    res_df <- protein_df %>%
      select(gene_name = `gene.name`, uniprot_id = `protein.uniprot_id`)
  } else {
    res_df <- tibble() %>% set_names(c("gene_name", "uniprot_id"))
  }

  res_df
}

gene_protein_df <- get_gene_protein(genelist = GENELIST)
gene_protein_df

Identifying the involvement in the same biological pathways

For each protein we retrieve the pathways they are involved in.

get_protein_pathway <- function(gene_protein_df) {
  endpoint <- "/protein/in-pathway"
  params <- list(
    uniprot_id_list = gene_protein_df %>% pull(`uniprot_id`) %>% I()
  )
  df <- query_epigraphdb(route = endpoint, params = params, mode = "table", method = "POST")

  if (nrow(df) > 0) {
    res_df <- gene_protein_df %>%
      select(`uniprot_id`) %>%
      left_join(df, by = c("uniprot_id"))
  } else {
    res_df <- gene_protein_df %>%
      select(`uniprot_id`) %>%
      mutate(pathway_count = NA_integer_, pathway_reactome_id = NA_character_)
  }
  res_df <- res_df %>%
    mutate(
      pathway_count = ifelse(is.na(pathway_count), 0L, as.integer(pathway_count)),
      pathway_reactome_id = ifelse(is.na(pathway_reactome_id), c(), pathway_reactome_id)
    )

  res_df
}
pathway_df <- get_protein_pathway(gene_protein_df = gene_protein_df)
pathway_df

Now for each pair of proteins we match the pathways they have in common.

get_shared_pathway <- function(pathway_df) {
  # For the protein-pathway data
  # Get protein-protein permutations where they share pathways

  per_permutation <- function(pathway_df, permutation) {
    df <- pathway_df %>% filter(uniprot_id %in% permutation)
    primary_pathway <- pathway_df %>%
      filter(uniprot_id == permutation[1]) %>%
      pull(pathway_reactome_id) %>%
      unlist()
    assoc_pathway <- pathway_df %>%
      filter(uniprot_id == permutation[2]) %>%
      pull(pathway_reactome_id) %>%
      unlist()
    intersect(primary_pathway, assoc_pathway)
  }

  pairwise_permutations <- pathway_df %>%
    pull(`uniprot_id`) %>%
    gtools::permutations(n = length(.), r = 2, v = .)
  shared_pathway_df <- tibble(
    protein = pairwise_permutations[, 1],
    assoc_protein = pairwise_permutations[, 2]
  ) %>%
    mutate(
      shared_pathway = map2(`protein`, `assoc_protein`, function(x, y) {
        per_permutation(pathway_df = pathway_df, permutation = c(x, y))
      }),
      combination = map2_chr(`protein`, `assoc_protein`, function(x, y) {
        comb <- sort(c(x, y))
        paste(comb, collapse = ",")
      }),
      count = map_int(`shared_pathway`, function(x) na.omit(x) %>% length()),
      connected = count > 0
    )

  shared_pathway_df
}
shared_pathway_df <- get_shared_pathway(pathway_df)
n_pairs <- length(shared_pathway_df %>% filter(count > 0))
print(glue::glue("Num. shared_pathway pairs: {n_pairs}"))
shared_pathway_df %>% arrange(desc(count))

We can further query EpiGraphDB regarding the detailed pathway information using GET /meta/nodes/Pathway/search.

get_pathway_info <- function(reactome_id) {
  endpoint <- "/meta/nodes/Pathway/search"
  params <- list(id = reactome_id)
  df <- query_epigraphdb(route = endpoint, params = params, mode = "table")
  df
}

pathway <- shared_pathway_df %>%
  pull(shared_pathway) %>%
  unlist() %>%
  unique()

pathway_info <- pathway %>% map_df(get_pathway_info)
pathway_info %>% print()

In order to extract protein groups from the shared pathways, the last step for this query is to convert the shared pathway data into a graph where

We then count the number of nodes in each connected community and plot the graph.

protein_df_to_graph <- function(df) {
  df_connected <- df %>%
    filter(connected) %>%
    distinct(`combination`, .keep_all = TRUE)
  nodes <- df %>%
    pull(protein) %>%
    unique()
  graph <- igraph::graph_from_data_frame(
    df_connected,
    directed = FALSE, vertices = nodes
  )
  graph$layout <- igraph::layout_with_kk
  graph
}

graph_to_protein_groups <- function(graph) {
  graph %>%
    igraph::components() %>%
    igraph::groups() %>%
    tibble(group_member = .) %>%
    mutate(group_size = map_int(`group_member`, length)) %>%
    arrange(desc(group_size))
}

pathway_protein_graph <- shared_pathway_df %>% protein_df_to_graph()
pathway_protein_groups <- pathway_protein_graph %>% graph_to_protein_groups()
pathway_protein_groups %>% str()
plot(pathway_protein_graph)

Retrieving shared protein-protein interactions

We can complement the information given by pathway ontologies with data of interactions between proteins. For each pair of proteins, we retrieve shared protein-protein interactions, either a direct interaction or an interaction where there's one mediator protein in the middle (see parameters section).

get_ppi <- function(gene_protein_df, n_intermediate_proteins = 0) {
  endpoint <- "/protein/ppi/pairwise"
  params <- list(
    uniprot_id_list = gene_protein_df %>% pull("uniprot_id") %>% I(),
    n_intermediate_proteins = n_intermediate_proteins
  )
  df <- query_epigraphdb(
    route = endpoint,
    params = params,
    mode = "table",
    method = "POST"
  )

  if (nrow(df) > 0) {
    res_df <- gene_protein_df %>%
      select(protein = uniprot_id) %>%
      left_join(df, by = c("protein"))
  } else {
    res_df <- gene_protein_df %>%
      select(protein = uniprot_id) %>%
      mutate(assoc_protein = NA_character_, path_size = NA_integer_)
  }
  res_df
}
ppi_df <- get_ppi(gene_protein_df, n_intermediate_proteins = PPI_N_INTERMEDIATE_PROTEINS)
ppi_df

Like in the query for pathway data, we convert and analyse these results as a graph.

get_shared_ppi <- function(ppi_df) {
  per_row <- function(ppi_df, x, y) {
    res <- ppi_df %>%
      filter(
        `protein` == x,
        `assoc_protein` == y
      ) %>%
      nrow() %>%
      `>`(0)
    res
  }

  pairwise_permutations <- ppi_df %>%
    pull(`protein`) %>%
    unique() %>%
    gtools::permutations(n = length(.), r = 2, v = .)
  shared_ppi_df <- tibble(
    protein = pairwise_permutations[, 1],
    assoc_protein = pairwise_permutations[, 2]
  ) %>%
    mutate(
      shared_ppi = map2_lgl(`protein`, `assoc_protein`, function(x, y) {
        per_row(ppi_df = ppi_df, x = x, y = y)
      }),
      combination = map2_chr(`protein`, `assoc_protein`, function(x, y) {
        comb <- sort(c(x, y))
        paste(comb, collapse = ",")
      }),
      connected = shared_ppi
    )

  shared_ppi_df
}

shared_ppi_df <- get_shared_ppi(ppi_df)
n_ppi_pairs <- shared_ppi_df %>%
  filter(shared_ppi) %>%
  nrow()
print(glue::glue("Num. shared_ppi pairs: {n_ppi_pairs}"))
shared_ppi_df
ppi_protein_graph <- protein_df_to_graph(shared_ppi_df)
ppi_protein_groups <- graph_to_protein_groups(ppi_protein_graph)
ppi_protein_groups %>% str()
plot(ppi_protein_graph)

sessionInfo

sessionInfo()


Try the epigraphdb package in your browser

Any scripts or data that you put into this service are public.

epigraphdb documentation built on March 29, 2021, 5:11 p.m.