knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>",
  fig.width = 6,
  fig.height = 3,
  fig.align = "center",
  tidy.opts=list(width.cutoff=80),
  tidy = TRUE
)

Setup

Install these packages if you don't already have them:

install.packages("tidyverse")
install.packages("readxl")

# You'll need Bioconductor to get DESeq2
install.packages("BiocManager")
BiocManager::install('DESeq2')

# Install mpraprofiler itself
devtools::install_github("WeirauchLab/mpraprofiler")

Then, load these libraries:

suppressMessages(library(tidyverse))
suppressMessages(library(readxl))
suppressMessages(library(DESeq2))
library(mpraprofiler)
library(tidyverse)
library(readxl)
library(DESeq2)
library(mpraprofiler)

Data Preparation for DESeq2

All the sample data are in the data folder, which is the SLE MPRA dataset

# Read all the data
# Here we use data from the package. You can replace with your own data.
ext_data <- system.file("extdata", package = "mpraprofiler")
file_list <- list.files(ext_data, pattern = "*.xlsx")
file_loc <- paste(ext_data, file_list, sep = "/")
raw_counts <- read_xlsx(file_loc[2], sheet = "Raw MPRA count data ")

# Only include variants with >30 unique tags
filter_snps <- raw_counts %>% filter(Unique_Tag >=30) %>% pull(Variant) %>% unique()
filter_counts <- raw_counts %>% filter(Variant %in% filter_snps)

# Build DESeq2 required matrix
cts <- filter_counts[, 6:ncol(filter_counts)]
row.names(cts) <- filter_counts$Oligo_ID
cts <- as.matrix(cts)

# This is wgat the required format for DESeq2 looks like
head(cts)

DESeq2 analysis

You may refer to DESeq2 package for more details

Create the design matrix for DESeq2.

coldata <- data.frame("condition" = c("ctrl", rep("sle",3)), row.names = colnames(cts))

# check the design matrix has the same order as the count matrix.
all(rownames(coldata) == colnames(cts))

Perform DESeq2 analysis.

# library("DESeq2")
dds <- DESeqDataSetFromMatrix(
                              countData = cts,
                              colData = coldata,
                              design = ~condition
)

dds <- DESeq(dds)

# sle vs ctrl result
dds_result <- results(dds, contrast=c("condition","sle","ctrl"))

# plot
fold_enhancer_plot(dds_result, xmin = 0.5)

Define enhancers

sle enhancer: More than 50% change with padj < 0.05

dds_result_all <- as.data.frame(dds_result)

# only care about the SLE variant
anno <- read_xlsx(file_loc[1])
sle_variant <- anno %>% filter(SLE_Variant==1) %>% pull(Variant) %>% unique()
dds_result_sle <- dds_result_all %>% rownames_to_column("Oligo_ID") %>% left_join(raw_counts[,1:4], by = "Oligo_ID") %>% filter(Variant %in% sle_variant)

# 853 enhancer alleles 
dds_sle_enhancer <- dds_result_sle %>% filter(log2FoldChange>=log2(1.5)& padj< 0.05)
dds_enAllele <- dds_sle_enhancer %>% pull(Oligo_ID) %>% unique()
length(dds_enAllele)

# 482 enhancer variant
dds_enVar <-  dds_sle_enhancer %>% pull(Variant) %>% unique()
length(dds_enVar)

Allelic Analysis

Prepare the data: add 0.5 to avoid infinite problem

# just use the dataset prepared for DESeq2
cts <- cts + 0.5

# this step calculates the non-ref vs ref ratio for all the variants and sample
cts_nr_r_ratio <- nr_r_ratio(cts)
head(cts_nr_r_ratio)

normalization over the control and do log2 transformation

# you need to indicate the exp data, ctrl data and the annotation data
allelic_all <- allelic_compare(cts_nr_r_ratio[, 2:4], cts_nr_r_ratio[, 1], cts_nr_r_ratio[, c("snp", "compare_nrvsr")]) # exp, ctrl, annotation

get the max fold change for each variant

max_fold <- max_fold(dds_result_all)

get the p-value for the significant enhancer

# the allelic log2 normalization data (allelic_compare) and the list of enhancer variant.
allelic_p <- allelic_compare_p(allelic_all, dds_enVar)

get the significant allielic variant : Over than 25% change in either direction and padj < 0.05

allelic_data <- allelic_p %>% filter(pFDR < 0.05 & (log2_aver >= log2(1.25) | log2_aver <= -log2(1.25)))

# allelic enVar

allelic_enVar <- allelic_data %>%  pull(snp) %>%  unique()

Plot

# only sle
plot_data <- inner_join(allelic_all,max_fold, by = "snp") %>% left_join(allelic_p[,c("snp","p_value", "pFDR")], by = "snp") %>% filter(snp %in% sle_variant) %>% mutate(pos = if_else(snp %in% allelic_enVar & (log2_aver >= log2(1.25) | log2_aver <= -log2(1.25)), 2, if_else(snp %in% allelic_enVar & (log2_aver < log2(1.25) & log2_aver > -log2(1.25)), 1,0)))

# with indicator line
allelic_enhancer_dot_plot(plot_data, log2 = "log2_aver", max_fold = "max_fold", label = "pos")


lux2ht/mpraprofiler documentation built on July 4, 2023, 5:45 a.m.