Xiaoming Lu 2022-10-19
Install these packages if you don’t already have them:
install.packages("tidyverse")
install.packages("readxl")
# You'll need Bioconductor to get DESeq2
install.packages("BiocManager")
BiocManager::install("DESeq2")
# Install mpraprofiler itself
devtools::install_github("WeirauchLab/mpraprofiler")
Then, load these libraries:
library(tidyverse)
library(readxl)
library(DESeq2)
library(mpraprofiler)
All the sample data are in the data folder, which is the SLE MPRA dataset
# Read all the data Here we use data from the package. You can replace with
# your own data.
ext_data <- system.file("extdata", package = "mpraprofiler")
file_list <- list.files(ext_data, pattern = "*.xlsx")
file_loc <- paste(ext_data, file_list, sep = "/")
raw_counts <- read_xlsx(file_loc[2], sheet = "Raw MPRA count data ")
# Only include variants with >30 unique tags
filter_snps <- raw_counts %>%
filter(Unique_Tag >= 30) %>%
pull(Variant) %>%
unique()
filter_counts <- raw_counts %>%
filter(Variant %in% filter_snps)
# Build DESeq2 required matrix
cts <- filter_counts[, 6:ncol(filter_counts)]
row.names(cts) <- filter_counts$Oligo_ID
cts <- as.matrix(cts)
# This is wgat the required format for DESeq2 looks like
head(cts)
#> Plasmid_Control MPRA_Replicate_1 MPRA_Replicate_2
#> rs10015639_Non-Ref_G 5307 18618 21334
#> rs10015639_Ref_A 4616 15084 19510
#> rs10026154_Non-Ref_A 2471 8266 11033
#> rs10026154_Ref_G 2619 8814 12235
#> rs10032244_Non-Ref_G 7268 26675 30581
#> rs10032244_Ref_A 12719 44067 53640
#> MPRA_Replicate_3
#> rs10015639_Non-Ref_G 13640
#> rs10015639_Ref_A 12108
#> rs10026154_Non-Ref_A 6383
#> rs10026154_Ref_G 7060
#> rs10032244_Non-Ref_G 20141
#> rs10032244_Ref_A 33421
You may refer to DESeq2
package for more details
Create the design matrix for DESeq2.
coldata <- data.frame(condition = c("ctrl", rep("sle", 3)), row.names = colnames(cts))
# check the design matrix has the same order as the count matrix.
all(rownames(coldata) == colnames(cts))
#> [1] TRUE
Perform DESeq2 analysis.
# library('DESeq2')
dds <- DESeqDataSetFromMatrix(countData = cts, colData = coldata, design = ~condition)
#> Warning in DESeqDataSet(se, design = design, ignoreRank): some variables in
#> design formula are characters, converting to factors
dds <- DESeq(dds)
# sle vs ctrl result
dds_result <- results(dds, contrast = c("condition", "sle", "ctrl"))
# plot
fold_enhancer_plot(dds_result, xmin = 0.5)
#> Warning: Removed 77 rows containing non-finite values (stat_bin).
#> Warning: Removed 70 rows containing non-finite values (stat_bin).
#> Warning: Transformation introduced infinite values in continuous y-axis
#> Transformation introduced infinite values in continuous y-axis
#> Transformation introduced infinite values in continuous y-axis
#> Warning: Removed 50 rows containing missing values (geom_bar).
#> Warning: Removed 193 rows containing missing values (geom_bar).
#> Warning: Removed 302 rows containing missing values (geom_bar).
dds_result_all <- as.data.frame(dds_result)
# only care about the SLE variant
anno <- read_xlsx(file_loc[1])
sle_variant <- anno %>%
filter(SLE_Variant == 1) %>%
pull(Variant) %>%
unique()
dds_result_sle <- dds_result_all %>%
rownames_to_column("Oligo_ID") %>%
left_join(raw_counts[, 1:4], by = "Oligo_ID") %>%
filter(Variant %in% sle_variant)
# 853 enhancer alleles
dds_sle_enhancer <- dds_result_sle %>%
filter(log2FoldChange >= log2(1.5) & padj < 0.05)
dds_enAllele <- dds_sle_enhancer %>%
pull(Oligo_ID) %>%
unique()
length(dds_enAllele)
#> [1] 853
# 482 enhancer variant
dds_enVar <- dds_sle_enhancer %>%
pull(Variant) %>%
unique()
length(dds_enVar)
#> [1] 482
# just use the dataset prepared for DESeq2
cts <- cts + 0.5
# this step calculates the non-ref vs ref ratio for all the variants and sample
cts_nr_r_ratio <- nr_r_ratio(cts)
head(cts_nr_r_ratio)
#> Plasmid_Control_nr/r_ratio MPRA_Replicate_1_nr/r_ratio
#> 1 1.1496805 1.2342802
#> 2 0.9435007 0.9378297
#> 3 0.5714454 0.6053327
#> 4 0.8011571 0.4502152
#> 5 1.2491842 1.6643535
#> 6 1.0270453 1.0082466
#> MPRA_Replicate_2_nr/r_ratio MPRA_Replicate_3_nr/r_ratio snp
#> 1 1.0934881 1.1265227 rs10015639
#> 2 0.9017613 0.9041144 rs10026154
#> 3 0.5701196 0.6026510 rs10032244
#> 4 0.4588089 0.4590293 rs10036748
#> 5 1.4130758 1.4801842 rs10048743
#> 6 0.8621627 1.0566792 rs10077437
#> compare_nrvsr
#> 1 rs10015639_G_A_Non-Ref
#> 2 rs10026154_A_G_Non-Ref
#> 3 rs10032244_G_A_Non-Ref
#> 4 rs10036748_T_C_Non-Ref
#> 5 rs10048743_T_G_Non-Ref
#> 6 rs10077437_A_G_Non-Ref
# you need to indicate the exp data, ctrl data and the annotation data
allelic_all <- allelic_compare(cts_nr_r_ratio[, 2:4], cts_nr_r_ratio[, 1], cts_nr_r_ratio[,
c("snp", "compare_nrvsr")]) # exp, ctrl, annotation
max_fold <- max_fold(dds_result_all)
# the allelic log2 normalization data (allelic_compare) and the list of
# enhancer variant.
allelic_p <- allelic_compare_p(allelic_all, dds_enVar)
allelic_data <- allelic_p %>%
filter(pFDR < 0.05 & (log2_aver >= log2(1.25) | log2_aver <= -log2(1.25)))
# allelic enVar
allelic_enVar <- allelic_data %>%
pull(snp) %>%
unique()
# only sle
plot_data <- inner_join(allelic_all, max_fold, by = "snp") %>%
left_join(allelic_p[, c("snp", "p_value", "pFDR")], by = "snp") %>%
filter(snp %in% sle_variant) %>%
mutate(pos = if_else(snp %in% allelic_enVar & (log2_aver >= log2(1.25) | log2_aver <=
-log2(1.25)), 2, if_else(snp %in% allelic_enVar & (log2_aver < log2(1.25) &
log2_aver > -log2(1.25)), 1, 0)))
# with indicator line
allelic_enhancer_dot_plot(plot_data, log2 = "log2_aver", max_fold = "max_fold", label = "pos")
#> Warning: It is deprecated to specify `guide = FALSE` to remove a guide. Please
#> use `guide = "none"` instead.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.