packages <- c(
    "tidyverse",
    "printr",
    "ggthemes",
    "miR34AasRNAproject",
    "grid",
    "gtable"
)
purrr::walk(packages, library, character.only = TRUE)
rm(packages)



Introduction

asRNAs are known to be capable of exerting regulatory functions on both a transcriptional and post-transcriptional level and therefore, the cellular localization of the asRNA can be informative concerning the method of regulation.



Methods

Quantified RNAseq data from 11 cell lines from the GRCh38 assembly was downloaded from ENCODE and quantifications for miR34a asRNA (ENSG00000234546), ACTB (ENSG00000075624), GAPDH (ENSG00000111640), and MALAT1 (ENSG00000251562) were extracted. Cell lines for which data was downloaded include: A549, GM12878, HeLa-S3, HepG2, HT1080, K562 MCF-7, NCI-H460, SK-MEL-5, SK-N-DZ, SK-N-SH. Initial exploratory analysis revealed that several cell lines should not be included in the final figure for the following reasons: The SK-N-SH has a larger proportion of GAPDH in the nucleus than cytoplasm. The variation of miR34a asRNA expression is too large for SK-MEL-5. K562, HT1080, SK-N-DZ, and NCI-H460 have no or low miR34a asRNA expression. In addition, both the cytoplasmic markers ACTB and GAPDH were analyzed for their ability to differentiate between the nuclear and cytoplasmic fractions, and GAPDH was choosed for the final analysis due its superior performance. Furthermore, only polyadenylated libraries were used in the final analysis, due to the fact that the cellular compartment enrichment was seen to be improved in these samples, and all analyzed genes are reported to be polyadenylated (MALAT1: https://www.ncbi.nlm.nih.gov/pmc/articles/PMC2722846/). Only samples with 2 biological replicates were retained. For each cell type, gene, and biological replicate the fraction of transcripts per million (TPM) in each cellular compartment was calculate as the fraction of TPM in the specific compartment by the total TPM. The mean and standard deviation for the fraction was subsequently calculated for each cell type and cellular compartment and this information was represented in the final figure.



Results

data <- getData("Supplementary Figure 2d")

#SK-N-SH has a larger proportion of GAPDH in the nucleus than cytoplasm
#The variation in miR34a asRNA expression is way too large for SK-MEL-5
#K562, HT1080, SK-N-DZ, and NCI-H460 have no or low miR34a asRNA expression
#use GAPDH instead of ACTB for cytoplasmic fraction, seems to be a better representation.

fdata <- data %>% 
  filter(!`Biosample term name` %in% c("K562", "HT1080", "SK-N-DZ", "NCI-H460", "SK-MEL-5", "SK-N-SH")) %>% 
  filter(gene_name != "ACTB") %>%
  select(`Biosample term name`, gene_id, gene_name, fraction, `Library made from`, `Biological replicate(s)`, TPM) %>%
  group_by(`Biosample term name`, gene_id, gene_name, fraction) %>%
  mutate(tmp = n() == 2) %>%
  filter(tmp | `Library made from` == "polyadenylated mRNA") %>%
  ungroup() %>%
  select(-tmp) %>%
  group_by(`Biosample term name`, gene_id, gene_name, `Biological replicate(s)`) %>%
  mutate(frac = TPM / sum(TPM)) %>%
  ungroup()

stats <- fdata %>%
  group_by(`Biosample term name`, gene_id, gene_name, fraction) %>%
  filter(n() == 2) %>%
  summarize(
    n = n(),
    mean = mean(frac),
    sd_min = mean(frac) + sd(frac),
    sd_max = mean(frac) - sd(frac)
  )  %>%
  ungroup()

#prepare for plotting 
stats <- mutate(stats, gene_name = parse_factor(gene_name, levels = c("GAPDH", "MALAT1", "lncTAM34a")))
markdown <- ggplot(data = NULL) +
  geom_point(
    data = stats,
    aes(x = gene_name, y = mean, color = fraction),
    position = position_dodge(width = 1)
  ) +
  facet_wrap(~`Biosample term name`, ncol = 1) +
  geom_errorbar(
    data = stats, 
    aes(x = gene_name, ymax = sd_max, ymin = sd_min, colour = fraction), 
    width = 0,
    position = position_dodge(width = 1)
  ) +
  geom_vline(
    mapping = NULL, 
    xintercept=c(1.5, 2.5), 
    color = "darkgrey", 
    linetype = 3
  ) +
  scale_y_continuous(labels = scales::percent) +
  coord_flip() +
  labs(
    y = "% TPM",
    x = "Gene"
  ) +
  guides(colour = guide_legend(title = "Fraction"))

plotRmarkdown(markdown) +
  theme(
    legend.position = "top",
    axis.title.y = element_text(margin = margin(r = 20, "cm"))
  )

pdf <- ggplot(data = NULL) +
  geom_point(
    data = stats,
    aes(x = gene_name, y = mean, color = fraction),
    position = position_dodge(width = 0.45),
    size = 0.17
  ) +
  facet_wrap(~`Biosample term name`, ncol = 1) +
  geom_errorbar(
    data = stats, 
    aes(x = gene_name, ymax = sd_max, ymin = sd_min, colour = fraction), 
    width = 0,
    position = position_dodge(width = 0.45),
    size = 0.25
  ) +
  geom_vline(
    mapping = NULL,
    xintercept=c(1.5, 2.5), 
    color = "darkgrey", 
    linetype = 3,
    size = 0.35
  ) +
  scale_y_continuous(labels = scales::percent) +
  coord_flip() +
  labs(
    y = "% TPM",
    x = "Gene"
  ) +
  guides(colour = guide_legend(title = "Fraction"))

figure <- plotPDF(pdf) +
  theme(
    axis.text.y = element_text(size = 4),
    panel.spacing = unit(0.1, "lines"),
    strip.text.x = element_text(size = 5, margin = margin(5.5, 5.5, 2.5, 5.5, unit = "pt"))
  )

# path <- file.path("~/GitHub/miR34a_asRNA_project/inst", fileMap(type = "pdf")["Figure 1-Supplement 2e"][[1]])
# ggsave(
#   plot = figure,
#   filename = path,
#   device = cairo_pdf,
#   height = 60,
#   width = 83,
#   units = "mm"
# )
RNAseq data from five fractionated cell lines in the ENCODE project showing the percentage of transcripts per million (TPM) for miR34a asRNA. MALAT1 (nuclear localization) and GAPDH (cytoplasmic localization) are included as fractionation controls. Points represent the mean and horizontal lines represent the standard deviation from two biological replicates.



Conclusions

The analysis revealed that the miR34a asRNA transcript localizes to both the nucleus and cytoplasm but primarily resides in the nucleus.





sessionInfo()


GranderLab/miR34a_asRNA_project documentation built on May 26, 2019, 7:26 a.m.