# example driver 2
require(coproanalysis)
##############
# Process data
##############
# load data (stored in package, no file path required)
dat <- loadCoproData(chromStart = 0.33, chromEnd = 0.66)
# normalize reads
normalizeCoProData(dat)
# identify the TSNs
tsnIds <- identifyTSNsR(dat)
# remove bases that are not TSNs
dat <- dat[ID5 %in% tsnIds]
# Identify the TSSs
identifyTSSs(dat)
# convert to tibble
dat <- as_tibble(dat)
# require each TSS to have at least one capped read (are the TSSs without a capped read similar on the other variables to those with one? Make violin plots.)
dat <- dat %>% group_by(tssId) %>% filter(sum(U) >= 1)
##############
# Summarize data and fit model
##############
pauseLoc <- 40
idVars <- c("tssId", "ID5s", "chrom", "chromLocation", "strand")
modelVars <- c("totalReads", "nTSNs", "nDistinctPauseSites", "dispersion", "relUncap", "pauseIdx", "medLength")
# For each tss, compute the value of the model variables
datSum <- dat %>% summarize(
ID5s = paste0(as.character(unique(ID5[R > 0])), collapse = ","),
chrom = seqnames[R > 0][1],
chromLocation = start[R > 0][1],
strand = strand[R > 0][1],
totalReads = sum(R[R > 0]),
nTSNs = length(unique(ID5[R > 0])),
nDistinctPauseSites = length(unique(ID3[R > 0])),
dispersion = calculateDispersion(R[R > 0], ID5[R > 0]),
relUncap = sum(U[U > 0])/totalReads,
pauseIdx = mean(rep(width[R > 0], R[R > 0]) < pauseLoc),
medLength = median(rep(width[R > 0], R[R > 0]))) %>%
mutate_at(c("totalReads", "nTSNs", "nDistinctPauseSites", "medLength", "relUncap"), log) %>% # log transform these vars
mutate_at(vars(-idVars), function(x) (x - mean(x))/sd(x)) # scale all vars except for the identification vars
corMat <- cor(select(datSum, -idVars))
plot1 <- ggcorr(data = select(datSum, modelVars) %>% rename("N reads" = totalReads, "N TSNs" = nTSNs, "N pause sites" = nDistinctPauseSites, "Dispersion" = dispersion, "Frac uncapped" = relUncap, "Pause idx" = pauseIdx, "Med length" = medLength), palette = "RdBu", label = TRUE, hjust = 0.7, label_round = 2)
# get the bootstrap samples
bootSampRes <- getBootstrapSamples(dat = select(datSum, -idVars), nFactors = 2, rotation = "varimax", B = 5000)
bootSamps <- bootSampRes$bootSamples
fit <- bootSampRes$baseFA
fit$loadings <- fit$loadings * -1 # facilitate interpretation of factor loadings
# In what follows, we make a few plots.
# Compute standard errors for the model parameters as well as communalities using the bootstrap samples.
loadingsSE <- getSEFromBootStats(l = map(bootSamps, function(x) x[["loadings"]])) # factor loadings
varSE <- getSEFromBootStats(l = map(bootSamps, function(x) as.matrix(data.frame(specVar = x[["specificVariances"]], com = x[["communalities"]]))))
vars <- c("N reads", "N TSNs", "N pause sites", "Dispersion", "Frac. uncapped", "Pause idx.", "Median length")
# make plot of factor loadings
myCols <- c("royalblue4", "red4")
myFactors <- c("Activity", "Pause profile")
loadingsSE$row <- factor(x = loadingsSE$row, levels = 1:length(fit$specificVariances), labels = vars)
loadingsSE$col <- factor(x = loadingsSE$col, levels = 1:2, labels = myFactors)
estimatedLoadings <- numeric()
for (i in 1:ncol(fit$loadings)) estimatedLoadings <- c(estimatedLoadings, fit$loadings[,i])
loadingsSE$estimatedLoadings <- estimatedLoadings
plot2 <- ggplot(data = loadingsSE, mapping = aes(x = row, y = estimatedLoadings, col = col)) + geom_hline(yintercept = 0, lwd = 0.5, col = "grey") + geom_point(size = 2.5) + geom_errorbar(aes(ymin = estimatedLoadings - SE * 4, ymax = estimatedLoadings + SE * 4, width = 0)) + theme_bw() + theme(axis.text.x=element_text(angle = 45, hjust = 1), panel.grid.major.y = element_blank(), panel.grid.minor.y = element_blank()) + ylab("Correlation coefficient") + xlab("Variable") + scale_color_manual(name = "Latent factor", values = myCols)
# Make plot of specific variances and communalities
varTypes <- c("Unexplained \n by factors", "Explained \n by factors")
myCols <- c("darkgoldenrod3", "chartreuse4")
varSE$row <- factor(x = varSE$row, levels = 1:length(fit$specificVariances), labels = vars)
varSE$col <- factor(x = varSE$col, levels = 1:2, labels = varTypes)
varSE$estimate <- c(fit$specificVariances, fit$communalities)
plot3 <- ggplot(data = varSE, mapping = aes(x = row, y = estimate, col = col)) + geom_point() + geom_point(size = 2.5) + geom_errorbar(aes(ymin = estimate - SE * 4, ymax = estimate + SE * 4, width = 0)) + theme_bw() + theme(axis.text.x=element_text(angle = 45, hjust = 1), panel.grid.major.y = element_blank(), panel.grid.minor.y = element_blank()) + xlab("Variable") + ylab("Variance") + scale_color_manual(name = "Variance", values = myCols)
# Make a plot of the factor scores
fscores <- getFactorScores(fit$loadings, select(datSum, -idVars))
fscores <- select(fscores, Activity = V1, Pause_profile = V2)
fscores <- mutate(fscores, chrom = datSum$chrom, tssId = datSum$tssId)
plot4 <- ggplot() + geom_point(data = fscores[sample(1:nrow(fscores)),], mapping = aes(x = Activity, y = Pause_profile, col = chrom), alpha = 0.95, cex = 0.5) + theme(legend.position = "none") + ylab("Pause profile") + theme_bw() + theme(legend.position = "none")
# find some of the more extreme TSSs to use as examples; useful function is findTssOnFactorScorePlot.
# High activity and late pause: TSS ID 8237, RPLP2 protein-coding gene
filter(datSum, tssId == 8237) %>% select(modelVars) %>% calculatePercentiles(datSum)
exampleTSSs <- 8237
exampleTSSsTibble <- bind_cols(data.frame(tssId = exampleTSSs, geneName = c("RPLP2")),
slice(fscores, sapply(exampleTSSs, function(x) {which(fscores$tssId == x)})) %>% select(Activity, Pause_profile))
plot5 <- plot4 + geom_point(data = exampleTSSsTibble, mapping = aes(x = Activity, y = Pause_profile), shape = 3, cex = 4) + geom_text(data = exampleTSSsTibble, mapping = aes(x = Activity, y = Pause_profile, label = geneName), nudge_y = 0.4) + theme(legend.position = "none") + ylab("Pause profile") + theme_bw() + theme(legend.position = "none")
# save plots
ggsave(filename = "/Users/timbarry/iCloud Drive (Archive)/Documents/ADAResearch/Presentations/presAugust/factorLoadings.png", plot = plot2, device = "png", width = 6, height = 4, scale = 0.8)
ggsave(filename = "/Users/timbarry/iCloud Drive (Archive)/Documents/ADAResearch/Presentations/presAugust/variancePartition.png", plot = plot3, device = "png", width = 6, height = 4, scale = 0.8)
ggsave(filename = "/Users/timbarry/iCloud Drive (Archive)/Documents/ADAResearch/Presentations/presAugust/lowDim.png", plot = plot5, device = "png", width = 6, height = 5, scale = 0.7)
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.