list.of.packages <- c('ggplot2', 'dplyr', 'plyr', 'RColorBrewer',
'BSgenome.Dmelanogaster.UCSC.dm6', 'deconstructSigs',
'reshape', 'data.table', 'ggpubr', 'plotly', 'grid', 'VennDiagram')
new.packages <- list.of.packages[!(list.of.packages %in% installed.packages()[,"Package"])]
if(length(new.packages)){
cat('Installing missing packages...\n')
install.packages(new.packages)
}
cat('Silently loading packages...')
suppressMessages(library(ggplot2))
suppressMessages(library(dplyr))
suppressMessages(library(plyr))
suppressMessages(library(RColorBrewer))
suppressMessages(library(BSgenome.Dmelanogaster.UCSC.dm6))
suppressMessages(library(deconstructSigs))
suppressMessages(library(reshape))
suppressMessages(library(data.table))
suppressMessages(library(ggpubr))
suppressMessages(library(plotly))
suppressMessages(library(grid))
suppressMessages(library(VennDiagram))
set.seed(42)
#' getData
#'
#' Function to clean cnv files
#' @param infile File to process [Required]
#' @keywords get
#' @import dplyr
#' @export
#' @return Dataframe
getData <- function(infile = "data/annotated_snvs.txt", expression_data='data/isc_genes_rnaSeq.csv'){
snv_data<-read.delim(infile, header = T)
colnames(snv_data)=c("sample", "chrom", "pos", "ref", "alt", "tri", "trans", "decomposed_tri", "grouped_trans", "a_freq", "caller", "variant_type", "status", "snpEff_anno", "feature", "gene", "id")
# Read in tissue specific expression data
seq_data<-read.csv(header = F, expression_data)
colnames(seq_data)<-c('id', 'fpkm')
snv_data <- join(snv_data,seq_data,"id", type = 'left')
snv_data$fpkm <- ifelse(is.na(snv_data$fpkm), 0, round(as.numeric(snv_data$fpkm), 1))
# Order by FPKM
snv_data<- dplyr::arrange(snv_data, desc(fpkm))
# Find vars called by both Mu and Var
# Must also filter one of these calls out...
snv_data$dups<-duplicated(snv_data[,1:3])
snv_data<-mutate(snv_data, caller = ifelse(dups == "TRUE", 'varscan2_mutect2' , as.character(caller)))
##############
## Filters ###
##############
# Filter for calls made by both V and M
# snv_data<-filter(snv_data, caller == 'mutect2' | caller == 'varscan2_mutect2')
# Filter for old/new data
# cat("Filtering for old/new data\n")
# snv_data <- filter(snv_data, !grepl("^A|H", sample))
# Filter for genes expressed in RNA-Seq data
# cat("Filtering out non-expressed genes\n")
# snv_data<-filter(snv_data, !is.na(fpkm) & fpkm > 0.1)
# Filter for genes NOT expressed in RNA-Seq data
# cat("Filtering out expressed genes\n")
# snv_data<-filter(snv_data, fpkm == 0)
# Filter on allele freq
# cat("Filtering on allele frequency\n")
#snv_data<-filter(snv_data, is.na(a_freq))
# snv_data<-filter(snv_data, a_freq >= 0.20)
# Filter out samples
# snv_data<-filter(snv_data, sample != "A373R1" & sample != "A373R7" & sample != "A512R17" )
# snv_data <- filter(snv_data, !sample %in% c("A373R1", "A373R7", "A512R17", "A373R11", "A785-A788R1", "A785-A788R11", "A785-A788R3", "A785-A788R5", "A785-A788R7", "A785-A788R9"))
# snv_data<-filter(snv_data, sample != "A373R11" & sample != 'A373R13')
# snv_data <- snv_data %>%
# filter(!sample %in% c("A373R1", "A373R7", "A512R17", "A373R11", "D050R07-2")) %>%
# droplevels()
# snv_data <- snv_data %>%
# filter(sample %in% c("D050R01", "D050R03", "D050R05", "D050R07-1")) %>%
# droplevels()
dir.create(file.path("plots"), showWarnings = FALSE)
return(snv_data)
}
#' cleanTheme
#'
#' Clean theme for plotting
#' @param base_size Base font size [Default 12]
#' @import ggplot2
#' @keywords theme
#' @export
cleanTheme <- function(base_size = 12){
theme(
plot.title = element_text(hjust = 0.5, size = 20),
panel.background = element_blank(),
plot.background = element_rect(fill = "transparent",colour = NA),
panel.grid.minor = element_blank(),
panel.grid.major = element_blank(),
axis.line.x = element_line(color="black", size = 0.5),
axis.line.y = element_line(color="black", size = 0.5),
axis.text = element_text(size=12),
axis.title = element_text(size=30)
)
}
#' genTris
#'
#' This function returns all possible trinucleotide combinations
#' @keywords trinucleotides
#' @export
#' @return Character string containing all 96 trinucleotides
#' genTris()
genTris <- function(){
all.tri = c()
for(i in c("A", "C", "G", "T")){
for(j in c("C", "T")){
for(k in c("A", "C", "G", "T")){
if(j != k){
for(l in c("A", "C", "G", "T")){
tmp = paste(i, "[", j, ">", k, "]", l, sep = "")
all.tri = c(all.tri, tmp)
}
}
}
}
}
all.tri <- all.tri[order(substr(all.tri, 3, 5))]
return(all.tri)
}
#' setCols
#'
#' Get colours for n levels
#' @import RColorBrewer
#' @param df Dataframe [Required]
#' @param col Column of snv_dataframe. Colours will be set to levels(df$cols) [Required]
#' @keywords cols
#' @export
setCols <- function(df, col){
names<-levels(df[[col]])
cat("Setting colour levles:", names, "\n")
level_number<-length(names)
mycols<-brewer.pal(level_number, "Set2")
names(mycols) <- names
colScale <- scale_fill_manual(name = col,values = mycols)
return(colScale)
}
#' snvStats
#'
#' Calculate some basic stats for snv snv_data
#' @import dplyr
#' @keywords stats
#' @export
snvStats <- function(){
snv_data<-getData()
cat("sample", "snvs", sep='\t', "\n")
rank<-sort(table(snv_data$sample), decreasing = TRUE)
rank<-as.array(rank)
total=0
scores=list()
for (i in 1:nrow(rank)){
cat(names(rank[i]), rank[i], sep='\t', "\n")
total<-total + rank[i]
scores[i]<-rank[i]
}
cat('--------------', '\n')
scores<-unlist(scores)
mean<-as.integer(mean(scores))
med<-as.integer(median(scores))
cat('total', total, sep='\t', '\n')
cat('samples', nrow(rank), sep='\t', '\n')
cat('--------------', '\n')
cat('mean', mean, sep='\t', '\n')
cat('median', med, sep='\t', '\n')
cat('\n')
all_ts<-nrow(filter(snv_data, trans == "A>G" | trans == "C>T" | trans == "G>A" | trans == "T>C"))
all_tv<-nrow(filter(snv_data, trans != "A>G" & trans != "C>T" & trans != "G>A" & trans != "T>C"))
ts_tv<-round((all_ts/all_tv), digits=2)
cat("ts/tv = ", ts_tv, sep='', '\n')
}
#' rainfall
#'
#' Plot log10 distances between snvs as rainfall plot
#' @import ggplot2
#' @keywords rainfall
#' @export
rainfall <- function(){
snv_data <- getData()
distances <- do.call(rbind, lapply(split(snv_data[order(snv_data$chrom, snv_data$pos),], snv_data$chrom[order(snv_data$chrom, snv_data$pos)]),
function(a)
data.frame(a,
dist=c(diff(a$pos), NA),
logdist = c(log10(diff(a$pos)), NA))
)
)
distances$logdist[is.infinite(distances$logdist)] <- 0
distances<-filter(distances, chrom != 4)
p<-ggplot(distances)
p<-p + geom_point(aes(pos/1000000, logdist, colour = grouped_trans))
p <- p + cleanTheme() +
theme(axis.text.x = element_text(angle=45, hjust = 1),
panel.grid.major.y = element_line(color="grey80", size = 0.5, linetype = "dotted"),
strip.text = element_text(size=20)
)
p<-p + facet_wrap(~chrom, scale = "free_x", ncol = 6)
#p<-p + scale_x_continuous("Mbs", breaks = seq(0,33,by=1), limits = c(0, 33), expand = c(0.01, 0.01))
p<-p + scale_x_continuous("Mbs", breaks = seq(0,max(distances$pos),by=10))
rainfall_out<-paste("rainfall.pdf")
cat("Writing file", rainfall_out, "\n")
ggsave(paste("plots/", rainfall_out, sep=""), width = 20, height = 5)
p
}
#' samplesPlot
#'
#' Plot the snv distribution for each sample
#' @import ggplot2
#' @param count Output total counts instead of frequency if set [Default no]
#' @keywords spectrum
#' @export
samplesPlot <- function(count=NA){
snv_data<-getData()
mut_class<-c("C>A", "C>G", "C>T", "T>A", "T>C", "T>G")
p<-ggplot(snv_data)
if(is.na(count)){
p<-p + geom_bar(aes(x = grouped_trans, y = (..count..)/sum(..count..), group = sample, fill = sample), position="dodge",stat="count")
p<-p + scale_y_continuous("Relative contribution to total mutation load", expand = c(0.0, .001))
tag='_freq'
}
else{
p<-p + geom_bar(aes(x = grouped_trans, y = ..count.., group = sample, fill = sample), position="dodge",stat="count")
p<-p + scale_y_continuous("Count", expand = c(0.0, .001))
tag='_count'
}
p<-p + scale_x_discrete("Mutation class", limits=mut_class)
p<-p + cleanTheme() +
theme(panel.grid.major.y = element_line(color="grey80", size = 0.5, linetype = "dotted"),
axis.title = element_text(size=20),
strip.text.x = element_text(size = 10)
)
p<-p + facet_wrap(~sample, ncol = 4, scale = "free_x" )
samples_mut_spect<-paste("mutation_spectrum_samples", tag, ".pdf", sep = '')
cat("Writing file", samples_mut_spect, "\n")
ggsave(paste("plots/", samples_mut_spect, sep=""), width = 20, height = 10)
p
}
#calledSnvs
calledSnvs <- function(){
snv_data<-getData()
calls<-table(snv_data$caller)
calls<-as.data.frame(unlist(calls))
calls$Var1 <- as.factor(calls$Var1)
grid.newpage()
draw.pairwise.venn(area1 = calls$Freq[calls$Var1 == 'mutect2'],
area2 = calls$Freq[calls$Var1 == 'varscan2'],
cross.area = calls$Freq[calls$Var1 == 'varscan2_mutect2'],
category = c("Mutect2","Varscan2"),
#lty = rep('blank', 2),
lwd = rep(0.3, 2),
cex = rep(2, 3),
cat.cex = rep(2, 2),
fill = c("#E7B800", "#00AFBB"),
alpha = rep(0.4, 2),
cat.pos = c(0, 0),
#cat.dist = rep(0.025, 2)
ext.text = 'FALSE'
)
}
#' mutSigs
#'
#' Calculate and plot the mutational signatures accross samples using the package `deconstructSigs`
#' @param samples Calculates and plots mutational signatures on a per-sample basis [Default no]
#' @param pie Plot a pie chart shwoing contribution of each signature to overall profile [Default no]
#' @import deconstructSigs
#' @import BSgenome.Dmelanogaster.UCSC.dm6
#' @keywords signatures
#' @export
mutSigs <- function(samples=NULL, pie=FALSE, write=FALSE){
if(!exists('scaling_factor')){
cat("calculating trinucleotide frequencies in genome\n")
scaling_factor <-triFreq()
}
snv_data<-getData()
genome <- BSgenome.Dmelanogaster.UCSC.dm6
if(missing(samples)){
cat("Plotting for all samples\n")
snv_data$tissue = 'All'
sigs.input <- mut.to.sigs.input(mut.ref = snv_data, sample.id = "tissue", chr = "chrom", pos = "pos", alt = "alt", ref = "ref", bsg = genome)
sig_plot<-whichSignatures(tumor.ref = sigs.input, signatures.ref = signatures.cosmic, sample.id = 'All',
contexts.needed = TRUE,
tri.counts.method = scaling_factor
)
if(write){
cat("Writing to file 'plots/all_signatures.pdf'\n")
pdf('plots/all_signatures.pdf', width = 20, height = 10)
plotSignatures(sig_plot)
dev.off()
}
plotSignatures(sig_plot)
if(pie){
makePie(sig_plot)
}
}
else{
sigs.input <- mut.to.sigs.input(mut.ref = snv_data, sample.id = "sample", chr = "chrom", pos = "pos", alt = "alt", ref = "ref", bsg = genome)
cat("sample", "snv_count", sep="\t", "\n")
for(s in levels(snv_data$sample)) {
snv_count<-nrow(filter(snv_data, sample == s))
if(snv_count > 50){
cat(s, snv_count, sep="\t", "\n")
sig_plot<-whichSignatures(tumor.ref = sigs.input, signatures.ref = signatures.cosmic, sample.id = s,
contexts.needed = TRUE,
tri.counts.method = scaling_factor)
if(write){
outfile<-(paste('plots/', s, '_signatures.pdf', sep = ''))
cat("Writing to file", outfile, "\n")
pdf(outfile, width = 20, height = 10)
plotSignatures(sig_plot)
dev.off()
}
plotSignatures(sig_plot)
if(pie){
makePie(sig_plot)
}
}
}
}
}
#' sigTypes
#'
#' Calculate and plot the mutational signatures accross samples using the package `deconstructSigs`
#' @param samples Calculates and plots mutational signatures on a per-sample basis [Default no]
#' @param pie Plot a pie chart shwoing contribution of each signature to overall profile [Default no]
#' @import deconstructSigs
#' @import data.table
#' @import reshape
#' @import forcats
#' @import BSgenome.Dmelanogaster.UCSC.dm6
#' @keywords signatures
#' @export
sigTypes <- function(write=FALSE){
suppressMessages(require(BSgenome.Dmelanogaster.UCSC.dm6))
suppressMessages(require(deconstructSigs))
if(!exists('scaling_factor')){
cat("Calculating trinucleotide frequencies in genome\n")
scaling_factor <-triFreq()
}
snv_data<-getData()
genome <- BSgenome.Dmelanogaster.UCSC.dm6
sigs.input <- mut.to.sigs.input(mut.ref = snv_data, sample.id = "sample", chr = "chrom", pos = "pos", alt = "alt", ref = "ref", bsg = genome)
l = list()
for(s in levels(snv_data$sample)) {
snv_count<-nrow(filter(snv_data, sample == s))
if(snv_count > 50){
sig_plot<-whichSignatures(tumor.ref = sigs.input, signatures.ref = signatures.cosmic, sample.id = s,
contexts.needed = TRUE,
tri.counts.method = scaling_factor)
l[[s]] <- sig_plot
}
}
mutSigs<-do.call(rbind, l)
mutSigs<-as.data.frame(mutSigs)
mutWeights<-mutSigs$weights
mutData<-melt(rbindlist(mutWeights, idcol = 'sample'),
id = 'sample', variable.name = 'signature', value.name = 'score')
mutData <- mutData %>%
filter(score > 0.1) %>%
group_by(sample) %>%
mutate(total = sum(score))
p <- ggplot(mutData)
p <- p + geom_bar(aes(fct_reorder(sample, -total), score, fill=signature),colour="black", stat = "identity")
p <- p + scale_x_discrete("Sample")
p <- p + scale_y_continuous("Signature contribution", expand = c(0.01, 0.01), breaks=seq(0, 1, by=0.1))
p <- p + cleanTheme() +
theme(axis.text.x = element_text(angle = 45, hjust=1),
axis.text = element_text(size=30)
)
if(write){
sigTypes<-paste("sigTypes.pdf")
cat("Writing file", sigTypes, "\n")
ggsave(paste("plots/", sigTypes, sep=""), width = 20, height = 10)
}
p
}
####
# sigTypesPie
####
sigPie <- function() {
df <- data.frame(
group = c("Sig3", "Sig5", "Sig8", "Unknown"),
value = c(21, 14, 25, 40),
cols = c('#DB8E00', '#64B200', '#00BD5C', '#00BADE'))
all <- data.frame(
group = c("Sig3", "Sig8", "Sig9", "Sig21", "Sig25", "Unknown"),
value = c(29, 17, 10, 7, 7, 30),
cols = c('#E68613', '#0CB702', '#00BE67', '#ED68ED', '#FF61CC', 'grey'))
bp <- ggplot(all, aes(x="", y=value, fill = cols)) +
geom_bar(width = 1, stat = "identity", colour = "white") +
scale_fill_manual(values = levels(all$cols), labels = levels(all$group))
pie <- bp + coord_polar("y", start=0)
pie + cleanTheme() +
theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())
}
#' mutSpectrum
#'
#' Plots the mutations spectrum for all samples combined
#' @import ggplot2
#' @keywords spectrum
#' @export
mutSpectrum <- function(write=FALSE, max_y=25){
snv_data<-getData()
cat("Showing global contribution of tri class to mutation load", "\n")
p <- ggplot(snv_data)
p <- p + geom_bar(aes(x = decomposed_tri, y = (..count..)/sum(..count..), group = decomposed_tri, fill = grouped_trans), position="dodge",stat="count")
p <- p + scale_y_continuous("Contribution to mutation load", limits = c(0, max_y/100), breaks=seq(0,max_y/100,by=0.025), labels=paste0(seq(0,max_y,by=2.5), "%"), expand = c(0.0, .0005))
p <- p + scale_x_discrete("Genomic context", expand = c(.005, .005))
p <- p + cleanTheme() +
theme(panel.grid.major.y = element_line(color="grey80", size = 0.5, linetype = "dotted"),
axis.text.x = element_text(angle = 90, hjust=1),
axis.text.y = element_text(size=15),
axis.title = element_text(size=20),
strip.text.x = element_text(size = 15)
)
p <- p + facet_wrap(~grouped_trans, ncol = 6, scale = "free_x" )
p <- p + guides(grouped_trans = FALSE)
if(write){
mut_spectrum<-paste("mutation_spectrum.pdf")
cat("Writing file", mut_spectrum, "\n")
ggsave(paste("plots/", mut_spectrum, sep=""), width = 20, height = 5)
}
p
}
#' featureEnrichment
#'
#' Function to calculate enrichment of snv hits in genomic features
#' @description Calculate the enrichment of snv hits in genomic features
#' A 'features' file must be provided with the follwing format:
#' feature length percentage
#' This can be generated using the script 'script/genomic_features.pl' and a genome .gtf file
#' The defualt genome length is set to the mappable regions of the Drosophila melanogastor Dmel6.12 genome (GEM mappability score > .5)
#' (118274340). The full, assembled genome legnth for chroms 2/3/4/X/Y is 137547960
#' @param features File containing total genomic lengths of features [Default 'data/genomic_features.txt']
#' @param genome_length The total legnth of the genome [Default 118274340 (mappable regions on chroms 2, 3, 4, X & Y for Drosophila melanogastor Dmel6.12)]
#' @keywords enrichment
#' @import dplyr ggpubr
#' @return A snv_data frame with FC scores for all genes seen at least n times in snv snv_data
#' @export
featureEnrichment <- function(features='data/genomic_features.txt', genome_length=118274340, write=FALSE){
genome_features<-read.delim(features, header = T)
snv_data<-getData()
mutCount<-nrow(snv_data)
# To condense exon counts into "exon"
snv_data$feature<-as.factor(gsub("exon_.*", "exon", snv_data$feature))
classCount<-table(snv_data$feature)
classLengths<-setNames(as.list(genome_features$length), genome_features$feature)
fun <- function(f) {
# Calculate the fraction of geneome occupied by each feature
featureFraction<-classLengths[[f]]/genome_length
# How many times should we expect to see this feature hit in our snv_data (given number of obs. and fraction)?
featureExpect<-(mutCount*featureFraction)
# observed/expected
fc<-classCount[[f]]/featureExpect
Log2FC<-log2(fc)
featureExpect<-round(featureExpect,digits=3)
# Binomial test
if(!is.null(classLengths[[f]])){
if(classCount[f] >= featureExpect){
stat<-binom.test(x = classCount[f], n = mutCount, p = featureFraction, alternative = "greater")
test<-"enrichment"
}
else{
stat<-binom.test(x = classCount[f], n = mutCount, p = featureFraction, alternative = "less")
test<-"depletion"
}
sig_val <- ifelse(stat$p.value <= 0.001, "***",
ifelse(stat$p.value <= 0.01, "**",
ifelse(stat$p.value <= 0.05, "*", "")))
p_val<-format.pval(stat$p.value, digits = 3, eps=0.0001)
list(feature = f, observed = classCount[f], expected = featureExpect, Log2FC = Log2FC, test = test, sig = sig_val, p_val = p_val)
}
}
enriched<-lapply(levels(snv_data$feature), fun)
enriched<-do.call(rbind, enriched)
featuresFC<-as.data.frame(enriched)
# Sort by FC value
featuresFC<-dplyr::arrange(featuresFC,desc(abs(as.numeric(Log2FC))))
featuresFC$Log2FC<-round(as.numeric(featuresFC$Log2FC), 1)
if(write){
featuresFC <- filter(featuresFC, observed >= 5)
first.step <- lapply(featuresFC, unlist)
second.step <- as.data.frame(first.step, stringsAsFactors = F)
ggpubr::ggtexttable(second.step, rows = NULL, theme = ttheme("mGreen"))
feat_enrichment_table <- paste("feature_enrichment_table.tiff")
cat("Writing to file: ", 'plots/', feat_enrichment_table, sep = '')
ggsave(paste("plots/", feat_enrichment_table, sep=""), width = 5.5, height = (nrow(featuresFC)/3), dpi=300)
} else{
return(featuresFC)
}
}
featureEnrichmentPlot <- function(write=FALSE) {
feature_enrichment<-featureEnrichment()
feature_enrichment$feature <- as.character(feature_enrichment$feature)
feature_enrichment$Log2FC <- as.numeric(feature_enrichment$Log2FC)
feature_enrichment <- transform(feature_enrichment, feature = reorder(feature, -Log2FC))
feature_enrichment <- filter(feature_enrichment, observed >= 5)
# Custom sorting
# feature_enrichment$feature <- factor(feature_enrichment$feature, levels=c("intron", "intergenic", "exon", "3UTR", "ncRNA", "5UTR"))
p<-ggplot(feature_enrichment)
p<-p + geom_bar(aes(feature, Log2FC, fill = as.character(test)), stat="identity")
p<-p + guides(fill=FALSE)
p<-p + ylim(-2,2)
p<-p + cleanTheme() +
theme(panel.grid.major.y = element_line(color="grey80", size = 0.5, linetype = "dotted"),
axis.text.x = element_text(angle = 45, hjust=1),
axis.text = element_text(size=20)
)
if(write){
feat_plot <- paste("feat_plot.pdf")
cat("Writing file", feat_plot, "\n")
ggsave(paste("plots/", feat_plot, sep=""), width = 5, height = 10)
}
p
}
#' geneEnrichment
#'
#' Function to calculate fold change enrichment in a set of snv calls correcting for gene length
#' @description Calculate the enrichment of snv hits in length-corrected genes
#' A 'gene_lengths' file must be provided with the following fields (cols 1..6 required)
#' gene length chrom start end tss scaling_factor
#' This can be generated using the script 'script/genomic_features.pl' and a genome .gtf file
#' The defualt genome length is set to the mappable regions of the Drosophila melanogastor Dmel6.12 genome (GEM mappability score > .5)
#' (118274340). The full, assembled genome legnth for chroms 2/3/4/X/Y is 137547960
#' @param gene_lengths File containing all genes and their lengths (as generated by 'script/genomefeatures.pl') [Default 'data/gene_lengths.txt']
#' @param n The number of times we need to have seen a gene in our snv_data to view its enrichment score [Default 3]
#' @param genome_length The total legnth of the genome [Default 137547960 (chroms 2, 3, 4, X & Y for Drosophila melanogastor Dmel6.12)]
#' @keywords enrichment
#' @import dplyr
#' @import ggpubr
#' @return A snv_data frame with FC scores for all genes seen at least n times in snv snv_data
#' @export
geneEnrichment <- function(gene_lengths_in="data/gene_lengths.txt", n=10, genome_length=118274340, write=FALSE){
snv_data <- getData() %>%
dplyr::filter(gene != "intergenic") %>%
droplevels()
snv_count<-nrow(snv_data)
gene_lengths <- read.delim(gene_lengths_in, header = T)
gene_lengths <- gene_lengths %>%
dplyr::filter(length > 1000) %>%
dplyr::select(gene, length) %>%
droplevels()
genes<-setNames(as.list(gene_lengths$length), gene_lengths$gene)
snv_data<-join(gene_lengths, snv_data, 'gene', type = 'left')
snv_data$fpkm <- ifelse(snv_data$fpkm=='NULL' | snv_data$fpkm=='NA' | is.na(snv_data$fpkm), 0, snv_data$fpkm)
snv_data$observed <- ifelse(is.numeric(snv_data$observed), snv_data$observed, 0)
hit_genes<-table(factor(snv_data$gene, levels = levels(snv_data$gene) ))
expression<-setNames(as.list(snv_data$fpkm), snv_data$gene)
fun <- function(g) {
# Calculate the fraction of geneome occupied by each gene
genefraction<-genes[[g]]/genome_length
# How many times should we expect to see this gene hit in our snv_data (given number of obs. and fraction of genome)?
gene_expect<-snv_count*(genefraction)
# observed/expected
fc<-hit_genes[[g]]/gene_expect
log2FC = log2(fc)
if (hit_genes[[g]] >= gene_expect) {
stat <- binom.test(x = hit_genes[[g]], n = snv_count, p = genefraction, alternative = "greater")
test <- "enrichment"
} else {
stat <- binom.test(x = hit_genes[[g]], n = snv_count, p = genefraction, alternative = "less")
test <- "depletion"
}
sig_val <- ifelse(stat$p.value <= 0.001, "***",
ifelse(stat$p.value <= 0.01, "**",
ifelse(stat$p.value <= 0.05, "*", "")))
sig_val <- ifelse(stat$p.value > 0.05, "-", sig_val)
p_val <- format.pval(stat$p.value, digits = 3)
gene_expect<-round(gene_expect,digits=3)
list(gene = g, length = genes[[g]], fpkm = expression[[g]], test=test, observed = hit_genes[g], expected = gene_expect, fc = fc, log2FC = log2FC, sig_val=sig_val, p_val=p_val)
}
enriched<-lapply(levels(snv_data$gene), fun)
enriched<-do.call(rbind, enriched)
genesFC<-as.data.frame(enriched)
# Filter for genes with few observations
genesFC <- genesFC %>%
dplyr::filter(observed >= n) %>%
dplyr::mutate(expected = round(as.numeric(expected),digits=3)) %>%
dplyr::mutate(log2FC = round(as.numeric(log2FC),digits=2)) %>%
dplyr::mutate(p_val = as.numeric(p_val)) %>%
dplyr::mutate(eScore = abs(log2FC) * -log10(p_val)) %>%
dplyr::mutate(eScore = round(as.numeric(eScore),digits=2)) %>%
dplyr::select(gene, observed, expected, log2FC, test, sig_val, p_val, eScore) %>%
dplyr::arrange(-eScore, p_val, -abs(log2FC)) %>%
droplevels()
if(write){
cat("printing")
first.step <- lapply(genesFC, unlist)
second.step <- as.data.frame(first.step, stringsAsFactors = F)
arrange(second.step,desc(as.integer(log2FC)))
ggpubr::ggtexttable(second.step, rows = NULL, theme = ttheme("mGreen"))
gene_enrichment_table <- paste("gene_enrichment_table.tiff")
ggsave(paste("plots/", gene_enrichment_table, sep=""), width = 5.2, height = (nrow(genesFC)/3), dpi=300)
} else{
return(genesFC)
}
}
# EnrichmentVolcano
#'
#' Plot the enrichment of SNVs in genes features
#' @keywords enrichment
#' @import dplyr
#' @import plotly
#' @export
EnrichmentVolcano <- function(d){
gene_enrichment <- d
minPval <- min(gene_enrichment$p_val[gene_enrichment$p_val>0])
gene_enrichment$p_val <- ifelse(gene_enrichment$p_val==0, minPval/abs(gene_enrichment$log2FC), gene_enrichment$p_val)
gene_enrichment$p_val <- ifelse(gene_enrichment$p_val==0, minPval, gene_enrichment$p_val)
maxLog2 <- max(abs(gene_enrichment$log2FC[is.finite(gene_enrichment$log2FC)]))
maxLog2 <- as.numeric(round_any(maxLog2, 1, ceiling))
ax <- list(
size = 25
)
ti <- list(
size = 25
)
p <- plot_ly(data = gene_enrichment,
x = ~log2FC,
y = ~-log10(p_val),
type = 'scatter',
# showlegend = FALSE,
mode = 'markers',
# height = 1200,
# width = 1000,
# frame = ~p_val,
text = ~paste("Gene: ", gene, "\n",
"Observed: ", observed, "\n",
"Expected: ", expected, "\n",
"P-val: ", p_val, "\n"),
color = ~log10(p_val),
colors = "Spectral",
size = ~-log10(p_val)
) %>%
layout(
xaxis = list(title="Log2(FC)", titlefont = ax, range = c(-maxLog2, maxLog2)),
yaxis = list(title="-Log10(p)", titlefont = ax)
)
p
}
#' snvinGene
#'
#' Plot all snvs found in a given gene
#' @description Plot all snvs found in a given gene.
#' A 'gene_lengths' file must be provided with the following fields (cols 1..6 required)
#' gene length chrom start end tss scaling_factor
#' This can be generated using the script 'script/genomic_features.pl' and a genome .gtf file
#' @param gene_lengths File containing all genes and their lengths (as generated by 'script/genomefeatures.pl') [Default 'data/gene_lengths.txt']
#' @param gene2plot Name of the gene to plot
#' @import ggplot2 dplyr
#' @keywords gene
#' @export
snvinGene <- function(gene_lengths="data/gene_lengths.txt", gene2plot='kuz', annotated=TRUE, col_by_status=TRUE, write=FALSE){
gene_lengths <- read.delim(gene_lengths, header = T)
region <- gene_lengths %>%
dplyr::filter(gene == gene2plot) %>%
droplevels()
gene_length <-(region$end-region$start)
wStart<-(region$start - gene_length/10)
wEnd<-(region$end + gene_length/10)
wChrom<-as.character(region$chrom)
wTss<-suppressWarnings(as.numeric(levels(region$tss))[region$tss])
snv_data<-getData() %>%
dplyr::filter(chrom == wChrom & pos >= wStart & pos <= wEnd)
if(nrow(snv_data) == 0){
stop(paste("There are no snvs in", gene2plot, "- Exiting", "\n"))
}
snv_data$colour_var <- snv_data$feature
if(annotated){
snv_data$colour_var <- snv_data$variant_type
if(col_by_status)
snv_data$colour_var <- snv_data$status
}
p <- ggplot(snv_data)
p <- p + geom_point(aes(pos/1000000, sample, colour = colour_var, size = 1.5), position=position_jitter(width=0, height=0.2))
p <- p + guides(size = FALSE, sample = FALSE)
p <- p + cleanTheme() +
theme(axis.title.y=element_blank(),
panel.grid.major.y = element_line(color="grey80", size = 0.5, linetype = "dotted"),
axis.text.y = element_text(size = 30)
)
p <- p + scale_x_continuous("Mbs", expand = c(0,0), breaks = seq(round(wStart/1000000, digits = 2),round(wEnd/1000000, digits = 2),by=0.05), limits=c(wStart/1000000, wEnd/1000000))
p <- p + annotate("rect", xmin=region$start/1000000, xmax=region$end/1000000, ymin=0, ymax=0.3, alpha=.2, fill="skyblue")
p <- p + geom_vline(xintercept = wTss/1000000, colour="red", alpha=.7, linetype="solid")
p <- p + geom_segment(aes(x = wTss/1000000, y = 0, xend= wTss/1000000, yend = 0.1), colour="red")
middle<-((wEnd/1000000+wStart/1000000)/2)
p <- p + annotate("text", x = middle, y = 0.15, label=gene2plot, size=6)
p <- p + ggtitle(paste("Chromosome:", wChrom))
if(write){
hit_gene<-paste(gene2plot, "_hits.pdf", sep='')
cat("Writing file", hit_gene, "\n")
ggsave(paste("plots/", hit_gene, sep=""), width = 10, height = 10)
}
p
}
#' featuresHit
#'
#' Show top hit features
#' @import ggplot2
#' @keywords features
#' @export
featuresHit <- function(..., write=FALSE){
snv_data<-getData(...)
# To condense exon counts into "exon"
snv_data$feature<-as.factor(gsub("exon_.*", "exon", snv_data$feature))
# Reoders descending
snv_data$feature<-factor(snv_data$feature, levels = names(sort(table(snv_data$feature), decreasing = TRUE)))
snv_data <- snv_data %>%
dplyr::group_by(feature) %>%
dplyr::add_tally() %>%
ungroup() %>%
dplyr::filter(n >= 5) %>%
droplevels()
#cols<-setCols(snv_data, "feature")
p <- ggplot(snv_data)
p <- p + geom_bar(aes(feature, fill = feature))
#p<-p + cols
p <- p + cleanTheme() +
theme(axis.title.x=element_blank(),
panel.grid.major.y = element_line(color="grey80", size = 0.5, linetype = "dotted"))
p <- p + scale_x_discrete(expand = c(0.01, 0.01))
p <- p + scale_y_continuous(expand = c(0.01, 0.01))
# colour to a pub palette:
# p<-p + ggpar(p, palette = 'jco')
if(write){
features_outfile<-paste("hit_features_count.pdf")
cat("Writing file", features_outfile, "\n")
ggsave(paste("plots/", features_outfile, sep=""), width = 20, height = 10)
}
p
}
#' geneHit
#'
#' Show top hit genes
#' @import dplyr
#' @keywords gene
#' @param n Show top n hits [Default 10]
#' @export
geneHit <- function(..., n=10){
snv_data<-getData(...)
snv_data<-filter(snv_data, gene != "intergenic")
hit_count<-as.data.frame(sort(table(unlist(snv_data$gene)), decreasing = T))
colnames(hit_count)<- c("gene", "count")
head(hit_count, n)
}
#' triFreq
#'
#' This function counts the number of times each triunucleotide is found in a supplied genome
#' @param genome BS.genome file defaults to BSgenome.Dmelanogaster.UCSC.dm6
#' @param count Output total counts instead of frequency if set [Default no]
#' @import dplyr
#' @keywords trinucleotides
#' @export
#' @return Dataframe of trinucs and freqs (or counts if count=1)
triFreq <- function(genome=NULL, count=FALSE){
if(missing(genome)){
cat("No genome specfied, defaulting to 'BSgenome.Dmelanogaster.UCSC.dm6'\n")
library(BSgenome.Dmelanogaster.UCSC.dm6, quietly = TRUE)
genome <- BSgenome.Dmelanogaster.UCSC.dm6
}
params <- new("BSParams", X = Dmelanogaster, FUN = trinucleotideFrequency, exclude = c("M", "_"), simplify = TRUE)
snv_data<-as.data.frame(bsapply(params))
snv_data$genome<-as.integer(rowSums(snv_data))
snv_data$genome_adj<-(snv_data$genome*2)
if(count){
tri_count<-snv_data['genome_adj']
tri_count<-cbind(tri = rownames(tri_count), tri_count)
colnames(tri_count) <- c("tri", "count")
rownames(tri_count) <- NULL
return(tri_count)
}
else{
snv_data$x <- (1/snv_data$genome)
scaling_factor<-snv_data['x']
return(scaling_factor)
}
}
# Functions to calculate the distance
# from each breakpoint to user-provided loci (e.g. TSS)
#' generateData
#' Prepare data for dist2motif
#' @keywords simulate
#' @import ggplot2
#' @import dplyr
#' @import colorspace
#' @import RColorBrewer
#' @export
generateData <- function(..., breakpoints=NA, sim=NA, keep=NULL){
if(is.na(breakpoints)){
# if(!missing(keep)){
# real_data <- notchFilt(..., keep=keep)
# } else {
real_data <- getData(..., genotype=='somatic_tumour', !sample %in% c("A373R7", "A512R17", "A785-A788R1", "A785-A788R11", "A785-A788R3", "A785-A788R5", "A785-A788R7", "A785-A788R9"))
# }
real_data <- real_data %>%
dplyr::filter(chrom == "2L" | chrom == "2R" | chrom == "3L" | chrom == "3R" | chrom == "X" ) %>%
dplyr::mutate(pos = bp) %>%
dplyr::select(chrom, pos) %>%
droplevels()
} else{
real_data <- read.table(breakpoints, header = F)
if(is.null(real_data$V3)){
real_data$V3 <- real_data$V2 + 2
}
colnames(real_data) <- c("chrom", "start", "end")
real_data <- real_data %>%
dplyr::filter(chrom == "2L" | chrom == "2R" | chrom == "3L" | chrom == "3R" | chrom == "X" ) %>%
dplyr::mutate(pos = (end+start)/2) %>%
dplyr::select(chrom, pos) %>%
droplevels()
}
if (!is.na(sim)) {
byIteration <- list()
#run each iteration
for (i in 1:sim){
cat("Running simulation", i, "of", sim, "\n")
simByChrom <- list()
for (c in levels(real_data$chrom)){
hitCount <- nrow(real_data[real_data$chrom== c,])
hitCount <- (hitCount*10)
if (i == 1){
cat(paste("Simulating", hitCount, "breakpoints on chromosome", c), "\n")
}
bp_data <- bpSim(nSites = hitCount, byChrom = c)
bp_data$iteration <- i
simByChrom[[c]] <- bp_data
}
result <- as.data.frame(do.call(rbind, simByChrom))
rownames(result) <- NULL
byIteration[[i]] <- result
}
#combine each iteration into one data frame
# final <- dplyr::bind_rows(byIteration)
final <- as.data.frame(do.call(rbind, byIteration))
final$iteration <- as.factor(final$iteration)
return(final)
} else{
cat("Using real data", "\n")
real_data$iteration <- as.factor(1)
return(real_data)
}
}
#' dist2Motif2
#' Calculate the distance from each breakpoint to closest motif in a directory of files
#' @keywords motif
#' @import ggplot2 dplyr tidyr RColorBrewer
#' @export
dist2motif2 <- function(..., feature_file = NA, featureDir = 'rawdata/features/', sim=NA, keep=NULL, position = 'centre') {
snv_data <- generateData(..., confidence=='precise', breakpoints=breakpoints, sim=sim, keep=keep)
cat("Calculating distances to", position, 'of regions', sep = " ", "\n")
svCount <- table(bp_data$chrom)
bp_data <- subset(bp_data, chrom %in% names(svCount[svCount >= 5]))
# bp_data <- droplevels(bp_data)
minDist <- function(p) {
index <- which.min(abs(tss_df$pos - p))
closestTss <- tss_df$pos[index]
chrom <- as.character(tss_df$chrom[index])
dist <- (p - closestTss)
list(p, closestTss, dist, chrom)
}
scores <- list()
fileNames <- dir(featureDir, pattern = ".bed")
# cat("Analysing all files in directory:", bedFiles, "\n")
for (i in 1:length(fileNames)){
filename <- basename(tools::file_path_sans_ext(fileNames[i]))
parts <- unlist(strsplit(filename, split = '\\.'))
feature <- parts[1]
cat("Analysing file:", fileNames[i], 'with feature:', feature, "\n")
feature_locations <- read.table(paste(featureDir, fileNames[i], sep='/'), header = F)
feature_locations <- feature_locations[,c(1,2,3)]
colnames(feature_locations) <- c("chrom", "start", "end")
# fCount <- table(feature_locations$chrom)
#
# bp_data <- subset(bp_data, chrom %in% names(svCount[svCount >= 5]))
#
feature_locations <- feature_locations %>%
dplyr::filter(chrom %in% levels(bp_data$chrom))
if(position == 'centre'){
feature_locations <- feature_locations %>%
dplyr::mutate(end = as.integer(((end+start)/2)+1)) %>%
dplyr::mutate(pos = as.integer(end-1)) %>%
dplyr::select(chrom, pos)
} else if(position == 'edge'){
feature_locations <- feature_locations %>%
tidyr::gather(c, pos, start:end, factor_key=TRUE) %>%
dplyr::select(chrom, pos)
}
byIteration <- list()
for (j in levels(bp_data$iteration)){
byChrom <- list()
df1 <- dplyr::filter(bp_data, iteration == j)
for (c in levels(bp_data$chrom)) {
df <- dplyr::filter(df1, chrom == c)
tss_df <- dplyr::filter(feature_locations, chrom == c)
dist2tss <- lapply(df$pos, minDist)
dist2tss <- do.call(rbind, dist2tss)
new <- data.frame(matrix(unlist(dist2tss), nrow=nrow(df)))
new$iteration <- j
new$feature <- as.factor(feature)
colnames(new) <- c("bp", "closest_tss", "min_dist", "chrom", "iteration", "feature")
byChrom[[c]] <- new
}
perIter <- do.call(rbind, byChrom)
byIteration[[j]] <- perIter
}
dist2feat <- do.call(rbind, byIteration)
scores[[i]] <- dist2feat
}
final <- do.call(rbind, scores)
rownames(final) <- NULL
final$iteration <- as.factor(final$iteration)
final$chrom <- as.character(final$chrom)
final$min_dist <- as.numeric(as.character(final$min_dist))
return(final)
}
# distOverlay
#'
#' Calculate the distance from each breakpoint to closest motif
#' Overlay the same number of random simulated breakpoints
#' @keywords motif
#' @import dplyr
#' @import ggplot2
#' @import ggpubr
#' @import RColorBrewer
#' @export
distOverlay2 <- function(..., breakpoints = NA, featureDir = 'rawdata/features/', from='bps', lim=2.5, n=2, plot = TRUE, keep=NULL, position = 'centre') {
scaleFactor <- lim*1000
real_data <- dist2motif2(..., breakpoints = breakpoints, featureDir = featureDir, keep=keep, position = position)
sim_data <- dist2motif2(..., featureDir = featureDir, sim = n, position = position)
real_data$Source <- as.factor("Real")
sim_data$Source <- as.factor("Sim")
dummy_iterations <- list()
for (i in levels(sim_data$iteration)){
real_data$iteration <- as.factor(i)
dummy_iterations[[i]] <- real_data
}
real_data <- do.call(rbind, dummy_iterations)
rownames(real_data) <- NULL
real_data$iteration <- factor(real_data$iteration, levels = 1:n)
sim_data$iteration <- factor(sim_data$iteration, levels = 1:n)
# Perform significance testing
pVals_and_df <- simSig2(r = real_data, s = sim_data, max_dist = scaleFactor)
combined <- pVals_and_df[[1]]
pVals <- pVals_and_df[[2]]
if(plot==T){
print(plotdistanceOverlay2(..., d=combined, from=from, facetPlot=FALSE, byChrom=byChrom, lim=lim, n=n, position=position ))
print(pVals)
}else{
print(pVals)
return(list(combined, pVals))
}
}
#' plotdistanceOverlay
#'
#' Plot the distance overlay
#' @param d Dataframe containing combined real + sim data (d <- distOverlay())
#' @import dplyr ggplot2 RColorBrewer scales colorspace cowplot
#' @keywords distance
#' @export
plotdistanceOverlay2 <- function(..., d, from='bps', lim=2.5, n=2, position='centre', histo=FALSE, binWidth = 500){
grDevices::pdf(NULL)
scaleFactor <- lim*1000
scale <- "(Kb)"
lims <- c(as.numeric(paste("-", scaleFactor, sep = '')), scaleFactor)
brks <- c(as.numeric(paste("-", scaleFactor, sep = '')),
as.numeric(paste("-", scaleFactor/10, sep = '')),
scaleFactor/10,
scaleFactor)
labs <- as.character(brks/1000)
expnd <- c(0, 0)
new <- d %>%
mutate(iteration = as.factor(ifelse(Source=='Real', 0, iteration)))
real_fill <- '#3D9DEB'
iterFill <- colorspace::rainbow_hcl(n)
colours <- c(real_fill, iterFill)
plts <- list()
for (i in 1:(length(levels(new$feature)))){
d <- new %>%
filter(feature == levels(new$feature)[i])
p <- ggplot(d)
if(histo) {
p <- p + geom_histogram(data=d[d$Source=="Sim",], aes(min_dist, fill = Source, group = iteration), alpha = 0.1, binwidth = binWidth, position="identity")
p <- p + geom_histogram(data=d[d$Source=="Real",], aes(min_dist, fill = Source, group = iteration), alpha = 0.5, binwidth = binWidth, position="identity")
p <- p + scale_fill_manual(values=colours)
p <- p + scale_y_continuous(paste("Count per", binWidth, "bp bins"))
} else {
p <- p + geom_line(data=d[d$Source=="Real",], aes(min_dist, colour = iteration), size=2, stat='density')
p <- p + geom_line(aes(min_dist, group = interaction(iteration, Source), colour = iteration), alpha = 0.7, size=1, stat='density')
p <- p + scale_color_manual(values=colours)
}
p <- p + scale_x_continuous(
limits = lims,
breaks = brks,
expand = expnd,
labels = labs
)
p <- p +
theme(
legend.position = "none",
panel.background = element_blank(),
plot.background = element_rect(fill = "transparent", colour = NA),
axis.line.x = element_line(color = "black", size = 0.5),
axis.text.x = element_text(size = 16),
axis.line.y = element_line(color = "black", size = 0.5),
plot.title = element_text(size=22, hjust = 0.5)
)
p <- p + labs(title = paste(d$feature, "\n", position))
plts[[i]] <- p
}
cat("Plotting", length(levels(new$feature)), "plots", "\n")
grDevices::dev.off()
cowplot::plot_grid(plotlist=plts)
}
simSig2 <- function(r, s, test=NA, max_dist=5000){
cat("Calculating descriptive statistics\n")
arrange_data <- function(x){
x <- x %>%
group_by(iteration, feature) %>%
dplyr::mutate( count = n(),
median = median(min_dist),
mean = mean(min_dist),
sd = sd(min_dist),
Source = Source) %>%
dplyr::filter(abs(min_dist) <= max_dist ) %>%
ungroup()
return(x)
}
simulated <- arrange_data(s)
real <- arrange_data(r)
combined <- suppressWarnings(dplyr::full_join(real, simulated))
combined$Source <- as.factor(combined$Source)
simbyFeat = list()
for (f in levels(combined$feature)){
pVals = list()
c <- dplyr::filter(combined, feature==f)
for(i in levels(c$iteration)){
df <- dplyr::filter(c, iteration==i)
rl <- dplyr::filter(df, Source == "Real")
sm <- dplyr::filter(df, Source == "Sim")
result1 <- tryCatch(suppressWarnings(ks.test(rl$min_dist, sm$min_dist)), error=function(err) NA)
result1 <- suppressWarnings(ks.test(rl$min_dist, sm$min_dist))
ksPval <- round(result1$p.value, 4)
result2 <- car::leveneTest(df$min_dist, df$Source, center='median')
result3 <- stats::bartlett.test(df$min_dist, df$Source)
bPval <- round(result3$p.value, 4)
lPval <- round(result2$`Pr(>F)`[1], 4)
rmed <- round(median(rl$min_dist)/1000, 2)
smed <- round(median(sm$min_dist)/1000, 2)
rsd <- round(sd(rl$min_dist)/1000, 2)
ssd <- round(sd(sm$min_dist)/1000, 2)
rKurtosis <- round(kurtosis(rl$min_dist), 2)
sKurtosis <- round(kurtosis(sm$min_dist), 2)
rSkew <- round(skewness(rl$min_dist), 2)
sSkew <- round(skewness(sm$min_dist), 2)
# fStat <- var.test(min_dist ~ Source , df, alternative = "two.sided")
# fRatio <- round(fStat$statistic, 2)
# fStat <- round(fStat$p.value, 4)
sig <- ifelse(lPval <= 0.001, "***",
ifelse(lPval <= 0.01, "**",
ifelse(lPval <= 0.05, "*", "")))
vals <- data.frame(iteration = i,
feature = f,
KS = ksPval,
Levenes = lPval,
# Bartlett = bPval,
# Fstat_ratio = fRatio,
# Fstat = fStat,
real_median = rmed,
sim_median = smed,
real_sd = rsd,
sim_sd = ssd,
real_kurtosis = rKurtosis,
sim_kurtosis = sKurtosis,
real_skew = rSkew,
sim_skew = sSkew,
sig = sig)
pVals[[i]] <- vals
}
pVals_df <- do.call(rbind, pVals)
simbyFeat[[f]] <- pVals_df
}
combined_sig_vals <- do.call(rbind, simbyFeat)
rownames(combined_sig_vals) <- NULL
combined_sig_vals <- combined_sig_vals %>%
arrange(Levenes, KS)
# print(pVals_df, row.names = FALSE)
## Boxplot per chrom
# colours <- c("#E7B800", "#00AFBB")
# cat("Plotting qq plot of min distances\n")
# qqnorm(combined$min_dist)
# qqline(combined$min_dist, col = 2)
# p <- ggplot(combined)
# p <- p + geom_boxplot(aes(chrom, min_dist, fill = Source), alpha = 0.6)
# p <- p + scale_y_continuous("Distance", limits=c(-5000, 5000))
# p <- p + facet_wrap(~iteration, ncol = 2)
# p <- p + scale_fill_manual(values = colours)
# p
return(list(combined, combined_sig_vals))
}
#################
## Development ##
#################
geneEnrichmentPlot <- function(n=0, highlight='kuz', write=FALSE) {
gene_enrichment<-geneEnrichment(n=n)
#gene_enrichment<-filter(gene_enrichment, fpkm > 0)
gene_enrichment <- gene_enrichment %>%
dplyr::mutate(gene = as.character(gene)) %>%
dplyr::mutate(log2FC = as.numeric(log2FC)) %>%
dplyr::mutate(test = as.character(ifelse(log2FC>=0, "enriched", "depleted")))
gene_enrichment <- transform(gene_enrichment, gene = reorder(gene, -abs(log2FC)))
highlightedGene <- dplyr::filter(gene_enrichment, gene == highlight)
highlightedGene <- droplevels(highlightedGene)
p <- ggplot(gene_enrichment)
p <- p + geom_bar(aes(gene, log2FC, fill = as.character(test)), stat="identity")
#p<-p + geom_bar(data=highlightedGene, aes(gene, log2FC, fill="red"), colour="black", stat="identity")
p <- p + guides(fill=FALSE)
p <- p + scale_x_discrete("Gene")
p <- p + cleanTheme() +
theme(panel.grid.major.y = element_line(color="grey80", size = 0.5, linetype = "dotted"),
axis.text.x = element_text(angle = 90, hjust=1),
axis.text = element_text(size=7)
)
#p<-p + coord_flip()
#p<-p + scale_y_reverse()
if(write){
gene_enrichment_plot <- paste("gene_enrichment.pdf")
cat("Writing file", gene_enrichment_plot, "\n")
ggsave(paste("plots/", gene_enrichment_plot, sep=""), width = 25, height = 5)
}
p
}
### Development
geneLenPlot <- function(n=0,gene_lengths_in="data/gene_lengths.txt"){
gene_enrichment<-geneEnrichment(n=n)
gene_lengths<-read.delim(gene_lengths_in, header = T)
gene_enrichment$length<-as.numeric(gene_enrichment$length)
gene_enrichment$log2FC<-as.numeric(gene_enrichment$log2FC)
gene_enrichment$fc<-as.numeric(gene_enrichment$fc)
gene_enrichment$observed<-as.numeric(gene_enrichment$observed)
# Var in x explained by Y
# par(mfrow=c(2,2))
# plot(enrichment_lm)
# Set new col 'col' to indicate enrichment/depletion
gene_enrichment$col<-as.factor(ifelse(gene_enrichment$log2FC > 0, 'enrichment', 'depletion'))
# Only keep relevant cols
gene_enrichment<-gene_enrichment[,c("gene","fpkm","observed",'expected','fc', 'log2FC', 'col')]
gene_enrichment<-droplevels(gene_enrichment)
# Join both df on 'gene'
gene_lengths_df<-join(as.data.frame(gene_lengths), gene_enrichment, 'gene', type = "left")
# Clean up null/na vals
gene_lengths_df$fpkm <- ifelse(gene_lengths_df$fpkm=='NULL' | gene_lengths_df$fpkm=='NA' | is.na(gene_lengths_df$fpkm), 0, gene_lengths_df$fpkm)
gene_lengths_df$level <- ifelse(gene_lengths_df$fpkm == 0 , 'not_expressed', 'expressed')
gene_lengths_df$observed <- ifelse(gene_lengths_df$observed == 'NULL' | gene_lengths_df$observed == 'NA', 0, gene_lengths_df$observed)
gene_lengths_df$level<-as.factor(gene_lengths_df$level)
# Allow colouring of expressed/enriched/depleted
gene_lengths_df$col <- ifelse(is.na(gene_lengths_df$col), 'NA', as.character(gene_lengths_df$col))
gene_lengths_df$col <- ifelse(gene_lengths_df$fpkm > 0, 'expressed', gene_lengths_df$col)
# New col log10 length
gene_lengths_df$log10length<-log10(gene_lengths$length)
gene_lengths_df<-filter(gene_lengths_df, length >= 1000, length < 200000)
gene_lengths_df<-droplevels(gene_lengths_df)
#
# Linear model (predicter ~ predictor)
enrichment_lm <- lm(observed ~ length, data = gene_lengths_df)
# Exponential model
enrichment_exp <- lm(log(observed) ~ length, data = gene_lengths_df)
# plot( observed ~ length, data = gene_enrichment)
lmRsq<-round(summary(enrichment_lm)$adj.r.squared, 2)
expRsq<-round(summary(enrichment_exp)$adj.r.squared, 2)
#summary(pois <- glm(observed ~ length, family="poisson", data=gene_lengths_df))
gene_lengths_p<-filter(gene_lengths_df, col != 'NA')
p <- ggplot(gene_lengths_p, aes(log10length, observed))
p <- p + geom_jitter(aes( colour = col, alpha = 0.8))
p <- p + scale_color_manual(values=c("#F8766D", "#00AFBB", "#E7B800"))
p <- p + scale_x_continuous("Log10 Kb", limits=c(3, max(gene_lengths_p$log10length)))
p <- p + scale_y_continuous("Count", limits=c(0,max(gene_lengths_p$observed)))
p <- p + scale_size_continuous(range=c(0, abs(max(gene_lengths_p$log2FC))))
p <- p + annotate(x = 3.5, y = 20, geom="text", label = paste('Lin:R^2:', lmRsq), size = 7,parse = TRUE)
p <- p + annotate(x = 3.5, y = 17, geom="text", label = paste('Exp:R^2:', expRsq), size = 7,parse = TRUE)
# Default model is formula = y ~ x
# How much variation in X is explained by Y
# How muc var in length is explained by observation
p <- p + geom_smooth(method=lm, show.legend = FALSE) # linear
p <- p + geom_smooth(method=lm, formula = y ~ poly(x, 2), colour = "orange", show.legend = FALSE) #Quadratic
#p <- p + geom_smooth(method=glm, method.args = list(family = "poisson"), colour = "red", se=T)
#p <- p + geom_smooth(colour="orange") # GAM
p <- p + cleanTheme()
p <- p + geom_rug(aes(colour=col,alpha=.8),sides="b")
p <- p + guides(alpha = FALSE)
colours<-c( "#E7B800", "#00AFBB")
p2<-ggplot(gene_lengths_df)
p2<-p2 + geom_density(aes(log10length, fill=level),alpha = 0.4)
p2 <- p2 + cleanTheme()
p2 <- p2 + scale_x_continuous("Log10 Kb", limits=c(3, max(gene_lengths_df$log10length)))
p2 <- p2 + guides(alpha = FALSE)
#p2 <- p2 + geom_rug(inherit.aes = F, aes(log10length,colour=level),alpha=0.2, sides = "tb")
p2 <- p2 + geom_rug(data=subset(gene_lengths_df,level=="expressed"), aes(log10length,colour=level),alpha=0.7, sides = "b")
p2 <- p2 + geom_rug(data=subset(gene_lengths_df,level=="not_expressed"), aes(log10length,colour=level),alpha=0.2, sides = "t")
p2 <- p2 + scale_fill_manual(values=colours)
p2 <- p2 + scale_colour_manual(values=colours)
# p<-ggscatter(gene_lengths_p, x = "log10length", y = "observed",
# color = "col", size = "log2FC"
# )
# p2<-ggdensity(gene_lengths_df, x = "log10length",
# add = "mean", rug = TRUE,
# color = "level", fill = "level",
# palette = c("#00AFBB", "#E7B800")
# )
#
combined_plots <- ggarrange(p, p2,
labels = c("A", "B"),
ncol = 1, nrow = 2)
gene_len<-paste("gene_lengths_count_model_log10.pdf")
cat("Writing file", gene_len, "\n")
ggsave(paste("plots/", gene_len, sep=""), width = 10, height = 10)
combined_plots
}
getPromoter <- function(gene_lengths_in="data/gene_lengths.txt"){
gene_lengths<-read.delim(gene_lengths_in, header = T)
gene_lengths$promoter<-ifelse(gene_lengths$start<gene_lengths$end,
gene_lengths$start- 1500,
gene_lengths$end + 1500)
gene_lengths<-gene_lengths[,c("chrom", "promoter")]
colnames(gene_lengths)<-NULL
return(gene_lengths)
}
dist2Feat <- function(feature_file="data/tss_locations.txt",sim=NA, print=0,send=0, feature='tss'){
if(is.na(sim)){
snv_data<-getData()
}
else{
cat("Generating simulated snv_data\n")
hit_count<-nrow(getData())
snv_data<-snvSim(N=hit_count, write=print)
colnames(snv_data)<-c("chrom", "pos", "v3", "v4", "v5")
snv_data<-filter(snv_data, chrom == "2L" | chrom == "2R" | chrom == "3L" | chrom == "3R" | chrom == "X" | chrom == "Y" | chrom == "4")
snv_data<-droplevels(snv_data)
}
feature<-paste(toupper(substr(feature, 1, 1)), substr(feature, 2, nchar(feature)), sep='')
if(feature=='Promoter'){
feature_locations<-getPromoter()
cat("Getting gene promoter locations...\n")
}
else{
feature_locations<-read.delim(feature_file, header = F)
cat("Reading in file:", feature_file, sep =' ', "\n")
}
cat("Calculating distances to", feature, sep=' ', "\n")
colnames(feature_locations)<-c("chrom", "pos")
feature_locations$pos<-as.integer(feature_locations$pos)
# Will throw error if SVs don't exist on a chrom...
# Removes chroms with fewer than 10 observations
svCount <- table(snv_data$chrom)
snv_data <- subset(snv_data, chrom %in% names(svCount[svCount >= 10]))
snv_data<-droplevels(snv_data)
feature_locations <- subset(feature_locations, chrom %in% levels(snv_data$chrom))
feature_locations<-droplevels(feature_locations)
fun2 <- function(p) {
index<-which.min(abs(tss_df$pos - p))
closestTss<-tss_df$pos[index]
# browser()
chrom<-as.character(tss_df$chrom[index])
gene<-as.character(tss_df$gene[index])
dist<-(p-closestTss)
list(p, closestTss, dist, chrom, gene)
}
l <- list()
for (c in levels(snv_data$chrom)){
df<-filter(snv_data, chrom == c)
tss_df<-filter(feature_locations, chrom == c)
dist2tss<-lapply(df$pos, fun2)
dist2tss<-do.call(rbind, dist2tss)
dist2tss<-as.data.frame(dist2tss)
colnames(dist2tss)=c("bp", "closest_tss", "min_dist", "chrom", "closest_gene")
dist2tss$min_dist<-as.numeric(dist2tss$min_dist)
l[[c]] <- dist2tss
}
dist2tss<-do.call(rbind, l)
dist2tss<-as.data.frame(dist2tss)
dist2tss$chrom<-as.character(dist2tss$chrom)
dist2tss<-arrange(dist2tss,(abs(min_dist)))
if(send==1){
return(dist2tss)
}
else{
p<-ggplot(dist2tss)
p<-p + geom_density(aes(min_dist, fill = chrom), alpha = 0.3)
p<-p + scale_x_continuous(paste("Distance to", feature, "(Kb)", sep=' '),
limits=c(-10000, 10000),
breaks=c(-10000,-1000, 1000, 10000),
expand = c(.0005, .0005),
labels=c("-10", "-1", "1", "10") )
p<-p + scale_y_continuous("Density")
p<-p + geom_vline(xintercept = 0, colour="black", linetype="dotted")
#p<-p + facet_wrap(~chrom, scale = "free_x", ncol = 5)
p <- p + geom_rug(aes(min_dist, colour=chrom))
p<-p + cleanTheme() +
theme(strip.text = element_text(size=20),
legend.position="top")
p<-p + facet_wrap(~chrom, ncol = 3, scales = "free_y")
if(is.na(sim)){
distout<-paste("snv", feature, 'dist.pdf', sep='')
}
else{
distout<-paste("snv", feature, 'dist_sim.pdf', sep='')
}
cat("Writing file", distout, "\n")
ggsave(paste("plots/", distout, sep=""), width = 20, height = 10)
p
}
}
distOverlay <- function(feature_file="data/tss_locations.txt", feature='tss', lim=10,all=NA){
feature<-paste(toupper(substr(feature, 1, 1)), substr(feature, 2, nchar(feature)), sep='')
if(feature=='promoter'){
real_data<-dist2Feat(send=1, feature=feature)
sim_data<-dist2Feat(feature=feature, sim=1, send=1)
}
else{
real_data<-dist2Feat(feature_file=feature_file, send=1, feature=feature)
sim_data<-dist2Feat(feature_file=feature_file, feature=feature, sim=1, send=1)
}
real_data$Source<-"Real"
sim_data$Source<-"Sim"
sim_data<-filter(sim_data, chrom != "Y", chrom != 4)
sim_data<-droplevels(sim_data)
real_data<-filter(real_data, chrom != "Y", chrom != 4)
real_data<-droplevels(real_data)
colours<-c( "#E7B800", "#00AFBB")
scale<-"(Kb)"
if(lim==0.1){
cat("Setting limits to -+100bp\n")
lims=c(-100, 100)
brks=c(-100, -10, 10, 100)
expnd = c(.0005, .0005)
labs=c("-100", "-10", "10", "100")
scale<-"(bp)"
}
else if(lim==0.5){
cat("Setting limits to -+0.5kb\n")
lims=c(-500, 500)
brks=c(-500, -100,100, 500)
expnd = c(.0005, .0005)
labs=c("-500", "-100", "100", "500")
scale<-"(bp)"
}
else if(lim==1){
cat("Setting limits to -+1kb\n")
lims=c(-1000, 1000)
brks=c(-1000, 1000)
expnd = c(.0005, .0005)
labs=c("-1", "1")
}
else{
cat("Setting limits to -+10kb\n")
lims=c(-10000, 10000)
brks=c(-10000,-1000, 1000, 10000)
expnd = c(.0005, .0005)
labs=c("-10", "-1", "1", "10")
}
p<-ggplot()
p<-p + geom_density(data=real_data,aes(min_dist, fill = Source), alpha = 0.4)
p<-p + geom_density(data=sim_data,aes(min_dist, fill = Source), alpha = 0.4)
if(is.na(all)){
p<-p + facet_wrap(~chrom, ncol = 3, scales = "free_y")
}
p<-p + scale_x_continuous(paste("Distance to", feature, scale, sep=' '),
limits=lims,
breaks=brks,
expand=expnd,
labels=labs )
p<-p + scale_y_continuous("Density")
p<-p + geom_vline(xintercept = 0, colour="black", linetype="dotted")
p <- p + geom_rug(data=real_data,aes(min_dist, colour=Source),sides="b")
p <- p + geom_rug(data=sim_data,aes(min_dist, colour=Source),sides="t")
p <- p + scale_fill_manual(values=colours)
p <- p + scale_colour_manual(values=colours)
p<-p + cleanTheme() +
theme(strip.text = element_text(size=20),
legend.position="top")
overlay<-paste("snv", feature, 'dist_overlay.pdf', sep='')
cat("Writing file", overlay, "\n")
ggsave(paste("plots/", overlay, sep=""), width = 25, height = 10)
p
}
#' tssDist
#'
#' Plot distance to TSS distribution
#' @param tss_pos File containing all TSS positions in genome: "gene chrom tss" [Default 'data/tss_positions.txt']
#' @param sim Simulate random SNVs accross genomic intervals? [Default: NO]
#' @param print Write the simulated random SNVs to a bed file ('data/simulatedSNVs.bed')? [Default: NO]
#' @import ggplot2
#' @keywords tss
#' @export
tssDist <- function(tss_pos="data/tss_positions.txt",sim=NA, print=0,return=0){
tss_locations<-read.delim(tss_pos, header = T)
tss_locations$tss<-as.integer(tss_locations$tss)
if(is.na(sim)){
snv_data<-getData()
}
else{
cat("Generating simulated snv_data\n")
hit_count<-nrow(getData())
snv_data<-snvSim(N=hit_count, write=print)
colnames(snv_data)<-c("chrom", "pos", "v3", "v4", "v5")
snv_data<-filter(snv_data, chrom == "2L" | chrom == "2R" | chrom == "3L" | chrom == "3R" | chrom == "X" | chrom == "Y" | chrom == "4")
snv_data<-droplevels(snv_data)
}
# Will throw error if SVs don't exist on a chrom...
# Removes chroms with fewer than 20 observations
svCount <- table(snv_data$chrom)
snv_data <- subset(snv_data, chrom %in% names(svCount[svCount > 30]))
snv_data<-droplevels(snv_data)
tss_locations <- subset(tss_locations, chrom %in% levels(snv_data$chrom))
tss_locations<-droplevels(tss_locations)
fun2 <- function(p) {
index<-which.min(abs(tss_df$tss - p))
closestTss<-tss_df$tss[index]
chrom<-as.character(tss_df$chrom[index])
gene<-as.character(tss_df$gene[index])
dist<-(p-closestTss)
list(p, closestTss, dist, chrom, gene)
}
l <- list()
for (c in levels(snv_data$chrom)){
df<-filter(snv_data, chrom == c)
tss_df<-filter(tss_locations, chrom == c)
dist2tss<-lapply(df$pos, fun2)
dist2tss<-do.call(rbind, dist2tss)
dist2tss<-as.data.frame(dist2tss)
colnames(dist2tss)=c("bp", "closest_tss", "min_dist", "chrom", "closest_gene")
dist2tss$min_dist<-as.numeric(dist2tss$min_dist)
l[[c]] <- dist2tss
}
dist2tss<-do.call(rbind, l)
dist2tss<-as.data.frame(dist2tss)
dist2tss$chrom<-as.character(dist2tss$chrom)
dist2tss<-arrange(dist2tss,(abs(min_dist)))
# Removes chroms with fewer than 20 observations
# svCount <- table(dist2tss$chrom)
# dist2tss <- subset(dist2tss, chrom %in% names(svCount[svCount > 10]))
if(return==1){
return(dist2tss)
}
else{
p<-ggplot(dist2tss)
p<-p + geom_density(aes(min_dist, fill = chrom), alpha = 0.3)
p<-p + scale_x_continuous("Distance to TSS (Kb)",
limits=c(-10000, 10000),
breaks=c(-10000,-1000, 1000, 10000),
expand = c(.0005, .0005),
labels=c("-10", "-1", "1", "10") )
p<-p + scale_y_continuous("Density")
p<-p + geom_vline(xintercept = 0, colour="black", linetype="dotted")
#p<-p + facet_wrap(~chrom, scale = "free_x", ncol = 5)
p <- p + geom_rug(aes(min_dist, colour=chrom))
p<-p + cleanTheme() +
theme(strip.text = element_text(size=20),
legend.position="top")
p<-p + facet_wrap(~chrom, ncol = 3, scales = "free_y")
if(is.na(sim)){
tssDistout<-paste("bpTSSdist.pdf")
}
else{
tssDistout<-paste("bpTSSdist_sim.pdf")
}
cat("Writing file", tssDistout, "\n")
ggsave(paste("plots/", tssDistout, sep=""), width = 20, height = 10)
p
}
}
tssDistOverlay <- function(){
real_data<-tssDist(return=1)
real_data$Source<-"Real"
sim_data<-bpTssDist(sim=1, return=1)
sim_data$Source<-"Sim"
sim_data<-filter(sim_data, chrom != "Y", chrom != 4)
sim_data<-droplevels(sim_data)
real_data<-filter(real_data, chrom != "Y", chrom != 4)
real_data<-droplevels(real_data)
colours<-c( "#E7B800", "#00AFBB")
p<-ggplot()
p<-p + geom_density(data=real_data,aes(min_dist, fill = Source), alpha = 0.4)
p<-p + geom_density(data=sim_data,aes(min_dist, fill = Source), alpha = 0.4)
p<-p + facet_wrap(~chrom, ncol = 3, scales = "free_y")
p<-p + scale_x_continuous("Distance to TSS (Kb)",
limits=c(-100000, 100000),
breaks=c(-100000,-10000,-1000, 1000, 10000, 100000),
expand = c(.0005, .0005),
labels=c("-100", "-10", "-1", "1", "10", "100") )
p<-p + scale_y_continuous("Density")
p<-p + geom_vline(xintercept = 0, colour="black", linetype="dotted")
#p<-p + facet_wrap(~chrom, scale = "free_x", ncol = 5)
p <- p + geom_rug(data=real_data,aes(min_dist, colour=Source),sides="b")
p <- p + geom_rug(data=sim_data,aes(min_dist, colour=Source),sides="t")
p <- p + scale_fill_manual(values=colours)
p <- p + scale_colour_manual(values=colours)
p<-p + cleanTheme() +
theme(strip.text = element_text(size=20),
legend.position="top")
p<-p + facet_wrap(~chrom, ncol = 3, scales = "free_y")
tssDistout<-paste("bpTSSdist_overlay.pdf")
cat("Writing file", tssDistout, "\n")
ggsave(paste("plots/", tssDistout, sep=""), width = 20, height = 10)
p
}
#' snvSim
#'
#' Generate simulated SNV hits acroos genomic regions (e.g. mappable regions)
#' @param intervals File containing genomic regions within which to simulate SNVs [Default 'data/intervals.bed]
#' @param N Number of random SNVs to generate [Default nrow(snv_data)]
#' @import GenomicRanges
#' @keywords sim
#' @export
snvSim <- function(intervals="data/intervals.bed", N=1000, write=F){
suppressPackageStartupMessages(require(GenomicRanges))
intFile <- import.bed(intervals)
space <- sum(width(intFile))
positions <- sample(c(1:space), N)
cat("Simulating", N, "SNVs", sep = ' ', "\n")
new_b <- GRanges(seqnames=as.character(rep(seqnames(intFile), width(intFile))),
ranges=IRanges(start=unlist(mapply(seq, from=start(intFile), to=end(intFile))), width=1))
bedOut<-new_b[positions]
if(write){
export.bed(new_b[positions], "data/simulatedSNVs.bed")
}
remove(new_b)
return(data.frame(bedOut))
}
svDist <- function(svs="data/all_bps_filtered.txt",sim=NA, print=0){
svBreaks<-read.delim(svs, header = F)
colnames(svBreaks) <- c("event", "bp_no", "sample", "chrom", "bp", "gene", "feature", "type", "length")
svBreaks$bp<-as.integer(svBreaks$bp)
svBreaks<-filter(svBreaks, sample != "A373R1" & sample != "A373R7" & sample != "A512R17" )
svBreaks <- droplevels(svBreaks)
snv_data<-getData()
if(!is.na(sim)){
simrep<-nrow(snv_data)
cat("Generating simulated snv_data for", simrep, "SNVs", "\n")
snv_data<-snvSim(N=simrep, write=print)
snv_data$end<-NULL
snv_data$width<-NULL
snv_data$strand<-NULL
snv_data$sample <- as.factor(sample(levels(svBreaks$sample), size = nrow(snv_data), replace = TRUE))
#snv_data$type <- sample(levels(svBreaks$type), size = nrow(snv_data), replace = TRUE)
colnames(snv_data)<-c("chrom", "pos", "sample")
snv_data<-filter(snv_data, chrom == "2L" | chrom == "2R" | chrom == "3L" | chrom == "3R" | chrom == "X" | chrom == "Y" | chrom == "4")
snv_data<-droplevels(snv_data)
}
snv_data <- subset(snv_data, sample %in% levels(svBreaks$sample))
snv_data <- droplevels(snv_data)
snv_data <- subset(snv_data, chrom %in% levels(svBreaks$chrom))
snv_data <- droplevels(snv_data)
fun3 <- function(p) {
index<-which.min(abs(sv_df$bp - p))
closestBp<-as.numeric(sv_df$bp[index])
chrom<-as.character(sv_df$chrom[index])
gene<-as.character(sv_df$gene[index])
sample<-as.character(sv_df$sample[index])
type<-as.character(sv_df$type[index])
dist<-(p-closestBp)
list(p, closestBp, dist, chrom, gene, type, sample)
}
l <- list()
for (c in levels(snv_data$chrom)){
for (s in levels(snv_data$sample)){
df<-filter(snv_data, chrom == c & sample == s)
sv_df<-filter(svBreaks, chrom == c & sample == s)
dist2bp<-lapply(df$pos, fun3)
dist2bp<-do.call(rbind, dist2bp)
dist2bp<-as.data.frame(dist2bp)
colnames(dist2bp)=c("snp", "closest_bp", "min_dist", "chrom", "closest_gene", "type", "sample")
dist2bp$min_dist<-as.numeric(dist2bp$min_dist)
l[[s]] <- dist2bp
}
l[[c]] <- dist2bp
}
dist2bp<-do.call(rbind, l)
dist2bp<-as.data.frame(dist2bp)
dist2bp$chrom<-as.character(dist2bp$chrom)
dist2bp$type<-as.character(dist2bp$type)
snvCount <- table(dist2bp$chrom)
dist2bp <- subset(dist2bp, chrom %in% names(snvCount[snvCount > 25]))
dist2bp<-arrange(dist2bp,(abs(min_dist)))
dist2bp <- dist2bp %>% na.omit()
p<-ggplot(dist2bp)
p<-p + geom_density(aes(min_dist, fill=type), alpha = 0.3)
# p<-p + scale_x_continuous("Distance to SV BP (Kb)",
# limits=c(-10000000, 10000000),
# breaks=c(-10000000, -100000, -10000, -1000, 0, 1000, 10000, 100000, 10000000),
# expand = c(.0005, .0005),
# labels=c("-10000", "-100", "-10", "-1", 0, "1", "10", "100", "10000") )
p<-p + scale_y_continuous("Density", expand = c(0, 0))
p<-p + geom_vline(xintercept = 0, colour="black", linetype="dotted")
p<-p + cleanTheme()
p<-p + facet_wrap(~chrom, ncol = 2, scales = "free_y")
p
# p<-ggplot(dist2bp)
# p<-p + geom_histogram(aes(as.numeric(min_dist, fill = type)), alpha = 0.6, binwidth = 1000)
# p<-p + scale_x_continuous("Distance to TSS", limits=c(-1000000, 1000000))
# p<-p + geom_vline(xintercept = 0, colour="black", linetype="dotted")
# p
}
g4Dist <- function(g4_pos="data/g4_positions.txt",sim=NA, print=0,return=0){
g4_pos="data/g4_positions.txt"
g4_locations<-read.delim(g4_pos, header = T)
g4_locations$g4<-as.integer(g4_locations$g4)
if(is.na(sim)){
snv_data<-getData()
}
else{
cat("Generating simulated snv_data\n")
hit_count<-nrow(getData())
snv_data<-snvSim(N=hit_count, write=print)
colnames(snv_data)<-c("chrom", "pos", "v3", "v4", "v5")
snv_data<-filter(snv_data, chrom == "2L" | chrom == "2R" | chrom == "3L" | chrom == "3R" | chrom == "X" | chrom == "Y" | chrom == "4")
snv_data<-droplevels(snv_data)
}
# Will throw error if SVs don't exist on a chrom...
# Removes chroms with fewer than 20 observations
svCount <- table(snv_data$chrom)
snv_data <- subset(snv_data, chrom %in% names(svCount[svCount > 30]))
snv_data<-droplevels(snv_data)
g4_locations <- subset(g4_locations, chrom %in% levels(snv_data$chrom))
g4_locations<-droplevels(g4_locations)
fun2 <- function(p) {
index<-which.min(abs(g4_df$g4 - p))
closestTss<-g4_df$g4[index]
chrom<-as.character(g4_df$chrom[index])
gene<-as.character(g4_df$gene[index])
dist<-(p-closestTss)
list(p, closestTss, dist, chrom, gene)
}
l <- list()
for (c in levels(snv_data$chrom)){
df<-filter(snv_data, chrom == c)
g4_df<-filter(g4_locations, chrom == c)
dist2g4<-lapply(df$pos, fun2)
dist2g4<-do.call(rbind, dist2g4)
dist2g4<-as.data.frame(dist2g4)
colnames(dist2g4)=c("bp", "closest_g4", "min_dist", "chrom", "closest_gene")
dist2g4$min_dist<-as.numeric(dist2g4$min_dist)
l[[c]] <- dist2g4
}
dist2g4<-do.call(rbind, l)
dist2g4<-as.data.frame(dist2g4)
dist2g4$chrom<-as.character(dist2g4$chrom)
dist2g4<-arrange(dist2g4,(abs(min_dist)))
# Removes chroms with fewer than 20 observations
# svCount <- table(dist2g4$chrom)
# dist2g4 <- subset(dist2g4, chrom %in% names(svCount[svCount > 10]))
if(return==1){
return(dist2g4)
}
else{
p<-ggplot(dist2g4)
p<-p + geom_density(aes(min_dist, fill = chrom), alpha = 0.3)
p<-p + scale_x_continuous("Distance to G4 (Kb)",
limits=c(-10000, 10000),
breaks=c(-10000,-1000, 1000, 10000),
expand = c(.0005, .0005),
labels=c("-10", "-1", "1", "10") )
p<-p + scale_y_continuous("Density")
p<-p + geom_vline(xintercept = 0, colour="black", linetype="dotted")
#p<-p + facet_wrap(~chrom, scale = "free_x", ncol = 5)
p <- p + geom_rug(aes(min_dist, colour=chrom))
p<-p + cleanTheme() +
theme(strip.text = element_text(size=20),
legend.position="top")
p<-p + facet_wrap(~chrom, ncol = 3, scales = "free_y")
if(is.na(sim)){
g4Distout<-paste("bpG4dist.pdf")
}
else{
g4Distout<-paste("bpG4dist_sim.pdf")
}
cat("Writing file", g4Distout, "\n")
ggsave(paste("plots/", g4Distout, sep=""), width = 20, height = 10)
p
}
}
g4DistOverlay <- function(){
real_data<-g4Dist(return=1)
real_data$Source<-"Real"
sim_data<-bpTssDist(sim=1, return=1)
sim_data$Source<-"Sim"
sim_data<-filter(sim_data, chrom != "Y", chrom != 4)
sim_data<-droplevels(sim_data)
real_data<-filter(real_data, chrom != "Y", chrom != 4)
real_data<-droplevels(real_data)
colours<-c( "#E7B800", "#00AFBB")
p<-ggplot()
p<-p + geom_density(data=real_data,aes(min_dist, fill = Source), alpha = 0.4)
p<-p + geom_density(data=sim_data,aes(min_dist, fill = Source), alpha = 0.4)
p<-p + facet_wrap(~chrom, ncol = 3, scales = "free_y")
p<-p + scale_x_continuous("Distance to G4 (Kb)",
limits=c(-10000, 10000),
breaks=c(-10000,-1000, 1000, 10000),
expand = c(.0005, .0005),
labels=c("-10", "-1", "1", "10") )
p<-p + scale_y_continuous("Density")
p<-p + geom_vline(xintercept = 0, colour="black", linetype="dotted")
#p<-p + facet_wrap(~chrom, scale = "free_x", ncol = 5)
p <- p + geom_rug(data=real_data,aes(min_dist, colour=Source),sides="b")
p <- p + geom_rug(data=sim_data,aes(min_dist, colour=Source),sides="t")
p <- p + scale_fill_manual(values=colours)
p <- p + scale_colour_manual(values=colours)
p<-p + cleanTheme() +
theme(strip.text = element_text(size=20),
legend.position="top")
p<-p + facet_wrap(~chrom, ncol = 3, scales = "free_y")
g4Distout<-paste("bpG4dist_overlay.pdf")
cat("Writing file", g4Distout, "\n")
ggsave(paste("plots/", g4Distout, sep=""), width = 20, height = 10)
p
}
#' chromDist
#'
#' Plot genome-wide snv distribution
#' @import ggplot2
#' @keywords distribution
#' @export
chromDist <- function(object=NA, notch=0){
snv_data<-getData()
ext<-'.pdf'
if(is.na(object)){
object<-'grouped_trans'
cols<-setCols(snv_data, "grouped_trans")
}
if(notch){
snv_data<-exclude_notch()
ext<-'_excl.N.pdf'
}
cat("Plotting snvs by", object, "\n")
p<-ggplot(snv_data)
p<-p + geom_histogram(aes(pos/1000000, fill = get(object)), binwidth=0.1, alpha = 0.8)
p<-p + facet_wrap(~chrom, scale = "free_x", ncol = 2)
p<-p + scale_x_continuous("Mbs", breaks = seq(0,33,by=1), limits = c(0, 33),expand = c(0.01, 0.01))
p<-p + scale_y_continuous("Number of snvs", expand = c(0.01, 0.01))
p<-p + cleanTheme() +
theme(axis.text.x = element_text(angle = 45, hjust=1),
axis.text = element_text(size=12),
axis.title = element_text(size=20),
strip.text.x = element_text(size = 15)
)
if (object == 'grouped_trans'){
p<-p + cols
}
chrom_outfile<-paste("snv_dist_genome_by_", object, ext, sep = "")
cat("Writing file", chrom_outfile, "\n")
ggsave(paste("plots/", chrom_outfile, sep=""), width = 20, height = 10)
p
}
mutationTypes <- function(allele_frequency = 0.1){
snv_data<-getData()
snv_data <- snv_data %>%
filter(dups!="TRUE") %>%
filter(a_freq>=allele_frequency)
library(dplyr)
mutCounts <- snv_data %>%
group_by(sample, grouped_trans) %>%
summarise(count=n()) %>%
mutate(perc=count/sum(count))
p <- ggplot(mutCounts)
p <- p + geom_bar(aes(sample, perc*100, fill=grouped_trans), stat="identity")
p <- p + cleanTheme() +
theme(panel.grid.major.y = element_line(color="grey80", size = 0.5, linetype = "dotted"),
axis.text.x = element_text(angle = 90, hjust=1),
axis.text.y = element_text(size=15),
axis.title = element_text(size=20),
strip.text.x = element_text(size = 15)
)
p <- p + ylab("% contribution")
p
}
###########
## Misc ##
###########
# Gene lengths on accross chroms
#gene_lengths="data/gene_lengths.txt"
#gene_lengths<-read.delim(gene_lengths, header = T)
#p<-ggplot(gene_lengths)
#p<-p + geom_point(aes( x=(start+(length/2)), y=log10(length), colour = chrom))
#p<-p + facet_wrap(~chrom, scale = "free_x")
#p
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.