#' tssDist
#'
#' Plot distance to TSS distribution
#' @param tss_pos File containing all TSS positions in genome: "gene chrom tss" [Default 'data/tss_positions.txt']
#' @param sim Simulate random SNVs accross genomic intervals? [Default: NO]
#' @param print Write the simulated random SNVs to a bed file ('data/simulatedSNVs.bed')? [Default: NO]
#' @import ggplot2
#' @keywords tss
#' @export
tssDist <- function(tss_pos="data/tss_positions.txt",sim=NA, print=0){
tss_locations<-read.delim(tss_pos, header = T)
tss_locations$tss<-as.integer(tss_locations$tss)
data<-getData()
if(!is.na(sim)){
simrep<-nrow(data)
cat("Generating simulated data for", simrep, "SNVs", "\n")
data<-snvSim(N=simrep, write=print)
colnames(data)<-c("chrom", "pos", "v3", "v4", "v5")
data<-filter(data, chrom == "2L" | chrom == "2R" | chrom == "3L" | chrom == "3R" | chrom == "X" | chrom == "Y" | chrom == "4")
data<-droplevels(data)
}
fun2 <- function(p) {
index<-which.min(abs(tss_df$tss - p))
closestTss<-tss_df$tss[index]
chrom<-as.character(tss_df$chrom[index])
gene<-as.character(tss_df$gene[index])
dist<-(p-closestTss)
list(p, closestTss, dist, chrom, gene)
}
l <- list()
for (c in levels(data$chrom)){
df<-filter(data, chrom == c)
tss_df<-filter(tss_locations, chrom == c)
dist2tss<-lapply(df$pos, fun2)
dist2tss<-do.call(rbind, dist2tss)
dist2tss<-as.data.frame(dist2tss)
colnames(dist2tss)=c("snp", "closest_tss", "min_dist", "chrom", "closest_gene")
dist2tss$min_dist<-as.numeric(dist2tss$min_dist)
l[[c]] <- dist2tss
}
dist2tss<-do.call(rbind, l)
dist2tss<-as.data.frame(dist2tss)
dist2tss$chrom<-as.character(dist2tss$chrom)
dist2tss<-arrange(dist2tss,(abs(min_dist)))
# Removes chroms with fewer than 20 observations
snvCount <- table(dist2tss$chrom)
dist2tss <- subset(dist2tss, chrom %in% names(snvCount[snvCount > 25]))
p<-ggplot(dist2tss)
p<-p + geom_density(aes(min_dist, fill = chrom), alpha = 0.3)
p<-p + scale_x_continuous("Distance to TSS (Kb)",
limits=c(-100000, 100000),
breaks=c(-100000, -10000, -1000, 0, 1000, 10000, 100000),
expand = c(.0005, .0005),
labels=c("-100", "-10", "-1", 0, "1", "10", "100") )
p<-p + scale_y_continuous("Density", expand = c(0, 0))
p<-p + geom_vline(xintercept = 0, colour="black", linetype="dotted")
p<-p + cleanTheme()
#p<-p + facet_wrap(~chrom, ncol = 2, scales = "free_y")
p
# p<-ggplot(dist2tss)
# p<-p + geom_histogram(aes(min_dist, fill = chrom), alpha = 0.6, bins=500)
# p<-p + scale_x_continuous("Distance to TSS", limits=c(-1000, 1000))
# p<-p + geom_vline(xintercept = 0, colour="black", linetype="dotted")
# p
}
#' snvSim
#'
#' Generate simulated SNV hits acroos genomic regions (e.g. mappable regions)
#' @param intervals File containing genomic regions within which to simulate SNVs [Default 'data/intervals.bed]
#' @param N Number of random SNVs to generate [Default nrow(data)]
#' @import GenomicRanges
#' @keywords sim
#' @export
snvSim <- function(intervals="data/intervals.bed", N=1000, write=F){
suppressPackageStartupMessages(require(GenomicRanges))
intFile <- import.bed(intervals)
space <- sum(width(intFile))
positions <- sample(c(1:space), N)
new_b <- GRanges(seqnames=as.character(rep(seqnames(intFile), width(intFile))),
ranges=IRanges(start=unlist(mapply(seq, from=start(intFile), to=end(intFile))), width=1))
bedOut<-new_b[positions]
if(write){
export.bed(new_b[positions], "data/simulatedSNVs.bed")
}
remove(new_b)
return(data.frame(bedOut))
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.