mutationProfiles.R

list.of.packages <- c('ggplot2', 'dplyr', 'plyr', 'RColorBrewer',
                      'BSgenome.Dmelanogaster.UCSC.dm6', 'deconstructSigs',
                      'reshape', 'data.table', 'ggpubr', 'plotly', 'grid', 'VennDiagram')
new.packages <- list.of.packages[!(list.of.packages %in% installed.packages()[,"Package"])]
if(length(new.packages)){
  cat('Installing missing packages...\n')
  install.packages(new.packages)
}
cat('Silently loading packages...')
suppressMessages(library(ggplot2))
suppressMessages(library(dplyr))
suppressMessages(library(plyr))
suppressMessages(library(RColorBrewer))
suppressMessages(library(BSgenome.Dmelanogaster.UCSC.dm6))
suppressMessages(library(deconstructSigs))
suppressMessages(library(reshape))
suppressMessages(library(data.table))
suppressMessages(library(ggpubr))
suppressMessages(library(plotly))
suppressMessages(library(grid))
suppressMessages(library(VennDiagram))

set.seed(42)


#' getData
#'
#' Function to clean cnv files
#' @param infile File to process [Required]
#' @keywords get
#' @import dplyr
#' @export
#' @return Dataframe
getData <- function(infile = "data/annotated_snvs.txt", expression_data='data/isc_genes_rnaSeq.csv'){
  snv_data<-read.delim(infile, header = T)
  colnames(snv_data)=c("sample", "chrom", "pos", "ref", "alt", "tri", "trans", "decomposed_tri", "grouped_trans", "a_freq", "caller", "variant_type", "status", "snpEff_anno", "feature", "gene", "id")

  # Read in tissue specific expression data
  seq_data<-read.csv(header = F, expression_data)
  colnames(seq_data)<-c('id', 'fpkm')

  snv_data <- join(snv_data,seq_data,"id", type = 'left')

  snv_data$fpkm <- ifelse(is.na(snv_data$fpkm), 0, round(as.numeric(snv_data$fpkm), 1))

  # Order by FPKM
  snv_data<- dplyr::arrange(snv_data, desc(fpkm))

  # Find vars called by both Mu and Var
  # Must also filter one of these calls out...
  snv_data$dups<-duplicated(snv_data[,1:3])
  snv_data<-mutate(snv_data, caller = ifelse(dups == "TRUE", 'varscan2_mutect2' , as.character(caller)))

  ##############
  ## Filters ###
  ##############

  # Filter for calls made by both V and M
  # snv_data<-filter(snv_data, caller == 'mutect2' | caller == 'varscan2_mutect2')

  # Filter for old/new data
  # cat("Filtering for old/new data\n")
  # snv_data <- filter(snv_data, !grepl("^A|H", sample))

  # Filter for genes expressed in RNA-Seq data
  # cat("Filtering out non-expressed genes\n")
  # snv_data<-filter(snv_data, !is.na(fpkm) & fpkm > 0.1)

  # Filter for genes NOT expressed in RNA-Seq data
  # cat("Filtering out expressed genes\n")
  # snv_data<-filter(snv_data, fpkm == 0)

  # Filter on allele freq
  # cat("Filtering on allele frequency\n")
  #snv_data<-filter(snv_data, is.na(a_freq))
  # snv_data<-filter(snv_data, a_freq >= 0.20)

  # Filter out samples
  # snv_data<-filter(snv_data, sample != "A373R1" & sample != "A373R7" & sample != "A512R17" )
  # snv_data <- filter(snv_data, !sample %in% c("A373R1", "A373R7", "A512R17", "A373R11", "A785-A788R1", "A785-A788R11", "A785-A788R3", "A785-A788R5", "A785-A788R7", "A785-A788R9"))
  # snv_data<-filter(snv_data, sample != "A373R11" & sample != 'A373R13')
  # snv_data <- snv_data %>%
  #   filter(!sample %in% c("A373R1", "A373R7", "A512R17", "A373R11", "D050R07-2")) %>%
  #   droplevels()

  # snv_data <- snv_data %>%
  #   filter(sample %in% c("D050R01", "D050R03", "D050R05", "D050R07-1")) %>%
  #   droplevels()

  dir.create(file.path("plots"), showWarnings = FALSE)
  return(snv_data)
}


#' cleanTheme
#'
#' Clean theme for plotting
#' @param base_size Base font size [Default 12]
#' @import ggplot2
#' @keywords theme
#' @export

cleanTheme <- function(base_size = 12){
  theme(
    plot.title = element_text(hjust = 0.5, size = 20),
    panel.background = element_blank(),
    plot.background = element_rect(fill = "transparent",colour = NA),
    panel.grid.minor = element_blank(),
    panel.grid.major = element_blank(),
    axis.line.x = element_line(color="black", size = 0.5),
    axis.line.y = element_line(color="black", size = 0.5),
    axis.text = element_text(size=12),
    axis.title = element_text(size=30)
  )
}


#' genTris
#'
#' This function returns all possible trinucleotide combinations
#' @keywords trinucleotides
#' @export
#' @return Character string containing all 96 trinucleotides
#' genTris()

genTris <- function(){
  all.tri = c()
  for(i in c("A", "C", "G", "T")){
    for(j in c("C", "T")){
      for(k in c("A", "C", "G", "T")){
        if(j != k){
          for(l in c("A", "C", "G", "T")){
            tmp = paste(i, "[", j, ">", k, "]", l, sep = "")
            all.tri = c(all.tri, tmp)
          }
        }
      }
    }
  }
  all.tri <- all.tri[order(substr(all.tri, 3, 5))]
  return(all.tri)
}


#' setCols
#'
#' Get colours for n levels
#' @import RColorBrewer
#' @param df Dataframe [Required]
#' @param col Column of snv_dataframe. Colours will be set to levels(df$cols) [Required]
#' @keywords cols
#' @export

setCols <- function(df, col){
  names<-levels(df[[col]])
  cat("Setting colour levles:", names, "\n")
  level_number<-length(names)
  mycols<-brewer.pal(level_number, "Set2")
  names(mycols) <- names
  colScale <- scale_fill_manual(name = col,values = mycols)
  return(colScale)
}


#' snvStats
#'
#' Calculate some basic stats for snv snv_data
#' @import dplyr
#' @keywords stats
#' @export

snvStats <- function(){
  snv_data<-getData()
  cat("sample", "snvs", sep='\t', "\n")
  rank<-sort(table(snv_data$sample), decreasing = TRUE)
  rank<-as.array(rank)

  total=0

  scores=list()
  for (i in 1:nrow(rank)){
    cat(names(rank[i]), rank[i], sep='\t', "\n")
    total<-total + rank[i]
    scores[i]<-rank[i]
  }


  cat('--------------', '\n')
  scores<-unlist(scores)

  mean<-as.integer(mean(scores))
  med<-as.integer(median(scores))

  cat('total', total, sep='\t', '\n')
  cat('samples', nrow(rank), sep='\t', '\n')

  cat('--------------', '\n')
  cat('mean', mean, sep='\t', '\n')
  cat('median', med, sep='\t', '\n')

  cat('\n')
  all_ts<-nrow(filter(snv_data, trans == "A>G" | trans == "C>T" | trans == "G>A" | trans == "T>C"))
  all_tv<-nrow(filter(snv_data, trans != "A>G" & trans != "C>T" & trans != "G>A" & trans != "T>C"))
  ts_tv<-round((all_ts/all_tv), digits=2)
  cat("ts/tv = ", ts_tv,  sep='', '\n')

}



#' rainfall
#'
#' Plot log10 distances between snvs as rainfall plot
#' @import ggplot2
#' @keywords rainfall
#' @export

rainfall <- function(){
  snv_data <- getData()

  distances <- do.call(rbind, lapply(split(snv_data[order(snv_data$chrom, snv_data$pos),], snv_data$chrom[order(snv_data$chrom, snv_data$pos)]),
                                   function(a)
                                     data.frame(a,
                                                dist=c(diff(a$pos), NA),
                                                logdist = c(log10(diff(a$pos)), NA))
  )
  )


  distances$logdist[is.infinite(distances$logdist)] <- 0
  distances<-filter(distances, chrom != 4)

  p<-ggplot(distances)
  p<-p + geom_point(aes(pos/1000000, logdist, colour = grouped_trans))
  p <- p + cleanTheme() +
    theme(axis.text.x = element_text(angle=45, hjust = 1),
          panel.grid.major.y = element_line(color="grey80", size = 0.5, linetype = "dotted"),
          strip.text = element_text(size=20)
    )

  p<-p + facet_wrap(~chrom, scale = "free_x", ncol = 6)
  #p<-p + scale_x_continuous("Mbs", breaks = seq(0,33,by=1), limits = c(0, 33), expand = c(0.01, 0.01))
  p<-p + scale_x_continuous("Mbs", breaks = seq(0,max(distances$pos),by=10))

  rainfall_out<-paste("rainfall.pdf")
  cat("Writing file", rainfall_out, "\n")
  ggsave(paste("plots/", rainfall_out, sep=""), width = 20, height = 5)

  p
}



#' samplesPlot
#'
#' Plot the snv distribution for each sample
#' @import ggplot2
#' @param count Output total counts instead of frequency if set [Default no]
#' @keywords spectrum
#' @export

samplesPlot <- function(count=NA){
  snv_data<-getData()

  mut_class<-c("C>A", "C>G", "C>T", "T>A", "T>C", "T>G")

  p<-ggplot(snv_data)

  if(is.na(count)){
    p<-p + geom_bar(aes(x = grouped_trans, y = (..count..)/sum(..count..), group = sample, fill = sample), position="dodge",stat="count")
    p<-p + scale_y_continuous("Relative contribution to total mutation load", expand = c(0.0, .001))
    tag='_freq'
  }
  else{
    p<-p + geom_bar(aes(x = grouped_trans, y = ..count.., group = sample, fill = sample), position="dodge",stat="count")
    p<-p + scale_y_continuous("Count", expand = c(0.0, .001))
    tag='_count'
  }
  p<-p + scale_x_discrete("Mutation class", limits=mut_class)
  p<-p + cleanTheme() +
    theme(panel.grid.major.y = element_line(color="grey80", size = 0.5, linetype = "dotted"),
          axis.title = element_text(size=20),
          strip.text.x = element_text(size = 10)
    )
  p<-p + facet_wrap(~sample, ncol = 4, scale = "free_x" )

  samples_mut_spect<-paste("mutation_spectrum_samples", tag, ".pdf", sep = '')
  cat("Writing file", samples_mut_spect, "\n")
  ggsave(paste("plots/", samples_mut_spect, sep=""), width = 20, height = 10)
  p
}

#calledSnvs

calledSnvs <- function(){
  snv_data<-getData()
  calls<-table(snv_data$caller)
  calls<-as.data.frame(unlist(calls))
  calls$Var1 <- as.factor(calls$Var1)


  grid.newpage()
  draw.pairwise.venn(area1 = calls$Freq[calls$Var1 == 'mutect2'],
                     area2 = calls$Freq[calls$Var1 == 'varscan2'],
                     cross.area = calls$Freq[calls$Var1 == 'varscan2_mutect2'],
                     category = c("Mutect2","Varscan2"),
                     #lty = rep('blank', 2),
                     lwd = rep(0.3, 2),
                     cex = rep(2, 3),
                     cat.cex = rep(2, 2),
                     fill = c("#E7B800", "#00AFBB"),
                     alpha = rep(0.4, 2),
                     cat.pos = c(0, 0),
                     #cat.dist = rep(0.025, 2)
                     ext.text = 'FALSE'
  )

}




#' mutSigs
#'
#' Calculate and plot the mutational signatures accross samples using the package `deconstructSigs`
#' @param samples Calculates and plots mutational signatures on a per-sample basis [Default no]
#' @param pie Plot a pie chart shwoing contribution of each signature to overall profile [Default no]
#' @import deconstructSigs
#' @import BSgenome.Dmelanogaster.UCSC.dm6
#' @keywords signatures
#' @export

mutSigs <- function(samples=NULL, pie=FALSE, write=FALSE){

  if(!exists('scaling_factor')){
    cat("calculating trinucleotide frequencies in genome\n")
    scaling_factor <-triFreq()
  }

  snv_data<-getData()
  genome <- BSgenome.Dmelanogaster.UCSC.dm6

  if(missing(samples)){
    cat("Plotting for all samples\n")
    snv_data$tissue = 'All'
    sigs.input <- mut.to.sigs.input(mut.ref = snv_data, sample.id = "tissue", chr = "chrom", pos = "pos", alt = "alt", ref = "ref", bsg = genome)
    sig_plot<-whichSignatures(tumor.ref = sigs.input, signatures.ref = signatures.cosmic, sample.id = 'All',
                              contexts.needed = TRUE,
                              tri.counts.method = scaling_factor
    )

    if(write){
      cat("Writing to file 'plots/all_signatures.pdf'\n")
      pdf('plots/all_signatures.pdf', width = 20, height = 10)
      plotSignatures(sig_plot)
      dev.off()
    }
    plotSignatures(sig_plot)

    if(pie){
      makePie(sig_plot)
    }
  }

  else{
    sigs.input <- mut.to.sigs.input(mut.ref = snv_data, sample.id = "sample", chr = "chrom", pos = "pos", alt = "alt", ref = "ref", bsg = genome)
    cat("sample", "snv_count", sep="\t", "\n")
    for(s in levels(snv_data$sample)) {
      snv_count<-nrow(filter(snv_data, sample == s))

      if(snv_count > 50){
        cat(s, snv_count, sep="\t", "\n")

        sig_plot<-whichSignatures(tumor.ref = sigs.input, signatures.ref = signatures.cosmic, sample.id = s,
                                  contexts.needed = TRUE,
                                  tri.counts.method = scaling_factor)

        if(write){
          outfile<-(paste('plots/', s, '_signatures.pdf', sep = ''))
          cat("Writing to file", outfile, "\n")
          pdf(outfile, width = 20, height = 10)
          plotSignatures(sig_plot)
          dev.off()
        }
        plotSignatures(sig_plot)

        if(pie){
          makePie(sig_plot)
        }
      }
    }
  }
}


#' sigTypes
#'
#' Calculate and plot the mutational signatures accross samples using the package `deconstructSigs`
#' @param samples Calculates and plots mutational signatures on a per-sample basis [Default no]
#' @param pie Plot a pie chart shwoing contribution of each signature to overall profile [Default no]
#' @import deconstructSigs
#' @import data.table
#' @import reshape
#' @import forcats
#' @import BSgenome.Dmelanogaster.UCSC.dm6
#' @keywords signatures
#' @export

sigTypes <- function(write=FALSE){
  suppressMessages(require(BSgenome.Dmelanogaster.UCSC.dm6))
  suppressMessages(require(deconstructSigs))

  if(!exists('scaling_factor')){
    cat("Calculating trinucleotide frequencies in genome\n")
    scaling_factor <-triFreq()
  }

  snv_data<-getData()
  genome <- BSgenome.Dmelanogaster.UCSC.dm6

  sigs.input <- mut.to.sigs.input(mut.ref = snv_data, sample.id = "sample", chr = "chrom", pos = "pos", alt = "alt", ref = "ref", bsg = genome)

  l = list()
  for(s in levels(snv_data$sample)) {
    snv_count<-nrow(filter(snv_data, sample == s))

    if(snv_count > 50){

      sig_plot<-whichSignatures(tumor.ref = sigs.input, signatures.ref = signatures.cosmic, sample.id = s,
                                contexts.needed = TRUE,
                                tri.counts.method = scaling_factor)
      l[[s]] <- sig_plot
    }
  }

  mutSigs<-do.call(rbind, l)
  mutSigs<-as.data.frame(mutSigs)

  mutWeights<-mutSigs$weights

  mutData<-melt(rbindlist(mutWeights, idcol = 'sample'),
                id = 'sample', variable.name = 'signature', value.name = 'score')

  mutData <- mutData %>%
    filter(score > 0.1) %>%
    group_by(sample) %>%
    mutate(total = sum(score))

  p <- ggplot(mutData)
  p <- p + geom_bar(aes(fct_reorder(sample, -total), score, fill=signature),colour="black", stat = "identity")
  p <- p + scale_x_discrete("Sample")
  p <- p + scale_y_continuous("Signature contribution", expand = c(0.01, 0.01), breaks=seq(0, 1, by=0.1))
  p <- p + cleanTheme() +
    theme(axis.text.x = element_text(angle = 45, hjust=1),
          axis.text = element_text(size=30)
    )

  if(write){
    sigTypes<-paste("sigTypes.pdf")
    cat("Writing file", sigTypes, "\n")
    ggsave(paste("plots/", sigTypes, sep=""), width = 20, height = 10)
  }
  p
}


####
# sigTypesPie
####

sigPie <- function() {
  df <- data.frame(
    group = c("Sig3", "Sig5", "Sig8", "Unknown"),
    value = c(21, 14, 25, 40),
    cols = c('#DB8E00', '#64B200', '#00BD5C', '#00BADE'))


  all <- data.frame(
    group = c("Sig3", "Sig8", "Sig9", "Sig21", "Sig25", "Unknown"),
    value = c(29, 17, 10, 7, 7, 30),
    cols = c('#E68613', '#0CB702', '#00BE67', '#ED68ED', '#FF61CC', 'grey'))

  bp <- ggplot(all, aes(x="", y=value, fill = cols)) +
    geom_bar(width = 1, stat = "identity", colour = "white") +
    scale_fill_manual(values = levels(all$cols), labels = levels(all$group))

  pie <- bp + coord_polar("y", start=0)
  pie + cleanTheme() +
    theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())

}





#' mutSpectrum
#'
#' Plots the mutations spectrum for all samples combined
#' @import ggplot2
#' @keywords spectrum
#' @export
mutSpectrum <- function(write=FALSE, max_y=25){
  snv_data<-getData()
  cat("Showing global contribution of tri class to mutation load", "\n")

  p <- ggplot(snv_data)
  p <- p + geom_bar(aes(x = decomposed_tri, y = (..count..)/sum(..count..), group = decomposed_tri, fill = grouped_trans), position="dodge",stat="count")
  p <- p + scale_y_continuous("Contribution to mutation load", limits = c(0, max_y/100), breaks=seq(0,max_y/100,by=0.025), labels=paste0(seq(0,max_y,by=2.5), "%"), expand = c(0.0, .0005))
  p <- p + scale_x_discrete("Genomic context", expand = c(.005, .005))
  p <- p + cleanTheme() +
    theme(panel.grid.major.y = element_line(color="grey80", size = 0.5, linetype = "dotted"),
          axis.text.x = element_text(angle = 90, hjust=1),
          axis.text.y = element_text(size=15),
          axis.title = element_text(size=20),
          strip.text.x = element_text(size = 15)
    )
  p <- p + facet_wrap(~grouped_trans, ncol = 6, scale = "free_x" )
  p <- p + guides(grouped_trans = FALSE)
  if(write){
    mut_spectrum<-paste("mutation_spectrum.pdf")
    cat("Writing file", mut_spectrum, "\n")
    ggsave(paste("plots/", mut_spectrum, sep=""), width = 20, height = 5)
  }
  p
}



#' featureEnrichment
#'
#' Function to calculate enrichment of snv hits in genomic features
#' @description Calculate the enrichment of snv hits in genomic features
#' A 'features' file must be provided with the follwing format:
#' feature	length	percentage
#' This can be generated using the script 'script/genomic_features.pl' and a genome .gtf file
#' The defualt genome length is set to the mappable regions of the Drosophila melanogastor Dmel6.12 genome (GEM mappability score > .5)
#' (118274340). The full, assembled genome legnth for chroms 2/3/4/X/Y is 137547960
#' @param features File containing total genomic lengths of features [Default 'data/genomic_features.txt']
#' @param genome_length The total legnth of the genome [Default 118274340 (mappable regions on chroms 2, 3, 4, X & Y for Drosophila melanogastor Dmel6.12)]
#' @keywords enrichment
#' @import dplyr ggpubr
#' @return A snv_data frame with FC scores for all genes seen at least n times in snv snv_data
#' @export
featureEnrichment <- function(features='data/genomic_features.txt', genome_length=118274340, write=FALSE){
  genome_features<-read.delim(features, header = T)
  snv_data<-getData()
  mutCount<-nrow(snv_data)

  # To condense exon counts into "exon"
  snv_data$feature<-as.factor(gsub("exon_.*", "exon", snv_data$feature))

  classCount<-table(snv_data$feature)
  classLengths<-setNames(as.list(genome_features$length), genome_features$feature)

  fun <- function(f) {
    # Calculate the fraction of geneome occupied by each feature
    featureFraction<-classLengths[[f]]/genome_length

    # How many times should we expect to see this feature hit in our snv_data (given number of obs. and fraction)?
    featureExpect<-(mutCount*featureFraction)

    # observed/expected
    fc<-classCount[[f]]/featureExpect
    Log2FC<-log2(fc)
    featureExpect<-round(featureExpect,digits=3)

    # Binomial test
    if(!is.null(classLengths[[f]])){
      if(classCount[f] >= featureExpect){
        stat<-binom.test(x = classCount[f], n = mutCount, p = featureFraction, alternative = "greater")
        test<-"enrichment"
      }
      else{
        stat<-binom.test(x = classCount[f], n = mutCount, p = featureFraction, alternative = "less")
        test<-"depletion"
      }
      sig_val <- ifelse(stat$p.value <= 0.001, "***",
                        ifelse(stat$p.value <= 0.01, "**",
                               ifelse(stat$p.value <= 0.05, "*", "")))

      p_val<-format.pval(stat$p.value, digits = 3, eps=0.0001)
      list(feature = f, observed = classCount[f], expected = featureExpect, Log2FC = Log2FC, test = test, sig = sig_val, p_val = p_val)
    }
  }

  enriched<-lapply(levels(snv_data$feature), fun)
  enriched<-do.call(rbind, enriched)
  featuresFC<-as.data.frame(enriched)
  # Sort by FC value
  featuresFC<-dplyr::arrange(featuresFC,desc(abs(as.numeric(Log2FC))))
  featuresFC$Log2FC<-round(as.numeric(featuresFC$Log2FC), 1)

  if(write){
    featuresFC <- filter(featuresFC, observed >= 5)
    first.step <- lapply(featuresFC, unlist)
    second.step <- as.data.frame(first.step, stringsAsFactors = F)
    ggpubr::ggtexttable(second.step, rows = NULL, theme = ttheme("mGreen"))
    feat_enrichment_table <- paste("feature_enrichment_table.tiff")
    cat("Writing to file: ", 'plots/', feat_enrichment_table, sep = '')
    ggsave(paste("plots/", feat_enrichment_table, sep=""), width = 5.5, height = (nrow(featuresFC)/3), dpi=300)
  } else{
    return(featuresFC)
  }
}


featureEnrichmentPlot <- function(write=FALSE) {
  feature_enrichment<-featureEnrichment()

  feature_enrichment$feature <- as.character(feature_enrichment$feature)
  feature_enrichment$Log2FC <- as.numeric(feature_enrichment$Log2FC)

  feature_enrichment <- transform(feature_enrichment, feature = reorder(feature, -Log2FC))

  feature_enrichment <- filter(feature_enrichment, observed >= 5)

  # Custom sorting
  # feature_enrichment$feature <- factor(feature_enrichment$feature, levels=c("intron", "intergenic", "exon", "3UTR", "ncRNA", "5UTR"))

  p<-ggplot(feature_enrichment)
  p<-p + geom_bar(aes(feature, Log2FC, fill = as.character(test)), stat="identity")
  p<-p + guides(fill=FALSE)
  p<-p + ylim(-2,2)
  p<-p + cleanTheme() +
    theme(panel.grid.major.y = element_line(color="grey80", size = 0.5, linetype = "dotted"),
          axis.text.x = element_text(angle = 45, hjust=1),
          axis.text = element_text(size=20)
    )

  if(write){
    feat_plot <- paste("feat_plot.pdf")
    cat("Writing file", feat_plot, "\n")
    ggsave(paste("plots/", feat_plot, sep=""), width = 5, height = 10)
  }
  p
}


#' geneEnrichment
#'
#' Function to calculate fold change enrichment in a set of snv calls correcting for gene length
#' @description Calculate the enrichment of snv hits in length-corrected genes
#' A 'gene_lengths' file must be provided with the following fields (cols 1..6 required)
#' gene length chrom    start      end      tss scaling_factor
#' This can be generated using the script 'script/genomic_features.pl' and a genome .gtf file
#' The defualt genome length is set to the mappable regions of the Drosophila melanogastor Dmel6.12 genome (GEM mappability score > .5)
#' (118274340). The full, assembled genome legnth for chroms 2/3/4/X/Y is 137547960
#' @param gene_lengths File containing all genes and their lengths (as generated by 'script/genomefeatures.pl') [Default 'data/gene_lengths.txt']
#' @param n The number of times we need to have seen a gene in our snv_data to view its enrichment score [Default 3]
#' @param genome_length The total legnth of the genome [Default 137547960 (chroms 2, 3, 4, X & Y for Drosophila melanogastor Dmel6.12)]
#' @keywords enrichment
#' @import dplyr
#' @import ggpubr
#' @return A snv_data frame with FC scores for all genes seen at least n times in snv snv_data
#' @export
geneEnrichment <- function(gene_lengths_in="data/gene_lengths.txt", n=10, genome_length=118274340, write=FALSE){
  snv_data <- getData() %>%
    dplyr::filter(gene != "intergenic") %>%
    droplevels()

  snv_count<-nrow(snv_data)

  gene_lengths <- read.delim(gene_lengths_in, header = T)

  gene_lengths <- gene_lengths %>%
    dplyr::filter(length > 1000) %>%
    dplyr::select(gene, length) %>%
    droplevels()

  genes<-setNames(as.list(gene_lengths$length), gene_lengths$gene)

  snv_data<-join(gene_lengths, snv_data, 'gene', type = 'left')

  snv_data$fpkm <- ifelse(snv_data$fpkm=='NULL' | snv_data$fpkm=='NA' | is.na(snv_data$fpkm), 0, snv_data$fpkm)
  snv_data$observed <- ifelse(is.numeric(snv_data$observed), snv_data$observed, 0)

  hit_genes<-table(factor(snv_data$gene, levels = levels(snv_data$gene) ))
  expression<-setNames(as.list(snv_data$fpkm), snv_data$gene)

  fun <- function(g) {
    # Calculate the fraction of geneome occupied by each gene
    genefraction<-genes[[g]]/genome_length

    # How many times should we expect to see this gene hit in our snv_data (given number of obs. and fraction of genome)?
    gene_expect<-snv_count*(genefraction)

    # observed/expected
    fc<-hit_genes[[g]]/gene_expect
    log2FC = log2(fc)

    if (hit_genes[[g]] >= gene_expect) {
      stat <- binom.test(x = hit_genes[[g]], n = snv_count, p = genefraction, alternative = "greater")
      test <- "enrichment"
    } else {
      stat <- binom.test(x = hit_genes[[g]], n = snv_count, p = genefraction, alternative = "less")
      test <- "depletion"
    }

    sig_val <- ifelse(stat$p.value <= 0.001, "***",
                      ifelse(stat$p.value <= 0.01, "**",
                             ifelse(stat$p.value <= 0.05, "*", "")))

    sig_val <- ifelse(stat$p.value > 0.05, "-", sig_val)

    p_val <- format.pval(stat$p.value, digits = 3)

    gene_expect<-round(gene_expect,digits=3)
    list(gene = g, length = genes[[g]], fpkm = expression[[g]], test=test, observed = hit_genes[g], expected = gene_expect, fc = fc, log2FC = log2FC, sig_val=sig_val, p_val=p_val)
  }

  enriched<-lapply(levels(snv_data$gene), fun)
  enriched<-do.call(rbind, enriched)
  genesFC<-as.data.frame(enriched)
  # Filter for genes with few observations

  genesFC <- genesFC %>%
    dplyr::filter(observed >= n) %>%
    dplyr::mutate(expected = round(as.numeric(expected),digits=3)) %>%
    dplyr::mutate(log2FC = round(as.numeric(log2FC),digits=2)) %>%
    dplyr::mutate(p_val = as.numeric(p_val)) %>%
    dplyr::mutate(eScore = abs(log2FC) * -log10(p_val)) %>%
    dplyr::mutate(eScore = round(as.numeric(eScore),digits=2)) %>%
    dplyr::select(gene, observed, expected, log2FC, test, sig_val, p_val, eScore) %>%
    dplyr::arrange(-eScore, p_val, -abs(log2FC)) %>%
    droplevels()

  if(write){
    cat("printing")
    first.step <- lapply(genesFC, unlist)
    second.step <- as.data.frame(first.step, stringsAsFactors = F)
    arrange(second.step,desc(as.integer(log2FC)))

    ggpubr::ggtexttable(second.step, rows = NULL, theme = ttheme("mGreen"))

    gene_enrichment_table <- paste("gene_enrichment_table.tiff")
    ggsave(paste("plots/", gene_enrichment_table, sep=""), width = 5.2, height = (nrow(genesFC)/3), dpi=300)
  } else{
    return(genesFC)
  }
}


# EnrichmentVolcano
#'
#' Plot the enrichment of SNVs in genes features
#' @keywords enrichment
#' @import dplyr
#' @import plotly
#' @export

EnrichmentVolcano <- function(d){
  gene_enrichment <- d

  minPval <- min(gene_enrichment$p_val[gene_enrichment$p_val>0])

  gene_enrichment$p_val <- ifelse(gene_enrichment$p_val==0, minPval/abs(gene_enrichment$log2FC), gene_enrichment$p_val)

  gene_enrichment$p_val <- ifelse(gene_enrichment$p_val==0, minPval, gene_enrichment$p_val)


  maxLog2 <- max(abs(gene_enrichment$log2FC[is.finite(gene_enrichment$log2FC)]))
  maxLog2 <- as.numeric(round_any(maxLog2, 1, ceiling))

  ax <- list(
    size = 25
  )

  ti <- list(
    size = 25
  )

  p <- plot_ly(data = gene_enrichment,
               x = ~log2FC,
               y = ~-log10(p_val),
               type = 'scatter',
               # showlegend = FALSE,
               mode = 'markers',
               # height = 1200,
               # width = 1000,
               # frame = ~p_val,
               text = ~paste("Gene: ", gene, "\n",
                             "Observed: ", observed, "\n",
                             "Expected: ", expected, "\n",
                             "P-val: ", p_val, "\n"),
               color = ~log10(p_val),
               colors = "Spectral",
               size = ~-log10(p_val)
  ) %>%
    layout(
      xaxis = list(title="Log2(FC)", titlefont = ax, range = c(-maxLog2, maxLog2)),
      yaxis = list(title="-Log10(p)", titlefont = ax)
    )
  p
}


#' snvinGene
#'
#' Plot all snvs found in a given gene
#' @description Plot all snvs found in a given gene.
#' A 'gene_lengths' file must be provided with the following fields (cols 1..6 required)
#' gene length chrom    start      end      tss scaling_factor
#' This can be generated using the script 'script/genomic_features.pl' and a genome .gtf file
#' @param gene_lengths File containing all genes and their lengths (as generated by 'script/genomefeatures.pl') [Default 'data/gene_lengths.txt']
#' @param gene2plot Name of the gene to plot
#' @import ggplot2 dplyr
#' @keywords gene
#' @export

snvinGene <- function(gene_lengths="data/gene_lengths.txt", gene2plot='kuz', annotated=TRUE, col_by_status=TRUE, write=FALSE){
  gene_lengths <- read.delim(gene_lengths, header = T)

  region <- gene_lengths %>%
    dplyr::filter(gene == gene2plot) %>%
    droplevels()

  gene_length <-(region$end-region$start)
  wStart<-(region$start - gene_length/10)
  wEnd<-(region$end + gene_length/10)
  wChrom<-as.character(region$chrom)
  wTss<-suppressWarnings(as.numeric(levels(region$tss))[region$tss])

  snv_data<-getData() %>%
    dplyr::filter(chrom == wChrom & pos >= wStart & pos <= wEnd)

  if(nrow(snv_data) == 0){
    stop(paste("There are no snvs in", gene2plot, "- Exiting", "\n"))
  }

  snv_data$colour_var <- snv_data$feature

  if(annotated){
    snv_data$colour_var <- snv_data$variant_type
    if(col_by_status)
      snv_data$colour_var <- snv_data$status
  }

  p <- ggplot(snv_data)
  p <- p + geom_point(aes(pos/1000000, sample, colour = colour_var, size = 1.5), position=position_jitter(width=0, height=0.2))
  p <- p + guides(size = FALSE, sample = FALSE)
  p <- p + cleanTheme() +
    theme(axis.title.y=element_blank(),
          panel.grid.major.y = element_line(color="grey80", size = 0.5, linetype = "dotted"),
          axis.text.y = element_text(size = 30)
    )
  p <- p + scale_x_continuous("Mbs", expand = c(0,0), breaks = seq(round(wStart/1000000, digits = 2),round(wEnd/1000000, digits = 2),by=0.05), limits=c(wStart/1000000, wEnd/1000000))
  p <- p + annotate("rect", xmin=region$start/1000000, xmax=region$end/1000000, ymin=0, ymax=0.3, alpha=.2, fill="skyblue")
  p <- p + geom_vline(xintercept = wTss/1000000, colour="red", alpha=.7, linetype="solid")

  p <- p + geom_segment(aes(x = wTss/1000000, y = 0, xend= wTss/1000000, yend = 0.1), colour="red")
  middle<-((wEnd/1000000+wStart/1000000)/2)
  p <- p + annotate("text", x = middle, y = 0.15, label=gene2plot, size=6)
  p <- p + ggtitle(paste("Chromosome:", wChrom))
  if(write){
    hit_gene<-paste(gene2plot, "_hits.pdf", sep='')
    cat("Writing file", hit_gene, "\n")
    ggsave(paste("plots/", hit_gene, sep=""), width = 10, height = 10)
  }
  p
}


#' featuresHit
#'
#' Show top hit features
#' @import ggplot2
#' @keywords features
#' @export

featuresHit <- function(..., write=FALSE){
  snv_data<-getData(...)

  # To condense exon counts into "exon"
  snv_data$feature<-as.factor(gsub("exon_.*", "exon", snv_data$feature))

  # Reoders descending
  snv_data$feature<-factor(snv_data$feature, levels = names(sort(table(snv_data$feature), decreasing = TRUE)))

  snv_data <- snv_data %>%
    dplyr::group_by(feature) %>%
    dplyr::add_tally() %>%
    ungroup() %>%
    dplyr::filter(n >= 5) %>%
    droplevels()
  #cols<-setCols(snv_data, "feature")

  p <- ggplot(snv_data)
  p <- p + geom_bar(aes(feature, fill = feature))
  #p<-p + cols
  p <- p + cleanTheme() +
    theme(axis.title.x=element_blank(),
          panel.grid.major.y = element_line(color="grey80", size = 0.5, linetype = "dotted"))
  p <- p + scale_x_discrete(expand = c(0.01, 0.01))
  p <- p + scale_y_continuous(expand = c(0.01, 0.01))

  # colour to a pub palette:
  # p<-p + ggpar(p, palette = 'jco')

  if(write){
    features_outfile<-paste("hit_features_count.pdf")
    cat("Writing file", features_outfile, "\n")
    ggsave(paste("plots/", features_outfile, sep=""), width = 20, height = 10)
  }
  p
}


#' geneHit
#'
#' Show top hit genes
#' @import dplyr
#' @keywords gene
#' @param n Show top n hits [Default 10]
#' @export
geneHit <- function(..., n=10){
  snv_data<-getData(...)
  snv_data<-filter(snv_data, gene != "intergenic")

  hit_count<-as.data.frame(sort(table(unlist(snv_data$gene)), decreasing = T))

  colnames(hit_count)<- c("gene", "count")
  head(hit_count, n)
}



#' triFreq
#'
#' This function counts the number of times each triunucleotide is found in a supplied genome
#' @param genome BS.genome file defaults to BSgenome.Dmelanogaster.UCSC.dm6
#' @param count Output total counts instead of frequency if set [Default no]
#' @import dplyr
#' @keywords trinucleotides
#' @export
#' @return Dataframe of trinucs and freqs (or counts if count=1)

triFreq <- function(genome=NULL, count=FALSE){
  if(missing(genome)){
    cat("No genome specfied, defaulting to 'BSgenome.Dmelanogaster.UCSC.dm6'\n")
    library(BSgenome.Dmelanogaster.UCSC.dm6, quietly = TRUE)
    genome <- BSgenome.Dmelanogaster.UCSC.dm6
  }

  params <- new("BSParams", X = Dmelanogaster, FUN = trinucleotideFrequency, exclude = c("M", "_"), simplify = TRUE)
  snv_data<-as.data.frame(bsapply(params))
  snv_data$genome<-as.integer(rowSums(snv_data))
  snv_data$genome_adj<-(snv_data$genome*2)

  if(count){
    tri_count<-snv_data['genome_adj']
    tri_count<-cbind(tri = rownames(tri_count), tri_count)
    colnames(tri_count) <- c("tri", "count")
    rownames(tri_count) <- NULL
    return(tri_count)
  }
  else{
    snv_data$x <- (1/snv_data$genome)
    scaling_factor<-snv_data['x']
    return(scaling_factor)
  }
}










# Functions to calculate the distance
# from each breakpoint to user-provided loci (e.g. TSS)

#' generateData
#' Prepare data for dist2motif
#' @keywords simulate
#' @import ggplot2
#' @import dplyr
#' @import colorspace
#' @import RColorBrewer
#' @export
generateData <- function(..., breakpoints=NA, sim=NA, keep=NULL){
  if(is.na(breakpoints)){
    # if(!missing(keep)){
    #   real_data <- notchFilt(..., keep=keep)
    # } else {
    real_data <- getData(..., genotype=='somatic_tumour', !sample %in% c("A373R7", "A512R17", "A785-A788R1", "A785-A788R11", "A785-A788R3", "A785-A788R5", "A785-A788R7", "A785-A788R9"))
    # }
    real_data <- real_data %>%
      dplyr::filter(chrom == "2L" | chrom == "2R" | chrom == "3L" | chrom == "3R" | chrom == "X" ) %>%
      dplyr::mutate(pos = bp) %>%
      dplyr::select(chrom, pos) %>%
      droplevels()
  } else{
    real_data <- read.table(breakpoints, header = F)
    if(is.null(real_data$V3)){
      real_data$V3 <- real_data$V2 + 2
    }
    colnames(real_data) <- c("chrom", "start", "end")
    real_data <- real_data %>%
      dplyr::filter(chrom == "2L" | chrom == "2R" | chrom == "3L" | chrom == "3R" | chrom == "X" ) %>%
      dplyr::mutate(pos = (end+start)/2) %>%
      dplyr::select(chrom, pos) %>%
      droplevels()
  }

  if (!is.na(sim)) {
    byIteration <- list()
    #run each iteration
    for (i in 1:sim){
      cat("Running simulation", i, "of", sim, "\n")
      simByChrom <- list()

      for (c in levels(real_data$chrom)){
        hitCount <- nrow(real_data[real_data$chrom== c,])
        hitCount <- (hitCount*10)
        if (i == 1){
          cat(paste("Simulating", hitCount, "breakpoints on chromosome", c), "\n")
        }
        bp_data <- bpSim(nSites = hitCount, byChrom = c)
        bp_data$iteration <- i
        simByChrom[[c]] <- bp_data
      }
      result <- as.data.frame(do.call(rbind, simByChrom))
      rownames(result) <- NULL
      byIteration[[i]] <- result
    }

    #combine each iteration into one data frame
    # final <- dplyr::bind_rows(byIteration)
    final <- as.data.frame(do.call(rbind, byIteration))
    final$iteration <- as.factor(final$iteration)

    return(final)
  } else{
    cat("Using real data", "\n")
    real_data$iteration <- as.factor(1)
    return(real_data)
  }
}




#' dist2Motif2
#' Calculate the distance from each breakpoint to closest motif in a directory of files
#' @keywords motif
#' @import ggplot2 dplyr tidyr RColorBrewer
#' @export
dist2motif2 <- function(..., feature_file = NA, featureDir = 'rawdata/features/', sim=NA, keep=NULL, position = 'centre') {

  snv_data <- generateData(..., confidence=='precise', breakpoints=breakpoints, sim=sim, keep=keep)

  cat("Calculating distances to", position, 'of regions', sep = " ", "\n")

  svCount <- table(bp_data$chrom)
  bp_data <- subset(bp_data, chrom %in% names(svCount[svCount >= 5]))
  # bp_data <- droplevels(bp_data)

  minDist <- function(p) {
    index <- which.min(abs(tss_df$pos - p))
    closestTss <- tss_df$pos[index]
    chrom <- as.character(tss_df$chrom[index])
    dist <- (p - closestTss)
    list(p, closestTss, dist, chrom)
  }

  scores <- list()

  fileNames <- dir(featureDir, pattern = ".bed")
  # cat("Analysing all files in directory:", bedFiles, "\n")
  for (i in 1:length(fileNames)){
    filename <- basename(tools::file_path_sans_ext(fileNames[i]))
    parts <- unlist(strsplit(filename, split = '\\.'))
    feature <- parts[1]

    cat("Analysing file:", fileNames[i], 'with feature:', feature, "\n")

    feature_locations <- read.table(paste(featureDir, fileNames[i], sep='/'), header = F)
    feature_locations <- feature_locations[,c(1,2,3)]
    colnames(feature_locations) <- c("chrom", "start", "end")

    # fCount <- table(feature_locations$chrom)
    #
    # bp_data <- subset(bp_data, chrom %in% names(svCount[svCount >= 5]))
    #

    feature_locations <- feature_locations %>%
      dplyr::filter(chrom %in% levels(bp_data$chrom))


    if(position == 'centre'){
      feature_locations <- feature_locations %>%
        dplyr::mutate(end = as.integer(((end+start)/2)+1)) %>%
        dplyr::mutate(pos = as.integer(end-1)) %>%
        dplyr::select(chrom, pos)
    } else if(position == 'edge'){
      feature_locations <- feature_locations %>%
        tidyr::gather(c, pos, start:end, factor_key=TRUE) %>%
        dplyr::select(chrom, pos)
    }
    byIteration <- list()
    for (j in levels(bp_data$iteration)){
      byChrom <- list()
      df1 <- dplyr::filter(bp_data, iteration == j)
      for (c in levels(bp_data$chrom)) {
        df <- dplyr::filter(df1, chrom == c)
        tss_df <- dplyr::filter(feature_locations, chrom == c)
        dist2tss <- lapply(df$pos, minDist)
        dist2tss <- do.call(rbind, dist2tss)
        new <- data.frame(matrix(unlist(dist2tss), nrow=nrow(df)))
        new$iteration <- j
        new$feature <- as.factor(feature)
        colnames(new) <- c("bp", "closest_tss", "min_dist", "chrom", "iteration", "feature")
        byChrom[[c]] <- new
      }
      perIter <- do.call(rbind, byChrom)
      byIteration[[j]] <- perIter
    }
    dist2feat <- do.call(rbind, byIteration)
    scores[[i]] <- dist2feat
  }

  final <- do.call(rbind, scores)
  rownames(final) <- NULL
  final$iteration <- as.factor(final$iteration)
  final$chrom <- as.character(final$chrom)
  final$min_dist <- as.numeric(as.character(final$min_dist))


  return(final)
}


# distOverlay
#'
#' Calculate the distance from each breakpoint to closest motif
#' Overlay the same number of random simulated breakpoints
#' @keywords motif
#' @import dplyr
#' @import ggplot2
#' @import ggpubr
#' @import RColorBrewer
#' @export
distOverlay2 <- function(..., breakpoints = NA, featureDir = 'rawdata/features/', from='bps', lim=2.5, n=2, plot = TRUE, keep=NULL, position = 'centre') {
  scaleFactor <- lim*1000
  real_data <- dist2motif2(..., breakpoints = breakpoints, featureDir = featureDir, keep=keep, position = position)
  sim_data <- dist2motif2(..., featureDir = featureDir, sim = n, position = position)

  real_data$Source <- as.factor("Real")
  sim_data$Source <- as.factor("Sim")

  dummy_iterations <- list()
  for (i in levels(sim_data$iteration)){
    real_data$iteration <- as.factor(i)
    dummy_iterations[[i]] <- real_data
  }
  real_data <- do.call(rbind, dummy_iterations)
  rownames(real_data) <- NULL

  real_data$iteration <- factor(real_data$iteration, levels = 1:n)
  sim_data$iteration <- factor(sim_data$iteration, levels = 1:n)

  # Perform significance testing
  pVals_and_df <- simSig2(r = real_data, s = sim_data, max_dist = scaleFactor)

  combined <- pVals_and_df[[1]]
  pVals <- pVals_and_df[[2]]

  if(plot==T){
    print(plotdistanceOverlay2(..., d=combined, from=from, facetPlot=FALSE, byChrom=byChrom, lim=lim, n=n, position=position ))
    print(pVals)
  }else{
    print(pVals)
    return(list(combined, pVals))
  }
}


#' plotdistanceOverlay
#'
#' Plot the distance overlay
#' @param d Dataframe containing combined real + sim data (d <- distOverlay())
#' @import dplyr ggplot2 RColorBrewer scales colorspace cowplot
#' @keywords distance
#' @export
plotdistanceOverlay2 <- function(..., d, from='bps', lim=2.5, n=2, position='centre', histo=FALSE, binWidth = 500){
  grDevices::pdf(NULL)

  scaleFactor <- lim*1000
  scale <- "(Kb)"

  lims <- c(as.numeric(paste("-", scaleFactor, sep = '')), scaleFactor)
  brks <- c(as.numeric(paste("-", scaleFactor, sep = '')),
            as.numeric(paste("-", scaleFactor/10, sep = '')),
            scaleFactor/10,
            scaleFactor)
  labs <- as.character(brks/1000)
  expnd <- c(0, 0)

  new <- d %>%
    mutate(iteration = as.factor(ifelse(Source=='Real', 0, iteration)))

  real_fill <- '#3D9DEB'
  iterFill <- colorspace::rainbow_hcl(n)

  colours <- c(real_fill, iterFill)

  plts <- list()
  for (i in 1:(length(levels(new$feature)))){
    d <- new %>%
      filter(feature == levels(new$feature)[i])

    p <- ggplot(d)
    if(histo) {
      p <- p + geom_histogram(data=d[d$Source=="Sim",], aes(min_dist, fill = Source, group = iteration), alpha = 0.1, binwidth = binWidth,  position="identity")
      p <- p + geom_histogram(data=d[d$Source=="Real",], aes(min_dist, fill = Source, group = iteration), alpha = 0.5, binwidth = binWidth, position="identity")
      p <- p + scale_fill_manual(values=colours)
      p <- p + scale_y_continuous(paste("Count per", binWidth, "bp bins"))
    } else {
      p <- p + geom_line(data=d[d$Source=="Real",], aes(min_dist, colour = iteration), size=2, stat='density')
      p <- p + geom_line(aes(min_dist, group = interaction(iteration, Source), colour = iteration), alpha = 0.7, size=1, stat='density')
      p <- p + scale_color_manual(values=colours)
    }

    p <- p + scale_x_continuous(
      limits = lims,
      breaks = brks,
      expand = expnd,
      labels = labs
    )
    p <- p +
      theme(
        legend.position = "none",
        panel.background = element_blank(),
        plot.background = element_rect(fill = "transparent", colour = NA),
        axis.line.x = element_line(color = "black", size = 0.5),
        axis.text.x = element_text(size = 16),
        axis.line.y = element_line(color = "black", size = 0.5),
        plot.title = element_text(size=22, hjust = 0.5)
      )
    p <- p + labs(title = paste(d$feature, "\n", position))
    plts[[i]] <- p
  }
  cat("Plotting", length(levels(new$feature)), "plots", "\n")
  grDevices::dev.off()
  cowplot::plot_grid(plotlist=plts)
}


simSig2 <- function(r, s, test=NA, max_dist=5000){
  cat("Calculating descriptive statistics\n")
  arrange_data <- function(x){
    x <- x %>%
      group_by(iteration, feature) %>%
      dplyr::mutate( count = n(),
                     median = median(min_dist),
                     mean = mean(min_dist),
                     sd = sd(min_dist),
                     Source = Source) %>%
      dplyr::filter(abs(min_dist) <= max_dist ) %>%
      ungroup()
    return(x)
  }
  simulated <- arrange_data(s)
  real <- arrange_data(r)

  combined <- suppressWarnings(dplyr::full_join(real, simulated))
  combined$Source <- as.factor(combined$Source)

  simbyFeat = list()
  for (f in levels(combined$feature)){
    pVals = list()
    c <- dplyr::filter(combined, feature==f)
    for(i in levels(c$iteration)){
      df <- dplyr::filter(c, iteration==i)
      rl <- dplyr::filter(df, Source == "Real")
      sm <- dplyr::filter(df, Source == "Sim")
      result1 <- tryCatch(suppressWarnings(ks.test(rl$min_dist, sm$min_dist)), error=function(err) NA)
      result1 <- suppressWarnings(ks.test(rl$min_dist, sm$min_dist))
      ksPval <- round(result1$p.value, 4)

      result2 <- car::leveneTest(df$min_dist, df$Source, center='median')
      result3 <- stats::bartlett.test(df$min_dist, df$Source)
      bPval <- round(result3$p.value, 4)
      lPval <- round(result2$`Pr(>F)`[1], 4)
      rmed <- round(median(rl$min_dist)/1000, 2)
      smed <- round(median(sm$min_dist)/1000, 2)
      rsd <- round(sd(rl$min_dist)/1000, 2)
      ssd <- round(sd(sm$min_dist)/1000, 2)
      rKurtosis <- round(kurtosis(rl$min_dist), 2)
      sKurtosis <- round(kurtosis(sm$min_dist), 2)
      rSkew <- round(skewness(rl$min_dist), 2)
      sSkew <- round(skewness(sm$min_dist), 2)
      # fStat <- var.test(min_dist ~ Source , df, alternative = "two.sided")
      # fRatio <- round(fStat$statistic, 2)
      # fStat <- round(fStat$p.value, 4)

      sig <- ifelse(lPval <= 0.001, "***",
                    ifelse(lPval <= 0.01, "**",
                           ifelse(lPval <= 0.05, "*", "")))

      vals <- data.frame(iteration = i,
                         feature = f,
                         KS = ksPval,
                         Levenes = lPval,
                         # Bartlett = bPval,
                         # Fstat_ratio = fRatio,
                         # Fstat = fStat,
                         real_median = rmed,
                         sim_median = smed,
                         real_sd = rsd,
                         sim_sd = ssd,
                         real_kurtosis = rKurtosis,
                         sim_kurtosis = sKurtosis,
                         real_skew = rSkew,
                         sim_skew = sSkew,
                         sig = sig)
      pVals[[i]] <- vals
    }

    pVals_df <- do.call(rbind, pVals)
    simbyFeat[[f]] <- pVals_df
  }

  combined_sig_vals <- do.call(rbind, simbyFeat)

  rownames(combined_sig_vals) <- NULL
  combined_sig_vals <- combined_sig_vals %>%
    arrange(Levenes, KS)

  # print(pVals_df, row.names = FALSE)

  ## Boxplot per chrom

  # colours <- c("#E7B800", "#00AFBB")
  # cat("Plotting qq plot of min distances\n")
  # qqnorm(combined$min_dist)
  # qqline(combined$min_dist, col = 2)

  # p <- ggplot(combined)
  # p <- p + geom_boxplot(aes(chrom, min_dist, fill = Source), alpha = 0.6)
  # p <- p + scale_y_continuous("Distance", limits=c(-5000, 5000))
  # p <- p + facet_wrap(~iteration, ncol = 2)
  # p <- p + scale_fill_manual(values = colours)

  # p
  return(list(combined, combined_sig_vals))
}

















#################
## Development ##
#################

geneEnrichmentPlot <- function(n=0, highlight='kuz', write=FALSE) {
  gene_enrichment<-geneEnrichment(n=n)

  #gene_enrichment<-filter(gene_enrichment, fpkm > 0)

  gene_enrichment <- gene_enrichment %>%
    dplyr::mutate(gene = as.character(gene)) %>%
    dplyr::mutate(log2FC = as.numeric(log2FC)) %>%
    dplyr::mutate(test = as.character(ifelse(log2FC>=0, "enriched", "depleted")))

  gene_enrichment <- transform(gene_enrichment, gene = reorder(gene, -abs(log2FC)))

  highlightedGene <- dplyr::filter(gene_enrichment, gene == highlight)
  highlightedGene <- droplevels(highlightedGene)

  p <- ggplot(gene_enrichment)
  p <- p + geom_bar(aes(gene, log2FC, fill = as.character(test)), stat="identity")
  #p<-p + geom_bar(data=highlightedGene, aes(gene, log2FC, fill="red"), colour="black", stat="identity")
  p <- p + guides(fill=FALSE)
  p <- p + scale_x_discrete("Gene")
  p <- p + cleanTheme() +
    theme(panel.grid.major.y = element_line(color="grey80", size = 0.5, linetype = "dotted"),
          axis.text.x = element_text(angle = 90, hjust=1),
          axis.text = element_text(size=7)
    )
  #p<-p + coord_flip()
  #p<-p + scale_y_reverse()
  if(write){
    gene_enrichment_plot <- paste("gene_enrichment.pdf")
    cat("Writing file", gene_enrichment_plot, "\n")
    ggsave(paste("plots/", gene_enrichment_plot, sep=""), width = 25, height = 5)
  }
  p
}

### Development

geneLenPlot <- function(n=0,gene_lengths_in="data/gene_lengths.txt"){
  gene_enrichment<-geneEnrichment(n=n)
  gene_lengths<-read.delim(gene_lengths_in, header = T)

  gene_enrichment$length<-as.numeric(gene_enrichment$length)
  gene_enrichment$log2FC<-as.numeric(gene_enrichment$log2FC)
  gene_enrichment$fc<-as.numeric(gene_enrichment$fc)
  gene_enrichment$observed<-as.numeric(gene_enrichment$observed)


  #  Var in x explained by Y
  # par(mfrow=c(2,2))
  # plot(enrichment_lm)

  # Set new col 'col' to indicate enrichment/depletion
  gene_enrichment$col<-as.factor(ifelse(gene_enrichment$log2FC > 0, 'enrichment', 'depletion'))

  # Only keep relevant cols
  gene_enrichment<-gene_enrichment[,c("gene","fpkm","observed",'expected','fc', 'log2FC', 'col')]
  gene_enrichment<-droplevels(gene_enrichment)

  # Join both df on 'gene'
  gene_lengths_df<-join(as.data.frame(gene_lengths), gene_enrichment, 'gene', type = "left")

  # Clean up null/na vals
  gene_lengths_df$fpkm <- ifelse(gene_lengths_df$fpkm=='NULL' | gene_lengths_df$fpkm=='NA' | is.na(gene_lengths_df$fpkm), 0, gene_lengths_df$fpkm)
  gene_lengths_df$level <- ifelse(gene_lengths_df$fpkm == 0 , 'not_expressed', 'expressed')
  gene_lengths_df$observed <- ifelse(gene_lengths_df$observed == 'NULL' | gene_lengths_df$observed == 'NA', 0, gene_lengths_df$observed)
  gene_lengths_df$level<-as.factor(gene_lengths_df$level)

  # Allow colouring of expressed/enriched/depleted
  gene_lengths_df$col <- ifelse(is.na(gene_lengths_df$col), 'NA', as.character(gene_lengths_df$col))
  gene_lengths_df$col <- ifelse(gene_lengths_df$fpkm > 0, 'expressed', gene_lengths_df$col)

  # New col log10 length
  gene_lengths_df$log10length<-log10(gene_lengths$length)

  gene_lengths_df<-filter(gene_lengths_df, length >= 1000, length < 200000)
  gene_lengths_df<-droplevels(gene_lengths_df)
  #
  # Linear model (predicter ~ predictor)
  enrichment_lm <- lm(observed ~ length, data = gene_lengths_df)
  # Exponential model
  enrichment_exp <- lm(log(observed) ~ length, data = gene_lengths_df)

  # plot( observed ~ length, data = gene_enrichment)
  lmRsq<-round(summary(enrichment_lm)$adj.r.squared, 2)
  expRsq<-round(summary(enrichment_exp)$adj.r.squared, 2)

  #summary(pois <- glm(observed ~ length, family="poisson", data=gene_lengths_df))

  gene_lengths_p<-filter(gene_lengths_df, col != 'NA')

  p <- ggplot(gene_lengths_p, aes(log10length, observed))
  p <- p + geom_jitter(aes( colour = col, alpha = 0.8))
  p <- p + scale_color_manual(values=c("#F8766D", "#00AFBB", "#E7B800"))
  p <- p + scale_x_continuous("Log10 Kb", limits=c(3, max(gene_lengths_p$log10length)))
  p <- p + scale_y_continuous("Count", limits=c(0,max(gene_lengths_p$observed)))
  p <- p + scale_size_continuous(range=c(0, abs(max(gene_lengths_p$log2FC))))

  p <- p + annotate(x = 3.5, y = 20, geom="text", label = paste('Lin:R^2:', lmRsq), size = 7,parse = TRUE)
  p <- p + annotate(x = 3.5, y = 17, geom="text", label = paste('Exp:R^2:', expRsq), size = 7,parse = TRUE)

  # Default model is formula = y ~ x
  # How much variation in X is explained by Y
  # How muc var in length is explained by observation

  p <- p + geom_smooth(method=lm, show.legend = FALSE) # linear
  p <- p + geom_smooth(method=lm, formula = y ~ poly(x, 2), colour = "orange", show.legend = FALSE) #Quadratic
  #p <- p + geom_smooth(method=glm, method.args = list(family = "poisson"), colour = "red", se=T)
  #p <- p + geom_smooth(colour="orange") # GAM
  p <- p + cleanTheme()
  p <- p + geom_rug(aes(colour=col,alpha=.8),sides="b")

  p <- p + guides(alpha = FALSE)

  colours<-c( "#E7B800", "#00AFBB")
  p2<-ggplot(gene_lengths_df)
  p2<-p2 + geom_density(aes(log10length, fill=level),alpha = 0.4)
  p2 <- p2 + cleanTheme()
  p2 <- p2 + scale_x_continuous("Log10 Kb", limits=c(3, max(gene_lengths_df$log10length)))
  p2 <- p2 + guides(alpha = FALSE)
  #p2 <- p2 + geom_rug(inherit.aes = F, aes(log10length,colour=level),alpha=0.2, sides = "tb")


  p2 <- p2 + geom_rug(data=subset(gene_lengths_df,level=="expressed"), aes(log10length,colour=level),alpha=0.7, sides = "b")
  p2 <- p2 + geom_rug(data=subset(gene_lengths_df,level=="not_expressed"), aes(log10length,colour=level),alpha=0.2, sides = "t")


  p2 <- p2 + scale_fill_manual(values=colours)
  p2 <- p2 + scale_colour_manual(values=colours)

  # p<-ggscatter(gene_lengths_p, x = "log10length", y = "observed",
  #           color = "col", size = "log2FC"
  #           )

  # p2<-ggdensity(gene_lengths_df, x = "log10length",
  #           add = "mean", rug = TRUE,
  #           color = "level", fill = "level",
  #           palette = c("#00AFBB", "#E7B800")
  #           )
  #
  combined_plots <- ggarrange(p, p2,
                              labels = c("A", "B"),
                              ncol = 1, nrow = 2)

  gene_len<-paste("gene_lengths_count_model_log10.pdf")
  cat("Writing file", gene_len, "\n")
  ggsave(paste("plots/", gene_len, sep=""), width = 10, height = 10)

  combined_plots
}

getPromoter <- function(gene_lengths_in="data/gene_lengths.txt"){
  gene_lengths<-read.delim(gene_lengths_in, header = T)

  gene_lengths$promoter<-ifelse(gene_lengths$start<gene_lengths$end,
                                gene_lengths$start- 1500,
                                gene_lengths$end + 1500)


  gene_lengths<-gene_lengths[,c("chrom", "promoter")]
  colnames(gene_lengths)<-NULL
  return(gene_lengths)
}













dist2Feat <- function(feature_file="data/tss_locations.txt",sim=NA, print=0,send=0, feature='tss'){
  if(is.na(sim)){
    snv_data<-getData()
  }

  else{
    cat("Generating simulated snv_data\n")
    hit_count<-nrow(getData())
    snv_data<-snvSim(N=hit_count, write=print)
    colnames(snv_data)<-c("chrom", "pos", "v3", "v4", "v5")
    snv_data<-filter(snv_data, chrom == "2L" | chrom == "2R" | chrom == "3L" | chrom == "3R" | chrom == "X" | chrom == "Y" | chrom == "4")
    snv_data<-droplevels(snv_data)
  }


  feature<-paste(toupper(substr(feature, 1, 1)), substr(feature, 2, nchar(feature)), sep='')

  if(feature=='Promoter'){
    feature_locations<-getPromoter()
    cat("Getting gene promoter locations...\n")
  }
  else{
    feature_locations<-read.delim(feature_file, header = F)
    cat("Reading in file:", feature_file, sep =' ', "\n")
  }

  cat("Calculating distances to", feature, sep=' ', "\n")

  colnames(feature_locations)<-c("chrom", "pos")

  feature_locations$pos<-as.integer(feature_locations$pos)

  # Will throw error if SVs don't exist on a chrom...

  # Removes chroms with fewer than 10 observations
  svCount <- table(snv_data$chrom)
  snv_data <- subset(snv_data, chrom %in% names(svCount[svCount >= 10]))
  snv_data<-droplevels(snv_data)

  feature_locations <- subset(feature_locations, chrom %in% levels(snv_data$chrom))
  feature_locations<-droplevels(feature_locations)


  fun2 <- function(p) {
    index<-which.min(abs(tss_df$pos - p))
    closestTss<-tss_df$pos[index]
    # browser()
    chrom<-as.character(tss_df$chrom[index])
    gene<-as.character(tss_df$gene[index])
    dist<-(p-closestTss)
    list(p, closestTss, dist, chrom, gene)
  }

  l <- list()

  for (c in levels(snv_data$chrom)){
    df<-filter(snv_data, chrom == c)
    tss_df<-filter(feature_locations, chrom == c)
    dist2tss<-lapply(df$pos, fun2)
    dist2tss<-do.call(rbind, dist2tss)
    dist2tss<-as.data.frame(dist2tss)

    colnames(dist2tss)=c("bp", "closest_tss", "min_dist", "chrom", "closest_gene")
    dist2tss$min_dist<-as.numeric(dist2tss$min_dist)
    l[[c]] <- dist2tss
  }

  dist2tss<-do.call(rbind, l)
  dist2tss<-as.data.frame(dist2tss)
  dist2tss$chrom<-as.character(dist2tss$chrom)

  dist2tss<-arrange(dist2tss,(abs(min_dist)))

  if(send==1){
    return(dist2tss)
  }
  else{
    p<-ggplot(dist2tss)
    p<-p + geom_density(aes(min_dist, fill = chrom), alpha = 0.3)
    p<-p + scale_x_continuous(paste("Distance to", feature, "(Kb)", sep=' '),
                              limits=c(-10000, 10000),
                              breaks=c(-10000,-1000, 1000, 10000),
                              expand = c(.0005, .0005),
                              labels=c("-10", "-1", "1", "10") )

    p<-p + scale_y_continuous("Density")
    p<-p + geom_vline(xintercept = 0, colour="black", linetype="dotted")
    #p<-p + facet_wrap(~chrom, scale = "free_x", ncol = 5)
    p <- p + geom_rug(aes(min_dist, colour=chrom))
    p<-p + cleanTheme() +
      theme(strip.text = element_text(size=20),
            legend.position="top")

    p<-p + facet_wrap(~chrom, ncol = 3, scales = "free_y")

    if(is.na(sim)){
      distout<-paste("snv", feature, 'dist.pdf', sep='')
    }
    else{
      distout<-paste("snv", feature, 'dist_sim.pdf', sep='')
    }

    cat("Writing file", distout, "\n")
    ggsave(paste("plots/", distout, sep=""), width = 20, height = 10)

    p
  }
}


distOverlay <- function(feature_file="data/tss_locations.txt", feature='tss', lim=10,all=NA){
  feature<-paste(toupper(substr(feature, 1, 1)), substr(feature, 2, nchar(feature)), sep='')

  if(feature=='promoter'){
    real_data<-dist2Feat(send=1, feature=feature)
    sim_data<-dist2Feat(feature=feature, sim=1, send=1)
  }
  else{
    real_data<-dist2Feat(feature_file=feature_file, send=1, feature=feature)
    sim_data<-dist2Feat(feature_file=feature_file, feature=feature, sim=1, send=1)
  }

  real_data$Source<-"Real"
  sim_data$Source<-"Sim"

  sim_data<-filter(sim_data, chrom != "Y", chrom != 4)
  sim_data<-droplevels(sim_data)
  real_data<-filter(real_data, chrom != "Y", chrom != 4)
  real_data<-droplevels(real_data)


  colours<-c( "#E7B800", "#00AFBB")

  scale<-"(Kb)"
  if(lim==0.1){
    cat("Setting limits to -+100bp\n")
    lims=c(-100, 100)
    brks=c(-100, -10, 10, 100)
    expnd = c(.0005, .0005)
    labs=c("-100", "-10", "10", "100")
    scale<-"(bp)"
  }

  else if(lim==0.5){
    cat("Setting limits to -+0.5kb\n")
    lims=c(-500, 500)
    brks=c(-500, -100,100, 500)
    expnd = c(.0005, .0005)
    labs=c("-500", "-100", "100", "500")
    scale<-"(bp)"
  }

  else if(lim==1){
    cat("Setting limits to -+1kb\n")
    lims=c(-1000, 1000)
    brks=c(-1000, 1000)
    expnd = c(.0005, .0005)
    labs=c("-1", "1")
  }
  else{
    cat("Setting limits to -+10kb\n")
    lims=c(-10000, 10000)
    brks=c(-10000,-1000, 1000, 10000)
    expnd = c(.0005, .0005)
    labs=c("-10", "-1", "1", "10")
  }


  p<-ggplot()
  p<-p + geom_density(data=real_data,aes(min_dist, fill = Source), alpha = 0.4)
  p<-p + geom_density(data=sim_data,aes(min_dist, fill = Source), alpha = 0.4)

  if(is.na(all)){
    p<-p + facet_wrap(~chrom, ncol = 3, scales = "free_y")
  }

  p<-p + scale_x_continuous(paste("Distance to", feature, scale, sep=' '),
                            limits=lims,
                            breaks=brks,
                            expand=expnd,
                            labels=labs )
  p<-p + scale_y_continuous("Density")
  p<-p + geom_vline(xintercept = 0, colour="black", linetype="dotted")

  p <- p + geom_rug(data=real_data,aes(min_dist, colour=Source),sides="b")
  p <- p + geom_rug(data=sim_data,aes(min_dist, colour=Source),sides="t")


  p <- p + scale_fill_manual(values=colours)
  p <- p + scale_colour_manual(values=colours)

  p<-p + cleanTheme() +
    theme(strip.text = element_text(size=20),
          legend.position="top")

  overlay<-paste("snv", feature, 'dist_overlay.pdf', sep='')
  cat("Writing file", overlay, "\n")
  ggsave(paste("plots/", overlay, sep=""), width = 25, height = 10)



  p


}




#' tssDist
#'
#' Plot distance to TSS distribution
#' @param tss_pos File containing all TSS positions in genome: "gene chrom tss" [Default 'data/tss_positions.txt']
#' @param sim Simulate random SNVs accross genomic intervals? [Default: NO]
#' @param print Write the simulated random SNVs to a bed file ('data/simulatedSNVs.bed')? [Default: NO]
#' @import ggplot2
#' @keywords tss
#' @export

tssDist <- function(tss_pos="data/tss_positions.txt",sim=NA, print=0,return=0){
  tss_locations<-read.delim(tss_pos, header = T)
  tss_locations$tss<-as.integer(tss_locations$tss)

  if(is.na(sim)){
    snv_data<-getData()
  }

  else{
    cat("Generating simulated snv_data\n")
    hit_count<-nrow(getData())
    snv_data<-snvSim(N=hit_count, write=print)
    colnames(snv_data)<-c("chrom", "pos", "v3", "v4", "v5")
    snv_data<-filter(snv_data, chrom == "2L" | chrom == "2R" | chrom == "3L" | chrom == "3R" | chrom == "X" | chrom == "Y" | chrom == "4")
    snv_data<-droplevels(snv_data)
  }



  # Will throw error if SVs don't exist on a chrom...

  # Removes chroms with fewer than 20 observations
  svCount <- table(snv_data$chrom)
  snv_data <- subset(snv_data, chrom %in% names(svCount[svCount > 30]))
  snv_data<-droplevels(snv_data)

  tss_locations <- subset(tss_locations, chrom %in% levels(snv_data$chrom))
  tss_locations<-droplevels(tss_locations)


  fun2 <- function(p) {
    index<-which.min(abs(tss_df$tss - p))
    closestTss<-tss_df$tss[index]
    chrom<-as.character(tss_df$chrom[index])
    gene<-as.character(tss_df$gene[index])
    dist<-(p-closestTss)
    list(p, closestTss, dist, chrom, gene)
  }

  l <- list()

  for (c in levels(snv_data$chrom)){
    df<-filter(snv_data, chrom == c)
    tss_df<-filter(tss_locations, chrom == c)
    dist2tss<-lapply(df$pos, fun2)
    dist2tss<-do.call(rbind, dist2tss)
    dist2tss<-as.data.frame(dist2tss)

    colnames(dist2tss)=c("bp", "closest_tss", "min_dist", "chrom", "closest_gene")
    dist2tss$min_dist<-as.numeric(dist2tss$min_dist)
    l[[c]] <- dist2tss
  }

  dist2tss<-do.call(rbind, l)
  dist2tss<-as.data.frame(dist2tss)
  dist2tss$chrom<-as.character(dist2tss$chrom)

  dist2tss<-arrange(dist2tss,(abs(min_dist)))

  # Removes chroms with fewer than 20 observations
  # svCount <- table(dist2tss$chrom)
  # dist2tss <- subset(dist2tss, chrom %in% names(svCount[svCount > 10]))

  if(return==1){
    return(dist2tss)
  }
  else{
    p<-ggplot(dist2tss)
    p<-p + geom_density(aes(min_dist, fill = chrom), alpha = 0.3)
    p<-p + scale_x_continuous("Distance to TSS (Kb)",
                              limits=c(-10000, 10000),
                              breaks=c(-10000,-1000, 1000, 10000),
                              expand = c(.0005, .0005),
                              labels=c("-10", "-1", "1", "10") )
    p<-p + scale_y_continuous("Density")
    p<-p + geom_vline(xintercept = 0, colour="black", linetype="dotted")
    #p<-p + facet_wrap(~chrom, scale = "free_x", ncol = 5)
    p <- p + geom_rug(aes(min_dist, colour=chrom))
    p<-p + cleanTheme() +
      theme(strip.text = element_text(size=20),
            legend.position="top")
    p<-p + facet_wrap(~chrom, ncol = 3, scales = "free_y")

    if(is.na(sim)){
      tssDistout<-paste("bpTSSdist.pdf")
    }
    else{
      tssDistout<-paste("bpTSSdist_sim.pdf")
    }
    cat("Writing file", tssDistout, "\n")
    ggsave(paste("plots/", tssDistout, sep=""), width = 20, height = 10)

    p
  }
}


tssDistOverlay <- function(){
  real_data<-tssDist(return=1)
  real_data$Source<-"Real"
  sim_data<-bpTssDist(sim=1, return=1)
  sim_data$Source<-"Sim"

  sim_data<-filter(sim_data, chrom != "Y", chrom != 4)
  sim_data<-droplevels(sim_data)
  real_data<-filter(real_data, chrom != "Y", chrom != 4)
  real_data<-droplevels(real_data)


  colours<-c( "#E7B800", "#00AFBB")


  p<-ggplot()
  p<-p + geom_density(data=real_data,aes(min_dist, fill = Source), alpha = 0.4)
  p<-p + geom_density(data=sim_data,aes(min_dist, fill = Source), alpha = 0.4)
  p<-p + facet_wrap(~chrom, ncol = 3, scales = "free_y")

  p<-p + scale_x_continuous("Distance to TSS (Kb)",
                            limits=c(-100000, 100000),
                            breaks=c(-100000,-10000,-1000, 1000, 10000, 100000),
                            expand = c(.0005, .0005),
                            labels=c("-100", "-10", "-1", "1", "10", "100") )
  p<-p + scale_y_continuous("Density")
  p<-p + geom_vline(xintercept = 0, colour="black", linetype="dotted")
  #p<-p + facet_wrap(~chrom, scale = "free_x", ncol = 5)

  p <- p + geom_rug(data=real_data,aes(min_dist, colour=Source),sides="b")
  p <- p + geom_rug(data=sim_data,aes(min_dist, colour=Source),sides="t")


  p <- p + scale_fill_manual(values=colours)
  p <- p + scale_colour_manual(values=colours)

  p<-p + cleanTheme() +
    theme(strip.text = element_text(size=20),
          legend.position="top")

  p<-p + facet_wrap(~chrom, ncol = 3, scales = "free_y")


  tssDistout<-paste("bpTSSdist_overlay.pdf")

  cat("Writing file", tssDistout, "\n")
  ggsave(paste("plots/", tssDistout, sep=""), width = 20, height = 10)



  p


}


#' snvSim
#'
#' Generate simulated SNV hits acroos genomic regions (e.g. mappable regions)
#' @param intervals File containing genomic regions within which to simulate SNVs [Default 'data/intervals.bed]
#' @param N Number of random SNVs to generate [Default nrow(snv_data)]
#' @import GenomicRanges
#' @keywords sim
#' @export

snvSim <- function(intervals="data/intervals.bed", N=1000, write=F){
  suppressPackageStartupMessages(require(GenomicRanges))

  intFile <- import.bed(intervals)
  space <- sum(width(intFile))
  positions <- sample(c(1:space), N)
  cat("Simulating", N, "SNVs", sep = ' ', "\n")
  new_b <- GRanges(seqnames=as.character(rep(seqnames(intFile), width(intFile))),
                   ranges=IRanges(start=unlist(mapply(seq, from=start(intFile), to=end(intFile))), width=1))
  bedOut<-new_b[positions]
  if(write){
    export.bed(new_b[positions], "data/simulatedSNVs.bed")
  }
  remove(new_b)
  return(data.frame(bedOut))
}



svDist <- function(svs="data/all_bps_filtered.txt",sim=NA, print=0){
  svBreaks<-read.delim(svs, header = F)
  colnames(svBreaks) <- c("event", "bp_no", "sample", "chrom", "bp", "gene", "feature", "type", "length")

  svBreaks$bp<-as.integer(svBreaks$bp)
  svBreaks<-filter(svBreaks, sample != "A373R1" & sample != "A373R7" & sample != "A512R17" )
  svBreaks <- droplevels(svBreaks)

  snv_data<-getData()

  if(!is.na(sim)){
    simrep<-nrow(snv_data)
    cat("Generating simulated snv_data for", simrep, "SNVs", "\n")
    snv_data<-snvSim(N=simrep, write=print)
    snv_data$end<-NULL
    snv_data$width<-NULL
    snv_data$strand<-NULL
    snv_data$sample <- as.factor(sample(levels(svBreaks$sample), size = nrow(snv_data), replace = TRUE))
    #snv_data$type <- sample(levels(svBreaks$type), size = nrow(snv_data), replace = TRUE)
    colnames(snv_data)<-c("chrom", "pos", "sample")
    snv_data<-filter(snv_data, chrom == "2L" | chrom == "2R" | chrom == "3L" | chrom == "3R" | chrom == "X" | chrom == "Y" | chrom == "4")
    snv_data<-droplevels(snv_data)
  }

  snv_data <- subset(snv_data, sample %in% levels(svBreaks$sample))
  snv_data <- droplevels(snv_data)

  snv_data <- subset(snv_data, chrom %in% levels(svBreaks$chrom))
  snv_data <- droplevels(snv_data)


  fun3 <- function(p) {
    index<-which.min(abs(sv_df$bp - p))
    closestBp<-as.numeric(sv_df$bp[index])
    chrom<-as.character(sv_df$chrom[index])
    gene<-as.character(sv_df$gene[index])
    sample<-as.character(sv_df$sample[index])
    type<-as.character(sv_df$type[index])

    dist<-(p-closestBp)
    list(p, closestBp, dist, chrom, gene, type, sample)
  }

  l <- list()

  for (c in levels(snv_data$chrom)){
    for (s in levels(snv_data$sample)){
      df<-filter(snv_data, chrom == c & sample == s)
      sv_df<-filter(svBreaks, chrom == c & sample == s)

      dist2bp<-lapply(df$pos, fun3)
      dist2bp<-do.call(rbind, dist2bp)
      dist2bp<-as.data.frame(dist2bp)

      colnames(dist2bp)=c("snp", "closest_bp", "min_dist", "chrom", "closest_gene", "type", "sample")
      dist2bp$min_dist<-as.numeric(dist2bp$min_dist)
      l[[s]] <- dist2bp
    }
    l[[c]] <- dist2bp
  }

  dist2bp<-do.call(rbind, l)
  dist2bp<-as.data.frame(dist2bp)
  dist2bp$chrom<-as.character(dist2bp$chrom)
  dist2bp$type<-as.character(dist2bp$type)

  snvCount <- table(dist2bp$chrom)
  dist2bp <- subset(dist2bp, chrom %in% names(snvCount[snvCount > 25]))

  dist2bp<-arrange(dist2bp,(abs(min_dist)))

  dist2bp <- dist2bp %>% na.omit()

  p<-ggplot(dist2bp)
  p<-p + geom_density(aes(min_dist, fill=type), alpha = 0.3)
  # p<-p + scale_x_continuous("Distance to SV BP (Kb)",
  #                           limits=c(-10000000, 10000000),
  #                           breaks=c(-10000000, -100000, -10000, -1000, 0, 1000, 10000, 100000, 10000000),
  #                           expand = c(.0005, .0005),
  #                           labels=c("-10000", "-100", "-10", "-1", 0, "1", "10", "100", "10000") )
  p<-p + scale_y_continuous("Density", expand = c(0, 0))
  p<-p + geom_vline(xintercept = 0, colour="black", linetype="dotted")
  p<-p + cleanTheme()
  p<-p + facet_wrap(~chrom, ncol = 2, scales = "free_y")
  p

  # p<-ggplot(dist2bp)
  # p<-p + geom_histogram(aes(as.numeric(min_dist, fill = type)), alpha = 0.6, binwidth = 1000)
  # p<-p + scale_x_continuous("Distance to TSS", limits=c(-1000000, 1000000))
  # p<-p + geom_vline(xintercept = 0, colour="black", linetype="dotted")
  # p

}





g4Dist <- function(g4_pos="data/g4_positions.txt",sim=NA, print=0,return=0){
  g4_pos="data/g4_positions.txt"
  g4_locations<-read.delim(g4_pos, header = T)
  g4_locations$g4<-as.integer(g4_locations$g4)

  if(is.na(sim)){
    snv_data<-getData()
  }

  else{
    cat("Generating simulated snv_data\n")
    hit_count<-nrow(getData())
    snv_data<-snvSim(N=hit_count, write=print)
    colnames(snv_data)<-c("chrom", "pos", "v3", "v4", "v5")
    snv_data<-filter(snv_data, chrom == "2L" | chrom == "2R" | chrom == "3L" | chrom == "3R" | chrom == "X" | chrom == "Y" | chrom == "4")
    snv_data<-droplevels(snv_data)
  }



  # Will throw error if SVs don't exist on a chrom...

  # Removes chroms with fewer than 20 observations
  svCount <- table(snv_data$chrom)
  snv_data <- subset(snv_data, chrom %in% names(svCount[svCount > 30]))
  snv_data<-droplevels(snv_data)

  g4_locations <- subset(g4_locations, chrom %in% levels(snv_data$chrom))
  g4_locations<-droplevels(g4_locations)


  fun2 <- function(p) {
    index<-which.min(abs(g4_df$g4 - p))
    closestTss<-g4_df$g4[index]
    chrom<-as.character(g4_df$chrom[index])
    gene<-as.character(g4_df$gene[index])
    dist<-(p-closestTss)
    list(p, closestTss, dist, chrom, gene)
  }

  l <- list()

  for (c in levels(snv_data$chrom)){
    df<-filter(snv_data, chrom == c)
    g4_df<-filter(g4_locations, chrom == c)
    dist2g4<-lapply(df$pos, fun2)
    dist2g4<-do.call(rbind, dist2g4)
    dist2g4<-as.data.frame(dist2g4)

    colnames(dist2g4)=c("bp", "closest_g4", "min_dist", "chrom", "closest_gene")
    dist2g4$min_dist<-as.numeric(dist2g4$min_dist)
    l[[c]] <- dist2g4
  }

  dist2g4<-do.call(rbind, l)
  dist2g4<-as.data.frame(dist2g4)
  dist2g4$chrom<-as.character(dist2g4$chrom)

  dist2g4<-arrange(dist2g4,(abs(min_dist)))

  # Removes chroms with fewer than 20 observations
  # svCount <- table(dist2g4$chrom)
  # dist2g4 <- subset(dist2g4, chrom %in% names(svCount[svCount > 10]))

  if(return==1){
    return(dist2g4)
  }
  else{
    p<-ggplot(dist2g4)
    p<-p + geom_density(aes(min_dist, fill = chrom), alpha = 0.3)
    p<-p + scale_x_continuous("Distance to G4 (Kb)",
                              limits=c(-10000, 10000),
                              breaks=c(-10000,-1000, 1000, 10000),
                              expand = c(.0005, .0005),
                              labels=c("-10", "-1", "1", "10") )
    p<-p + scale_y_continuous("Density")
    p<-p + geom_vline(xintercept = 0, colour="black", linetype="dotted")
    #p<-p + facet_wrap(~chrom, scale = "free_x", ncol = 5)
    p <- p + geom_rug(aes(min_dist, colour=chrom))
    p<-p + cleanTheme() +
      theme(strip.text = element_text(size=20),
            legend.position="top")
    p<-p + facet_wrap(~chrom, ncol = 3, scales = "free_y")

    if(is.na(sim)){
      g4Distout<-paste("bpG4dist.pdf")
    }
    else{
      g4Distout<-paste("bpG4dist_sim.pdf")
    }
    cat("Writing file", g4Distout, "\n")
    ggsave(paste("plots/", g4Distout, sep=""), width = 20, height = 10)

    p
  }
}



g4DistOverlay <- function(){
  real_data<-g4Dist(return=1)
  real_data$Source<-"Real"
  sim_data<-bpTssDist(sim=1, return=1)
  sim_data$Source<-"Sim"

  sim_data<-filter(sim_data, chrom != "Y", chrom != 4)
  sim_data<-droplevels(sim_data)
  real_data<-filter(real_data, chrom != "Y", chrom != 4)
  real_data<-droplevels(real_data)


  colours<-c( "#E7B800", "#00AFBB")


  p<-ggplot()
  p<-p + geom_density(data=real_data,aes(min_dist, fill = Source), alpha = 0.4)
  p<-p + geom_density(data=sim_data,aes(min_dist, fill = Source), alpha = 0.4)
  p<-p + facet_wrap(~chrom, ncol = 3, scales = "free_y")

  p<-p + scale_x_continuous("Distance to G4 (Kb)",
                            limits=c(-10000, 10000),
                            breaks=c(-10000,-1000, 1000, 10000),
                            expand = c(.0005, .0005),
                            labels=c("-10", "-1", "1", "10") )
  p<-p + scale_y_continuous("Density")
  p<-p + geom_vline(xintercept = 0, colour="black", linetype="dotted")
  #p<-p + facet_wrap(~chrom, scale = "free_x", ncol = 5)

  p <- p + geom_rug(data=real_data,aes(min_dist, colour=Source),sides="b")
  p <- p + geom_rug(data=sim_data,aes(min_dist, colour=Source),sides="t")


  p <- p + scale_fill_manual(values=colours)
  p <- p + scale_colour_manual(values=colours)

  p<-p + cleanTheme() +
    theme(strip.text = element_text(size=20),
          legend.position="top")

  p<-p + facet_wrap(~chrom, ncol = 3, scales = "free_y")


  g4Distout<-paste("bpG4dist_overlay.pdf")

  cat("Writing file", g4Distout, "\n")
  ggsave(paste("plots/", g4Distout, sep=""), width = 20, height = 10)



  p


}








#' chromDist
#'
#' Plot genome-wide snv distribution
#' @import ggplot2
#' @keywords distribution
#' @export

chromDist <- function(object=NA, notch=0){
  snv_data<-getData()
  ext<-'.pdf'
  if(is.na(object)){
    object<-'grouped_trans'
    cols<-setCols(snv_data, "grouped_trans")
  }

  if(notch){
    snv_data<-exclude_notch()
    ext<-'_excl.N.pdf'
  }

  cat("Plotting snvs by", object, "\n")

  p<-ggplot(snv_data)
  p<-p + geom_histogram(aes(pos/1000000, fill = get(object)), binwidth=0.1, alpha = 0.8)
  p<-p + facet_wrap(~chrom, scale = "free_x", ncol = 2)
  p<-p + scale_x_continuous("Mbs", breaks = seq(0,33,by=1), limits = c(0, 33),expand = c(0.01, 0.01))
  p<-p + scale_y_continuous("Number of snvs", expand = c(0.01, 0.01))
  p<-p + cleanTheme() +
    theme(axis.text.x = element_text(angle = 45, hjust=1),
          axis.text = element_text(size=12),
          axis.title = element_text(size=20),
          strip.text.x = element_text(size = 15)
    )

  if (object == 'grouped_trans'){
    p<-p + cols
  }
  chrom_outfile<-paste("snv_dist_genome_by_", object, ext, sep = "")
  cat("Writing file", chrom_outfile, "\n")
  ggsave(paste("plots/", chrom_outfile, sep=""), width = 20, height = 10)

  p
}

mutationTypes <- function(allele_frequency = 0.1){
  snv_data<-getData()

  snv_data <- snv_data %>%
    filter(dups!="TRUE") %>%
    filter(a_freq>=allele_frequency)

  library(dplyr)
  mutCounts <- snv_data %>%
    group_by(sample, grouped_trans) %>%
    summarise(count=n()) %>%
    mutate(perc=count/sum(count))


  p <- ggplot(mutCounts)
  p <- p + geom_bar(aes(sample, perc*100, fill=grouped_trans),  stat="identity")

  p <- p + cleanTheme() +
    theme(panel.grid.major.y = element_line(color="grey80", size = 0.5, linetype = "dotted"),
          axis.text.x = element_text(angle = 90, hjust=1),
          axis.text.y = element_text(size=15),
          axis.title = element_text(size=20),
          strip.text.x = element_text(size = 15)
    )
  p <- p + ylab("% contribution")

  p
}


###########
##  Misc ##
###########

# Gene lengths on accross chroms

#gene_lengths="data/gene_lengths.txt"
#gene_lengths<-read.delim(gene_lengths, header = T)

#p<-ggplot(gene_lengths)
#p<-p + geom_point(aes( x=(start+(length/2)), y=log10(length), colour = chrom))
#p<-p + facet_wrap(~chrom, scale = "free_x")
#p
nriddiford/mutationProfiles documentation built on Nov. 7, 2021, 12:14 a.m.