R/variant_maps/variantFishers.R

#!/usr/bin/env Rscript

Fisher <- function(plot.type='mutation', gene.matrix.a, gene.matrix.b, plot.title.main, plot.title.a, plot.title.b, allosome='none', targets.file=NULL, suffix='', muts.exact=FALSE, threshold.a=TRUE, threshold.b=TRUE, gene.names=TRUE, gene.order=NULL) {

#------
# USAGE
#------

# gene.matrix: data frame with the following columns [group, gene, chrom, sample.names... ] - only sample names containing the string "_threshold" are run
#
# gene.matrix.a / gene.matrix.b: data frames with columns gene, start, end, sample1, sample2 ... [ sample columns should have copy number calls -2..2 ]
#
# plot.title.main: string used for plot title [ verbatim ], and file name [ "a vs b" -> "a_vs_b.pdf" ]
#
# plot.title.a / plot.title.b used for subtitle naming
#
# allosome:
#
#   "none"      = exclude X & Y chromosome
#   "merge"     = treat X & Y coordinates as homologous pair
#   "distinct"  = treat X & Y as seperate, sequential chromosomes
#
# targets.file: should genes be subset by a targets bed file [first 3 cols should be chrom, start, end]
#
# suffix: string appended to end of file names for versioning, use reserved string "now" to use current timestamp
#
# threshold.a / threshold.b: subset columns with those containing string 'threshold'
#
# plot.type:
#
#   "mutation"      = for Fishers exact on mutations
#   "copy number"   = for Fishers exact on copy number


    #---------------
    # INITIALIZATION
    #---------------

    # libraries
    pacman::p_load(DescTools,dplyr,readr,stringr,tidyr,broom,purrr,magrittr,rlist,crayon,colorspace,ggplot2,grid,gridExtra,RColorBrewer)

    # create output directory
    MakeDirs(c('summary/fishers_cn', 'summary/fishers_mut'))


    #----------
    # FUNCTIONS
    #----------

    message(blue('- initializing'))

    # create mutation prevalence object
    MutPrevalence <- function(sample.stats, plot.title, gene.names=TRUE, gene.order=NULL) {

        stats.melt <-
            sample.stats %>%
            mutate(pct=present*100) %>%
            select(Gene=gene,`Prevalance (% cases)`=pct) %>%
            mutate(group=factor('present'))

        if(!is.null(gene.order)) {
            stats.melt <-
                stats.melt %>%
                rowwise %>%
                mutate(gene.ord=which(Gene==gene.order)) %>%
                arrange(gene.ord) %>%
                ungroup %>%
                mutate(Gene=factor(Gene, levels=unique(Gene))) %>%
                select(-gene.ord)
        }

        gg <- ggplot(stats.melt , aes(x=Gene, y=`Prevalance (% cases)`, fill=group, width=0.9)) +

        ggtitle(plot.title) +

        geom_bar(stat="identity", data=stats.melt) +

        scale_y_continuous(limits=c(0,100)) +
        scale_fill_brewer(type='qualitative', palette='Accent') +

        geom_hline(aes(yintercept=0), colour='#666666') # y=0 axis

        if(gene.names==TRUE) {

            gg <- gg +
                theme(  legend.title        = element_blank(),
                        panel.grid.major    = element_blank(),
                        panel.grid.minor    = element_blank(),
                        text                = element_text(size=14),
                        axis.title.x        = element_text(vjust=7),
                        axis.text.x         = element_text(angle=90,vjust=0.5,hjust=1,size=10),
                        axis.text.y         = element_text(size=10),
                        legend.key          = element_rect(colour='white', fill=NULL, size=0.1),
                        legend.key.size     = unit(1.4, 'lines'),
                        legend.text         = element_text(size=10),
                        strip.text.x        = element_text(colour='white',size=10),
                        panel.background    = element_rect(fill=NA, color='black'),
                        plot.margin         = unit(c(1,1,1,1),'cm'))

        } else {

            gg <- gg +
                theme(  legend.title        = element_blank(),
                        panel.grid.major    = element_blank(),
                        panel.grid.minor    = element_blank(),
                        text                = element_text(size=14),
                        axis.title.x        = element_blank(),
                        axis.text.x         = element_blank(),
                        axis.ticks.x        = element_blank(),
                        axis.title.y        = element_text(vjust=4),
                        axis.text.y         = element_text(size=10),
                        legend.key          = element_rect(colour='white',fill=NULL,size=0.1),
                        legend.key.size     = unit(1.4, 'lines'),
                        legend.text         = element_text(size=10),
                        strip.text.x        = element_text(colour='white',size=10),
                        panel.background    = element_rect(fill=NA, color='black'),
                        plot.margin         = unit(c(1,1,1,1),'cm'))

        }

        gt <- suppressMessages(ggplot_gtable(ggplot_build(gg)))
        gt$layout$clip[gt$layout$name=='panel'] <- 'off'
        gt
    }


    # create gains prevalence object
    GainPrevalence <- function(sample.stats, plot.title, gene.names, gene.order=NULL) {

        stats.melt <-
            sample.stats %>%
            mutate(above=above*100) %>%
            mutate(below=below*-100) %>%
            gather(group, pct, -gene) %>%
            select(Gene=gene,`Prevalance (% cases)`=pct,group) %>%
            mutate(group=factor(group,levels=c('above', 'below')))

        if(!is.null(gene.order)) {
            stats.melt <-
                stats.melt %>%
                rowwise %>%
                mutate(gene.ord=which(Gene==gene.order)) %>%
                arrange(gene.ord) %>%
                ungroup %>%
                mutate(Gene=factor(Gene, levels=unique(Gene))) %>%
                select(-gene.ord)
        }

        gg <- ggplot(stats.melt , aes(x=Gene, y=`Prevalance (% cases)`, fill=group, width=0.9)) +

        ggtitle(plot.title) +

        geom_bar(stat="identity", data=filter(stats.melt, group=='above')) +
        geom_bar(stat="identity", data=filter(stats.melt, group=='below')) +

        scale_y_continuous(limits=c(-100,100), labels=abs) +
        scale_fill_brewer(type='qualitative', palette='Accent') +

        geom_hline(aes(yintercept=0), colour='#666666') +   # y=0 axis
        geom_vline(xintercept=head(chrom.n$chrom.n,-1)+0.5, colour='#666666', linetype=3)   # chromosome demarcation

        if(gene.names==TRUE) {

            gg <- gg + annotate(geom='text', x=Midx(c(0,chrom.n$chrom.n))+0.5, label=1:max(as.numeric(chrom.n$chrom)), y=Inf, vjust=2, size=3) +    # chromosome enumeration (top)
                theme(  legend.title        = element_blank(),
                        panel.grid.major    = element_blank(),
                        panel.grid.minor    = element_blank(),
                        text                = element_text(size=10),
                        axis.title.x        = element_text(vjust=7),
                        axis.text.x         = element_text(angle=90,vjust=0.5,hjust=1,size=3),
                        axis.text.y         = element_text(size=7),
                        legend.key          = element_rect(colour='white', fill=NULL, size=0.1),
                        legend.key.size     = unit(1.4, 'lines'),
                        legend.text         = element_text(size=7),
                        strip.text.x        = element_text(colour='white',size=7),
                        panel.background    = element_rect(fill=NA, color='black'),
                        plot.margin         = unit(c(1,1,1,1),'cm'))

        } else {

            gg <- gg + annotate(geom='text', x=Midx(c(0,chrom.n$chrom.n))+0.5, label=1:max(as.numeric(chrom.n$chrom)), y=-Inf, vjust=2, size=3) +   # chromosome enumeration (bottom)
            theme(  legend.title        = element_blank(),
                    panel.grid.major    = element_blank(),
                    panel.grid.minor    = element_blank(),
                    text                = element_text(size=10),
                    axis.title.x        = element_blank(),
                    axis.text.x         = element_blank(),
                    axis.ticks.x        = element_blank(),
                    axis.title.y        = element_text(vjust=4),
                    axis.text.y         = element_text(size=7),
                    legend.key          = element_rect(colour='white',fill=NULL,size=0.1),
                    legend.key.size     = unit(1.4, 'lines'),
                    legend.text         = element_text(size=7),
                    strip.text.x        = element_text(colour='white',size=7),
                    panel.background    = element_rect(fill=NA, color='black'),
                    plot.margin         = unit(c(1,1,1,1),'cm'))

        }

        gt <- suppressMessages(ggplot_gtable(ggplot_build(gg)))
        gt$layout$clip[gt$layout$name=='panel'] <- 'off'
        gt

    }


    # create amplifications prevalence object
    AmpPrevalence <- function(sample.stats, plot.title, gene.names, gene.order=NULL) {

        stats.melt <-
            sample.stats %>%
            mutate(pct=above*100) %>%
            select(Gene=gene,`Prevalance (% cases)`=pct) %>%
            mutate(group=factor('above'))

        if(!is.null(gene.order)) {
            stats.melt <-
                stats.melt %>%
                rowwise %>%
                mutate(gene.ord=which(Gene==gene.order)) %>%
                arrange(gene.ord) %>%
                ungroup %>%
                mutate(Gene=factor(Gene, levels=unique(Gene))) %>%
                select(-gene.ord)
        }

        gg <- ggplot(stats.melt , aes(x=Gene, y=`Prevalance (% cases)`, fill=group, width=0.9)) +

        ggtitle(plot.title) +

        geom_bar(stat="identity", data=stats.melt) +

        scale_y_continuous(limits=c(0,100)) +
        scale_fill_brewer(type='qualitative', palette='Accent') +

        geom_hline(aes(yintercept=0), colour='#666666') +   # y=0 axis
        geom_vline(xintercept=head(chrom.n$chrom.n,-1)+0.5, colour='#666666', linetype=3)   # chromosome demarcation

        if(gene.names==TRUE) {

            gg <- gg + annotate(geom='text', x=Midx(c(0,chrom.n$chrom.n))+0.5, label=1:max(as.numeric(chrom.n$chrom)), y=Inf, vjust=2, size=3) +    # chromosome enumeration (top)
                theme(  legend.title        = element_blank(),
                        panel.grid.major    = element_blank(),
                        panel.grid.minor    = element_blank(),
                        text                = element_text(size=10),
                        axis.title.x        = element_text(vjust=7),
                        axis.text.x         = element_text(angle=90,vjust=0.5,hjust=1,size=5),
                        axis.text.y         = element_text(size=7),
                        legend.key          = element_rect(colour='white', fill=NULL, size=0.1),
                        legend.key.size     = unit(1.4, 'lines'),
                        legend.text         = element_text(size=7),
                        strip.text.x        = element_text(colour='white',size=7),
                        panel.background    = element_rect(fill=NA, color='black'),
                        plot.margin         = unit(c(1,1,1,1),'cm'))

        } else {

            gg <- gg + annotate(geom='text', x=Midx(c(0,chrom.n$chrom.n))+0.5, label=1:max(as.numeric(chrom.n$chrom)), y=-Inf, vjust=2, size=3) +   # chromosome enumeration (bottom)
                theme(  legend.title        = element_blank(),
                        panel.grid.major    = element_blank(),
                        panel.grid.minor    = element_blank(),
                        text                = element_text(size=10),
                        axis.title.x        = element_blank(),
                        axis.text.x         = element_blank(),
                        axis.ticks.x        = element_blank(),
                        axis.title.y        = element_text(vjust=4),
                        axis.text.y         = element_text(size=7),
                        legend.key          = element_rect(colour='white',fill=NULL,size=0.1),
                        legend.key.size     = unit(1.4, 'lines'),
                        legend.text         = element_text(size=7),
                        strip.text.x        = element_text(colour='white',size=7),
                        panel.background    = element_rect(fill=NA, color='black'),
                        plot.margin         = unit(c(1,1,1,1),'cm'))

        }

        gt <- suppressMessages(ggplot_gtable(ggplot_build(gg)))
        gt$layout$clip[gt$layout$name=='panel'] <- 'off'
        gt
    }


    # create amplifications fishers object
    MutFishers <- function(sample.stats, plot.title, gene.names=TRUE, gene.order=NULL) {

        stats.melt <-
            sample.stats %>%
            mutate(pct=present*-1) %>%
            select(Gene=gene,`P-value (-log10)`=pct) %>%
            mutate(group=factor('present'))

        if(!is.null(gene.order)) {
            stats.melt <-
                stats.melt %>%
                rowwise %>%
                mutate(gene.ord=which(Gene==gene.order)) %>%
                arrange(gene.ord) %>%
                ungroup %>%
                mutate(Gene=factor(Gene, levels=unique(Gene))) %>%
                select(-gene.ord)
        }

        gg <- ggplot(stats.melt , aes(x=Gene, y=`P-value (-log10)`, fill=group, width=0.9)) +

        ggtitle(plot.title) +

        geom_bar(stat="identity", data=filter(stats.melt, group=='present')) +

        scale_y_continuous(limits=c(0,10)) +
        scale_fill_brewer(type='qualitative', palette='Accent') +

        geom_hline(aes(yintercept=0), colour='#666666') # y=0 axis

        if(gene.names==TRUE) {

            gg <- gg +
                theme(  legend.title        = element_blank(),
                        panel.grid.major    = element_blank(),
                        panel.grid.minor    = element_blank(),
                        text                = element_text(size=14),
                        axis.title.x        = element_text(vjust=7),
                        axis.text.x         = element_text(angle=90,vjust=0.5,hjust=1,size=10),
                        axis.text.y         = element_text(size=10),
                        legend.key          = element_rect(colour='white', fill=NULL, size=0.1),
                        legend.key.size     = unit(1.4, 'lines'),
                        legend.text         = element_text(size=10),
                        strip.text.x        = element_text(colour='white',size=10),
                        panel.background    = element_rect(fill=NA, color='black'),
                        plot.margin         = unit(c(1,1,1,1),'cm'))

        } else {

            gg <- gg +
            theme(  legend.title        = element_blank(),
                    panel.grid.major    = element_blank(),
                    panel.grid.minor    = element_blank(),
                    text                = element_text(size=10),
                    axis.title.x        = element_blank(),
                    axis.text.x         = element_blank(),
                    axis.ticks.x        = element_blank(),
                    axis.title.y        = element_text(vjust=4),
                    axis.text.y         = element_text(size=7),
                    legend.key          = element_rect(colour='white',fill=NULL,size=0.1),
                    legend.key.size     = unit(1.4, 'lines'),
                    legend.text         = element_text(size=7),
                    strip.text.x        = element_text(colour='white',size=7),
                    panel.background    = element_rect(fill=NA, color='black'),
                    plot.margin         = unit(c(1,1,1,1),'cm'))

        }

        gt <- suppressMessages(ggplot_gtable(ggplot_build(gg)))
        gt$layout$clip[gt$layout$name=='panel'] <- 'off'
        gt
    }


    # create gains fishers object
    GainFishers <- function(sample.stats, plot.title, gene.names, gene.order=NULL) {

        stats.melt <-
            sample.stats %>%
            mutate(above=above*-1) %>%
            mutate(below=below) %>%
            gather(group, pct, -gene) %>%
            select(Gene=gene,`P-value (-log10)`=pct,group) %>%
            mutate(group=factor(group,levels=c('above', 'below')))

        if(!is.null(gene.order)) {
            stats.melt <-
                stats.melt %>%
                rowwise %>%
                mutate(gene.ord=which(Gene==gene.order)) %>%
                arrange(gene.ord) %>%
                ungroup %>%
                mutate(Gene=factor(Gene, levels=unique(Gene))) %>%
                select(-gene.ord)
        }

        gg <- ggplot(stats.melt , aes(x=Gene, y=`P-value (-log10)`, fill=group, width=0.9)) +

        ggtitle(plot.title) +

        geom_bar(stat="identity", data=filter(stats.melt, group=='above')) +
        geom_bar(stat="identity", data=filter(stats.melt, group=='below')) +

        scale_y_continuous(limits=c(-10,10), labels=abs) +
        scale_fill_brewer(type='qualitative', palette='Accent') +

        geom_hline(aes(yintercept=0), colour='#666666') +   # y=0 axis
        geom_vline(xintercept=head(chrom.n$chrom.n,-1)+0.5, colour='#666666', linetype=3)   # chromosome demarcation

        if(gene.names==TRUE) {

            gg <- gg + annotate(geom='text', x=Midx(c(0,chrom.n$chrom.n))+0.5, label=1:max(as.numeric(chrom.n$chrom)), y=Inf, vjust=2, size=3) +  # chromosome enumeration (top)
                theme(  legend.title        = element_blank(),
                        panel.grid.major    = element_blank(),
                        panel.grid.minor    = element_blank(),
                        text                = element_text(size=10),
                        axis.title.x        = element_text(vjust=7),
                        axis.text.x         = element_text(angle=90,vjust=0.5,hjust=1,size=5),
                        axis.text.y         = element_text(size=7),
                        legend.key          = element_rect(colour='white', fill=NULL, size=0.1),
                        legend.key.size     = unit(1.4, 'lines'),
                        legend.text         = element_text(size=7),
                        strip.text.x        = element_text(colour='white',size=7),
                        panel.background    = element_rect(fill=NA, color='black'),
                        plot.margin         = unit(c(1,1,1,1),'cm'))

        } else {

            gg <- gg + annotate(geom='text', x=Midx(c(0,chrom.n$chrom.n))+0.5, label=1:max(as.numeric(chrom.n$chrom)), y=-Inf, vjust=2, size=3) +   # chromosome enumeration (bottom)
            theme(  legend.title        = element_blank(),
                    panel.grid.major    = element_blank(),
                    panel.grid.minor    = element_blank(),
                    text                = element_text(size=10),
                    axis.title.x        = element_blank(),
                    axis.text.x         = element_blank(),
                    axis.ticks.x        = element_blank(),
                    axis.title.y        = element_text(vjust=4),
                    axis.text.y         = element_text(size=7),
                    legend.key          = element_rect(colour='white',fill=NULL,size=0.1),
                    legend.key.size     = unit(1.4, 'lines'),
                    legend.text         = element_text(size=7),
                    strip.text.x        = element_text(colour='white',size=7),
                    panel.background    = element_rect(fill=NA, color='black'),
                    plot.margin         = unit(c(1,1,1,1),'cm'))

        }

        gt <- suppressMessages(ggplot_gtable(ggplot_build(gg)))
        gt$layout$clip[gt$layout$name=='panel'] <- 'off'
        gt
    }


    # create amplifications fishers object
    AmpFishers <- function(sample.stats, plot.title, gene.names, gene.order=NULL) {

        stats.melt <-
            sample.stats %>%
            mutate(pct=above*-1) %>%
            select(Gene=gene,`P-value (-log10)`=pct) %>%
            mutate(group=factor('above'))

        if(!is.null(gene.order)) {
            stats.melt <-
                stats.melt %>%
                rowwise %>%
                mutate(gene.ord=which(Gene==gene.order)) %>%
                arrange(gene.ord) %>%
                ungroup %>%
                mutate(Gene=factor(Gene, levels=unique(Gene))) %>%
                select(-gene.ord)
        }

        gg <- ggplot(stats.melt , aes(x=Gene, y=`P-value (-log10)`, fill=group, width=0.9)) +

        ggtitle(plot.title) +

        geom_bar(stat="identity", data=filter(stats.melt, group=='above')) +

        scale_y_continuous(limits=c(0,10)) +
        scale_fill_brewer(type='qualitative', palette='Accent') +

        geom_hline(aes(yintercept=0), colour='#666666') +   # y=0 axis
        geom_vline(xintercept=head(chrom.n$chrom.n,-1)+0.5, colour='#666666', linetype=3)   # chromosome demarcation

        if(gene.names==TRUE) {

            gg <- gg + annotate(geom='text', x=Midx(c(0,chrom.n$chrom.n))+0.5, label=1:max(as.numeric(chrom.n$chrom)), y=Inf, vjust=2, size=3) +    # chromosome enumeration (top)
                theme(  legend.title        = element_blank(),
                        panel.grid.major    = element_blank(),
                        panel.grid.minor    = element_blank(),
                        text                = element_text(size=10),
                        axis.title.x        = element_text(vjust=7),
                        axis.text.x         = element_text(angle=90,vjust=0.5,hjust=1,size=5),
                        axis.text.y         = element_text(size=7),
                        legend.key          = element_rect(colour='white', fill=NULL, size=0.1),
                        legend.key.size     = unit(1.4, 'lines'),
                        legend.text         = element_text(size=7),
                        strip.text.x        = element_text(colour='white',size=7),
                        panel.background    = element_rect(fill=NA, color='black'),
                        plot.margin         = unit(c(1,1,1,1),'cm'))

        } else {

            gg <- gg + annotate(geom='text', x=Midx(c(0,chrom.n$chrom.n))+0.5, label=1:max(as.numeric(chrom.n$chrom)), y=-Inf, vjust=2, size=3) +   # chromosome enumeration (bottom)
            theme(  legend.title        = element_blank(),
                    panel.grid.major    = element_blank(),
                    panel.grid.minor    = element_blank(),
                    text                = element_text(size=10),
                    axis.title.x        = element_blank(),
                    axis.text.x         = element_blank(),
                    axis.ticks.x        = element_blank(),
                    axis.title.y        = element_text(vjust=4),
                    axis.text.y         = element_text(size=7),
                    legend.key          = element_rect(colour='white',fill=NULL,size=0.1),
                    legend.key.size     = unit(1.4, 'lines'),
                    legend.text         = element_text(size=7),
                    strip.text.x        = element_text(colour='white',size=7),
                    panel.background    = element_rect(fill=NA, color='black'),
                    plot.margin         = unit(c(1,1,1,1),'cm'))

        }

        gt <- suppressMessages(ggplot_gtable(ggplot_build(gg)))
        gt$layout$clip[gt$layout$name=='panel'] <- 'off'
        gt
    }


    # fix file names
    FixName <- function(file.name) {
        file.name %>% gsub('\\s', '_', .) %>% gsub('/', '-', .)
    }


    # convert chrom vector to desired format
    ChromMod <- function(muts, allosome){

        if('X' %in% muts$chrom) { message(yellow('X chromosome labelling found'))}
        if('Y' %in% muts$chrom) { message(yellow('Y chromosome labelling found'))}

        if(allosome=='distinct') {
                muts %>%
                mutate(.,chrom=as.numeric(ifelse(chrom=='X', 23,chrom))) %>%
                mutate(chrom=as.numeric(ifelse(chrom=='Y', 24,chrom)))
        } else if(allosome =='merge') {
                muts %>% 
                mutate(chrom=as.numeric(ifelse(chrom %in% c('X','Y'), 23, chrom)))
        } else {
                muts %>%
                filter(!chrom %in% c('X','Y'))
        }
    }


    # use targets file to select genes
    SubsetTargets <- function(gene.matrix, targets, plot.type) {
        gene.matrix %>%
        mutate(chrom=ifelse(chrom=='X',23,ifelse(chrom=='Y',24,chrom))) %>%
        mutate(chrom=as.numeric(chrom)) %>%
        do({ if(plot.type=='copy number'){  # conditionally assign midpoints
                mutate(., mid=(start+end)/2)
            } else { 
                mutate(., mid=pos)
            } }) %>%
        rowwise %>%
        mutate(in.target={  # check if gene exists in target file
                any(targets[[chrom]]$start.target <= mid & mid <= targets[[chrom]]$end.target)
            }) %>%
        filter(in.target==TRUE) %>%
        select(-mid,-in.target) %>%
        mutate(chrom=as.character(chrom)) %>%
        mutate(chrom=ifelse(chrom=='23','X',ifelse(chrom=='24','Y',chrom))) %>%
        ungroup
    }


    # calculate % of cases with CN status
    MutSampleStats <- function(gene.matrix) {
        gene.matrix %>%
        gather(sample,effect,-gene,-chrom,-pos) %>%
        mutate(n.samples = n_distinct(sample)) %>%
        group_by(sample, gene) %>%
        slice(which.min(pos)) %>%
        ungroup %>%
        group_by(gene) %>%
        arrange(gene) %>%
        # mutation status sums
        mutate(mut.sum  = sum(effect == 1, na.rm=TRUE)) %>%
        # CN status percentages
        mutate(mut.pct = mut.sum/n.samples) %>%
        # over/under sums for Fisher's test
        mutate(mut.present = sum(effect == 1, na.rm=TRUE)) %>%
        mutate(mut.absent = sum(effect == 0, na.rm=TRUE)) %>%
        # over/under percentages for gain plotting
        mutate(mut.present.pct = sum(effect == 1, na.rm=TRUE)/n.samples) %>%
        mutate(mut.absent.pct = 1 - mut.present.pct) %>%
        slice(which.min(pos)) %>%
        ungroup %>%
        select(-effect, -sample) %>%
        unique
    }


    # calculate % of cases with CN status
    CNSampleStats <- function(gene.matrix, threshold) {
        gene.matrix %>%
        select(-matches('band|mid|stop')) %>%
        gather(sample,copy.num,-gene,-chrom,-start,-end) %>%
        do({ if(threshold==TRUE){  # conditionally select threshold calls
                filter(., grepl('threshold', sample))
            } else { . } }) %>%
        mutate(n.samples = n_distinct(sample)) %>%
        group_by(gene) %>% arrange(gene) %>%
        # CN status sums
        mutate(amp.sum  = sum(copy.num == 2, na.rm=TRUE)) %>%
        mutate(gain.sum = sum(copy.num == 1, na.rm=TRUE)) %>%
        mutate(loss.sum = sum(copy.num ==-1, na.rm=TRUE)) %>%
        mutate(del.sum  = sum(copy.num == -2, na.rm=TRUE)) %>%
        # CN status percentages
        mutate(amp.pct = amp.sum /n.samples) %>%
        mutate(gain.pct = gain.sum/n.samples) %>%
        mutate(loss.pct = loss.sum/n.samples) %>%
        mutate(del.pct = del.sum /n.samples) %>%
        # over/under sums for Fisher's test [amplicfications]
        mutate(amp.gt = sum(copy.num > 1, na.rm=TRUE)) %>%
        mutate(amp.gt.eq = sum(copy.num >= 1, na.rm=TRUE)) %>%
        mutate(amp.lt.eq = sum(copy.num <= 1, na.rm=TRUE)) %>%
        mutate(amp.lt = sum(copy.num < 1, na.rm=TRUE)) %>%
        # over/under sums for Fisher's test [gains]
        mutate(gain.gt = sum(copy.num > 0, na.rm=TRUE)) %>%
        mutate(gain.gt.eq = sum(copy.num >= 0, na.rm=TRUE)) %>%
        mutate(gain.lt.eq = sum(copy.num <= 0, na.rm=TRUE)) %>%
        mutate(gain.lt = sum(copy.num < 0, na.rm=TRUE)) %>%
        # over/under percentages for gain plotting
        mutate(gain.gt.pct = sum(copy.num > 0, na.rm=TRUE)/n.samples) %>%
        mutate(gain.lt.pct = sum(copy.num < 0, na.rm=TRUE)/n.samples) %>%
        select(-c(copy.num,sample)) %>%
        unique
    }


    #----------------
    # DATA PROCESSING
    #----------------

    # rename hgnc to gene if needed
    if('hgnc' %in% names(gene.matrix.a)){
        gene.matrix.a %<>% rename(gene=hgnc)
    }
    if('hgnc' %in% names(gene.matrix.b)){
        gene.matrix.b %<>% rename(gene=hgnc)
    }
    # rename stop to end if needed
    if('stop' %in% names(gene.matrix.a)){
        gene.matrix.a %<>% rename(end=stop)
    }
    if('stop' %in% names(gene.matrix.b)){
        gene.matrix.b %<>% rename(end=stop)
    }


    message(blue('- subsetting tables to target regions'))
    if(!is.null(targets.file)){
        # read in target file, select coordinate columns & split
        targets <-
            read.delim(targets.file, sep='\t', header=FALSE) %>%
            tbl_df %>% .[,1:3] %>%
            setNames(c('chrom','start.target','end.target')) %>%
            mutate(chrom=as.numeric(ifelse(chrom=='X',23,ifelse(chrom=='Y',24,chrom)))) %>%
            arrange(chrom,start.target) %>%
            split(.$chrom)

        # subset genes in target file doing this on one subset is sufficient due to intersection step below
        if(nrow(gene.matrix.a) <= nrow(gene.matrix.b)) {
            gene.matrix.a %<>% SubsetTargets(targets)
        } else {
            gene.matrix.b %<>% SubsetTargets(targets)
        }
    }


    # format allosome & filter for overlapping genes
    gene.matrix.a %<>% ChromMod(allosome) %>% filter(gene %in% gene.matrix.b$gene)
    gene.matrix.b %<>% ChromMod(allosome) %>% filter(gene %in% gene.matrix.a$gene)


    # generate table of chromosome breaks
    chrom.n <-
        gene.matrix.a %>%
        group_by(chrom) %>%
        summarise(chrom.n=n()) %>%
        mutate(chrom.n=cumsum(chrom.n))


    # calculate necessary statistics
    message(blue('- calculating sample statistics'))

    if(plot.type=='mutation') {

        if(muts.exact != TRUE) {
            sample.stats <-
                inner_join( MutSampleStats(gene.matrix.a) %>% mutate(pos=1),
                            MutSampleStats(gene.matrix.b) %>% mutate(pos=1), by=c('gene', 'chrom', 'pos'), suffix=c('.x','.y') )
        } else {
            sample.stats <-
                inner_join( MutSampleStats(gene.matrix.a),
                            MutSampleStats(gene.matrix.b), by=c('gene','chrom','pos'), suffix=c('.x','.y') )
        }


        sample.stats %<>%
            mutate(pct.tot=mut.present.pct.x+mut.present.pct.y) %>%
            arrange(desc(pct.tot), gene) %>%
            mutate(gene=factor(gene, levels=unique(gene)))

        sample.stats <-
            sample.stats %>%
            #arrange(chrom, pos) %>%
            mutate(gene=factor(gene, levels=unique(gene))) %>%
            rowwise %>%
            # call mutations Fisher's function
            mutate(mut.fisher = fisher.test( data.frame( a = c(mut.present.x, mut.absent.x),
                                                         b = c(mut.present.y, mut.absent.y) ) )$p.value ) %>%
            # p adjustments
            mutate(mut.fisher.adj = p.adjust(mut.fisher,  'BH')) %>%
            # 0.05 cutoffs
            mutate(mut.fisher.adj.cut  = ifelse(mut.fisher.adj < 0.05, mut.fisher.adj, as.numeric(NA))) %>%
            # log
            mutate(mut.fisher.adj.cut.log = log10(mut.fisher.adj.cut))


        # tables for plotting
        sample.stats.mut.a <-
            sample.stats %>%
            select(gene, present=mut.present.pct.x)

        sample.stats.mut.b <-
            sample.stats %>%
            select(gene, present=mut.present.pct.y)

        sample.stats.mut.fishers <-
            sample.stats %>%
            select(gene, present=mut.fisher.adj.cut.log) %>%
            mutate(present=ifelse(is.na(present), 0, present))

        # column selection & arrangement
        if(muts.exact != TRUE) {
            sample.stats %<>%
                select(gene, n.samples.x, n.samples.y, mut.present.pct.x, mut.present.pct.y, mut.absent.pct.x, mut.absent.pct.y,
                mut.fisher, mut.fisher.adj, mut.fisher.adj.cut, mut.fisher.adj.cut.log)
        } else {
            sample.stats %<>%
                select(gene, chrom, pos, n.samples.x, n.samples.y, mut.present.pct.x, mut.present.pct.y, mut.absent.pct.x, mut.absent.pct.y,
                mut.fisher, mut.fisher.adj, mut.fisher.adj.cut, mut.fisher.adj.cut.log)
        }


        # write data summary
        message(blue('- writing data summary'))
        file.name <- str_c("summary/fishers_mut/",FixName(plot.title.main),ifelse(suffix=='now', format(Sys.time(), '.%Y-%m-%d-%H-%M-%S'), ifelse(suffix=='', '', str_c('.',suffix))))
        write_tsv(sample.stats, str_c(file.name,'.txt'))


        #---------------------
        # STATS FUNCTION CALLS
        #---------------------

        message(blue('- building plots'))

        options(device="pdf")

        # call functions & add to plot list
        gg.list <- list( MutPrevalence(sample.stats.mut.a, plot.title=str_c(plot.title.a, " Prevalence [Mutations]"), gene.names=gene.names, gene.order=gene.order),
                         MutPrevalence(sample.stats.mut.b, plot.title=str_c(plot.title.b, " Prevalence [Mutations]"), gene.names=gene.names, gene.order=gene.order),
                         MutFishers(sample.stats.mut.fishers, plot.title=str_c(plot.title.a, ' x ', plot.title.b, " Fisher's Exact [Mutations]"), gene.names=gene.names, gene.order=gene.order) )

    } else {

        sample.stats <-
            inner_join( CNSampleStats(gene.matrix.a, threshold.a), CNSampleStats(gene.matrix.b, threshold.b) %>% select(-chrom, -start, -end), by='gene' ) %>%
            arrange(chrom, start) %>%
            ungroup %>%
            mutate(gene=factor(gene,levels=unique(gene))) %>%
            rowwise %>%
            # amplifications (+)
            mutate(amp.pos.fisher  = fisher.test( data.frame( a = c(amp.gt.x,  amp.lt.eq.x),
                                                              b = c(amp.gt.y,  amp.lt.eq.y) ) )$p.value ) %>%
            # gains (+)
            mutate(gain.pos.fisher = fisher.test( data.frame( a = c(gain.gt.x, gain.lt.eq.x),
                                                              b = c(gain.gt.y, gain.lt.eq.y) ) )$p.value ) %>%
            # amplifications (-)
            mutate(amp.neg.fisher  = fisher.test( data.frame( a = c(amp.lt.x,  amp.gt.eq.x),
                                                              b = c(amp.lt.y,  amp.gt.eq.y) ) )$p.value ) %>%
            # gains (-)
            mutate(gain.neg.fisher = fisher.test( data.frame( a = c(gain.lt.x, gain.gt.eq.x),
                                                              b = c(gain.lt.y, gain.gt.eq.y) ) )$p.value ) %>%
            # p adjustments
            mutate(amp.pos.fisher.adj  = p.adjust(amp.pos.fisher,  'BH')) %>%
            mutate(gain.pos.fisher.adj = p.adjust(gain.pos.fisher, 'BH')) %>%
            mutate(amp.neg.fisher.adj  = p.adjust(amp.neg.fisher,  'BH')) %>%
            mutate(gain.neg.fisher.adj = p.adjust(gain.neg.fisher, 'BH')) %>%
            # 0.05 cutoffs
            mutate(amp.pos.fisher.adj.cut  = ifelse(amp.pos.fisher.adj  < 0.05, amp.pos.fisher.adj,  as.numeric(NA))) %>%
            mutate(gain.pos.fisher.adj.cut = ifelse(gain.pos.fisher.adj < 0.05, gain.pos.fisher.adj, as.numeric(NA))) %>%
            mutate(amp.neg.fisher.adj.cut  = ifelse(amp.neg.fisher.adj  < 0.05, amp.neg.fisher.adj,  as.numeric(NA))) %>%
            mutate(gain.neg.fisher.adj.cut = ifelse(gain.neg.fisher.adj < 0.05, gain.neg.fisher.adj, as.numeric(NA))) %>%
            # log
            mutate(amp.pos.fisher.adj.cut.log  = log10(amp.pos.fisher.adj.cut)) %>%
            mutate(gain.pos.fisher.adj.cut.log = log10(gain.pos.fisher.adj.cut)) %>%
            mutate(amp.neg.fisher.adj.cut.log  = log10(amp.neg.fisher.adj.cut)) %>%
            mutate(gain.neg.fisher.adj.cut.log = log10(gain.neg.fisher.adj.cut))


        # tables for plotting
        sample.stats.gain.a <-
            sample.stats %>%
            select(gene, above=gain.gt.pct.a, below=gain.lt.pct.a)

        sample.stats.gain.b <-
            sample.stats %>%
            select(gene, above=gain.gt.pct.b, below=gain.lt.pct.b)

        sample.stats.amp.a <-
            sample.stats %>%
            select(gene, above=amp.pct.a)

        sample.stats.amp.b <-
            sample.stats %>%
            select(gene, above=amp.pct.b)
     
        sample.stats.gain.fishers <-
            sample.stats %>%
            select(gene, above=gain.pos.fisher.adj.cut.log, below=gain.neg.fisher.adj.cut.log) %>%
            mutate(above=ifelse(is.na(above),0,above)) %>%
            mutate(below=ifelse(is.na(below),0,below))

        sample.stats.amp.fishers <-
            sample.stats %>%
            select(gene, above=amp.pos.fisher.adj.cut.log) %>%
            mutate(above=ifelse(is.na(above),0,above))

            # column selection & arrangement
            sample.stats %<>%
            select(gene,chrom,start,end,n.samples.a,n.samples.b,amp.pct.a,amp.pct.b,gain.pct.a,gain.pct.b,loss.pct.a,loss.pct.b,del.pct.a,del.pct.b,
            gain.gt.pct.a,gain.lt.pct.a,gain.gt.pct.b,gain.lt.pct.b,
            amp.neg.fisher,amp.neg.fisher.adj,amp.neg.fisher.adj.cut,amp.neg.fisher.adj.cut.log,
            amp.pos.fisher,amp.pos.fisher.adj,amp.pos.fisher.adj.cut,amp.pos.fisher.adj.cut.log,
            gain.neg.fisher,gain.neg.fisher.adj,gain.neg.fisher.adj.cut,gain.neg.fisher.adj.cut.log,
            gain.pos.fisher,gain.pos.fisher.adj,gain.pos.fisher.adj.cut,gain.pos.fisher.adj.cut.log)


        # write data summary
        message(blue('- writing data summary'))
        file.name <- str_c("summary/fishers_cn/",FixName(plot.title.main),ifelse(suffix=='now', format(Sys.time(), '.%Y-%m-%d-%H-%M-%S'), ifelse(suffix=='', '', str_c('.',suffix))))
        write_tsv(sample.stats, str_c(file.name,'.txt'))


        #---------------------
        # STATS FUNCTION CALLS
        #---------------------

        message(blue('- building plots'))

        options(device="pdf")

        # call functions & add to plot list
        gg.list <- list( GainPrevalence(sample.stats.gain.a, plot.title=str_c(plot.title.a, " Prevalence [Gains]"), gene.names=gene.names, gene.order=gene.order),
                         GainPrevalence(sample.stats.gain.b, plot.title=str_c(plot.title.b, " Prevalence [Gains]"), gene.names=gene.names, gene.order=gene.order),
                         GainFishers(sample.stats.gain.fishers, plot.title=str_c(plot.title.a, ' x ', plot.title.b, " Fisher's Exact [Gains]"), gene.names=gene.names, gene.order=gene.order),
                         AmpPrevalence(sample.stats.amp.a, plot.title=str_c(plot.title.a, " Prevalence [Amplifications]"), gene.names=gene.names, gene.order=gene.order),
                         AmpPrevalence(sample.stats.amp.b, plot.title=str_c(plot.title.b, " Prevalence [Amplifications]"), gene.names=gene.names, gene.order=gene.order),
                         AmpFishers(sample.stats.amp.fishers, plot.title=str_c(plot.title.a, ' x ', plot.title.b, " Fisher's Exact [Amplifications]"), gene.names=gene.names, gene.order=gene.order) )
    }

    #---------
    # PLOTTING
    #---------

    # plot to pdf
    suppressWarnings(ggsave(str_c(file.name,'.pdf'), do.call(arrangeGrob, c(gg.list, ncol=1)), device='pdf', width=12, height=12))

    message(green('[ done ]'))

    return(sample.stats)

}
hxrts/pascal documentation built on May 17, 2019, 9:15 p.m.