R/cum.plot.R

#' @title Cumulative frequency plot
#' @author Peter Utnes \email{utnesp@gmail.com}
#' @param file.or.object Provide a .txt file or object where row.names are ensembl_gene_identifiers
#' @param legend.position Position of legend within plot. Defaults to c(0.8, 0.7) 
#' @param incl.mito.genes Include mitochondrial genes in plot. Defaults to FALSE.
#' @param cum.freq.out.file.txt If specified, writes out the cumulative frequencies to a file.
#' @param file.pdf  Where to store cum.plot. Defaults to getwd() and to filename "CPM.cumfreq.pdf".
#' @param file.html Where to store cum.plot. Defaults to getwd() and to filename "CPM.cumfreq.html".
#' @param color.numbers.to.use Specify colors to use for individual biotypes from the color palette brewer.pal.info[brewer.pal.info$category == 'qual',]. 
#' @param biotype.cutoff How many biotypes to represent in the plot. Defaults to 10.
#' @export
#' @import biomaRt
#' @import RColorBrewer
#' @import easybiomart
#' @import ggplot2
#' @import ggthemes
#' @import plotly
#' @import htmlwidgets
#' @import splitstackshape
#' @examples See https://github.com/utnesp/cumulative.plot.R

cum.plot <- function(file.or.object, legend.position = c(0.8, 0.7), incl.mito.genes = F, cum.freq.out.file.txt = "", file.pdf = "", file.html = "", color.numbers.to.use = "", biotype.cutoff = "", prior.count = 0.25, log.transform = F, create.html = T,  axis.text.size = 10, theme.text.size = 10, legend.text.size = 10, save.tiff = F) {

    if (grepl(".txt", file.or.object) == TRUE) {
        t <- read.delim(file.or.object, header = T)
    } else {
        t <- data.frame(file.or.object)
    }
    #if ((class(file.or.object) == "matrix") == TRUE) t <- data.frame(file.or.object)
    #if (grepl(".txt", file.or.object) == FALSE) 
    
    
    if (log.transform == T) t <- log2(t+prior.count)
    
     t$CPM <- rowMeans(t)
    t$ensembl_gene_id <- row.names(t)
    t <- t[grep("ENSG", t$ensembl_gene_id), ]
    x <- ensg2ext_name_biotype(t$ensembl_gene_id)
    t <- merge(x, t, by = "ensembl_gene_id")
    x <- ensg2chr_start_end(t$ensembl_gene_id)
    t <- merge(x, t, by = "ensembl_gene_id")
    t$length <- abs(t$end_position) - abs(t$start_position)
    m <- table(t$gene_biotype)
    m <- data.frame(m)
    m <- m[order(-m$Freq), ]
    print(m)
    
    if (biotype.cutoff == "") {
        print("You have not set cutoff for which biotypes to include")
        print("Automatically using max ten biotypes")
        if (nrow(m) > 10) m <- m[1:10, ]
    } else {
        m <- m[m$Freq > biotype.cutoff, ]
    }
    
    print(m)
    t$gene_biotype <- ifelse(t$gene_biotype %in% m$Var1, t$gene_biotype, "other")
    
    qual_col_pals <- brewer.pal.info[brewer.pal.info$category == 'qual',]
    col_vector = unlist(mapply(brewer.pal, qual_col_pals$maxcolors, rownames(qual_col_pals)))
    
    if (color.numbers.to.use == "") {
        print("Selecting colors automatically")
        colors <- col_vector[c(1,2,3,5,6,8,10, 14, 15, 17, 21, 22:60)]
        colors <- colors[1:(nrow(m)+1)] 
        pie(rep(1,nrow(m)+1), col=colors)
    } else {
        par(mfrow = c(2,1))
        print("You may choose from these colors:")
        pie(rep(1,length(col_vector)), col=col_vector)
        print("Selecting colors manually")
        colors <- col_vector[colors] 
        pie(rep(1,nrow(m)), col=colors)
        par(mfrow = c(1,1))
    }
    
    t$CPM_ensg <- paste(t$CPM, t$ensembl_gene_id, t$length, sep = "_")
    t <- melt(t, "gene_biotype", "CPM_ensg")
    t <- cSplit(t, "value", "_") 
    
    m <- table(t$gene_biotype)
    m <- data.frame(m)
    m <- m[order(-m$Freq), ]
    m$col.nr <- row.names(m)
    m$col <- factor(colors)
    colnames(m) <- c("gene_biotype", "bio_tot", "col.nr", "col")
    
    t <- merge(t, m, by = "gene_biotype")
    t$col.nr <- NULL; t$variable <- NULL
    colnames(t) <- c('gene_biotype','CPM','ensembl_gene_id','GeneLength','bio_tot','col')
    
    k <- aggregate(CPM ~ gene_biotype, t, sum)
    t <- merge(t, k, by = "gene_biotype", all = T)
    colnames(t) <- c('gene_biotype','CPM','ensembl_gene_id','GeneLength','bio_tot','col','sum')
    t$freq <- t$CPM / t$sum
    
    x <- ensg2ext_name(t$ensembl_gene_id)
    t <- merge(x, t, by = "ensembl_gene_id")
    t <- with(t, t[order(gene_biotype, freq), ])
    f <- setDT(t)
    
    # Does not work inside function, using dplyr insted
    # e <- data.table:::f[, .(CumulativeFrequency) := .(cumsum(freq)), by=.(gene_biotype)]
    # e <- data.frame(e)
    
    e <- as.data.frame(f %>% 
        group_by(gene_biotype) %>% 
        mutate(CumulativeFrequency=cumsum(freq)))
    
    
    e$gene_biotype <- paste(e$gene_biotype, " (", e$bio_tot, ")", sep = "")
    
    
    if (incl.mito.genes == T) {
        e.sub <- e
    } else {
        e.sub <- e[grep("MT-", e$external_gene_name, invert = T), ]
    }
    
    shapes = c(15, 16, 17, 0, 18, 3, 5, 2, 8, 24, 1, 25, 4, 7, 8, 9, 10, 26:30)
    
    s <- data.frame(gene_biotype = e.sub$gene_biotype[!duplicated(e.sub$gene_biotype)], shape = shapes[1:length(colors)] , color = colors)
    # e.sub$shape <- NULL; e.sub$color <- NULL
    e.sub <- merge(e.sub, s, by = "gene_biotype", all = T)
    
    # head(e.sub[grep("protein_coding", e.sub$gene_biotype), ])
    
    if (cum.freq.out.file.txt != "") {
        write.table(e.sub, cum.freq.out.file.txt, sep = "\t", quote = F, row.names = F)
    }
    
    if (file.pdf == "") file.pdf <- paste(getwd(), "/CPM.cumfreq.pdf", sep = "")
    
    # p1 <- ggplot(e.sub, aes(CPM, CumulativeFrequency)) + scale_shape_identity(name = "Biotype") + geom_point(aes(color = factor(gene_biotype), shape = shape, text = paste("GENE: ", paste(external_gene_name, "(", ensembl_gene_id, ")", sep = ""), "LENGTH:", GeneLength)), size = 0.5) + theme_classic() + labs(x = "log2(CPM)", y = "Cumulative Frequency")
    # p1 <- ggplot(e.sub, aes(CPM, CumulativeFrequency)) + geom_point(aes(color = factor(gene_biotype), shape = factor(shape), text = paste("GENE: ", paste(external_gene_name, "(", ensembl_gene_id, ")", sep = ""), "LENGTH:", GeneLength)), size = 1) + theme_classic() + theme(legend.title = element_text(colour = "white")) + labs(x = "CPM") + guides(shape=FALSE) + labs(x = "log2(CPM)", y = "Cumulative Frequency")
    
    p1 <- ggplot(e.sub, aes(CPM, CumulativeFrequency)) + geom_point(aes(shape = gene_biotype, color = gene_biotype, text = paste("GENE: ", paste(external_gene_name, "(", ensembl_gene_id, ")", sep = ""), "LENGTH:", GeneLength)), size = 1) + theme_classic() + labs(x = "log2(CPM)", y = "Cumulative Relative Frequency", color='Biotype') +
        scale_colour_manual(name = "Biotype",
                            labels = s$gene_biotype,
                            values = as.character(s$color))  +
        scale_shape_manual(name = "Biotype",
                           labels = s$gene_biotype,
                           values = s$shape) + 
        guides(shape=guide_legend(override.aes=list(size=5))) +
        theme(text = element_text(size = theme.text.size), axis.text = element_text(size = axis.text.size))
    
        if(legend.position != "") p1 <- p1 + theme(legend.text=element_text(size=legend.text.size), legend.position = legend.position)
        if(legend.position == "") p1 <- p1 + theme(legend.text=element_text(size=legend.text.size), legend.justification = c(1,1), legend.position = c(1,1)) 
    
    p2 <- ggplot(e.sub, aes(x = factor(1), fill = factor(gene_biotype))) + geom_bar(width = 1, show.legend = FALSE) + 
        scale_fill_manual(name = "Biotype",
                          labels = s$gene_biotype,
                          values = as.character(s$color))
    p2 <- p2 + coord_polar(theta = "y") + theme_classic() + labs(x="", y = "") + theme(axis.ticks = element_blank(), axis.line = element_blank(), axis.text = element_blank()) 
    
    print(p1)
    ggsave(file.pdf, device = "pdf", width = 178, height = 163, units = "mm")
    
    if(save.tiff == T) ggsave(gsub(".pdf", ".tiff", file.pdf), device = "tiff", width = 178, height = 163, units = "mm")
    print(p2)
    proportions.file = paste(gsub(".pdf", "", file.pdf), "_proportions.pdf", sep = "")
    ggsave(proportions.file, device = "pdf", width = 15, height = 15, units = "cm")
    if(save.tiff == T) ggsave(gsub(".pdf", ".tiff", proportions.file), device = "tiff", width = 5, height = 5, units = "cm")
    openPDF(file.pdf)
    system(paste("open", dirname(file.pdf)))
    
    if (create.html == T) {
        p1 <- ggplotly(p1 + theme(text = element_text(size = 10), axis.text = element_text(size = 10), legend.text = element_text(size = 10)))
        if (file.html == "") file.html <- paste(getwd(), "/CPM.cumfreq.html", sep = "")
        htmlwidgets::saveWidget(p1, file.html, selfcontained = T, libdir = NULL)
        system(paste("open", file.html))
    }
}
utnesp/cumulative.plot.R documentation built on May 15, 2019, 4:19 p.m.