R/MD_visualize_zones.R

Defines functions MD.Genome.plots MD.Chromosome.heatmap MD.Zone.Gene.path.plots get_legend data_summary

# Package: GenomicOZone
# MD_visualize_zones.R
#   - Generate visualizations
# Created:
#   Hua Zhong
#   Dec 25, 2018


#' @importFrom stats median mad
#' @importFrom plyr ddply
data_summary <- function(data, varname, groupnames){
  summary_func <- function(x, col){
    c(Median = median(x[[col]], na.rm=TRUE),
      Mad = mad(x[[col]], na.rm=TRUE),
      Min = min(x[[col]], na.rm=TRUE),
      Max = max(x[[col]], na.rm=TRUE))
  }
  data_sum <- ddply(data, groupnames, .fun=summary_func, varname)
  return(data_sum)
}


#' @import ggplot2
get_legend<-function(p){
  tmp <- ggplot_gtable(ggplot_build(p))
  leg <- which(sapply(tmp$grobs, function(x) x$name) == "guide-box")
  legend <- tmp$grobs[[leg]]
  return(legend)
}


#' @import ggplot2
#' @importFrom grDevices dev.off pdf
#' @importFrom gridExtra grid.arrange
#' @importFrom utils setTxtProgressBar tail txtProgressBar
MD.Zone.Gene.path.plots <- function(GOZ.ds, plot.file, alpha = 0.05, min.effect.size = 0.8, log.exp = TRUE,
                                    plot.all.zones = FALSE){ #cancer.genes = NULL
  colData <- GOZ.ds$input.data$colData
  Zone.GRanges <- GOZ.ds$runtime.var$zone.GRanges
  Gene.GRanges <- GOZ.ds$runtime.var$data.GRanges
  Gene.GRanges.names <- if(!is.null(Gene.GRanges$external_gene_name)){
    Gene.GRanges$external_gene_name
  }else if(!is.null(Gene.GRanges$Gene.name)){
    Gene.GRanges$Gene.name
  }else{
    names(Gene.GRanges)
  }

  data.mat <- NULL
  p.val.specific <- NULL
  effect.size.specific <- NULL

  if(!is.null(Zone.GRanges$p.value.adj)){
    X.value.var <- GOZ.ds$runtime.var$design.treatment.var
    X.value.unique <- GOZ.ds$runtime.var$design.treatment.vals
    Condition.label <- if(paste(X.value.var, ".label", sep='') %in% colnames(colData)){
      paste(X.value.var, ".label", sep='')
    }else{
      X.value.var
    }
    sample.rep.pick <- lapply(X.value.unique, function(y){
      return(which(colData[,X.value.var] == y))
    })
    names(sample.rep.pick) <- X.value.unique

    data.mat <- GOZ.ds$input.data$data[names(GOZ.ds$runtime.var$data.GRanges),]

    p.val.specific <- Zone.GRanges$p.value.adj
    effect.size.specific <- Zone.GRanges$effect.size
    names(p.val.specific) <- names(Zone.GRanges)
    names(effect.size.specific) <- names(Zone.GRanges)
  }else{
    stop("Gene pattern not ploted, because less than 2 differential conditions specified in colData!")
  }

  # effect.size.specific.threshold <- floor(length(effect.size.specific) * effect.size.rate)
  # if(effect.size.specific.threshold < 1){
  #   effect.size.specific.threshold <- 1
  # }
  # if(effect.size.specific.threshold > length(effect.size.specific)){
  #   effect.size.specific.threshold <- length(effect.size.specific)
  # }

  effect.size.specific.threshold <- min.effect.size
      #sort(effect.size.specific, na.last = TRUE, decreasing = TRUE)[effect.size.specific.threshold]

  zones.all <- names(Zone.GRanges)[order(effect.size.specific, na.last = TRUE, decreasing = TRUE)]
  zone.signif <- !is.na(p.val.specific) & !is.na(effect.size.specific) &
    p.val.specific <= alpha &
    effect.size.specific >= effect.size.specific.threshold
  zone.signif <- zone.signif[zones.all]
  zone.color <- sapply(zone.signif, function(x){
    if(x){
      return("#F8766D")
    }else{
      return("gray")
    }
  })

  plots.all <- vector("list", length(zones.all))
  names(plots.all) <- zones.all

  if(!plot.all.zones && sum(zone.signif) == 0){
    message("No significant differential expressed zones!")
    return(NULL)
  }

  zones.all.process <- if(plot.all.zones) seq_len(length(zones.all)) else seq_len(length(zones.all))[zone.signif]
  #pb <- txtProgressBar(min = 1, max = length(zones.all.process), style = 3)
  for (pb.i in zones.all.process) {
    #setTxtProgressBar(pb, pb.i)

    zone <- zones.all[pb.i]

    #print(paste("[", which(zones.all == zone), "/", length(zones.all), "]: ", zone, sep=''))

    gene.pick <- Gene.GRanges$zone == zone
    data.mat.sub <- data.mat[names(Gene.GRanges)[gene.pick] ,, drop = FALSE]
    rownames(data.mat.sub) <- Gene.GRanges.names[gene.pick]

    plots.all.sub <- vector("list", 3)

    # if(!is.null(cancer.genes)){
    #   rownames(data.mat.sub)[rownames(data.mat.sub) %in% cancer.genes] <- paste(rownames(data.mat.sub)[rownames(data.mat.sub) %in% cancer.genes], '*', sep='')
    # }

    data.mat.sub.exp <- if(log.exp) log10(data.mat.sub + 1) else data.mat.sub
    data.mat.sub.rank <- do.call("rbind", Obtain.gene.rank(data.mat.sub))
    rownames(data.mat.sub.exp) <- rownames(data.mat.sub)
    colnames(data.mat.sub.exp) <- colnames(data.mat.sub)
    rownames(data.mat.sub.rank) <- rownames(data.mat.sub)
    colnames(data.mat.sub.rank) <- colnames(data.mat.sub)

    zone.gene.exp.df <- data.frame(Y.value = c(data.mat.sub.exp),
                                   X.value = factor(rep(colData[colnames(data.mat.sub.exp), Condition.label],
                                                        each = nrow(data.mat.sub.exp)), levels = X.value.unique),
                                   Gene = rep(rownames(data.mat.sub.exp), ncol(data.mat.sub.exp)))

    zone.gene.exp.df.sum <- data_summary(zone.gene.exp.df, varname="Y.value", groupnames=c("X.value", "Gene"))
    zone.gene.exp.df.sum.order <- zone.gene.exp.df.sum[zone.gene.exp.df.sum$X.value == tail(levels(zone.gene.exp.df.sum$X.value), 1),]
    zone.gene.exp.df.sum.order <- zone.gene.exp.df.sum.order[order(zone.gene.exp.df.sum.order$Median, decreasing = TRUE),]
    zone.gene.exp.df.sum <- zone.gene.exp.df.sum[order(match(zone.gene.exp.df.sum$Gene, unique(zone.gene.exp.df.sum.order$Gene))),]
    zone.gene.exp.df.sum$Gene <- factor(as.character(zone.gene.exp.df.sum$Gene), levels = as.character(unique(zone.gene.exp.df.sum$Gene)))
    gene.labels <- as.character(levels(zone.gene.exp.df.sum$Gene))

    zone.gene.rank.df <- data.frame(Y.value = c(data.mat.sub.rank),
                                    X.value = factor(rep(colData[colnames(data.mat.sub.rank), Condition.label],
                                                         each = nrow(data.mat.sub.rank)), levels = X.value.unique),
                                    Gene = rep(rownames(data.mat.sub.rank), ncol(data.mat.sub.rank)))

    legend.angle <- if(length(X.value.unique) >= 3) 30 else 0
    legend.angle.adjust <- if(length(X.value.unique) >= 3) 1 else 0.5
    col.num <- max(c(1, ceiling(length(unique(zone.gene.exp.df.sum$Gene)) / 23)))
    legend.font.size <- if(col.num <= 2) 8 else 6
    #if(length(unique(zone.gene.exp.df.sum$Gene)) > 26) 2 else 1

    p <- ggplot(data = zone.gene.exp.df.sum, aes_string(x = "X.value", y = "Median", color = "Gene", group = "Gene")) +
      geom_hline(yintercept=0) +
      geom_errorbar(aes_string(ymin = "Min", ymax = "Max"), width=.1) +
      geom_line() +
      # expand_limits(x = c(1, length(levels(zone.gene.exp.df$X.value)))) +
      ggtitle(paste(zone, ": ", start(Zone.GRanges[zone]), " - ", end(Zone.GRanges[zone]), '\n',
                    "Adjusted p-value: ", signif(p.val.specific[zone], digits = 3), '\n',
                    "Effect size: ", signif(effect.size.specific[zone], digits = 3), sep="")) +
      labs(x = GOZ.ds$runtime.var$design.treatment.var, y = "Log10(exp+1)") +
      theme(plot.title = element_text(hjust = 0.5, color = zone.color[pb.i]),
            #axis.text.x = element_text(angle = 90, hjust = 1),
            axis.title.x=element_blank(),
            axis.text.x=element_blank(),
            #panel.grid.major = element_blank(),
            #panel.grid.minor = element_blank(),
            legend.title = element_blank(),
            legend.position = "right", #c(-Inf,Inf),
            legend.key = element_rect(size = 0.5),
            legend.key.size = unit(0.5, 'lines'),
            plot.margin = unit(c(1,0,0,1), "lines"),
            legend.margin=margin(t = 0, unit='cm'),
            legend.text=element_text(size=legend.font.size)
      ) +
      guides(color=guide_legend(ncol=col.num, byrow=TRUE))

    p.legend <- get_legend(p)
    p <- p + theme(legend.position="none")

    p.box <- ggplot(data = zone.gene.rank.df, aes_string(x = "X.value", y = "Y.value", group = "X.value")) +
      geom_hline(yintercept=0) +
      geom_violin(fill = "#00BFC4", trim=TRUE) +
      geom_boxplot(width = 0.15, fill="white") + #color = "#00BFC4",
      stat_summary(data = zone.gene.rank.df, aes_string(x = "X.value", y = "Y.value", group = "1"),
                   fun.y = "median", geom = "line", color = "#F8766D") +
      # expand_limits(x = c(1, length(levels(zone.gene.rank.df$X.value)))) +
      # ggtitle(paste(zone, ": ", start(Zone.GRanges[zone]), " - ", end(Zone.GRanges[zone]), " (", length(gene.labels), " genes)", '\n',
      #               "Adjusted p-value: ", signif(p.val.specific[zone], digits = 3), sep="")) +
      labs(x = GOZ.ds$runtime.var$design.treatment.var, y = "Gene rank") +
      theme(plot.title = element_blank(),
            axis.title.x=element_blank(),
            axis.text.x = element_text(size = 13, angle = legend.angle, hjust = legend.angle.adjust),
            #panel.grid.major = element_blank(),
            #panel.grid.minor = element_blank(),
            legend.title=element_blank(),
            legend.position="none",
            plot.margin = unit(c(1,0,1,1), "lines")
      )

    plots.all.sub[[1]] <- p
    plots.all.sub[[2]] <- p.box
    plots.all.sub[[3]] <- p.legend

    plots.all[[zone]] <- plots.all.sub
  }

  if(plot.all.zones){
    plot.file.tmp <- if(substr(plot.file, nchar(plot.file)-3, nchar(plot.file)) == ".pdf"){
      paste(substr(plot.file, 1, nchar(plot.file)-4), "_all_zones.pdf", sep='')
    }else{
      paste(plot.file, "_all_zones.pdf", sep='')
    }
    pdf(plot.file.tmp, width = length(X.value.unique)/2+4, height = 4.5)
    #pb <- txtProgressBar(min = 0, max = length(plots.all), style = 3)
    for (i in seq_len(length(plots.all))) {
      #setTxtProgressBar(pb, i)
      if(length(X.value.unique) >= 3){
        grid.arrange(grobs = plots.all[[i]],
                     layout_matrix = rbind(c(1,3), c(2,3)),
                     widths = c(2.7, 1), heights = c(4.5,3.5))
      }else{
        grid.arrange(grobs = plots.all[[i]],
                     layout_matrix = rbind(c(1,3), c(2,3)),
                     widths = c(2.7, 1), heights = c(5,3))
      }
    }
    dev.off()
  }

  plots.all.signif <- plots.all[names(zone.signif[zone.signif])]
  if(length(plots.all.signif) > 0){
    pdf(plot.file, width = length(X.value.unique)/2+4, height = 4.5)
    #pb <- txtProgressBar(min = 0, max = length(plots.all.signif), style = 3)
    for (i in seq_len(length(plots.all.signif))) {
      #setTxtProgressBar(pb, i)
      if(length(X.value.unique) >= 3){
        grid.arrange(grobs = plots.all.signif[[i]],
                     layout_matrix = rbind(c(1,3), c(2,3)),
                     widths = c(2.7, 1), heights = c(4.5,3.5))
      }else{
        grid.arrange(grobs = plots.all.signif[[i]],
                     layout_matrix = rbind(c(1,3), c(2,3)),
                     widths = c(2.7, 1), heights = c(5,3))
      }

    }
    dev.off()
  }else{
    message("No significant differential expressed zones!")
  }
}


#' @import ggplot2
#' @importFrom GenomeInfoDb seqlevels seqlengths
#' @importFrom grDevices dev.off pdf
#' @importFrom utils setTxtProgressBar txtProgressBar
MD.Chromosome.heatmap <- function(GOZ.ds, plot.file, alpha = 0.05, min.effect.size = 0.8,
                                  plot.width = NULL, plot.height = NULL){
  colData <- GOZ.ds$input.data$colData
  rownames(colData) <- colData[,1]
  Genes.GRanges <- GOZ.ds$runtime.var$data.GRanges
  Zone.GRanges <- GOZ.ds$runtime.var$zone.GRanges
  Exp <- GOZ.ds$input.data$data
  effect.size.threshold <- min.effect.size
        #sort(Zone.GRanges$effect.size, na.last = TRUE, decreasing = TRUE)[floor(length(Zone.GRanges) * effect.size.rate)]

  Gene.npt.all.zero <- rownames(Exp)[apply(Exp, 1, function(x){!all(x == 0)})]

  plots.all <- vector("list", length(seqlevels(Zone.GRanges)))
  names(plots.all) <- seqlevels(Zone.GRanges)

  for (chr in seqlevels(Zone.GRanges)) {
    Zone.GRanges.chr <- Zone.GRanges[seqnames(Zone.GRanges) == chr]
    Genes.GRanges.chr <- Genes.GRanges[seqnames(Genes.GRanges) == chr]
    Zone.GRanges.chr.signif <- Zone.GRanges.chr[Zone.GRanges.chr$p.value.adj <= alpha &
                                                  Zone.GRanges.chr$effect.size >= effect.size.threshold]

    Gene.pick <- names(Genes.GRanges.chr)[names(Genes.GRanges.chr) %in% Gene.npt.all.zero]
    Genes.GRanges.chr <- Genes.GRanges.chr[Gene.pick]

    Rank.mat.chr <- GOZ.ds$runtime.var$zone.stat$Zone.stat.norm.per.sample[names(Zone.GRanges.chr),, drop = FALSE]

    chr.plot.df <- data.frame(Zone = factor(rep(names(Zone.GRanges.chr), ncol(Rank.mat.chr)), levels = names(Zone.GRanges.chr)),
                              #Gene = rep(names(Genes.GRanges.chr), ncol(Rank.mat.chr)),
                              Exp = c(Rank.mat.chr),
                              Sample = rep(colnames(Rank.mat.chr), each = nrow(Rank.mat.chr)),
                              Condition = factor(rep(colData[colnames(Rank.mat.chr), GOZ.ds$runtime.var$design.treatment.var],
                                                    each = nrow(Rank.mat.chr)),
                                                levels = levels(colData[,GOZ.ds$runtime.var$design.treatment.var])))

    p <- ggplot() +
      geom_tile(data = chr.plot.df, aes_string(x = "Zone", y = "Sample", fill = "Exp"), color = "white") +
      facet_grid(Condition ~ ., scales = "free_y", space = "free_y", switch="y") +
      scale_fill_gradient2(low = "blue", mid = "gray94", high = "red", limits=c(-1, 1), breaks=seq(-1,1,by=0.5)) +
      labs(title = paste(chr, " significant differentially expressed zones"))

    if(length(Zone.GRanges.chr.signif) > 0){
      Signif.rec.df <- data.frame(x.min = as.numeric(sapply(names(Zone.GRanges.chr.signif), function(x){which(names(Zone.GRanges.chr) == x)})) - 0.5,
                                  x.max = as.numeric(sapply(names(Zone.GRanges.chr.signif), function(x){which(names(Zone.GRanges.chr) == x)})) + 0.5,
                                  y.min = -Inf,
                                  y.max = Inf)
      p <- p +
        geom_rect(data = Signif.rec.df, aes_string(xmin="x.min", xmax="x.max", ymin="y.min", ymax="y.max"),
                  color = "limegreen", fill = NA, size = 1, linetype = 1)
    }

    p <- p +
      theme(#text = element_text(size=20),
        plot.title = element_text(size = 20, hjust = 0.5),
        axis.title.x=element_blank(),
        axis.text.x = element_text(size = 9, angle = 90, vjust = 0.5), #size = 10,
        axis.text.y = element_blank(),
        # panel.grid.major.y = element_line(colour = "darkgray", size = 0.1, linetype = "dashed"),
        # panel.grid.minor.y = element_line(colour = "darkgray", size = 0.1, linetype = "dashed"),
        # panel.grid.major.x = element_blank(),
        # panel.grid.minor.x = element_blank(),
        # axis.text.x=element_blank(),
        axis.ticks.y = element_blank(),
        legend.title = element_blank(),
        #legend.position = "top",
        legend.position = "right", #c(0,1),
        # legend.justification = c(0, 0),
        # legend.direction = "horizontal",
        # legend.margin = margin(0,0,0,0),
        # legend.box.margin = margin(0,0,0,0),
        # panel.background = element_rect(fill = "white", colour = "black")
        strip.text.y = element_text(angle = 180)
      )

    plots.all[[chr]] <- p
  }

  pdf(plot.file, width = if(!is.null(plot.width)) plot.width else 15,
      height = if(!is.null(plot.width)) plot.height else 6)
  for(chr in names(plots.all)){
    print(plots.all[[chr]])
  }
  dev.off()
}


#' @import ggplot2
#' @importFrom ggbio autoplot
#' @importFrom grDevices dev.off pdf
#' @importFrom utils setTxtProgressBar txtProgressBar
MD.Genome.plots <- function(GOZ.ds, plot.file, alpha = 0.05, min.effect.size = 0.8, plot.width = NULL, plot.height = NULL){
  Zone.GRanges <- GOZ.ds$runtime.var$zone.GRanges
  Zone.GRanges.sub <- NULL

  if(all(is.na(Zone.GRanges$effect.size))){
    Zone.GRanges.sub <- Zone.GRanges[!is.na(Zone.GRanges$p.value.adj) &
                                       Zone.GRanges$p.value.adj <= alpha]
  }else{
    effect.size.threshold <- min.effect.size
          #sort(Zone.GRanges$effect.size, na.last = TRUE, decreasing = TRUE)[floor(length(Zone.GRanges) * effect.size.rate)]
    Zone.GRanges.sub <- Zone.GRanges[!is.na(Zone.GRanges$p.value.adj) &
                                       Zone.GRanges$p.value.adj <= alpha &
                                       Zone.GRanges$effect.size >= effect.size.threshold]
  }

  if(length(Zone.GRanges.sub) > 0){
    p <- autoplot(object = Zone.GRanges.sub, layout = "karyogram", fill = "#F8766D", color = "black") +
      labs(title=paste("Genome overview"), sep="") +
      theme(plot.title = element_text(size = 20, hjust = 0.5),
            legend.title = element_blank(),
            #legend.position = "right",
            text = element_text(size=20),
            legend.position = c(0,1),
            legend.justification = c(0, 0),
            legend.direction = "horizontal",
            legend.margin = margin(0,0,0,0),
            legend.box.margin = margin(0,0,0,0))


    pdf(plot.file,
        width = if(!is.null(plot.width)) plot.width else 15,
        height = if(!is.null(plot.width)) plot.height else 10)
    print(p)
    dev.off()
  }else{
    message("No significant differential expressed zones!")
  }
  return(NULL)
}
zh9118/GenomicOZone documentation built on March 11, 2021, 8:11 p.m.