bioshmods: What the Package Does (One Line, Title Case)

Documented in disco_color_scale disco_score plot_disco plot_gene plot_ly_pca

#' Map disco scores to colors
#'
#' Map disco scores to colors
#' @param x disco score
#' @param lower,upper lower and upper boundaries of the score
#' @param int how many colors in the palette
#' @param alpha transparency (string with two hexadecimal digits)
#' @return character vector of colors corresponding to x
#' @importFrom methods is
#' @export
disco_color_scale <- function(x, lower=-100, upper=100, int=255, alpha="66") {
  x[ is.na(x) ] <- 0
  x[ x > upper ] <- upper
  x[ x < lower ] <- lower

  pal <- colorRampPalette(c("blue", "grey", "red"))(int)
  pal <- paste0(pal, alpha)

  int <- findInterval(x, seq(lower, upper, length.out=int + 1), all.inside=TRUE)
  pal[int]
}






#' Create a disco plot
#'
#' Make a concordance / discordance plots for two contrasts
#'
#' A concordance / discordance plot is simply a log~2~ FC vs log~2~ FC
#' comparison between two contrasts. 
#' Disco score is a heuristic score 
#' reflecting the strength of similarity or dissimilarity between two
#' log~2~ FC values weighted by the corresponding p-values and given by the
#' formula log2FC.x * log2FC.y * (-log10(pval.x) - log10(pval.y)).
#'
#' To manually select which labels to show, use a truncated `annot` object
#' – remove all rows you don't want to show, and set the show_top_labels
#' parameter to `Inf`.
#' @param contrast1,contrast2 data frames with rownames corresponding to
#'        IDs (they don't need to be in the same order) and columns `log2FoldChange`
#'        and `pvalue`.
#' @param lower,upper lower and upper boundaries for coloring of the score
#' @param show_top_labels sort the genes by descending absolute disco score and show top N labels
#' @param annot1,annot2 annotation data frames for the two contrasts
#' @param top_labels_both should top labels from both negative and positive
#'        disco scores be shown, or only for the absolute top, whether only negative
#'        or only positive or both?
#' @param alpha transparency
#' @param disco result of `disco_score`; if provided, it will be used to
#'        avoid unnecessary computations
#' @param label_col name of the column in `annot` which should be used for
#'        labels
#' @param by column by which the contrast data frames should be merged
#'       (passed to `merge`). Default: merge by row names
#' @param primary_id the name which should be assigned to the identifier
#'        column which results from the merge
#' @param label_sel identifiers of labels to be shown on the plot
#' @examples
#' ## Generate example data
#' c1 <- data.frame(log2FoldChange=rnorm(5000, sd=2))
#' c1$pvalue <- pnorm(abs(c1$log2FoldChange), sd=2, lower.tail=FALSE)
#' c1$PrimaryID <- paste0("ID", 1:nrow(c1))
#'
#' c2 <- data.frame(log2FoldChange=c1$log2FoldChange + rnorm(5000, sd=3))
#' c2$pvalue <- pnorm(abs(c2$log2FoldChange), sd=2, lower.tail=FALSE)
#' c2$PrimaryID <- paste0("ID", 1:nrow(c2))
#'
#' ## Example disco plot
#' plot_disco(c1, c2)
#' @return a ggplot object (plot)
#' @import ggplot2 ggrepel
#' @importFrom stats cor
#' @importFrom shiny icon
#' @export
plot_disco <- function(contrast1, contrast2, lower=-100, upper=100,
  show_top_labels=0, top_labels_both=TRUE, annot1=NULL, annot2=NULL, 
  alpha=.5, disco=NULL, by=0,
  primary_id="PrimaryID", label_col="SYMBOL", label_sel=NULL) {

  if(is.null(disco)) {
    cc <- disco_score(contrast1, contrast2, by=by, primary_id=primary_id)
  } else {
    cc <- disco
  }

  cc$col <- disco_color_scale(cc$disco, lower=lower, upper=upper)
  cc <- cc[ order(-abs(cc$disco)), ]

# if(show_top_labels > 0) {
#   cc$label <- ""
#   if(!is.null(annot) && all(c(primary_id, label_col) %in% colnames(annot))) {
#     cc$label <- as.character(annot[[label_col]])[ match(cc[[primary_id]], annot[[primary_id]]) ]
#     sel <- is.na(cc$label)
#     cc$label[sel] <- cc[[primary_id]][sel]
#     cc$label[is.na(cc$label)] <- ""
#   } else if(is.null(cc$label <- cc[[primary_id]])) {
#     cc$label <- ""
#   }
#   if(top_labels_both) {
#     o1 <- which(cc$disco > 0)[ 1:show_top_labels ]
#     o2 <- which(cc$disco < 0)[ 1:show_top_labels ]
#     cc$label[ ! 1:nrow(cc) %in% c(o1, o2) ] <- ""
#
#   } else {
#     cc$label[ 1:nrow(cc) > show_top_labels ] <- ""
#   }
#
#   cc$label[is.na(cc$label)] <- ""
# }

  cc <- cc %>% filter(!is.na(.data$log2FoldChange.x) & !is.na(.data$log2FoldChange.y) & !is.na(.data$disco)) %>%
    mutate(disco=ifelse(.data$disco > upper, upper, ifelse(.data$disco < lower, lower, .data$disco))) %>%
    arrange(abs(.data$disco))

  lab_df <- NULL

  if(!is.null(label_sel) && length(label_sel) > 0L && primary_id %in% colnames(cc)) {

    lab_df <- data.frame(cc[[primary_id]])
    colnames(lab_df)  <- primary_id
    lab_df[["label"]] <- NA

    ## function that looks for specified labels in the label_col column of
    ## data frame df1, matching it with lab_df by primary_id
    source_labels <- function(lab_df, df1, primary_id, label_col, label_sel) {
      if(!label_col %in% colnames(df1)) {
        return(lab_df)
      }

      sel <- grepl(label_sel, df1[[label_col]]) &
             df1[[primary_id]] %in% lab_df[[primary_id]]

      if(sum(sel) == 0L) { return(lab_df) }

      m <- match(df1[[primary_id]][sel], lab_df[[primary_id]])
      lab_df[m, "label"] <- df1[[label_col]][sel]
      return(lab_df)
    }

    lab_df <- source_labels(lab_df, contrast1, primary_id, label_col, label_sel)
    lab_df <- source_labels(lab_df, contrast2, primary_id, label_col, label_sel)
    lab_df <- source_labels(lab_df, annot1, primary_id, label_col, label_sel)
    lab_df <- source_labels(lab_df, annot2, primary_id, label_col, label_sel)
    lab_df <- lab_df[ !is.na(lab_df$label),, drop=FALSE ]

    lab_df <- merge(cc, lab_df, by=primary_id)
  }



  g <- ggplot(cc, aes_string(x="log2FoldChange.x", y="log2FoldChange.y")) +
    geom_point(aes(color=.data$disco), alpha=alpha) + 
    scale_color_gradient2(low="blue", mid="grey", high="red") + 
    theme(legend.position="none") +
    geom_hline(aes(yintercept=0), color="grey") +
    geom_vline(aes(xintercept=0), color="grey") +
    geom_abline(aes(slope=1, intercept=0), color="grey")

  g <- g + ggtitle(
      sprintf("Correlation between log2FC\nr=%.2f rho=%.2f",
        cor(cc$log2FoldChange.x, cc$log2FoldChange.y, use="p"),
        cor(cc$log2FoldChange.x, cc$log2FoldChange.y, method="s", use="p")))

# if(show_top_labels > 0) {
#   g <- g + geom_label_repel(aes(label=.data$label, color=.data$disco,
#     force_pull=1.5, max.overlaps=Inf))
# }

  if(!is.null(label_sel) && length(label_sel) > 0L) {
    g <- g + geom_label(data=lab_df, aes(label=.data$label, color=.data$disco))
  }

  return(g)
}


#' Calculate the disco score
#'
#' Merge two contrasts and calculate the disco score
#'
#' Disco score is a heuristic score 
#' reflecting the strength of similarityor dissimilarity between two
#' log~2~ FC values weighted by the corresponding p-values and given by the
#' formula log2FC.x * log2FC.y * (-log10(pval.x) - log10(pval.y)).
#' @param contrast1,contrast2 data frames with rownames corresponding to
#'        IDs (they don't need to be in the same order) and columns `log2FoldChange`
#'        and `pvalue`.
#' @param minp minimum p-value
#' @param by column by which the contrast data frames should be merged
#'       (passed to `merge`). Default: merge by row names
#' @param primary_id Primary identifier
#' @param annot1,annot2 annotation data frames corresponding to contrast1
#' and contrast2, respectively
#' @return a merged data frame containing column "disco.score"
#' @importFrom methods as
#' @export
disco_score <- function(contrast1, contrast2, primary_id="PrimaryID", annot1=NULL, annot2=NULL, minp=1e-16, by=0) {
  stopifnot(primary_id %in% colnames(contrast1) && primary_id %in% colnames(contrast2))

  ## we need to keep these two columns for displaying the gene table
  contrast1[["__primary_id_1"]] <- contrast1[[primary_id]]
  contrast2[["__primary_id_2"]] <- contrast2[[primary_id]]

  ## special case when by has two elements
  ## this is tedious, but needs be.

  if(length(by) > 1) {
    if(!by[1] %in% colnames(contrast1)) {
      if(is.null(annot1)) {
        stop(sprintf("Column %s not in contrast1 data frame and annotation data frame 1is NULL.", by[1]))
      }
      if(!by[1] %in% colnames(annot1)) {
        stop(sprintf("Column %s not in annotation data frame 1.", by[1]))
      }

      m <- match(contrast1[[primary_id]], annot1[[primary_id]])
      contrast1[[by[1]]] <- annot1[[by[1]]][m]
    }

    if(!by[2] %in% colnames(contrast2)) {
      if(is.null(annot2)) {
        stop(sprintf("Column %s not in contrast2 data frame and annotation data frame 2 is NULL.", by[2]))
      }
      if(!by[2] %in% colnames(annot2)) {
        stop(sprintf("Column %s not in annotation data frame 2.", by[2]))
      }

      m <- match(contrast2[[primary_id]], annot2[[primary_id]])
      contrast2[[by[2]]] <- annot2[[by[2]]][m]
    }

    contrast1 <- contrast1[ !is.na(contrast1[[by[1]]]), , drop=FALSE ]
    contrast2 <- contrast2[ !is.na(contrast2[[by[2]]]), , drop=FALSE ]
    cc <- merge(as(contrast1, "data.frame"), as(contrast2, "data.frame"), by.x=by[1], by.y=by[2])
  } else {
    cc <- merge(as(contrast1, "data.frame"), as(contrast2, "data.frame"), by=by)
  }

  if(nrow(cc) == 0L) {
    stop(sprintf("No common identifiers for column(s) '%s' in these contrasts.",
                 paste(by, collapse=",")))
  }
  #colnames(cc)[1] <- "PrimaryID"
  #rownames(cc) <- cc[,1]
  cc$disco <- with(cc, log2FoldChange.x * log2FoldChange.y * (-log10(pvalue.x + minp) -log10(pvalue.y  + minp)))
  cc$disco[ is.na(cc$disco) ] <- 0
  return(cc)
}



#' Create a PCA plot with plotly
#'
#' Generate a PCA 3D or 2D plot with plotly
#' @param mtx PCA matrix (typically `prcomp(...)$x`)
#' @param covariate_data data frame with covariates to display (character, factor or continuous)
#' @param threeD if FALSE, a 2D plot will be produced
#' @param cov_default default covariate names (must be names of columns of `covariate_data`)
#' @return the plotly object
#' @import purrr tibble dplyr RColorBrewer
#' @importFrom plotly plot_ly add_trace layout plotly_build
#' @importFrom grDevices colorRampPalette palette
#' @export
plot_ly_pca <- function(mtx, covariate_data, threeD=TRUE, cov_default=NULL) {

	numplots <- floor(ncol(mtx)/2)
	numplots <- min(numplots, 6)

	df <- data.frame(
		label=rownames(covariate_data),
		mtx[, paste0("PC", 1:(2*numplots))],
		stringsAsFactors=FALSE
	)

	## selecting covariates to use: not all unique and also not only one value
	sel <- apply(covariate_data, 2, function(x) {
  	.u <- unique(x)
  	length(.u) > 1 && length(.u) < nrow(covariate_data)
	})

	covariates <- union(cov_default, colnames(covariate_data)[sel])
	covariate_data <- covariate_data[ , covariates, drop=FALSE ]

	## we are sure that the order is the same
	df <- cbind(covariate_data, df)

	## which covariates are numerical?
	cov_numeric <- covariates[ map_lgl(covariates, ~ is.numeric(df[[.]])) ]

  ## make sure the remaining covariates are factors
  df <- df %>% mutate_at(setdiff(covariates, cov_numeric), factor)

  ## make the PCA data more terse
  df <- df %>% mutate_at(vars(starts_with("PC")), signif, digits=4)

  ## auto select default covariates if cov_default is missing
  if(is.null(cov_default)) {
    if(length(covariates) > 0) {
      cov_default <- covariates[1]
    }
  }

  plotly_ids <- paste0("ID", 1:nrow(df))

## First, assign palettes to factor covariates
## We need to set up coloring manually, because plotly is stupid
## use palettes from RColorBrewer, preferring the colorblind ones
  palettes <- brewer.pal.info %>% rownames_to_column("palette") %>% filter(.data$category == "qual") %>% arrange(-.data$colorblind)
  palette_names <- palettes[["palette"]]

  covariates_fac <- covariates[ sapply(covariates, function(cov) is.factor(df[[cov]])) ]
  if(nrow(palettes) < length(covariates_fac)) {
    palette_names <- rep(palette_names, ceiling(length(covariates_fac)/nrow(palettes)))
  }

  covariates_pal_names <- set_names(palette_names[1:length(covariates_fac)], covariates_fac)

## get colors for each of the factor covariates
  covariates_col <- lapply(set_names(covariates_fac), function(cov) {
    pal.name <- covariates_pal_names[cov]
    n.col <- length(levels(df[[cov]]))
    max.col <- subset(palettes, palette == pal.name)$maxcolors[1]
    .pal <- brewer.pal(n=min(max.col, max(3, n.col)), name=pal.name)
    if(max.col < n.col) {
      warning(sprintf("covariate: %s more levels (%d) than colors (%d) in palette (%s)",
        cov, n.col, max.col, pal.name))
      .pal <- rep(.pal, ceiling(n.col / max.col))
    }
    names(.pal) <- levels(df[[cov]])

    set_names(.pal[ as.character(df[[cov]]) ], plotly_ids)
  })

  cont_pal <- colorRampPalette(c("#440154FF", "#21908CFF", "#FDE725FF"))(32)
  covariates_col_cont <- lapply(set_names(setdiff(covariates, covariates_fac)), function(cov) {
    x <- df[[cov]]
    col_i <- seq(min(x, na.rm=TRUE), max(x, na.rm=TRUE), length.out=33)
    i <- findInterval(x, col_i, all.inside=TRUE)
    set_names(cont_pal[i], plotly_ids)
  })

  covariates_col <- c(covariates_col, covariates_col_cont)


## Same for symbols
  symbols <- c("circle", "square", "diamond", "cross", "x", "triangle",
               "pentagon", "hexagram", "star", "diamond", "hourglass",
               "bowtie", "asterisk", "hash")

  cov_symbols <- lapply(set_names(covariates), function(cov) {
    lev <- levels(df[[cov]])
    .s <- symbols
    if(length(lev) > length(.s)) {
      # expand symbol list if necessary. Moot, because if there are more
      # than three symbols it is pointless anyway
      .s <- rep(.s, ceiling(length(lev) / length(.s)))
    }

    .s <- .s[1:length(lev)]
    names(.s) <- lev
    return(.s)
  })

  plotly_ids <- paste0("ID", 1:nrow(df))

## create hovertext as a combination of covariates
  hovertext <- apply(map_dfc(set_names(covariates), ~ sprintf("%s: %s", ., df[[.]])), 1, paste, collapse="\n")
  hovertext <- paste(hovertext, sprintf("plotlyID: %s", plotly_ids), sep="\n")

## set up default colors and symbols

  def_symbol <- def_symbols <- def_color <- def_colors <- NULL
  if(length(cov_default) > 0) {
    def_color <- df[[ cov_default[1] ]]
    if(is.factor(def_color)) {
      levs <- levels(def_color)
      matches <- match(levs, df[[ cov_default[1] ]])
      def_colors <- covariates_col[[cov_default[1]]][ matches ]
      names(def_colors) <- levs
    } else {
      def_colors <- covariates_col[[cov_default[1]]]
    }
  }



  if(length(cov_default) > 1) {
    def_symbol <- df[[ cov_default[2] ]]
    def_symbols <- cov_symbols[[ cov_default[2] ]]
  } 

  if(threeD) {
    numplots <- floor(numplots * 2/3)
  }

## create the initial plotly object. We need it to know how plotly cuts up the data into segments
  plotly.args <- list(data=df, type="scatter", mode="markers",
               x=df$PC1, y=df$PC2,
               hovertext=hovertext,
               ids=plotly_ids,
               marker=list(size=10),
               visible=TRUE,
               color= def_color, colors= def_colors,
               symbol=def_symbol, symbols=def_symbols)

  if(threeD) {
    plotly.args$z <- df$PC3
    plotly.args$type <- "scatter3d"
    plotly.args$marker <- list(size=5)
  }

  p <- do.call("plot_ly", plotly.args)


## if there are more than two components to be shown, we need to add the data
  if(numplots > 1) {
    if(threeD) {
      for(i in 2:numplots) {
        pc1 <- paste0("PC", i * 3 - 2)
        pc2 <- paste0("PC", i * 3 - 1)
        pc3 <- paste0("PC", i * 3)
        p <- p %>% add_trace(data=df, type="scatter3d", mode="markers", x=df[[pc1]], y=df[[pc2]], z=df[[pc3]], visible=FALSE)
      }
    } else {
      for(i in 2:numplots) {
        pc1 <- paste0("PC", i * 2 - 1)
        pc2 <- paste0("PC", i * 2)
        p <- p %>% add_trace(data=df, type="scatter", mode="markers", x=df[[pc1]], y=df[[pc2]], visible=FALSE)
      }
    }
  }


## here we build the plotly object and dissect it to figure out
## how plotly cut up the data. Note that plotly is efficient and that if
## factor levels overlap between factors (i.e. factors are colinear) then
## there will be fewer blocks than simply length(levels(f1)) * length(levels(f2))
## Also, we need the IDs in each block to be able to match the correct values
  p.obj <- plotly_build(p)$x$data
  nblocks <- length(p.obj) / numplots
  p.obj <- p.obj[ 1:nblocks ]
  p.ids <- map(p.obj, ~ .$ids)

## here we build the menu for updating symbol style
  sym_update <- lapply(covariates_fac, function(cov) {
    sym_pal <- cov_symbols[[cov]]

    ## map levels to symbol
    syms <- lapply(p.ids, function(ids) {
      ord <- match(ids, paste0("ID", 1:nrow(df)))
      var <- df[[cov]][ ord ]
      sym_pal[ as.character(var) ]
    })

    list(
          method="restyle",
          label=sprintf("Symbol by %s", cov),
          args=list(list(
              marker.symbol=syms
              ))
        )
  })

## set up update menus for colors
  col_update <- lapply(covariates, function(cov) {
    cols <- lapply(p.ids, function(ids) {
      covariates_col[[cov]][ids]
    })
    list(
          method="restyle",
          label=sprintf("Color by %s", cov),
          args=list(list(
              marker.color=cols
              ))
        )
  })

## prepare the buttons to change the coordinates
  if(threeD) {
    pc_update <- lapply(1:numplots, function(i) {
      pc1 <- paste0("PC", i * 3 - 2)
      pc2 <- paste0("PC", i * 3 - 1)
      pc3 <- paste0("PC", i * 3)
      list(
        method="update",
        label=sprintf("PC%d/%d/%d", i * 3 - 2, i * 3 - 1, i * 3),
        args=list(
          list(visible=rep((1:numplots) == i, each=nblocks)),
          list(scene=list(
              xaxis=list(title=pc1),
              yaxis=list(title=pc2),
              zaxis=list(title=pc3)
            ))
        ))
    })
  } else {
    pc_update <- lapply(1:numplots, function(i) {
      pc1 <- paste0("PC", i * 2 - 1)
      pc2 <- paste0("PC", i * 2)
      list(
        method="restyle",
        label=paste(pc1, "vs", pc2),
        args=list(
          list(visible=rep((1:numplots) == i, each=nblocks))
       ))
    })
  }


  menus <- list(
    list(
      type="buttons",
      direction="right",
      y=1.1,
      yanchor="top",
      x=0.1,
      xanchor="left",
      buttons=pc_update
    ),
    list(
      yanchor="bottom",
      y=0,
      x=-.1,
      xanchor="right",
      direction="up",
      buttons=col_update
    ),
    list(
      yanchor="bottom",
      y=.1,
      xanchor="right",
      x=-.1,
      direction="up",
      buttons=sym_update
    )
  )

  scene <- list(
    xaxis=list(title="PC1"),
    yaxis=list(title="PC2")
  )

  if(threeD) {
    scene$zaxis <- list(title="PC3")
  }

  p <- p %>% layout(updatemenus=menus,
    scene=scene,
    legend=list(
      xanchor="right",
      x=-.1
    ))

  return(p)

}


#' Show gene expression in relation to a covariate
#'
#' Show gene expression in relation to a covariate
#'
#' @param id PrimaryID of the gene (usually ENSEMBL ID)
#' @param xCovar the x covariate – column name from the covariate table
#' @param xCovar the y covariate – column name from the covariate table
#' @param expressionLabel - what should be the label for the gene expression covariate
#' @param exprs gene expression matrix to show on the y axis; rownames must
#'        be PrimaryIDs. If NULL, the rld object from the pipeline is used.
#' @param annot_symb_col name of the column in the annot data frame which should be added to the title of the plot.
#' @param annot_id_col name of the column in the annot data frame which corresponds to the rownames of the expression matrix. 
#' @param annot annotation data frame (as returned by the get_annot()
#'        function). If empty, it will be loaded.
#' @param covar the covariate data frame containing the column `xCovar`
#' @param groupBy name of the covariate column by which to group and connect by lines the data points 
#' @param symbolBy name of the covariate column by which to select point symbols
#' @param colorBy name of the covariate column by which to color the data
#' @param trellisBy name of the covariate column for use in a trellis (multipanel) plot
#' @return a ggplot2 object
#' @import ggplot2 
#' @export
plot_gene <- function(id, xCovar, exprs, covar, annot=NULL, 
                               annot_id_col="PrimaryID",
                               annot_symb_col="SYMBOL",
                               expressionLabel="Expression",
                               yCovar=expressionLabel, 
                               groupBy = NA, colorBy = NA, symbolBy = NA,
                               trellisBy=NA) {
  if(is.null(exprs)) { stop("No expression matrix provided") }
  if(is.null(covar)) { stop("No covariate data frame provided") }
  if(is.null(exprs[id, ])) { stop("No expression data for gene ", id) }
  if(nrow(covar) != length(exprs[id, ])) { stop("Expression matrix and covariate data frame have different number of columns/rows") }

  df <- data.frame(covar, Expression=exprs[id, ])
  colnames(df)[ncol(df)] <- expressionLabel
  #print(df)
  #print(colnames(df))

  if(!is.null(annot)) {
    title <- sprintf("%s (%s)", id, annot[ match(id, annot[[annot_id_col]]), ][[annot_symb_col]])
  } else {
    title <- id
  }

  if(!is.na(colorBy)) {
    if(!is.na(groupBy)) {
      g <- ggplot(df, aes(x=.data[[xCovar]], y=.data[[yCovar]], group=.data[[groupBy]], color=.data[[colorBy]]))
    } else {
      g <- ggplot(df, aes(x=.data[[xCovar]], y=.data[[yCovar]], color=.data[[colorBy]]))
    }
  } else {
    if(!is.na(groupBy)) {
      g <- ggplot(df, aes(x=.data[[xCovar]], y=.data[[yCovar]], group=.data[[groupBy]]))
    } else {
      g <- ggplot(df, aes(x=.data[[xCovar]], y=.data[[yCovar]]))
    }
  }

  if(!is.numeric(df[[xCovar]]) && is.na(groupBy)) {
    g <- g + geom_boxplot() + geom_jitter(size=3, alpha=.5, width=.1)
  } else {
    if(!is.na(symbolBy)) {
      g <- g + geom_point(aes(shape=.data[[symbolBy]], size=3))
    } else {
      g <- g + geom_point(size=3)
    }
  }

  if(!is.na(groupBy)) {
    g <- g + geom_line()
  }

  if(!is.na(trellisBy)) {
    g <- g + facet_wrap(trellisBy)
  }


  g  <- g + ggtitle(title)

  return(g)
}