chipmine: Analyze ChIP-seq Data

Documented in get_polII_expressions get_polII_signal import_peaks_as_df

##################################################################################
## get the polII expression values for list of samples
#' PolII signal list from multiple samples
#'
#' @param genesDf dataframe with geneId column
#' @param exptInfo dataframe with experiment information for polII samples
#' @param log2 Logical. Whether to conver polII signal to log2. Default: FALSE
#'
#' @return dataframe with polII expression and isExpressed columns for samples of interest
#' @export
#'
#' @examples NA
get_polII_expressions <- function(genesDf, exptInfo, log2 = FALSE){

  if(is.null(exptInfo$polIIExpFile)){
    warning("Expression data not found...")
    return(genesDf)
  }

  for(i in 1:nrow(exptInfo)){

    if(toupper(exptInfo$IP_tag[i]) == "POLII" & !is.na(exptInfo$polIIExpFile[i])){

      df <- suppressMessages(readr::read_tsv(file = exptInfo$polIIExpFile[i]))

      if(log2){
        df[[exptInfo$sampleId[i]]] <- log2(pmax(df[[exptInfo$sampleId[i]]], 1))
      }

      genesDf <- dplyr::left_join(x = genesDf, y = df, by = c("geneId" = "geneId"))

    }
  }

  return(genesDf)
}

##################################################################################




##################################################################################
#' Read polII binding signal data for a sample
#'
#' This function reads the polII binding data for single sample and add it
#' to the provided dataframe. It also convert the polII signal values to log2
#' which can be used for plotting polII signal heatmap
#'
#' @param file polII binding signal file
#' @param title Title of the polII sample
#' @param clusterData A dataframe to which polII binding signal column should be added.
#' This dataframe should have 'geneId' column
#'
#' @return A list with three elements:
#' \itemize{
#' \item log2_mat: a matrix with log2(polII binding signal) values for heatmap generation
#' \item clusterDf: dataframe with polII binding signal columns
#' \item quantiles: polII binding signal quantile values generated by quantile() function
#' }
#' @export
#'
#' @examples NA
get_polII_signal <- function(file, title, clusterData){

  polII_df <- suppressMessages(readr::read_tsv(file = file))

  clusterData <- clusterData %>% dplyr::left_join(y = polII_df, by = c("geneId" = "geneId"))

  polII_mat <- as.matrix(clusterData[[title]])
  rownames(polII_mat) <- clusterData$geneId

  polII_log2_mat <- log2(pmax(polII_mat, 1))

  polII_q <- quantile(polII_log2_mat, c(seq(0, 0.9, by = 0.1), 0.95, 0.99, 0.992, 0.995, 0.999, 1), na.rm = T)
  print(paste("Quantiles for polII data", file, sep = ": "), quote = F)
  print(polII_q)

  # return: log2(polII expression) matrix, clusterData
  return(list(
    "log2_mat" = polII_log2_mat,
    "clusterDf" = clusterData,
    "quantiles" = polII_q
  ))
}

##################################################################################



##################################################################################

#' Read peak file as dataframe with selected columns
#'
#' @param file Peak file
#' @param peakFormat one of \code{c("narrowPeak", "broadPeak", "bed")}
#' @param peakCols Column to select in dataframe. Column names should be from this list:
#' \code{c("peakChr", "peakStart", "peakEnd", "peakId", "peakScore", "peakStrand",
#' "peakEnrichment", "peakPval", "peakQval", "peakSummit")}.
#' Default: \code{c("peakId", "peakEnrichment", "peakPval")}
#' @param sampleId sampleId to be used as suffix in column names
#' @param column_suffix Logical: whether to add sampleId suffix to column name. Default: TRUE
#'
#' @return A dataframe with columns specified
#' @export
#'
#' @examples NA
import_peaks_as_df <- function(
  file, peakFormat = "narrowPeak",
  peakCols = c("peakId", "peakEnrichment", "peakPval"),
  sampleId = NULL, column_suffix = TRUE){

  peakFormat <- match.arg(arg = peakFormat, choices = c("narrowPeak", "broadPeak", "bed"))

  colNames <- c("peakChr", "peakStart", "peakEnd", "peakId", "peakScore", "peakStrand",
                "peakEnrichment", "peakPval", "peakQval", "peakSummit")


  if(peakFormat == "narrowPeak"){
    peakCols <- match.arg(arg = peakCols, choices = colNames, several.ok = TRUE)

  } else if(peakFormat == "broadPeak"){
    peakCols <- match.arg(arg = peakCols, choices = colNames[1:9], several.ok = TRUE)
    colNames <- colNames[1:9]

  } else{
    peakCols <- match.arg(arg = peakCols, choices = colNames[1:6], several.ok = TRUE)
    colNames <- colNames[1:6]
  }

  peaksDf <- suppressMessages(readr::read_tsv(file = file, col_names = colNames))

  if(nrow(peaksDf) == 0){
    return(NULL)
  }

  peaksDf <- dplyr::select(peaksDf, !!!peakCols)

  if(column_suffix && !is.null(sampleId)){
    renameCols <- peakCols
    names(renameCols) <- paste(peakCols, sampleId, sep = ".")
    peaksDf <- dplyr::rename(peaksDf, !!! renameCols)
  }

  return(peaksDf)
}

##################################################################################