R/PREBS.R

Defines functions calc_prebs perform_rpa perform_rma check_probe_table probe_table_expressions sum_duplicates fr count_overlaps_from_bam read_bam_and_count_overlaps granges_from_cdf check_input is.installed cdf_package_name

Documented in calc_prebs

# Authors: Karolis Uziela, Antti Honkela

########################################### Libraries #################################################

#library(GenomicAlignments)
#library(GenomicRanges)
#library(affy)
#library(parallel)

######################################## Function definitions #########################################

#' PREBS package
#'
#' The prebs package aims at making RNA-sequencing (RNA-seq)
#' data more comparable to microarray data. The comparability is achieved
#' by summarizing sequencing-based expressions of probe regions using
#' standard microarray summarization algorithms (RPA or RMA). The pipeline takes mapped reads
#' in BAM format as an input and produces either gene expressions or
#' original microarray probe set expressions as an output.
#'
#' The package has only one public function: \code{calc_prebs}. 
#' Type help(calc_prebs) for more information on the usage.
#' @importFrom GenomicAlignments readGAlignments readGAlignmentPairs
#' @importFrom stats optim
#' @importFrom parallel parLapply
#' @importFrom methods setClass setMethod
#' @importFrom affy xy2indices
#' @importFrom RPA rpa
#' @importFrom Biobase exprs
#' @importFrom Biobase assayData
#' @importFrom GenomeInfoDb seqlengths
#' @importClassesFrom GenomicRanges GRanges
#' @importClassesFrom IRanges IRanges
#' @importClassesFrom S4Vectors Rle
#' @importClassesFrom Biobase ExpressionSet
#' @importFrom GenomicRanges GRanges
#' @importFrom IRanges IRanges
#' @importFrom S4Vectors Rle
#' @importFrom Biobase ExpressionSet
#' @importMethodsFrom IRanges countOverlaps
#' @importFrom affy getCdfInfo
#' @docType package
#' @name prebs
NULL

## Take "_mapping.txt" filename and return the name of respective package ("HGU133Plus2_Hs_ENSG_mapping.txt" -> "hgu133plus2hsensgcdf")
cdf_package_name <- function(mapping_txt_file) {
  my_cdf <- basename(mapping_txt_file)
  my_cdf <- sub("_mapping.txt", "", my_cdf)
  my_cdf <- gsub("_", "", my_cdf)
  my_cdf <- tolower(my_cdf)
  my_cdf <- paste(my_cdf, "cdf", sep="")
  message(paste("Inferred name for CDF package: ", basename(mapping_txt_file), " -> ", my_cdf, "\n", sep=""))
  return(my_cdf)
}

# Check if a package is installed
is.installed <- function(mypkg) {
  is.element(mypkg, installed.packages()[,1]) 
}

# Check if the input files exist and if the CDF package is installed
check_input <- function(bam_files, probe_mapping_file, cdf_name) {
  if (! is.installed(cdf_name)) {
    stop(paste("CDF package `", cdf_name, "` is not installed. Please, install the package before running `calc_prebs`\n", sep=""))
  }
  if (! file.exists(probe_mapping_file) ) {
    stop(paste("Probe mapping file `", probe_mapping_file, "` does not exist.\n", sep=""))
  }
  for (i in 1:length(bam_files)) {
    if (! file.exists(bam_files[i]) ) {
      stop(paste("BAM file `", bam_files[i], "` does not exist. If you are running the code from examples, please, install `prebsdata` package first.\n", sep=""))
    }
  }
}

## Read CDF probe mapping file and create GRanges object from it
granges_from_cdf <- function(probe_mapping_file, CDF_NAME) {
  old.scipen <- getOption("scipen")
  options("scipen" = 100) # Avoid exponential number expressions in probe_id

  # Read probe mapping file
  probes <- read.table(probe_mapping_file, sep = "\t", header=TRUE)

  # create GRanges object
  PROBE_SEQ_LENGTH = 25
  probe_id <- xy2indices(probes$Probe.X, probes$Probe.Y, cdf=CDF_NAME)
  chromosome <- probes$Chr
  range_start <- probes$Chr.From
  range_end <- range_start + PROBE_SEQ_LENGTH - 1
  strand_id <- probes$Chr.Strand
  my_ranges <- GRanges(seqnames=Rle(chromosome), ranges = IRanges(range_start, range_end, names=probe_id), strand = Rle(strand_id))
  
  options("scipen" = old.scipen)  
  
  return(my_ranges)
}

## Read a single BAM file and count overlaps with probe regions
read_bam_and_count_overlaps <- function(bam_file, probe_ranges, paired_ended_reads, ignore_strand) {
  #library(GenomicAlignments)
  #library(GenomicRanges)
  if (paired_ended_reads) {
    bam_aligns <- readGAlignmentPairs(bam_file)
  } else {
    bam_aligns <- readGAlignments(bam_file)
  }

  if (! ("X" %in% names(seqlengths(bam_aligns)))) {
    stop("Unrecognized chromosome names. You are probably using a bam file whose reads were mapped to UCSC genome. Currently only bam files with reads mapped to ENSEMBL genome are supported. ENSEMBL reference genomes can be downloaded from ENSEMBL FTP server.")
  }  

  counts <- countOverlaps(probe_ranges, bam_aligns, ignore.strand=ignore_strand) 

  rm(bam_aligns)
  gc()
  print(paste("Finished:",basename(bam_file)))
  return(counts)
}

## Read all BAM files and calculate read overlaps with probe regions
count_overlaps_from_bam <- function(bam_files, probe_mapping_file, CDF_NAME, CLUSTER, paired_ended_reads, ignore_strand) {

  my_ranges <- granges_from_cdf(probe_mapping_file, CDF_NAME)
  
  if (is.null(CLUSTER)) {
    counts <- lapply(bam_files, read_bam_and_count_overlaps, probe_ranges=my_ranges, paired_ended_reads=paired_ended_reads, ignore_strand = ignore_strand)
  } else {
    counts <- parLapply(CLUSTER, bam_files, read_bam_and_count_overlaps, probe_ranges=my_ranges, paired_ended_reads=paired_ended_reads, ignore_strand = ignore_strand)
  }
  probe_table <- do.call(cbind, counts)
  colnames(probe_table) <- basename(bam_files)
  rownames(probe_table) <- names(my_ranges)
  
  return(probe_table)
}

## This function is used for optimization of Bayesian prior
fr <- function(x,k) {
  alpha <- x[1]
  beta <- x[2]
  -sum(alpha*log(beta) - (alpha+k)*log(beta+1) + lgamma(alpha+k) - lgamma(k+1) - lgamma(alpha))
}

## This function sums rows with duplicate ids. Rows shouldn't have any duplicate ids in when using Custom CDF files.
## However, when using manufacturer's CDF files, duplicate row ids may arise bacause bowtie might map probe sequence to several
## different locations. In that case, we have to sum probe region counts from all of these locations.
sum_duplicates <- function(probe_table) {
  if (sum(duplicated(rownames(probe_table))) > 0) {
    message("Note: Some probe IDs contain duplicates. If you are using manufacturer's CDF then you can ignore this message.\n")
    
    unique_row_names <- unique(rownames(probe_table))
    unique_probe_table <- matrix(data=0, nrow=length(unique_row_names), ncol=ncol(probe_table))
    rownames(unique_probe_table) <- unique_row_names
    colnames(unique_probe_table) <- colnames(probe_table)

    rn <- rownames(probe_table)
    last_id <- ""
    last_row <- NULL
    j <- 1

    for (i in 1:nrow(probe_table)) {
      new_row <- probe_table[i,]
      new_id <- rn[i]
      if (new_id == last_id) {
        last_row = last_row + new_row
      } else {
        if (!is.null(last_row)) {
          unique_probe_table[j,] <- last_row
          j = j + 1
        }
        last_row <- new_row
        last_id <- new_id
      }
    }
    unique_probe_table[j,] <- last_row
    
    return(unique_probe_table)
  } else {
    return(probe_table)
  }
}

## Convert raw counts to expression values using statistical model
probe_table_expressions <- function(probe_table) {
  # Combine all probe level expressions into a single vector (k) and find optimal Alpha and Beta parameters for the vector 
  k <- c(probe_table)
  opt <- optim(c(0.2,0.03), fr, NULL, k, method="L-BFGS-B", lower=0.0001, upper=10)
  ALPHA <- opt$par[1]
  BETA <- opt$par[2]
  
  # Print alpha and beta
  message("Estimated values for Bayesian prior:\n")
  message(paste("Alpha=", ALPHA, "\n", sep=""))
  message(paste("Beta=", BETA, "\n", sep=""))

  # Update probe-level expressions taking prior into account
  probe_table <- (probe_table + ALPHA) / (1 + BETA)
  return(probe_table)
}

# Check if all probe sequence ids are present in probe_table. 
# When working with Custom CDF it should always be the case, as long as you are using the latest version cdf package and _mapping.txt file
# When working with Manufacturer's CDF, some probe IDs might be missing in _mapping.txt file
# because the file is manually created and some probe sequences fail to be mapped to genome.
check_probe_table <- function(probe_table, all_probes_vec) {
  vec_names <- all_probes_vec[,1]
  pr_names <- rownames(probe_table)
  missing_n <- sum(!is.element(vec_names, pr_names))
  if (missing_n > 0) {
    message(paste("Note: ", missing_n, " probe sequences are missing in _mapping.txt file. If you are using Custom CDF, the make sure that the _mapping.txt file is from the same Custom CDF version as the package (see the Vignette for details). If you are using manufacturer's CDF then you can ignore this message.\n", sep=""))
    missing_probes <- vec_names[!is.element(vec_names, pr_names)]
    dummy_matrix <- matrix(data=0, nrow=missing_n, ncol=ncol(probe_table))
    for (i in 1:ncol(dummy_matrix)) {
      dummy_matrix[,i] <- min(probe_table[,i])
    }
    rownames(dummy_matrix) <- missing_probes
    probe_table <- rbind(probe_table, dummy_matrix)
  }
  return(probe_table)
}

# Create an object for extracting information about perfect match probe ids
setClass("CDFName", representation(cdfName="character"), prototype=(cdfName=''))

setMethod("initialize", "CDFName", function(.Object, name) {
  .Object@cdfName <- name
  .Object
})

setMethod("cdfName", "CDFName", function(object) object@cdfName)

## Extract necessary information from CDF files and perform RMA algorithm
perform_rma <- function(probe_table, CDF_NAME, output_eset) {
  # Get CDF environment containing information about perfect match probe ids
  cdn <- new("CDFName", CDF_NAME)
  cdf_env <- getCdfInfo(cdn)

  # Map probe identifiers in probe_table to the ones in CDF files
  all_sets <- grep('AFFX', ls(cdf_env), value=TRUE, invert=TRUE)

  # Extract pm probes for each gene
  all_probes <- lapply(mget(all_sets, cdf_env),
                       function(x) as.matrix(as.character(x[,1])))

  # Create names for probes
  for (i in seq_along(all_probes)) {
    row.names(all_probes[[i]]) <- paste(names(all_probes)[i],
                                        seq_along(all_probes[[i]]), sep='')
  }
  all_probes_vec <- do.call(rbind, all_probes)
  
  # Check if all probe ids are present in probe_table. Some might be missing for manually created _mapping.txt files
  probe_table <- check_probe_table(probe_table, all_probes_vec)

  # Select pm probes in probe_table
  probe_table_mapped <- probe_table[all_probes_vec,,drop=FALSE]
  rownames(probe_table_mapped) <- rownames(all_probes_vec)

  # Create pNList and ngenes input variables
  pNList <- sub("(.*_at)[0-9]+","\\1",rownames(probe_table_mapped))
  pNList <- split(0:(length(pNList) - 1), pNList)
  ngenes <- length(pNList)

  # Set RMA parameters
  verbose = TRUE; normalize = TRUE; background = FALSE; bgversion = 2

  # Apply RMA
  prebs_values <- .Call("rma_c_complete_copy", probe_table_mapped, pNList, ngenes, normalize, background, bgversion, verbose, PACKAGE = "affy")

  # Save the results
  coln <- colnames(probe_table)
  coln[duplicated(coln)] <- paste(coln[duplicated(coln)],1:length(coln[duplicated(coln)]), sep="") # rename duplicated columns
  colnames(prebs_values) <- coln

  if (output_eset) {
    rownames(prebs_values) <- substr(rownames(prebs_values), 1, nchar(rownames(prebs_values))-3) # remove trailing _at from identifiers
    prebs_values <- new("ExpressionSet", annotation = CDF_NAME, exprs = prebs_values)
  } else {
    prebs_values <- as.data.frame(prebs_values)
    prebs_values$ID <- substr(rownames(prebs_values), 1, nchar(rownames(prebs_values))-3) 
    rownames(prebs_values) <- 1:nrow(prebs_values)
  }

  return(prebs_values)
}

perform_rpa <- function(probe_table, CDF_NAME, output_eset) {
  # Get CDF env to find out what is the maximum probe index
  cdn <- new("CDFName", CDF_NAME)
  cdf_env <- getCdfInfo(cdn)  
  max_ind <- max(sapply(mget(ls(cdf_env), cdf_env), max))

  # Create a full matrix of probe expressions instead of sparse matrix
  min_value <- min(probe_table)
  sample_n <- ncol(probe_table)
  probe_table_full <- matrix(data=min_value,nrow=max_ind, ncol=sample_n)
  rownames(probe_table_full) <- 1:max_ind
  colnames(probe_table_full) <- paste("sample", 1:ncol(probe_table_full), sep="") # Avoid duplicate column names in RPA input
  probe_table_full[rownames(probe_table),] <- probe_table

  # Convert the matrix of probe table expressions to AffyBatch object
  eset <- ExpressionSet(assayData=probe_table_full, annotation = CDF_NAME)
  ad <- assayData(eset)
  abatch <- new("AffyBatch", cdfName = CDF_NAME, assayData=ad)
  
  # Apply RPA
  prebs_values <- rpa(abatch, bg.method="none", cdf=CDF_NAME)
  prebs_values <- exprs(prebs_values)
  prebs_values <- prebs_values[substr(rownames(prebs_values),1,4) != "AFFX",] # Remove AFFX probes

  # Save the results
  coln <- colnames(probe_table)
  coln[duplicated(coln)] <- paste(coln[duplicated(coln)],1:length(coln[duplicated(coln)]), sep="") # rename duplicated columns
  colnames(prebs_values) <- coln
  
  if (output_eset) {
    rownames(prebs_values) <- substr(rownames(prebs_values), 1, nchar(rownames(prebs_values))-3) # remove trailing _at from identifiers
    prebs_values <- new("ExpressionSet", annotation = CDF_NAME, exprs = prebs_values)
  } else {
    prebs_values <- as.data.frame(prebs_values)
    prebs_values$ID <- substr(rownames(prebs_values), 1, nchar(rownames(prebs_values))-3)
    rownames(prebs_values) <- 1:nrow(prebs_values)
  }

  return(prebs_values)
}
    
#' @title Calculate PREBS values
#'
#' @description
#' \code{calc_prebs} calculates PREBS values for given set of BAM files.
#'
#' @details
#' \code{calc_prebs} is the main function of \code{prebs} package that implements the whole
#' pipeline. The function takes mapped reads in BAM format and probe sequence
#' mappings as an input. 
#' 
#' \code{calc_prebs} can run in two modes: \code{rpa} and \code{rma}. RMA is the classical 
#' microarray summarization algorithm developed by R. A. Irizarry et al. (2003), while RPA is a newer algorithm that was developed by
#' L. Lahti et al. (2011). The default mode is \code{rpa}. NOTE: before \code{prebs} version 1.7.1 only RMA mode was available. 
#'
#' The output format depends on \code{output_eset} option. If \code{output_eset=TRUE} then
#' \code{calc_prebs} returns ExpressionSet object  (ExpressionSet object is defined in
#' \code{affy} package). Otherwise, it returns a data frame containing PREBS values.
#'
#' For running \code{calc_prebs} with custom CDF, the custom CDF package has to be 
#' downloaded and installed from Custom CDF website:   
#' http://brainarray.mbni.med.umich.edu/CustomCDF
#'
#' For running \code{calc_prebs} with manufacturer's CDF, the manufacturer's CDF package 
#' can be installed from Bioconductor, for example:
#' BiocManager::install("GenomicRanges"); 
#' BiocManager::install("hgu133plus2cdf")
#'
#' For a detailed input specification, please refer to the \code{prebs} vignette.
#'
#' @param bam_files A vector containing .bam files.
#' @param probe_mapping_file A file containing probe mappings in the genome. 
#' @param cdf_name A name of CDF package to use in RMA algorithm. If cdf_name=NULL, the package name is inferred from the name of probe_mapping_file ("HGU133Plus2_Hs_ENSG_mapping.txt" -> "hgu133plus2hsensgcdf")
#' @param sum.method Microarray summarization method to be used. Can be either \code{rpa} or \code{rma}. The default mode is \code{rpa}.
#' @param cluster A cluster object created using "makeCluster" function from "parellel" package. If cluster=NULL, no parallelization is used.
#' @param output_eset If set to TRUE, the output of \code{calc_prebs} will be ExpressionSet object. Otherwise, the output will be a data frame.
#' @param paired_ended_reads Set it to TRUE if your data contains paired-ended reads. Otherwise, the two read mates will be treated as independent units.
#' @param ignore_strand If set to TRUE, then the strand is ignored while counting read overlaps with probe regions. If you use strand-specific RNA-seq protocol, set to FALSE, otherwise set it to TRUE.
#' @return ExpressionSet object or a data frame containing PREBS values
#' @export
#' @examples 
#' if (require(prebsdata)) {
#'   # Get full paths to data files in \code{prebsdata} package
#'   bam_file1 <- system.file(file.path("sample_bam_files", "input1.bam"), package="prebsdata")
#'   bam_file2 <- system.file(file.path("sample_bam_files", "input2.bam"), package="prebsdata")
#'   bam_files <- c(bam_file1, bam_file2)
#'   custom_cdf_mapping1 <- system.file(file.path("custom-cdf", "HGU133Plus2_Hs_ENSG_mapping.txt"), 
#'                                      package="prebsdata")
#'   custom_cdf_mapping2 <- system.file(file.path("custom-cdf", "HGU133A2_Hs_ENSG_mapping.txt"), 
#'                                      package="prebsdata")
#'   manufacturer_cdf_mapping <- system.file(file.path("manufacturer-cdf", "HGU133Plus2_mapping.txt"), 
#'                                           package="prebsdata")
#'   if (interactive()) {
#'     # Run PREBS using custom CDF without parallelization ("rpa" mode)
#'     prebs_values <- calc_prebs(bam_files, custom_cdf_mapping1)
#'     head(exprs(prebs_values))
#'
#'     # Run PREBS using custom CDF without parallelization ("rma" mode)
#'     prebs_values <- calc_prebs(bam_files, custom_cdf_mapping1, sum.method="rma")
#'     head(exprs(prebs_values))
#'
#'     # Run PREBS using custom CDF with parallelization
#'     library(parallel)
#'     N_CORES = 2
#'     CLUSTER <- makeCluster(N_CORES)
#'     prebs_values <- calc_prebs(bam_files, custom_cdf_mapping1, cluster=CLUSTER)
#'     stopCluster(CLUSTER)
#'     
#'     # Run PREBS using another custom CDF
#'     prebs_values <- calc_prebs(bam_files, custom_cdf_mapping2)
#'     
#'     # Run PREBS and return data frame instead of ExpressionSet object
#'     prebs_values <- calc_prebs(bam_files, custom_cdf_mapping1, output_eset=FALSE)
#'     head(prebs_values)
#'   }
#'
#'   # Run PREBS using Manufacturer's CDF (outputs probe set expressions)
#'   prebs_values <- calc_prebs(bam_files, manufacturer_cdf_mapping)
#'   head(exprs(prebs_values))
#'
#'   # Same as above, but state CDF package name explicitly
#'   prebs_values <- calc_prebs(bam_files, manufacturer_cdf_mapping, cdf_name="hgu133plus2cdf")
#' }

calc_prebs <- function(bam_files, probe_mapping_file, cdf_name = NULL, cluster = NULL, output_eset=TRUE, paired_ended_reads=FALSE, ignore_strand=TRUE, sum.method="rpa") {
  if (is.null(cdf_name)) {
    cdf_name <- cdf_package_name(probe_mapping_file) # Get CDF package name from the filename of cdf mapping file
  }
  check_input(bam_files, probe_mapping_file, cdf_name)

  probe_table <- count_overlaps_from_bam(bam_files, probe_mapping_file, cdf_name, cluster, paired_ended_reads, ignore_strand) # Read bam files and calculate read overlaps with probe regions
  
  probe_table <- sum_duplicates(probe_table) # Sum duplicate row ids. Required only for manufacturer's CDF

  probe_table <- probe_table_expressions(probe_table) # Convert raw probe region counts to probe region expressions using statistical model
  
  if (sum.method == "rma") {
    prebs_values <- perform_rma(probe_table, cdf_name, output_eset) # Perform RMA on probe region expressions
  } else if (sum.method == "rpa") {
    prebs_values <- perform_rpa(probe_table, cdf_name, output_eset)
  } else {
    stop("Invalid sum.method parameter. Has to be either 'rma' or 'rpa'")
  }

  return(prebs_values)
}

Try the prebs package in your browser

Any scripts or data that you put into this service are public.

prebs documentation built on Nov. 8, 2020, 5:40 p.m.