R/normalize_isobaric.R

Defines functions isonorm normalize_isobaric

Documented in normalize_isobaric

#' Examine and Apply Isobaric Normalization
#'
#' Examine reference pool samples and apply normalization of study samples to
#' their corresponding reference pool sample
#'
#' @param omicsData an object of the class 'isobaricpepData'
#' @param apply_norm logical, indicates whether normalization should be applied
#'   to omicsData$e_data
#' @param exp_cname character string specifying the name of the column
#'   containing the experiment/plate information in \code{f_data}
#' @param channel_cname optional character string specifying the name of the
#'   column containing the instrument channel a sample was run on in
#'   \code{f_data}. This argument is optional. See Details for how to specify
#'   information regarding reference pool samples. If using this argument, the
#'   'refpool_channel' argument must also be specified; in this case,
#'   'refpool_cname' and 'refpool_notation' should not be specified.
#' @param refpool_channel optional character string specifying which channel
#'   contains the reference pool sample. Only used when this is the same
#'   from experiment to experiment. This argument is optional. See Details for
#'   how to specify information regarding reference pool samples. If using this
#'   argument, the 'channel_cname' argument must also be specified; in this
#'   case, 'refpool_cname' and 'refpool_notation' should not be specified.
#' @param refpool_cname optional character string specifying the name of the
#'   column containing information about which samples are reference samples in
#'   \code{f_data}. This argument is optional. see Details for how to specify
#'   information regarding reference pool samples. If using this argument, the
#'   'refpool_notation' argument must also be specified; in this case,
#'   'channel_cname' and 'refpool_channel' should not be specified.
#' @param refpool_notation optional character string specifying the value in the
#'   refpool_channel column which denotes that a sample is a reference sample.
#'   This argument is optional. See Details for how to specify information
#'   regarding reference pool samples. If using this argument, the
#'   'refpool_cname' argument must also be specified; in this case,
#'   'channel_cname' and 'refpool_channel' should not be specified.
#'
#' @return If apply_norm = TRUE, an object of class 'isobaricpepData', normalized to reference pool,
#'  and with the attribute 'isobaric_info' updated to include information about
#'  the reference pool samples and the normalization procedure.  Otherwise an
#'  object of class 'isobaricnormRes' containing similar information about the 
#'  normalization process
#'
#' @details There are two ways to specify the information needed for identifying
#'   reference samples which should be used for normalization: \enumerate{ \item
#'   specify \code{channel_cname} and \code{refpool_channel}. This should be
#'   used when the reference sample for each experiment/plate was always located
#'   in the same channel. Here \code{channel_cname} gives the column name for
#'   the column in \code{f_data} which gives information about which channel
#'   each sample was run on, and \code{refpool_channel} is a character string
#'   specifying the value in \code{channel_colname} that corresponds to the
#'   reference sample channel. \item specify \code{refpool_cname} and
#'   \code{refpool_notation}. This should be used when the reference sample is
#'   not in a consistent channel across experiments/plates. Here,
#'   \code{refpool_cname} gives the name of the column in \code{f_data} which
#'   indicates whether a sample is a reference or not, and
#'   \code{refpool_notation} is a character string giving the value used to
#'   denote a reference sample in that column. } In both cases you must specify
#'   \code{exp_cname} which gives the column name for the column in
#'   \code{f_data} containing information about which experiment/plate a sample
#'   was run on.
#'
#' @examplesIf identical(tolower(Sys.getenv("NOT_CRAN")), "true") & requireNamespace("pmartRdata", quietly = TRUE)
#' library(pmartRdata)
#'
#' myiso <- edata_transform(isobaric_object, "log2")
#'
#' # Don't apply the normalization quite yet;
#' # can use summary() and plot() to view reference pool samples
#' myiso_refpools <- normalize_isobaric(
#'   omicsData = myiso, exp_cname = "Plex",
#'   apply_norm = FALSE,
#'   refpool_cname = "Virus",
#'   refpool_notation = "Pool"
#' )
#' summary(myiso_refpools)
#'
#' # Now apply the normalization;
#' # can use plot() to view the study samples after reference pool normalization
#' myiso_norm <- normalize_isobaric(
#'   omicsData = myiso, exp_cname = "Plex",
#'   apply_norm = TRUE,
#'   refpool_cname = "Virus",
#'   refpool_notation = "Pool"
#' )
#'
#' @export
#'
normalize_isobaric <- function(omicsData, exp_cname = NULL, apply_norm = FALSE,
                               channel_cname = NULL, refpool_channel = NULL,
                               refpool_cname = NULL, refpool_notation = NULL) {
  # initial checks #

  # check that omicsData is of correct class
  if (!inherits(omicsData, "isobaricpepData")) {
    stop("omicsData must be of the class 'isobaricpepData'")
  }

  # check that the data has not already been isobaric normalized #
  if (attr(omicsData, "isobaric_info")$norm_info$is_normalized == TRUE) {
    stop(paste("omicsData is already normalized with respect to the isobaric",
      "labels, per attributes assigned upon data object creation",
      sep = " "
    ))
  }

  # check that omicsData$e_data is log transformed
  if (!(get_data_scale(omicsData) %in% c('log2', 'log10', 'log'))) {
    stop("omicsData$e_data must be log transformed")
  }

  # check that exp_cname is in f_data #
  if (!(exp_cname %in% names(omicsData$f_data))) {
    stop(paste("Experiment column", exp_cname, "is not found in f_data.",
      sep = " "
    ))
  }

  # check that apply_norm is of class logical
  if (!is.logical(apply_norm)) stop("apply_norm must be of class 'logical'")

  # check that channel_cname is in f_data, if not NULL #
  if (!is.null(channel_cname)) {
    if (!(channel_cname %in% names(omicsData$f_data))) {
      stop(paste("Channel column", channel_cname,
        "is not found in f_data. See details of as.isobaricpepData",
        "for specifying column names.",
        sep = " "
      ))
    }
  }

  # check that refpool_cname is in f_data, if not NULL #
  if (!is.null(refpool_cname)) {
    if (!(refpool_cname %in% names(omicsData$f_data))) {
      stop(paste("Reference pool column", refpool_cname,
        "is not found in f_data. See details of as.isobaricpepData",
        "for specifying column names.",
        sep = " "
      ))
    }
  }

  # make sure the reference pool info is specified appropriately #
  # possibility 1: specify refpool_cname and refpool_notation #
  poss1 = !is.null(refpool_cname) & !is.null(refpool_notation)
  # possibility 2: specify refpool_channel and channel_cname#
  poss2 = !is.null(refpool_channel) & !is.null(channel_cname)

  # throw an error if neither or both of these are true #
  if ((poss1 + poss2) != 1) {
    stop(paste("Reference samples information was not correctly specified.",
      "See Details and Examples for more information.",
      sep = " "
    ))
  }

  # Pluck out the name of the column in e_data with the peptide IDs.
  edata_cname <- get_edata_cname(omicsData)

  # Fish out the name of the column in f_data containing the sample names.
  fdata_cname <- get_fdata_cname(omicsData)

  ### ### ### ### ###
  # Convert columns of f_data to character vectors because all inputs for
  # distinguishing reference samples are character strings.
  ### ### ### ### ###

  # Convert the column in f_data containing the sample names into a character
  # vector.
  omicsData$f_data[, fdata_cname] <- as.character(
    omicsData$f_data[, fdata_cname]
  )

  # Prepare possibility 1 info -------------------------------------------------

  # if possibility 1 is used, check that refpool_cname is a value seen in each
  # experiment.
  if (poss1 == TRUE) {
    # Make sure refpool_notation is a character string.
    if (!is.character(refpool_notation)) {
      # Throw an error for being out of character!
      stop("refpool_notation must be a character string")
    }

    # Mutate the column indicating whether a sample is a reference sample or not
    # into a character vector.
    omicsData$f_data[, refpool_cname] <- as.character(
      omicsData$f_data[, refpool_cname]
    )

    # Divide the sample names by experiment.
    idx = split(
      as.character(omicsData$f_data[, refpool_cname]),
      omicsData$f_data[, exp_cname]
    )

    # Check that there is a reference sample in each experiment.
    temp_check = lapply(idx, function(x) refpool_notation %in% x)
    if (sum(unlist(temp_check)) != length(temp_check)) {
      stop(paste("refpool_notation =", refpool_notation,
        "is not in every experiment. See Details and Examples for",
        "more information.",
        sep = " "
      ))
    }
  }

  # Prepare possibility 2 info -------------------------------------------------

  # if possibility 2 is used, check that refpool_channel is a value seen in each
  # experiment.
  if (poss2 == TRUE) {
    # Ensure refpool_channel is a character string.
    if (!is.character(refpool_channel)) {
      # Throw an error for being out of character!
      stop("refpool_channel must be a character string")
    }

    # Metamorphose the column in f_data containing the channels into a character
    # vector.
    omicsData$f_data[, channel_cname] <- as.character(
      omicsData$f_data[, channel_cname]
    )

    # Divide the sample names by experiment.
    idx = split(
      omicsData$f_data[, channel_cname],
      omicsData$f_data[, exp_cname]
    )

    # Ensure there is a reference sample in each experiment.
    temp_check = lapply(idx, function(x) refpool_channel %in% x)
    if (sum(unlist(temp_check)) != length(temp_check)) {
      stop(paste("refpool_channel =", refpool_channel,
        "is not in every experiment. See Details and Examples for",
        "more information.",
        sep = " "
      ))
    }
  }

  # Set reference column IDs and sample names ----------------------------------

  # Set the reference column and reference sample names according to the user's
  # input.
  if (!is.null(refpool_channel) && !is.null(channel_cname)) {
    # Set the name of the reference column.
    ref_col <- channel_cname

    # Set the name of the reference samples
    ref_name <- refpool_channel
  } else if (!is.null(refpool_cname) && !is.null(refpool_notation)) {
    # Set the name of the reference column.
    ref_col <- refpool_cname

    # Set the name of the reference samples
    ref_name <- refpool_notation
  }

  # Carry out normalization/create isobaricnormRes object ----------------------

  # case where apply_norm is TRUE
  if (apply_norm == TRUE) {
    # Normalize the data to the isobaric reference samples.
    omicsData <- isonorm(
      omicsData = omicsData,
      exp_cname = exp_cname,
      ref_col = ref_col,
      ref_name = ref_name
    )

    # Include information for the isobaric_info attribute.
    attr(omicsData, "isobaric_info") <- set_isobaric_info(
      exp_cname = exp_cname,
      channel_cname = channel_cname,
      refpool_channel = refpool_channel,
      refpool_cname = refpool_cname,
      refpool_notation = refpool_notation,
      norm_info = list(),
      isobaric_norm = TRUE
    )

    # Update the data_info attribute because the reference samples have been
    # removed from e_data and f_data.
    attr(omicsData, 'data_info') <- set_data_info(
      e_data = omicsData$e_data,
      edata_cname = get_edata_cname(omicsData),
      data_scale_orig = get_data_scale_orig(omicsData),
      data_scale = get_data_scale(omicsData),
      data_types = get_data_info(omicsData)$data_types,
      norm_info = get_data_info(omicsData)$norm_info,
      is_normalized = get_data_info(omicsData)$norm_info$is_normalized,
      batch_info = get_data_info(omicsData)$batch_info,
      is_bc = get_data_info(omicsData)$batch_info$is_bc
    )

    # Return the normalized omicsData object along with its updated attributes.
    return(omicsData)

    # Runs when apply_norm is false.
  } else {
    # Obtain the sample names corresponding to the reference samples.
    rfrnc_nms <- omicsData$f_data[
      omicsData$f_data[, ref_col] == ref_name, fdata_cname
    ]

    # Subset the columns of e_data pertaining to the reference samples.
    edata <- omicsData$e_data[, which(
      names(omicsData$e_data) %in% c(edata_cname, rfrnc_nms)
    )]

    # Subset the rows of f_data corresponding to the reference samples.
    fdata <- omicsData$f_data[which(
      omicsData$f_data[, fdata_cname] %in% rfrnc_nms
    ), ]

    # Create a list containing the columns and rows from e_data and f_data
    # that correspond to the reference samples.
    result <- list(
      e_data = edata,
      f_data = fdata
    )

    # Set the class of the reference samples.
    class(result) = "isobaricnormRes"

    # Add helpful attributes to the isobaricnormRes object.
    attr(result, "cnames") <- list(
      edata_cname = edata_cname,
      fdata_cname = fdata_cname
    )
    attr(result, "isobaric_info") <- set_isobaric_info(
      exp_cname = exp_cname,
      channel_cname = channel_cname,
      refpool_channel = refpool_channel,
      refpool_cname = refpool_cname,
      refpool_notation = refpool_notation,
      norm_info = list(),
      isobaric_norm = FALSE
    )

    # Return the isobaricnormRes object!!!
    return(result)
  }
}

# Carries out the normalization for isobaric data. Within each experiment each
# sample is normalized to the corresponding reference sample.
#
# @param omicsData An isobaricpepData object.
#
# @param exp_cname A character string specifying the name of the column
#   containing the experiment/plate information in \code{f_data}.
# @param ref_col A character string indicating which column in f_data contains
#   the information identifying reference samples.
#
# @param ref_name A character string identifying the reference sample.
#
# @return An isobaricpepData object. The data in e_data has been normalized to
#   the reference samples and all columns corresponding to the reference samples
#   have been removed. The rows in f_data corresponding to the reference samples
#   have also been removed.
#
isonorm <- function(omicsData, exp_cname,
                    ref_col, ref_name) {
  # Find the name of the sample column in f_data
  f_name <- get_fdata_cname(omicsData)

  # Fish out all experiment levels.
  xprmnt <- unique(omicsData$f_data[, exp_cname])

  # Create a list that will hold the sample names for each experiment. It is
  # named no_ref even though at this point it has the names of the reference
  # samples. Later in the function the reference sample names will be removed
  # from this list.
  no_ref <- vector(
    mode = "list",
    length = length(xprmnt)
  )

  # Forge a vector that houses the sample names for the reference samples in
  # each experiment.
  ref <- vector(length = length(xprmnt))

  # Loop through each experiment and extract all samples belonging to that
  # experiment and normalizing the data in an isobaric fashion.
  for (e in 1:length(xprmnt)) {
    # Convert the sample names to a character string. This will remove any
    # factors if the sample column is a factor.
    no_ref[[e]] <- as.character(
      omicsData$f_data[omicsData$f_data[, exp_cname] == xprmnt[[e]], f_name]
    )

    # Convert the sample names for the reference samples to a character string.
    # The reference sample name is extracted by experiment and name of the
    # reference sample (either channel name or reference pool name).
    ref[[e]] <- as.character(
      omicsData$f_data[(omicsData$f_data[, exp_cname] == xprmnt[[e]] &
        omicsData$f_data[, ref_col] == ref_name), f_name]
    )

    # Pluck out the indices for the samples in the eth experiment.
    idx_no_ref <- which(names(omicsData$e_data) %in% no_ref[[e]])

    # Go fishin for the index of the reference sample for the eth experiment.
    idx_ref <- which(names(omicsData$e_data) %in% ref[[e]])

    # Separate the reference sample from the others for the eth experiment.
    idx_no_ref <- idx_no_ref[idx_no_ref != idx_ref]

    # Isobaricate the heck out of the data!
    # Divide the non-reference samples by the reference sample. Since we are on
    # the log scale we will subtract the reference sample from the non-reference
    # samples.
    omicsData$e_data[, idx_no_ref] <- (omicsData$e_data[, idx_no_ref] -
      omicsData$e_data[, idx_ref])
  }

  # Remove all columns and rows from e_data and f_data that correspond to the
  # reference samples.
  omicsData$e_data <- omicsData$e_data[, -which(
    names(omicsData$e_data) %in% ref
  )]
  omicsData$f_data <- omicsData$f_data[-which(
    omicsData$f_data[, f_name] %in% ref
  ), ]

  # Return the normalized omicsData object! Woot!!
  return(omicsData)
}
pmartR/pmartR documentation built on March 4, 2024, 8:32 a.m.