R/pentropy.R

# All code that compute or plot positional entropy
# Shoud later be split into separate files?

#' Given data at a single point of a single patient, compute the entropy at
#' each position
#'
#' Defaults to Shannon entropy. If other entropies are required, contact the
#' package maintainer. (Trivial expansion)
#' 
#' @param seq_data The filtered sequence data whose positional entropy
#' scores must be computed.
#' @export

compute_pentropy <- function(seq_data){
  consensus <- consensusMatrix(seq_data)
  letter_frequencies <- apply(consensus, 1, sum)
  letters_that_occur <- names(letter_frequencies[letter_frequencies>0])
  consensus <- consensus[attr(consensus, 'dimnames')[[1]] %in% letters_that_occur, ]

  pentropy <- data.frame(pos = 1:ncol(consensus), 
                         variable = 'entropy',
                         value = apply(consensus, 2, entropy))
  return(pentropy)
}

#' Given data at a single point of a single patient, compute the entropy at
#' each position
#'
#' Defaults to Shannon entropy. If other entropies are required, contact the
#' package maintainer. (Trivial expansion)
#'
#' TODO: Not so trivial, expand this function to not only work over time, but
#' over arbitrary splits.
#' 
#' @param seq_data The unfiltered sequence data whose positional entropy
#' scores must be computed.
#' @export

compute_all_pentropy <- function(seq_data){
  all_pos_data <- data.frame(time_point = character(0),
                             pos = integer(0),
                             variable = character(0),
                             value = numeric(0))
  time_points <- get_unique_points_of('time', seq_data, indx = 2)
  for (time_point in time_points){
    seq_at_time_data <- get_data_of('timepoint', seq_data, time_point)
    pentropy <- compute_pentropy(seq_at_time_data)
    all_pos_data <- rbind(all_pos_data,
                          cbind(data.frame(time_point = time_point),
                                pentropy))
  }
  return(all_pos_data)
}

#' Aggregates the data over the points (usually time points) computing various
#' metrics of the values.
#'
#' @param all_pos_data The position data for all the points
#' @param variable A character vector of the names of the variables to compute
#' metrics on.
#' @export

aggregate_over_points <- function(all_pos_data, variables = 'entropy'){
  aggregated_metrics <- data.frame(pos = numeric(0),
                                   variable = character(0),
                                   value = numeric(0))
  for (the_variable in variables){
    variable_data <- subset(all_pos_data, variable == the_variable)
    tmp <- ddply(variable_data,
                 .(pos),
                 summarize,
                 n = length(value),
                 max = max(value),
                 min = min(value),
                 mean = mean(value),
                 median = median(value),
                 sd = sd(value))
    tmp <- melt(tmp, id.vars = 'pos')
    tmp$variable <- paste0(tmp$variable, '_', the_variable)
    aggregated_metrics <- rbind(aggregated_metrics, tmp)
  }
  return(aggregated_metrics)
}
philliplab/toolmania documentation built on May 25, 2019, 5:06 a.m.