R/data_quality.R

# Functions that analyze the data and report numbers that indicate the quality
# or nature of the data, but does not answer scientific questions.

#' Reports on the number of sequences in the input data at each point (usually
#' time)
#' @param name A dummy variable that can be filled in to aid code readibility
#' @param seq_data The sequence data to filter
#' @param sep The separator using in the FASTA headers
#' @param indx The index in the FASTA header of the information to separate the
#' points on.
#' @export

count_sequences_per_point_of <- function(name = NULL, seq_data, sep = '_', 
                                         indx = 1){
  x <- table(unlist(lapply(strsplit(names(seq_data), sep), `[[`, indx)))
  point <- names(x)
  counts <- as.numeric(x)
  return(data.frame(point = point, n_sequences = counts))
}

#' Plot the number of sequences per point by which the data was split (usually
#' time)
#'
#' @param name A dummy variable that can be filled in to aid code readibility
#' @param split_data Counts by split as produced by
#' \code{\link{count_sequences_per_point_of}}
#' @export

plot_sequences_per_point_of <- function(name = NULL, split_data){
  p <- ggplot(split_data,
              aes(x = point, y = n_sequences)) +
         geom_bar(stat = 'identity') +
         theme(axis.text.x = element_text(angle = 90, hjust = 1))
  return(p)
}
philliplab/toolmania documentation built on May 25, 2019, 5:06 a.m.