R/filter.R

#' Apply a custom fold-change filter to an aggregated data frame.
#'
#' This function allows you to filter an aggregated master result to only contain genes with fold-changes less than or greater than
#' a specified threshold for a given metric.  Several metrics are provided for flexibility of selection.
#' The use of metrics in this function makes it possible to ask questions such as, "which genes have a mean fold-change of at least 2?",
#' "which genes have a fold change of less than 5?", or "which genes have fold-change with a variance across all conditions of at least 10?"
#' @param master_df An aggregated master DE dataframe as produced using create_master_res().
#' @param metric Metric to be used for filtering. All metrics are gene-wise calculated.  That is, for each gene in the aggregate data set,
#' a metric is calculated that is then used to filter the data. Valid metrics are: "max", "min", "mean", "variance", "sd".
#' The "max" metric is the gene-wise maximum fold-change value. The "min" metric is the gene-wise minimum fold-change value.
#' The "mean" metric is the gene-wise fold-change mean.  The "variance" metric is the gene-wise variance. The "sd" metric is the
#' gene-wise standard deviation.
#' @param threshold The cutoff value to use for filtering. Numeric.
#' @param operator Operator to apply to the filtering process.  Options are: "less", "greater".
#' @param absolute Use absolute values when applying filters to filter from both negative and positive ends of the fold change distribution.
#' Boolean. Default=TRUE.
#' @return This function returns a data frame that has been filtered by the specified criteria.
#' @keywords filter aggregate subset sort fold-change
#' @seealso \code{\link{create_master_res}}
#' @export
#' @examples
#' \dontrun{
#'
#' #Prepare a result list.
#' res.day1 <- results(dds, contrast=c("Condition_Time", "day1_disease", "day1_control"))
#' res.day2 <- results(dds, contrast=c("Condition_Time", "day2_disease", "day2_control"))
#' res.day3 <- results(dds, contrast=c("Condition_Time", "day3_disease", "day3_control"))
#' myResList <- list(res.day1, res.day2, res.day3)
#'
#' #Prepare an aggregate master data frame.
#' aggregate_df <- create_master_res(res_list=myResList, filename="master_DE.txt")
#'
#' /*
#'  * Filter the aggregate data to contain only genes whose gene-wise maximum
#'  * fold-change is greater than 10. This will return genes whose gene-wise maximum
#'  * is > 10.
#'  */
#' de_filtered <- de_filter(master_df=aggregate_df, metric="max", threshold=10,
#'                           operator="greater", absolute=FALSE)
#'
#'
#' /*
#'  * Filter the aggregate data to contain only genes whose gene-wise minimum
#'  * fold-change is greater than an absoulte value of 4.  This will return genes
#'  * where the gene-wise minimum is < -4 and > +4.
#'  */
#' de_filtered <- de_filter(master_df=aggregate_df, metric="min", threshold=4,
#'                           operator="greater", absolute=TRUE)
#'
#'
#' /*
#'  * Filter the aggregate data to contain only genes whose gene-wise standard
#'  * deviation is less than an absoulte value of 2.  This will return genes
#'  * where the gene-wise standard deviation is > -2 and < +2.
#'  */
#' de_filtered <- de_filter(master_df=aggregate_df, metric="sd", threshold=2,
#'                           operator="less", absolute=TRUE)
#'
#' }
de_filter <- function(master_df, metric, threshold, operator, absolute=TRUE)
{
  valid_metrics = c("max", "min", "mean", "variance", "sd")
  valid_ops     = c("greater", "less")

  if(typeof(master_df) != "list" || length(master_df) <= 0)
  {
    stop("de_filter() requires a dataframe with at least one column as generated by create_master_res() as input.")
    return(-1)
  }
  if(!(metric %in% valid_metrics))
  {
    stop("de_filter(): invalid metric provided.  Possbile options are:\"max\", \"min\", \"mean\", \"variance\", \"sd\".")
    return(-1)
  }
  if(typeof(absolute) != "logical")
  {
    stop("absolute must be a logical value. Please enter TRUE or FALSE.")
    return(-1)
  }
  if(!is.numeric(threshold))
  {
    stop("de_filter(): Invalid threshold value provided.  Threshold value must be numeric.")
    return(-1)
  }
  if(!(operator %in% valid_ops))
  {
    stop("de_filter(): invalid operator provided.  Possible values are: \"greater\" or \"less\".")
    return(-1)
  }

  #Prepare our variable names for dynamic selection.
  if(operator=="greater") { operator = ">" }
  if(operator=="less")    { operator = "<" }
  if(metric=="max")       { target_column = "maxCol"  }
  if(metric=="min")       { target_column = "minCol"  }
  if(metric=="mean")      { target_column = "meanCol" }
  if(metric=="variance")  { target_column = "varCol"  }
  if(metric=="sd")        { target_column = "sdCol"   }

  #Calculate metrics for the current data set.
  maxCol  <- apply(master_df, 1, max)
  minCol  <- apply(master_df, 1, min)
  meanCol <- apply(master_df, 1, mean)
  varCol  <- apply(master_df, 1, var)
  sdCol   <- apply(master_df, 1, sd)


  #Do the actual filtering.
  filterString = paste("master_df[",target_column,operator,threshold,",]", sep="")
  if(absolute)
  {
    filterString = paste("master_df[abs(",target_column,")",operator,threshold,",]", sep="")
  }
  filtered_df <- eval(parse(text = filterString))

  numFiltered = nrow(master_df) - nrow(filtered_df)

  print(paste("Filtered ", numFiltered, " genes.  ", nrow(filtered_df), " genes remain.", sep=""))
  return(filtered_df)
}


#' Select a subset of count and target data based on metadata annotation.
#'
#' This function, given count data and target metadata, will return a subset of both count and target data including
#' only data with values corresponding to selection parameters. For example, you can select count and target data for
#' only timepoint "day1".  Multiple valid keep selections can also be used.
#' @param counts Count data as prepared with prep_counts().
#' @param targets Target data as prepared with prep_targets().
#' @param target_count_id_map Column in targets file whose values corresponds with count data column ids.
#' @param target_keep_col Column in target data to select values by.  For instance, to select "day1" data
#' from the "timepoint" column, this argument should be "timepoint".
#' @param target_keep_val Value from target_keep_col to keep in the subset data. For instance, to select "day1" data
#' from the "timepoint" column, this argument should be "day1". Multiple values can be selected by providing a concatenated
#' set of values.  I.E. To keep day1 and day2 data, this argument should be, c("day1","day2").
#' @return This function returns count and target data based on the target_keep_col and target_keep_val parameters.
#' Counts will be returned as the [[1]] index and target data will be returned as the [[2]] index.
#' @keywords filter subset
#' @seealso \code{\link{prep_counts}}, \code{\link{prep_targets}}, \code{\link{exclude_data_subset}}
#' @export
#' @examples
#' \dontrun{
#'
#' myCounts <- prep_counts(count_input="master_count_data.txt", delim="t")
#' myTargets <- prep_targets(target_input="master_count_data.txt", delim="t")
#'
#' #Get data for only "timepoint" at "day1".
#' data_subset <- keep_data_subset(counts=myCounts, targets=myTargets,
#'                                   target_count_id_map="sample_id",
#'                                   target_keep_col="timepoint",
#'                                   target_keep_val="day1")
#'
#' #Count data is stored in the first index returned by the function.
#' subset_counts <- data_subset[[1]]
#'
#' #Target data is stored in the second index returned by the function.
#' subset_targets <- data_subset[[2]]
#'
#'
#' #Get data for only "timepoint" at "day1" and "day2".
#' data_subset <- keep_data_subset(counts=myCounts, targets=myTargets,
#'                                   target_count_id_map="sample_id",
#'                                   target_keep_col="timepoint",
#'                                   target_keep_val=c("day1","day2"))
#'
#' #Count data is stored in the first index returned by the function.
#' subset_counts <- data_subset[[1]]
#'
#' #Target data is stored in the second index returned by the function.
#' subset_targets <- data_subset[[2]]
#'
#' }
keep_data_subset <- function(counts, targets, target_count_id_map, target_keep_col, target_keep_val)
{
  if(typeof(counts) != "list")
  {
    stop("keep_data_subset(): Type mismatch. Requires count data to be of type 'list'. ")
    return(-1)
  }
  if(typeof(targets) != "list")
  {
    stop("keep_data_subset(): Type mismatch. Requires target data to be of type 'list'. ")
    return(-1)
  }
  if(typeof(target_count_id_map) != "character")
  {
    stop("keep_data_subset(): Type mismatch. Requires target_count_id_map to be of type 'character'. ")
    return(-1)
  }
  if(typeof(target_keep_col) != "character")
  {
    stop("keep_data_subset(): Type mismatch. Requires target_keep_col to be of type 'character'. ")
    return(-1)
  }
  if(typeof(target_keep_val) != "character")
  {
    stop("keep_data_subset(): Type mismatch. Requires target_keep_val to be of type 'character'. ")
    return(-1)
  }
  if(!(target_keep_col %in% colnames(targets)))
  {
    print("keep_data_subset(): target_keep_col not present in target column set.")
    print("Possible options are:")
    print(colnames(targets))
    stop("Please provide a valid column name.")
    return(-1)
  }
  sub1 <- paste("levels(targets$", target_keep_col, ")", sep="")
  keep_levels <- as.character(eval(parse(text = sub1)))
  for (i in 1:length(target_keep_val))
  {
    if(!(target_keep_val[i] %in% keep_levels))
    {
      print("keep_data_subset(): target_keep_val not present in target_keep_col.")
      print("Possible options are:")
      print(keep_levels)
      print("Invalid selection:")
      print(target_keep_val[i])
      stop("Please provide a valid value.")
      return(-1)
    }
  }
  sub1 <- paste("targets$", target_count_id_map, sep="")
  target_ids <- as.character(eval(parse(text = sub1)))
  if(!(all(colnames(counts) == target_ids)))
  {
    print("keep_data_subset(): count data column names do not match targets id mapping column.")
    print("Count data column ids:")
    print(colnames(counts))
    print("Target data column ids:")
    print(target_ids)
    stop("Please provide corresponding count and target data.")
    return(-1)
  }

  subset_ids = list()

  #Select the rows from targets with provided keep values that correspond to correct count colnames.
  if(length(target_keep_val) == 1)
  {
    sub1 <- paste("targets$", target_count_id_map, "[targets$", target_keep_col, "== \"", target_keep_val, "\"]", sep="")

    subset_ids <- as.character(eval(parse(text = sub1)))
  }
  #If they are selecting multiple values, build up the proper selection criteria.
  else
  {
    sub1 <- paste("targets$", target_count_id_map, "[", sep="")
    sub2 = ""
    for(i in 1:(length(target_keep_val)-1))
    {
      sub2 <- paste(sub2, "targets$", target_keep_col, "== '", target_keep_val[i], "' |", sep="")
    }
    sub2 <- paste(sub2,"targets$", target_keep_col, "== '", target_keep_val[length(target_keep_val)], "'", sep="")
    sub3 <- paste(sub1,sub2,"]",sep="")

    subset_ids <- as.character(eval(parse(text = sub3)))
  }

  #Select the correct subset of counts.
  count_subset <- counts[subset_ids]

  #Select the correct subset of targets.
  sub1 <- paste("targets[is.element(targets$", target_count_id_map, ", subset_ids),]", sep="")
  targets_subset <- eval(parse(text = sub1))

  #Adjust target rownames.
  sub1 <- paste("targets_subset$", target_count_id_map, sep="")
  target_subset_rownames <- as.character(eval(parse(text = sub1)))
  rownames(targets_subset) <- target_subset_rownames

  #Sanity check, only return data as long as targets and counts properly correspond.
  if(!(all(colnames(count_subset) == rownames(targets_subset))))
  {
    print(ncol(count_subset))
    print(nrow(targets_subset))
    print(colnames(count_subset) == rownames(targets_subset))
    stop("keep_data_subset(): Mismatch error.  Subset target rows and count columns do not correspond.")
    return(-1)
  }

  assign("countDat", count_subset, envir=.DEVis_env)
  assign("tgt_dat", targets_subset, envir=.DEVis_env)
  return(list(count_subset,targets_subset))

}



#' Select a subset of count and target data based on metadata annotation.
#'
#' This function, given count data and target metadata, will return a subset of both count and target data excluding
#' data with values corresponding to selection parameters. For example, you can select count and target data to exclude
#' only timepoint "day1".  Multiple valid keep selections can also be used.
#' @param counts Count data as prepared with prep_counts().
#' @param targets Target data as prepared with prep_targets().
#' @param target_count_id_map Column in targets file whose values corresponds with count data column ids.
#' @param target_exclude_col Column in target data to select values by.  For instance, to exclude "day1" data
#' from the "timepoint" column, this argument should be "timepoint".
#' @param target_exclude_val Value from target_exclude_col to remove in the subset data. For instance, to select "day1" data
#' from the "timepoint" column, this argument should be "day1". Multiple values can be selected by providing a concatenated
#' set of values.  I.E. To exclude day1 and day2 data, this argument should be, c("day1","day2").
#' @return This function returns count and target data based on the target_exclude_col and target_exclude_val parameters.
#' Counts will be returned as the [[1]] index and target data will be returned as the [[2]] index.
#' @keywords filter subset
#' @seealso \code{\link{prep_counts}}, \code{\link{prep_targets}}, \code{\link{keep_data_subset}}
#' @export
#' @examples
#' \dontrun{
#'
#' myCounts <- prep_counts(count_input="master_count_data.txt", delim="t")
#' myTargets <- prep_targets(target_input="master_count_data.txt", delim="t")
#'
#' #Get data for only "timepoint" at "day1".
#' data_subset <- exclude_data_subset(counts=myCounts, targets=myTargets,
#'                                   target_count_id_map="sample_id",
#'                                   target_exclude_col="timepoint",
#'                                   target_exclude_val="day1")
#'
#' #Count data is stored in the first index returned by the function.
#' subset_counts <- data_subset[[1]]
#'
#' #Target data is stored in the second index returned by the function.
#' subset_targets <- data_subset[[2]]
#'
#'
#' #Get data for only "timepoint" at "day1" and "day2".
#' data_subset <- exclude_data_subset(counts=myCounts, targets=myTargets,
#'                                   target_count_id_map="sample_id",
#'                                   target_exclude_col="timepoint",
#'                                   target_exclude_val=c("day1","day2"))
#'
#' #Count data is stored in the first index returned by the function.
#' subset_counts <- data_subset[[1]]
#'
#' #Target data is stored in the second index returned by the function.
#' subset_targets <- data_subset[[2]]
#'
#' }
exclude_data_subset <- function(counts, targets, target_count_id_map, target_exclude_col, target_exclude_val)
{
  if(typeof(counts) != "list")
  {
    stop("exclude_data_subset(): Type mismatch. Requires count data to be of type 'list'. ")
    return(-1)
  }
  if(typeof(targets) != "list")
  {
    stop("exclude_data_subset(): Type mismatch. Requires target data to be of type 'list'. ")
    return(-1)
  }
  if(typeof(target_count_id_map) != "character")
  {
    stop("exclude_data_subset(): Type mismatch. Requires target_count_id_map to be of type 'character'. ")
    return(-1)
  }
  if(typeof(target_exclude_col) != "character")
  {
    stop("exclude_data_subset(): Type mismatch. Requires target_exclude_col to be of type 'character'. ")
    return(-1)
  }
  if(typeof(target_exclude_val) != "character")
  {
    stop("exclude_data_subset(): Type mismatch. Requires target_exclude_val to be of type 'character'. ")
    return(-1)
  }
  if(!(target_exclude_col %in% colnames(targets)))
  {
    print("exclude_data_subset(): target_exclude_col not present in target column set.")
    print("Possible options are:")
    print(colnames(targets))
    stop("Please provide a valid column name.")
    return(-1)
  }
  sub1 <- paste("targets$", target_exclude_col, sep="")
  exclude_levels <- as.character(eval(parse(text = sub1)))
  for (i in 1:length(target_exclude_val))
  {
    if(!(target_exclude_val[i] %in% exclude_levels))
    {
      print("exclude_data_subset(): target_exclude_val not present in target_exclude_col.")
      print("Possible options are:")
      print(exclude_levels)
      print("Invalid selection:")
      print(target_exclude_val[i])
      stop("Please provide a valid value.")
      return(-1)
    }
  }
  sub1 <- paste("targets$", target_count_id_map, sep="")
  target_ids <- as.character(eval(parse(text = sub1)))
  if(!(all(colnames(counts) == target_ids)))
  {
    print("exclude_data_subset(): count data column names do not match targets id mapping column.")
    print("Count data column ids:")
    print(colnames(counts))
    print("Target data column ids:")
    print(target_ids)
    stop("Please provide corresponding count and target data.")
    return(-1)
  }

  subset_ids = list()

  #Select the rows from targets with provided exclude values that correspond to correct count colnames.
  if(length(target_exclude_val) == 1)
  {
    sub1 <- paste("targets$", target_count_id_map, "[targets$", target_exclude_col, "!= \"", target_exclude_val, "\"]", sep="")

    subset_ids <- as.character(eval(parse(text = sub1)))
  }
  #If they are selecting multiple values, build up the proper selection criteria.
  else
  {
    sub1 <- paste("targets$", target_count_id_map, "[", sep="")
    sub2 = ""
    for(i in 1:(length(target_exclude_val)-1))
    {
      sub2 <- paste(sub2, "targets$", target_exclude_col, "!= '", target_exclude_val[i], "' &", sep="")
    }
    sub2 <- paste(sub2,"targets$", target_exclude_col, "!= '", target_exclude_val[length(target_exclude_val)], "'", sep="")
    sub3 <- paste(sub1,sub2,"]",sep="")

    subset_ids <- as.character(eval(parse(text = sub3)))
  }

  #Select the correct subset of counts.
  count_subset <- counts[subset_ids]

  #Select the correct subset of targets.
  sub1 <- paste("targets[is.element(targets$", target_count_id_map, ", subset_ids),]", sep="")
  targets_subset <- eval(parse(text = sub1))

  #Adjust target rownames.
  sub1 <- paste("targets_subset$", target_count_id_map, sep="")
  target_subset_rownames <- as.character(eval(parse(text = sub1)))
  rownames(targets_subset) <- target_subset_rownames

  #Sanity check, only return data as long as targets and counts properly correspond.
  if(!(all(colnames(count_subset) == rownames(targets_subset))))
  {
    print(ncol(count_subset))
    print(nrow(targets_subset))
    print(colnames(count_subset) == rownames(targets_subset))
    stop("exclude_data_subset(): Mismatch error.  Subset target rows and count columns do not correspond.")
    return(-1)
  }

  assign("countDat", count_subset, envir=.DEVis_env)
  assign("tgt_dat", targets_subset, envir=.DEVis_env)

  return(list(count_subset,targets_subset))

}

Try the DEVis package in your browser

Any scripts or data that you put into this service are public.

DEVis documentation built on May 2, 2019, 3:18 p.m.