MicroVis: Microbiome Analysis and Visualization

Documented in clearFeatureFilt filterLowAbun filterLowPrev filterLowRelAbun filterLowTotAbun filterLowVar filterNAs findSigFisher getFtStats runFeatureFilter selectFeatures unselectFeatures

#' @title Feature Filter
#'
#' @description This function is one of the processing functions called by
#'     processDataset() and identifies and removes features based on parameters
#'     set by the filter___ feature filtering functions
#'
#' @param dataset MicroVis dataset (mvdata object)
#' @param temp This parameter has no use in this function and can be removed
#' @param silent Argument that is ultimately passed onto runSampleFilter(),
#'     runNormalization(), and runFeatureFilter(), telling them not to output
#'     any messages.
#'
#' @return MicroVis dataset (mvdata object) with features filtered out based on
#'     parameters set by the filter___ feature filtering functions
#'
runFeatureFilter <- function(dataset=NULL, temp=F, silent=F) {
  if(is.null(dataset)) dataset <- get('active_dataset',envir = mvEnv)

  if(!is.null(dataset$data$proc$rarefied)) return(dataset)

  if(is.null(dataset$data$proc$unranked)) dataset <- runNormalization(dataset, silent = T)

  if(is.null(dataset$data$proc$filter_rank)) dataset$data$proc$filter_rank <- getLowestRank(dataset)

  filtering <- dataset$data$proc$filtering

  # If there is something in the dataset's filtering history besides
  #   "filterlist" and "ftstats", proceed with filtering
  if(length(names(filtering)[!(names(filtering) %in% c('filterlist','ftstats'))])) {
    if(!silent) cat('\n\n|~~~~~~~~~~~~~  FILTERING FEATURES  ~~~~~~~~~~~~~|\n')

    filter_rank <- dataset$data$proc$filter_rank

    ft_data <- getFtStats(dataset,justStats = F)
    abd_temp <- ft_data$proc$unranked
    if(dataset$features=='taxa') abd_temp <- agglomTaxa(ft_data, abd_temp,
                                                        from_rank='asv',
                                                        to_rank=filter_rank)

    nfts <- nrow(ft_data$proc$filtering$ftstats)

    filterlist <- list()
    ftstats <- ft_data$proc$filtering$ftstats

    ### Identify Low Prevalence ###
    #-----------------------------#
    if(!is.null(filtering$prevalence_proportion)) {
      low_prevalence <- ftstats$Feature[ftstats$Prevalence_Proportion<filtering$prevalence_proportion]

      if(!silent) cat(paste0('\n  Identified ',
                             length(low_prevalence),' features present in < ',
                             filtering$prevalence_proportion*100,'% of samples'))

      filterlist$low_prevalence <- low_prevalence

    } else if(!is.null(filtering$top_prevalence)) {
      low_prevalence <- (ftstats %>% dplyr::slice_min(Prevalence,
                                                      n=(nfts-filtering$top_prevalence)))[['Feature']]

      if(!silent) cat(paste0('\n  Identified top ',
                             filtering$top_prevalence,' features by prevalence'))

      filterlist$low_prevalence <- low_prevalence

    } else if(!is.null(filtering$min_prevalence)) {
      low_prevalence <- ftstats$Feature[ftstats$Prevalence<filtering$min_prevalence]

      if(!silent) cat(paste0('\n  Identified ',
                             length(low_prevalence),' features with < ',
                             filtering$min_prevalence,' prevalence'))

      filterlist$low_prevalence <- low_prevalence
    }
    ### Identify Low Relative Abundance ###
    #-------------------------------------#
    if(!is.null(filtering$top_relabun)) {
      low_relabun <- (ftstats %>% dplyr::slice_min(Mean_Relative_Abundance,
                                                   n=(nfts-filtering$top_relabun)))[['Feature']]

      if(!silent) cat(paste0('\n  Identified top ',
                             filtering$top_relabun,' features by relative abundance'))

      filterlist$low_relabun <- low_relabun

    } else if(!is.null(filtering$min_relabun)) {
      low_relabun <- ftstats[ftstats$Mean_Relative_Abundance<filtering$min_relabun,]$Feature

      if(!silent) cat(paste0('\n  Identified ',
                             length(low_relabun),' features with < ',
                             filtering$min_relabun,' relative abundance'))

      filterlist$low_relabun <- low_relabun
    }
    ### Identify Low Total Abundance ###
    #----------------------------------#
    if(!is.null(filtering$top_totabun)) {
      low_totabun <- (ftstats %>% dplyr::slice_min(Total_Abundance,
                                                   n=(nfts-filtering$top_totabun)))[['Feature']]

      if(!silent) cat(paste0('\n  Identified top ',
                             filtering$top_totabun,' features by total abundance'))

      filterlist$low_totabun <- low_totabun

    } else if(!is.null(filtering$min_totabun)) {
      low_totabun <- ftstats[ftstats$Total_Abundance<filtering$min_totabun,]$Feature

      if(!silent) cat(paste0('\n  Identified ',
                             length(low_totabun),' features with < ',
                             filtering$min_totabun,' total abundance'))

      filterlist$low_totabun <- low_totabun
    }
    ### Identify Low Variance ###
    #---------------------------#
    if(!is.null(filtering$top_var)) {
      low_var <- (ftstats %>% dplyr::slice_min(StDev, n=(nfts-filtering$top_var)))[['Feature']]

      if(!silent) cat(paste0('\n  Identified the top ',
                             filtering$top_var,' features by standard deviation'))

      filterlist$low_var <- low_var

    } else if(!is.null(filtering$low_var_percentile)) {
      low_sd <- dplyr::slice_min(data.frame(sd=unique(ftstats$StDev)),
                                 sd, n=floor(filtering$low_var_percentile*nrow(ftstats)/100))$sd

      low_var <- ftstats[ftstats$StDev %in% low_sd,]$Feature

      if(!silent) cat(paste0('\n  Identified ',
                             length(low_var),' features with variance in the bottom ',
                             filtering$low_var_percentile,' percentile by standard deviation'))

      filterlist$low_variance <- low_var
    }
    ### Identify Low Abundance ###
    #----------------------------#
    if(!is.null(filtering$low_abun$min_abun) & !is.null(filtering$low_abun$min_prop)) {
      ftstats$LowAbunProp=apply(abd_temp, 2,
                                function(x) sum(x<filtering$low_abun$min_abun)/nrow(abd_temp))

      low_abun <- ftstats[ftstats$LowAbunProp>(1-filtering$low_abun$min_prop/100),]$Feature

      if(get('keepSigFisher',envir = mvEnv)) {
        low_abun <- low_abun[!(low_abun %in% findSigFisher(dataset,low_abun,silent=silent))]
      }

      if(!silent) cat(paste0('\n  Identified ',
                             length(low_abun),' features with < ',
                             filtering$low_abun$min_abun,' abundance in >',
                             filtering$low_abun$min_prop,'% of samples'))

      filterlist$low_abun <- low_abun
    }

    taxaranks <- c('species','genus','family','order','class','phylum','single_rank','pathways')
    lowest_rank <- getLowestRank(dataset)

    ### Identify Taxa with NAs ###
    #----------------------------#
    # If keepNA exists and is false, identify NAs at the active rank
    taxa_names_tab <- ft_data$taxa_names
    active_rank <- ft_data$proc$active_rank
    if(!is.null(filtering$NAfilter$ranks)) {
      if(!is.null(taxa_names_tab)) if(ncol(taxa_names_tab)>1) for(rank in filtering$NAfilter$ranks) {
          filterlist$NAs <- unique(taxa_names_tab[grep(paste0('_',rank),
                                                       taxa_names_tab[[rank]]),][[lowest_rank]])
      }

      if(!silent) cat(paste0('\n  Identified ',
                             length(filterlist$NAs),' features without assigned ',
                             paste0(filtering$NAfilter$ranks,collapse = ', ')))
    } else filterlist$NAs <- NULL

    # Reload the normalized ft_data and abundance table
    ft_data <- dataset$data
    abd_temp <- ft_data$proc$unranked

    if(length(unlist(filterlist))) {
      ### Filter all Identified Features ###
      #------------------------------------#
      # First, temporarily separate the NA list from the nested filterlist as
      #   NAs will be dealt with separately
      unklist <- filterlist$NAs
      filterlist$NAs <- NULL

      # Replace the taxa names with all ASV numbers corresponding to them
      if(!is.null(taxa_names_tab)) {
        filterlist.ids <- TaxatoASV(ft_data,
                                    unique(unlist(filterlist)),
                                    filter_rank)
        unk.ids <- TaxatoASV(ft_data,
                             unique(unklist),
                             filter_rank)
      } else {
        filterlist.ids <- unique(unlist(filterlist))
        unk.ids <- unique(unklist)
      }

      # Put the NA list back in filterlist since we won't be using filterlist
      #   for the rest of this
      filterlist$NAs <- unklist

      # Remove any ASVs that are not in the current unranked abundance table either
      #   because they were excluded prior to processing or during normalization.
      #   They are unnecessary to have in filterlist.ids or unk.ids and they will
      #   cause issues
      filterlist.ids <- filterlist.ids[filterlist.ids %in% colnames(abd_temp)]
      unk.ids <- unk.ids[unk.ids %in% colnames(abd_temp)]

      # Make a new abundance table of just the features that made it through
      #   the filter
      abd_filtered <- abd_temp[!(colnames(abd_temp) %in% c(filterlist.ids,unk.ids))]

      # Now, pool all the filter list features into 'Other' and add it to the table
      abd_filtered$Other <- rowSums(abd_temp[filterlist.ids])
      abd_filtered$Unknown <- rowSums(abd_temp[unk.ids])
    } else abd_filtered <- abd_temp

    ft_data$proc$unranked <- abd_filtered

    # Record the filter list in the dataset's history
    ft_data$proc$filtering$filterlist <- filterlist

    num_removed <- ncol(abd_temp) - ncol(abd_filtered) + any(colnames(abd_filtered) %in% c('Other','Unknown'))

    if(!silent) cat('\n\n>>> Removed',num_removed,'features based on filtering parameters <<<\n')
  } else if(length(dataset$data$proc$selected)) {
    ft_data <- dataset$data
    abd_temp <- ft_data$proc$unranked
    selected <- ft_data$proc$selected

    if(dataset$features=='taxa') {
      if(is.null(names(selected))) selected.ids <- TaxatoASV(dataset$data, selected, 'single_rank')
      else selected.ids <- unique(unlist(lapply(getRanks(dataset),
                                                function(rank) {
                                                  if(length(selected[[rank]])) {
                                                    TaxatoASV(dataset$data,
                                                              selected[[rank]],
                                                              rank)
                                                  }
                                                })))
    } else selected.ids <- unique(unlist(selected))

    abd_selected <- abd_temp[selected.ids]
    # Now, pool all the filter list features into 'Other' and add it to the table
    abd_selected$Other <- rowSums(abd_temp[!(colnames(abd_temp) %in% selected.ids)])

    ft_data$proc$unranked <- abd_selected

    num_selected <- length(selected.ids)

    if(!silent) cat('\n\n>>> Selected',num_selected,'features in total <<<\n')
  } else {
    ft_data <- dataset$data
    if(!silent) cat('\n~~~ No feature filtering performed ~~~\n')
  }

  ft_data <- makeRankTabs(ft_data)
  # No need for the unranked table anymore, as all downstream data manipulation
  #   (i.e. normalization methods) will be performed separately on each ranked
  #   abundance table
  ft_data$proc$unranked <- NULL
  dataset$data <- ft_data

  return(dataset)
}

#' Clear Feature Filtering
#'
#' @param dataset MicroVis dataset (mvdata object)
#' @param temp If set to TRUE, it tells processDataset() to NOT update the active
#'     dataset.
#' @param silent Argument that is ultimately passed onto runSampleFilter(),
#'     runNormalization(), and runFeatureFilter(), telling them not to output
#'     any messages.
#'
#' @return MicroVis dataset (mvdata object) with all feature filtering parameters
#'     cleared. This function then calls processDataset() which will call the
#'     3 "run___" functions and therefore add all the features back to the dataset
#'     via runFeatureFilter()
#'
#' @export
#'
clearFeatureFilt <- function(dataset=NULL, temp=F, silent=F) {
  if(is.null(dataset)) dataset <- get('active_dataset',envir = mvEnv)

  dataset$data$proc$filtering <- NULL

  dataset <- processDataset(dataset, temp=temp, silent=silent)

  return(dataset)
}

#' Filter Low Prevalence Features
#'
#' @param dataset MicroVis dataset (mvdata object)
#' @param prev_prop (Optional) Minimum percent of samples that a feature must be
#'     be present in for it to be kept. Overrides top and min_prevalence values
#' @param top (Optional) Top n number of features to be kept based on prevalence.
#'     Specifying a value for this will override the min_prevalence value.
#' @param min_prevalence (Optional) (Default) Prevalence threshold. Features with
#'     prevalence lower than this threshold will be filtered out.
#' @param silent Argument that is ultimately passed onto runSampleFilter(),
#'     runNormalization(), and runFeatureFilter(), telling them not to output
#'     any messages.
#'
#' @return MicroVis dataset (mvdata object) with updated low prevalence filtering
#'     parameters.
#'
#' @export
#'
filterLowPrev <- function(dataset=NULL,
                          top=NULL, min_prevalence=NULL, prev_prop=NULL,
                          silent=F) {
  if(is.null(dataset)) dataset <- get('active_dataset',envir = mvEnv)

  if(length(dataset$data$proc$selected)) {
    message('\nRemoving selected feature list')
    dataset$data$proc$selected <- NULL
  }

  filtering.defaults <- get('filtering.defaults',envir = mvDefaults)

  dataset$data$proc$rarefied <- NULL

  nfts <- ncol(dataset$data$orig)
  ft_data <- dataset$data

  if(is.null(top) & is.null(min_prevalence) & is.null(prev_prop)){
    if(!is.null(ft_data$proc$filtering$top_prevalence)) {
      top <- ft_data$proc$filtering$top_prevalence
    } else if(!is.null(ft_data$proc$filtering$min_prevalence)) {
      min_prevalence <- ft_data$proc$filtering$min_prevalence
    } else if(!is.null(ft_data$proc$filtering$prevalence_proportion)) {
      prev_prop <- ft_data$proc$filtering$prevalence_proportion
    } else {
      min_prevalence <- filtering.defaults$min_prevalence
    }
  } else {
    if(!is.null(prev_prop)) {
      if(prev_prop > 0) {
        if(prev_prop > 1) prev_prop <- prev_prop/(10^(floor(log(prev_prop, base=10))+1))
        min_prevalence <- NULL
        top <- NULL
      } else prev_prop <- NULL
    }
    if(!is.null(top)) {
      if(top > 0 & top < nfts) min_prevalence <- NULL
      else top <- NULL
    }
    if(!is.null(min_prevalence)) {
      if(min_prevalence <= 0) min_prevalence <- NULL
    }
  }

  # Load these parameters to the dataset
  ft_data$proc$filtering$prevalence_proportion <- prev_prop
  ft_data$proc$filtering$top_prevalence <- top
  ft_data$proc$filtering$min_prevalence <- min_prevalence

  dataset$data <- ft_data

  if(!get('.loading',envir = mvEnv)) dataset <- processDataset(dataset, silent=silent)

  return(dataset)
}

#' Filter Low Relative Abundance Features
#'
#' @param dataset MicroVis dataset (mvdata object)
#' @param top (Optional) Top n number of features to be kept based relative
#'     abundance. Specifying a value for this will override the min_relabun
#'     value.
#' @param min_relabun (Optional) (Default) Relative abundance threshold. Features
#'     with prevalence lower than this threshold will be filtered out.
#' @param silent Argument that is ultimately passed onto runSampleFilter(),
#'     runNormalization(), and runFeatureFilter(), telling them not to output
#'     any messages.
#'
#' @return MicroVis dataset (mvdata object) with updated low relative abundance
#'     filtering parameters.
#'
#' @export
#'
filterLowRelAbun <- function(dataset=NULL, top=NULL, min_relabun=NULL, silent=F) {
  if(is.null(dataset)) dataset <- get('active_dataset',envir = mvEnv)

  if(length(dataset$data$proc$selected)) {
    message('\nRemoving selected feature list')
    dataset$data$proc$selected <- NULL
  }

  filtering.defaults <- get('filtering.defaults',envir = mvDefaults)

  dataset$data$proc$rarefied <- NULL

  nfts <- ncol(dataset$data$orig)
  ft_data <- dataset$data

  if(is.null(top) & is.null(min_relabun)){
    if(!is.null(ft_data$proc$filtering$top_relabun)) {
      top <- ft_data$proc$filtering$top_relabun
    } else if(!is.null(ft_data$proc$filtering$min_relabun)) {
      min_relabun <- ft_data$proc$filtering$min_relabun
    } else {
      min_relabun <- filtering.defaults$min_relabun
    }
  } else {
    if(!is.null(top)) {
      if(top > 0 & top < nfts) min_relabun <- NULL
      else top <- NULL
    }
    if(!is.null(min_relabun)) {
      if(min_relabun <= 0) min_relabun <- NULL
    }
  }

  # Whatever threshold is being set for this run will be stored in the dataset's filtering "history"
  ft_data$proc$filtering$top_relabun <- top
  ft_data$proc$filtering$min_relabun <- min_relabun

  dataset$data <- ft_data

  if(!get('.loading',envir = mvEnv)) dataset <- processDataset(dataset, silent=silent)

  return(dataset)
}

#' Filter Low Total Abundance Features
#'
#' @param dataset MicroVis dataset (mvdata object)
#' @param top (Optional) Top n number of features to be kept based on total abundance.
#'     Specifying a value for this will override the min_totabun value.
#' @param min_totabun (Optional) (Default) Total abundance threshold. Features
#'     with total abundance lower than this threshold will be filtered out.
#' @param silent Argument that is ultimately passed onto runSampleFilter(),
#'     runNormalization(), and runFeatureFilter(), telling them not to output
#'     any messages.
#'
#' @return MicroVis dataset (mvdata object) with updated low total abundance
#'     filtering parameters.
#'
#' @export
#'
filterLowTotAbun <- function(dataset=NULL, top=NULL, min_totabun=NULL, silent=F) {
  if(is.null(dataset)) dataset <- get('active_dataset',envir = mvEnv)

  if(length(dataset$data$proc$selected)) {
    message('\nRemoving selected feature list')
    dataset$data$proc$selected <- NULL
  }

  filtering.defaults <- get('filtering.defaults',envir = mvDefaults)

  dataset$data$proc$rarefied <- NULL

  nfts <- ncol(dataset$data$orig)
  ft_data <- dataset$data

  if(is.null(top) & is.null(min_totabun)){
    if(!is.null(ft_data$proc$filtering$top_totabun)) {
      top <- ft_data$proc$filtering$top_totabun
    } else if(!is.null(ft_data$proc$filtering$min_totabun)) {
      min_totabun <- ft_data$proc$filtering$min_totabun
    } else {
      min_totabun <- filtering.defaults$min_totabun
    }
  } else {
    if(!is.null(top)) {
      if(top > 0 & top < nfts) min_totabun <- NULL
      else top <- NULL
    }
    if(!is.null(min_totabun)) {
      if(min_totabun <= 0) min_totabun <- NULL
    }
  }

  # Whatever threshold is being set for this run will be stored in the dataset's filtering "history"
  ft_data$proc$filtering$top_totabun <- top
  ft_data$proc$filtering$min_totabun <- min_totabun

  dataset$data <- ft_data

  if(!get('.loading',envir = mvEnv)) dataset <- processDataset(dataset, silent=silent)

  return(dataset)
}

#' Filter Low Variance Features
#'
#' @param dataset MicroVis dataset (mvdata object)
#' @param top (Optional) Top n number of features to be kept based on relative
#'     abundance. Specifying a value for this will override the min_relabun value.
#' @param var_type Method for assessing variance of samples. Current choices are
#'     standard deviation ('sd') and interquartile range ('iqr')
#' @param low_var_percentile (Optional) (Default) Variance threshold. Features
#'     with variance (standard deviation) lower than this threshold will be
#'     filtered out.
#' @param silent Argument that is ultimately passed onto runSampleFilter(),
#'     runNormalization(), and runFeatureFilter(), telling them not to output
#'     any messages.
#'
#' @return MicroVis dataset (mvdata object) with updated low variance filtering
#'     parameters.
#'
#' @export
#'
filterLowVar <- function(dataset=NULL, top=NULL, var_type='sd', low_var_percentile=NULL, silent=F) {
  if(is.null(dataset)) dataset <- get('active_dataset',envir = mvEnv)

  if(length(dataset$data$proc$selected)) {
    message('\nRemoving selected feature list')
    dataset$data$proc$selected <- NULL
  }

  filtering.defaults <- get('filtering.defaults',envir = mvDefaults)

  dataset$data$proc$rarefied <- NULL

  nfts <- ncol(dataset$data$orig)
  ft_data <- dataset$data

  if(is.null(top) & is.null(low_var_percentile)){
    if(!is.null(ft_data$proc$filtering$top_var)) {
      top <- ft_data$proc$filtering$top_var
    } else if(!is.null(ft_data$proc$filtering$low_var_percentile)) {
      low_var_percentile <- ft_data$proc$filtering$low_var_percentile
    } else {
      low_var_percentile <- filtering.defaults$low_var_percentile
    }
  } else {
    if(!is.null(top)) {
      if(top > 0 & top < nfts) low_var_percentile <- NULL
      else top <- NULL
    }
    if(!is.null(low_var_percentile)) {
      if(low_var_percentile <= 0) low_var_percentile <- NULL
    }
  }

  # Whatever threshold is being set for this run will be stored in the
  #   dataset's filtering "history"
  ft_data$proc$filtering$top_var <- top
  ft_data$proc$filtering$low_var_percentile <- low_var_percentile

  dataset$data <- ft_data

  if(!get('.loading',envir = mvEnv)) dataset <- processDataset(dataset, silent=silent)

  return(dataset)
}

#' Filter Low Abundance Features
#'
#' @param dataset MicroVis dataset (mvdata object)
#' @param min_abun (Optional) (Default = 1) Abundance threshold. Only features
#'     with at least this much abundance in min_proportion percent of samples
#'     will be KEPT.
#' @param min_proportion (Optional) (Default = 20) Low abundance proportion
#'     threshold. Only features with at least min_abun abundance in this percentage
#'     of samples will be KEPT.
#' @param silent Argument that is ultimately passed onto runSampleFilter(),
#'     runNormalization(), and runFeatureFilter(), telling them not to output
#'     any messages.
#'
#' @return MicroVis dataset (mvdata object) with updated low abundance filtering
#'     parameters.
#'
#' @export
#'
filterLowAbun <- function(dataset=NULL, min_abun=NULL, min_proportion=NULL, silent=F) {
  if(is.null(dataset)) dataset <- get('active_dataset',envir = mvEnv)

  if(length(dataset$data$proc$selected)) {
    message('\nRemoving selected feature list')
    dataset$data$proc$selected <- NULL
  }

  filtering.defaults <- get('filtering.defaults',envir = mvDefaults)

  dataset$data$proc$rarefied <- NULL

  ft_data <- dataset$data

  # Is a new threshold being specified?
  if(is.null(min_abun)) {
    # If not, has this filtering been done previously?
    if(is.null(ft_data$proc$filtering$low_abun$min_abun)) {
      # If not, set the threshold to the default value
      min_abun <- filtering.defaults$low_abun$min_abun
    } else {
      # If this filtering has been done previously, set the threshold to the previously used one
      #   since no threshold was specified during this call of the function
      min_abun <- ft_data$proc$filtering$low_abun$min_abun
    }
  } else if(min_abun==0) {
    # If the threshold is being set to 0, set it to NULL so it disappears
    #   in the dataset's filtering history
    min_abun <- NULL
    min_proportion <- 0
  }

  # Is a new threshold being specified?
  if(is.null(min_proportion)) {
    # If not, has this filtering been done previously?
    if(is.null(ft_data$proc$filtering$low_abun_prop)) {
      # If not, set the threshold to the default value
      min_proportion <- filtering.defaults$low_abun$min_prop
    } else {
      # If this filtering has been done previously, set the threshold to the previously used one
      #   since no threshold was specified during this call of the function
      min_proportion <- ft_data$proc$filtering$low_abun$min_prop
    }
  } else if(min_proportion==0) {
    # If the threshold is being set to 0, set it to NULL so it disappears
    #   in the dataset's filtering history
    min_proportion <- NULL
    min_abun <- NULL
  }

  # Whatever threshold is being set for this run will be stored in the dataset's filtering "history"
  if(is.null(c(min_abun,min_proportion))) ft_data$proc$filtering$low_abun <- NULL
  else ft_data$proc$filtering$low_abun <- list(min_abun=min_abun,
                                               min_prop=min_proportion)

  dataset$data <- ft_data

  # Now process the dataset
  if(!get('.loading',envir = mvEnv)) dataset <- processDataset(dataset, silent=silent)

  return(dataset)
}

#' @title Filter Unassigned Taxa
#'
#' @description Filter out taxa that are unassigned at certain ranks
#'
#' @param dataset MicroVis dataset (mvdata object)
#' @param keepNAs Logical telling whether or not to keep taxa that aren't assigned
#'     at the ranks specified
#' @param ranks Any taxa that have an unknown assignment at this/these rank(s)
#'     will be filtered out.
#' @param temp If set to TRUE, it tells processDataset() to NOT update the active
#'     dataset.
#' @param silent Argument that is ultimately passed onto runSampleFilter(),
#'     runNormalization(), and runFeatureFilter(), telling them not to output
#'     any messages.
#'
#' @return MicroVis dataset (mvdata object) with an updated list of taxa to be
#'     removed because they were unassigned at specified ranks
#' @export
#'
filterNAs <- function(dataset=NULL, keepNAs=F, ranks=NULL, temp=F, silent=F) {
  if(is.null(dataset)) dataset <- get('active_dataset',envir = mvEnv)

  if(length(dataset$data$proc$selected)) {
    message('\nRemoving selected feature list')
    dataset$data$proc$selected <- NULL
  }

  if(dataset$features!='taxa') return(dataset)

  if(keepNAs) dataset$data$proc$filtering$NAfilter <- NULL
  else {
    # If not keeping NAs, then by default remove any taxa in just the highest
    #   rank of the dataset
    dataset$data$proc$filtering$NAfilter$ranks <- colnames(dataset$data$taxa_names)[1]

    ranks <- tolower(ranks)[tolower(ranks) %in% getRanks(dataset)]

    if(!length(ranks)) ranks <- dataset$data$proc$filtering$NAfilter$ranks
    else dataset$data$proc$filtering$NAfilter$ranks <- ranks
  }

  if(!get('.loading',envir = mvEnv)) dataset <- processDataset(dataset, temp=temp, silent=silent)

  return(dataset)
}

#' Select Specific Features
#'
#' @param dataset MicroVis dataset. Defaults to the active dataset
#' @param features Vector of features (at any rank) to select
#' @param ranks (Optional) Specify a rank from which to select features. Searches
#'     through all ranks by default
#' @param temp This parameter has no use in this function and can be removed
#' @param silent Argument that is ultimately passed onto runSampleFilter(),
#'     runNormalization(), and runFeatureFilter(), telling them not to output
#'     any messages.
#'
#' @return Dataset with list of selected features that is passed to runFeatureFilter
#' @export
#'
selectFeatures <- function(dataset=NULL, features, ranks=NULL, temp=F, silent=F) {
  if(is.null(dataset)) dataset <- get('active_dataset',envir = mvEnv)

  if(length(names(dataset$data$proc$filtering)[!(names(dataset$data$proc$filtering)
                                                 %in% c('filterlist','ftstats'))])) {
    if(temp) removefilts <- T
    else removefilts <- ifelse(select.list(c('Yes','No'), title='\nSelecting specific features will undo any filtering. Continue?')=='Yes', yes = T, no = T)

    if(removefilts) dataset$data$proc$filtering <- NULL
    else stop('Cannot select specific features from a dataset that has been filtered')
  }

  if(dataset$features=='taxa') alltaxa <- unique(unname(unlist(dataset$data$taxa_names)))
  else alltaxa <- colnames(dataset$data$orig)

  features <- unique(alltaxa[tolower(alltaxa) %in% tolower(features)])
  if(!length(features)) stop('None of the features were found in "',dataset$name,'"')

  if(dataset$features=='taxa') {
    ft_names <- dataset$data$taxa_names
    if(ncol(ft_names)>1) selected <- sapply(getRanks(dataset),
                                            function(rank) features[features %in% ft_names[[rank]]])
    else selected <- list('single_rank'=features)
  } else selected <- list('functional'=features)

  ranks <- ranks[ranks %in% getRanks(dataset)]
  if(!is.null(ranks)) selected[!(names(selected) %in% ranks)] <- NULL

  dataset$data$proc$selected <- selected

  dataset <- processDataset(dataset, temp=temp, silent=silent)

  return(dataset)
}

#' Undo Feature Selection
#'
#' @param dataset MicroVis dataset. Defaults to the active dataset
#' @param temp This parameter has no use in this function and can be removed
#' @param silent Argument that is ultimately passed onto runSampleFilter(),
#'     runNormalization(), and runFeatureFilter(), telling them not to output
#'     any messages.
#'
#' @return MicroVis dataset with the selected feature list cleared
#' @export
#'
unselectFeatures <- function(dataset=NULL, temp=F, silent=F) {
  if(is.null(dataset)) dataset <- get('active_dataset',envir = mvEnv)

  dataset$data$proc$selected <- NULL

  dataset <- processDataset(dataset, temp=temp, silent=silent)

  return(dataset)
}

#' @title Run Fisher Test on Features
#'
#' @description Run Fisher test on features that are labeled for filtering. This
#'     test is run on presence-absence data and not abundance data.
#'
#' @param dataset MicroVis dataset (mvdata object)
#' @param fts List of features to test.
#' @param lowabun_thresh Abundance value above which a feature is considered to
#'     be present in a given sample.
#' @param silent Argument that is ultimately passed onto runSampleFilter(),
#'     runNormalization(), and runFeatureFilter(), telling them not to output
#'     any messages.
#'
#' @return List of features that had a statistically significant Fisher test
#'     when performed across the subsetted groups of the active factor.
#'
#' @export
#'
findSigFisher <- function(dataset, fts, lowabun_thresh=0, silent=F) {
  if(is.null(dataset$data$proc$unranked)) dataset <- runSampleFilter(dataset,temp = T,silent = T)

  abun <- dataset$data$proc$unranked

  if(dataset$features=='taxa') {
    abun <- agglomTaxa(dataset$data, abun, from_rank = 'asv',
                       to_rank=dataset$data$proc$filter_rank)
  }
  active_factor <- dataset$active_factor

  if(!silent) cat(paste0('\n Running Fisher test on low abundance features...'))

  sig_fisher <- c()
  for(ft in fts) {
    abd <- data.frame(sample=rownames(abun),
                      present=as.numeric(abun[[ft]]>lowabun_thresh),
                      absent=as.numeric(abun[[ft]]<=lowabun_thresh))
    df <- merge(dataset$metadata, abd)

    form <- formula(paste('. ~',active_factor))
    xtab <- aggregate(form, df[c(active_factor,'present','absent')], function(x) sum(x))

    if(rstatix::fisher_test(xtab[2:3])$p<0.05) sig_fisher <- c(sig_fisher,ft)
  }
  return(sig_fisher)
}

#' Get Feature Statistics for Filtering
#'
#' @param dataset MicroVis dataset (mvdata object)
#' @param rank Rank at which to get feature summary stats
#' @param raw Use raw counts to calculate feature statistics. Defaults to TRUE
#' @param justStats Whether to just output the a table of feature statistics
#'     instead of all data processing information
#' @param doCLR Calculate average CLR-transformed abundances for each feature
#'
#' @return List containing original abundance table, feature names table, and
#'     a dataframe of feature statistics used for filtering
#'
#' @export
#'
getFtStats <- function(dataset=NULL, rank=NULL, raw=T, justStats=T, doCLR=F) {
  if(is.null(dataset)) dataset <- get('active_dataset',envir = mvEnv)

  # If a pre-filtered, unranked abundance table is not available, run sample
  #   filtering or normalization to get one
  if(is.null(dataset$data$proc$unranked)) {
    if(raw) dataset <- runSampleFilter(dataset,temp = T,silent = T)
    else dataset <- runNormalization(dataset,temp = T,silent = T)
  }

  if(is.null(rank)) rank <- dataset$data$proc$filter_rank

  ft_data <- dataset$data
  abd <- ft_data$proc$unranked

  if(!is.null(ft_data$taxa_names)) colnames(abd) <- ASVtoTaxa(ft_data, colnames(abd), rank)

  # Combine any features with the same name at this rank (used for agglomerating taxa at higher ranks)
  abd <- t(rowsum(t(abd),group = colnames(abd)))

  # Compute prevalence, total abundance, proportional prevalence, and relative abundance of each feature
  ftstats <- data.frame(Prevalence=apply(abd, 2, function(x) sum(x>0,na.rm = T)),
                        Total_Abundance=apply(abd, 2, function(x) sum(x,na.rm = T)),
                        Mean_Abundance=apply(abd, 2, function(x) mean(x,na.rm = T)),
                        Relative_Abundance=apply(apply(abd,1,function(x) x/sum(x,na.rm = T)),
                                                 1,function(x) max(x,na.rm = T)),
                        Mean_Relative_Abundance=apply(apply(abd,1,function(x) x/sum(x,na.rm = T)),
                                                 1,function(x) mean(x,na.rm = T)),
                        StDev=apply(abd, 2, function(x) sd(x,na.rm = T)),
                        IQR=apply(abd, 2, function(x) IQR(x,na.rm = T)))

  # What percentage of all samples is each feature present in
  ftstats$Prevalence_Proportion <- ftstats$Prevalence/nrow(abd)
  ftstats$Feature <- colnames(abd)

  if(doCLR) {
    # Mean CLR-transformed abundance for each feature
    abd.nozeros <- zeroReplace(abd)

    ftstats$Mean_CLR=apply(apply(abd.nozeros, 1, function(x) log(x/exp(mean(log(x))), base=10)),
                           1, function(x) mean(x,na.rm = T))
  }

  ft_data$proc$filtering$ftstats <- ftstats

  if(justStats) return(ftstats)
  else return(ft_data)
}
microresearcher/MicroVis documentation built on Feb. 8, 2024, 10:59 a.m.
rdrr.io home R language documentation Run R code online
CRAN packages Bioconductor packages R-Forge packages GitHub packages
Note that we can't provide technical support on individual packages. You should contact the package authors for that.
microresearcher/MicroVis
Microbiome Analysis and Visualization

R/featureFiltering.R
In microresearcher/MicroVis: Microbiome Analysis and Visualization

Defines functions getFtStats findSigFisher unselectFeatures selectFeatures filterNAs filterLowAbun filterLowVar filterLowTotAbun filterLowRelAbun filterLowPrev clearFeatureFilt runFeatureFilter

Documented in clearFeatureFilt filterLowAbun filterLowPrev filterLowRelAbun filterLowTotAbun filterLowVar filterNAs findSigFisher getFtStats runFeatureFilter selectFeatures unselectFeatures

R Package Documentation

Browse R Packages

We want your feedback!

microresearcher/MicroVis Microbiome Analysis and Visualization

R/featureFiltering.R In microresearcher/MicroVis: Microbiome Analysis and Visualization

Defines functions getFtStats findSigFisher unselectFeatures selectFeatures filterNAs filterLowAbun filterLowVar filterLowTotAbun filterLowRelAbun filterLowPrev clearFeatureFilt runFeatureFilter

Documented in clearFeatureFilt filterLowAbun filterLowPrev filterLowRelAbun filterLowTotAbun filterLowVar filterNAs findSigFisher getFtStats runFeatureFilter selectFeatures unselectFeatures

R Package Documentation

Browse R Packages

We want your feedback!

microresearcher/MicroVis
Microbiome Analysis and Visualization

R/featureFiltering.R
In microresearcher/MicroVis: Microbiome Analysis and Visualization