R/experiment.R

Defines functions experimentAssignmentSubsets .convertChannelsToDesc .getSubsetChannelsFull .getSubsetChannels experimentLabelingStrategy experimentMds experimentDifferentialExpressionAnalysis differentialAbundanceAnalysis experimentCellSubsetMap experimentCellSubsetChannelStatistics experimentCellSubsetCounts .chooseLevel experimentSummary getExperimentSampleFeatures loadExperiment

Documented in differentialAbundanceAnalysis experimentAssignmentSubsets experimentCellSubsetChannelStatistics experimentCellSubsetCounts experimentCellSubsetMap experimentDifferentialExpressionAnalysis experimentLabelingStrategy experimentMds experimentSummary getExperimentSampleFeatures loadExperiment

# experiment.R
# Functions for interacting with Astrolabe experiments.

#' Load Astrolabe experiment.
#'
#' Load an Astrolabe experiment which can then be used for other orloj
#' functions. As part of loading the experiment, correct the file paths that
#' were generated by the Astrolabe platform to local file paths.
#'
#' @param experiment_path Path to the Astrolabe experiment.
#' @return Astrolabe experiment.
#' @export
loadExperiment <- function(experiment_path) {
  experiment <- readRDS(file.path(experiment_path, "config.RDS"))
  
  experiment$analysis_path <- file.path(experiment_path, "analysis")
  experiment$reports_path <- file.path(experiment_path, "reports")
  experiment$samples_path <- file.path(experiment_path, "samples")
  
  experiment
}

#' Get experiment sample features.
#'
#' Get the sample features for a given Astrolabe experiment. Feature names are
#' converted from internal Astrolabe IDs to user-readable names.
#'
#' @param experiment An Astrolabe experiment.
#' @return Experiment sample features.
#' @export
getExperimentSampleFeatures <- function(experiment) {
  sample_features <- experiment$sample_features
  features <- experiment$features
  
  # Experiment has no features, return just column with sample IDs.
  if (ncol(features) == 0) return(experiment$samples[, "SampleId"])
  
  features$FeatureId <- paste0("feature_", features$FeatureId)
  m <- match(colnames(sample_features), features$FeatureId)
  col_indices <- !is.na(m)
  colnames(sample_features)[col_indices] <- features$FeatureName[m[col_indices]]
  
  sample_features
}

#' Astrolabe experiment summary.
#'
#' @param experiment An Astrolabe experiment.
#' @export
experimentSummary <- function(experiment) {
  cat(paste0(
    "Astrolabe experiment with ",
    nrow(experiment$samples), " samples and ",
    nrow(experiment$channels), " channels\n\n"
  ))
  cat(paste0(
    "Classification channels:\n",
    paste(experiment$class_channels, collapse = ", "),
    "\n"
  ))
  if (nrow(experiment$features) > 0) {
    cat("\n")
    cat("Sample features:\n")
    for (row_idx in seq(nrow(experiment$features))) {
      row <- experiment$features[row_idx, ]
      feature_id <- paste0("feature_", row$FeatureId)
      feature_values <- unique(experiment$sample_features[[feature_id]])
      cat(paste0(
        row$FeatureName, ": ", paste(feature_values, collapse = ", "), "\n"))
    }
  }
}

.chooseLevel <- function(experiment) {
  # Choose default level for experiment.
  if (is.null(experiment$organism)) {
    return("Assignment")
  } else if (experiment$organism == "profiling_only") {
    return("Profiling")
  } else {
    return("Assignment")
  }
}

#' Experiment cell subset counts.
#'
#' Cell subset counts and frequencies for all of the samples in an experiment.
#'
#' @param experiment An Astrolabe experiment.
#' @param level Cell subset level, such as "Assignment" or "Profiling".
#' @return Cell subset counts for that level.
#' @export
experimentCellSubsetCounts <- function(experiment,
                                       level = .chooseLevel(experiment)) {
  aggregate_statistics_filename <-
    file.path(experiment$analysis_path, "combine_aggregate_statistics.RDS")
  if (!file.exists(aggregate_statistics_filename)) {
    stop(paste0(aggregate_statistics_filename, "not found"))
  }
  aggregate_statistics <- readRDS(aggregate_statistics_filename)
  
  counts <- aggregate_statistics$subset_cell_counts
  if (!(level %in% counts$Parent)) stop("level not found in cell subset counts")
  
  counts <- counts %>%
    dplyr::filter(Parent == level) %>%
    dplyr::select(-Parent)
  counts <- dplyr::left_join(experiment$samples, counts, by = "SampleId")
  
  # Calculate frequencies.
  counts <- counts %>%
    dplyr::group_by(SampleId) %>%
    dplyr::mutate(Freq = N / sum(N)) %>%
    dplyr::ungroup()
  
  counts
}

#' Experiment cell subset channel statistics.
#'
#' Cell subset channel statistics for all of the samples in an experiment.
#'
#' @param experiment An Astrolabe experiment.
#' @param level Cell subset level, such as "Assignment" or "Profiling".
#' @return Cell subset channel statistics for that level.
#' @export
experimentCellSubsetChannelStatistics <- function(experiment,
                                                  level = .chooseLevel(experiment)) {
  aggregate_statistics_filename <-
    file.path(experiment$analysis_path, "combine_aggregate_statistics.RDS")
  if (!file.exists(aggregate_statistics_filename)) {
    stop(paste0(aggregate_statistics_filename, "not found"))
  }
  aggregate_statistics <- readRDS(aggregate_statistics_filename)
  
  stats <- aggregate_statistics$subset_channel_statistics
  if (!(level %in% stats$Parent)) {
    stop("level not found in cell subset channel statistics")
  }
  
  stats <- stats %>%
    dplyr::filter(Parent == level) %>%
    dplyr::select(-Parent)
  stats <- dplyr::left_join(experiment$samples, stats, by = "SampleId")
  
  stats
}

#' Map experiment cell subsets to numerical values.
#'
#' Create a map from experiment cell subsets in a given level to unique
#' numerical values. This map is used when generating Assignment and Profiling
#' columns in exported FCS files.
#'
#' @param experiment An Astrolabe experiment.
#' @param level Cell subset level.
#' @return Map from cell subsets to numerical values.
#' @export
experimentCellSubsetMap <- function(experiment,
                                    level = .chooseLevel(experiment)) {
  # Get cell subsets from aggregate statistics.
  cell_subsets_file_path <-
    file.path(experiment$analysis_path, "experiment_cell_subsets.RDS")
  cell_subsets <- readRDS(cell_subsets_file_path)
  cell_subsets <- unique(cell_subsets[[level]])
  cell_subsets <- c(cell_subsets, "AstrolabeBead", "Debris", "Dead")
  
  data.frame(
    Value = seq(length(cell_subsets)),
    CellSubset = cell_subsets,
    stringsAsFactors = FALSE
  )
}

#' Differential abundance analysis.
#'
#' Load the experiment differential abundance analysis, for a given cell subset
#' level.
#'
#' @param experiment An Astrolabe experiment.
#' @param level Cell subset level, such as "Assignment" or "Profiling".
#' @param convert_ids Whether to convert Astrolabe IDs to feature names.
#' @return Differential abundance analysis list.
#' @export
differentialAbundanceAnalysis <- function(experiment,
                                          level = .chooseLevel(experiment),
                                          convert_ids = TRUE) {
  # Value for NA feature value. Samples with this value should not be included
  # in the analysis.
  feature_na <- "__NA__"
  
  daa_filename <-
    file.path(experiment$analysis_path, "differential_abundance_analysis.RDS")
  if (!file.exists(daa_filename)) {
    stop(paste0(daa_filename, " not found"))
  }
  differential_abundance_analysis <- readRDS(daa_filename)
  
  daa <- differential_abundance_analysis$differential_abundance_analysis
  
  if (!(level %in% names(daa))) return(NULL)
  daa <- daa[[level]]
  
  # Find feature values.
  feature_values <- lapply(nameVector(names(daa)), function(feature_name) {
    if(!is.factor(experiment$sample_features[[feature_name]])) {
      # Reverse compatibility: Past versions of Astrolabe used character for
      # sample feature values.
      values <- experiment$sample_features[[feature_name]]
    } else {
      values <- levels(experiment$sample_features[[feature_name]])
    }
    values <- setdiff(values, feature_na)
  })

  # Convert feature IDs to names.
  if (convert_ids) {
    m <-
      match(as.numeric(gsub("^feature_", "", names(daa))),
            experiment$features$FeatureId)
    names(daa) <- experiment$features$FeatureName[m]
    names(feature_values) <- experiment$features$FeatureName[m]
  }
  
  # Convert tables to tibbles and only keep p-value, FDR, and logFC.
  lapply(nameVector(names(daa)), function(feature_name) {
    if (is.null(daa[[feature_name]])) {
      NULL
    } else {
      tab <- as.data.frame(daa[[feature_name]]) %>%
        tibble::rownames_to_column("CellSubset")
      
      # Calculate max(logFC) for features with multiple values.
      log_fc_cols <- grep("logFC", colnames(tab))
      if (length(log_fc_cols) > 1) {
        log_fc <- tab[, log_fc_cols]
        max_log_fc <- apply(log_fc, 1, function(v) v[which.max(abs(v))])
        tab$logFC <- max_log_fc
      }

      # Reverse compatibility: Median columns were labeled "median_" in past
      # versions of Astrolabe.
      feature_cols_old <- paste0("median_", feature_values[[feature_name]])
      if (all(feature_cols_old %in% colnames(tab))) {
        old_indices <- which(colnames(tab) %in% feature_cols_old)
        colnames(tab)[old_indices] <-
          gsub("median_", "", colnames(tab)[old_indices])
      }
      
      cols <-
        c("CellSubset",
          "logFC",
          "PValue",
          "FDR",
          feature_values[[feature_name]])
      
      tab[, cols]
    }
  })
}

#' Differential expression analysis.
#'
#' Load the experiment differential expression analysis across all levels and
#' all kits.
#'
#' @param experiment An Astrolabe experiment.
#' @param convert_ids Whether to convert Astrolabe IDs to user-supplied values.
#' @return Differential expression analysis list.
#' @export
experimentDifferentialExpressionAnalysis <- function(experiment,
                                                     convert_ids = TRUE) {
  # Import differential expression analysis data.
  dea <-
    readRDS(file.path(
      experiment$analysis_path,
      "differential_abundance_analysis.RDS"))$differential_expression_analysis

  if (is.null(dea)) return(dea)
  if (!convert_ids) return(dea)
  
  # Convert kit IDs to user-supplied values.
  lapply(dea, function(dea_level) {
    m <- match(names(dea_level), experiment$de_analysis_kits$Id)
    names(dea_level) <- experiment$de_analysis_kits$Name[m]
    dea_level
  })
}

#' MDS map.
#'
#' Return the MDS map for the experiment, for a given level.
#'
#' @param experiment An Astrolabe experiment.
#' @param level Cell subset level, such as "Assignment" or "Profiling".
#' @param convert_ids Whether to convert Astrolabe IDs to feature names.
#' @return MDS map.
#' @export
experimentMds <- function(experiment,
                          level = .chooseLevel(experiment),
                          convert_ids = TRUE) {
  mds_filename <-
    file.path(experiment$analysis_path, "calculate_mds.RDS")
  if (!file.exists(mds_filename)) stop(paste0(mds_filename, " not found"))
  mds <- readRDS(mds_filename)
  
  if (!(level %in% names(mds))) stop("cannot find level in MDS map")
  
  mds <- mds[[level]]
  
  # Calculate and add mean frequency over all samples.
  cell_subset_counts <- experimentCellSubsetCounts(experiment, level = level)
  mds_freqs <- cell_subset_counts %>%
    dplyr::group_by(CellSubset) %>%
    dplyr::summarize(Freq = mean(Freq))
  mds <- dplyr::left_join(mds, mds_freqs, by = "CellSubset")
  
  # Calculate and add mean marker intensities over all samples.
  marker_stats <-
    experimentCellSubsetChannelStatistics(experiment, level = level)
  marker_stats <- marker_stats %>%
    dplyr::group_by(CellSubset, ChannelName) %>%
    dplyr::summarize(Median = mean(Median, na.rm = TRUE)) %>%
    reshape2::dcast(CellSubset ~ ChannelName, value.var = "Median")
  mds <- dplyr::left_join(mds, marker_stats, by = "CellSubset")
  
  # Add max(fold change) and -log10(FDR) for each feature.
  daa <-
    differentialAbundanceAnalysis(experiment, level = level,
                                  convert_ids = FALSE)
  for (feature_id in experiment$features$FeatureId) {
    feature_id_long <- paste0("feature_", feature_id)
    feature_name <- feature_id_long
    if (convert_ids) {
      feature_name <-
        subset(experiment$features, FeatureId == feature_id)$FeatureName
    }

    if (is.null(daa[[feature_id_long]])) {
      # If we did not run DAA for a feature, update MDS map with empty values.
      tab <-
        data.frame(
          CellSubset = unique(cell_subset_counts$CellSubset),
          logFC = 0,
          FDR = 1
        )
    } else {
      # Ran DAA, take test results.
      tab <- daa[[feature_id_long]]
      tab <- tab[, c("CellSubset", "logFC", "FDR")]
    }

    colnames(tab) <-
      c("CellSubset",
        paste0(feature_name, "_logFC"),
        paste0(feature_name, "_FDR"))
    mds <- dplyr::left_join(mds, tab, by = "CellSubset")
  }
  
  mds
}

experimentLabelingStrategy <- function(experiment) {
  if (experiment$organism == "profiling_only") return(NULL)
  
  hierarchy <-
    readRDS(file.path(experiment$analysis_path, "hierarchy.RDS"))$hierarchy
  
  # Convert Assignment cell subsets to strings.
  assignment <- setdiff(hierarchy$CellSubset, hierarchy$Parent)
  descs <- lapply(orloj::nameVector(assignment), function(cell_subset) {
    channels <-
      .getSubsetChannelsFull(hierarchy, experiment$class_channels, cell_subset)
    data.frame(
      CellSubset = cell_subset,
      Desc = .convertChannelsToDesc(channels),
      stringsAsFactors = FALSE
    )
  }) %>% dplyr::bind_rows()
  
  # Order according to Compartment order.
  cell_subsets_file_path <-
    file.path(experiment$analysis_path, "experiment_cell_subsets.RDS")
  if (file.exists(cell_subsets_file_path)) {
    # Experiment has Compartment level.
    cell_subsets <- readRDS(cell_subsets_file_path)
    cell_subsets <- unique(cell_subsets[, c("Compartment", "Assignment")])
    cell_subsets <-
      cell_subsets %>%
      dplyr::filter(Assignment %in% descs$CellSubset) %>%
      dplyr::rename(Parent = Compartment, CellSubset = Assignment)
  } else {
    # Reverse compatibility: Experiment does not have Compartment level, use
    # Level_1 instead.
    cell_subsets <-
      lapply(orloj::nameVector(assignment), function(cell_subset) {
        curr_subset <- cell_subset
        while (curr_subset != "Root") {
          curr_subset <- gsub("_unassigned", "", curr_subset)
          parent <- curr_subset
          curr_subset <- hierarchy$Parent[hierarchy$CellSubset == curr_subset]
        }
        
        data.frame(
          Parent = parent,
          CellSubset = cell_subset,
          stringsAsFactors = FALSE
        )
      }) %>% dplyr::bind_rows()
    cell_subsets <- cell_subsets[gtools::mixedorder(cell_subsets$Parent), ]
  }
  
  dplyr::left_join(cell_subsets, descs, by = "CellSubset")
}

.getSubsetChannels <- function(hierarchy, class_channels, cell_subset) {
  # Get data frame with state of each channel for given subset.
  hierarchy <- hierarchy[hierarchy$CellSubset == cell_subset, ]
  channels <-
    intersect(class_channels, names(which(colSums(!is.na(hierarchy)) > 0)))
  
  # Unassigned cell subset with no markers (neither positive nor negative).
  if (length(channels) == 0) {
    return(data.frame(
      CellSubset = cell_subset,
      Channel = "(unassigned)",
      State = NA
    ))
  }
  
  hierarchy <- hierarchy[, channels, drop = FALSE]
  hierarchy$CellSubset <- cell_subset
  ch <-
    reshape2::melt(hierarchy, id.vars = "CellSubset", variable.name = "Channel",
                   value.name = "State")
  ch$Channel <- as.character(ch$Channel)
  ch[gtools::mixedorder(ch$Channel), ]
}

.getSubsetChannelsFull <- function(hierarchy, class_channels, cell_subset) {
  # Get data frame with state of each channel for given subset, spanning the
  # entire hierarchy.
  df <- data.frame()
  while (cell_subset %in% hierarchy$CellSubset) {
    # Reverse compatibility: Before the addition of support for labeling
    # unassigned subsets, parents were allowed to include "_unassigned".
    cell_subset <- gsub("_unassigned", "", cell_subset)
    df <- rbind(.getSubsetChannels(hierarchy, class_channels, cell_subset), df)
    cell_subset <- subset(hierarchy, CellSubset == cell_subset)$Parent
  }
  df
}

.convertChannelsToDesc <- function(channels) {
  # Convert channels data frame to a string.
  channels$StateStr <-
    ifelse(is.na(channels$State), "",
           ifelse(channels$State, "+", "-"))
  # Use factors to maintain subset order.
  channels$CellSubset <-
    factor(channels$CellSubset, levels = unique(channels$CellSubset))
  channels <- channels %>%
    dplyr::group_by(CellSubset) %>%
    dplyr::summarize(Str = paste0(paste0(Channel, StateStr), collapse = " "))
  paste0(channels$Str, collapse = ", ")
}

#' List of Assignment subsets.
#'
#' Return a vector of all Assignment subsets.
#''
#' @param experiment An Astrolabe experiment.
#' @return Character vector of Assignment subsets.
#' @export
experimentAssignmentSubsets <- function(experiment) {
  if (experiment$organism == "profiling_only") return(NULL)
  
  cell_subsets_file_path <-
    file.path(experiment$analysis_path, "experiment_cell_subsets.RDS")
  if (file.exists(cell_subsets_file_path)) {
    # Experiment has Compartment level.
    cell_subsets <- readRDS(cell_subsets_file_path)
    unique(cell_subsets$Assignment)
  } else {
    # Reverse compatibility: Experiment does not have Compartment level, need
    # to check for extend_assignment_unassigned.
    subsets <- experimentLabelingStrategy(experiment)$CellSubset
    
    if (!is.null(experiment$extend_assignment_unassigned) &&
        experiment$extend_assignment_unassigned) {
      hierarchy <-
        readRDS(file.path(experiment$analysis_path, "hierarchy.RDS"))$hierarchy
      
      subsets <- c(subsets, paste0(unique(hierarchy$Parent), "_unassigned"))
      subsets <- gtools::mixedsort(unique(subsets))
    }
    
    subsets
  }
}
astrolabediagnostics/orloj documentation built on May 20, 2021, 2:17 p.m.