R/model_data.R

Defines functions msglm_data prepare_msdata annotate_msdata cluster_msprofiles impute_intensities prepare_expanded_effects

# use experimental design matrices and add
# object-specific object/replicate/batch effects information to model_data
prepare_expanded_effects <- function(model_data, verbose=model_data$model_def$verbose)
{
  model_def <- model_data$model_def

  # FIXME use model_def class information
  is_glmm <- rlang::has_name(model_def, "mixtureXcondition.mtxs") &&
             rlang::has_name(model_def, "mixeffects.df")
  if (is_glmm) {
    if (verbose) message("Detected mixeffects data for GLMM model")
    mixtureXcondition.mtxs <- model_def$mixtureXcondition.mtxs
    mixcoefXeff.mtx <- model_def$mixcoefXeff.mtx
    # "GLMM" mixing model
    if (any(names(mixtureXcondition.mtxs) != rownames(mixcoefXeff.mtx))) {
      stop("Mismatch between mixtureXcondition.mtxs matrix names and mix coefficient names")
    }
    mixeffects_df <- model_def$mixeffects
    if (any(mixeffects_df$mixeffect != colnames(mixcoefXeff.mtx))) {
      stop("Mismatch between mixcoefXeff.mtx matrix colnames and mix effect names")
    }
    for (i in seq_along(mixtureXcondition.mtxs)) {
      if (any(colnames(mixtureXcondition.mtxs[[i]]) != model_data$conditions$condition)) {
        stop("Mismatch between mixtureXcondition.mtxs[[", names(mixtureXcondition.mtxs)[[i]],
             "]] matrix colnames and condition names")
      }
      if (any(rownames(mixtureXcondition.mtxs[[i]]) != model_data$mixtures$mixture)) {
        stop("Mismatch between mixtureXcondition.mtxs[[", names(mixtureXcondition.mtxs)[[i]],
             "]] matrix rownames and mixture names")
      }
    }
    model_data$mixeffects <- mixeffects_df
    model_data$mixcoefs <- tibble(mixcoef = rownames(mixcoefXeff.mtx))
    model_data$mixcoefXeff <- mixcoefXeff.mtx
    model_data$mixtureXcondition <- mixtureXcondition.mtxs
    conditions <- levels(model_data$conditions$condition)
    obj_mixXcond.mtxs <- lapply(model_data$mixtureXcondition, function(mixtureXcondition.mtx){
      res <- extrude_matrix(mixtureXcondition.mtx,
                            model_data$object_mixtures$index_object,
                            model_data$object_mixtures$index_mixture,
                            blockdim = "index_object", rowixdim = "index_object_mixture",
                            coldim = "object_condition_id", corecoldim = "condition_id")
      names(dimnames(res$mtx)) <- c("sact", "iact")
      res$mtx <- res$mtx[, colSums(abs(res$mtx)) != 0.0, drop=FALSE]
      res$df <- dplyr::rename(res$df, condition=eff, objcond_id=objeff)
      res$objconds_df <- dplyr::rename(res$objeff_df, condition=eff,
                                      objcond_id=objeff,
                                      index_object=obj)
      res$objeff_df <- NULL
      return(res)
    }) %>% pivot_wider(id_cols = )
    # collect all used object_condition ids to use as factor levels
    objcond_ids = unique(unlist(lapply(obj_mixXcond.mtxs, function(x) x$objconds_df$objcond_id)))
    # convert condition and objcond_id into factor
    obj_mixXcond.mtxs <- lapply(obj_mixXcond.mtxs, function(x) {
      x$df <- mutate(x$df,
                     condition = factor(condition, levels=conditions),
                     objcond_id = factor(objcond_id, levels=objcond_ids))
      x$objconds_df <- mutate(x$objconds_df,
                             condition = factor(condition, levels=conditions),
                             objcond_id = factor(objcond_id, levels=objcond_ids))
      return(x)
    })
    model_data$object_conditions <- dplyr::distinct(dplyr::bind_rows(lapply(obj_mixXcond.mtxs, function(x) x$objconds_df))) %>%
      dplyr::mutate(index_condition = as.integer(condition)) %>%
      dplyr::mutate(index_object_condition = row_number(),
             is_virtual = FALSE)
    model_data$mixtions <- dplyr::bind_rows(lapply(names(obj_mixXcond.mtxs), function(mixcoef){
      tibble::tibble(mixcoef = mixcoef,
                     objcond_id = factor(colnames(obj_mixXcond.mtxs[[mixcoef]]$mtx), levels=objcond_ids))
    })) %>%
      dplyr::mutate(mixtion_ix = row_number(),
                    object_premix = paste0(mixcoef, " ", objcond_id),
                    mixcoef = factor(mixcoef, levels=rownames(mixcoefXeff.mtx)),
                    mixcoef_ix = replace_na(as.integer(mixcoef), 0L)) %>%
    dplyr::left_join(dplyr::select(model_data$object_conditions, objcond_id, index_object_condition)) %>%
    dplyr::arrange(mixtion_ix)
    model_data$object_mixtureXpremix <- bind_cols(lapply(obj_mixXcond.mtxs, function(x) x$mtx))
    dimnames(model_data$object_mixtureXpremix) <- list(object_mixture = model_data$object_mixtures$object_mixture_id,
                                                       object_premix = model_data$object_premixes$object_premix)
  } else {
    objcond_ids <- unique(model_data$object_conditions$objcond_id)
    objprobe_ids <- unique(model_data$object_msprobes$objprobe_id)
  }

  obj_condXeff_df <- dplyr::full_join(matrix2frame(model_def$conditionXeffect, row_col="condition", col_col = "effect"),
                                     dplyr::select(model_data$objects, index_object, object_id),
                                     by = character()) %>%
    dplyr::inner_join(dplyr::select(model_def$effects, effect, index_effect), by="effect") %>%
    dplyr::inner_join(dplyr::select(model_data$object_conditions, index_object_condition, index_object, condition),
                      by=c("index_object", "condition")) %>%
    dplyr::mutate(object_effect = paste0(effect, '@', object_id))
  model_data$object_effects <- dplyr::select(obj_condXeff_df, index_object, object_id,
                                             index_effect, effect, object_effect) %>%
    dplyr::distinct() %>%
    dplyr::arrange(index_object, index_effect) %>%
    dplyr::mutate(index_object_effect = row_number())
  model_data$object_conditionXeffect <- frame2matrix(obj_condXeff_df, row_col="index_object_condition", col_col="object_effect",
                                          rows=model_data$object_conditions$index_object_condition,
                                          cols=model_data$object_effects$object_effect)

  msprobeXeffect_name <- "msprobeXeffect"
  msprb <- model_data$msentities[['msprobe']]
  msprb_idcol <- msprb
  if (rlang::has_name(model_def, msprobeXeffect_name)) {
    if (verbose) message("Using model_def$", msprobeXeffect_name, " for per-", msprb, " design matrix")
    msprbXeff_mtx <- model_def[[msprobeXeffect_name]]
    msprb_dim <- names(dimnames(msprbXeff_mtx))[[1]]
    if (msprb_dim != msprb_idcol) {
      stop("Name of model_def$", msprobeXeffect_name, " rows dimension (", msprb_dim, ") inconsistent with ", msprb_idcol)
    }
    checkmate::assert_set_equal(rownames(msprbXeff_mtx), model_data$msprobes$msprobe,
                                .var.name = paste0("rownames(model_def$", msprobeXeffect_name, ")"))
    checkmate::assert_set_equal(colnames(msprbXeff_mtx), model_def$effects$effect,
                                .var.name = paste0("colnames(model_def$", msprobeXeffect_name, ")"))

    obj_probeXeff_df <- dplyr::full_join(matrix2frame(msprbXeff_mtx, row_col="msprobe", col_col = "effect"),
                                         dplyr::select(model_data$objects, index_object, object_id),
                                         by = character()) %>%
      dplyr::inner_join(dplyr::select(model_data$object_msprobes, index_object_msprobe, index_object, msprobe),
                        by=c("index_object", "msprobe")) %>%
      dplyr::mutate(object_effect = paste0(effect, '@', object_id))
  } else {
    obj_probeXeff_df <- dplyr::inner_join(obj_condXeff_df,
                                          dplyr::select(model_data$object_msprobes, index_object_condition, index_object_msprobe),
                                          by="index_object_condition")
  }
  model_data$object_msprobeXeffect <- frame2matrix(obj_probeXeff_df, row_col="index_object_msprobe", col_col="object_effect",
                                           rows=model_data$object_msprobes$index_object_msprobe,
                                           cols=model_data$object_effects$object_effect)

  if (rlang::has_name(model_def, "msprobeXbatchEffect")) {
    msprobeXbatchEffect <- model_def$msprobeXbatchEffect
    batch_effects_df <- model_def$batch_effects
    if (verbose) message(nrow(batch_effects_df), " batch effects on ",
                         msprb, " level defined")
    checkmate::assert_set_equal(rownames(msprobeXbatchEffect),
                                model_data$msprobes$msprobe)
  } else {
    if (verbose) message("No batch effects defined")
    msprobeXbatchEffect <- constant_matrix(0, list(msprobe = character(0),
                                                   batch_effect = character(0)))
    batch_effects_df <- tibble::tibble(batch_effect = character(0),
                                       index_batch_effect = integer(0),
                                       is_positive = logical(0))
  }
  obj_probeXbatcheff_df <- dplyr::full_join(matrix2frame(msprobeXbatchEffect, row_col="msprobe", col_col="batch_effect"),
                                            dplyr::select(model_data$objects, index_object, object_id, object_label),
                                            by = character()) %>%
    dplyr::inner_join(dplyr::select(batch_effects_df, batch_effect, index_batch_effect), by="batch_effect") %>%
    dplyr::inner_join(dplyr::select(model_data$object_msprobes, msprobe, index_object, index_object_msprobe),
                      by=c("index_object", "msprobe")) %>%
    dplyr::mutate(object_batch_effect = paste0(batch_effect, '@', object_id))
  model_data$object_batch_effects <- dplyr::select(obj_probeXbatcheff_df,
                                                   object_batch_effect,
                                                   index_batch_effect, batch_effect,
                                                   index_object, object_id) %>%
    dplyr::distinct() %>%
    dplyr::arrange(index_object, index_batch_effect) %>%
    dplyr::mutate(index_object_batch_effect = row_number())
  model_data$object_msprobeXbatch_effect <- frame2matrix(obj_probeXbatcheff_df, row_col="index_object_msprobe", col_col="object_batch_effect",
                                                rows=if (nrow(obj_probeXbatcheff_df) > 0) {
                                                    model_data$object_msprobes$index_object_msprobe
                                                  } else integer(0),
                                                cols=model_data$object_batch_effects$object_batch_effect)
  if (rlang::has_name(model_data, "quantobjects")) {
    if (rlang::has_name(model_def, "mschannelXquantBatchEffect")) {
      mschannelXquantBatchEffect <- model_def$mschannelXquantBatchEffect
      mschan <- model_data$msentities[['mschannel']]
      mschan_idcol <- mschan
      quant_batch_effects_df <- model_def$quant_batch_effects
      if (verbose) message(nrow(quant_batch_effects_df),
                           " quantification batch effects on ", mschan_idcol, " level defined")
    } else {
      if (verbose) message("No quantification-level batch effects defined")
      mschannelXquantBatchEffect <- constant_matrix(0, list(mschannel = character(0),
                                                            quant_batch_effect = character(0)))
      quant_batch_effects_df <- tibble::tibble(quant_batch_effect = character(0),
                                               index_quant_batch_effect = integer(0),
                                               is_positive = logical(0))
    }
    qobj_channelXbatcheff_df <- dplyr::full_join(matrix2frame(mschannelXquantBatchEffect,
                                                            row_col="mschannel", col_col="quant_batch_effect"),
                                               dplyr::select(model_data$quantobjects, index_object, index_quantobject, quantobject_id),
                                               by = character()) %>%
      dplyr::inner_join(dplyr::select(quant_batch_effects_df, quant_batch_effect, index_quant_batch_effect), by="quant_batch_effect") %>%
      dplyr::right_join(dplyr::select(model_data$msdata, mschannel, index_msprobe, index_object, index_quantobject, index_quantobject_mschannel),
                        by=c("index_object", "index_quantobject", "mschannel")) %>%
      # index msdata without batch effect as 0 (needed for ref quant object removal later)
      dplyr::mutate(index_quant_batch_effect = replace_na(index_quant_batch_effect, 0L)) %>%
      dplyr::group_by(index_quantobject) %>%
      # remove batch effects that are present in all msdata of a given quantobject
      # (otherwise it creates redundancy)
      dplyr::group_modify(~{
        nmschans <- n_distinct(.x$mschannel)
        dplyr::group_by(.x, index_quant_batch_effect) %>%
        dplyr::filter(n() < nmschans) %>%
        dplyr::ungroup()
      }) %>%
      # remove reference quantobject for each object (one with the smallest index
      # among those that are detected in all batch_effect groups)
      dplyr::group_by(index_object) %>%
      dplyr::group_modify(~{
        # TODO make igraph dependency optional
        # indentify quant_batch_effect groups that don't share probes between each other
        # (which means that quant_batch_effects should have 1 degree-of-freedom less,
        #  because intensities in these groups are, strictly speaking, not comparable)
        # quant batch effects can share probes if the same biological sample (i.e. probe)
        # was analysed more than once
        probe2batcheff_df <- dplyr::select(.x, index_msprobe, index_quant_batch_effect) %>%
          dplyr::transmute(index_msprobe, index_quant_batch_effect,
                           probe_id = paste0("p", index_msprobe),
                           batcheff_id = paste0("e", index_quant_batch_effect)) %>%
          dplyr::distinct()
        probe2batcheff_graph <- igraph::graph_from_data_frame(
          dplyr::select(probe2batcheff_df, batcheff_id, probe_id), directed = FALSE)
        batcheffs <- as.integer(igraph::components(probe2batcheff_graph)$membership)
        batcheffs_df <- dplyr::select(probe2batcheff_df, batcheff_id, index_quant_batch_effect) %>%
          dplyr::distinct() %>% dplyr::inner_join(
            tibble::as_tibble(batcheffs, rownames = "batcheff_id"),
            by = "batcheff_id") %>%
          dplyr::select(index_quant_batch_effect,
                        index_quant_batch_effect_group = value)
        ngroups <- dplyr::n_distinct(batcheffs_df$index_quant_batch_effect_group)

        if (ngroups > 1L) {
          .x <- dplyr::inner_join(.x, batcheffs_df, by="index_quant_batch_effect")
          # select potential reference objects that are shared by effect groups
          overlap_qobjs_df <- dplyr::group_by(.x, index_quantobject) %>%
            dplyr::summarise(ngroups_qobj = n_distinct(index_quant_batch_effect_group), .groups = "drop") %>%
            dplyr::filter(ngroups_qobj == ngroups)
          if (nrow(overlap_qobjs_df) > 0L) {
            # remove one reference quantobject batch effects from each group to
            # eliminate redundancy
            del_qobj_batch_effs_df <- dplyr::semi_join(.x, overlap_qobjs_df, by ="index_quantobject") %>%
              dplyr::filter(index_quant_batch_effect > 0L) %>%
              dplyr::arrange(index_quant_batch_effect_group, index_quantobject, index_quant_batch_effect) %>%
              dplyr::group_by(index_quant_batch_effect) %>%
              dplyr::filter(row_number() == 1L) %>% dplyr::ungroup()
            .x <- dplyr::anti_join(.x, del_qobj_batch_effs_df,
                                   by =c("index_quant_batch_effect",
                                         "index_quantobject"))
          }
        }
        .x
      }) %>% dplyr::ungroup() %>%
      dplyr::filter(index_quant_batch_effect > 0L) %>%
      dplyr::mutate(quantobject_batch_effect = paste0(quant_batch_effect, '@', quantobject_id)) %>%
      dplyr::arrange(index_quant_batch_effect, index_quantobject_mschannel)
    model_data$quantobject_batch_effects <- dplyr::select(qobj_channelXbatcheff_df,
                                                        quantobject_batch_effect,
                                                        index_quant_batch_effect, quant_batch_effect,
                                                        index_quantobject, quantobject_id) %>%
      dplyr::distinct() %>%
      dplyr::arrange(index_quantobject, index_quant_batch_effect) %>%
      dplyr::mutate(index_quantobject_batch_effect = row_number())
    model_data$quantobject_mschannelXquant_batch_effect <- frame2matrix(
            qobj_channelXbatcheff_df,
            row_col="index_quantobject_mschannel",
            col_col="quantobject_batch_effect",
            rows = if (nrow(qobj_channelXbatcheff_df)>0) {
                    model_data$msdata$index_quantobject_mschannel
                  } else integer(0),
            cols = model_data$quantobject_batch_effects$quantobject_batch_effect)
  } else if (rlang::has_name(model_def, "mschannelXquantBatchEffect")) {
      warning("Object-level model data specifies quant_batch_effects matrix. The inference would ignore it, please specify batch effects at the object-level")
  }

  return(model_data)
}

#' @export
# FIXME more checks/control over the columns of intensities_df/stats_df
impute_intensities <- function(intensities_df, stats_df, log2_mean_offset=-1.8, log2_sd_scale=0.3){
  res <- dplyr::inner_join(intensities_df, stats_df) %>%
    dplyr::mutate(intensity_imputed = if_else(is.na(intensity),
                                              2^(rnorm(n(), mean=log2_intensity.mean + log2_intensity.sd * log2_mean_offset,
                                                       sd=log2_intensity.sd * log2_sd_scale)),
                                              intensity)) %>%
    dplyr::ungroup()
  # don't take stats_df columns along
  dplyr::select(res, one_of(colnames(intensities_df)), intensity_imputed)
}

#' @export
cluster_msprofiles <- function(msdata, mschannel_stats, obj_col, mschannel_col, nclu=4) {
  # create matrix of intensities
  objs.df <- dplyr::select_at(msdata, obj_col) %>%
    dplyr::distinct() %>% dplyr::arrange(!!sym(obj_col)) %>%
    dplyr::mutate(`__index_msobject__` = row_number())
  intensities.df <- tidyr::expand(msdata, !!!rlang::syms(c(obj_col, mschannel_col))) %>%
    dplyr::left_join(dplyr::select(msdata, any_of(c(obj_col, mschannel_col, "intensity"))),
                     by=c(obj_col, mschannel_col)) %>%
    impute_intensities(mschannel_stats) %>%
    dplyr::inner_join(objs.df, by=obj_col) %>%
    dplyr::arrange(`__index_msobject__`, !!sym(mschannel_col))
  # handle trivial cases
  mschannels <- unique(intensities.df[[mschannel_col]])
  if (nrow(objs.df) == 1L || length(mschannels) == 1L) {
    return(tibble(!!obj_col := objs.df[[obj_col]],
                  profile_cluster = 1L,
                  nsimilar_profiles = 1L))
  }
  if (nrow(intensities.df) != length(mschannels)*nrow(objs.df)) {
    stop("Duplicate intensities detected, check the input data")
  }
  obj_stats.df <- group_by(intensities.df, `__index_msobject__`) %>%
    summarise(n_quants = sum(!is.na(intensity))) %>%
    dplyr::ungroup()
  # add a bit of noise to avoid zero variance
  intensities.mtx <- matrix(log2(pmax(if_else(is.finite(intensities.df$intensity_imputed),
                                              intensities.df$intensity_imputed, 0.0), 1)) +
                            rnorm(nrow(intensities.df), sd=0.01),
                            ncol = nrow(objs.df),
                            dimnames = list(mschannel = mschannels, `__index_msobject__` = NULL))
  obj.pca <- stats::prcomp(intensities.mtx, scale.=TRUE)
  # create object feature matrix
  obj.pca_featmtx <- obj.pca$rotation * crossprod(t(rep.int(1, nrow(obj.pca$rotation))),
                                                  summary(obj.pca)$importance[2,])

  res <- tibble(`__index_msobject__` = seq_len(nrow(obj.pca_featmtx)),
                tmp_profile_cluster = stats::cutree(hclust(dist(obj.pca_featmtx), method="single"),
                                                    min(c(nclu, nrow(obj.pca_featmtx), ncol(obj.pca_featmtx))))) %>%
    dplyr::inner_join(objs.df, by="__index_msobject__")
  # assign profile_cluster indices from largest to smallest clusters
  res_clustats <- dplyr::inner_join(res, obj_stats.df, by="__index_msobject__") %>%
    dplyr::group_by(tmp_profile_cluster) %>%
    dplyr::summarise(nsimilar_profiles = n(),
                     n_quants_cluster_total = sum(n_quants, na.rm=TRUE),
                     n_quants_cluster_median = median(n_quants, na.rm=TRUE)) %>%
    dplyr::ungroup() %>%
    dplyr::arrange(desc(n_quants_cluster_total), desc(nsimilar_profiles)) %>%
    dplyr::mutate(profile_cluster = row_number())

  return(inner_join(res, res_clustats, by="tmp_profile_cluster") %>%
           dplyr::select(-tmp_profile_cluster, -`__index_msobject__`))
}

# annotate msdata (full quantobject X msrun Cartesian product)
# as reliable (column name) according to
# the specificity of observing the quant object in the MS experiment groups
# defined by `msprobe$spec_msexp_group`
# or by co-occurrence of distinct quant quantobjects in the given MS experiment group
# defined by `msprobe$cooccur_msexp_group`
annotate_msdata <- function(msdata_df, model_def, verbose = model_def$verbose,
                            specificity_pvalue = 1E-3,
                            specificity_quantobject_group_cols = NULL,
                            quantobject_fdr = 0.01,
                            cooccurrence_pvalue = 1E-3) {
  if (verbose) message("Identifying reliable quantifications")
  msdata_df <- dplyr::mutate(msdata_df, is_observed = !is.na(intensity))

  spec_quantobj_group_cols <- c('object_id', specificity_quantobject_group_cols %||%
                                       intersect(colnames(msdata_df),
                                                 c('quantobject_id', 'object_id'))) %>% unique()
  spec_id_cols <- c("spec_msexp_group", spec_quantobj_group_cols)

  # calculate probabilities that object_msprobes of an object are specific to the given object_condition
  spec_stats_df <- dplyr::group_by(msdata_df, spec_msexp_group, !!!syms(spec_quantobj_group_cols)) %>%
    dplyr::summarise(nms_observed = sum(is_observed), nms_missed = sum(!is_observed),
                     .groups = "drop") %>%
    dplyr::group_by_at(spec_quantobj_group_cols) %>%
    dplyr::mutate(nms_observed_total = sum(nms_observed), nms_missed_total = sum(nms_missed)) %>%
    dplyr::ungroup() %>%
    dplyr::mutate(is_reliable = phyper(nms_observed-1, nms_observed_total, nms_missed_total,
                                       nms_observed + nms_missed, lower.tail=FALSE) <= specificity_pvalue)

  msdata_df <- dplyr::left_join(msdata_df,
      dplyr::select_at(spec_stats_df, c(spec_id_cols, 'is_reliable')),
      by = spec_id_cols)
  if (rlang::has_name(msdata_df, 'quantobject_id')) {
    # calculate probabilities that all quantitations of quantobjects in a given object_msprobe are false discoveries
    cooccur_stats_df <- dplyr::group_by(msdata_df, object_id, cooccur_msexp_group) %>%
      dplyr::summarise(nqobj_observed = n_distinct(quantobject_id[is_observed]),
                       .groups = "drop") %>%
      dplyr::inner_join(dplyr::group_by(msdata_df, object_id) %>%
                        dplyr::summarise(nqobj_total = n_distinct(quantobject_id),
                                         .groups = "drop"), by="object_id") %>%
      dplyr::mutate(is_cooccurring = pbinom(nqobj_observed - 1L, nqobj_total,
                                            quantobject_fdr, lower.tail=FALSE) <= cooccurrence_pvalue)

    msdata_df <- dplyr::left_join(msdata_df,
        dplyr::select(cooccur_stats_df, object_id, cooccur_msexp_group, is_cooccurring),
        by = c("object_id", "cooccur_msexp_group")) %>%
        dplyr::mutate(is_reliable = is_reliable | is_cooccurring,
                      is_cooccurring = NULL)
    if (rlang::has_name(msdata_df, "ident_type")) {
      msdata_df <- dplyr::group_by(msdata_df, quantobject_id, spec_msexp_group) %>%
        dplyr::mutate(is_reliable = is_reliable | any(ident_type %in% c("MULTI-MSMS"))) %>%
        dplyr::ungroup()
    }
  }
  msdata_df <- dplyr::mutate(msdata_df, is_reliable = !is.na(intensity) & is_reliable)

  return(msdata_df)
}

# select relevant msdata and quantobjects from msdata_df
# and define model_data$quantobjects and model_data$object_msprobes
prepare_msdata <- function(model_data, msdata, verbose = model_data$model_def$verbose,
                           max_quantobjects = 20L,
                           specificity_quantobject_group_cols = NULL,
                           eager_msprotocols = FALSE,
                           ...) {
  model_def <- model_data$model_def
  msprb <- msdata$msentities[['msprobe']]
  msprb_idcol <- msprb
  mschan <- msdata$msentities[['mschannel']]
  mschan_idcol <- mschan
  obj <- msdata$msentities[['object']]
  obj_idcol <- paste0(obj, "_id")
  quantobj <- msdata$msentities[['quantobject']]
  quantobj_idcol <- paste0(quantobj, "_id")
  intensities_dfname <- paste0(quantobj, "_intensities")
  if (!rlang::has_name(msdata, intensities_dfname)) {
    stop("No intensities (", intensities_dfname, " data frame) found in msdata")
  }

  if (obj == quantobj) {
    # obj is quanted directly
    intensities_df <- dplyr::select(msdata[[intensities_dfname]],
                                    object_id=!!sym(obj_idcol), mschannel=!!sym(mschan_idcol),
                                    intensity, any_of("ident_type"))
    msdata_df <- dplyr::left_join(model_data$object_msprobes,
                                  dplyr::select(model_data$mschannels, index_mschannel, mschannel, index_msprobe,
                                                index_msprotocol, any_of(c("msfraction", "msprotocol"))),
                                  by = "index_msprobe") %>%
                 dplyr::left_join(intensities_df, by = c("object_id", "mschannel")) %>%
        annotate_msdata(model_def) %>%
        dplyr::arrange(index_object, index_mschannel)
  } else {
    # quant specific quantobjects of object
    obj2quantobj_df <- msdata[[paste0(obj, "2", quantobj)]]
    qobjs_df <- dplyr::inner_join(obj2quantobj_df,
                                  dplyr::select(model_data$objects, !!sym(obj_idcol), object_id, index_object),
                                  by=obj_idcol) %>%
      dplyr::filter(is_specific) %>%
      dplyr::inner_join(dplyr::select(msdata[[paste0(quantobj, "s")]], !!sym(quantobj_idcol),
                                      any_of(c("msfraction", "charge"))),
                        by=quantobj_idcol) %>%
      dplyr::mutate(quantobject_id = !!sym(quantobj_idcol))
    if (nrow(qobjs_df) == 0L) stop("No specific ", quantobj, "s found for ",
                                   obj_idcol, "=", model_data$object_id)
    if (verbose) message(nrow(qobjs_df), " specific ", quantobj, "(s) found")
    intensities_df <- dplyr::select(msdata[[intensities_dfname]],
                                    quantobject_id=!!sym(quantobj_idcol), mschannel=!!sym(mschan_idcol),
                                    intensity, any_of("ident_type"))
    msdata_df <- dplyr::inner_join(dplyr::select(model_data$object_msprobes, index_msprobe, index_object_msprobe, index_object, object_id),
                                   dplyr::select(qobjs_df, index_object, quantobject_id, any_of("msfraction")), by="index_object") %>%
        dplyr::inner_join(dplyr::select(model_data$mschannels, index_mschannel, mschannel, index_msprobe, index_msprotocol, any_of(c("msfraction", "msprotocol"))),
                          by = c("index_msprobe", intersect("msfraction", colnames(qobjs_df)))) %>%
        dplyr::inner_join(dplyr::select(model_data$msprobe, index_msprobe, msprobe,
                                        spec_msexp_group, cooccur_msexp_group,
                                        any_of(specificity_quantobject_group_cols)),
                          by=c("index_msprobe")) %>%
        dplyr::left_join(intensities_df, by=c("quantobject_id", "mschannel"))
    if (all(is.na(msdata_df$intensity))) stop("No quantifications for ", nrow(qobjs_df), " specific ",
                                              quantobj, "(s) of ", obj_idcol, "=", model_data$object_id)
    if (!eager_msprotocols && n_distinct(msdata_df$index_msprotocol) > 1L) {
      # removing missing quantitations if a given quantobject is completely missing in a given msprotocol
      msdata_df <- dplyr::group_by(msdata_df, quantobject_id, index_msprotocol) %>%
        dplyr::filter(any(!is.na(intensity))) %>%
        dplyr::ungroup()
    }
    nqobj_channels <- nrow(dplyr::distinct(msdata_df, quantobject_id, mschannel))
    if (nrow(msdata_df) != nqobj_channels) {
      if (verbose) warning(nrow(msdata_df) - nqobj_channels, " of ", nrow(msdata_df),
                           " ", quantobj, " MS intensities are duplicate, summing duplicate entries")
      msdata_df <- dplyr::group_by(msdata_df, dplyr::across(!any_of(c("intensity", "ident_type")))) %>%
        dplyr::summarise(dplyr::across(intensity, ~sum(.x, na.rm = TRUE)),
                         dplyr::across(any_of("ident_type"), ~min(.x, na.rm = TRUE)),
                         .groups="drop") %>%
        dplyr::mutate(intensity = if_else(intensity != 0, intensity, NA_real_))
    }
    msdata_df <- annotate_msdata(msdata_df, model_def, verbose=verbose,
                                 specificity_quantobject_group_cols = specificity_quantobject_group_cols,
                                 ...)

    # arrange pepmodstates by object, by profile cluster and by the number of quantitations
    quantobject_group_size <- model_def$quantobject_group_size %||% (max_quantobjects %/% 2)
    qobj_info_cols <- intersect(colnames(qobjs_df), c("pepmod_id", "msfraction", "charge"))

    qobj_stats_df <- msdata_df %>%
      dplyr::group_by(index_object, quantobject_id) %>%
      dplyr::summarise(n_quants = sum(!is.na(intensity)),
                       intensity_med = median(intensity, na.rm=TRUE),
                       .groups = "drop") %>%
      dplyr::inner_join(
        dplyr::group_by(msdata_df, index_object) %>%
        dplyr::group_modify(~ cluster_msprofiles(.x, dplyr::rename(msdata[[paste0(mschan, "_", quantobj, "_stats")]],
                                                                   mschannel = !!sym(mschan_idcol)),
                                                 obj_col='quantobject_id', mschannel_col="mschannel")) %>%
        dplyr::ungroup(),
        by = c("quantobject_id", "index_object")) %>%
      dplyr::left_join(dplyr::select(qobjs_df, object_id, quantobject_id, is_specific,
                                     any_of(quantobj_idcol), !!!syms(qobj_info_cols)),
                       by = 'quantobject_id') %>%
      dplyr::arrange(index_object, profile_cluster,
                     desc(is_specific), desc(n_quants), desc(intensity_med),
                     !!!syms(qobj_info_cols)) %>%
      dplyr::group_by(index_object, profile_cluster) %>%
      dplyr::mutate(index_quantobject_group = row_number() %/% quantobject_group_size, # put objects within cluster into groups of 20
                    index_quantobject_local = row_number() %% quantobject_group_size) %>%
      dplyr::ungroup()
    # take the first group of 10 objects from each cluster, then continue with the second group etc
    model_data$quantobjects <- qobj_stats_df %>%
      dplyr::arrange(index_object, index_quantobject_group, profile_cluster, index_quantobject_local) %>%
      dplyr::mutate(index_quantobject = row_number()) %>%
      dplyr::filter(index_quantobject <= max_quantobjects) # remove less abundant quantobjects of rich objects
    if (verbose) message(nrow(model_data$quantobjects), " ", quantobj, "(s) from ",
                         n_distinct(model_data$quantobjects$profile_cluster), " cluster(s) selected")
    msdata_df <- dplyr::inner_join(msdata_df,
                                   dplyr::select(model_data$quantobjects, index_object, index_quantobject, quantobject_id),
                                   by=c("index_object", "quantobject_id")) %>%
      dplyr::arrange(index_object, index_object_msprobe, index_quantobject) %>%
      dplyr::mutate(index_quantobject_mschannel = row_number())
  #} else {
  #   stop("Unsupported combination of object=", obj,
  #       " and quantobject=", quantobj)
  }
  # separately index quantifications and missing data
  model_data$msdata <- mutate(msdata_df,
                              index_qdata = if_else(is_observed, cumsum(is_observed), NA_integer_),
                              index_mdata = if_else(!is_observed, cumsum(!is_observed), NA_integer_))
  message(nrow(model_data$msdata), " ",
          if (rlang::has_name(model_data$msdata, 'index_quantobject_mschannel'))
            quantobj else obj, '-in-msprobe(s) of ',
          n_distinct(model_data$msdata$index_object), ' ', obj, '(s)',
          if (rlang::has_name(model_data$msdata, 'index_quantobject'))
            paste0(' with ', n_distinct(model_data$msdata$index_quantobject), ' ',
                   quantobj, '(s)') else '', ': ',
          sum(model_data$msdata$is_observed), " quantitation(s) (",
          sum(model_data$msdata$is_reliable), " reliable), ",
          sum(!model_data$msdata$is_observed), " missed")
  return(model_data)
}

#' Prepare *MSGLM* model input data for specified objects.
#'
#' TODO
#'
#' @details
#' Since some MS quantitations could be false positives resulting from
#' incorrect MS1 feature matching, co-elution etc, one part of data preparation
#' carried by `msglm_data()` is the identification of reliable quantitations.
#' It does that by checking the patterns of how each *quantobject* was quantified.
#' Non-random patterns provide additional evidence that these quantification
#' are reliable:
#'   * it uses `specificity_msexp_group_cols` to define the *specificity groups*
#'     of MS channels and checks whether the quantifications of any given *quantobject*
#'     are specific to some of these groups.
#'   * it uses `cooccurrence_msexp_group_cols` to define the *cooccurrence groups*
#'     of MS channels. If more *quantobjects* of the same *objet* are simultaneously
#'     quantified in a given cooccurrence group than expected by chance
#'     (given `quantobject_fdr` rate of false identifications), such quantifications
#'     are regarded as reliable.
#' While both reliable and non-reliable quantifications are used to fit the model,
#' reliable quantifications give MSGLM an additional hint about the abundance of
#' MS *object* in corresponding MS *probes*.
#'
#' @param model_def *msglm_model* object with MSGLM model definition
#' @param msdata *msglm_data_collection* object with all MS data
#' @param object_ids vector of *model objects* IDs to analyze
#' @param mschannel_shift_col which column of `msdata$mschannel_shifts` data frame
#'        contains the log2 shifts for intensity normalization, defaults to
#'        `total_mschannel_shift`
#' @param specificity_msexp_group_cols the columns of `msdata$mschannels` data
#'        frame that define the *specificity groups* of MS channels
#' @param cooccurrence_msexp_group_cols the columns of `msdata$mschannels` data
#'        frame that define the *cooccurence groups* of MS channels
#' @param eager_msprotocols if FALSE, missing data entries are created only for
#'        the probes of MS protocols, where given quantobjects were identified.
#'        If TRUE, missing data entries are created for all MS probes regardless
#'        of the MS protocol
#' @param verbose generate verbose diagnostic output
#'
#' @return object of *msglm_model_data* class
#' @export
#'
#' @examples
#' @seealso [msglm_model()], [import_msglm_data()], [to_standata()]
msglm_data <- function(model_def, msdata, object_ids,
                       mschannel_shift_col = paste0("total_", msdata$msentities[['mschannel']], "_shift"),
                       max_quantobjects = 20L,
                       specificity_msexp_group_cols = msdata$msentities[['condition']],
                       cooccurrence_msexp_group_cols = msdata$msentities[['msprobe']],
                       eager_msprotocols = FALSE,
                       verbose = model_def$verbose,
                       ...) {
  checkmate::assert_class(model_def, "msglm_model")
  checkmate::assert_class(msdata, "msglm_data_collection")
  model_data <- list(model_def = model_def, object_id = object_ids,
                     msentities = msdata$msentities)
  obj <- msdata$msentities[['object']]
  obj_idcol <- paste0(obj, "_id")

  quantobj <- msdata$msentities[['quantobject']]

  if (!rlang::has_name(msdata, paste0(obj, "s"))) {
    stop("No model object (", obj, ") information found in MS data")
  }
  objs_df <- msdata$objects
  obj_cols <- msdata$msentities_extra_columns$object
  # FIXME remove this if
  if (is.null(obj_cols)) {
    if (verbose) warning("msdata$msentities_extra_columns$object not found, resorting to temorary guess")
    obj_cols <- (c(intersect(c("majority_protein_acs", "protein_acs",
                     "gene_names", "protein_names"), colnames(objs_df)),
         str_subset(colnames(objs_df), "^is_")) %>% unique())
    if (verbose) {
      message("Model object (", obj, ") columns to use: ", paste0(obj_cols, collapse=", "))
    }
  }
  model_data$objects <- dplyr::filter(objs_df, object_id %in% object_ids) %>%
      dplyr::select_at(c("object_id", "object_label", obj_idcol,
                         obj_cols) %>% unique()) %>%
      dplyr::arrange(object_id) %>%
      dplyr::mutate(index_object = row_number())
  missing_obj_ids <- setdiff(object_ids, unique(objs_df$object_id))
  if (length(missing_obj_ids) > 0) {
    stop("Objects not found: ", paste0(missing_obj_ids, ", "))
  }

  msprb <- msdata$msentities[['msprobe']]
  if (is.na(msprb)) stop("No msprobe entity specified in msdata")
  mschan <- msdata$msentities[['mschannel']]
  if (is.na(mschan)) stop("No mschannel entity specified in msdata")
  condition <- msdata$msentities[['condition']]
  if (is.na(condition)) stop("No condition entity specified in msdata")
  msprb_idcol <- msprb
  mschan_idcol <- mschan
  mschan_parent = if_else(msprb == mschan, condition, msprb)

  msrun <- msdata$msentities[['msrun']]
  msexp <- msdata$msentities[['msexperiment']]

  msprbs_df <- msdata[[paste0(msprb,'s')]]
  if (verbose) message("Using msdata$", msprb, "s for MS probes information")
  checkmate::assert_data_frame(msprbs_df)
  msprb_cols <- c(msprobe = msprb_idcol,
                  condition = condition,
                  mstag = msdata$msentities[['mstag']])
  msprbs_orig_df <- msdata[[paste0(msprb,'s')]]
  checkmate::assert_names(colnames(msprbs_orig_df),
                          must.include = Filter(Negate(is.na), msprb_cols))
  msprbs_df <- dplyr::select_at(msprbs_orig_df, Filter(Negate(is.na), msprb_cols))
  checkmate::assert_character(as.character(msprbs_df$msprobe), unique=TRUE, any.missing=FALSE)
  checkmate::assert_subset(as.character(msprbs_df$condition), model_def$conditions$condition)
  checkmate::assert_set_equal(unique(as.character(msprbs_df$condition)),
                              dplyr::filter(model_def$conditions, !is_virtual)$condition)
  if (!is.null(specificity_msexp_group_cols)) {
    msprbs_df <- dplyr::bind_cols(msprbs_df, dplyr::transmute(msprbs_orig_df,
                                  spec_msexp_group = factor(paste(!!!syms(specificity_msexp_group_cols)))))
  } else {
    msprbs_df <- dplyr::mutate(msprbs_df, spec_msexp_group = NA_integer_)
  }
  if (!is.null(cooccurrence_msexp_group_cols)) {
    msprbs_df <- dplyr::bind_cols(msprbs_df, dplyr::transmute(msprbs_orig_df,
                                  cooccur_msexp_group = factor(paste(!!!syms(cooccurrence_msexp_group_cols)))))
  } else {
    msprbs_df <- dplyr::mutate(msprbs_df, cooccur_msexp_group = NA_integer_)
  }
  # fill missing columns with NA
  for (col in setdiff(names(msprb_cols), colnames(msprbs_df))) {
    if (verbose) message("Adding empty ", col, " column to MS probe information")
    msprbs_df[[col]] <- NA_character_
  }

  if (mschan != msprb) {
    if (!rlang::has_name(msdata, paste0(mschan, "s"))) {
      stop("MS channels data msdata$", mschan, " not found")
    }
    if (verbose) message("Using msdata$", mschan, "s for MS channels information")
    mschans_df <- msdata[[paste0(mschan,"s")]]
    checkmate::assert_data_frame(mschans_df)
    mschan_cols <- c(mschannel = mschan_idcol, msprobe = msprb_idcol,
                     msrun = msdata$msentities[['msrun']],
                     msfraction = msdata$msentities[['msfraction']],
                     mstag = msdata$msentities[['mstag']],
                     msprotocol = msdata$msentities[['msprotocol']])
    checkmate::assert_names(colnames(mschans_df),
                            must.include = Filter(Negate(is.na), mschan_cols))
    mschans_df <- dplyr::select_at(mschans_df, Filter(Negate(is.na), mschan_cols))
    checkmate::assert_character(as.character(mschans_df$mschannel), unique=TRUE, any.missing=FALSE)
    checkmate::assert_set_equal(unique(as.character(mschans_df$msprobe)),
                                as.character(msprbs_df$msprobe))
    # fill missing columns with NA
    for (col in setdiff(names(mschan_cols), colnames(mschans_df))) {
      if (verbose) message("Adding empty ", col, " column to MS channel information")
      mschans_df[[col]] <- NA_character_
    }
  } else {
    mschans_df <- dplyr::mutate(msprbs_df, mschannel = msprobe)
  }
  # add normalization shifts to msprobes
  mschan_shifts_dfname <- NA_character_
  for (kv in list(c("mschannel", mschan), c("msprobe", msprb), c("msrun", msrun), c("msexperiment", msexp))) {
    msentity <- kv[[1]]
    msentity_name <- kv[[2]]
    if (!is.na(msentity_name) && rlang::has_name(msdata, paste0(msentity_name, "_shifts"))) {
      mschan_shifts_dfname <- paste0(msentity_name, "_shifts")
      mschan_shifts_idcol <- msentity_name
      mschan_shift_refcol <- msentity
      break
    }
  }
  if (is.na(mschan_shifts_dfname)) {
    stop("Cannot find MS experiments normalization information ",
         "(msdata$", mschan, "_shifts or msdata$", msdata$msentities[['msrun']], "_shifts etc)")
  }
  if (verbose) message("Using msdata$", mschan_shifts_dfname, "$", mschannel_shift_col, " for MS experiments normalization")
  mschan_shifts_df <- msdata[[mschan_shifts_dfname]]
  checkmate::assert_data_frame(mschan_shifts_df, .var.name = paste0("msdata$", mschan_shifts_dfname))
  checkmate::assert_names(names(mschan_shifts_df), must.include = c(mschan_shifts_idcol, mschannel_shift_col))
  checkmate::assert_set_equal(as.character(mschan_shifts_df[[mschan_shifts_idcol]]),
                              as.character(mschans_df[[mschan_shift_refcol]]))
  mschan_shifts_df = dplyr::select_at(mschan_shifts_df,
                                      c(mschan_shifts_idcol, mschannel_shift=mschannel_shift_col))
  checkmate::assert_numeric(mschan_shifts_df$mschannel_shift, any.missing = FALSE, finite=TRUE, lower=-20, upper=20)
  mschans_df <- dplyr::inner_join(mschans_df, mschan_shifts_df,
                                  by=rlang::set_names(mschan_shifts_idcol, mschan_shift_refcol))
  msprbs_df <- msprbs_df %>%
    dplyr::inner_join(dplyr::select(model_def$conditions, condition, index_condition, any_of("mstag")),
                      by=intersect(c('condition', 'mstag'), colnames(model_def$conditions))) %>%
    dplyr::arrange(dplyr::across(c(index_condition, any_of(c("mstag", "msexperiment")), msprobe)))
  if (rlang::has_name(model_def, "msprobeXeffect")) {
    msprb_dim <- names(dimnames(model_def$msprobeXeffect))
    if (msprb_dim[[1]] != msprb_idcol) {
      stop("MS experiments object (", msprb_idcol, ") does not match",
           " the name of model_def$msprobeXeffect rows dimension (",
           msprb_dim[[1]], ")")
    }
    msprobes_order <- rownames(model_def$msprobeXeffect)
  } else {
    msprobes_order <- NULL
  }
  # fix the order of msprobes
  msprbs_df <- ensure_primary_index_column(
        msprbs_df, 'index_msprobe',
        id_col="msprobe", ids_ordered=msprobes_order,
        create=TRUE, paste0("msdata$", msprb, "s"))
  # fix the order of mschannels
  mschan_import_cols <- c("msprobe", "index_msprobe", "index_condition")
  if (msprb != mschan) {
    mschan_import_cols <- append(mschan_import_cols, c("condition"))
  }
  mschans_df <- dplyr::inner_join(mschans_df, dplyr::select_at(msprbs_df, mschan_import_cols),
                                  by='msprobe') %>%
      dplyr::arrange(dplyr::across(c(index_msprobe, any_of(c("mstag", "msfraction", "msrun")))))
  model_data$msprobes <- msprbs_df
  model_data$mschannels <- dplyr::mutate(mschans_df,
      index_msrun = match(msrun, unique(msrun)),
      index_mschannel = row_number(),
      index_msprotocol = if (rlang::has_name(mschans_df, "msprotocol")) {
          match(msprotocol, unique(msprotocol))
        } else {
          1L
        }
    )

  # all objects X all conditions (including virtual)
  # FIXME rename objectXcondition
  model_data$object_conditions <- dplyr::full_join(dplyr::select(model_data$objects, object_id, index_object),
                                              dplyr::select(model_def$conditions, index_condition, is_virtual, condition),
                                              by=character()) %>%
    dplyr::mutate(objcond_id = paste(condition, object_id)) %>%
    dplyr::arrange(index_condition, index_object) %>%
    dplyr::mutate(index_object_condition = row_number())

  # all objects X all MS channels (only those with actual experiments)
  # FIXME rename objectXmschannel
  model_data$object_msprobes <- dplyr::inner_join(model_data$object_conditions, model_data$msprobes,
                                               by = c("index_condition", "condition")) %>%
    dplyr::arrange(index_object, index_object_condition, index_msprobe) %>%
    dplyr::mutate(index_object_msprobe = row_number(),
                  objprobe_id = paste0(object_id, '_', msprobe))

  model_data$quantobject_mscalib <- msdata[[paste0(quantobj, "_mscalib")]]
  model_data$quantobject_labu_min <- msdata[[paste0(quantobj, '_labu_min')]]
  logbase <- logintensityBase(model_data$quantobject_mscalib, silent=TRUE)
  if (logbase != 2) {
    warning("msdata$", quantobj, "_mscalib logintensityBase=", logbase,
            " converting mscalib model and log-intensities to log2-based ones")
    model_data$quantobject_mscalib <- convert_logintensityBase(model_data$quantobject_mscalib, new_base=2)
    #k <- log(logbase) / log(2)
    #model_data$quantobj_labu_min <- model_data$quantobj_labu_min * k
  }

  model_data <- prepare_msdata(model_data, msdata, max_quantobjects=max_quantobjects,
                               eager_msprotocols=eager_msprotocols,
                               verbose=verbose, ...)
  model_data <- prepare_expanded_effects(model_data, verbose=verbose)

  return (structure(model_data, class="msglm_model_data"))
}
innatelab/msglm documentation built on March 23, 2023, 7:17 a.m.