R/imputes.R

Defines functions impute_lls impute_svd impute_ppca impute_bpca impute_nipals impute_rf impute_knn impute_lod impute_min impute_median impute_mean impute_user_value impute_global_lowest

Documented in impute_bpca impute_global_lowest impute_knn impute_lls impute_lod impute_mean impute_median impute_min impute_nipals impute_ppca impute_rf impute_svd impute_user_value

#' Impute missing values by replacing them with the lowest observed intensity (global)
#'
#' @description
#' Replace missing intensity values (`NA`) with the lowest observed intensity.
#'
#' @param data A tidy tibble created by \code{\link[metamorphr]{read_featuretable}}.
#'
#' @return A tibble with imputed missing values.
#' @export
#'
#' @examples
#' toy_metaboscape %>%
#'   impute_global_lowest()
impute_global_lowest <- function(data) {
  data %>%
    dplyr::mutate(LoD = min(.data$Intensity, na.rm = T)) %>%
    dplyr::mutate(Intensity = dplyr::case_when(is.na(.data$Intensity) ~ .data$LoD,
      .default = .data$Intensity
    )) %>%
    dplyr::select(-"LoD")
}



#' Impute missing values by replacing them with a user-provided value
#'
#' @description
#' Replace missing intensity values (`NA`) with a user-provided value (e.g., 1).
#'
#' @param data A tidy tibble created by \code{\link[metamorphr]{read_featuretable}}.
#' @param value Numeric that replaces missing values
#'
#' @return A tibble with imputed missing values.
#' @export
#'
#' @examples
#' toy_metaboscape %>%
#'   impute_user_value(value = 1)
impute_user_value <- function(data, value) {
  data %>%
    dplyr::mutate(Intensity = dplyr::case_when(is.na(.data$Intensity) ~ .env$value,
      .default = .data$Intensity
    ))
}

#' Impute missing values by replacing them with the Feature mean
#'
#' @description
#' Replace missing intensity values (`NA`) with the Feature mean of non-`NA` values. For example, if a Feature has the measured intensities `NA, 1, NA, 3, 2` in samples 1-5,
#' the intensities after `impute_mean()` would be `2, 1, 2, 3, 2`.
#'
#' @param data A tidy tibble created by \code{\link[metamorphr]{read_featuretable}}.
#'
#' @return A tibble with imputed missing values.
#' @export
#'
#' @examples
#' toy_metaboscape %>%
#'   impute_mean()
impute_mean <- function(data) {
  data %>%
    dplyr::group_by(.data$UID) %>%
    dplyr::mutate(LoD = mean(.data$Intensity, na.rm = T)) %>%
    dplyr::ungroup() %>%
    dplyr::mutate(Intensity = dplyr::case_when(is.na(.data$Intensity) ~ .data$LoD,
      .default = .data$Intensity
    )) %>%
    dplyr::select(-"LoD")
}

#' Impute missing values by replacing them with the Feature median
#'
#' @description
#' Replace missing intensity values (`NA`) with the Feature median of non-`NA` values. For example, if a Feature has the measured intensities `NA, 1, NA, 3, 2` in samples 1-5,
#' the intensities after `impute_median()` would be `2, 1, 2, 3, 2`.
#'
#' @param data A tidy tibble created by \code{\link[metamorphr]{read_featuretable}}.
#'
#' @return A tibble with imputed missing values.
#' @export
#'
#' @examples
#' toy_metaboscape %>%
#'   impute_median()
impute_median <- function(data) {
  data %>%
    dplyr::group_by(.data$UID) %>%
    dplyr::mutate(LoD = stats::median(.data$Intensity, na.rm = T)) %>%
    dplyr::ungroup() %>%
    dplyr::mutate(Intensity = dplyr::case_when(is.na(.data$Intensity) ~ .data$LoD,
      .default = .data$Intensity
    )) %>%
    dplyr::select(-"LoD")
}


#' Impute missing values by replacing them with the Feature minimum
#'
#' @description
#' Replace missing intensity values (`NA`) with the Feature minimum of non-`NA` values.
#'
#' @param data A tidy tibble created by \code{\link[metamorphr]{read_featuretable}}.
#'
#' @return A tibble with imputed missing values.
#' @export
#'
#' @examples
#' toy_metaboscape %>%
#'   impute_min()
impute_min <- function(data) {
  data %>%
    dplyr::group_by(.data$UID) %>%
    dplyr::mutate(LoD = min(.data$Intensity, na.rm = T)) %>%
    dplyr::ungroup() %>%
    dplyr::mutate(Intensity = dplyr::case_when(is.na(.data$Intensity) ~ .data$LoD,
      .default = .data$Intensity
    )) %>%
    dplyr::select(-"LoD")
}

#' Impute missing values by replacing them with the Feature 'Limit of Detection'
#'
#' @description
#' Replace missing intensity values (`NA`) by what is assumed to be the detector limit of detection (LoD).
#' It is estimated by dividing the Feature minimum by the provided denominator, usually 5. See the References section for more information.
#'
#' @param data A tidy tibble created by \code{\link[metamorphr]{read_featuretable}}.
#' @param div_by A numeric value that specifies by which number the Feature minimum will be divided
#'
#' @return A tibble with imputed missing values.
#' @references \href{https://omicsforum.ca/t/how-to-deal-with-missing-values/75}{LoD on OmicsForum}
#' @export
#'
#' @examples
#' toy_metaboscape %>%
#'   impute_lod()
impute_lod <- function(data, div_by = 5) {
  # https://omicsforum.ca/t/how-to-deal-with-missing-values/75
  data %>%
    dplyr::group_by(.data$UID) %>%
    dplyr::mutate(LoD = min(.data$Intensity, na.rm = T) / .env$div_by) %>%
    dplyr::ungroup() %>%
    dplyr::mutate(Intensity = dplyr::case_when(is.na(.data$Intensity) ~ .data$LoD,
      .default = .data$Intensity
    )) %>%
    dplyr::select(-"LoD")
}

#' Impute missing values using nearest neighbor averaging
#'
#' @description
#' Basically a wrapper function around `impute::`\code{\link[impute]{impute.knn}}. Imputes missing values using the k-th nearest neighbor algorithm.
#'
#'
#' Note that the function ln-transforms the data prior to imputation and transforms it back to the original scale afterwards. **Please do not do it manually prior to calling `impute_knn()`!**
#' See References for more information.
#'
#'
#' \strong{Important Note}
#'
#' `impute_knn()` depends on the `impute` package from Bioconductor. If `metamorphr` was installed via `install.packages()`, dependencies from Bioconductor were not
#' automatically installed. When `impute_knn()` is called without the `impute` package installed, you should be asked if you want to install `pak` and `impute`.
#' If you want to use `impute_knn()` you have to install those. In case you run into trouble with the automatic installation, please install `impute` manually. See
#' \href{https://bioconductor.org/packages/release/bioc/html/impute.html}{impute: Imputation for microarray data} for instructions on manual installation.
#'
#' @param data A tidy tibble created by \code{\link[metamorphr]{read_featuretable}}.
#' @param quietly `TRUE` or `FALSE`. Should messages and warnings from \code{\link[impute]{impute.knn}} be printed to the console?
#' @param ... Additional parameters passed to \code{\link[impute]{impute.knn}}.
#'
#' @return A tibble with imputed missing values.
#' @export
#'
#' @references \itemize{
#' \item Robert Tibshirani, Trevor Hastie, \strong{2017}, DOI 10.18129/B9.BIOC.IMPUTE.
#' \item J. Khan, J. S. Wei, M. Ringnér, L. H. Saal, M. Ladanyi, F. Westermann, F. Berthold, M. Schwab, C. R. Antonescu, C. Peterson, P. S. Meltzer, \emph{Nat Med} \strong{2001}, \emph{7}, 673–679, DOI 10.1038/89044.
#' }
#' @examples
#' toy_metaboscape %>%
#'   impute_knn()
impute_knn <- function(data, quietly = TRUE, ...) {
  # impute is a bioconductor package so it is not installed with metamorphr if installed via install.packages().
  # check if it installed first
  # also check, if pak is installed
  if (!is_installed_wrapper("impute")) {
    if (!is_installed_wrapper("pak")) {
      check_installed_wrapper("pak")
      check_installed_wrapper("impute")
    }
    check_installed_wrapper("impute")
  }
  # to preserve order
  data_colnames <- colnames(data)

  data_obs <- data %>%
    dplyr::select("UID", "Intensity", "Sample")

  # preserve order for later
  data_obs_sample_order <- data_obs %>%
    dplyr::select("Sample") %>%
    dplyr::pull() %>%
    unique()

  data_meta <- data %>%
    dplyr::select(-"Intensity")

  data_obs <- data_obs %>%
    dplyr::mutate(Intensity = log(.data$Intensity)) %>%
    tidyr::spread(key = "Sample", value = "Intensity")

  uids <- data_obs %>%
    dplyr::select("UID")

  data_obs <- data_obs %>%
    dplyr::select(-"UID")

  # preserve colnames
  data_obs_colnames <- colnames(data_obs)

  data_obs <- data_obs %>%
    as.matrix()

  # used with_preserve_seed to preserve random seed
  if (quietly == TRUE) {
    data_obs <- knn_impute_quiet(data_obs, ...)
    # warnings will be printed anyways
    if (length(data_obs$warnings) > 0) {
      warning(data_obs$warnings)
    }
    data_obs <- data_obs$result
  } else {
    data_obs <- withr::with_preserve_seed(impute::impute.knn(data_obs, ...))
  }


  data_obs <- data_obs$data %>%
    tidyr::as_tibble()

  # restore colnames
  colnames(data_obs) <- data_obs_colnames

  data_obs <- data_obs[data_obs_sample_order]

  # print(data_obs)
  data <- data_obs %>%
    cbind(uids) %>%
    tidyr::gather(-"UID", key = "Sample", value = "Intensity") %>%
    # reverse ln-transformation
    dplyr::mutate(Intensity = exp(.data$Intensity)) %>%
    dplyr::left_join(data_meta, by = c("UID", "Sample"))

  # bring columns to correct order
  data <- data[data_colnames]

  data <- tidyr::as_tibble(data)


  data
}

#' Impute missing values using random forest
#'
#' @description
#' Basically a wrapper function around `missForest::`\code{\link[missForest]{missForest}}. Imputes missing values using the random forest algorithm.
#'
#'
#' @param data A tidy tibble created by \code{\link[metamorphr]{read_featuretable}}.
#' @param random_seed A seed for the random number generator. Can be an integer or `NULL` (in case no particular seed should be used) but for reproducibility reasons it is **strongly advised** to provide an integer.
#' @param ... Additional parameters passed to \code{\link[missForest]{missForest}}.
#'
#' @return A tibble with imputed missing values.
#' @export
#'
#' @references \itemize{
#' \item \CRANpkg{missForest} on CRAN
#' \item D. J. Stekhoven, P. Bühlmann, \emph{Bioinformatics} \strong{2012}, \emph{28}, 112–118, DOI 10.1093/bioinformatics/btr597.
#' }
#'
#' @examples
#' toy_metaboscape %>%
#'   impute_rf()
impute_rf <- function(data, random_seed = 1L, ...) {
  # to preserve order of columns of initial tibble
  data_colnames <- colnames(data)

  data_obs <- data %>%
    dplyr::select("UID", "Intensity", "Sample")

  # preserve order for later
  data_obs_sample_order <- data_obs %>%
    dplyr::select("Sample") %>%
    dplyr::pull() %>%
    unique()

  data_meta <- data %>%
    dplyr::select(-"Intensity")

  data_obs <- data_obs %>%
    tidyr::spread(key = "UID", value = "Intensity") %>%
    as.data.frame()

  # preserve rownames (Sample names)
  data_obs_rownames <- data_obs %>%
    dplyr::select("Sample")

  data_obs <- dplyr::select(data_obs, -"Sample")


  if (!is.null(random_seed)) {
    data_obs_imp <- withr::with_seed(seed = random_seed, missForest::missForest(data_obs, ...))
  } else {
    # with_preserve_seed might not be necessary
    data_obs_imp <- withr::with_preserve_seed(missForest::missForest(data_obs, ...))
  }

  data_obs_imp <- data_obs_imp$ximp

  data_obs_imp <- data_obs_imp %>%
    cbind(data_obs_rownames) %>%
    tidyr::gather(-"Sample", key = "UID", value = "Intensity") %>%
    dplyr::mutate(UID = as.integer(.data$UID))

  data_obs_imp <- dplyr::left_join(data_meta, data_obs_imp, by = c("UID", "Sample")) %>%
    tidyr::as_tibble()

  data_obs_imp <- data_obs_imp[data_colnames]

  data_obs_imp
}

#' Impute missing values using NIPALS PCA
#'
#' @description
#' One of several PCA-based imputation methods. Basically a wrapper around `pcaMethods::`\code{\link[pcaMethods]{pca}}`(method = "nipals")`.
#' For a detailed discussion, see the `vignette("pcaMethods")` and `vignette("missingValues", "pcaMethods")` as well as the References section.
#'
#' \emph{Important Note}
#'
#' `impute_nipals()` depends on the `pcaMethods` package from Bioconductor. If `metamorphr` was installed via `install.packages()`, dependencies from Bioconductor were not
#' automatically installed. When `impute_nipals()` is called without the `pcaMethods` package installed, you should be asked if you want to install `pak` and `pcaMethods`.
#' If you want to use `impute_nipals()` you have to install those. In case you run into trouble with the automatic installation, please install `pcaMethods` manually. See
#' \href{https://www.bioconductor.org/packages/release/bioc/html/pcaMethods.html}{pcaMethods – a Bioconductor package providing PCA methods for incomplete data} for instructions on manual installation.
#'
#' @param data A tidy tibble created by \code{\link[metamorphr]{read_featuretable}}.
#' @param n_pcs The number of PCs to calculate.
#' @param center Should `data` be mean centered? See \code{\link[pcaMethods]{prep}} for details.
#' @param scale Should `data` be scaled? See \code{\link[pcaMethods]{prep}} for details.
#' @param direction Either `1` or `2`. `1` runs a PCA on a matrix with samples in columns and features in rows and `2` runs a PCA on a matrix with features in columns and samples in rows.
#' Both are valid according to this \href{https://github.com/hredestig/pcaMethods/issues/25}{discussion on GitHub} but give \strong{different results}.
#'
#' @return A tibble with imputed missing values.
#' @export
#'
#' @references \itemize{
#' \item H. R. Wolfram Stacklies, \strong{2017}, DOI 10.18129/B9.BIOC.PCAMETHODS.
#' \item W. Stacklies, H. Redestig, M. Scholz, D. Walther, J. Selbig, \emph{Bioinformatics} \strong{2007}, \emph{23}, 1164–1167, DOI 10.1093/bioinformatics/btm069.
#' }
#'
#' @examples
#' toy_metaboscape %>%
#'   impute_nipals()
impute_nipals <- function(data, n_pcs = 2, center = TRUE, scale = "none", direction = 2) {
  # pcaMethods is a bioconductor package so it is not installed with metamorphr if installed via install.packages().
  # check if it installed first
  # also check, if pak is installed
  if (!is_installed_wrapper("pcaMethods")) {
    if (!is_installed_wrapper("pak")) {
      check_installed_wrapper("pak")
      check_installed_wrapper("pcaMethods")
    }
    check_installed_wrapper("pcaMethods")
  }

  data_list <- internal_prep_pca_imputes(data = data, direction = direction)

  #data <- data_list$data


  data_list$data <- pcaMethods::pca(data_list$data, nPcs = n_pcs, method = "nipals", center = center, scale = scale)
  data_list$data <- pcaMethods::completeObs(data_list$data)

  internal_clean_pca_results(data_list = data_list, direction = direction)
}

#' Impute missing values using Bayesian PCA
#'
#'
#' @description
#' One of several PCA-based imputation methods. Basically a wrapper around `pcaMethods::`\code{\link[pcaMethods]{pca}}`(method = "bpca")`.
#' For a detailed discussion, see the `vignette("pcaMethods")` and `vignette("missingValues", "pcaMethods")` as well as the References section.
#'
#' \strong{Important Note}
#'
#' `impute_bpca()` depends on the `pcaMethods` package from Bioconductor. If `metamorphr` was installed via `install.packages()`, dependencies from Bioconductor were not
#' automatically installed. When `impute_bpca()` is called without the `pcaMethods` package installed, you should be asked if you want to install `pak` and `pcaMethods`.
#' If you want to use `impute_bpca()` you have to install those. In case you run into trouble with the automatic installation, please install `pcaMethods` manually. See
#' \href{https://www.bioconductor.org/packages/release/bioc/html/pcaMethods.html}{pcaMethods – a Bioconductor package providing PCA methods for incomplete data}.
#'
#' @param data A tidy tibble created by \code{\link[metamorphr]{read_featuretable}}.
#' @param n_pcs The number of PCs to calculate.
#' @param center Should `data` be mean centered? See \code{\link[pcaMethods]{prep}} for details.
#' @param scale Should `data` be scaled? See \code{\link[pcaMethods]{prep}} for details.
#' @param direction Either `1` or `2`. `1` runs a PCA on a matrix with samples in columns and features in rows and `2` runs a PCA on a matrix with features in columns and samples in rows.
#' Both are valid according to this \href{https://github.com/hredestig/pcaMethods/issues/25}{discussion on GitHub} but give \strong{different results}.
#'
#' @return A tibble with imputed missing values.
#' @export
#'
#' @references \itemize{
#' \item H. R. Wolfram Stacklies, \strong{2017}, DOI 10.18129/B9.BIOC.PCAMETHODS.
#' \item W. Stacklies, H. Redestig, M. Scholz, D. Walther, J. Selbig, \emph{Bioinformatics} \strong{2007}, \emph{23}, 1164–1167, DOI 10.1093/bioinformatics/btm069.
#' }
#'
#' @examples
#' toy_metaboscape %>%
#'   impute_bpca()
impute_bpca <- function(data, n_pcs = 2, center = TRUE, scale = "none", direction = 2) {
  # pcaMethods is a bioconductor package so it is not installed with metamorphr if installed via install.packages().
  # check if it installed first
  # also check, if pak is installed
  if (!is_installed_wrapper("pcaMethods")) {
    if (!is_installed_wrapper("pak")) {
      check_installed_wrapper("pak")
      check_installed_wrapper("pcaMethods")
    }
    check_installed_wrapper("pcaMethods")
  }

  data_list <- internal_prep_pca_imputes(data = data, direction = direction)

  #data <- data_list$data

  rlang::inform(message = "`impute_bpca` output:\n\n")
  data_list$data <- pcaMethods::pca(data_list$data, nPcs = n_pcs, method = "bpca", center = center, scale = scale)
  data_list$data <- pcaMethods::completeObs(data_list$data)

  internal_clean_pca_results(data_list = data_list, direction = direction)
}

#' Impute missing values using Probabilistic PCA
#'
#'
#' @description
#' One of several PCA-based imputation methods. Basically a wrapper around `pcaMethods::`\code{\link[pcaMethods]{pca}}`(method = "ppca")`.
#' For a detailed discussion, see the `vignette("pcaMethods")` and `vignette("missingValues", "pcaMethods")` as well as the References section. <br>
#' In the underlying function (`pcaMethods::`\code{\link[pcaMethods]{pca}}`(method = "ppca")`), the order of columns has an influence on the outcome. Therefore, calling `pcaMethods::`\code{\link[pcaMethods]{pca}}`(method = "ppca")`
#' on a matrix and calling `metamorphr::impute()` on a tidy tibble might give different results, even though they contain the same data. That is because under the hood,
#' the tibble is transformed to a matrix prior to calling `pcaMethods::`\code{\link[pcaMethods]{pca}}`(method = "ppca")` and you have limited influence on the column order of the
#' resulting matrix.
#'
#' \emph{Important Note}
#'
#' `impute_ppca()` depends on the `pcaMethods` package from Bioconductor. If `metamorphr` was installed via `install.packages()`, dependencies from Bioconductor were not
#' automatically installed. When `impute_ppca()` is called without the `pcaMethods` package installed, you should be asked if you want to install `pak` and `pcaMethods`.
#' If you want to use `impute_ppca()` you have to install those. In case you run into trouble with the automatic installation, please install `pcaMethods` manually. See
#' \href{https://www.bioconductor.org/packages/release/bioc/html/pcaMethods.html}{pcaMethods – a Bioconductor package providing PCA methods for incomplete data} for instructions on manual installation.
#'
#' @param data A tidy tibble created by \code{\link[metamorphr]{read_featuretable}}.
#' @param n_pcs The number of PCs to calculate.
#' @param center Should `data` be mean centered? See \code{\link[pcaMethods]{prep}} for details.
#' @param scale Should `data` be scaled? See \code{\link[pcaMethods]{prep}} for details.
#' @param direction Either `1` or `2`. `1` runs a PCA on a matrix with samples in columns and features in rows and `2` runs a PCA on a matrix with features in columns and samples in rows.
#' Both are valid according to this \href{https://github.com/hredestig/pcaMethods/issues/25}{discussion on GitHub} but give \strong{different results}.
#' @param random_seed An integer used as seed for the random number generator.
#'
#' @return A tibble with imputed missing values.
#' @export
#'
#' @references \itemize{
#' \item H. R. Wolfram Stacklies, \strong{2017}, DOI 10.18129/B9.BIOC.PCAMETHODS.
#' \item W. Stacklies, H. Redestig, M. Scholz, D. Walther, J. Selbig, \emph{Bioinformatics} \strong{2007}, \emph{23}, 1164–1167, DOI 10.1093/bioinformatics/btm069.
#' }
#'
#' @examples
#' toy_metaboscape %>%
#'   impute_ppca()
impute_ppca <- function(data, n_pcs = 2, center = TRUE, scale = "none", direction = 2, random_seed = 1L) {
  # pcaMethods is a bioconductor package so it is not installed with metamorphr if installed via install.packages().
  # check if it installed first
  # also check, if pak is installed
  if (!is_installed_wrapper("pcaMethods")) {
    if (!is_installed_wrapper("pak")) {
      check_installed_wrapper("pak")
      check_installed_wrapper("pcaMethods")
    }
    check_installed_wrapper("pcaMethods")
  }

  data_list <- internal_prep_pca_imputes(data = data, direction = direction)

  #data <- data_list$data

  #data_list$data <- withr::with_seed(seed = random_seed, pcaMethods::pca(data_list$data, nPcs = n_pcs, method = "ppca"))
  data_list$data <- withr::with_preserve_seed(pcaMethods::pca(data_list$data, nPcs = n_pcs, method = "ppca", seed = random_seed, center = center, scale = scale))
  data_list$data <- pcaMethods::completeObs(data_list$data)

  internal_clean_pca_results(data_list = data_list, direction = direction)
}

#' Impute missing values using Singular Value Decomposition (SVD)
#'
#'
#' @description
#' Basically a wrapper around `pcaMethods::`\code{\link[pcaMethods]{pca}}`(method = "svdImpute")`.
#' For a detailed discussion, see the `vignette("pcaMethods")` and `vignette("missingValues", "pcaMethods")` as well as the References section.
#'
#' \emph{Important Note}
#' `impute_svd()` depends on the `pcaMethods` package from Bioconductor. If `metamorphr` was installed via `install.packages()`, dependencies from Bioconductor were not
#' automatically installed. When `impute_svd()` is called without the `pcaMethods` package installed, you should be asked if you want to install `pak` and `pcaMethods`.
#' If you want to use `impute_svd()` you have to install those. In case you run into trouble with the automatic installation, please install `pcaMethods` manually. See
#' \href{https://www.bioconductor.org/packages/release/bioc/html/pcaMethods.html}{pcaMethods – a Bioconductor package providing PCA methods for incomplete data} for instructions on manual installation.
#'
#' @param data A tidy tibble created by \code{\link[metamorphr]{read_featuretable}}.
#' @param n_pcs The number of PCs to calculate.
#' @param center Should `data` be mean centered? See \code{\link[pcaMethods]{prep}} for details.
#' @param scale Should `data` be scaled? See \code{\link[pcaMethods]{prep}} for details.
#' @param direction Either `1` or `2`. `1` runs `pcaMethods::`\code{\link[pcaMethods]{pca}}`(method = "svdImpute")` on a matrix with samples in columns and features in rows and `2` runs `pcaMethods::`\code{\link[pcaMethods]{pca}}`(method = "svdImpute")` on a matrix with features in columns and samples in rows.
#' Both are valid according to this \href{https://github.com/hredestig/pcaMethods/issues/25}{discussion on GitHub} but give \strong{different results}.
#'
#' @return A tibble with imputed missing values.
#' @export
#'
#' @references \itemize{
#' \item H. R. Wolfram Stacklies, \strong{2017}, DOI 10.18129/B9.BIOC.PCAMETHODS.
#' \item W. Stacklies, H. Redestig, M. Scholz, D. Walther, J. Selbig, \emph{Bioinformatics} \strong{2007}, \emph{23}, 1164–1167, DOI 10.1093/bioinformatics/btm069.
#' \item O. Troyanskaya, M. Cantor, G. Sherlock, P. Brown, T. Hastie, R. Tibshirani, D. Botstein, R. B. Altman, \emph{Bioinformatics} \strong{2001}, \emph{17}, 520–525, DOI 10.1093/bioinformatics/17.6.520.
#' }
#'
#' @examples
#' toy_metaboscape %>%
#'   impute_svd()
impute_svd <- function(data, n_pcs = 2, center = TRUE, scale = "none", direction = 2) {
  # pcaMethods is a bioconductor package so it is not installed with metamorphr if installed via install.packages().
  # check if it installed first
  # also check, if pak is installed
  if (!is_installed_wrapper("pcaMethods")) {
    if (!is_installed_wrapper("pak")) {
      check_installed_wrapper("pak")
      check_installed_wrapper("pcaMethods")
    }
    check_installed_wrapper("pcaMethods")
  }

  data_list <- internal_prep_pca_imputes(data = data, direction = direction)

  #data <- data_list$data

  #data_list$data <- withr::with_seed(seed = random_seed, pcaMethods::pca(data_list$data, nPcs = n_pcs, method = "ppca"))
  data_list$data <- pcaMethods::pca(data_list$data, nPcs = n_pcs, method = "svdImpute", verbose = F, center = center, scale = scale)
  data_list$data <- pcaMethods::completeObs(data_list$data)

  internal_clean_pca_results(data_list = data_list, direction = direction)
}

#' Impute missing values using Local Least Squares (LLS)
#'
#' @description
#' Basically a wrapper around `pcaMethods::`\code{\link[pcaMethods]{llsImpute}}.
#' For a detailed discussion, see the `vignette("pcaMethods")` and `vignette("missingValues", "pcaMethods")` as well as the References section.
#'
#' \emph{Important Note}
#' `impute_lls()` depends on the `pcaMethods` package from Bioconductor. If `metamorphr` was installed via `install.packages()`, dependencies from Bioconductor were not
#' automatically installed. When `impute_svd()` is called without the `pcaMethods` package installed, you should be asked if you want to install `pak` and `pcaMethods`.
#' If you want to use `impute_lls()` you have to install those. In case you run into trouble with the automatic installation, please install `pcaMethods` manually. See
#' \href{https://www.bioconductor.org/packages/release/bioc/html/pcaMethods.html}{pcaMethods – a Bioconductor package providing PCA methods for incomplete data} for instructions on manual installation.
#'
#' @param data A tidy tibble created by \code{\link[metamorphr]{read_featuretable}}.
#' @param correlation The method used to calculate correlations between features. One of `"pearson"`, `"spearman"` or `"kendall"`. See \code{\link[stats]{cor}}.
#' @param complete_genes If `TRUE` only complete features will be used for regression, if `FALSE`, all will be used.
#' @param center Should `data` be mean centered? See \code{\link[pcaMethods]{prep}} for details.
#' @param cluster_size The number of similar features used for regression.
#'
#' @return A tibble with imputed missing values.
#' @export
#'
#' @references \itemize{
#' \item H. R. Wolfram Stacklies, \strong{2017}, DOI 10.18129/B9.BIOC.PCAMETHODS.
#' \item W. Stacklies, H. Redestig, M. Scholz, D. Walther, J. Selbig, \emph{Bioinformatics} \strong{2007}, \emph{23}, 1164–1167, DOI 10.1093/bioinformatics/btm069.
#' }
#'
#' @examples
#' # The cluster size must be reduced because
#' # the data set is too small for the default (10)
#'
#' toy_metaboscape %>%
#'   impute_lls(complete_genes = TRUE, cluster_size = 5)
impute_lls <- function(data, correlation = "pearson", complete_genes = FALSE, center = FALSE, cluster_size = 10) {
  # pcaMethods is a bioconductor package so it is not installed with metamorphr if installed via install.packages().
  # check if it installed first
  # also check, if pak is installed
  if (!is_installed_wrapper("pcaMethods")) {
    if (!is_installed_wrapper("pak")) {
      check_installed_wrapper("pak")
      check_installed_wrapper("pcaMethods")
    }
    check_installed_wrapper("pcaMethods")
  }

  data_list <- internal_prep_pca_imputes(data = data, direction = 2)

  data_list$data <- pcaMethods::llsImpute(data_list$data, verbose = F, correlation = correlation, center = center, allVariables = complete_genes, k = cluster_size)
  data_list$data <- pcaMethods::completeObs(data_list$data)

  internal_clean_pca_results(data_list = data_list, direction = 2)
}

Try the metamorphr package in your browser

Any scripts or data that you put into this service are public.

metamorphr documentation built on June 10, 2026, 5:07 p.m.