hetset: Identification of Heterogeneous Subsets in Data

# hetset-forward stepwise selection ===========================================

scan_hetset <- function(H, level = "univariate", min_size = 2, max_size = 10,
                        rel_imp = 0, em_steps = 2) {
  if (is.null(H@metadata$slf)) {
    H@metadata$slf <- NA
  }
  if (is.null(H$prt)) {
    H$prt <- NA
  }
  if (sum(!is.na(H@metadata$slf)) == 0) {
    if (mean(is.na(H$prt)) == 1) {
      if (level == "univariate") {
        H <- .init_meth_A_univ(H)
      } else {
        H <- .init_meth_A_biv(H)
      }
    } else {
      if (level == "univariate") {
        H <- .init_meth_B(H, scan_depth = 1)
      } else {
        H <- .init_meth_B(H, scan_depth = 2)
      }
    }
  } else if (mean(is.na(H$prt)) == 1) {
    H <- .init_meth_C(H)
  } else {
    H <- estimate_densities(H)
    H <- calculate_dist(H)
  }
  H <- reassign_samples(H)

  iter_max <- 10^2
  iter_no <- 1
  cat(paste0(":",H@metadata$slf))
  while ((sum(!is.na(H@metadata$slf)) < max_size) & (iter_no < iter_max)) {
    H_b <- H
    H_b <- reassign_samples(H_b)
    # distance to be beaten with additional feature:
    ref_het <- calculate_dist(estimate_densities(H_b))@metadata$sqHell
    # initialize current best choice
    H_b@metadata$sqHell <- 0
    # initialize candidate set
    candidates <- H@NAMES[!(H@NAMES %in% H@metadata$slf)]
    for (i_c in candidates) {
      H_c <- H
      H_c@metadata$slf <- c(H@metadata$slf, i_c)
      H_c <- estimate_densities(H_c)
      i.em <- 0
      while (i.em < em_steps) {
        H_c <- reassign_samples(H_c)
        H_c <- estimate_densities(H_c)
        i.em <- i.em + 1
      }
      H_c <- calculate_dist(H_c)
      if ((H_c@metadata$sqHell > H_b@metadata$sqHell) &
        (min(c(H_c@metadata$prp.A, 1 - H_c@metadata$prp.A)) > 0.05)) {
        H_b <- H_c
        cat(".")
      }
      rm(H_c)
    }
    rm(candidates)
    if ((sum(!is.na(H@metadata$slf)) < min_size)){
      cat(paste0("+",H_b@metadata$slf[length(H_b@metadata$slf)]))
      H <- H_b
      H <- reassign_samples(H)
    } else {
      if (H_b@metadata$sqHell > ref_het) {
        cat(paste0("+",H_b@metadata$slf[length(H_b@metadata$slf)]))
        H <- H_b
        H <- reassign_samples(H)
      } else {
        break
      }

      separation <- evaluate_set(H = H,verbose = F)
      low_imp <- separation[which.min(separation$rel.importance),]
      if(low_imp$rel.importance < rel_imp){
        ohne <- low_imp$Feature
        cat(paste0("-",ohne))
        H <- subset_hetset(H = H,remove.features = ohne)
        iter_no <- Inf
      }
      iter_no <- iter_no + 1
    }
  }
  if (sum(!is.na(H@metadata$slf))) {
    H@metadata$color_type <- "subpop"
  }

  H
}

# Initialize Scenario A -------------------------------------------------------
.init_meth_A_univ <- function(H) {
  cat(" no partitioning, no set ... initializing ... univariate scan ...  \n")
  H_b <- H
  H_b@metadata$sqHell <- 0
  H_c <- H
  for (i_c in H@NAMES) {
    H_c@metadata$slf <- i_c
    # fit normal mixture model using mclust
    # add noise for robustness of mclust
    mc_fit_data <- (assays(H_c[H_c@metadata$slf, ])[[1]]
    + 0.0025 * sd(assays(H_c[H_c@metadata$slf, ])[[1]], na.rm = T)
        * rnorm(n = length(assays(H_c[H_c@metadata$slf, ])[[1]])))
    mc_fit <- mclust::Mclust(data = mc_fit_data, G = 2, modelNames = "V",
                             verbose = F)
    # calculate
    H_c$prt <- factor(c("A", "B")[mc_fit$classification], levels = c("A", "B"))
    H_c@metadata$prm.A$mean <- mc_fit$parameters$mean[1]
    H_c@metadata$prm.A$cov <- mc_fit$parameters$variance$sigmasq[1]
    H_c@metadata$prm.B$mean <- mc_fit$parameters$mean[2]
    H_c@metadata$prm.B$cov <- mc_fit$parameters$variance$sigmasq[2]
    H_c@metadata$prp.A <- sum(H_c$prt == "A") / sum(!is.na(H_c$prt))
    H_c <- calculate_dist(H_c)
    if ((H_c@metadata$sqHell > H_b@metadata$sqHell) &
      (min(c(H_c@metadata$prp.A, 1 - H_c@metadata$prp.A)) > 0.05)) {
      H_c@metadata$prm.full$mean <- mean(assays(H[i_c, ])[[1]], na.rm = T)
      H_c@metadata$prm.full$cov <- var(t(assays(H[i_c, ])[[1]]), na.rm = T)
      H_b <- H_c
    }
  }
  H <- H_b
  H
}

.init_meth_A_biv <- function(H) {
  cat(" no partitioning, no set ... initializing ... bivariate scan ...  \n")
  H_b <- H
  H_b@metadata$sqHell <- 0
  H_c <- H
  feature_pairs <- t(combn(x = H@NAMES, m = 2))
  N_combn <- dim(feature_pairs)[1]
  for (i_c in 1:N_combn) {
    pair <- feature_pairs[i_c, ]
    H_c@metadata$slf <- pair
    # fit normal mixture model using mclust
    # add noise for robustness of mclust
    mc_fit_data <- (t(assays(H_c[H_c@metadata$slf, ])[[1]])
    + 0.001 * sd(assays(H_c[H_c@metadata$slf, ])[[1]], na.rm = T)
        * rnorm(n = prod(dim(assays(H_c[H_c@metadata$slf, ])[[1]]))))
    mc_fit <- mclust::Mclust(data = mc_fit_data, G = 2, modelNames = "VVV",
                             verbose = F)
    # calculate
    H_c$prt <- factor(c("A", "B")[mc_fit$classification], levels = c("A", "B"))
    H_c@metadata$prm.A$mean <- mc_fit$parameters$mean[, 1]
    H_c@metadata$prm.A$cov <- mc_fit$parameters$variance$sigma[, , 1]
    H_c@metadata$prm.B$mean <- mc_fit$parameters$mean[, 2]
    H_c@metadata$prm.B$cov <- mc_fit$parameters$variance$sigma[, , 2]
    H_c@metadata$prp.A <- sum(H_c$prt == "A") / sum(!is.na(H_c$prt))
    H_c <- calculate_dist(H_c)
    if ((H_c@metadata$sqHell > H_b@metadata$sqHell) &
        (min(c(H_c@metadata$prp.A, 1 - H_c@metadata$prp.A)) > 0.05)) {
      H_c@metadata$prm.full$mean <- rowMeans(assays(H[pair, ])[[1]], na.rm = T)
      H_c@metadata$prm.full$cov <- cov(t(assays(H[pair, ])[[1]]),
                                       use = "complete.obs")
      H_b <- H_c
    }
  }
  H <- H_b
  H
}

# Initialize Scenario B -------------------------------------------------------
.init_meth_B <- function(H, scan_depth = 2) {
  cat(" partitioning, no set ... initializing ...   \n")
  H_b <- H
  H_c <- H
  rm(H)
  H_b@metadata$sqHell <- 0
  feature_pairs <- t(combn(x = H_b@NAMES, m = scan_depth))
  N_combn <- dim(feature_pairs)[1]
  for (i_c in 1:N_combn) {
    pair <- feature_pairs[i_c, ]
    H_c@metadata$slf <- pair
    # calculate
    H_c <- estimate_densities(H_c)
    H_c <- calculate_dist(H_c)
    if (H_c@metadata$sqHell > H_b@metadata$sqHell) {
      H_b <- H_c
    }
  }
  H <- H_b
  H
}

# Initialize Scenario C -------------------------------------------------------
.init_meth_C <- function(H) {
  cat(" no partitioning, set of intital features ... initializing ...   \n")
  if (sum(!is.na(H@metadata$slf)) == 1) {
    # fit normal mixture model using mclust
    # add noise for robustness of mclust
    mc_fit_data <- (assays(H[H@metadata$slf, ])[[1]]
    + 0.001 * sd(assays(H[H@metadata$slf, ])[[1]], na.rm = T)
        * rnorm(n = length(assays(H[H@metadata$slf, ])[[1]])))
    mc_fit <- mclust::Mclust(
      data = mc_fit_data, G = 2, modelNames = "V",
      verbose = F
    )
    H@metadata$prm.A$mean <- mc_fit$parameters$mean[1]
    H@metadata$prm.A$cov <- mc_fit$parameters$variance$sigmasq[1]
    H@metadata$prm.B$mean <- mc_fit$parameters$mean[2]
    H@metadata$prm.B$cov <- mc_fit$parameters$variance$sigmasq[2]
    H@metadata$prm.full$mean <- mean(mc_fit_data, na.rm = T)
    H@metadata$prm.full$cov <- var(mc_fit_data, na.rm = T)
  } else {
    # fit normal mixture model using mclust
    # add noise for robustness of mclust
    mc_fit_data <- (t(assays(H[H@metadata$slf, ])[[1]])
    + 0.001 * sd(assays(H[H@metadata$slf, ])[[1]], na.rm = T)
        * rnorm(n = prod(dim(assays(H[H@metadata$slf, ])[[1]]))))
    mc_fit <- mclust::Mclust(
      data = mc_fit_data, G = 2, modelNames = "VVV",
      verbose = F
    )
    H@metadata$prm.A$mean <- mc_fit$parameters$mean[, 1]
    H@metadata$prm.A$cov <- mc_fit$parameters$variance$sigma[, , 1]
    H@metadata$prm.B$mean <- mc_fit$parameters$mean[, 2]
    H@metadata$prm.B$cov <- mc_fit$parameters$variance$sigma[, , 2]
    H@metadata$prm.full$mean <- colMeans(mc_fit_data, na.rm = T)
    H@metadata$prm.full$cov <- cov(mc_fit_data, use = "complete.obs")
  }
  H$prt <- factor(c("A", "B")[mc_fit$classification], levels = c("A", "B"))
  H@metadata$prp.A <- sum(H$prt == "A") / sum(!is.na(H$prt))
  H <- calculate_dist(H)
  H
}

ZytoHMGU/hetset documentation built on June 6, 2019, 2:16 p.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

ZytoHMGU/hetset
Identification of Heterogeneous Subsets in Data

R/hetset_FSS.R
In ZytoHMGU/hetset: Identification of Heterogeneous Subsets in Data

R Package Documentation

Browse R Packages

We want your feedback!

ZytoHMGU/hetset Identification of Heterogeneous Subsets in Data

R/hetset_FSS.R In ZytoHMGU/hetset: Identification of Heterogeneous Subsets in Data

R Package Documentation

Browse R Packages

We want your feedback!

ZytoHMGU/hetset
Identification of Heterogeneous Subsets in Data

R/hetset_FSS.R
In ZytoHMGU/hetset: Identification of Heterogeneous Subsets in Data