R/best_practice_helper.R

Defines functions get_expo_corr_stat get_similarity_stats get_3d_array_stats get_stat_samps get_stat_sigs tf_signature normalize_solution process_solution

# Extraction helpers ------------------------------------------------------

process_solution <- function(slist, catalogue_matrix,
                             report_integer_exposure = FALSE,
                             only_core_stats = FALSE) {
  on.exit(invisible(gc()), add = TRUE)
  send_info("Normalizing solutions to get Signature and Exposure.")
  out <- purrr::map(slist, .f = normalize_solution) %>%
    setNames(paste0("Run", seq_along(slist)))
  sn <- ncol(out[[1]]$Signature)
  send_info("Start processing ", sn, " signatures.")

  # If there are hyper-mutant records, collapse the samples into true samples
  if (any(grepl("_\\[hyper\\]_", colnames(catalogue_matrix)))) {
    send_info("Hyper-mutant records detected, revert them to true samples.")
    catalogue_matrix <- collapse_hyper_records(catalogue_matrix)
    purrr::map(seq_along(out), function(i) {
      out[[i]]$Exposure <<- collapse_hyper_records(out[[i]]$Exposure)
    })
    purrr::map(seq_along(out), function(i) {
      out[[i]]$H <<- collapse_hyper_records(out[[i]]$H)
    })
    send_success("Reverted.")
  }

  # If just one run, skip the following steps
  if (length(out) >= 2) {
    send_info("Running clustering with match algorithm for signature assignment in different NMF runs.")
    run_pairs <- combn(names(out), 2, simplify = FALSE)
    run_pairs_name <- purrr::map_chr(run_pairs, ~ paste(., collapse = "_"))
    send_success("Run pairs obtained.")
    # Get similarity distance
    pair_dist <- purrr::map(
      run_pairs,
      ~ get_cosine_distance(out[[.[1]]]$Signature, out[[.[2]]]$Signature)
    ) %>%
      setNames(run_pairs_name)
    send_success("Run pairwise similarity distance calculated.")
    pair_dist_mean <- purrr::map_dbl(pair_dist, mean) %>%
      setNames(run_pairs_name)
    match_list <- purrr::map2(pair_dist, run_pairs, get_matches)
    send_success("Match list obtained.")

    # Order the result by mean distance
    res_orders <- order(pair_dist_mean)
    run_pairs <- run_pairs[res_orders]
    pair_dist <- pair_dist[res_orders]
    pair_dist_mean <- pair_dist_mean[res_orders]
    match_list <- match_list[res_orders]
    send_success("Match list ordered by distance.")

    # Do clustering with match
    clusters <- clustering_with_match(match_list, n = length(out))
    send_success("Clustering done.")

    # 按 match cluster 排序
    # 样本排序也确保对齐
    send_info("Ordering data based on the clustering result.")
    samp_order <- colnames(out$Run1$Exposure)
    out <- purrr::map2(clusters, out, function(x, y, samp_order) {
      y$Signature <- y$Signature[, x, drop = FALSE]
      y$Exposure <- y$Exposure[x, samp_order, drop = FALSE]
      y
    }, samp_order = samp_order)
    send_success("Done.")
  } else {
    run_pairs <- NA
    pair_dist <- NA
    pair_dist_mean <- NA
    match_list <- NA
    clusters <- data.table::data.table(
      Run1 = paste0("S", seq_len(ncol(out$Run1$Signature)))
    )
  }

  send_info("Calculating stats for signatures and samples.")
  # 生成统计量
  # Remember the new order above
  # 分为 signature 和 样本两种,取每个度量的最大、最小值、平均值以及 SD
  # 重构相似性,轮廓系数,
  # 聚类平均相似距离, 平均错误,Exposure 相关性等
  send_info("Getting signature stats.")
  stat_sigs <- get_stat_sigs(out)
  send_info("Getting sample stats.")
  stat_samps <- get_stat_samps(out, mat = catalogue_matrix, only_core_stats = only_core_stats)
  send_success("Done.")

  send_info("Outputing extraction result and corresponding stats.")

  Signature <- stat_sigs$signature
  Exposure <- stat_samps$exposure
  stats_signature <- stat_sigs$stats
  stats_sample <- stat_samps$stats

  # Order by contribution and set signature names
  new_order <- order(colSums(Signature$signature_mean), decreasing = TRUE)

  Signature <- lapply(Signature, function(s) {
    s <- s[, new_order, drop = FALSE]
    rownames(s) <- rownames(catalogue_matrix)
    colnames(s) <- paste0("Sig", seq_along(new_order))
    s
  })

  Exposure <- lapply(Exposure, function(e) {
    e <- e[new_order, , drop = FALSE]
    colnames(e) <- colnames(catalogue_matrix)
    rownames(e) <- paste0("Sig", seq_along(new_order))
    e
  })

  # Update the order for stats_signature and set signature names
  stats_signature <- stats_signature[new_order, ]
  stats_signature$signature <- paste0("Sig", seq_along(new_order))

  # Merge the stats and generate measures for selecting signature number
  stats <- data.frame(
    signature_number = nrow(stats_signature),
    silhouette = mean(stats_signature$silhouette),
    sample_cosine_distance = mean(stats_sample$cosine_distance_mean),
    L1_error = mean(stats_sample$L1_mean),
    L2_error = mean(stats_sample$L2_mean),
    exposure_positive_correlation = mean(stats_signature$expo_pos_cor_mean),
    signature_similarity_within_cluster = mean(stats_signature$similarity_mean),
    signature_similarity_across_cluster = mean(stats_signature$cross_similarity_mean),
    silhouette_sample = if (isTRUE(only_core_stats)) NA else mean(stats_sample$silhouette),
    # 不同 runs 同一样本看作一个 cluster
    # 展示的是样本间的区分度
    stringsAsFactors = FALSE
  )
  send_success("Data.frame for stats generated.")

  # Generate Signature object
  object <- tf_signature(
    Signature$signature_mean,
    Exposure$exposure_mean,
    used_runs = length(out),
    catalogue_matrix = if (report_integer_exposure) catalogue_matrix else NULL
  )
  send_success("Signature object generated.")

  list(
    object = object,
    stats = stats,
    stats_signature = stats_signature,
    stats_sample = stats_sample,
    signature = Signature,
    exposure = Exposure
  )
}

normalize_solution <- function(solution) {
  # solution is a NMF fit result
  on.exit(invisible(gc()), add = TRUE)

  out <- c(helper_scale_nmf_matrix(solution$W, solution$H, solution$K, handle_cn = FALSE),
    list(W = solution$W, H = solution$H),
    KLD = solution$KLD
  )
  colnames(out$Signature) <- colnames(out$W) <- rownames(out$Exposure) <- rownames(out$H) <- paste0("S", seq_len(solution$K))
  out
}

# Transform data into Signature object
tf_signature <- function(s, e, used_runs, catalogue_matrix = NULL) {
  on.exit(invisible(gc()), add = TRUE)

  # If total_records is not NULL
  # generate integer counts based on resampling
  s.norm <- apply(s, 2, function(x) x / sum(x, na.rm = TRUE))
  e.norm <- apply(e, 2, function(x) x / sum(x, na.rm = TRUE))
  # When only one signature
  if (!is.matrix(e.norm)) {
    e.norm <- matrix(e.norm,
      nrow = 1,
      dimnames = list(rownames(e), names(e.norm))
    )
  }

  if (!is.null(catalogue_matrix)) {
    set.seed(123, kind = "L'Ecuyer-CMRG")
    s2 <- purrr::map2(
      as.data.frame(s),
      round((colSums(s) / sum(colSums(s))) * sum(catalogue_matrix)),
      simulate_catalogue
    ) %>%
      dplyr::as_tibble() %>%
      as.matrix()
    rownames(s2) <- rownames(s)
    colnames(s2) <- colnames(s)
    s <- s2

    if (nrow(e) < 2) {
      e2 <- matrix(colSums(catalogue_matrix), nrow = 1)
    } else {
      set.seed(123, kind = "L'Ecuyer-CMRG")
      e2 <- purrr::map2(
        as.data.frame(e),
        colSums(catalogue_matrix),
        simulate_catalogue
      ) %>%
        dplyr::as_tibble() %>%
        as.matrix()
    }
    rownames(e2) <- rownames(e)
    colnames(e2) <- colnames(e)
    e <- e2
  }

  obj <- list(
    Signature = s,
    Signature.norm = s.norm,
    Exposure = e,
    Exposure.norm = e.norm,
    K = nrow(e)
    # Raw = list()
  )

  class(obj) <- "Signature"
  attr(obj, "used_runs") <- used_runs
  attr(obj, "method") <- "brunet"
  attr(obj, "call_method") <- "NMF with best practice"

  obj
}

get_stat_sigs <- function(runs) {
  on.exit(invisible(gc()), add = TRUE)

  send_info("\t getting exposure array.")
  sig_list <- purrr::map(runs, "Signature")
  dm <- dim(sig_list[[1]])
  l <- length(sig_list)
  sig_array <- array(unlist(sig_list), dim = c(dm, l))
  rm(sig_list)
  invisible(gc())
  # signatures
  sig <- data.frame(
    signature_number = rep(dm[2], dm[2]),
    signature = seq_len(dm[2]) # Rename it outside the function
  )

  send_info("\t summarizing signatures from different runs.")
  s <- get_3d_array_stats(
    sig_array,
    c(
      "signature_mean", "signature_sd",
      "signature_min", "signature_max"
    )
  )

  send_info("\t summarizing sample profile similarity.")
  # 每个 signature 看作一个类,看一个类中不同 runs 结果的相似性
  sim <- get_similarity_stats(
    sig_array,
    n = dm[2],
    ns = c(
      "similarity_mean", "similarity_sd",
      "similarity_min", "similarity_max"
    )
  )

  # cluster silhouette
  send_info("\t summarizing signature-wise profile similarity.")
  cross_sim <- get_similarity_stats(
    sig_array,
    n = dm[2],
    ns = c(
      "cross_similarity_mean", "cross_similarity_sd",
      "cross_similarity_min", "cross_similarity_max"
    ),
    type = "between-cluster"
  )
  b <- 1 - cross_sim$cross_similarity_mean
  a <- 1 - sim$similarity_mean
  sil_width <- data.frame(
    silhouette = (b - a) / pmax(a, b)
  )
  # The minimum KLD in all kept runs
  KLD <- data.frame(
    KLD = rep(min(purrr::map_dbl(runs, "KLD")), length(a))
  )

  # Get exposure correlation between different signatures
  expo_list <- purrr::map(runs, "Exposure")
  dm <- dim(expo_list[[1]])
  expo_array <- array(unlist(expo_list), dim = c(dm, l))
  rm(sig_array, expo_list, runs)
  invisible(gc())
  expo_cor <- get_expo_corr_stat(expo_array)

  list(
    signature = s,
    stats = cbind(sig, KLD, sil_width, expo_cor, sim, cross_sim)
  )
}

get_stat_samps <- function(runs, mat, only_core_stats = FALSE) {
  on.exit(invisible(gc()), add = TRUE)

  send_info("\t getting exposure array.")
  expo_list <- purrr::map(runs, "Exposure")
  dm <- dim(expo_list[[1]]) # the second value of dm indicates how many samples
  l <- length(expo_list)
  expo_array <- array(unlist(expo_list), dim = c(dm, l))
  rm(expo_list)
  invisible(gc())
  # exposures
  send_info("\t summarizing exposures from different runs.")
  e <- get_3d_array_stats(
    expo_array,
    c(
      "exposure_mean", "exposure_sd",
      "exposure_min", "exposure_max"
    )
  )
  # get catalog array
  send_info("\t getting catalog array.")
  W_list <- purrr::map(runs, "W")
  H_list <- purrr::map(runs, "H")
  catalog_list <- purrr::map2(W_list, H_list, ~ .x %*% .y)
  dm2 <- dim(catalog_list[[1]])
  catalog_array <- array(unlist(catalog_list), dim = c(dm2, l))
  rm(runs, expo_array, W_list, H_list, catalog_list)
  invisible(gc())

  if (isFALSE(only_core_stats)) {
    send_info("\t summarizing sample profile similarity.")
    sim <- get_similarity_stats(
      catalog_array,
      n = dm2[2],
      ns = c(
        "similarity_mean", "similarity_sd",
        "similarity_min", "similarity_max"
      )
    )

    # cluster silhouette
    send_info("\t summarizing sample-wise profile similarity.")
    cross_sim <- get_similarity_stats(
      catalog_array,
      n = dm2[2],
      ns = c(
        "cross_similarity_mean", "cross_similarity_sd",
        "cross_similarity_min", "cross_similarity_max"
      ),
      type = "between-cluster"
    )
    b <- 1 - cross_sim$cross_similarity_mean
    a <- 1 - sim$similarity_mean
    sil_width <- data.frame(
      silhouette = (b - a) / pmax(a, b)
    )
  }

  samp <- data.frame(
    signature_number = rep(dm[1], ncol(mat)),
    sample = colnames(mat)
  )

  # 重构 error:包括 重构相似度
  send_info("\t summarizing sample profile reconstruction stats.")
  error <- get_error_stats(
    catalog_array,
    mat
  )

  list(
    exposure = e,
    stats = if (only_core_stats) cbind(samp, error) else cbind(samp, sil_width, error)
  )
}

get_3d_array_stats <- function(x, ns = NULL) {
  on.exit(invisible(gc()), add = TRUE)

  r <- list(
    mean = apply(x, c(1, 2), mean),
    sd = apply(x, c(1, 2), sd),
    min = apply(x, c(1, 2), min),
    max = apply(x, c(1, 2), max)
  )
  if (!is.null(ns)) {
    names(r) <- ns
  }
  r
}

get_similarity_stats <- function(x,
                                 n,
                                 ns = NULL,
                                 type = "within-cluster") {
  on.exit(invisible(gc()), add = TRUE)

  if (type == "within-cluster") {
    d <- lapply(seq_len(n), function(i) {
      x2 <- x[, i, ]
      if (is.null(dim(x2))) {
        x2 <- matrix(x2, ncol = 1)
      }
      mat <- cosineMatrix(x2, x2)
      if (ncol(mat) > 1) {
        mat[upper.tri(mat)]
      } else {
        mat
      }
    })
  } else if (type == "between-cluster") {
    # 找出最近的组(平均相似性最高)
    d <- lapply(seq_len(n), function(i) {
      if (dim(x)[2] >= 2) {
        x1 <- x[, i, ]
        x2 <- x[, -i, , drop = FALSE]

        if (is.null(dim(x1))) {
          x1 <- matrix(x1, ncol = 1)
        }
        if (dim(x2)[3] == 1) {
          dx2 <- dim(x2)
          dim(x2) <- c(dx2[1], dx2[2] * dx2[3])
          cosineMatrix(x1, x2)
        } else {
          # x2 是 3 维,且第 3 维不止 1
          d2 <- dim(x2)[2]
          sim_list <- lapply(seq_len(d2), function(k) {
            mat <- cosineMatrix(x1, x2[, k, ])
            if (ncol(mat) > 1) {
              mat[upper.tri(mat)]
            } else {
              mat
            }
          })
        }
        # dim(x2) <- c(dim(x1)[1], prod(dim(x2)) / dim(x1)[1])
        sim_list[[which.max(sapply(sim_list, mean))]]
      } else {
        NA
      }
    })
  }
  r <- data.frame(
    mean = sapply(d, mean),
    sd = sapply(d, sd),
    min = sapply(d, min),
    max = sapply(d, max),
    stringsAsFactors = FALSE
  )
  if (!is.null(ns)) {
    colnames(r) <- ns
  }
  r
}

# Quantify the exposure correlation between different signatures with
# Pearson coefficient
get_expo_corr_stat <- function(x) {
  on.exit(invisible(gc()), add = TRUE)

  n <- dim(x)[1] # n signatures
  r <- dim(x)[3] # r runs

  if (n > 1) {
    d <- lapply(seq_len(n), function(i) {
      x1 <- x[i, , , drop = FALSE]
      x2 <- x[-i, , , drop = FALSE]

      rows <- dim(x2)[1]
      corr <- vector("numeric", rows * r)
      # calculate exposure corr in each run
      j <- 1L
      for (row in seq_len(rows)) {
        for (rn in seq_len(r)) {
          corr[j] <- stats::cor(
            x1[1, , rn],
            x2[row, , rn],
            method = "spearman"
          )
          j <- j + 1L
        }
      }

      # 仅关注正相关
      corr <- corr[corr > 0]
      # 乘以一个数量权重
      if (length(corr) == 0) {
        NA
      } else {
        corr <- corr * (length(corr) / (rows * r))
        corr
      }
    })
  } else {
    d <- NA
  }

  r <- data.frame(
    expo_pos_cor_mean = sapply(d, mean),
    expo_pos_cor_sd = sapply(d, sd),
    expo_pos_cor_min = sapply(d, min),
    expo_pos_cor_max = sapply(d, max)
  )
  purrr::map_df(r, ~ ifelse(is.na(.), 0, .)) %>%
    as.data.frame()
}

# The difference between reconstructed catalogs and the original catalog
# 用相似性距离、L1、L2范数
get_error_stats <- function(x, mat) {
  on.exit(invisible(gc()), add = TRUE)

  n <- dim(mat)[2] # n samples
  r <- dim(x)[3] # r runs

  # similarity distance
  d <- lapply(seq_len(n), function(i) {
    x2 <- x[, i, ]
    if (is.null(dim(x2))) {
      x2 <- matrix(x2, ncol = 1)
    }
    1 - cosineMatrix(x2, mat[, i, drop = FALSE])
  })
  # L1 and L2
  # Target at each column (i.e. sample)
  l1 <- lapply(seq_len(n), function(i) {
    colSums(abs(x[, i, ] - mat[, rep(i, r), drop = FALSE]))
  })
  l2 <- lapply(seq_len(n), function(i) {
    sqrt(colSums((x[, i, ] - mat[, rep(i, r), drop = FALSE])^2))
  })

  r <- data.frame(
    cosine_distance_mean = sapply(d, mean),
    cosine_distance_sd = sapply(d, sd),
    cosine_distance_min = sapply(d, min),
    cosine_distance_max = sapply(d, max),
    L1_mean = sapply(l1, mean),
    L1_sd = sapply(l1, sd),
    L1_min = sapply(l1, min),
    L1_max = sapply(l1, max),
    L2_mean = sapply(l2, mean),
    L2_sd = sapply(l2, sd),
    L2_min = sapply(l2, min),
    L2_max = sapply(l2, max),
    stringsAsFactors = FALSE
  )
  r
}

# Column representing signatures
get_cosine_distance <- function(x, y) {
  # rows for x and cols for y
  out <- 1 - cosineMatrix(x, y)
  rownames(out) <- paste0("S", seq_len(ncol(x)))
  colnames(out) <- paste0("S", seq_len(ncol(y)))
  out
}

# test_mat <- matrix(
#   c(0.6, 0.1, 0.5, 0.3, 0.1, 0.7, 0.2, 0.4, 0.9),
#   nrow = 3
# )
# rownames(test_mat) <- colnames(test_mat) <- c("S1", "S2", "S3")
get_matches <- function(mat, runs) {
  # Find the column index of the second run responds to the first run
  match_index <- apply(mat, 1, which.min) %>% as.integer()
  index_uniq <- unique(match_index)
  if (length(match_index) != length(index_uniq)) {
    # Some signatures in the first run is matched by more than once
    # Solve it with lpSolve package
    index_uniq <- apply(lpSolve::lp.assign(mat)$solution, 1, which.max)
  }
  out <- data.table::data.table(
    v1 = rownames(mat),
    v2 = colnames(mat)[index_uniq]
  )
  colnames(out) <- runs
  out
}

# My implementation of clustering with match algorithm proposed
# by Nature Cancer paper by following description in supplementary material
# Steps:
# 1. 得到不同 runs 的 signature 结果列表,进行编号
# 2. 一对一配对计算相似性,并得到距离矩阵
# 3. 每个距离矩阵计算最小平均距离
# 4. 按平均距离对配对 run 进行排序,得到排序好的列表
# 5. 初始化排序列表(步骤4的子集),按顺序利用算法逐步合并
clustering_with_match <- function(match_list, n) {
  on.exit(invisible(gc()), add = TRUE)
  # Input: a match list ordered by mean distance
  # This function implements the core step:
  #   collapse the match list into one data.table step by step
  len <- length(match_list)

  if (len >= 2) {
    reduceG <- function(G) {
      # Reduce elements of G if at least two elements
      # contain common column names
      # G >= 2 elements here
      if (length(G) < 2) {
        return(G)
      }
      cnames <- purrr::map(G, colnames)
      check_list <- combn(seq_along(cnames), 2, simplify = FALSE)
      common <- purrr::map(check_list, ~ intersect(cnames[[.[1]]], cnames[[.[2]]]))

      # Index to reduce
      ri <- purrr::map_lgl(common, ~ length(.) != 0)
      if (any(ri)) {
        purrr::map2(check_list[ri], common[ri], .f = function(x, y) {
          if (!is.na(G[x[1]]) & !is.na(G[x[2]])) {
            # Update global G in reduceG
            G[[min(x)]] <<- merge(G[[x[1]]], G[[x[2]]], by = y)
            # to make sure the data is removed and the index
            # is kept to avoid "subscript out of bounds" error
            G[[max(x)]] <<- NA
          }
        })
        # Remove elements set to NA
        G <- G[!is.na(G)]
        return(reduceG(G))
      } else {
        return(G)
      }
    }

    updateG <- function(G, m) {
      # Add a match m to the set G
      # Check if m can be merged into G
      # if not, add m as a new element of G
      # if yes, merge the m and go further
      # check if the G can be reduced
      flag <- FALSE
      for (i in seq_along(G)) {
        nm <- colnames(G[[i]])
        byi <- colnames(m) %in% nm
        if (any(byi)) {
          # Can be merged
          G[[i]] <- merge(G[[i]], m, by = colnames(m)[byi])
          flag <- TRUE
          break()
        }
      }
      if (isFALSE(flag)) {
        # Set as a new member of G
        G[[length(G) + 1]] <- m
      } else if (length(G) > 1) {
        # Go further check if G can be reduced
        # 这是个递归操作
        G <- reduceG(G)
      }

      return(G)
    }

    G <- match_list[1]
    pair_list <- purrr::map(match_list, colnames)

    # Loop for integration
    # 找到另外 n - 2 个需要合并的 match
    # 边查找,边合并
    for (i in seq_along(pair_list)) {
      if (i == 1L) next()
      if (any(purrr::map_lgl(G, ~ all(pair_list[[i]] %in% colnames(.))))) {
        # Do nothing because the result has been included
        next()
      } else {
        # Include the result
        G <- updateG(G, match_list[[i]])
      }
      if (length(G) == 1L & ncol(G[[1]]) == n) break()
    }
    G <- G[[1]]
    # Check the G set
    if (any(ncol(G) != n, nrow(G) != nrow(match_list[[1]]))) {
      stop("Clustering failed! There are some bugs in code the developer not found, please report it!")
    }

    G[, paste0("Run", seq_len(ncol(G))), with = FALSE]
  } else {
    match_list[[1]]
  }
}

rank_solutions <- function(stats) {
  # 同时迭代 signature number, 指标 data.frame 和对应的排序方法
  # stats 必须按 signature_number 从小到大 order,不然一些定量会有问题
  # 如果 rank 得分相同,后面应该选择 signature 数目较大的
  stats <- stats[order(stats$signature_number), ]

  measures <- c(
    "silhouette", "sample_cosine_distance", "L2_error",
    "exposure_positive_correlation", "signature_similarity_within_cluster"
  )

  types <- c("diff", "increase", "increase", "increase", "diff")

  rk <- purrr::map2_df(
    .x = stats[, measures],
    .y = types,
    .f = function(x, y) {
      if (y == "increase") {
        # Smaller is better
        100 - (rank(x) - 1) * 5
      } else if (y == "decrease") {
        # Larger is better
        100 - (length(x) - rank(x, na.last = FALSE)) * 5
      } else {
        # diff type
        df <- c(diff(x), 0)
        100 - (rank(df) - 1) * 5
      }
    }
  )

  measure2 <- c("silhouette", "L2_error")
  send_info(
    "Calculating integrated rank score based on the measures: ",
    paste(measure2, collapse = ", ")
  )
  weights <- c(0.4, 0.6)
  send_info(
    "Corresponding weights for obtaining the aggregated score are: ",
    paste(weights, collapse = ", ")
  )
  rk$aggregated_score <- as.numeric(as.matrix(rk)[, measure2] %*% weights)
  rk <- cbind(data.frame(signature_number = stats$signature_number), rk)

  rk
}

# Activity helpers --------------------------------------------------------

# 构建一个 signature by sample 逻辑矩阵,
# 先指示某个 signature 是否在某个样本中存在
# 根据已知的 exposure 进行初始化,设定<0.05 相对贡献不存在
# 0 表示不存在 signature,1 表示存在,2 表示全局 signature
# 一开始的矩阵只可能是一个1和2构成的矩阵,后续的操作会进行填0操作
construct_sig_exist_matrix <- function(expo, sample_class = NULL, cutoff = 0.05) {
  out <- expo
  if (is.null(sample_class)) {
    # 没有群组标签(即1组),那么所有 signature 都有可能
    out[, ] <- 1L
  } else {
    # 只有 1 组标签也是如此
    grps <- unique(sample_class)
    if (length(grps) == 1L) {
      out[, ] <- 1L
    } else {
      out <- ifelse(out > cutoff, 1L, 0L) # 先初始化

      # Check sample_class
      sample_class <- sample_class[colnames(expo)]

      if (length(sample_class) != ncol(expo)) {
        stop("Not all samples are assigned to a class (subtype).")
      }

      # 用 group 标签覆盖样本标签
      lapply(grps, function(grp) {
        idx <- names(sample_class[sample_class == grp])
        ex <- sum(out[, idx] == 1L) > 5
        if (ex) {
          out[, idx] <<- 1L
        }
      })
      out[out == 0L] <- 2L # 还存在 0 的地方标记为全局 signature
    }
  }
  return(out)
}

## Bootstrap approach
optimize_exposure_in_one_sample_bt <- function(catalog,
                                               flag_vector, # a set of 1L or 2L
                                               sample,
                                               sig_matrix,
                                               tmp_dir,
                                               bt_use_prop = FALSE) {
  tmpf_expo <- file.path(
    tmp_dir,
    paste0(sample, "_expo.rds")
  )
  tmpf_sim <- file.path(
    tmp_dir,
    paste0(sample, "_sim.rds")
  )

  if (all(file.exists(tmpf_expo), file.exists(tmpf_sim))) {
    message("ALL result file exists, skipping...")
  } else {
    on.exit(invisible(gc()), add = TRUE)
    force(flag_vector)
    expo2 <- vector("numeric", length(flag_vector))
    flag1 <- which(flag_vector == 1L)

    message("Handling sample: ", sample)

    catalog_mat <- matrix(
      catalog,
      ncol = 1,
      dimnames = list(rownames(sig_matrix), sample)
    )

    out <- sig_fit_bootstrap(
      catalog_mat,
      sig_matrix[, flag1, drop = FALSE],
      n = 100,
      type = "absolute"
    )

    # Use median
    expo <- rowMedians(out$expo, na.rm = TRUE)
    sim <- median(out$cosine, na.rm = TRUE)

    th <- sum(catalog) * 0.01
    reset_idx <- if (bt_use_prop) {
      apply(out$expo, 1, function(x) {
        mean(x < th, na.rm = TRUE) > 0.05
      })
    } else {
      apply(out$expo, 1, function(x) {
        p <- my.t.test.p.value(x, mu = th, alternative = "greater")
        if (is.na(p)) {
          message("NA result detected from t.test, use empirical p value (proportion).")
          p <- mean(x < th, na.rm = TRUE)
        }

        p > 0.05
      })
    }

    if (any(reset_idx)) {
      expo[reset_idx] <- 0
    }

    expo2[flag1] <- expo

    message("Save expo result to temp file ", tmpf_expo)
    saveRDS(expo2, file = tmpf_expo)

    message("Save similarity result to temp file ", tmpf_sim)
    saveRDS(sim, file = tmpf_sim)
  }

  NULL
}


## Stepwise approach
optimize_exposure_in_one_sample <- function(catalog,
                                            flag_vector, # a set of 1L or 2L
                                            sample,
                                            sig_matrix,
                                            tmp_dir) {
  tmpf_expo <- file.path(
    tmp_dir,
    paste0(sample, "_expo.rds")
  )
  tmpf_sim <- file.path(
    tmp_dir,
    paste0(sample, "_sim.rds")
  )

  if (all(file.exists(tmpf_expo), file.exists(tmpf_sim))) {
    message("ALL result file exists, skipping...")
  } else {
    on.exit(invisible(gc()), add = TRUE)
    force(flag_vector)
    flag1_bk <- flag1 <- which(flag_vector == 1L)
    flag2 <- which(flag_vector == 2L)

    expo <- vector("numeric", length(flag_vector))
    catalog_mat <- matrix(
      catalog,
      ncol = 1,
      dimnames = list(rownames(sig_matrix), sample)
    )

    # 先处理得到一个 baseline similarity 值
    message("Processing sample: ", sample)
    message("\t getting baseline similarity.")
    baseline <- .get_one_catalog_similarity(
      catalog_mat, flag1, sig_matrix,
      return_all = TRUE
    )
    message("\t\t", baseline$sim, " based on ", length(flag1), " signatures.")

    # 先处理标记 1,如果相似性降低小于 0.01,移除
    message("\t getting updated similarity by removing one signature in batch.")
    sim_rm <- purrr::map_dbl(
      flag1,
      ~ .get_one_catalog_similarity(catalog_mat, setdiff(flag1, .), sig_matrix)
    )

    rm_id <- sim_rm - baseline$sim >= -0.01 ## 会不会出现全都可以扔掉?待观测

    if (any(rm_id)) {
      flag1 <- flag1[!rm_id]

      if (length(flag1) > 0) {
        baseline <- .get_one_catalog_similarity(
          catalog_mat, flag1, sig_matrix,
          return_all = TRUE
        )
        message("\t\t", baseline$sim, " with ", length(flag1), " signatures left.")
      }
    } else {
      message("\t no signature need to be removed.")
    }

    # 然后再处理可能的标记 2,如果相似性增加大于 0.05,则加入
    if (length(flag2) > 0) {
      message("\t getting updated similarity by adding one global signature in batch.")
      sim_add <- purrr::map_dbl(
        flag2,
        ~ .get_one_catalog_similarity(catalog_mat, c(flag1, .), sig_matrix)
      )

      add_id <- sim_add - baseline$sim > 0.05

      if (any(add_id)) {
        flag1 <- sort(c(flag1, flag2[add_id]))
        baseline <- .get_one_catalog_similarity(
          catalog_mat, flag1, sig_matrix,
          return_all = TRUE
        )
        message("\t\t", baseline$sim, " with ", length(flag1), " signatures left.")
      } else {
        message("\t no global signature need to be added.")
      }
    }

    # Output the result
    if (length(flag1) == 0L) {
      message("\t no signatures left in the whole procedure seems due to low similarity, just report baseline exposure.")
      flag1 <- flag1_bk
    }
    expo[flag1] <- baseline$expo[, 1]

    message("Save expo result to temp file ", tmpf_expo)
    saveRDS(expo, file = tmpf_expo)

    message("Save similarity result to temp file ", tmpf_sim)
    saveRDS(baseline$sim, file = tmpf_sim)
  }

  NULL
}

.get_one_catalog_similarity <- function(catalog_mat, flag, sig_matrix,
                                        return_all = FALSE) {
  expo <- suppressMessages(
    sig_fit(
      catalogue_matrix = catalog_mat,
      sig = sig_matrix[, flag, drop = FALSE],
      method = "QP",
      type = "absolute",
      return_class = "matrix"
    )
  )
  rec_catalog <- sig_matrix[, flag, drop = FALSE] %*% expo
  if (return_all) {
    list(
      sim = cosineMatrix(rec_catalog, catalog_mat)[1],
      expo = expo
    )
  } else {
    cosineMatrix(rec_catalog, catalog_mat)[1]
  }
}

Try the sigminer package in your browser

Any scripts or data that you put into this service are public.

sigminer documentation built on Aug. 21, 2023, 9:08 a.m.