R/hk_find.R

#' A housekeeping OTUs finding function
#'
#' This function allows you to find housekeeping OTUs and calculate size factors for normalization.
#' @param OTU_table pool for OTUs which will be tested for differential abundance in form of OTUs x Samples
#' @param nOTUs_hkpool number of OTUs in the pool from which to search housekeeping OTUs
#' @param quant quantile of distance distribution as the network threshold
#' @keywords housekeeping OTUs
#' @keywords size factor
#' @keywords normalization
#' @export
#' @examples
hk_find <- function(OTU_table, nOTUs_hkpool=100, quant = 0.03){
  # input:
  # OTU table: pool for OTUs which will be tested for differential abundance (OTUs x Samples)
  # nOTUs_hkpool: number of OTUs in the pool from which to search housekeeping OTUs
  # quant: quantile of distance distribution as the network threshold
  # output:
  # housekeeping OTUs' IDs (rownames of OTU table)
  # size factor used for later on normalization
  samobs = apply(OTU_table, 1, function(x) sum(x != 0))
  sums = apply(OTU_table, 1, sum)
  otudf = data.frame(prev = samobs, sums = sums)
  otudf = otudf[order(-otudf$prev, -otudf$sums),]
  hk_pool = OTU_table[rownames(otudf)[1:nOTUs_hkpool],]
  OTU_ID = rownames(hk_pool)
  # create symmetric distance matrix between selected OTUs
  ratio_var = matrix(0, nrow = nOTUs_hkpool, ncol = nOTUs_hkpool)
  for (i in 1:nOTUs_hkpool){
    for (j in 1:nOTUs_hkpool){
      mul = (hk_pool[i,]*hk_pool[j,])
      ind = unname(unlist(mul)) != 0
      ratio_var[i,j] = var(unlist(log(hk_pool[i,][ind]/hk_pool[j,][ind])))
    }
  }
  ratio_var[lower.tri(ratio_var, diag = TRUE)] <- 0
  dist = ratio_var[ratio_var > 0]
  nodes = data.frame(seq(1, nOTUs_hkpool, 1), OTU_ID)
  colnames(nodes) = c("order", "OTU_ID")
  links = matrix(0, nrow = 1, ncol = 3)
  h = quantile(dist, probs = quant)
  order = seq(1, nOTUs_hkpool, 1)
  for (i in 1:nOTUs_hkpool){
    check = ratio_var[i,]
    ind = order[check < h & check > 0]
    if (length(ind) > 0){
      dist = check[ind]
      rep = replicate(length(ind), i)
      record = cbind(rep, ind, dist)
      links = rbind(links, record)
    }
  }
  links = links[-1,]
  net <- graph_from_data_frame(d = links, vertices = nodes, directed = F)
  list_cliques = cliques(net)
  clique_size = sapply(list_cliques, length)

  largest_cliques = largest_cliques(net)
  ## intersect of all largest cliques as housekeeping OTUs
  #if (length(largest_cliques) > 1){
  #  intersect_cliques = largest_cliques[[1]]
  #  for (i in 2:length(largest_cliques)){
  #    intersect_cliques = intersect(intersect_cliques, largest_cliques[[i]])
  #  }
  #}else{
  #  intersect_cliques = largest_cliques[[1]]
  #}
  #intersect_cliques = intersect_cliques[order(intersect_cliques)]

  ## choose the first largest clique as housekeeping OTUs
  intersect_cliques = largest_cliques[[1]]

  hk_ID = OTU_ID[intersect_cliques]
  size_factor = colSums(hk_pool[hk_ID,])
  return(list(hk_ID = hk_ID, size_factor = size_factor))
}
yuanjing-ma/normtestpackage documentation built on May 7, 2019, 2:33 p.m.