R/genome_content.R

#' @title Compute genome content similarity index
#' @description This function calculates genome content similarity index for two genomes.
#'
#' @param gene_sets1 list of vectors of 1s and 0s which represnet presence and absence of gene families from each of the gene sets in first genome.
#' @param gene_sets2 list of vectors of 1s and 0s which represnet presence and absence of gene families from each of the gene sets in second genome.
#' @param threshold fraction of gene families from a gene set which is required to be present in
#'        at least one of the genomes for for the gene set to contribute to genome content
#'        similarity index calculation (default is 0.05).
#' @param size minimal size of the gene set to consider (default is 4).
#' @return Genome content similarity index computed for two genomes encoding \code{gene_sets1}
#'        and \code{gene_sets2} respectively.
#' @references \enumerate{
#'            \item Kamneva OK. Genome composition of microbes predicts their co-occurrence in the environment. In review.
#'          }
#' @seealso \code{\link{similarity_by_set}}, \code{\link{functional_association}}, \code{\link{set_representation}},
#'          \code{\link{reference_gene_sets}}.
#' @examples
#' data(families_Bf)
#' data(families_Er)
#' sets_Bf = set_representation(families = families_Bf, gene_sets = reference_gene_sets)
#' sets_Er = set_representation(families = families_Er, gene_sets = reference_gene_sets)
#' sim_Bf_Er = similarity(gene_sets1 = sets_Bf, gene_sets2 = sets_Er, threshold = 0.05, size = 4)
#' sim_Bf_Er
#' @export
similarity <- function(gene_sets1, gene_sets2, threshold = 0.05, size = 4)
{
  if(class(gene_sets1) != "list" | class(gene_sets2) != "list" | class(threshold) != "numeric" |
     threshold <= 0 | threshold > 1){
    cat("gene_sets1 and gene_sets2 should be lists of vectors of 1s and 0s.\n
        Threshold should be a real value between 0 and 1. See help for more details", "\n", sep="")
  }
  indices = similarity_by_set(gene_sets1, gene_sets2)
  indices = indices[indices[,3] >= size & (indices[,5] > (indices[,3] * threshold) | indices[,4] > (indices[,3] * threshold)),]
  sim_index = mean(indices[, 2])
  return(sim_index)
}

#' @title Compute genome content similarity by gene set
#' @description This function calculates genome content similarity index for two genomes gene set by gene set.
#' @param gene_sets1 list of vectors of 1s and 0s which represnet presence and absence of gene families from each of the gene sets in first genome.
#' @param gene_sets2 list of vectors of 1s and 0s which represnet presence and absence of gene families from each of the gene sets in second genome.
#' @return A data frame with 5 variables: \code{set_ID} gene set identifier, \code{score} genome content similarities computed between
#'        \code{gene_sets1} and \code{gene_sets2} for each individual gene set, \code{set_size} number of gene families in the gene set,
#'        \code{in_genome1} number of gene families from gene set present in first and \code{in_genome2} second genomes.
#' @references \enumerate{
#'            \item Kamneva OK. Genome composition of microbes predicts their co-occurrence in the environment. In review.
#'          }
#' @seealso \code{\link{similarity}}, \code{\link{functional_association}}, \code{\link{set_representation}},
#'          \code{\link{reference_gene_sets}}.
#' @examples
#' data(families_Bf)
#' data(families_Er)
#' sets_Bf = set_representation(families = families_Bf, gene_sets = reference_gene_sets)
#' sets_Er = set_representation(families = families_Er, gene_sets = reference_gene_sets)
#' sim_Bf_Er = similarity_by_set(gene_sets1 = sets_Bf, gene_sets2 = sets_Er)
#' head(sim_Bf_Er)
#' @export
similarity_by_set <- function(gene_sets1, gene_sets2) {
  if(class(gene_sets1) != "list" | class(gene_sets2) != "list"){
    cat("gene_sets1 and gene_sets2 should be lists of vectors of 1s and 0s. See help for more details.", "\n", sep="")
  }
  genes = mapply(cbind, gene_sets1, gene_sets2, SIMPLIFY = FALSE)
  indices = lapply(genes, get_similarity)
  indices = matrix(unlist(indices), ncol = length(indices[[1]]), byrow = TRUE)
  indices = cbind("gene_set" = 1:length(gene_sets1), indices)
  colnames(indices) = c("gene_set", "index", "set_size", "in_genome1", "in_genome2", "in_both")
  indices = as.data.frame(indices)
  return(indices)
}



get_similarity<-function(genes) {
  sim_index = sum(1-abs(genes[,1] - genes[,2]));
  sim_index = sim_index/length(genes[,1])
  in_both = sum((genes[,1] + genes[,2])>0)
  indices = c("index" = sim_index, "set_size" = dim(genes)[1], "in_genome1"= sum(genes[,1]), 
              "in_genome2" = sum(genes[,2]), "in_both" = in_both)
  return(indices)
}
olgakamneva/genomics2ecology documentation built on May 24, 2019, 12:51 p.m.