MetaPoly: What the Package Does (One Line, Title Case)

Documented in PlotPolyCorr PolyCorr

################# POLYCORR ################# 
fit_cor_gene <- function(gene_data, gene, min_samp, samp_vec){
  gene_data = na.omit(gene_data[gene_data$DEPTH > 0,])
  if (length(unique(gene_data$sample)) > min_samp){
  cor_test = cor.test(gene_data$res_m, gene_data$variable)
  p = as.numeric(cor_test$p.value)
  cor = as.numeric(cor_test$estimate)
  mean_coef = mean(gene_data$res_m, na.rm=T)
  low_ci = cor_test$conf.int[1]
  high_ci = cor_test$conf.int[2]
  return(data.frame(gene_id=gene, cor=cor, p=p, mean_coef=mean_coef, low_ci=low_ci, high_ci=high_ci))}
  else{return(data.frame(gene_id=gene, cor=NA, p=NA, mean_coef=NA, low_ci=NA, high_ci=NA))}}

#' PolyCorr
#'
#' This functions computes the gene-wise correlation between the number of SNPs, and a target 
#' variable than can be either continuous, either discrete (handles two treatments). It creates
#' a zero-inflated poisson model of the SNP count, taking the sample, log(sequencing depth), 
#' and the gene length as fixed effects. The idea is to identify genes showing signs of selective
#' sweeps associated with the target variable. There are three inputs:
#'   - data: the polymorphism data created by MetaPoly's "GetGenesData" function
#'   - min_samp: the minimal number of sample having the required depth to run the correlation test.
#'   - samp_vec: a vector that assigns each sample (vector values) to each population (vector
#'     names). The method handles two populations encoded as 0 and 1's.
#'
#' @export
PolyCorr <- function(data, min_samp, samp_vec, p_corr='holm'){
  print('Launching - MetaPoly PolyCorr: a polymorphism-variable correlation tool for metagenomic data')
  t0 = Sys.time()
  
  print(' - Fitting a zero-inflated poisson model on the data...')
  data = as.data.frame(data)
  print(data)
  model_df = na.omit(data[,colnames(data) %in% c('SNP_N', 'DEPTH', 'gene_length', 'sample', 'gene_id')])
  print(nrow(is.na(model_df)))
  model = pscl::zeroinfl(SNP_N ~ log(DEPTH) + gene_length + as.factor(sample) | log(DEPTH), data = model_df)
  print(summary(model))
  model_df$res_m = model$residuals
  model_df$variable = vapply(model_df$sample, function(x) as.numeric(names(samp_vec)[samp_vec == x]), numeric(1))
  print(Sys.time() - t0)
  
  print(' - Computing sample coefficients...')
  coefs = coefficients(model)
  coefs = coefs[startsWith(names(coefs),'count_as.factor(sample)')]
  coefs_df = as.data.frame(coefs)
  rownames(coefs_df) = vapply(rownames(coefs_df), function(x) strsplit(x, ')')[[1]][2], FUN.VALUE = character(1))
  coefs_df$type = vapply(rownames(coefs_df), function(x) as.numeric(names(samp_vec)[samp_vec == x]), numeric(1))
  print(Sys.time() - t0)
  
  print(' - Computing correlations of polymorphism with the variable of interest per gene...')
  corr_df = do.call(rbind, lapply(unique(model_df$gene_id), function(gene) fit_cor_gene(model_df[model_df$gene_id == gene,], gene, min_samp, samp_vec)))
  corr_df$padj = p.adjust(corr_df$p, method = p_corr)
  sign_genes = corr_df[corr_df$padj < 0.05,]
  pos_genes = as.vector(na.omit(sign_genes$gene_id[sign_genes$cor > 0]))
  neg_genes = as.vector(na.omit(sign_genes$gene_id[sign_genes$cor < 0]))
  print(Sys.time() - t0)
  
  print(' - Analysis done!')
  return(list(pi_corr_res = corr_df, pos_genes = pos_genes, neg_genes = neg_genes, coefs = coefs_df))}

#' PlotPolyCorr
#'
#' Plotting function for the PolyCorr function results.
#'
#' @export
PlotPolyCorr <- function(res_df, coefs_df, plots_name, boolean_var = TRUE){
  dir.create(paste0(plots_name,'_res'))
  
  res_df$Significant = 'No'
  res_df$Significant[res_df$padj < 0.05] = 'Yes'
  
  # Plot the distribution of correlations
  ggplot2::ggplot(res_df, ggplot2::aes(x=cor,fill=Significant)) + ggplot2::geom_histogram() + ggplot2::theme_minimal() +
    ggplot2::geom_vline(ggplot2::aes(xintercept=mean(res_df$cor, na.rm = T)), linetype="dashed") + 
    ggplot2::geom_vline(ggplot2::aes(xintercept=0), color='darkgrey', linetype="dashed") + 
    ggplot2::xlab('Correlation coef.') + ggsci::scale_fill_jco() 
  ggplot2::ggsave(paste0(plots_name,'_res','/',plots_name,'_dist.pdf'), width = 4, height = 4)
  
  # Plot the distribution of p-values
  ggplot2::ggplot(res_df, ggplot2::aes(x=p,fill=Significant)) + ggplot2::geom_histogram() + ggplot2::theme_minimal() + 
    ggplot2::xlab('p') + ggsci::scale_fill_jco() 
  ggplot2::ggsave(paste0(plots_name,'_res','/',plots_name,'_p.pdf'), width = 4, height = 4)
  
  # Plot the coefficients
  if (boolean_var == TRUE){
    ggplot2::ggplot(coefs_df, ggplot2::aes(x=as.factor(type),y=coefs,color=as.factor(type))) + ggplot2::geom_boxplot() + ggplot2::theme_minimal() + 
      ggpubr::stat_compare_means() + ggsci::scale_color_jco() + ggplot2::xlab('Variable') + ggplot2::ylab('Sample coefs.') + ggplot2::theme(legend.position = 'none')
    ggplot2::ggsave(paste0(plots_name,'_res','/',plots_name,'_coefficients.pdf'), width = 3, height = 4)}
  
  else {ggplot2::ggplot(coefs_df, ggplot2::aes(x=type,y=coefs)) + ggplot2::geom_point() + ggplot2::theme_minimal() + ggplot2::geom_smooth() +
        ggsci::scale_color_jco() + ggplot2::xlab('Variable') + ggplot2::ylab('Sample coefs.') + ggplot2::theme(legend.position = 'none')
    ggplot2::ggsave(paste0(plots_name,'_res','/',plots_name,'_coefficients.pdf'), width = 3, height = 4)}}

################ ENRICH ########
#' @export
CalcEnrichment <- function(gff, gene_list){
  cog_table = read.delim('data/cog-20.def.tab', sep='\t', header = F)
  for (i in 1:nrow(cog_table)){
    row = cog_table[i,]
    if (length(strsplit(row$V2,'')[[1]]) > 1){
        others = strsplit(row$V2,'')[[1]][2:length(strsplit(row$V2,'')[[1]])]
        cog_table$V2[i] = as.character(strsplit(row$V2,'')[[1]][1])
        for (cat in others){
          mod_row = row
          mod_row$V2 = cat
          cog_table = rbind(cog_table, mod_row)}}}
  cog_functions = c('J'='Translation, ribosomal structure and biogenesis',
                  'A'='RNA processing and modification',
                  'K'='Transcription',
                  'L'='Replication, recombination and repair',
                  'B'='Chromatin structure and dynamics',
                  'D'='Cell cycle control, cell division, chromosome partitioning',
                  'Y'='Nuclear structure',
                  'V'='Defense mechanisms',
                  'T'='Signal transduction mechanisms',
                  'M'='Cell wall/membrane/envelope biogenesis',
                  'N'='Cell motility',
                  'Z'='Cytoskeleton',
                  'W'='Extracellular structures',
                  'U'='Intracellular trafficking, secretion, and vesicular transport',
                  'O'='Posttranslational modification, protein turnover, chaperones',
                  'X'='Mobilome: prophages, transposons',
                  'C'='Energy production and conversion',
                  'G'='Carbohydrate transport and metabolism',
                  'E'='Amino acid transport and metabolism',
                  'F'='Nucleotide transport and metabolism',
                  'H'='Coenzyme transport and metabolism',
                  'I'='Lipid transport and metabolism',
                  'P'='Inorganic ion transport and metabolism',                     
                  'Q'='Secondary metabolites biosynthesis, transport and catabolism',
                  'R'='General function prediction only',
                  'S'='Function unknown')


  gff$gene = vapply(gff$V9, function(x) strsplit(strsplit(x,';')[[1]][1],'ID=')[[1]][2], FUN.VALUE = character(1))
  gff$cog = vapply(gff$V9, function(x) strsplit(strsplit(x,'COG:')[[1]][2],';')[[1]][1], FUN.VALUE = character(1))
  gff$cog_f = vapply(gff$cog, function(x) ifelse(is.na(x), 'NoCOG', substr(cog_func$V2[cog_func$V1 == x],1,1)), FUN.VALUE = character(1))
  
  gff$sign = 'b_No'
  gff$sign[gff$gene %in% gene_list] = 'a_Yes'

  full_con_tab = table(gff$cog_f, gff$sign)
  enrich_df = data.frame()
  for (cog_func in rownames(full_con_tab)){
    if(cog_func != 'NoCOG'){
      func_vals = full_con_tab[rownames(full_con_tab) == cog_func,]
      other_vals =  colSums(full_con_tab[rownames(full_con_tab) != cog_func,])
      cont_table = as.matrix(rbind(func_vals, other_vals))
      fish_test = fisher.test(cont_table, 'greater')
      p = fish_test$p.value
      or = fish_test$estimate
      l_or = fish_test$conf.int[1]
      h_or = fish_test$conf.int[2]
      func_cogs = gff$cog[(gff$cog_f == cog_func) & (gff$gene %in% gene_list)]
      enrich_df = rbind(enrich_df, data.frame(Function=cog_func, p=p, OR=or, low_CI=l_or, high_CI=h_or, cogs=paste(func_cogs, collapse=',')))}}
  enrich_df$padj = p.adjust(enrich_df$p, method = 'holm')
  enrich_df$Function_long = vapply(enrich_df$Function, function(x) as.character(cog_functions[x]), character(1))
  return(enrich_df)}