R/genome_profiling.R

Defines functions read_eggnog_mapper

Documented in read_eggnog_mapper

#' Function for reading in eggnog-mapper annotations and returning tidy subsets of the info
#'
#' Many of the data in the eggnog-mapper annotation table (eg., generated by the LLG pipeline)
#' is encoded as comma-delimited lists within a single column (eg., KEGG pathways).
#' This makes it challenging to "tidy" the table.
#'
#' This function will read in the table and output a tidy table of one part of the table
#' (eg., COG functional categories or KEGG pathways).
#'
#' The function will also provide info on how to obtain metadata for function groupings.
#'
#' @param infile Path to eggnog-annotation table file
#' @param cmd command instead of input file (eg., "gunzip -c INFILE")
#' @param sep table value delimiter
#' @param nrows Number of table rows to read. If Inf, all lines will be read.
#' @param to_keep Which functional grouping to keep (eg., KEGG pathways)?
#' @param column_names The column names to use for the table (use NULL if the input table has column names)
#' @return data.table
#' @export
#' @import tidytable
#' @importFrom glue glue
read_eggnog_mapper = function(infile=NULL, cmd=NULL, sep='\t', nrows=Inf, to_keep = c('COG', 'KEGG pathway', 'CAZy'),
                              column_names = c("query_name", "seed_eggNOG_ortholog", "seed_ortholog_evalue",
                                               "seed_ortholog_score", "Predicted_taxonomic_group", "Predicted_protein_name",
                                               "Gene_Ontology_terms", "EC_number", "KEGG_ko", "KEGG_Pathway", "KEGG_Module",
                                               "KEGG_Reaction", "KEGG_rclass", "BRITE", "KEGG_TC", "CAZy", "BiGG_Reaction",
                                               "tax_scope__eggNOG_taxonomic_level_used_for_annotation", "eggNOG_OGs",
                                               "bestOG", "COG_Functional_Category", "eggNOG_free_text_description")){
  X = Fread(infile, cmd=cmd, nrows=nrows, sep=sep)
  if(!is.null(column_names)){
    colnames(X) = column_names
  }

  if(to_keep[1] == 'COG'){
    max_feats = X %>%
      mutate.(n_feats = stringr::str_length(COG_Functional_Category)) %>%
      pull.(n_feats) %>% max
    new_cols = gsub('^', 'X', 1:(max_feats))
    F = '/ebio/abt3_projects2/databases_no-backup/humann2/utility_mapping/COG_cateogories.tsv'
    message(glue::glue('You can get COG metadata at: {F}', F=F))
    COG = X %>%
      select.(query_name, COG_Functional_Category) %>%
      separate.(COG_Functional_Category, into=new_cols, sep='') %>%
      pivot_longer.(cols=c(-query_name), names_to='X', values_to='COG_cat') %>%
      filter.(!is.na(COG_cat)) %>%
      select.(-X) %>%
      rename.(seqid = query_name)
    return(COG)
  } else if(to_keep[1] == 'KEGG pathway'){
    ## Formatting
    KEGG_ptw = X %>%
      select.(query_name, KEGG_Pathway) %>%
      mutate.(KEGG_Pathway = gsub(',map.+', '', KEGG_Pathway))
    ## Max pathways per seqid
    max_feats = KEGG_ptw %>%
      mutate.(n_feats = stringr::str_count(KEGG_Pathway, ',')) %>%
      pull.(n_feats) %>% max
    new_cols = gsub('^', 'X', 1:(max_feats+1))
    ## separating
    F = '/ebio/abt3_projects2/databases_no-backup/humann2/utility_mapping/map_kegg-pwy_name_cat.txt.gz'
    message(glue::glue('You can get pathway metadata at: {F}', F=F))
    KEGG_Pathway = KEGG_ptw %>%
      separate.(KEGG_Pathway, into=new_cols, sep=',') %>%
      pivot_longer.(cols=c(-query_name), names_to='X', values_to='KEGG_pathway') %>%
      select.(-X) %>%
      filter.(!is.na(KEGG_pathway),
                !grepl('^map', KEGG_pathway))
    return(KEGG_Pathway)
  } else if(to_keep[1] == 'CAZy'){
    max_feats = X %>%
      distinct.(CAZy) %>%
      mutate.(n_feats = stringr::str_count(CAZy, ',')) %>%
      pull.(n_feats) %>% max
    new_cols = gsub('^', 'X', 1:(max_feats + 1))
    CAZy_annot = X %>%
      select.(query_name, CAZy) %>%
      separate.(CAZy, into=new_cols, sep=',') %>%
      pivot_longer.(cols=c(-query_name), names_to='X', values_to='CAZy') %>%
      filter.(!is.na(CAZy)) %>%
      select.(-X) %>%
      rename.(seqid = query_name) %>%
      mutate.(CAZy_module = gsub('[0-9]+$', '', CAZy),
              CAZy = gsub('^([^0-9]+)([0-9])$', '\\10\\2', CAZy),
              CAZy = gsub('^([^0-9]+)([0-9][0-9])$', '\\10\\2', CAZy))
    return(CAZy_annot)
  } else {
    stop('to_keep option not recognized')
  }
  return(NULL)
}
leylabmpi/LeyLabRMisc documentation built on Nov. 3, 2022, 3:45 p.m.