#' Function for reading in eggnog-mapper annotations and returning tidy subsets of the info
#'
#' Many of the data in the eggnog-mapper annotation table (eg., generated by the LLG pipeline)
#' is encoded as comma-delimited lists within a single column (eg., KEGG pathways).
#' This makes it challenging to "tidy" the table.
#'
#' This function will read in the table and output a tidy table of one part of the table
#' (eg., COG functional categories or KEGG pathways).
#'
#' The function will also provide info on how to obtain metadata for function groupings.
#'
#' @param infile Path to eggnog-annotation table file
#' @param cmd command instead of input file (eg., "gunzip -c INFILE")
#' @param sep table value delimiter
#' @param nrows Number of table rows to read. If Inf, all lines will be read.
#' @param to_keep Which functional grouping to keep (eg., KEGG pathways)?
#' @param column_names The column names to use for the table (use NULL if the input table has column names)
#' @return data.table
#' @export
#' @import tidytable
#' @importFrom glue glue
read_eggnog_mapper = function(infile=NULL, cmd=NULL, sep='\t', nrows=Inf, to_keep = c('COG', 'KEGG pathway', 'CAZy'),
column_names = c("query_name", "seed_eggNOG_ortholog", "seed_ortholog_evalue",
"seed_ortholog_score", "Predicted_taxonomic_group", "Predicted_protein_name",
"Gene_Ontology_terms", "EC_number", "KEGG_ko", "KEGG_Pathway", "KEGG_Module",
"KEGG_Reaction", "KEGG_rclass", "BRITE", "KEGG_TC", "CAZy", "BiGG_Reaction",
"tax_scope__eggNOG_taxonomic_level_used_for_annotation", "eggNOG_OGs",
"bestOG", "COG_Functional_Category", "eggNOG_free_text_description")){
X = Fread(infile, cmd=cmd, nrows=nrows, sep=sep)
if(!is.null(column_names)){
colnames(X) = column_names
}
if(to_keep[1] == 'COG'){
max_feats = X %>%
mutate.(n_feats = stringr::str_length(COG_Functional_Category)) %>%
pull.(n_feats) %>% max
new_cols = gsub('^', 'X', 1:(max_feats))
F = '/ebio/abt3_projects2/databases_no-backup/humann2/utility_mapping/COG_cateogories.tsv'
message(glue::glue('You can get COG metadata at: {F}', F=F))
COG = X %>%
select.(query_name, COG_Functional_Category) %>%
separate.(COG_Functional_Category, into=new_cols, sep='') %>%
pivot_longer.(cols=c(-query_name), names_to='X', values_to='COG_cat') %>%
filter.(!is.na(COG_cat)) %>%
select.(-X) %>%
rename.(seqid = query_name)
return(COG)
} else if(to_keep[1] == 'KEGG pathway'){
## Formatting
KEGG_ptw = X %>%
select.(query_name, KEGG_Pathway) %>%
mutate.(KEGG_Pathway = gsub(',map.+', '', KEGG_Pathway))
## Max pathways per seqid
max_feats = KEGG_ptw %>%
mutate.(n_feats = stringr::str_count(KEGG_Pathway, ',')) %>%
pull.(n_feats) %>% max
new_cols = gsub('^', 'X', 1:(max_feats+1))
## separating
F = '/ebio/abt3_projects2/databases_no-backup/humann2/utility_mapping/map_kegg-pwy_name_cat.txt.gz'
message(glue::glue('You can get pathway metadata at: {F}', F=F))
KEGG_Pathway = KEGG_ptw %>%
separate.(KEGG_Pathway, into=new_cols, sep=',') %>%
pivot_longer.(cols=c(-query_name), names_to='X', values_to='KEGG_pathway') %>%
select.(-X) %>%
filter.(!is.na(KEGG_pathway),
!grepl('^map', KEGG_pathway))
return(KEGG_Pathway)
} else if(to_keep[1] == 'CAZy'){
max_feats = X %>%
distinct.(CAZy) %>%
mutate.(n_feats = stringr::str_count(CAZy, ',')) %>%
pull.(n_feats) %>% max
new_cols = gsub('^', 'X', 1:(max_feats + 1))
CAZy_annot = X %>%
select.(query_name, CAZy) %>%
separate.(CAZy, into=new_cols, sep=',') %>%
pivot_longer.(cols=c(-query_name), names_to='X', values_to='CAZy') %>%
filter.(!is.na(CAZy)) %>%
select.(-X) %>%
rename.(seqid = query_name) %>%
mutate.(CAZy_module = gsub('[0-9]+$', '', CAZy),
CAZy = gsub('^([^0-9]+)([0-9])$', '\\10\\2', CAZy),
CAZy = gsub('^([^0-9]+)([0-9][0-9])$', '\\10\\2', CAZy))
return(CAZy_annot)
} else {
stop('to_keep option not recognized')
}
return(NULL)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.