## code to prepare `data` dataset goes here
library(dplyr, warn.conflicts = FALSE)
library(tidyr)
library(usethis)
source("./R/rule_utils.R")
# 1. Read information about microtrait hmms
hmms = readr::read_delim("./data-raw/microtrait_hmm.txt",
delim = "\t",
col_types = readr::cols(`microtrait_hmm-name` = readr::col_character(),
`microtrait_hmm-dbxref_kegg` = readr::col_character(),
`microtrait_hmm-dbxref_ec` = readr::col_character(),
`microtrait_hmm-dbxref_TC` = readr::col_character(),
`microtrait_hmm-description` = readr::col_character(),
`microtrait_hmm-notes` = readr::col_character()
)
)
hmms_cazy = hmms %>% dplyr::filter(stringr::str_detect(`microtrait_hmm-name`, "^GH") == TRUE)
message(paste("Read", nrow(hmms), "microtraithmms."))
# 2. Read information about microtrait rules
rules = readr::read_delim("./data-raw/microtrait_rules.txt",
delim = "\t",
col_types = readr::cols(`microtrait_rule-name` = readr::col_factor(ordered = FALSE),
`microtrait_rule-boolean` = readr::col_character()
)
)
message(paste("Read", nrow(rules), "rules."))
# unwrap rules
rules = unwrap.rules(rules)
# find which hmms appear in the rules
hmm_names_fromrules = unique(unlist(lapply(rules %>% pull(`microtrait_rule-booleanunwrapped`), FUN=rule2parts)))
hmm_names_fromrules = removequotes(hmm_names_fromrules)
hmm_names_fromrules = hmm_names_fromrules %>% tibble::as_tibble() %>%
dplyr::rename(`microtrait_hmm-name` = value)
# This should be empty if properly unwrapped
#hmm_names_fromrules %>%
# anti_join(hmms, by = c("microtrait_hmm-name" = "microtrait_hmm-name"))
hmm_names_fromrules = hmm_names_fromrules %>%
dplyr::left_join(hmms, by = c("microtrait_hmm-name" = "microtrait_hmm-name"))
# 3. read traits
traits = readr::read_delim("./data-raw/microtrait_traits.txt",
delim = "\t", col_names = T,
col_types = readr::cols(`microtrait_trait-name` = readr::col_character(),
`microtrait_trait-displaynameshort` = readr::col_character(),
`microtrait_trait-displaynamelong` = readr::col_character(),
`microtrait_trait-strategy` = readr::col_character(),
`microtrait_trait-type` = readr::col_character(),
`microtrait_trait-granularity` = readr::col_factor(levels = c("1","2","3"), ordered = TRUE),
`microtrait_trait-version` = readr::col_character(),
`microtrait_trait-displayorder` = readr::col_integer()
)
) %>%
dplyr::arrange(`microtrait_trait-strategy`, `microtrait_trait-granularity`, `microtrait_trait-displayorder`)
## split by granularity
traits_granularity_levels = c(1,2,3)
traits_listbygranularity = vector("list", length(traits_granularity_levels))
for(i in 1:length(traits_granularity_levels)) {
traits_atgranularity = traits %>%
dplyr::filter(`microtrait_trait-granularity` == traits_granularity_levels[i]) %>%
dplyr::arrange(`microtrait_trait-displayorder`)
traits_atgranularity = traits_atgranularity %>%
dplyr::mutate(`microtrait_trait-name` = readr::parse_factor(`microtrait_trait-name`,
ordered = T,
levels = traits_atgranularity %>%
dplyr::arrange(`microtrait_trait-displayorder`) %>%
dplyr::pull(`microtrait_trait-name`)),
`microtrait_trait-displaynameshort` = readr::parse_factor(`microtrait_trait-displaynameshort`,
ordered = T,
levels = traits_atgranularity %>%
dplyr::arrange(`microtrait_trait-displayorder`) %>%
dplyr::pull(`microtrait_trait-displaynameshort`)),
`microtrait_trait-displaynamelong` = readr::parse_factor(`microtrait_trait-displaynamelong`,
ordered = T,
levels = traits_atgranularity %>%
dplyr::arrange(`microtrait_trait-displayorder`) %>%
dplyr::pull(`microtrait_trait-displaynamelong`))
)
traits_listbygranularity[[i]] = traits_atgranularity
}
# 4. read substrate to rule
substrate2rule = readr::read_delim("./data-raw/microtrait_substrate2rule.txt",
delim = "\t", col_names = T,
col_types = readr::cols(`microtrait_substrate-name` = readr::col_character(),
`microtrait_substrate-subclass1` = readr::col_character(),
`microtrait_substrate-subclass2` = readr::col_character(),
`microtrait_substrate-subclass3` = readr::col_character(),
`microtrait_trait-name1` = readr::col_character(),
`microtrait_trait-name2` = readr::col_character(),
`microtrait_trait-name3` = readr::col_character()
)
)
# allsubstrates = substrate2rule %>% pull(`microtrait_substrate-name`)
# 5. read rule to trait
rule2trait_temp = readr::read_delim("./data-raw/microtrait_rule2trait.txt",
delim = "\t",
col_names = T,
col_types = readr::cols(`microtrait_rule-name` = readr::col_factor(ordered = FALSE),
`microtrait_rule-type` = readr::col_character(),
`microtrait_rule-substrate` = readr::col_character(),
`microtrait_trait-name1` = readr::col_character(),
`microtrait_trait-name2` = readr::col_character(),
`microtrait_trait-name3` = readr::col_character(),
`microtrait_trait-version` = readr::col_character()
)
)
rule2trait_temp = rule2trait_temp %>%
dplyr::inner_join(rules, by = c("microtrait_rule-name" = "microtrait_rule-name")) %>%
dplyr::select(c("microtrait_rule-name", "microtrait_rule-boolean", "microtrait_rule-type", "microtrait_rule-substrate",
"microtrait_trait-name1", "microtrait_trait-name2", "microtrait_trait-name3",
"microtrait_trait-version")) %>%
dplyr::filter(`microtrait_trait-version` == "production")
## expand count_by_substrate rules
rule2trait_countbysubstrate = rule2trait_temp %>%
dplyr::filter(`microtrait_rule-type` == "count_by_substrate") %>%
tidyr::separate_rows(`microtrait_rule-substrate`, sep = ";") %>%
dplyr::left_join(substrate2rule, by = c("microtrait_rule-substrate" = "microtrait_substrate-name"), suffix = c(".rule2trait", ".substrate2rule")) %>%
dplyr::select(c("microtrait_rule-name", "microtrait_rule-boolean", "microtrait_rule-type", "microtrait_rule-substrate",
"microtrait_trait-name1.substrate2rule", "microtrait_trait-name2.substrate2rule", "microtrait_trait-name3.substrate2rule",
"microtrait_trait-version")) %>%
dplyr::rename(`microtrait_trait-name1` = `microtrait_trait-name1.substrate2rule`,
`microtrait_trait-name2` = `microtrait_trait-name2.substrate2rule`,
`microtrait_trait-name3` = `microtrait_trait-name3.substrate2rule`)
# rule2trait_countbysubstrate_allsubstrates = rule2trait_countbysubstrate %>% pull(`microtrait_rule-substrate`) %>% unique
## count and binary rules
rule2trait_remain = rule2trait_temp %>%
dplyr::filter(`microtrait_rule-type` %in% c("count", "binary"))
rule2trait = dplyr::bind_rows(rule2trait_countbysubstrate, rule2trait_remain)
# check, no NAs in trait-names
# rule2trait%>%pull(`microtrait_trait-name1`)%>%is.na()%>%table()
# rule2trait%>%pull(`microtrait_trait-name2`)%>%is.na()%>%table()
# rule2trait%>%pull(`microtrait_trait-name3`)%>%is.na()%>%table()
rule2trait = rule2trait %>%
dplyr::mutate(`microtrait_trait-name1` = readr::parse_factor(`microtrait_trait-name1`,
ordered = T,
levels = traits_listbygranularity[[1]] %>%
dplyr::arrange(`microtrait_trait-displayorder`) %>%
dplyr::pull(`microtrait_trait-name`) %>% as.character()
),
`microtrait_trait-name2` = readr::parse_factor(`microtrait_trait-name2`,
ordered = T,
levels = traits_listbygranularity[[2]] %>%
dplyr::arrange(`microtrait_trait-displayorder`) %>%
dplyr::pull(`microtrait_trait-name`) %>% as.character()
),
`microtrait_trait-name3` = readr::parse_factor(`microtrait_trait-name3`,
ordered = T,
levels = traits_listbygranularity[[3]] %>%
dplyr::arrange(`microtrait_trait-displayorder`) %>%
dplyr::pull(`microtrait_trait-name`) %>% as.character()
)
)
# 1,701
# 6. read performance metrics for the microtraithmms
hmm_performance = readr::read_delim("./data-raw/microtrait_hmm-performance.txt",
delim = "\t",
col_names = T,
col_types = readr::cols(`microtrait_hmm-dbxref_kegg` = readr::col_character(),
`microtrait_hmm-dbxref_kegg_score` = readr::col_double(),
`microtrait_hmm-dbxref_kegg_fscore` = readr::col_double(),
`microtrait_hmm-dbxref_kegg_tpr` = readr::col_double(),
`microtrait_hmm-dbxref_kegg_tnr` = readr::col_double(),
`microtrait_hmm-dbxref_kegg_fpr` = readr::col_double(),
`microtrait_hmm-dbxref_kegg_fnr` = readr::col_double(),
`microtrait_hmm-dbxref_kegg_accuracy` = readr::col_double()
)
)
hmms_fromrules_wkegg = hmm_names_fromrules %>%
dplyr::inner_join(hmm_performance, by = c("microtrait_hmm-dbxref_kegg" = "microtrait_hmm-dbxref_kegg"))
hmms_fromrules_wokegg = hmm_names_fromrules %>%
dplyr::inner_join(hmm_performance, by = c("microtrait_hmm-name" = "microtrait_hmm-dbxref_kegg"))
hmms_fromrules_cazy = hmm_names_fromrules %>%
dplyr::inner_join(hmms_cazy, by = c("microtrait_hmm-name" = "microtrait_hmm-name")) %>%
dplyr::select(c("microtrait_hmm-name", "microtrait_hmm-description.x")) %>%
dplyr::rename(`microtrait_hmm-description` = `microtrait_hmm-description.x`) %>%
tibble::add_column(`microtrait_hmm-dbxref_kegg`=NA,
`microtrait_hmm-dbxref_ec`=NA,
`microtrait_hmm-dbxref_TC`=NA,
`microtrait_hmm-notes`=NA,
`microtrait_hmm-dbxref_kegg_score`=NA,
`microtrait_hmm-dbxref_kegg_fscore`=NA,
`microtrait_hmm-dbxref_kegg_tpr`=NA,
`microtrait_hmm-dbxref_kegg_tnr`=NA,
`microtrait_hmm-dbxref_kegg_fpr`=NA,
`microtrait_hmm-dbxref_kegg_fnr`=NA,
`microtrait_hmm-dbxref_kegg_accuracy`=NA)
hmms_fromrules = bind_rows(hmms_fromrules_wkegg,
hmms_fromrules_wokegg,
hmms_fromrules_cazy) %>%
dplyr::mutate(`microtrait_hmm-name` = readr::parse_factor(`microtrait_hmm-name`,
ordered = T))
#
# a=c(hmms_fromrules_wkegg %>% pull(`microtrait_hmm-dbxref_kegg`) %>% unique(),
# hmms_fromrules_wokegg %>% pull(`microtrait_hmm-name`) %>% unique())
# b=hmm_performance %>% pull(`microtrait_hmm-dbxref_kegg`)
# should be empty, otherwise there are genes without performance metrics
# setdiff(a,b)
#hmms_fromrules %>%
# anti_join(hmm_performance, by = c("microtrait_hmm-dbxref_kegg" = "microtrait_hmm-dbxref_kegg"))
write.table(rules,
file = "./data-raw/microtrait_ruleunwrapped.txt",
sep = "\t", quote = F, row.names = F, col.names = F)
write.table(hmms_fromrules %>% as.data.frame,
file = "./data-raw/microtrait_hmmsfromrules.txt",
sep = "\t", quote = F, row.names = F, col.names = T)
write.table(hmm_names_fromrules %>% as.data.frame,
file = "./data-raw/microtrait_hmm_names_fromrules.txt",
sep = "\t", quote = F, row.names = F, col.names = T)
# build shell commands for "rename_addtc2hmm.sh"
rename_addtc2hmm_path = "$bindir/rename_addtc2hmm.sh"
hmm_indir = "$rawhmmdir"
hmm_outdir = "$renamedwTCdir"
#
rename_addtc2hmm_commands = hmms_fromrules %>%
dplyr::mutate(`microtrait_hmm-descriptionwithdbxref` = paste0(`microtrait_hmm-description`,
"//dbxref_kegg", `microtrait_hmm-dbxref_kegg`,
"//dbxref_ec", `microtrait_hmm-dbxref_ec`,
"//dbxref_TC", `microtrait_hmm-dbxref_TC`
)) %>%
dplyr::mutate(rename_addtc2hmm_command = paste0(rename_addtc2hmm_path, " ",
hmm_indir, " ",
hmm_outdir, " ",
`microtrait_hmm-dbxref_kegg`, " ",
`microtrait_hmm-name`, " ",
`microtrait_hmm-dbxref_kegg_score`, " ",
"\"", `microtrait_hmm-descriptionwithdbxref`, "\"")
) %>%
pull(`rename_addtc2hmm_command`)
write.table(rename_addtc2hmm_commands,
file = "./data-raw/rename_addtc2hmm_commands.sh", sep = "\t", quote = F, row.names = F, col.names = F)
# 7.Read regression models for prediction of optimum growth temperature
# The models are from Sauer & Wang (2019) available at https://github.com/DavidBSauer/OGT_prediction
model.dir = "./data-raw/ogt_regression_models"
model.files = file.path(model.dir, c("superkingdom-all_species-genomic+ORF+protein.txt", "superkingdom-all_species-genomic+tRNA+rRNA+ORF.txt",
"superkingdom-all_species-genomic+rRNA+ORF+protein.txt", "superkingdom-all_species-genomic+tRNA+rRNA.txt",
"superkingdom-all_species-genomic+rRNA.txt", "superkingdom-all_species-genomic+tRNA.txt",
"superkingdom-all_species-genomic+tRNA+ORF+protein.txt", "superkingdom-all_species-genomic.txt"))
model.allfeatures = readr::read_delim(file.path(model.dir, "allfeatures.txt"), delim = "\t") %>%
dplyr::mutate(feature = factor(feature, levels = feature, ordered = T))
#
models = parallel::mclapply(1:length(model.files),
function(i) {
rank = strsplit(basename(model.files[i]), "-")[[1]][1]
clade = strsplit(basename(model.files[i]), "-")[[1]][2]
model_type = sub(".txt", "", strsplit(basename(model.files[i]), "-")[[1]][3])
firstline = readr::read_lines(model.files[i], n_max = 1)
R2 = sub("#R2=", "", strsplit(firstline, "\\|")[[1]][1])
RMSE = sub("RMSE=", "", strsplit(firstline, "\\|")[[1]][2])
coefficients = readr::read_delim(model.files[i],
col_names = F,
col_type = "cc", # read the coefficients as text
delim = "\t",
comment = "#") %>%
setNames(., c("feature", "value"))
coefficients = model.allfeatures %>%
dplyr::left_join(coefficients, by = c("feature" = "feature")) %>%
dplyr::mutate(value = tidyr::replace_na(value, "0"))
model = cbind(rank,
clade,
model_type,
R2,
RMSE,
coefficients)
model
},
mc.cores = 4)
models = do.call(rbind, models) %>% dplyr::as_tibble()
#usethis::use_data(hmms, overwrite = TRUE)
#usethis::use_data(substrate2rule, overwrite = TRUE)
usethis::use_data(hmms_fromrules, overwrite = TRUE)
usethis::use_data(rules, overwrite = TRUE)
usethis::use_data(rule2trait, overwrite = TRUE)
usethis::use_data(traits_listbygranularity, overwrite = TRUE)
usethis::use_data(models, overwrite = TRUE)
#files = list.files("~/Work/microtrait/code/github/microtrait/inst/extdata/ogt.test/tocompare", full.names = T)
#allcolnames = matrix(nrow = 0, ncol = 2)
#for(i in 2:length(files)) {
# temp = read.table(files[i], sep = "\t", header = T, check.names = F)
# allcolnames = rbind(allcolnames,
# cbind(basename(files[i]), colnames(temp)[3:ncol(temp)])
# )
#}
#write.table(allcolnames,
# file = "/Users/ukaraoz/Work/microtrait/code/github/microtrait/data-raw/regression_models/allfeatures.xls",
# row.names = F, col.names = T, sep = "\t", quote = F)
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.