other_files/database_construction_code/database_drugbank.R

# tinyTools::setwd_project()
# setwd('all_ms2_database/drugbank')
# library(dbparser)
# library(dplyr)
# library(ggplot2)
# library(XML)
# 
# library(MetaDBparse)
# rm(list = ls())
# # Description <- NULL
# # base.loc <- file.path("database", "drugbank_source")
# # if (!dir.exists(base.loc)) {
# #   dir.create(base.loc)
# # }
# # zip.file <- file.path(base.loc, "drugbank.zip")
# # 
# # utils::unzip(normalizePath(zip.file), exdir = normalizePath(base.loc))
# # input <- file.path(base.loc, "full database.xml")
# # header <- readLines(input, n = 10)
# # hasInfo <- grep(x = header, pattern = "version", value = TRUE, perl = TRUE)[2]
# # version <- stringr::str_match(string = hasInfo, pattern = "version=\"(.*)\" exported")[, 2]
# # theurl <- RCurl::getURL("https://go.drugbank.com/stats", .opts = list(ssl.verifypeer = FALSE))
# # tables <- XML::readHTMLTable(theurl, header = FALSE)
# # stats <- data.table::as.data.table(tables[[1]])
# # colnames(stats) <- c("Description", "Count")
# # n <- as.numeric(as.character(gsub(x = stats[Description == "Total Number of Drugs"]$Count, pattern = ",", replacement = "")))
# # envir <- environment()
# # envir$db.formatted <- data.frame(compoundname = rep(NA, n), baseformula = rep(NA, n), identifier = rep(NA, n), structure = rep(NA, n), charge = rep(NA, n), description = rep(NA, n))
# # envir$pb <- pbapply::startpb(min = 0, max = n)
# # envir$idx <- 0
# # metabolite <- function(currNode, currEnvir = envir) {
# #   if (currEnvir$idx %% 10 == 0) {
# #     pbapply::setpb(currEnvir$pb, currEnvir$idx)
# #   }
# #   currEnvir$idx <- currEnvir$idx + 1
# #   properties <- currNode[["calculated-properties"]]
# #   if (is.null(properties)) {
# #     properties <- currNode[["experimental-properties"]]
# #   }
# #   proplist <- XML::xmlToList(properties)
# #   if (length(proplist) == 0) {
# #     return(NULL)
# #   }
# #   which.form <- which(sapply(proplist, function(x) {
# #     if ("kind" %in% names(x)) {
# #       res <- x[["kind"]] == "Molecular Formula"
# #     }
# #     else {
# #       res <- FALSE
# #     }
# #     res
# #   }))
# #   which.struc <- which(sapply(proplist, function(x) {
# #     if ("kind" %in% names(x)) {
# #       res <- x[["kind"]] == "SMILES"
# #     }
# #     else {
# #       res <- FALSE
# #     }
# #     res
# #   }))
# #   which.charge <- which(sapply(proplist, function(x) {
# #     if ("kind" %in% names(x)) {
# #       res <- x[["kind"]] == "Physiological Charge"
# #     }
# #     else {
# #       res <- FALSE
# #     }
# #     res
# #   }))
# #   if (length(which.form) == 0 & length(which.struc) == 0) {
# #     return(NULL)
# #   }
# #   currEnvir$db.formatted[currEnvir$idx, "compoundname"] <- XML::xmlValue(currNode[["name"]])
# #   currEnvir$db.formatted[currEnvir$idx, "identifier"] <- XML::xmlValue(currNode[["drugbank-id"]])
# #   currEnvir$db.formatted[currEnvir$idx, "baseformula"] <- proplist[[which.form]][["value"]]
# #   currEnvir$db.formatted[currEnvir$idx, "structure"] <- if (length(which.struc) > 0) {
# #     proplist[[which.struc]][["value"]]
# #   }
# #   else {
# #     ""
# #   }
# #   currEnvir$db.formatted[currEnvir$idx, "description"] <- XML::xmlValue(currNode[["description"]])
# #   currEnvir$db.formatted[currEnvir$idx, "charge"] <- if (length(which.charge) > 0) {
# #     proplist[[which.charge]][["value"]]
# #   }
# #   else {
# #     0
# #   }
# # }
# # res <- XML::xmlEventParse(file = input, branches = list(drug = metabolite, `drugbank-metabolite-id-value` = print))
# # envir$db.formatted <- envir$db.formatted[-1, ]
# # drugbank2 = envir$db.formatted
# # save(drugbank2, file = "drugbank2")
# 
# load("drugbank2")
# 
# 
# # ## parse data from XML and save it to memory
# # read_drugbank_xml_db("full database.xml")
# # 
# # ## load drugs data
# # drugs <- drugs()
# # 
# # ## load drug groups data
# # drug_groups <- drug_groups()
# # 
# # ## load drug targets actions data
# # drug_targets_actions <- targets_actions()
# # save(drugs, file = "drugs")
# # 
# # load("drugs")
# # 
# # ###only remain small molecules
# # general_information = 
# # drugs$general_information %>% 
# #   dplyr::filter(type == "small molecule") %>% 
# #   dplyr::rename(Lab.ID = primary_key) %>% 
# #   dplyr::select(Lab.ID, everything()) %>% 
# #   dplyr::mutate(drugbank.ID = Lab.ID) %>%
# #   dplyr::rename(Compound.name = name,
# #                 CAS.ID = cas_number,
# #                 mz = monoisotopic_mass,) %>%
# #   dplyr::select(-c(description, synthesis_reference, fda_label, msds))
# # 
# # drug_classification = 
# # drugs$drug_classification %>% 
# #   dplyr::filter(drugbank_id %in% general_information$Lab.ID) %>% 
# #   dplyr::rename(Lab.ID = drugbank_id) %>% 
# #   dplyr::select(Lab.ID, everything()) %>% 
# #   dplyr::select(-c(description, direct_parent, alternative_parents, substituents))
# # 
# # library(plyr)
# # synonyms =
# #   drugs$synonyms %>%
# #   dplyr::filter(`drugbank-id` %in% general_information$Lab.ID) %>%
# #   plyr::dlply(.variables = .(`drugbank-id`)) %>%
# #   purrr::map(function(x) {
# #     if (nrow(x) == 1) {
# #       return(x)
# #     } else{
# #       x$synonym = paste(x$synonym, collapse = "{}")
# #       x$language = paste(x$language, collapse = "{}")
# #       x$coder = paste(x$coder, collapse = "{}")
# #       x[1,,drop = FALSE]
# #     }
# #   }) %>% 
# #   do.call(rbind, .) %>% 
# #   as.data.frame() %>% 
# #   dplyr::rename(Lab.ID = `drugbank-id`) %>% 
# #   dplyr::select(Lab.ID, everything()) %>% 
# #   dplyr::select(-coder)
# # 
# # drugbank =
# #   general_information %>%
# #   dplyr::left_join(drug_classification, by = c("Lab.ID")) %>% 
# #   dplyr::left_join(synonyms, by = c("Lab.ID"))
# # 
# # drugbank$RT = NA
# # drugbank$HMDB.ID = NA
# # drugbank$KEGG.ID = NA
# # drugbank$mz.pos = NA
# # drugbank$mz.neg = NA
# # drugbank$Submitter = "drugbnak"
# # 
# # save(drugbank, file = "drugbank")
# load("drugbank")
# 
# 
# drugbank2 = 
# drugbank2 %>% 
#   dplyr::select(baseformula, identifier) %>% 
#   dplyr::filter(!is.na(identifier)) %>% 
#   dplyr::rename(Lab.ID = identifier, Formula = baseformula)
# 
# drugbank = 
# drugbank %>% 
#   dplyr::left_join(drugbank2, by = "Lab.ID")
# 
# drugbank = 
# drugbank %>% 
#   dplyr::select(Lab.ID, mz, RT, CAS.ID, HMDB.ID, KEGG.ID, Formula, mz.pos, mz.neg, Submitter, everything()) %>% 
#   dplyr::filter(!is.na(Formula))
# 
# drugbank$mz = as.numeric(drugbank$mz)
# 
# openxlsx::write.xlsx(drugbank, file = "drugbank.xlsx", asTable = TRUE)
# 
# drugbankMS1Database5.1.8 =
#   construct_database(
#     path = ".",
#     version = "5.1.8",
#     metabolite.info.name = "drugbank.xlsx",
#     source = "drugbank",
#     link = "https://go.drugbank.com/",
#     creater = "Xiaotao Shen",
#     email = "shenxt@stanford.edu",
#     rt = FALSE,
#     threads = 3
#   )
# 
# 
# save(drugbankMS1Database5.1.8, file = "drugbankMS1Database5.1.8")
# 
# 
jaspershen/metID documentation built on July 31, 2022, 11:31 p.m.