inst/doc/TKCat-User-guide.R

## ----setup, message=FALSE, echo=FALSE, include=FALSE, cache=FALSE-------------
library(knitr)
opts_chunk$set(
   include=TRUE,
   echo=TRUE,
   message=TRUE,
   warning=TRUE,
   cache=FALSE,
   cache.lazy=FALSE
)
library(TKCat)
igraph_available <- "igraph" %in% installed.packages()[,"Package"]

## ---- message=FALSE-----------------------------------------------------------
library(readr)
hpo_data_dir <- system.file("examples/HPO-subset", package="ReDaMoR")

## ---- message=FALSE-----------------------------------------------------------
HPO_hp <- readr::read_tsv(
   file.path(hpo_data_dir, "HPO_hp.txt")
)
HPO_hp

## ---- message=FALSE-----------------------------------------------------------
HPO_diseases <- readr::read_tsv(
   file.path(hpo_data_dir, "HPO_diseases.txt")
)
HPO_diseases

## ---- message=FALSE-----------------------------------------------------------
HPO_diseaseHP <- readr::read_tsv(
   file.path(hpo_data_dir, "HPO_diseaseHP.txt")
)
HPO_diseaseHP

## ---- out.height="200px"------------------------------------------------------
mhpo_dm <- ReDaMoR::df_to_model(HPO_hp, HPO_diseases, HPO_diseaseHP)
if(igraph_available){
   mhpo_dm %>%
      ReDaMoR::auto_layout(lengthMultiplier=80) %>% 
      plot()
}else{
   mhpo_dm %>%
      plot()
}

## ---- eval=FALSE--------------------------------------------------------------
#  mhpo_dm <- ReDaMoR::model_relational_data(mhpo_dm)

## ---- echo=FALSE--------------------------------------------------------------
mhpo_dm <- ReDaMoR::read_json_data_model(
   system.file("examples/HPO-model.json", package="ReDaMoR")
)[c("HPO_hp", "HPO_diseases", "HPO_diseaseHP")]

## ---- out.height="200px"------------------------------------------------------
plot(mhpo_dm)

## ---- echo=FALSE, results='hide'----------------------------------------------
try(
   mhpo_db <- memoMDB(
      dataTables=list(
         HPO_hp=HPO_hp, HPO_diseases=HPO_diseases, HPO_diseaseHP=HPO_diseaseHP
      ),
      dataModel=mhpo_dm,
      dbInfo=list(name="miniHPO")
   ),
   silent=TRUE
)

## ---- eval=FALSE--------------------------------------------------------------
#  mhpo_db <- memoMDB(
#     dataTables=list(
#        HPO_hp=HPO_hp, HPO_diseases=HPO_diseases, HPO_diseaseHP=HPO_diseaseHP
#     ),
#     dataModel=mhpo_dm,
#     dbInfo=list(name="miniHPO")
#  )

## ---- echo=FALSE, results='asis'----------------------------------------------
get_confrontation_report() %>% 
   format_confrontation_report_md(
      title="miniHPO",
      level=2, numbered=FALSE
   ) %>% 
   cat()

## -----------------------------------------------------------------------------
HPO_hp <- mutate(HPO_hp, level=as.integer(level))
HPO_diseases <- mutate(HPO_diseases, id=as.character(id))
HPO_diseaseHP <- mutate(HPO_diseaseHP, id=as.character(id))
mhpo_db <- memoMDB(
   dataTables=list(
      HPO_hp=HPO_hp, HPO_diseases=HPO_diseases, HPO_diseaseHP=HPO_diseaseHP
   ),
   dataModel=mhpo_dm,
   dbInfo=list(name="miniHPO")
)

## -----------------------------------------------------------------------------
f_mhpo_db <- read_fileMDB(
   path=hpo_data_dir,
   dbInfo=list(name="miniHPO"),
   dataModel=mhpo_dm
)

## -----------------------------------------------------------------------------
print(object.size(mhpo_db), units="Kb")
print(object.size(f_mhpo_db), units="Kb")
compare_MDB(former=mhpo_db, new=f_mhpo_db) %>% 
   DT::datatable(
      rownames=FALSE,
      width="75%",
      options=list(dom="t", pageLength=nrow(.))
   )

## -----------------------------------------------------------------------------
db_info(mhpo_db)$title <- "Very small extract of the human phenotype ontology"
db_info(mhpo_db)$description <- "For demonstrating ReDaMoR and TKCat capabilities, a very few information from the HPO (human phenotype ontology) has been extracted"
db_info(mhpo_db)$url <- "https://hpo.jax.org/app/"

## -----------------------------------------------------------------------------
db_info(mhpo_db)$version <- "0.1"
db_info(mhpo_db)$maintainer <- "Patrice Godard"
db_info(mhpo_db)$timestamp <- Sys.time()

## -----------------------------------------------------------------------------
mhpo_db

## -----------------------------------------------------------------------------
list_local_collections()

## -----------------------------------------------------------------------------
mhpo_db

## -----------------------------------------------------------------------------
get_local_collection("Condition") %>%
   show_collection_def()

## -----------------------------------------------------------------------------
mhpo_db$HPO_hp
mhpo_db <- add_collection_member(
   mhpo_db, collection="Condition", table="HPO_hp",
   condition=list(value="Phenotype", static=TRUE),
   source=list(value="HP", static=TRUE),
   identifier=list(value="id", static=FALSE)
)

## -----------------------------------------------------------------------------
mhpo_db <- add_collection_member(
   mhpo_db, collection="Condition", table="HPO_diseases",
   condition=list(value="Disease", static=TRUE),
   source=list(value="db", static=FALSE),
   identifier=list(value="id", static=FALSE)
)

## -----------------------------------------------------------------------------
mhpo_db

## -----------------------------------------------------------------------------
collection_members(mhpo_db)

## ---- results='hide'----------------------------------------------------------
tmpDir <- tempdir()
as_fileMDB(mhpo_db, path=tmpDir, htmlModel=FALSE)

## ---- echo=FALSE--------------------------------------------------------------
list.files(
   path=file.path(tmpDir, "miniHPO"),
   recursive=TRUE
) %>% 
   file.path("miniHPO", .) %>%
   data.frame(pathString=.) %>% 
   data.tree::as.Node() %>%
   data.tree::ToDataFrameTree() %>%
   pull(1) %>% 
   cat(sep="\n")

## -----------------------------------------------------------------------------
read_fileMDB(file.path(tmpDir, "miniHPO"))

## -----------------------------------------------------------------------------
file_hpo <- read_fileMDB(
   path=system.file("examples/HPO-subset", package="ReDaMoR"),
   dataModel=system.file("examples/HPO-model.json", package="ReDaMoR"),
   dbInfo=list(
      "name"="HPO",
      "title"="Data extracted from the HPO database",
      "description"=paste(
         "This is a very small subset of the HPO!",
         "Visit the reference URL for more information."
      ),
      "url"="http://human-phenotype-ontology.github.io/"
   )
)

## -----------------------------------------------------------------------------
plot(data_model(file_hpo))

## -----------------------------------------------------------------------------
file_hpo <- file_hpo %>% 
   add_collection_member(
      collection="Condition", table="HPO_hp",
      condition=list(value="Phenotype", static=TRUE),
      source=list(value="HP", static=TRUE),
      identifier=list(value="id", static=FALSE)
   ) %>% 
   add_collection_member(
      collection="Condition", table="HPO_diseases",
      condition=list(value="Disease", static=TRUE),
      source=list(value="db", static=FALSE),
      identifier=list(value="id", static=FALSE)
   )

## -----------------------------------------------------------------------------
file_clinvar <- read_fileMDB(
   path=system.file("examples/ClinVar", package="TKCat")
)

## -----------------------------------------------------------------------------
file_clinvar

## -----------------------------------------------------------------------------
file_chembl <- read_fileMDB(
   path=system.file("examples/CHEMBL", package="TKCat")
)

## -----------------------------------------------------------------------------
file_chembl

## -----------------------------------------------------------------------------
memo_clinvar <- as_memoMDB(file_clinvar)
object.size(file_clinvar) %>% print(units="Kb")
object.size(memo_clinvar) %>% print(units="Kb")

## -----------------------------------------------------------------------------
db_info(file_clinvar)

## -----------------------------------------------------------------------------
plot(data_model(file_clinvar))

## -----------------------------------------------------------------------------
names(file_clinvar)
file_clinvar <- file_clinvar %>% 
   set_names(sub("ClinVar_", "", names(.))) 
names(file_clinvar)

## -----------------------------------------------------------------------------
collection_members(file_clinvar)

## -----------------------------------------------------------------------------
length(file_clinvar)        # Number of tables
lengths(file_clinvar)       # Number of fields per table
count_records(file_clinvar) # Number of records per table

## -----------------------------------------------------------------------------
data_file_size(file_clinvar, hr=TRUE)

## ---- eval=FALSE--------------------------------------------------------------
#  data_tables(file_clinvar, "traitNames")[[1]]
#  file_clinvar[["traitNames"]]
#  file_clinvar$"traitNames"
#  file_clinvar %>% pull(traitNames)

## ---- echo=FALSE--------------------------------------------------------------
file_clinvar %>% pull(traitNames)

## -----------------------------------------------------------------------------
file_clinvar[1:3]
if(igraph_available){
   c(file_clinvar[1:3], file_hpo[c(1,5,7)]) %>% 
      data_model() %>% auto_layout(force=TRUE) %>% plot()
}else{
   c(file_clinvar[1:3], file_hpo[c(1,5,7)]) %>% 
      data_model() %>% plot()
}

## -----------------------------------------------------------------------------
filtered_clinvar <- file_clinvar %>%
   filter(
      entrezNames = symbol %in% c("PIK3R2", "UGT1A8")
   ) %>% 
   slice(ReferenceClinVarAssertion=grep(
      "pathogen",
      .$ReferenceClinVarAssertion$clinicalSignificance,
      ignore.case=TRUE
   ))
left_join(
   dims(file_clinvar) %>% select(name, nrow),
   dims(filtered_clinvar) %>% select(name, nrow),
   by="name",
   suffix=c("_ori", "_filt")
)

## -----------------------------------------------------------------------------
gene_traits <- filtered_clinvar %>% 
   join_mdb_tables(
      "entrezNames", "varEntrez", "variants", "rcvaVariant",
      "ReferenceClinVarAssertion", "rcvaTraits", "traits"
   )
gene_traits$entrezNames %>%
   select(symbol, name, variants.type, variants.name, traitType, traits.name)

## -----------------------------------------------------------------------------
file_chembl$CHEMBL_component_sequence

## -----------------------------------------------------------------------------
file_clinvar$entrezNames

## -----------------------------------------------------------------------------
list_local_collections()

## -----------------------------------------------------------------------------
collection_members(file_chembl, "BE")
collection_members(file_clinvar, "BE")

## -----------------------------------------------------------------------------
get_shared_collections(filtered_clinvar, file_chembl)

## -----------------------------------------------------------------------------
bedCheck <- try(BED::checkBedConn())
if(!inherits(bedCheck, "try-error") && bedCheck){
   sel_coll <- get_shared_collections(file_clinvar, file_chembl) %>% 
      filter(collection=="BE")
   filtered_cv_chembl <- merge(
      x=file_clinvar,
      y=file_chembl,
      by=sel_coll,
      dmAutoLayout=igraph_available
   )
}

## -----------------------------------------------------------------------------
get_shared_collections(file_hpo, file_clinvar)

## -----------------------------------------------------------------------------
sel_coll <- get_shared_collections(file_hpo, file_clinvar) %>% 
   filter(table.x=="HPO_diseases", table.y=="traitCref") %>% 
   mutate(collection=NA)
sel_coll

## -----------------------------------------------------------------------------
hpo_clinvar <- merge(
   file_hpo, file_clinvar, by=sel_coll, dmAutoLayout=igraph_available
)
plot(data_model(hpo_clinvar))
hpo_clinvar$HPO_diseases_traitCref

## -----------------------------------------------------------------------------
k <- TKCat(file_hpo, file_clinvar)

## -----------------------------------------------------------------------------
list_MDBs(k)                     # list all the MDBs in a TKCat object
get_MDB(k, "HPO")                # get a specific MDBs from the catalog
search_MDB_tables(k, "disease")  # Search table about "disease"
search_MDB_fields(k, "disease")  # Search a field about "disease"
collection_members(k)            # Get collection members of the different MDBs
c(k, TKCat(file_chembl))         # Merge 2 TKCat objects

## ---- eval=FALSE--------------------------------------------------------------
#  library(TKCat)
#  explore_MDBs(k, download=TRUE)

## ---- echo=FALSE--------------------------------------------------------------
## The following line is to avoid building errors on CRAN
knitr::opts_chunk$set(eval=Sys.getenv("USER") %in% c("pgodard"))

## -----------------------------------------------------------------------------
k <- chTKCat(
   host="localhost",                     # default parameter
   port=9111L,                           # default parameter
   drv=ClickHouseHTTP::ClickHouseHTTP(), # default parameter
   user="default",                       # default parameter
   password=""                           # if not provided the
                                         # password is requested interactively 
)

## -----------------------------------------------------------------------------
list_MDBs(k)             # get a specific MDBs from the catalog
search_MDB_tables(k, "disease")  # Search table about "disease"
search_MDB_fields(k, "disease")  # Search a field about "disease"
collection_members(k)  

## ---- eval=FALSE--------------------------------------------------------------
#  explore_MDBs(k)

## ---- eval=FALSE--------------------------------------------------------------
#  kw <- chTKCat(host="localhost", port=9111L, user="pgodard")
#  create_chMDB(kw, "HPO", public=TRUE)
#  ch_hpo <- as_chMDB(file_hpo, kw)

## -----------------------------------------------------------------------------
ch_hpo <- get_MDB(k, "HPO")

## -----------------------------------------------------------------------------
get_query(
   ch_hpo,
   query="SELECT * from HPO_diseases WHERE lower(label) LIKE '%epilep%'"
)

## ---- echo=FALSE--------------------------------------------------------------
## The following line is to avoid building errors on CRAN
knitr::opts_chunk$set(eval=TRUE)

## ---- eval=FALSE--------------------------------------------------------------
#  k <- chTKCat(user="pgodard")
#  create_chTKCat_user(
#     k, login="lfrancois", contact=NA, admin=FALSE, provider=TRUE
#  )

## ---- eval=FALSE--------------------------------------------------------------
#  k <- chTKCat(user="pgodard")
#  change_chTKCat_password(k, "lfrancois")
#  update_chTKCat_user(k, contact="email", admin=FALSE)

## ---- eval=FALSE--------------------------------------------------------------
#  manage_chTKCat_users(k)

## ---- eval=FALSE--------------------------------------------------------------
#  drop_chTKCat_user(k, login="lfrancois")

## ---- eval=FALSE--------------------------------------------------------------
#  create_chMDB(k, "CHEMBL", public=FALSE)

## ---- eval=FALSE--------------------------------------------------------------
#  set_chMDB_access(k, "CHEMBL", public=TRUE)

## ---- eval=FALSE--------------------------------------------------------------
#  add_chMDB_user(k, "CHEMBL", "lfrancois", admin=TRUE)
#  # remove_chMDB_user(k, "CHEMBL", "lfrancois")
#  list_chMDB_users(k, "CHEMBL")

## ---- eval=FALSE--------------------------------------------------------------
#  lc <- scan_fileMDBs("fileMDB_directory")
#  ## The commented line below allows the exploration of the data models in lc.
#  # explore_MDBs(lc)
#  for(r in toFeed){
#     message(r)
#     lr <- as_memoMDB(lc[[r]])
#     cr <- as_chMDB(lr, k, overwrite=FALSE)
#  }

## ---- eval=FALSE--------------------------------------------------------------
#  empty_chMDB(k, "CHEMBL")

## ---- eval=FALSE--------------------------------------------------------------
#  drop_chMDB(k, "CHEMBL")

## ---- eval=FALSE--------------------------------------------------------------
#  add_chTKCat_collection(k, "BE")
#  list_chTKCat_collections(k)
#  remove_chTKCat_collection(k, "BE")

## ---- echo=FALSE--------------------------------------------------------------
plot(TKCat:::DEFAULT_DATA_MODEL)

## ---- echo=FALSE--------------------------------------------------------------
plot(TKCat:::CHMDB_DATA_MODEL)

## -----------------------------------------------------------------------------
list_local_collections()

## ---- eval=FALSE--------------------------------------------------------------
#  get_local_collection("BE")

## ---- echo=FALSE, results='asis'----------------------------------------------
get_local_collection("BE") %>%
   paste('```json', ., '```', sep="\n") %>% cat()

## -----------------------------------------------------------------------------
get_local_collection("BE") %>%
   show_collection_def()

## ---- eval=FALSE--------------------------------------------------------------
#  system.file(
#     "examples/CHEMBL/model/Collections/BE-CHEMBL_BE_1.0.json",
#     package="TKCat"
#  ) %>%
#     readLines() %>% paste(collapse="\n")

## ---- echo=FALSE, results='asis'----------------------------------------------
system.file(
   "examples/CHEMBL/model/Collections/BE-CHEMBL_BE_1.0.json",
   package="TKCat"
) %>% 
   readLines() %>% paste(collapse="\n") %>%
   paste('```json', ., '```', sep="\n") %>% cat()

## -----------------------------------------------------------------------------
jsonvalidate::json_validate(
   json=system.file(
      "examples/CHEMBL/model/Collections/BE-CHEMBL_BE_1.0.json",
      package="TKCat"
   ),
   schema=get_local_collection("BE"),
   engine="ajv"
)

## ---- eval=FALSE--------------------------------------------------------------
#  get_collection_mapper("BE")

## ---- echo=FALSE, results='asis'----------------------------------------------
get_collection_mapper("BE") %>% 
   format() %>% paste(collapse="\n") %>% 
   paste('```r', ., '```', sep="\n") %>% cat()

## -----------------------------------------------------------------------------
d <- matrix(
   rnorm(40), nrow=10,
   dimnames=list(
      paste0("g", 1:10),
      paste0("s", 1:4)
   )
)
m <- ReDaMoR::df_to_model(d) %>% 
   ReDaMoR::rename_field("d", "row", "gene") %>%
   update_field("d", "gene", comment="Gene identifier") %>% 
   ReDaMoR::rename_field("d", "column", "sample") %>% 
   update_field("d", "sample", comment="Sample identifier") %>% 
   ReDaMoR::rename_field("d", "value", "expression") %>% 
   update_field(
      "d", "expression", nullable=FALSE, comment="Gene expression value"
   )
md <- memoMDB(list(d=d), m, list(name="Matrix example"))
plot(data_model(md))

## -----------------------------------------------------------------------------
ch_config_files <- tibble(
   name=c("config.xml", "users.xml"),
   file=c(
      base64enc::base64encode(
         system.file("ClickHouse/config.xml", package="TKCat")
      ),
      base64enc::base64encode(
         system.file("ClickHouse/users.xml", package="TKCat")
      )
   )
)
m <- df_to_model(ch_config_files) %>% 
   update_field(
      "ch_config_files", "name",
      type="base64", comment="Name of the config file",
      nullable=FALSE, unique=TRUE
   ) %>% 
   update_field(
      "ch_config_files", "file",
      type="base64", comment="Config file in base64 format",
      nullable=FALSE
   )
md <- memoMDB(
   list(ch_config_files=ch_config_files), m, list(name="base64 example")
)
plot(data_model(md))

Try the TKCat package in your browser

Any scripts or data that you put into this service are public.

TKCat documentation built on Feb. 16, 2023, 10:51 p.m.