knitr::opts_knit$set(root.dir = here::here(''))
dir(here::here('data'), full.names = T) %>% file.remove() rm(list = ls(envir = globalenv(), all = T))
internet <- T; knitr::opts_chunk$set(collapse = T)
I use two kinds of cache in this document (three if you include GPL-html/
).
For dataset
, I use knitr's cache mechanism. Fairly simple, you just need to add
md5sum of input file to chunk options, knitr will takes care of the rest for you.
But remember to use the data in another chunk, otherwise it won't run.
gds_result.txt
: click here.
Or query ("expression profiling by array"[DataSet Type]) AND "homo sapiens"[Organism]
in GEO DataSets.
dataset <- rGEO.data::read_summary('inst/extdata/gds_result.txt') %T>% print
usethis::use_data(dataset, overwrite = T)
download .tsv
files of Platform and Series, click Homo sapiens
in https://www.ncbi.nlm.nih.gov/geo/browse/?view=platforms
# update monthly geo_tsv <- function(type = c('platforms', 'series'), page = 1, tax_id = '9606') { paste0('https://www.ncbi.nlm.nih.gov/geo/browse/?view=', type, '&tax=', tax_id, '&mode=tsv&page=', page, '&display=5000') } parallel::mcmapply(download.file, geo_tsv('platforms', 1:2), paste0('data-raw/geo-tsv/platform-', 1:2, '.tsv')) # parallel::mcmapply(download.file, geo_tsv('series', 1:9), paste0('data-raw/geo-tsv/series-', 1:9, '.tsv'))
platform <- dir('data-raw/geo-tsv', 'platform', full.names = T) %>% lapply(readr::read_tsv, T, libzhuoer::cols_char()) %>% dplyr::bind_rows() %>% dplyr::filter(Taxonomy == 'Homo sapiens') %T>% print usethis::use_data(platform, overwrite = T)
GPL_html_dir <- 'data-raw/GPL-html'; dir(GPL_html_dir, full = T) %>% {.[file.size(.) <10]} %>% file.remove download_GPL_html <- function(){ rGEO.data::platform$Accession %>% {setdiff(., dir(GPL_html_dir) %>% stringr::str_extract('GPL\\d+'))} %>% parallel::mclapply(. %>% { input <- paste0('https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=', .); output <- paste0(GPL_html_dir, '/', ., '.html'); if (!file.exists(output)) download.file(input, output) }, mc.cores = 16); } download_GPL_html()
Here I manually maintain a cache. One reason is to save file in case all old data get cleaned. Another is that there are two differences, one is old and new chunk option, whether input dir mtime or input files; the other is the difference between the inut files and our cache. Although they should be the same, there is always accidents which we might never understand.
cache <- readr::read_rds('data-raw/rds/gpl_metas.rds.gz') to_process <- setdiff(stringr::str_remove(dir(GPL_html_dir), '.html$'), names(cache)) gpl_html_files <- if (length(to_process) > 0) { new <- paste0(GPL_html_dir, '/', to_process, '.html') %>% parallel::mclapply(rGEO.data::read_gpl_html) %T>% {names(.) <- to_process} gpl_metas <- c(new, cache) %T>% readr::write_rds('data-raw/rds/gpl_metas.rds.gz', 'gz') } else { gpl_metas <- cache } usethis::use_data(gpl_metas, overwrite = T)
devtools::test() # test the new data roxygen2::roxygenize() # you may also have edited data documentation system('R CMD INSTALL --no-multiarch --with-keep.source .') devtools::reload() # now you can use the new data in current R session
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.