# first, helper/internal functions ---------------------------------------------
#' Get the set of approved projects tables from HTML
#' see https://www.w3schools.com/xml/xml_xpath.asp for xpath syntax
#' @noRd
.get_rows_from_html <- function(html_path){
xml2::read_html(html_path) %>%
xml2::xml_find_all("//body//div[@id='approved']//tbody//tr")
}
#' split the project tables by project. Return list of projects.
#' @noRd
.split_by_project <- function(project_tables){
# get a list of xml_children for each row (basically, number of columns), then
# get the length of each of these lists
row_lengths <-
purrr::map_int(
purrr::map(
project_tables, function(x){
xml2::xml_children(x)
}
),
function(x){
length(x)
}
)
# get the row number of each header row
header_row_indexes <-
which(row_lengths %in% 2)
# short circuit if only one project
if (length(header_row_indexes) == 1) {
return(list(project_tables))
}
# initialize empty list of length = number of projects
by_project <- vector("list", length(header_row_indexes))
# loop to divide project_tables into by_project
for (index in seq(1:length(by_project))){
starting_row <- header_row_indexes[index]
if (is.na(header_row_indexes[index + 1])) {
ending_row <- length(project_tables)
} else {
ending_row <- header_row_indexes[index + 1] - 1
}
by_project[[index]] <- project_tables[starting_row:ending_row]
}
return(by_project)
}
#' parse a single project to get tibble with phs, expiration, consent code
#' THIS SHOULD BE REFACTORED
#' @noRd
.parse_project <- function(a_project){
# define my regexes
phs_regex <- paste0(
"(phs", # start capture group with the string "phs"
"\\d{6})", # followed by 6 digits
"\\.v", # followd by the string ".v"
"(\\d+?)", # followed by 1 or more (but as few as possible) digits
"\\.p", # followed yb the string ".p"
"(\\d+?)") # followed by 1 or more (but as few as possible) digits
consent_regex <- paste0(
"^", # at the beginning of the string
"\\\n", # the string "\n"
"(.+)") # followed by 1 or more characters that we want to capture
# run the loop. Note: this is bad b/c uncertain length..
for (index in 2:length(a_project)){
# get a row
a_row <- a_project[index]
# stop if the row doesn't have 5 columns
if (length(xml2::xml_children(a_row)) != 5){
break
}
expiration <- xml2::xml_text(xml2::xml_children(a_row)[4])
string_with_phs <- xml2::xml_text(xml2::xml_children(a_row)[2])
phs <- stringr::str_match(string_with_phs, phs_regex)[[2]]
string_with_consent <-
xml2::xml_text(
xml2::xml_children(
xml2::xml_children(
xml2::xml_children(a_row)[2])
))
#[2])
# note: string_with_consent includes \n, introduced by xml_text(), which
# regex will call end of line...
consent_code <- stringr::str_trim(
stringr::str_match(string_with_consent, consent_regex)[[2]])
# check for results tibble
if (!"result" %in% ls()){
# initialze empty data frame
result <- dplyr::tibble(phs, expiration, consent_code)
} else {
result <- dplyr::bind_rows(
result,
dplyr::tibble(phs, expiration, consent_code)
)
}
}
return(result)
}
#' get a project number by parsing a single project
#' @noRd
.get_number <- function(a_project){
# define my regex
number_regex <- paste0(
"#", # the hash character
"(", # open capture group
"\\d+", # one or more numbers
")", # close capture group
":") # the colon character
# get line with number
header_line <- a_project[[1]]
# stop if the row doesn't have 2 columns
if (length(xml2::xml_children(header_line)) != 2){
stop("header line doesn't have 2 columns as expected")
}
# get target line as text
string_with_number <-
stringr::str_squish(
xml2::xml_text(xml2::xml_children(header_line)[1])
)
# return number obtained with regex
stringr::str_match(string_with_number, number_regex)[[2]]
}
#' parse a list of projects to get a named list of tibbles with phs, expiration,
#' consent code. List item names should be project number.
#' @noRd
.parse_list <- function(project_list){
# first, parse each project
parsed_projects <- purrr::map(project_list, .parse_project)
# next, get list of project numbers for names
project_numbers <- purrr::map(project_list, .get_number)
# assign names to parsed projects
names(parsed_projects) <- project_numbers
return(parsed_projects)
}
# next, functions to export from package----------------------------------------
#' Get phs identifiers from a dbGaP-provided Study Request List
#'
#' @param html_path filepath to the html page downloaded from dbGaP
#' @return a list whose elements are each a list of phs identifiers with one
#' list per projects
#'
#' @examples
#' \dontrun{
#' html_path <-
#' system.file("extdata", "example.html", package = "phsparsr", mustWork = TRUE)
#' ids <- parse_phs(html_path)
#' }
#' @export
parse_phs <- function(html_path){
table_rows <- .get_rows_from_html(html_path)
project_list <- .split_by_project(table_rows)
.parse_list(project_list)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.