# neural network papers from 1970 to 1980. Exact phrase my_url <- make_search_url(query = "neural network", from_year = 1970, to_year = 1980, how = "all") my_url get_papers_count(my_url) # 0
# neural network papers from 1980 to 1990. Exact phrase my_url <- make_search_url(query = "neural network", from_year = 1980, to_year = 1990, how = "all") get_papers_count(my_url) # 17
# neural network papers from 1990 to 2000. Exact phrase my_url <- make_search_url(query = "neural network", from_year = 1990, to_year = 2000, how = "all") get_papers_count(my_url) # 510
# Onepetro page to dataframe # neural network papers from 1990 to 2000. Exact phrase my_url <- make_search_url(query = "neural network", from_year = 1990, to_year = 2000, how = "all") onepetro_page_to_dataframe(my_url)
# Onepetro page to dataframe # neural network papers. Exact phrase my_url <- make_search_url(query = "neural network", how = "all") get_papers_count(my_url) # 2998 onepetro_page_to_dataframe(my_url)
# Using regex to extract the number of papers when the result string has more info my_url <- make_search_url(query = "neural network", from_year = 1990, to_year = 1999, how = "all") result <- read_onepetro(my_url) search_result <- result %>% html_nodes("h2") %>% html_text() search_result # extract the numeric part of the results pattern <- "[\\d,]+(?= results.)" # a number, including comma, before " results." m <- regexpr(pattern, search_result[1], perl = TRUE) # matched as.numeric(gsub(",", "", regmatches(search_result[1], m))) # remove comma first # 415
# extract results from node: .facet-unit-right my_url <- make_search_url(query = "shale gas", how = "all") result <- read_onepetro(my_url) search_result <- result %>% html_nodes("h2") %>% html_text() search_result # extract the numeric part of the results pattern <- "[\\d,]+(?= results.)" # a number, including comma, before " results." m <- regexpr(pattern, search_result[1], perl = TRUE) # matched as.numeric(gsub(",", "", regmatches(search_result[1], m))) # remove comma first pub_doc <- result %>% html_nodes(".facet-unit-right") %>% html_nodes("option") %>% html_text() pub_doc
# pub_doc len_list <- length(pub_doc) len_list grep("All types", pub_doc)
get_dctype <- function(aList) { len_list <- length(aList) ix <- grep("All types", aList) aList[(ix+1):len_list] } get_dctype(pub_doc)
get_dc_publisher_facet <- function(aList) { ix_stop <- grep("All types", aList) aList[2:(ix_stop-1)] } publishers_raw <- get_dc_publisher_facet(pub_doc) publishers_raw
x <- publishers pattern <- "(?<=\\{).+(?=\\})" ## Match data from regexpr() m <- regexpr(pattern, x, perl = TRUE) regmatches(x, m)
extract_num_papers <- function(x) { pattern <- "(?<=\\{).+(?=\\})" m <- regexpr(pattern, x, perl = TRUE) as.numeric(gsub(",", "" , regmatches(x, m))) } extract_num_papers(publishers_raw)
x <- publishers_raw pattern <- "(\\s[{\\d}].+)" gsub(pattern, "", x, perl = TRUE)
extract_publishers <- function(x) { pattern <- "(\\s[{\\d}].+)" gsub(pattern, "", x, perl = TRUE) } extract_publishers(publishers_raw)
publishers.as.dataframe <- function(x) { pub_vector <- get_dc_publisher_facet(x) pub_values <- extract_num_papers(pub_vector) pub_name <- extract_publishers(pub_vector) data.frame(pub_name, pub_values, stringsAsFactors = FALSE) } publishers.as.dataframe(pub_doc)
doctype.as.dataframe <- function(x) { doctype_vector <- get_dctype(x) doctype_value <- extract_num_papers(doctype_vector) doctype_name <- extract_publishers(doctype_vector) data.frame(doctype_name, doctype_value, stringsAsFactors = FALSE) } doctype.as.dataframe(pub_doc)
source('./R/url.R') my_url <- make_search_url(query = "shale oil", how = "all") result <- send_url(my_url) search_result <- result %>% html_nodes("h2") %>% html_text() search_result # extract the numeric part of the results pattern <- "[\\d,]+(?= results.)" # a number, including comma, before " results." m <- regexpr(pattern, search_result[1], perl = TRUE) # matched as.numeric(gsub(",", "", regmatches(search_result[1], m))) # remove comma first result %>% html_nodes(".facet-unit-right") %>% html_nodes("option") %>% html_text()
result %>% html_nodes(".facet-unit-right option .dc_type") %>% # html_attr("dc_type") %>% html_text()
result %>% html_nodes(".facet-unit-right") %>% html_nodes("option") %>% html_text()
result %>% html_nodes(".facet-unit-right option") %>% html_text() %>% .[20:28]
result %>% html_nodes(".filter-label") %>% html_text()
result %>% html_nodes(".filter-label") %>% html_nodes("Publisher") %>% html_text("value") # html_nodes(result, "class.dc_type")
html_nodes(result, ".select2-choice Type") html_nodes(result, ".select2-results") html_nodes(result, ".s2id_autogen1") #
html_text(result, ".select2-choice Type") html_text(result, ".select2-results") html_text(result, ".s2id_autogen1") #
html_nodes(result, "#dc_type") html_nodes(result, ".dc_publisher_facet") html_nodes(result, "#dc_publisher_facet") html_nodes(result, "value.conference-paper") html_nodes(result, "option.conference-paper") html_nodes(result, "facets-dd chzn-enable ") html_nodes(result, "Type") html_nodes(result, "#Type") html_nodes(result, ".Type")
result %>% html_nodes("div.filter-label option value")
result %>% html_nodes(".facets-form") %>% html_nodes("div") # %>% html_nodes("filter-label")
result %>% html_nodes(".facets-form div") %>% html_text("Publisher:")
# obtaining the name of the variable for a class result %>% html_nodes(".facets-form div.filter-label") %>% html_text()
# obtaining the name of the variable for a class result %>% html_nodes("div") %>% html_nodes("div.controls") %>% html_text()
result %>% html_nodes("div")
result %>% html_nodes("div.container")
result %>% html_attrs("class")
result %>% html_nodes("div.result-item")
nodes <- html_nodes(result, "div.result-item") nodes[[8]]
nodes <- html_nodes(result, "div.facet-unit-right") # nodes[[1]] l_nodes <- nodes[[length(nodes)]] l_nodes # html_attr(x = nodes[[length(nodes)]], "filter-label") html_attr(l_nodes, "")
result %>% html_nodes("div.result-item") %>% html_attrs()
result %>% html_nodes("div.filter-label")
result %>% html_nodes("div") %>% .[[64]]
result %>% html_nodes("div") %>% html_nodes("option")
div_res <- result %>% html_nodes("div") lapply(div_res, identity)
result %>% html_nodes("dc_publisher_facet")
result %>% xml_structure()
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.