# neural network papers from 1970 to 1980. Exact phrase

my_url <- make_search_url(query = "neural network", 
                          from_year = 1970, 
                          to_year   = 1980, 
                          how = "all")
my_url
get_papers_count(my_url)
# 0
# neural network papers from 1980 to 1990. Exact phrase
my_url <- make_search_url(query = "neural network", 
                          from_year = 1980, 
                          to_year   = 1990, 
                          how = "all")
get_papers_count(my_url)
# 17
# neural network papers from 1990 to 2000. Exact phrase
my_url <- make_search_url(query = "neural network", 
                          from_year = 1990, 
                          to_year   = 2000, 
                          how = "all")
get_papers_count(my_url)
# 510
# Onepetro page to dataframe
# neural network papers from 1990 to 2000. Exact phrase

my_url <- make_search_url(query = "neural network", 
                          from_year = 1990, 
                          to_year   = 2000, 
                          how = "all")

onepetro_page_to_dataframe(my_url)
# Onepetro page to dataframe
# neural network papers. Exact phrase

my_url <- make_search_url(query = "neural network", 
                          how = "all")

get_papers_count(my_url)
# 2998
onepetro_page_to_dataframe(my_url)
# Using regex to extract the number of papers when the result string has more info

my_url <- make_search_url(query = "neural network", 
                          from_year = 1990, 
                          to_year   = 1999, 
                          how = "all")

result <- read_onepetro(my_url)

search_result <- result %>%
  html_nodes("h2") %>%
  html_text()

search_result

# extract the numeric part of the results
pattern <- "[\\d,]+(?= results.)"    # a number, including comma, before " results."
m <- regexpr(pattern, search_result[1], perl = TRUE)       # matched
as.numeric(gsub(",", "", regmatches(search_result[1], m))) # remove comma first
# 415
# extract results from node: .facet-unit-right
my_url <- make_search_url(query = "shale gas", 
                          how = "all")

result <- read_onepetro(my_url)

search_result <- result %>%
  html_nodes("h2") %>%
  html_text()

search_result

# extract the numeric part of the results
pattern <- "[\\d,]+(?= results.)"    # a number, including comma, before " results."
m <- regexpr(pattern, search_result[1], perl = TRUE)       # matched
as.numeric(gsub(",", "", regmatches(search_result[1], m))) # remove comma first

pub_doc <- result %>%
html_nodes(".facet-unit-right") %>%
    html_nodes("option") %>%
    html_text()
pub_doc
# pub_doc
len_list <- length(pub_doc)
len_list
grep("All types", pub_doc)
get_dctype <- function(aList) {
    len_list <- length(aList)
    ix <- grep("All types", aList)
    aList[(ix+1):len_list]
}

get_dctype(pub_doc)
get_dc_publisher_facet <- function(aList) {
    ix_stop <- grep("All types", aList)
    aList[2:(ix_stop-1)]
}

publishers_raw <- get_dc_publisher_facet(pub_doc)
publishers_raw
x <- publishers
pattern <- "(?<=\\{).+(?=\\})"

## Match data from regexpr()
m <- regexpr(pattern, x, perl = TRUE)
regmatches(x, m)
extract_num_papers <- function(x) {
    pattern <- "(?<=\\{).+(?=\\})"
    m <- regexpr(pattern, x, perl = TRUE)
    as.numeric(gsub(",", "" , regmatches(x, m)))
}

extract_num_papers(publishers_raw)
x <- publishers_raw
pattern <- "(\\s[{\\d}].+)"
gsub(pattern, "", x, perl = TRUE)
extract_publishers <- function(x) {
    pattern <- "(\\s[{\\d}].+)"
    gsub(pattern, "", x, perl = TRUE)
}

extract_publishers(publishers_raw)
publishers.as.dataframe <- function(x) {
    pub_vector <- get_dc_publisher_facet(x)
    pub_values <- extract_num_papers(pub_vector)
    pub_name   <- extract_publishers(pub_vector)
    data.frame(pub_name, pub_values, stringsAsFactors = FALSE)
}

publishers.as.dataframe(pub_doc)
doctype.as.dataframe <- function(x) {
    doctype_vector <- get_dctype(x)
    doctype_value <- extract_num_papers(doctype_vector)
    doctype_name   <- extract_publishers(doctype_vector)
    data.frame(doctype_name, doctype_value, stringsAsFactors = FALSE)
}

doctype.as.dataframe(pub_doc)
source('./R/url.R')

my_url <- make_search_url(query = "shale oil", 
                          how = "all")

result <- send_url(my_url)

search_result <- result %>%
  html_nodes("h2") %>%
  html_text()

search_result

# extract the numeric part of the results
pattern <- "[\\d,]+(?= results.)"    # a number, including comma, before " results."
m <- regexpr(pattern, search_result[1], perl = TRUE)       # matched
as.numeric(gsub(",", "", regmatches(search_result[1], m))) # remove comma first

result %>%
html_nodes(".facet-unit-right") %>%
    html_nodes("option") %>%
    html_text()
result %>%
html_nodes(".facet-unit-right option .dc_type") %>%
    # html_attr("dc_type") %>%
    html_text()
result %>%
html_nodes(".facet-unit-right") %>%
    html_nodes("option") %>%
    html_text()
result %>%
html_nodes(".facet-unit-right option") %>%
    html_text() %>%
    .[20:28]
result %>%
html_nodes(".filter-label") %>%
    html_text()
result %>%
html_nodes(".filter-label") %>%
    html_nodes("Publisher") %>%
    html_text("value")
# html_nodes(result, "class.dc_type")
html_nodes(result, ".select2-choice Type")
html_nodes(result, ".select2-results")
html_nodes(result, ".s2id_autogen1")
#
html_text(result, ".select2-choice Type")
html_text(result, ".select2-results")
html_text(result, ".s2id_autogen1")
#
html_nodes(result, "#dc_type")
html_nodes(result, ".dc_publisher_facet")
html_nodes(result, "#dc_publisher_facet")

html_nodes(result, "value.conference-paper")
html_nodes(result, "option.conference-paper")

html_nodes(result, "facets-dd chzn-enable ")

html_nodes(result, "Type")
html_nodes(result, "#Type")
html_nodes(result, ".Type")
result %>%
    html_nodes("div.filter-label option value")
result %>%
    html_nodes(".facets-form") %>%
    html_nodes("div") # %>% html_nodes("filter-label")
result %>%
    html_nodes(".facets-form div") %>%
    html_text("Publisher:") 
# obtaining the name of the variable for a class
result %>%
    html_nodes(".facets-form div.filter-label") %>%
    html_text()
# obtaining the name of the variable for a class
result %>%
    html_nodes("div") %>%
    html_nodes("div.controls") %>%
    html_text()
result %>%
    html_nodes("div") 
result %>%
    html_nodes("div.container") 
result %>%
    html_attrs("class") 
result %>%
    html_nodes("div.result-item")
nodes <- html_nodes(result, "div.result-item")
nodes[[8]]
nodes <- html_nodes(result, "div.facet-unit-right")
# nodes[[1]]
l_nodes <- nodes[[length(nodes)]]
l_nodes
# html_attr(x = nodes[[length(nodes)]], "filter-label")
html_attr(l_nodes, "")
result %>%
    html_nodes("div.result-item") %>%
    html_attrs()
result %>%
    html_nodes("div.filter-label")
result %>%
    html_nodes("div") %>%
    .[[64]]
result %>%
    html_nodes("div") %>%
    html_nodes("option")
div_res <- result %>%
    html_nodes("div")

lapply(div_res, identity)
result %>%
    html_nodes("dc_publisher_facet")
result %>%
    xml_structure()


f0nzie/petro.One documentation built on May 29, 2019, 12:05 a.m.