library(petro.One)

my_url <- make_search_url(query = "mechanistic", 
                          how = "all")
my_url
page <- xml2::read_html(my_url)
petro.One:::is_dctype_enabled(page)
# get_papers_count(my_url)
papers_by_type(my_url)
library(petro.One)

my_url <- make_search_url(query = "mechanistic performance",
                          how = "all")
my_url
page <- xml2::read_html(my_url)
petro.One:::is_dctype_enabled(page)
# get_papers_count(my_url)
papers_by_type(my_url)

example when the selector for paper types is DISABLED

library(petro.One)

my_url <- make_search_url(query = "mechanistic performance",
                          how = "all")

page <- xml2::read_html(my_url)
petro.One:::is_dctype_enabled(page)
get_papers_count(my_url)
item_source_d <- petro.One:::get_item_source(page)
# item_source
item_source_d <- trimws(gsub("\n", "",item_source_d))
item_source_d
# strsplit(item_source, " ")
# data.frame(item_source = as.character(item_source),
#                                  stringsAsFactors = FALSE)
# papers_by_type(my_url)
item_source <- gsub("-", "", item_source_d)
item_source_df <- read.table(text =item_source, stringsAsFactors = FALSE)
if (ncol(item_source_df) > 4) {
    item_source_df$V3 <- tolower(paste(item_source_df$V3, item_source_df$V4, sep = "-"))
    item_source_df$V4 <- item_source_df$V5
    item_source_df <- item_source_df[, 1:4]
} else {
    item_source_df$V3 <- tolower(item_source_df$V3)
}
item_source_df

disabled

x <- item_source_d
pattern <- "^([^\\s]+\\s+)"
m1       <- regexpr(pattern, x, perl = TRUE)
paper_id <- trimws(regmatches(x, m1, invert = FALSE))
s1r      <- trimws(gsub(pattern, "", x, perl = TRUE))

x2 <- gsub("-", "", s1r)

pattern <- "^([^\\s]+\\s+)"
m2     <- regexpr(pattern, x2, perl = TRUE)
source <- trimws(regmatches(x2, m2, invert = FALSE))
s2r    <- gsub(pattern, "", x2, perl = TRUE)
x3 <- s2r

pattern <- "\\d+$"
m3   <-  regexpr(pattern, x3, perl = TRUE)
year <- trimws(regmatches(x3, m3, invert = FALSE))
type <- trimws(gsub(pattern, "", x3, perl = TRUE))
type

data.frame(paper_id, source, type, year)
break_by_pattern <- function(x, pattern) {
    m     <- regexpr(pattern, x, perl = TRUE)
    left  <- trimws(regmatches(x, m, invert = FALSE))
    right <- trimws(gsub(pattern, "", x, perl = TRUE))
    list(left, right)
}

pattern <- "^([^\\s]+\\s+)"
paper_id  <- break_by_pattern(item_source_d, pattern)[[1]]
right     <- break_by_pattern(item_source_d, pattern)[[2]]
right     <- gsub("-", "", right)

pattern <- "^([^\\s]+\\s+)"
source <- break_by_pattern(right, pattern)[[1]]
right    <- break_by_pattern(right, pattern)[[2]]

pattern <- "\\d+$"
year  <- break_by_pattern(right, pattern)[[1]]
type <- break_by_pattern(right, pattern)[[2]]

df <- data.frame(paper_id, source, type, year, stringsAsFactors = FALSE)
df
grepl("chapter", item_source_d, perl = TRUE, ignore.case = TRUE)

example when the selector for paper types is enabled

library(petro.One)

my_url <- make_search_url(query = "wellhead",
                          how = "any")

page <- xml2::read_html(my_url)
petro.One:::is_dctype_enabled(page)
get_papers_count(my_url)
item_source_e <- petro.One:::get_item_source(page)
# item_source
item_source_e <- trimws(gsub("\n", "",item_source_e))
item_source_e
# strsplit(item_source, " ")
# data.frame(item_source = as.character(item_source),
#                                  stringsAsFactors = FALSE)
# papers_by_type(my_url)
x <- item_source_e
pattern <- "^([^\\s]+\\s+)"
# pattern <- "^([^\\s]+)|\\d.+$"
# item_source_2 <- gsub("-", "", item_source_e)
# substr(item_source_2, 1, 11)
# gsub("^([^\\s]+)", "\\1", item_source_e, perl = TRUE)
m1 <- regexpr(pattern, x, perl = TRUE)
s1l <- regmatches(x, m1, invert = FALSE)
s1r <- gsub(pattern, "", x, perl = TRUE)

x2 <- gsub("-", "", s1r)
m2 <- regexpr(pattern, x2, perl = TRUE)
s2l <- trimws(regmatches(x2, m2, invert = FALSE))
s2r <- gsub(pattern, "", x2, perl = TRUE)
x3 <- s2r

pattern <- "\\d+$"
m3 <-  regexpr(pattern, x3, perl = TRUE)
s3r <- regmatches(x3, m3, invert = FALSE)
s3l <- trimws(gsub(pattern, "", x3, perl = TRUE))
s3l
# s1r <- unlist(regmatches(x, m, invert = TRUE))
# s1l
# regmatches(item_source_e, regexpr(pattern, item_source_e))
# strsplit(item_source_2, "(?<=^..)(?=[A-Z])", perl=TRUE)
# item_source_ef <- read.table(text =item_source2, stringsAsFactors = FALSE, sep = " ")
# if (ncol(item_source_ef) > 4) {
#     item_source_ef$V3 <- tolower(paste(item_source_ef$V3, item_source_ef$V4, sep = "-"))
#     item_source_ef$V4 <- item_source_ef$V5
#     item_source_ef <- item_source_ef[, 1:4]
# } else {
#     item_source_ef$V3 <- tolower(item_source_ef$V3)
# }
# item_source_ef
x <- item_source_e
pattern <- "^([^\\s]+\\s+)"
m1       <- regexpr(pattern, x, perl = TRUE)
paper_id <- trimws(regmatches(x, m1, invert = FALSE))
s1r      <- trimws(gsub(pattern, "", x, perl = TRUE))

x2 <- gsub("-", "", s1r)

pattern <- "^([^\\s]+\\s+)"
m2     <- regexpr(pattern, x2, perl = TRUE)
source <- trimws(regmatches(x2, m2, invert = FALSE))
s2r    <- gsub(pattern, "", x2, perl = TRUE)
x3 <- s2r

pattern <- "\\d+$"
m3   <-  regexpr(pattern, x3, perl = TRUE)
year <- trimws(regmatches(x3, m3, invert = FALSE))
type <- trimws(gsub(pattern, "", x3, perl = TRUE))
type

data.frame(paper_id, source, type, year)
break_by_pattern <- function(x, pattern) {
    m     <- regexpr(pattern, x, perl = TRUE)
    left  <- trimws(regmatches(x, m, invert = FALSE))
    right <- trimws(gsub(pattern, "", x, perl = TRUE))
    list(left, right)
}

pattern <- "^([^\\s]+\\s+)"
paper_id  <- break_by_pattern(item_source_e, pattern)[[1]]
right     <- break_by_pattern(item_source_e, pattern)[[2]]
right     <- gsub("-", "", right)

pattern <- "^([^\\s]+\\s+)"
source <- break_by_pattern(right, pattern)[[1]]
right    <- break_by_pattern(right, pattern)[[2]]

pattern <- "\\d+$"
year  <- break_by_pattern(right, pattern)[[1]]
type <- break_by_pattern(right, pattern)[[2]]

df <- data.frame(paper_id, source, type, year, stringsAsFactors = FALSE)
df
library(dplyr)
library(tibble)

# dplyr::group_by(df, type)
df2 <- df %>%
    group_by(type) %>%
    summarize (value = n()) %>%
    rename(name = type)

as.tibble(df2)
library(petro.One)

my_url <- make_search_url(query = "mechanistic", 
                          how = "all")
mecha <- read_multidoc(my_url)
mecha
# Error in onepetro_page_to_dataframe(url) : Dataframe sizes different
library(petro.One)

my_url <- make_search_url(query = "mechanistic model", 
                          how = "all")

summary_by_doctype(read_onepetro(my_url))
library(petro.One)

my_url <- make_search_url(query = "mechanistic physics", 
                          how = "all")
my_url
get_papers_count(my_url)
mecha <- read_multidoc(my_url)
mecha
# Error in data.frame(name, value, stringsAsFactors = FALSE) : arguments imply differing number of rows: 2, 0

error related to dataframe sizes not URL

library(petro.One)

my_url <- make_search_url(query = "mechanistic correlation", 
                          how = "all")
my_url
get_papers_count(my_url)
summary_by_doctype(read_onepetro(my_url))
# Error in data.frame(name, value, stringsAsFactors = FALSE) : arguments imply differing number of rows: 2, 0

## error related to dataframe sizes not URL
library(petro.One)

my_url <- make_search_url(query = "mechanistic theory", 
                          how = "all")

summary_by_doctype(read_onepetro(my_url))
library(petro.One)

my_url <- make_search_url(query = "mechanistic vertical lift", 
                          how = "all")
my_url
class(my_url)
get_papers_count(my_url)
op <- read_onepetro(my_url)
class(op)

papers_by_type(my_url)

# summary_by_doctype(op)
# Error in (ix + 1):len_list : argument of length 0
library(petro.One)

my_url <- make_search_url(query = "mechanistic tubing", 
                          how = "all")
my_url
get_papers_count(my_url)
papers_by_type(my_url)
# summary_by_doctype(read_onepetro(my_url))
# Error in (ix + 1):len_list : argument of length 0
library(petro.One)

my_url <- make_search_url(query = "mechanistic performance", 
                          how = "all")
my_url
get_papers_count(my_url)
x <- xml2::read_html(my_url)
papers_by_type(my_url)
# read_multidoc(my_url)
# summary_by_doctype(x)
# Error in data.frame(name, value, stringsAsFactors = FALSE) : arguments imply differing number of rows: 2, 0
[18] "Unconventional Resources Technology Conference {28}"       
[19] "World Petroleum Congress {8}"                              
[20] "All types"                                                 
[21] "Chapter {1}"                                               
[22] "Conference paper {2,626}"                                  
[23] "General {15}"                                              
[24] "Journal paper {645}"                                       
[25] "Media {1}"                                                 
[26] "Presentation {9}"                                          
[27] "Standard {2}"                                              
ix: 20
library(petro.One)
library(magrittr)
library(rvest)

my_url <- make_search_url(query = "mechanistic performance", 
                          how = "any")
page <- xml2::read_html(my_url)
# petro.One:::is_dctype_enabled(page)

pub_doc <- page %>%
    html_nodes(".facet-unit-right") %>%
    html_nodes("select") %>%
    .[2] %>%
    html_attr("class")
print(pub_doc)
grepl("disabled", pub_doc)
pub_doc <- page %>%
    html_nodes(".facet-unit-right") %>%
    html_nodes("select") %>%
    .[2] %>%
    html_attr("class")
print(pub_doc)
grepl("disabled", pub_doc)
library(petro.One)

my_url <- make_search_url(query = "mechanistic performance", 
                          how = "all")
page <- xml2::read_html(my_url)
petro.One:::is_dctype_enabled(page)
div <- xml_find_all(x, ".//div")
xml_path(div)
xml_attr(div, "id")
# this query works without error 1746 rows
library(petro.One)

my_url <- make_search_url(query = "IPR", 
                          how = "all")

mecha <- read_multidoc(my_url)
mecha
library(xml2)
x <- xml2::read_html(my_url)
summary_by_doctype(x)
xml_children(x)
xml_find_all(x, ".//h2")
xml_find_all(x, ".//result-item-source")
xml_find_all(x, ".//facet-unit-right")
xml_find_all(x, ".//facet-unit-left")
# xml_path(div)
rvest::html_nodes(x, "h2")
rvest::html_nodes(x, ".result-item-source")
rvest::html_nodes(x, ".facet-unit-left")
rvest::html_nodes(x, ".facet-unit-left")

with error

library(petro.One)

my_url <- make_search_url(query = "mechanistic performance", 
                          how = "all")

y <- xml2::read_html(my_url)
# summary_by_doctype(y)
# Error in data.frame(name, value, stringsAsFactors = FALSE) : arguments imply differing number of rows: 2, 0
rvest::html_nodes(y, "h2")
rvest::html_nodes(y, ".result-item-source")
rvest::html_nodes(y, ".facet-unit-left")
rvest::html_nodes(y, ".facet-unit-left")
library(petro.One)

my_url <- make_search_url(query = "mechanistic tubing", 
                          how = "all")
my_url
y <- xml2::read_html(my_url)
# summary_by_doctype(y)
# Error in data.frame(name, value, stringsAsFactors = FALSE) : arguments imply differing number of rows: 2, 0
rvest::html_nodes(y, "h2")
rvest::html_nodes(y, ".result-item-source")
rvest::html_nodes(y, ".facet-unit-left")
rvest::html_nodes(y, ".facet-unit-left")
library(petro.One)

my_url <- make_search_url(query = "mechanistic tubing", 
                          how = "all")
my_url
y <- xml2::read_html(my_url)
# summary_by_doctype(y)
# Error in data.frame(name, value, stringsAsFactors = FALSE) : arguments imply differing number of rows: 2, 0
rvest::html_nodes(y, "h2")
rvest::html_nodes(y, ".result-item-source")
rvest::html_nodes(y, ".facet-unit-left")
rvest::html_nodes(y, ".facet-unit-left")

# return 0 records but in reality it has 1265
library(petro.One)

my_url <- make_search_url(query = "mechanistic correlation", 
                          how = "all")

my_url
get_papers_count(my_url)
y <- xml2::read_html(my_url)
# summary_by_doctype(y)
# Error in data.frame(name, value, stringsAsFactors = FALSE) : arguments imply differing number of rows: 2, 0
rvest::html_nodes(y, "h2")
rvest::html_nodes(y, ".result-item-source")
rvest::html_nodes(y, ".facet-unit-left")
rvest::html_nodes(y, ".facet-unit-left")


# return 8 records but in reality it has 2235
summary_by_doctype(y)
# Error in data.frame(name, value, stringsAsFactors = FALSE) : arguments imply differing number of rows: 2, 0
# url read from OnePetro
url <- "https://www.onepetro.org/search?q=\\%22mechanistic+correlation\\%22&peer_reviewed=&published_between=&from_year=&to_year="

z <- xml2::read_html(url)
summary_by_doctype(z)
# url built by function
url <- "https://www.onepetro.org/search?q=\'mechanistic+correlation\'&peer_reviewed=&published_between=&from_year=&to_year="

z <- xml2::read_html(url)
summary_by_doctype(z)

# changed quotes inside the query word by single quotes and now works!!


f0nzie/petro.One documentation built on May 29, 2019, 12:05 a.m.