Nothing
#' @title Reads a OnePetro URL and converts it to a dataframe
#' @description A OnePetro URL with a query is read into a HTML page and
#' converted to a dataframe
#' @param url char a OnePetro type URL
#' @export
#' @examples
#' \dontrun{
#' # Example 1
#' # Search papers with keyword "smartwell"
#' url_sw <- "https://www.onepetro.org/search?q=smartwell"
#' onepetro_page_to_dataframe(url_sw)
#' # Example 2
#' # Search for exact words ""vertical lift performance"
#' url_vlp <- "https://www.onepetro.org/search?q=%22vertical+lift+performance%22"
#' onepetro_page_to_dataframe(url_vlp)
#' }
onepetro_page_to_dataframe <- function(url) {
empty_df <- tibble::tibble(book_title = character(),
dc_type = character(),
paper_id = character(),
authors = character(),
source = character(),
year = integer())
webpage <- read_html(url)
# titles
# sources
# author
data_itemid <- get_data_itemid(webpage)
if(ncol(data_itemid) == 0) return(empty_df)
dc_type <- get_dc_type(webpage)
book_title <- get_book_title(webpage)
paper_id <- get_paper_id(webpage)
authors <- get_authors(webpage)
year <- get_year(webpage)
source <- get_source(webpage)
return(tibble::as.tibble(cbind(book_title, paper_id, dc_type, authors, year, source)))
}
onepetro_allpages_to_dataframe <- function(url) {
# webpage <- read_html(url)
papers_count <- get_papers_count(url)
if (papers_count > 1000) {
num_pages <- papers_count / 1000
} else {
num_pages = 1
}
info <- list(papers = papers_count, pages1000 = num_pages)
for (page in seq_len(num_pages)) {
# webpage <- read_html(url)
}
info
}
#' @importFrom rvest html_nodes html_text
read_titles <- function(webpage) {
# title: .result-link
# define empty dataframe
title_data <- data.frame(title_data = character())
#Using CSS selectors to scrap the rankings section
title_data_html <- html_nodes(webpage, '.result-link')
# Converting the ranking data to text
title_data_txt <- html_text(title_data_html)
if (length(title_data_txt) != 0 ) {
# data pre-processing
title_data <- trimws(gsub("\n", "",title_data_txt))
title_data <- data.frame(title_data = as.character(title_data),
stringsAsFactors = FALSE)
}
return(title_data)
}
#' @importFrom rvest html_nodes html_text
get_item_source <- function(webpage) {
# Using CSS selectors to scrap the rankings section
source_data_html <- html_nodes(webpage, '.result-item-source')
# Converting the ranking data to text
html_text(source_data_html)
}
#' @importFrom rvest html_nodes html_text
read_sources <- function(webpage) {
# year, paper id, institution, type, year
# .result-item-source
# initialize an empty dataframe
source_data <- data.frame(paper_id = character(),
source = character(),
type = character(),
year = integer()
)
source_data_txt <- get_item_source(webpage)
# print(source_data_txt)
if (length(source_data_txt) != 0) {
# pre-processing. split at \n
source_data <- data.frame(do.call('rbind', strsplit(as.character(source_data_txt),
'\n', fixed=TRUE)),
stringsAsFactors = FALSE)
# print(source_data)
# force data types
source_data <- data.frame(paper_id = as.character(source_data[, 2]),
source = as.character(source_data[, 3]),
type = as.character(source_data[, 4]),
year = as.character(source_data[, 5]),
stringsAsFactors = FALSE)
# remove dash from year
source_data$year <- as.integer(gsub("-", "", source_data$year))
}
# Let's have a look at the paper source data
source_data
}
#' @importFrom rvest html_nodes html_text
read_author <- function(webpage) {
# author #1. define empty dataframe
author1_data <- data.frame(author1_data = character())
#Using CSS selectors to scrap the rankings section
author1_data_html <- html_nodes(webpage, '.result-item-author:nth-child(1)')
#Converting the ranking data to text
author1_data_txt <- html_text(author1_data_html)
if (length(author1_data_txt) != 0 ) {
# print("author data \n")
# print(length(author1_data))
# data pre-processing
author1_data <- trimws(gsub("\n", "", author1_data_txt))
author1_data <- data.frame(author1_data = as.character(author1_data),
stringsAsFactors = FALSE)
}
#Let's have a look at the rankings
author1_data
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.