read/write web pages of OnePetro searches

How many papers in search

url <- "https://www.onepetro.org/search?q=neural+networks&peer_reviewed=&published_between=&from_year=&to_year=&"
get_papers_count(url)
# Specifying the url for desired website to be scrapped
url <- "https://www.onepetro.org/search?start=0&q=neural+networks&from_year=&peer_reviewed=&published_between=&rows=1000&to_year="

get_papers_count(url)

write web page of the first 1000 rows

# Reading the HTML code from the website
webpage1 <- read_html(url)
xml2::write_html(webpage1, file = "neural_network-s0r1000.html")

read the save web page

url_page <- "neural_network-s0r1000.html"
onepetro_page_to_dataframe(url_page)

next 1000 rows: start at 1001

# Specifying the url for desired website to be scrapped
url <- "https://www.onepetro.org/search?start=1001&q=neural+networks&from_year=&peer_reviewed=&published_between=&rows=1000&to_year="

# Reading the HTML code from the website
webpage2 <- read_html(url)
xml2::write_html(webpage2, file = "neural_network-s1001r1000.html")

read the saved web page: 1001-2000

url_page <- "neural_network-s1001r1000.html"
onepetro_page_to_dataframe(url_page)

next 1000 rows: start at 2001

# Specifying the url for desired website to be scrapped
url <- "https://www.onepetro.org/search?start=2001&q=neural+networks&from_year=&peer_reviewed=&published_between=&rows=1000&to_year="

# Reading the HTML code from the website
webpage3 <- read_html(url)
xml2::write_html(webpage3, file = "neural_network-s2001r1000.html")

read the saved web page: 1001-2000

url_page <- "neural_network-s2001r1000.html"
onepetro_page_to_dataframe(url_page)
url <- "https://www.onepetro.org/search?q=smartwell&peer_reviewed=&published_between=&from_year=&to_year="

webpage1 <- read_html(url)
webpage1
#Specifying the url for desired website to be scrapped
url <- "https://www.onepetro.org/search?start=0&q=neural+networks&from_year=&peer_reviewed=&published_between=&rows=10&to_year="

#Reading the HTML code from the website
webpage2 <- read_html(url)
webpage2
#Loading the rvest package
library('rvest')

#Specifying the url for desired website to be scrapped
url <- "https://www.onepetro.org/search?start=0&q=neural+networks&from_year=&peer_reviewed=&published_between=&rows=1000&to_year="

#Reading the HTML code from the website
webpage1 <- read_html(url)
webpage1
xml2::write_html(webpage1, file = "1000rows.html")
url_page <- "1000rows.html"
webpage <- xml2::read_html(url_page)
onepetro_page_to_dataframe(url_page)
# THIS DOESN'T WORK
#Specifying the url for desired website to be scrapped
url2 <- "https://www.onepetro.org/search?start=0&q=neural+networks&from_year=&peer_reviewed=&published_between=&rows=3000&to_year="

#Reading the HTML code from the website
webpage2 <- read_html(url2)
xml2::write_html(webpage2, file = "0to3000.html")
onepetro_page_to_dataframe(url_page)
#Specifying the url for desired website to be scrapped
url2 <- "https://www.onepetro.org/search?start=2501&q=neural+networks&from_year=&peer_reviewed=&published_between=&rows=1000&to_year="

#Reading the HTML code from the website
webpage2 <- read_html(url2)
xml2::write_html(webpage2, file = "2501to3500.html")
webpage <- read_html("1501to2500.html")
# read from webpage
onepetro_page_to_dataframe("2501to3500.html")
# append al html pages that were scrapped separately]
acumm_page <- read_html("0to1000.html")
xml2::write_html(acumm_page, file = "accum_page.html")

# file.append("accum_page.html", "0to1000.html")

file.append("accum_page.html", "1001to1500.html")

file.append("accum_page.html", "1501to2500.html")

file.append("accum_page.html", "2501to3500.html")
# webpage <- read_html("accum_page.html")

# webpage <- read_html("1501to2500.html")
# webpage <- read_html("0to1000.html")
# webpage <- read_html("2501to3500.html")
webpage <- read_html("3000_conference.html")


# title
# .result-link

#Using CSS selectors to scrap the rankings section
title_data_html <- html_nodes(webpage, '.result-link')

#Converting the ranking data to text
title_data <- html_text(title_data_html)

# data pre-processing
title_data <- trimws(gsub("\n", "",title_data))


#Let's have a look at the rankings
df <- data.frame(title_data, stringsAsFactors = FALSE)
write.csv(df, file = "3000-conf.csv")
# year, paper id, institution, type, year
# .result-item-source

#Using CSS selectors to scrap the rankings section
source_data_html <- html_nodes(webpage, '.result-item-source')

#Converting the ranking data to text
source_data <- html_text(source_data_html)

# pre-processing. split at \n
  source_data <- data.frame(do.call('rbind', strsplit(as.character(source_data),'\n',fixed=TRUE)), 
                            stringsAsFactors = FALSE)
  # remove blank columns
    source_data <- source_data[, 2:5]
  # rename columns
    names(source_data) <- c("paper_id", "source", "type", "year")
  # remove dash from year
    source_data$year <- gsub("-", "", source_data$year)

# Let's have a look at the paper source data
source_data
webpage <- read_html("3000_conference.html")
# author #1
#Using CSS selectors to scrap the rankings section
author1_data_html <- html_nodes(webpage, '.result-item-author:nth-child(1)')

#Converting the ranking data to text
author1_data <- html_text(author1_data_html)

# data pre-processing
author1_data <- trimws(gsub("\n", "", author1_data))

#Let's have a look at the rankings
data.frame(author1_data)
# author #2
#Using CSS selectors to scrap the rankings section
author2_data_html <- html_nodes(webpage, '.result-item-author:nth-child(2)')

#Converting the ranking data to text
author2_data <- html_text(author2_data_html)

# data pre-processing
author2_data <- trimws(gsub("\n", "", author2_data))

#Let's have a look at the rankings
data.frame(author2_data)
# author #3
#Using CSS selectors to scrap the rankings section
author3_data_html <- html_nodes(webpage, '.result-item-author:nth-child(3)')

#Converting the ranking data to text
author3_data <- html_text(author3_data_html)

# data pre-processing
author3_data <- trimws(gsub("\n", "", author3_data))

#Let's have a look at the rankings
data.frame(author3_data)


f0nzie/petro.One documentation built on May 29, 2019, 12:05 a.m.