R/papers_to_dataframe.R
In petro.One: Statistics and Text Mining for Oil and Gas Papers from OnePetro Metadata

#' @title Reads a OnePetro URL and converts it to a dataframe
#' @description A OnePetro URL with a query is read into a HTML page and
#' converted to a dataframe
#' @param url char a OnePetro type URL
#' @export
#' @examples
#' \dontrun{
#' # Example 1
#' # Search papers with keyword "smartwell"
#' url_sw <- "https://www.onepetro.org/search?q=smartwell"
#' onepetro_page_to_dataframe(url_sw)
#' # Example 2
#' # Search for exact words ""vertical lift performance"
#' url_vlp <- "https://www.onepetro.org/search?q=%22vertical+lift+performance%22"
#' onepetro_page_to_dataframe(url_vlp)
#' }
onepetro_page_to_dataframe <- function(url) {
    empty_df <- tibble::tibble(book_title = character(),
                               dc_type = character(),
                               paper_id = character(),
                               authors = character(),
                               source = character(),
                               year = integer())
    webpage <- read_html(url)
    # titles
    # sources
    # author
    data_itemid <- get_data_itemid(webpage)
    if(ncol(data_itemid) == 0) return(empty_df)

    dc_type    <- get_dc_type(webpage)
    book_title <- get_book_title(webpage)
    paper_id   <- get_paper_id(webpage)
    authors    <- get_authors(webpage)
    year       <- get_year(webpage)
    source     <- get_source(webpage)
    return(tibble::as.tibble(cbind(book_title, paper_id, dc_type, authors, year, source)))
}



onepetro_allpages_to_dataframe <- function(url) {
    # webpage <- read_html(url)
    papers_count <- get_papers_count(url)
    if (papers_count > 1000) {
        num_pages <- papers_count / 1000
    } else {
        num_pages = 1
    }

    info <- list(papers = papers_count, pages1000 = num_pages)

    for (page in seq_len(num_pages)) {
        # webpage <- read_html(url)

    }
    info
}


#' @importFrom rvest html_nodes html_text
read_titles <- function(webpage) {

    # title:  .result-link
    # define empty dataframe
    title_data <- data.frame(title_data = character())

    #Using CSS selectors to scrap the rankings section
    title_data_html <- html_nodes(webpage, '.result-link')

    # Converting the ranking data to text
    title_data_txt <- html_text(title_data_html)

    if (length(title_data_txt) != 0 ) {
        # data pre-processing
        title_data <- trimws(gsub("\n", "",title_data_txt))
        title_data <- data.frame(title_data = as.character(title_data),
                                 stringsAsFactors = FALSE)
    }
    return(title_data)
}


#' @importFrom rvest html_nodes html_text
get_item_source <- function(webpage) {
    # Using CSS selectors to scrap the rankings section
    source_data_html <- html_nodes(webpage, '.result-item-source')

    # Converting the ranking data to text
    html_text(source_data_html)
}



#' @importFrom rvest html_nodes html_text
read_sources <- function(webpage) {
    # year, paper id, institution, type, year
    # .result-item-source

    # initialize an empty dataframe
    source_data <- data.frame(paper_id = character(),
                     source = character(),
                     type = character(),
                     year = integer()
    )

    source_data_txt <- get_item_source(webpage)

    # print(source_data_txt)

    if (length(source_data_txt) != 0) {
        # pre-processing. split at \n
        source_data <- data.frame(do.call('rbind', strsplit(as.character(source_data_txt),
                                                            '\n', fixed=TRUE)),
                                  stringsAsFactors = FALSE)
        # print(source_data)
        # force data types
        source_data <- data.frame(paper_id = as.character(source_data[, 2]),
                                  source   = as.character(source_data[, 3]),
                                  type     = as.character(source_data[, 4]),
                                  year     = as.character(source_data[, 5]),
                                  stringsAsFactors = FALSE)
        # remove dash from year
        source_data$year <- as.integer(gsub("-", "", source_data$year))
    }
    # Let's have a look at the paper source data
    source_data
}

#' @importFrom rvest html_nodes html_text
read_author <- function(webpage) {
    # author #1. define empty dataframe
    author1_data <- data.frame(author1_data = character())

    #Using CSS selectors to scrap the rankings section
    author1_data_html <- html_nodes(webpage, '.result-item-author:nth-child(1)')

    #Converting the ranking data to text
    author1_data_txt <- html_text(author1_data_html)

    if (length(author1_data_txt) != 0 ) {
        # print("author data \n")
        # print(length(author1_data))

        # data pre-processing
        author1_data <- trimws(gsub("\n", "", author1_data_txt))
        author1_data <- data.frame(author1_data = as.character(author1_data),
                                   stringsAsFactors = FALSE)
    }

    #Let's have a look at the rankings
    author1_data
}

Any scripts or data that you put into this service are public.

petro.One documentation built on May 2, 2019, 3:10 p.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

petro.One
Statistics and Text Mining for Oil and Gas Papers from OnePetro Metadata

R/papers_to_dataframe.R
In petro.One: Statistics and Text Mining for Oil and Gas Papers from OnePetro Metadata

Try the petro.One package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

petro.One Statistics and Text Mining for Oil and Gas Papers from OnePetro Metadata

R/papers_to_dataframe.R In petro.One: Statistics and Text Mining for Oil and Gas Papers from OnePetro Metadata

Try the petro.One package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

petro.One
Statistics and Text Mining for Oil and Gas Papers from OnePetro Metadata

R/papers_to_dataframe.R
In petro.One: Statistics and Text Mining for Oil and Gas Papers from OnePetro Metadata