R/papers_to_dataframe.R

#' @title Reads a OnePetro URL and converts it to a dataframe
#' @description A OnePetro URL with a query is read into a HTML page and
#' converted to a dataframe
#' @param url char a OnePetro type URL
#' @export
#' @examples
#' \dontrun{
#' # Example 1
#' # Search papers with keyword "smartwell"
#' url_sw <- "https://www.onepetro.org/search?q=smartwell"
#' onepetro_page_to_dataframe(url_sw)
#' # Example 2
#' # Search for exact words ""vertical lift performance"
#' url_vlp <- "https://www.onepetro.org/search?q=%22vertical+lift+performance%22"
#' onepetro_page_to_dataframe(url_vlp)
#' }
onepetro_page_to_dataframe <- function(url) {
    empty_df <- tibble::tibble(book_title = character(),
                               dc_type = character(),
                               paper_id = character(),
                               authors = character(),
                               source = character(),
                               year = integer())
    webpage <- read_html(url)
    # titles
    # sources
    # author
    data_itemid <- get_data_itemid(webpage)
    if(ncol(data_itemid) == 0) return(empty_df)

    dc_type    <- get_dc_type(webpage)
    book_title <- get_book_title(webpage)
    paper_id   <- get_paper_id(webpage)
    authors    <- get_authors(webpage)
    year       <- get_year(webpage)
    source     <- get_source(webpage)
    return(tibble::as.tibble(cbind(book_title, paper_id, dc_type, authors, year, source)))
}



onepetro_allpages_to_dataframe <- function(url) {
    # webpage <- read_html(url)
    papers_count <- get_papers_count(url)
    if (papers_count > 1000) {
        num_pages <- papers_count / 1000
    } else {
        num_pages = 1
    }

    info <- list(papers = papers_count, pages1000 = num_pages)

    for (page in seq_len(num_pages)) {
        # webpage <- read_html(url)

    }
    info
}


#' @importFrom rvest html_nodes html_text
read_titles <- function(webpage) {

    # title:  .result-link
    # define empty dataframe
    title_data <- data.frame(title_data = character())

    #Using CSS selectors to scrap the rankings section
    title_data_html <- html_nodes(webpage, '.result-link')

    # Converting the ranking data to text
    title_data_txt <- html_text(title_data_html)

    if (length(title_data_txt) != 0 ) {
        # data pre-processing
        title_data <- trimws(gsub("\n", "",title_data_txt))
        title_data <- data.frame(title_data = as.character(title_data),
                                 stringsAsFactors = FALSE)
    }
    return(title_data)
}


#' @importFrom rvest html_nodes html_text
get_item_source <- function(webpage) {
    # Using CSS selectors to scrap the rankings section
    source_data_html <- html_nodes(webpage, '.result-item-source')

    # Converting the ranking data to text
    html_text(source_data_html)
}



#' @importFrom rvest html_nodes html_text
read_sources <- function(webpage) {
    # year, paper id, institution, type, year
    # .result-item-source

    # initialize an empty dataframe
    source_data <- data.frame(paper_id = character(),
                     source = character(),
                     type = character(),
                     year = integer()
    )

    source_data_txt <- get_item_source(webpage)

    # print(source_data_txt)

    if (length(source_data_txt) != 0) {
        # pre-processing. split at \n
        source_data <- data.frame(do.call('rbind', strsplit(as.character(source_data_txt),
                                                            '\n', fixed=TRUE)),
                                  stringsAsFactors = FALSE)
        # print(source_data)
        # force data types
        source_data <- data.frame(paper_id = as.character(source_data[, 2]),
                                  source   = as.character(source_data[, 3]),
                                  type     = as.character(source_data[, 4]),
                                  year     = as.character(source_data[, 5]),
                                  stringsAsFactors = FALSE)
        # remove dash from year
        source_data$year <- as.integer(gsub("-", "", source_data$year))
    }
    # Let's have a look at the paper source data
    source_data
}

#' @importFrom rvest html_nodes html_text
read_author <- function(webpage) {
    # author #1. define empty dataframe
    author1_data <- data.frame(author1_data = character())

    #Using CSS selectors to scrap the rankings section
    author1_data_html <- html_nodes(webpage, '.result-item-author:nth-child(1)')

    #Converting the ranking data to text
    author1_data_txt <- html_text(author1_data_html)

    if (length(author1_data_txt) != 0 ) {
        # print("author data \n")
        # print(length(author1_data))

        # data pre-processing
        author1_data <- trimws(gsub("\n", "", author1_data_txt))
        author1_data <- data.frame(author1_data = as.character(author1_data),
                                   stringsAsFactors = FALSE)
    }

    #Let's have a look at the rankings
    author1_data
}

Try the petro.One package in your browser

Any scripts or data that you put into this service are public.

petro.One documentation built on May 2, 2019, 3:10 p.m.