R Notebook"

library(rvest)
library(dplyr)
library(petro.One)
.url_pres <- "https://www.onepetro.org/search?start=0&q=pressure&from_year=&peer_reviewed=&published_between=&to_year=&rows="

url_100 <- function(url, page_size) {
    paste0(url, page_size)         # join the number of rows to the URL
}

url_pres <- url_100(.url_pres, 100)

make_webpage_of_size <- function(url, page_size) {
    xml2::read_html(url_100(url, 100))
}

url_pres
webpage_100 <- make_webpage_of_size(.url_pres, 100)
webpage_pres <- webpage_100
get_data_itemid <- function(webpage) {
    data_itemid <- html_nodes(webpage, '.result-item') %>%
    html_attr("data-itemid") %>%      # extract data-itemid
    gsub("data-cite-id=", "", .) %>% 
    trimws %>% 
    head(., -1)

    data_itemid <- strsplit(data_itemid, "/")
    data.frame(t(sapply(data_itemid, '[', 1:max(sapply(data_itemid, length)))), 
               stringsAsFactors = FALSE)
}


get_data_itemid(webpage_pres)
url_nn <- make_search_url(query = "neural network",
                          from_year = 1970,
                          to_year   = 1999,
                          how = "all",
                          rows = 100)

webpage_nn <- xml2::read_html(url_nn)
get_data_itemid(webpage_nn)
get_dc_type_raw <- function(webpage) {

    dc_type_0 <- html_nodes(webpage, '.result-item') %>%
        html_attr("data-type") %>% 
        trimws %>%         # remove blanks
        head(., -1)        # discard last row

    data_itemid <- get_data_itemid(webpage)

    if (ncol(data_itemid) > 2) {
        dc_type <- data_itemid %>%
        rename(dc_type_1 = X1, paper_id = X2, dc_type_2 = X3, sup = X4) %>%
        mutate(x1x3 = ifelse(dc_type_1 == dc_type_2, TRUE, FALSE)) %>%
        mutate(dc_type = ifelse(dc_type_2 %in% "SUPPLEMENTARY", "media", dc_type_1)) %>% 
        mutate(dc_type = ifelse(dc_type_1 %in% "book", "chapter", dc_type))
    } else {
        dc_type <- data_itemid %>%
           rename(dc_type = X1, paper_id = X2) 
    }


    # number of columns of result is not a multiple of vector length (arg 154)
    tibble::as.tibble(cbind(dc_type_0, dc_type))
}

dc_type_raw_pres <- get_dc_type_raw(webpage_pres)
dc_type_raw_pres
dc_type_raw_nn <- get_dc_type_raw(webpage_nn)
dc_type_raw_nn
get_dc_type <- function(webpage) {
    dc_type_raw <- get_dc_type_raw(webpage)
    dc_type_raw$dc_type
}

dc_type_pres <- get_dc_type(webpage_pres)
dc_type_pres
dc_type_nn <- get_dc_type(webpage_nn)
dc_type_nn
unique(dc_type_pres)
unique(dc_type_nn)

Get a glipmse of the number of paper by type

dc_type <- data.frame(dc_type = get_dc_type(webpage_pres))

dc_type %>% 
    group_by(dc_type) %>% 
    summarise(count = n())

sum(.Last.value$count)
# page_size, media  chapter other
#  100          1       0     2     100
#  200          6       0           200
#  300          6       0
#  400          6       1           400
#  500          6       1
#  600          7       2   20       600
#  700         12       2
#  800         16       3
# 1000         16       3
dc_type <- data.frame(dc_type = get_dc_type(webpage_nn))

dc_type %>% 
    group_by(dc_type) %>% 
    summarise(count = n())

sum(.Last.value$count)

Titles

Titles are not the same size as dc_type above. Why?

df.titles <- petro.One:::read_titles(webpage_pres)
dim(df.titles)
# 100  97   1
# 600 573   1
# 100  97   1

This will give error because columns are different

df.sources <- petro.One:::read_sources(webpage)
unique(df$type)
# error:
# number of columns of result is not a multiple of vector length (arg 69)

Let's try to get the titles in another way

get_book_title <- function(webpage) {
    html_nodes(webpage, '.result-item') %>%
        html_nodes(".book-title") %>% 
        html_text %>% 
        gsub("\n", "", .) %>% 
        trimws
}

get_book_title(webpage_pres)
get_book_title(webpage_nn)

merge titles and dc_type

It they match, ther shouldn't be any error

dc_type    <- get_dc_type(webpage_pres)
book_title <- get_book_title(webpage_pres)
tibble::as.tibble(cbind(book_title, dc_type))
dc_type    <- get_dc_type(webpage_nn)
book_title <- get_book_title(webpage_nn)
tibble::as.tibble(cbind(book_title, dc_type))

Authors

get_result_item_authors <- function(webpage) {
    html_nodes(webpage, '.result-item') %>%
        # html_nodes(".result-item-authors")
        html_nodes(".highlighted") %>% 
        html_text %>% 
        #gsub("\n", "", .) %>% 
        trimws %>%     
        strsplit("\n") %>% 
        lapply(., trimws) %>% 
        lapply(., function(x) x[!x %in% ""])  # remove a blank from a list

}


get_authors <- function(webpage) {
    #' @param x called by apply function
    join_authors <- function(x, y) {
        nm1 <- if (grepl("[,.]", y[[x]][1])) y[[x]][1] else "NA"
        nm2 <- if (grepl("[,.]", y[[x]][2]) & nm1 != "NA") y[[x]][2] else "NA"
        nm3 <- if (grepl("[,.]", y[[x]][3]) & nm2 != "NA") y[[x]][3] else "NA"
        nm4 <- if (grepl("[,.]", y[[x]][4]) & nm3 != "NA") y[[x]][4] else "NA"
        nm5 <- if (grepl("[,.]", y[[x]][5]) & nm4 != "NA") y[[x]][5] else "NA"
        nm6 <- if (grepl("[,.]", y[[x]][6]) & nm5 != "NA") y[[x]][6] else "NA"
        nm7 <- if (grepl("[,.]", y[[x]][7]) & nm6 != "NA") y[[x]][7] else "NA"
        nm8 <- if (grepl("[,.]", y[[x]][8]) & nm7 != "NA") y[[x]][8] else "NA"
        authors <- paste(nm1, nm2, nm3, nm4, nm5, nm6, nm7, nm8, sep = "|")
        authors <- strsplit(authors, "|", fixed = TRUE)
        authors <- lapply(authors, function(x) x[!x %in% "NA"] )  # remove a blank from a list
        authors <- sapply(authors, paste, collapse = " | ")
        #if (is_author(authors)) authors else "unknown"
    }

    item_authors <- get_result_item_authors(webpage)
    data.frame(authors = sapply(seq_along(item_authors), join_authors, item_authors),
               stringsAsFactors = FALSE)
}

get_authors(webpage_pres)
get_authors(webpage_nn)
dc_type    <- get_dc_type(webpage_pres)
book_title <- get_book_title(webpage_pres)
authors    <- get_authors(webpage_pres)
tibble::as.tibble(cbind(book_title, dc_type, authors))
dc_type    <- get_dc_type(webpage_nn)
book_title <- get_book_title(webpage_nn)
authors    <- get_authors(webpage_nn)
tibble::as.tibble(cbind(book_title, dc_type, authors))

Year

get_year <- function(webpage) {
    f <- function(x) {
        if (!any(sapply(x, function(y) grepl(y, pattern = "- [0-9].", perl = TRUE))))
            NA
        else {
            as.integer(trimws(gsub("-", "",grep(x, pattern = "- [0-9].", perl = TRUE, value = TRUE))))
        }

    }
    item_authors <- get_result_item_authors(webpage)
    data.frame(year = sapply(item_authors, f))
}


get_year(webpage_pres)
get_year(webpage_nn)

Sources

get_source <- function(webpage) {
    f <- function(x) {
        if (!any(sapply(x, function(y) grepl(y, pattern = "^[A-Z]+$", perl = TRUE))))
            NA
        else
            grep(x, pattern = "^[A-Z]+$", perl = TRUE, value = TRUE)
    }
    item_authors <- get_result_item_authors(webpage)
    sapply(item_authors, f)
}

get_source(webpage_pres)
get_source(webpage_nn)

Merge all

dc_type    <- get_dc_type(webpage_pres)
book_title <- get_book_title(webpage_pres)
authors    <- get_authors(webpage_pres)
year       <- get_year(webpage_pres)
source     <- get_source(webpage_pres)

all_pres <- tibble::as.tibble(cbind(book_title, dc_type, authors, year, source))
all_pres
dc_type    <- get_dc_type(webpage_nn)
book_title <- get_book_title(webpage_nn)
authors    <- get_authors(webpage_nn)
year       <- get_year(webpage_nn)
source     <- get_source(webpage_nn)

all_nn <- tibble::as.tibble(cbind(book_title, dc_type, authors, year, source))
all_nn
table(all_pres$dc_type)
table(all_nn$dc_type)
get_papers_from_result_item <- function(url) {
    webpage <- xml2::read_html(url)

    dc_type    <- get_dc_type(webpage)
    book_title <- get_book_title(webpage)
    authors    <- get_authors(webpage)
    year       <- get_year(webpage)
    source     <- get_source(webpage)
    tibble::as.tibble(cbind(book_title, dc_type, authors, year, source))
}

get_papers_from_result_item(url_pres)
get_papers_from_result_item(url_nn)


Try the petro.One package in your browser

Any scripts or data that you put into this service are public.

petro.One documentation built on May 2, 2019, 3:10 p.m.