library(petro.One)

# provide the list of keywords
keywords <- "    
    water injection
    water-injection
    water injection machine learning
    water injection artificial intelligence
"
# convert the text to a dataframe
# read text table and split rows at carriage return
kw_tbl <- read.table(text = keywords, header = FALSE, sep = "\n", 
                     stringsAsFactors = FALSE, strip.white = TRUE, 
                     col.names = "keyword")
kw_tbl

Build iteration loop

# iterate through the keywords dataframe
rec <- vector("list")
i <- 1
for (k in kw_tbl$keyword) {
    url_all  <- make_search_url(query = k, how = "all")    # create search query
    count    <- get_papers_count(url_all)                  # paper count
    rec[[i]] <- list(keyword = k, count = count)           # add observation
    cat(sprintf("%30s %5d \n", k, count))                  # print it
    i <-  i + 1                                            # increment counter
    Sys.sleep(5)                          # do not bug OnePetro website too much
}                                         # be a good internet citizen
data.table::rbindlist(rec)                # final data table

Enhanced search. More keywords

library(petro.One)

# provide the list of keywords
keywords <- "    
    water injection machine-learning
    water injection intelligence
    water injection intelligent
    water injection workflow
    water injection learning
    waterflooding intelligence
    waterflooding intelligent
    waterflooding machine learning
    waterflooding algorithm
    water flooding intelligence
    water flooding intelligent
    water flooding workflow
    water flooding control
    water flooding algorithm
    water flooding SVM
    water flooding loop
"
# convert the text to a dataframe
# read text table and split rows at carriage return
kw_tbl <- read.table(text = keywords, header = FALSE, sep = "\n", 
                     stringsAsFactors = FALSE, strip.white = TRUE, 
                     col.names = "keyword")
kw_tbl

# iterate through the keywords dataframe
rec <- vector("list")
i <- 1
for (k in kw_tbl$keyword) {
    url_all  <- make_search_url(query = k, how = "all")    # create search query
    count    <- get_papers_count(url_all)                  # paper count
    rec[[i]] <- list(keyword = k, paper_count = count)           # add observation
    cat(sprintf("%30s %5d \n", k, count))                  # print it
    i <-  i + 1                                            # increment counter
    Sys.sleep(5)                          # do not bug OnePetro website too much
}                                         # be a good internet citizen
data.table::rbindlist(rec)                # final data table

Variation of the previous search, using plus sign

The result does not change. It is similar to the previous search.

library(petro.One)

# provide the list of keywords
keywords <- "    
    water+injection machine-learning
    water+injection intelligence
    water+injection intelligent
    water+injection workflow
    water+injection learning
    waterflooding intelligence
    waterflooding intelligent
    waterflooding machine learning
    waterflooding algorithm
    water+flooding intelligence
    water+flooding intelligent
    water+flooding workflow
    water+flooding control
    water+flooding algorithm
    water+flooding SVM
    water+flooding loop
"
# convert the text to a dataframe
# read text table and split rows at carriage return
kw_tbl <- read.table(text = keywords, header = FALSE, sep = "\n", 
                     stringsAsFactors = FALSE, strip.white = TRUE, 
                     col.names = "keyword")
kw_tbl

# iterate through the keywords dataframe
rec <- vector("list")
i <- 1
for (k in kw_tbl$keyword) {
    url_all  <- make_search_url(query = k, how = "all")    # create search query
    count    <- get_papers_count(url_all)                  # paper count
    rec[[i]] <- list(keyword = k, paper_count = count)           # add observation
    cat(sprintf("%30s %5d \n", k, count))                  # print it
    i <-  i + 1                                            # increment counter
    Sys.sleep(5)                          # do not bug OnePetro website too much
}                                         # be a good internet citizen
data.table::rbindlist(rec)                # final data table

New search using apostrophe and plus sign

There is no change either.

library(petro.One)

# provide the list of keywords
keywords <- "    
    'water+injection' machine-learning
    'water+injection' intelligence
    'water+injection' intelligent
    'water+injection' workflow
    water+injection learning
    waterflooding intelligence
    waterflooding intelligent
    waterflooding 'machine+learning'
    waterflooding algorithm
    'water+flooding' intelligence
    'water+flooding' intelligent
    'water+flooding' workflow
    'water+flooding' control
    'water+flooding' algorithm
    'water+flooding' SVM
    'water+flooding' loop
"
# convert the text to a dataframe
# read text table and split rows at carriage return
kw_tbl <- read.table(text = keywords, header = FALSE, sep = "\n", 
                     stringsAsFactors = FALSE, strip.white = TRUE, 
                     col.names = "keyword")
kw_tbl

# iterate through the keywords dataframe
rec <- vector("list")
i <- 1
for (k in kw_tbl$keyword) {
    url_all  <- make_search_url(query = k, how = "all")    # create search query
    count    <- get_papers_count(url_all)                  # paper count
    rec[[i]] <- list(keyword = k, paper_count = count)           # add observation
    cat(sprintf("%30s %5d \n", k, count))                  # print it
    i <-  i + 1                                            # increment counter
    Sys.sleep(5)                          # do not bug OnePetro website too much
}                                         # be a good internet citizen
data.table::rbindlist(rec)                # final data table
library(petro.One)

# provide the list of keywords
keywords <- "    
    \\'water+injection\\'+AND+\\'machine-learning\\'
     \\water injection\\+AND+machine-learning
     water injection machine-learning
     water+injection machine-learning
    'water+injection'+machine-learning
    'water+injection'+AND+'machine-learning'
"
# convert the text to a dataframe
# read text table and split rows at carriage return
kw_tbl <- read.table(text = keywords, header = FALSE, sep = "\n", 
                     stringsAsFactors = FALSE, strip.white = TRUE, 
                     col.names = "keyword")
kw_tbl

# iterate through the keywords dataframe
rec <- vector("list")
i <- 1
for (k in kw_tbl$keyword) {
    url_all  <- make_search_url(query = k, how = "all")    # create search query
    count    <- get_papers_count(url_all)                  # paper count
    rec[[i]] <- list(keyword = k, paper_count = count)           # add observation
    cat(sprintf("%30s %5d \n", k, count))                  # print it
    i <-  i + 1                                            # increment counter
    Sys.sleep(5)                          # do not bug OnePetro website too much
}                                         # be a good internet citizen
data.table::rbindlist(rec)                # final data table
# this is the correct way of passing composite keywords
url_all <- make_search_url(query = "'water+injection'+AND+'machine-learning'", 
                           how = "all")
url_all
get_papers_count(url_all)
# 143
library(petro.One)

# provide the list of keywords
keywords <- "   
    \\'water+injection\\'+AND+\\'machine-learning\\'
    \\'water+injection\\'+AND+\\'machine-learning\\'
    \"water+injection\"+AND+\"machine-learning\"
    'water+injection'+AND+'machine-learning'
"
# convert the text to a dataframe
# read text table and split rows at carriage return
kw_tbl <- read.table(text = keywords, header = FALSE, sep = "\n", 
                     stringsAsFactors = FALSE, strip.white = TRUE, 
                     col.names = "keyword")
kw_tbl

# iterate through the keywords dataframe
rec <- vector("list")
i <- 1
for (k in kw_tbl$keyword) {
    url_all  <- make_search_url(query = k, how = "all")    # create search query
    count    <- get_papers_count(url_all)                  # paper count
    rec[[i]] <- list(keyword = k, paper_count = count)           # add observation
    cat(sprintf("%30s %5d \n", k, count))                  # print it
    i <-  i + 1                                            # increment counter
    Sys.sleep(5)                          # do not bug OnePetro website too much
}                                         # be a good internet citizen
data.table::rbindlist(rec)                # final data table

3.

library(petro.One)

# provide the list of keywords
keywords <- "    
    water injection optimization
    water injection optimisation
    water injection optim
    waterflooding optimization
    waterflooding optimisation
    waterflooding optim
    water flooding optim
"
# convert the text to a dataframe
# read text table and split rows at carriage return
kw_tbl <- read.table(text = keywords, header = FALSE, sep = "\n", 
                     stringsAsFactors = FALSE, strip.white = TRUE, 
                     col.names = "keyword")
kw_tbl

# iterate through the keywords dataframe
rec <- vector("list")
i <- 1
for (k in kw_tbl$keyword) {
    url_all  <- make_search_url(query = k, how = "all")    # create search query
    count    <- get_papers_count(url_all)                  # paper count
    rec[[i]] <- list(keyword = k, paper_count = count)           # add observation
    cat(sprintf("%30s %5d \n", k, count))                  # print it
    i <-  i + 1                                            # increment counter
    Sys.sleep(5)                          # do not bug OnePetro website too much
}                                         # be a good internet citizen
data.table::rbindlist(rec)                # final data table

This low value because we are using "and", equivalent to "exact phrase" in OnePetro.

get papers data

library(petro.One)

url1 <- make_search_url(query = "water injection optimization", 
                          how = "all",
                          rows = 129)
df1 <- onepetro_page_to_dataframe(url1)

url2 <- make_search_url(query = "water injection optimisation", 
                          how = "all",
                          rows = 53)
df2 <- onepetro_page_to_dataframe(url2)

url3 <- make_search_url(query = "waterflooding optimization", 
                          how = "all",
                          rows = 186)
df3 <- onepetro_page_to_dataframe(url3)

url4 <- make_search_url(query = "waterflooding optimisation", 
                          how = "all",
                          rows = 75)
df4 <- onepetro_page_to_dataframe(url4)
df12 <- rbind(df1, df2)
df14 <- rbind(df1, df2, df3, df4)
df14
write.csv(df14, file = "../notebooks/wi_wf.csv", col.names = TRUE)
getwd()
library(petro.One)

# provide the list of keywords
keywords <- "    
    ''water flooding'' genetic
    water injection algorithm
    water injection prediction
    water injection data-driven
    waterflooding data-driven
    waterflooding algorithm
    waterflooding prediction
    water flooding algorithm
    water flooding intelligent
    water flooding control
    water flooding machine
    water flooding neural
"
# convert the text to a dataframe
# read text table and split rows at carriage return
kw_tbl <- read.table(text = keywords, header = FALSE, sep = "\n", 
                     stringsAsFactors = FALSE, strip.white = TRUE, 
                     col.names = "keyword")
kw_tbl

# iterate through the keywords dataframe
papers <- data.frame()
rec <- vector("list")
i <- 1
for (k in kw_tbl$keyword) {
    url_all  <- make_search_url(query = k, how = "all")    # create search query
    count    <- get_papers_count(url_all)                  # paper count
    rec[[i]] <- list(keyword = k, paper_count = count)           # add observation
    if (count > 0) {                                    # if papers > 0
        url_gt0 <- make_search_url(query = k, how = "all", rows = count)  # query
        df <- onepetro_page_to_dataframe(url_gt0)     # create a dataframe
        papers <- rbind(papers, df)                   # cumulative dataframe
    }
    cat(sprintf("%30s %5d \n", k, count))                  # print it
    i <-  i + 1                                            # increment counter
    Sys.sleep(5)                          # do not bug OnePetro website too much
}                                         # be a good internet citizen
dt <- data.table::rbindlist(rec)                # final data table
dt

dt[dt$paper_count>0,]    # show only those > 0
papers
library(petro.One)
# search any word like "neural" or "network"
url_any <- make_search_url(query = "neural network", how = "any")
url_any
get_papers_count(url_any)

# search for papers that have "neural" and "network" at the same time
url_all <- make_search_url(query = "neural network", how = "all")
url_all
get_papers_count(url_all)

# search for papers that have "neural" and "network" at the same time
url_all <- make_search_url(query = "'neural+network'+'water+injection'", how = "all")
url_all
get_papers_count(url_all)

Combining two major keywords neural network and water injection

We have to use the plus sign and apostrophe to build a search query of major keywords that have space in between.

url_all <- make_search_url(query = "'neural+network'+'water+injection'", how = "all")
url_all
get_papers_count(url_all)
# 319

This is the same as searching:

# search for papers that have "neural" and "network" at the same time
url_all <- make_search_url(query = "'neural+network'+AND+'water+injection'", how = "all")
url_all
get_papers_count(url_all)
# 319

What happens if we remove the + sign?

# search for papers that have "neural" and "network" at the same time
url_all <- make_search_url(query = "'neural network'+'water injection'", how = "all")
url_all
get_papers_count(url_all)
# 12205

The result is not we expect. What the search returns is an OR search of "neural", "network", "water", and "injection", equivalent to this search:

url_all <- make_search_url(query = "neural network water injection", how = "any")
url_all
get_papers_count(url_all)
# 12205

Build composite queries

# provide two different set of keywords to combine as vectors
major  <- c("water injection", "water flooding")
minor  <- c("machine-learning", "intelligent")
lesser <- c("neural network", "SVM", "genetic", "algorithm")
df <- expand.grid(major, minor, lesser, stringsAsFactors = FALSE)
df
# for two composite keywords
for (i in 1:nrow(df)) {
    # cat(i, df[i, "Var1"], "\n")
    s1 <- unlist(strsplit(df[i, "Var1"], " "))
    s2 <- unlist(strsplit(df[i, "Var2"], " "))
    s1plus <- paste(s1, collapse = "+")
    s2plus <- paste(s2, collapse = "+")
    # cat(i, s1, length(s1), s1plus, s2, length(s2), s2plus, "\n")
    sf <- paste0("'", s1plus, "'+AND+'", s2plus, "'")
    # cat(i, sf, "\n")
    url <- make_search_url(sf, how = "all")
    cat(i, sf, get_papers_count(url), "\n")
}
# for three composite keywords
for (i in 1:nrow(df)) {
    # joint the major keyword with the minors
    s1 <- unlist(strsplit(df[i, "Var1"], " "))
    s2 <- unlist(strsplit(df[i, "Var2"], " "))
    s3 <- unlist(strsplit(df[i, "Var3"], " "))
    s1plus <- paste(s1, collapse = "+")
    s2plus <- paste(s2, collapse = "+")
    s3plus <- paste(s3, collapse = "+")

    sf <- paste0("'", s1plus, "'+AND+'", s2plus, "'+AND+'", s3plus, "'")
    url <- make_search_url(sf, how = "all")
    cat(i, sf, get_papers_count(url), "\n")
    sleep(3)
}

algorithm that joins multiple composite keywords

# works for "n" columns
bool_op <- "AND"
sep     <- paste0("'", bool_op, "'")
for (i in 1:nrow(df)) {
    sf <- NULL
    for (j in 1:ncol(df)) {
        s     <- unlist(strsplit(df[i, j], " "))
        splus <- paste(s, collapse = "+")
        if (!is.null(sf)) {
            # sf <- paste(sf, splus, sep = "'+AND+'")
            sf <- paste(sf, splus, sep = sep)
        } else {
            sf <- paste0("'", sf, splus)
        }
    }
    sf <- paste0(sf, "'")
    # cat(i, sf, "\n")
    url <- make_search_url(sf, how = "all")
    cat(sprintf("%3d %60s %5d \n", i, sf, get_papers_count(url)))
    Sys.sleep(3)
}    

Function using algorithm that joins multiple composite keywords

library(petro.One)

# provide different set of keywords to combine as vectors
major  <- c("water injection", "water flooding")
minor  <- c("machine-learning", "intelligent")
lesser <- c("neural network", "SVM", "genetic", "algorithm")
df <- expand.grid(major, minor, lesser, stringsAsFactors = FALSE)
df


#' @param ...     input character vectors
#' @param bool_op boolean operator. It can be AND or OR
join_keywords <- function(..., bool_op = "AND") {
    rec <- vector("list")
    sleep <- 3
    # works for "n" columns
    df <- expand.grid(..., stringsAsFactors = FALSE)   # combine keywords
    sep     <- paste0("'", bool_op, "'")               # add apostrophes to operator
    # iterate through the rows of keyword combinations dataframe
    for (i in 1:nrow(df)) {
        sf <- NULL
        # iterate through columns of keywords
        for (j in 1:ncol(df)) {
            s     <- unlist(strsplit(df[i, j], " "))
            splus <- paste(s, collapse = "+")
            if (!is.null(sf)) {
                sf <- paste(sf, splus, sep = sep)
            } else {
                sf <- paste0("'", sf, splus)
            }
        }
        sf <- paste0(sf, "'")
        url <- make_search_url(sf, how = "all")
        paper_count <- get_papers_count(url)
        cat(sprintf("%3d %60s %5d \n", i, sf, paper_count))

        # build a record of results
        rec[[i]] <- list(paper_count = paper_count, sf  = sf, url = url)
        Sys.sleep(sleep)                    # give OnePetro a break
    } 
    rec.df <- data.table::rbindlist(rec)    # convert list to dataframe
    df <- cbind(df, rec.df)                 # join the results
    invisible(df)
}

#paper_count_from_vectors <- function() {}

# provide two different set of keywords to combine as vectors
major  <- c("water injection", "water flooding")
minor  <- c("machine-learning", "intelligent")
lesser <- c("neural network", "SVM", "genetic", "algorithm")

# join_keywords(major, minor, lesser)
# p.df <- join_keywords(major, minor)
p.df <- join_keywords(major, minor, lesser)
p.df
paper_count_from_vectors <- function(..., bool_op = "AND", sleep = 3) {
    joined <- join_keywords(...)
    i <- 1
    for (u in joined$url) {
        # print(u)
        joined$paper_count[[i]] <- get_papers_count(u)
        #cat(get_papers_count(u))
        #cat(u, "\n")
        Sys.sleep(sleep)
        i <-  i + 1
    }
    #cat(sprintf("%3d %60s %5d \n", i, sf, joined$paper_count[]))
    #Sys.sleep(sleep)
    return(joined)
}


# provide two different set of keywords to combine as vectors
major   <- c("water injection", "water flooding")
minor   <- c("machine-learning", "intelligent")
lesser  <- c("neural network", "SVM", "genetic")
another <- c("algorithm")


paper_count_from_vectors(major, minor, lesser, another, bool_op = "AND")
# this is the correct way of passing composite keywords
url_all <- make_search_url(query = "'water+injection'+AND+'machine-learning'", 
                           how = "all")
url_all
get_papers_count(url_all)
# 143
    my_url <- make_search_url(query = "neural network",
                              dc_type = "standard",
                              how = "all")
    # print(get_papers_count(my_url))
    df <- read_multipage(my_url)
    df
    # petro.One:::expect_equal_scale(nrow(df), 0, tolerance_pct = 0.01)
expect_equal_scale2 <- function (object, expected, ..., tolerance_pct) 
{
    tolerance <- tolerance_pct
    scale <- object
    testthat::expect_equal(object, expected, tolerance, scale, check.attributes = TRUE)
}

expect_equal_scale2(nrow(df), 0, tolerance_pct = 0.01)


f0nzie/petro.One documentation built on May 29, 2019, 12:05 a.m.