Nothing
# constructor for WOS HTML wildcard searches
newScrape <- function(wildcard_PRE = "",
wildcard_POST = "",
scrubs = NA,
collapse = FALSE,
aNumber = FALSE,
split = NA,
multiline = FALSE,
skipline = 0) {
list("wildcard_PRE" = wildcard_PRE,
"wildcard_POST" = wildcard_POST,
"scrubs" = scrubs,
"collapse" = collapse,
"isNumeric" = aNumber,
"split" = split,
"multiline" = multiline,
"skipline" = skipline)
}
# a list of WOS bibliographic items and their wildcard search and scrub patterns
scrapeWOSList <- list(
author = newScrape("Find more records by this author\"", ".*\\((.+?)\\).*", collapse = TRUE, split = "alt"),
# author = newScrape("field=AU&value=", ".*field=AU&value=(.+?)\".*"), # single author papers
# authors = newScrape("author_name=", ".*author_name=(.+?)&.*"), # multi-author papers
# year = newScrape("publication_year", ".*publication_year%253D(.+?)%2526.*", aNumber = TRUE),
year = newScrape("Published:", "<value>(.+?)</value>", scrubs = "([^\\s]+\\s+)", skipline = 1),
# title = newScrape("title%253D", ".*title%253D(.+?)%2526.*", scrubs = "%252520"),
title = newScrape("aria-label=\"Main content", "<value>(.+?)</value>", skipline = 1),
# journal = newScrape("journal%253D", ".*journal%253D(.+?)%2526.*", scrubs = "%252520"),
# journal = newScrape("LinksService&jTitle=", ".*LinksService&jTitle=(.+?)&link_type.*"),
journal = newScrape("<source_title_txt_label>", "<value>(.+?)</value>", skipline = 1),
# volume = newScrape("%252Evolume%253D", ".*volume%253D(.+?)%2526.*"),
volume = newScrape("Volume:", "<value>(.+?)</value>", skipline = 1),
# pages = newScrape("%252Epages%253D", ".*pages%253D(.+?)%2526.*"),
pages = newScrape("Pages:", "<value>(.+?)</value>", skipline = 1),
DOI = newScrape("dx.doi.org", ".*dx.doi.org/(.+?)&.*"),
abstract = newScrape("FR_field", ".*FR_field\">(.+?)</p>.*", scrubs = "<br>", collapse = TRUE), # multiline = TRUE
N_references = newScrape("<a title=.*View this record.*s bibliography", ".*<b>(.+?)</b>.*", aNumber = TRUE),
N_citations = newScrape("<a title=.*View all of the articles that cite this one", ".*<b>(.+?)</b>.*", aNumber = TRUE),
#journal_IF = newScrape("Impact_Factor_table", ".*<tr> <td>(.+?)</td> <td>.*", aNumber = TRUE),
#journal_IF_year = newScrape("Impact_Factor_table", ".*<tr> <th>(.+?)</th> <th>.*", aNumber = TRUE)
journal_IF = newScrape("Impact_Factor_table", ".*<tr> <td> (.+?) </td>.*", aNumber = TRUE),
journal_IF_year = newScrape("Impact_Factor_table", ".*</td> <td>(.+?)</td>.*", aNumber = TRUE)
)
# scrapes WOS HTML to locate and scrub bibliographic data
scrapeHTML_WOS <- function(theHTMLdata,
wildcard_PRE,
wildcard_POST,
scrubs = NA,
collapse = FALSE,
isNumeric = FALSE,
split = NA,
multiline = FALSE,
skipline = 0) {
# PRE-scrub of HTML
preScrub <- unique(theHTMLdata[which(!is.na(str_extract(theHTMLdata, wildcard_PRE))) + skipline])
if(collapse == TRUE) preScrub <- paste(preScrub, collapse = "")
if(!is.na(split)) preScrub <- unlist(str_split(preScrub, split))
# abstract only: working progress: recover multiline abstracts with HTML line
# breaks, must remove collapse = TRUE for this to work. Only tested with
# Journal of Ecology. Does not work with Diversity and Distributions.
# Note: 12/20/2016
if(multiline == TRUE) {
firstSentence <- word(preScrub, -1)
firstSentence <- firstSentence[!is.na(str_extract(firstSentence, "\\."))]
newpreScrub <- theHTMLdata[!is.na(str_extract(theHTMLdata, firstSentence))]
theStart <- which(!is.na(str_extract(theHTMLdata, firstSentence)))
aSentence <- 0; finished <- FALSE;
while(finished == FALSE) {
if (aSentence == 0) {
theAbstract <- theHTMLdata[theStart]
} else {
isLast <- str_extract(theHTMLdata[theStart + aSentence], ".</p>")
theAbstract <- paste(theAbstract, theHTMLdata[theStart + aSentence])
if(!is.na(isLast)) finished <- TRUE
}
aSentence <- aSentence + 1
}
preScrub <- theAbstract
}
# POST-scrub of HTML
theText <- unique(gsub(wildcard_POST, "\\1", preScrub))
# additional scrubbing
if(!is.na(scrubs)) theText <- gsub(scrubs, " ", theText)
if(isNumeric == TRUE) {
theText <- as.numeric(theText)
if(length(theText) == 0) theText <- 0
}
return(theText)
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.