#' Scrapes 110 entries from Google Scholar.
#'
#' @param searchterm A search term.
#' @return First 110 results scraped for \code{searchterm}.
#' @examples
#' scrapescholar('Privatization in England')
#' scrapescholar('Privatization in England')
scrapescholar <- function(searchterm){
library(rvest)
library(magrittr)
library(stringr)
#input search term
search_term <- searchterm
#transform search term into readable form for hyperlink
search_term <- search_term %>% str_split(" ")
search_term2 <- character()
if(length(search_term[[1]])>1){
for(i in 1:length(search_term[[1]])-1){
if(i == 1){
search_term2 <- paste0(search_term[[1]][i], sep = "+", search_term[[1]][i+1])
}
else{
search_term2 <- paste0(search_term2, sep = "+", search_term[[1]][i+1])
}
}
}
else{
search_term2 <- search_term
}
search_term <- search_term2
rm(search_term2)
scholar_html <- "https://scholar.google.co.uk/scholar?start=0&q=REPLACEHERE&hl=en&as_sdt=0,5&as_vis=1"
scholar_html <- gsub("REPLACEHERE", search_term, scholar_html)
output <- data.frame(title = NA, year = NA, author1 = NA, author2 = NA, author3 = NA, author4 = NA, excerpt = NA, link= NA)
for(j in seq(0, 100, by=10)){
scholar_html <- sub("[0-9]{1,3}", j, scholar_html)
search = read_html(scholar_html)
title = search %>% html_nodes(".gs_rt") %>% html_text() %>% as.character()
#get both the authors and the year line
authors_year = search %>% html_nodes(".gs_a") %>% html_text() %>% as.character()
#get the year
year = str_extract(authors_year, "[0-9]{4}")
#split the authors and years string
split_string <- str_split(authors_year, "-", n=2)
#get authors
authors <- character()
for(i in 1:length(split_string)){
authors <- c(authors, split_string[[i]][1])
}
#split authors to identify the individual authors
split_string <- str_split(authors, ",")
author1 <- character()
author2 <- character()
author3 <- character()
author4 <- character()
for(i in 1:length(split_string)){
author1 <- c(author1, split_string[[i]][1])
author2 <- c(author2, ifelse(is.na(split_string[[i]][2]), NA, split_string[[i]][2]))
author3 <- c(author3, ifelse(is.na(split_string[[i]][3]), NA, split_string[[i]][3]))
author4 <- c(author4, ifelse(is.na(split_string[[i]][4]), NA, split_string[[i]][4]))
}
#get short excerpt
excerpt = search %>% html_nodes(".gs_rs") %>% html_text() %>% as.character()
#link
link <- search %>% html_nodes(".gs_rt a") %>% html_attr("href")
print(length(title))
print(length(year))
print(length(author1))
print(length(author2))
print(length(author3))
print(length(author4))
print(length(excerpt))
print(length(link))
print(excerpt)
output_temp <- data.frame(title = title, year = year, author1= author1, author2= author2, author3= author3, author4=author4, excerpt= excerpt, link= link)
output <- rbind(output, output_temp)
print(j)
}
output <<- output[-c(1),]
write.csv(output, "scrapescholar_output.csv")
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.