R/scrapescholar.R

Defines functions scrapescholar

Documented in scrapescholar

#' Scrapes 110 entries from Google Scholar.
#'
#' @param searchterm A search term.
#' @return First 110 results scraped for \code{searchterm}.
#' @examples
#' scrapescholar('Privatization in England')
#' scrapescholar('Privatization in England')
scrapescholar <- function(searchterm){
  library(rvest)
  library(magrittr)
  library(stringr)
#input search term
search_term <- searchterm
#transform search term into readable form for hyperlink
search_term <- search_term %>% str_split(" ")
search_term2 <- character()
if(length(search_term[[1]])>1){

  for(i in 1:length(search_term[[1]])-1){
    if(i == 1){
      search_term2 <- paste0(search_term[[1]][i], sep = "+", search_term[[1]][i+1])
    }
    else{
      search_term2 <- paste0(search_term2, sep = "+", search_term[[1]][i+1])
    }
  }
}
else{
  search_term2 <- search_term
}
search_term <- search_term2
rm(search_term2)

scholar_html <- "https://scholar.google.co.uk/scholar?start=0&q=REPLACEHERE&hl=en&as_sdt=0,5&as_vis=1"
scholar_html <- gsub("REPLACEHERE", search_term, scholar_html)

output <- data.frame(title = NA, year = NA, author1 = NA, author2 = NA, author3 = NA, author4 = NA, excerpt = NA, link= NA)

for(j in seq(0, 100, by=10)){
  scholar_html <- sub("[0-9]{1,3}", j, scholar_html)
  search = read_html(scholar_html)
  title = search %>% html_nodes(".gs_rt") %>% html_text() %>% as.character()
  #get both the authors and the year line
  authors_year = search %>% html_nodes(".gs_a") %>% html_text() %>% as.character()
  #get the year
  year = str_extract(authors_year, "[0-9]{4}")
  #split the authors and years string
  split_string <- str_split(authors_year, "-", n=2)
  #get authors
  authors <- character()
  for(i in 1:length(split_string)){
    authors <- c(authors, split_string[[i]][1])
  }
  #split authors to identify the individual authors
  split_string <- str_split(authors, ",")
  author1 <- character()
  author2 <- character()
  author3 <- character()
  author4 <- character()
  for(i in 1:length(split_string)){
    author1 <- c(author1, split_string[[i]][1])
    author2 <- c(author2, ifelse(is.na(split_string[[i]][2]), NA, split_string[[i]][2]))
    author3 <- c(author3, ifelse(is.na(split_string[[i]][3]), NA, split_string[[i]][3]))
    author4 <- c(author4, ifelse(is.na(split_string[[i]][4]), NA, split_string[[i]][4]))
  }
  #get short excerpt
  excerpt = search %>% html_nodes(".gs_rs") %>% html_text() %>% as.character()
  #link
  link <- search %>% html_nodes(".gs_rt a") %>% html_attr("href")
  print(length(title))
  print(length(year))
  print(length(author1))
  print(length(author2))
  print(length(author3))
  print(length(author4))
  print(length(excerpt))
  print(length(link))
  print(excerpt)
  output_temp <- data.frame(title = title, year = year, author1= author1, author2= author2, author3= author3, author4=author4, excerpt= excerpt, link= link)
  output <- rbind(output, output_temp)
  print(j)
}

output <<- output[-c(1),]
write.csv(output, "scrapescholar_output.csv")
}
lubospernis/litreviewbuddy documentation built on May 28, 2019, 8:40 a.m.