R/getData_SingleApp.R

Defines functions getData_SingleApp

Documented in getData_SingleApp

#' Scrape a Google Play Store app page
#'
#' @param app_url A url to an app on the Google Play Store.
#' @import rvest
#' @export
#' @return A list of the data from the Google Play Store page for the app.
#' @examples
#' getData_SingleApp("https://play.google.com/store/apps/details?id=com.dinaga.photosecret")
#' getData_SingleApp("https://play.google.com/store/apps/details?id=com.jamezuki.hidden")

getData_SingleApp <- function(app_url, tag_list){

  html <- read_html(app_url)

  # generate text to search for in html based on tag information
  search_text <- lapply(tag_list, function (x) (paste0("//", x$html_tag, "[contains(@", x$html_attribute, ",'", x$html_attribute_value, "')]")))

  # get data
  values <- lapply(search_text, function (x) (html %>%
                                                rvest::html_nodes('body') %>%
                                                xml2::xml_find_all(x[[1]]) %>%
                                                rvest::html_text()))

  # get tag names
  tag_names <- lapply(tag_list, function (x) (x$name))

  # rename variable names as tag names
  names(values) <- tag_names

  # NOTE: When this function was written, Google Play Store used the same
  # html_tag, html_attribute, and html_attribute_value for mulitple fields. If
  # this is the case, then the values for all such fields will be show up in R
  # as a list under a single tag name. The following code, splits the list
  # entries into individual entries with indivual names.

  if (length(values$developer_and_genre)==2){
    # Add separate developer and genre items
    values$developer <- values$developer_and_genre[1]
    values$genre <- values$developer_and_genre[2]

    # Delete developer_and_genre item
    values$developer_and_genre <- NULL
  }

  if (length(values$updated_etc)!=1){
    # Separate items
    values$updated <- values$updated_etc[1]
    values$size <- values$updated_etc[3]
    values$installs <- values$updated_etc[5]
    values$current_version <- values$updated_etc[7]
    values$requires_Android <- values$updated_etc[9]
    values$developer_email <- values$updated_etc[19]

    # Delete updated_etc item
    values$updated_etc <- NULL

  }

  # NOTE: Note all apps have entries for all tags. For example, if the app
  # doesn't have enough ratings, the ratings field doesn't show up on the Google
  # Store App Page. The following code replaces empty values with NA, so that an
  # error doesn't occur when the getData function tries to put the data in a
  # data frame.

  # Replace empty values with NA
  values <- lapply(values, function (x) (if(length(x)==0){x <-as.character(NA)} else{x <- x}))

  return(values)

}
stephaniereinders/RPlaystoreScraping documentation built on June 23, 2020, 11:04 a.m.