R/extract_data.R

#' Extract important data from Geyer et al. 2017
#'
#' @param location URL of pdf containing raw data.
#'
#' @return A list object with 4 items. Each item is an extracted data frame.
#' @export
#'
#' @examples
#'
#' #Extract important data from Geter et al. 2017
#' all_data <- extract_all_data()

extract_all_data <- function(){

  #Scrape data from location
  extracted_data  <- tabulizer::extract_tables("https://advances.sciencemag.org/content/advances/suppl/2017/07/17/3.7.e1700782.DC1/1700782_SM.pdf")

  #Get total plastic production
  total_plastic   <- total_plastic(extracted_data)

  #Get waste for each sector and plastic type
  sectors_waste   <- sectors_waste(extracted_data)
  plastics_waste  <- plastics_waste(extracted_data)

  #Get proportion of each plastic type per section
  sectors_plastic <- sectors_plastic(extracted_data)

  return(list(total_plastic = total_plastic,
              sectors_waste = sectors_waste,
              plastics_waste = plastics_waste,
              sectors_plastic = sectors_plastic))

}

###################################

#' Extract table with total plastic production over time
#'
#' @param extracted_data Raw scraped data generated by package tabulizer.
#'
#' @return A dataframe.
#' @export

total_plastic <- function(extracted_data){

  #Data is across the first two list items.
  df <- as.data.frame(rbind(extracted_data[[1]], extracted_data[[2]]), stringsAsFactors = FALSE)

  #Give data frame proper column names
  colnames(df) <- gsub(pattern = "\r", replacement = " ", x = df[1, ])

  #Remove text in top rows
  df <- df[3:nrow(df), ]

  #Make cols numeric
  df[, 1] <- as.numeric(df[, 1])
  df[, 2] <- as.numeric(df[, 2])

  write.csv(x = df, file = "./inst/extdata/total_plastic.csv", row.names = FALSE)

  return()

}

####################################################

#' Extract table with 2015 plastic production and waste per sector.
#'
#' @param extracted_data Raw scraped data generated by package tabulizer.
#'
#' @return A dataframe.
#' @export

sectors_waste <- function(extracted_data){

  #Data is in the third item.
  df <- as.data.frame(extracted_data[[6]], stringsAsFactors = FALSE)

  #Add column headers
  for(i in 1:3){

    if(i == 1){

      colnames(df)[i] <- "sector"

    } else {

      colnames(df)[i] <- paste(df[1, i], df[3, i])

    }

  }

  #Remove unwanted rows
  df <- df[c(-1:-3, -nrow(df)), ]

  #Give sectors standard names
  #N.B. Some of this needs to be done manually, because the order of the names is not consistent
  df$sector[3:nrow(df)] <- standard_names$sectors[-1:-2]

  #Make cols numeric
  df[, 2] <- as.numeric(df[, 2])
  df[, 3] <- as.numeric(df[, 3])

  write.csv(x = df, file = "./inst/extdata/sectors_waste.csv", row.names = FALSE)

  return()

}

##################################################

#' Extract table with 2015 plastic production and waste per plastic type.
#'
#' @param extracted_data Raw scraped data generated by package tabulizer.
#'
#' @return A dataframe.
#' @export

plastics_waste <- function(extracted_data){

  #Data is in the third item.
  df <- as.data.frame(extracted_data[[7]], stringsAsFactors = FALSE)

  #Add column headers
  for(i in 1:3){

    if(i == 1){

      colnames(df)[i] <- "plastic"

    } else {

      colnames(df)[i] <- paste(df[1, i], df[3, i])

    }

  }

  #Remove unwanted rows
  df <- df[c(-1:-3, -13:-14), ]

  #Add standard plastic names
  df$plastic <- standard_names$plastics

  #Make cols numeric
  df[, 2] <- as.numeric(df[, 2])
  df[, 3] <- as.numeric(df[, 3])

  write.csv(x = df, file = "./inst/extdata/plastics_waste.csv", row.names = FALSE)

  return()

}

#' Extract table with proportion of plastic resins used in each sector.
#'
#' @param extracted_data Raw scraped data generated by package tabulizer.
#'
#' @return A dataframe.
#' @export

sectors_plastic <- function(extracted_data){

  #Data is in the third item.
  df <- dplyr::na_if(as.data.frame(extracted_data[[3]], stringsAsFactors = FALSE), "")

  #Extract quantitative data
  #Subset to just be quantitative columns
  df_data <- df[4:15, 1:9]
  df_data <- df_data[complete.cases(df_data), ]
  percentages <- as.numeric(gsub(x = as.vector(as.matrix(df_data[, -1])), pattern = "%", replacement = ""))

  #Create a dataframe with all combinations of plastics and sectors
  #Exclude textiles PP&A fibres as these aren't included here
  output_df <- expand.grid(sector = standard_names$sectors[-7],
                           plastic = standard_names$plastic[-8], stringsAsFactors = FALSE)
  #Add percentage data
  output_df$prop <- percentages/100

  write.csv(x = output_df, file = "./inst/extdata/sectors_plastic.csv", row.names = FALSE)

  return()

}
LiamDBailey/NatGeoDataViz documentation built on June 13, 2019, 8:36 a.m.