#' Extract important data from Geyer et al. 2017
#'
#' @param location URL of pdf containing raw data.
#'
#' @return A list object with 4 items. Each item is an extracted data frame.
#' @export
#'
#' @examples
#'
#' #Extract important data from Geter et al. 2017
#' all_data <- extract_all_data()
extract_all_data <- function(){
#Scrape data from location
extracted_data <- tabulizer::extract_tables("https://advances.sciencemag.org/content/advances/suppl/2017/07/17/3.7.e1700782.DC1/1700782_SM.pdf")
#Get total plastic production
total_plastic <- total_plastic(extracted_data)
#Get waste for each sector and plastic type
sectors_waste <- sectors_waste(extracted_data)
plastics_waste <- plastics_waste(extracted_data)
#Get proportion of each plastic type per section
sectors_plastic <- sectors_plastic(extracted_data)
return(list(total_plastic = total_plastic,
sectors_waste = sectors_waste,
plastics_waste = plastics_waste,
sectors_plastic = sectors_plastic))
}
###################################
#' Extract table with total plastic production over time
#'
#' @param extracted_data Raw scraped data generated by package tabulizer.
#'
#' @return A dataframe.
#' @export
total_plastic <- function(extracted_data){
#Data is across the first two list items.
df <- as.data.frame(rbind(extracted_data[[1]], extracted_data[[2]]), stringsAsFactors = FALSE)
#Give data frame proper column names
colnames(df) <- gsub(pattern = "\r", replacement = " ", x = df[1, ])
#Remove text in top rows
df <- df[3:nrow(df), ]
#Make cols numeric
df[, 1] <- as.numeric(df[, 1])
df[, 2] <- as.numeric(df[, 2])
write.csv(x = df, file = "./inst/extdata/total_plastic.csv", row.names = FALSE)
return()
}
####################################################
#' Extract table with 2015 plastic production and waste per sector.
#'
#' @param extracted_data Raw scraped data generated by package tabulizer.
#'
#' @return A dataframe.
#' @export
sectors_waste <- function(extracted_data){
#Data is in the third item.
df <- as.data.frame(extracted_data[[6]], stringsAsFactors = FALSE)
#Add column headers
for(i in 1:3){
if(i == 1){
colnames(df)[i] <- "sector"
} else {
colnames(df)[i] <- paste(df[1, i], df[3, i])
}
}
#Remove unwanted rows
df <- df[c(-1:-3, -nrow(df)), ]
#Give sectors standard names
#N.B. Some of this needs to be done manually, because the order of the names is not consistent
df$sector[3:nrow(df)] <- standard_names$sectors[-1:-2]
#Make cols numeric
df[, 2] <- as.numeric(df[, 2])
df[, 3] <- as.numeric(df[, 3])
write.csv(x = df, file = "./inst/extdata/sectors_waste.csv", row.names = FALSE)
return()
}
##################################################
#' Extract table with 2015 plastic production and waste per plastic type.
#'
#' @param extracted_data Raw scraped data generated by package tabulizer.
#'
#' @return A dataframe.
#' @export
plastics_waste <- function(extracted_data){
#Data is in the third item.
df <- as.data.frame(extracted_data[[7]], stringsAsFactors = FALSE)
#Add column headers
for(i in 1:3){
if(i == 1){
colnames(df)[i] <- "plastic"
} else {
colnames(df)[i] <- paste(df[1, i], df[3, i])
}
}
#Remove unwanted rows
df <- df[c(-1:-3, -13:-14), ]
#Add standard plastic names
df$plastic <- standard_names$plastics
#Make cols numeric
df[, 2] <- as.numeric(df[, 2])
df[, 3] <- as.numeric(df[, 3])
write.csv(x = df, file = "./inst/extdata/plastics_waste.csv", row.names = FALSE)
return()
}
#' Extract table with proportion of plastic resins used in each sector.
#'
#' @param extracted_data Raw scraped data generated by package tabulizer.
#'
#' @return A dataframe.
#' @export
sectors_plastic <- function(extracted_data){
#Data is in the third item.
df <- dplyr::na_if(as.data.frame(extracted_data[[3]], stringsAsFactors = FALSE), "")
#Extract quantitative data
#Subset to just be quantitative columns
df_data <- df[4:15, 1:9]
df_data <- df_data[complete.cases(df_data), ]
percentages <- as.numeric(gsub(x = as.vector(as.matrix(df_data[, -1])), pattern = "%", replacement = ""))
#Create a dataframe with all combinations of plastics and sectors
#Exclude textiles PP&A fibres as these aren't included here
output_df <- expand.grid(sector = standard_names$sectors[-7],
plastic = standard_names$plastic[-8], stringsAsFactors = FALSE)
#Add percentage data
output_df$prop <- percentages/100
write.csv(x = output_df, file = "./inst/extdata/sectors_plastic.csv", row.names = FALSE)
return()
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.