# This is the chunck indicates all of the different packages used in our Rmd library("leaflet") library("geojsonio") library("readr") library("data.table") library("purrr") library("geonames") library("tidyverse")
We decided to work on a wine data review that was available on kaggle. This dataset contains information that can be found on the website "Wine Enthousiast". We created an interactive map that is easy to navigate through the different countries in the World. Indeed, we did not find any app or website that had a clear review of the wines throughout the World. With our map, one is able to see the average quality, the most common type and the amount of wine that was review for each country. Moreover, one can observe on the right side of the dashboard, the "Wine data explorer". There, you can change the variety of the wine as well as the country, to see the relation between quality and price for the variety/varieties for the country/countries. There is also an option on the dashboard to explore the data of wine review. This allows you to search for a specific wine and access all important information, such as the price or the winery. One can also vary the price range in order to find wines specific to everyone's willingness to pay.
The original dataset was retrieved from kaggle containing the following 14
variables:
- X: Integer variable, indicates the observation number.
- Country: Character variable, shows from which country the wine comes from.
- Description: Character variable, gives information about the wine.
- Designation: Character variable, indicates the vineyard of the winery.
- Points: Integer variable, contains the number of points that was given
to a specific wine. It varies from 1 to 100 where 100 indicates the best
possible wine and 1 the worst one.
- Price: Numerical variable, indicates the price (USD) of the bottle of wine.
- Province: Character variable, shows from which province/state the wine comes
from.
- Region 1: Character variable, indicates the area of where the wine is from.
- Region 2: Character variable, indicates more specific information about the
region were the grapes grow.
- Taster Name: Character variable, indicates the name of the taster of the wine
that posted the review on the website Wine Enthousiast.
- Taster Twitter Handle: Character variable, includes information about
the Twitter account of the wine taster.
- Title: Character variable, contains information about the wine such as its
name, type and year.
- Variety: Character variable, indicates the type of grapes used for the wine
( e.g."Pinot Gris").
- Winery: Character variable, indicates the winery.
This dataset contains 129971 observations.
We then deleted the unnecessary variables, i.e. the X, the Taster Name, and the Taster Twitter Handle. Moreover, we deleted the rows containing missing information about the price, the country, the variety and the number of points. We aslo changed the names of the countries: United States of America and United Kingdom so that they would match the names of our second database (i.e. countries).
# Download data winemag <- read_csv("/Users/Vanessa/Desktop/winemag-data-130k-v2.csv") # Take out unnecessary columns winemag <- winemag[,-c(1,10,11)] # Take out rows for wine without the price winemag <- winemag[complete.cases(winemag[,5]),] # Take out rows for wine without the points winemag <- winemag[complete.cases(winemag[,4]),] # Take out rows for wine without the country winemag <- winemag[complete.cases(winemag[,1]),] # Take out rows for wine without the variety winemag <- winemag[complete.cases(winemag[,10]),] # Change country names to match the second dataset (countries) winemag[which(winemag$country == "US"),]$country = "United States of America" winemag[which(winemag$country == "England"),]$country = "United Kingdom"
Thus, this new database contains 12 variables and 101400 observations.
Lastly, we changed the format of the database into a RDA file so that people could easily retrieve it when using our package.
usethis::use_data(winemag, overwrite = TRUE)
In order to get information about the countries' coordinates in the World, we used the geocountries function from the Geodata data package. This allowed us to create our World Map.
# Data from Geodata package countries <- geojson_read("/Users/Vanessa/Desktop/countries.geojson", what = "sp")
In this part, we first needed to create a variable containing the value of the average points, regarding the quality of the wines from the different countries. We then joined it to its respective countries in the database "countries". Following that, we wanted to know the number of unique countries and varieties in our dataset. Finally, we created new labels containing the most popular wine and the total number of wines for each country. The results are displayed in the map with different colors, the darkest colors can either indicate the country with the highest average or the highest number of wines reviewed in that country, depending on the label chosen in the shiny app.
# Load the data load("../data/winemag.rda") data <- winemag load("../data/countries.rda") # Calculate average wine quality per country avg.quality <- data %>% group_by(country) %>% summarise(avg.quality = mean(points)) # Initialize matrix for label creation df <- matrix(nrow = 255, ncol = 1, 0) df <- as.data.frame(df) df[,1] <- unique(countries$ADMIN) colnames(df) <- c("country") df$country <- as.character(df$country) df <- df %>% left_join(avg.quality, by = c("country" = "country")) # Find the number of unique countries/varieties that are present in the wine data unique.countries <- unique(data$country) unique.varieties <- unique(data$variety)
# Create a label based on the most popular wine common_wines <- data %>% group_by(country, variety) %>% count() wine.labels <- matrix(nrow = 42, ncol = 3) wine.labels <- as.data.frame(wine.labels) for(i in 1:42){ a <- common_wines %>% filter(country == unique.countries[i]) if (is.na(a) == FALSE) { wine.labels[i,] <- a[which.max(a$n),] } } wine.labels <- wine.labels[-43,] df <- df %>% left_join(wine.labels, by = c("country" = "V1")) n.wines <- data %>% group_by(country) %>% count() df <- df %>% left_join(n.wines, by = c("country" = "country")) countries@data$quality<- df # Create colours for the graph: # Based on average quality bins <- c() for (i in 1:5) { bins[i] <- quantile(countries@data$quality$avg.quality, na.rm = T, probs = 0.2*i) } bins.avg <- c(80,round(bins,1),93) pal.avg <- colorBin("YlOrRd", domain = countries$df, bins = bins.avg) # Based on number of wines bins <- c() for (i in 1:5) { bins[i] <- quantile(countries@data$quality$n, na.rm = T, probs = 0.2*i) } bins.n <- c(0, bins, 60000) pal.n <- colorBin("YlOrRd", domain = countries$df, bins = bins.n) pal.fun <- list(pal.avg, pal.avg, pal.n) pal <- list(pal.avg(countries@data$quality$avg.quality), pal.avg(countries@data$quality$avg.quality), pal.n(countries@data$quality$n)) pal.name <- c(rep("Average wine quality",2),"Amount of wines") # Change the countries database into an RDA file usethis::use_data(countries, overwrite = TRUE)
m <- leaflet(countries) %>% setView(0, 0, 2) %>% addProviderTiles("MapBox", options = providerTileOptions( id = "mapbox.light", accessToken = Sys.getenv('pk.eyJ1IjoiYmFydGozaCIsImEiOiJjam81amF6ODcwODBqM3FvYTlrN2E3azlvIn0.PUtXU40gLYiECsGAMzeYiw')))
df <- matrix(nrow = 255,ncol = 1, 0) df <- as.data.frame(df) df[,1] <- unique(countries$ADMIN) colnames(df) <- c("country") df$country <- as.character(df$country) df <- df %>% left_join(avg.quality, by = c("country" = "country")) countries@data$quality <- df bins <- c(84,85,86,87,88,89,90,91,92) pal <- colorBin("YlOrRd", domain = countries$df, bins = bins) labels <- sprintf( "<strong>%s</strong><br/>%g", df$country, df$avg.quality) %>% lapply(htmltools::HTML) m %>% addPolygons( fillColor = ~pal(df$avg.quality), weight = 2, opacity = 1, color = "white", dashArray = "3", fillOpacity = 0.7, highlight = highlightOptions( weight = 5, color = "#666", dashArray = "", fillOpacity = 0.7, bringToFront = TRUE), label = labels, labelOptions = labelOptions( style = list("font-weight" = "normal", padding = "3px 8px"), textsize = "15px", direction = "auto")) %>% addLegend(pal = pal, values = ~density, opacity = 0.7, title = "Average wine quality", position = "bottomleft")
This map shows us the average wine quality of each country in our database. As explained above, the darker the color, the better the quality. One can assess the exact average by moving the mouse on each different countries. For instance, the country with the highest average quality wine is the UK with the average of 91.55 and the lowest is Ukraine with an average of 84.07.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.