# load packages----------------------------------------------------------------- library(learnr) library(gradethis) library(tidyverse) library(dsbox) # set options for exercises and checking --------------------------------------- gradethis_setup() # hide non-exercise code chunks ------------------------------------------------ knitr::opts_chunk$set(echo = FALSE)
knitr::include_graphics("images/madeleine-kohler-90Qn643Pq9c-unsplash.jpg")
Recent developments in Edinburgh regarding the growth of Airbnb and its impact on the housing market means a better understanding of the Airbnb listings is needed. Using data provided by Airbnb, we can explore how Airbnb availability and prices vary by neighbourhood.
The data come from the Kaggle database, and was originally distributed by Inside Airbnb on 25 June 2019.
The data has been modified to better serve the goals of this exploration.
The goal of this tutorial is not to conduct a thorough analysis of Airbnb listings in Edinburgh, but instead to give you a chance to practice your data visualisation and interpretation skills.
We'll use the tidyverse
package for this analysis and the dsbox
package for the data. Run the following code to load this package.
library(tidyverse) library(dsbox)
library(tidyverse) library(dsbox)
grade_this_code("The tidyverse and dsbox packages are now loaded!")
The data is in the dsbox
package and it's called edibnb
.
You can view the data using
glimpse(edibnb)
question("What does each row in the `edibnb` dataset represent?", answer("The values of a certain variable"), answer("An individual Airbnb listing", correct = TRUE, message = "Each row in the dataset contains one observation of each field, each relating to a single Airbnb listing."), answer("An Airbnb booking"), answer("An Edinburgh neighbourhood") )
How many Airbnb listings are included in this dataset? Use the following code block to find out.
nrow(___)
grade_this({ if(identical(.result, 13245) | identical(.result, 13245L)) { pass("There are 13245 observations in the dataset.") } if(identical(.result, 10) | identical(.result , 10L)) { fail("There are 13245 observations in the dataset.") } fail("Not quite. Each observation is represented in one row. Can you remember which function we use to calculate the number of rows?") })
Each column represents a variable. We can get a list of the variables in the data frame using the names()
function.
names(edibnb)
The variables and their descriptions are given below:
| Variable Name | Description
|:----------------|:--------------------------------
| id
| ID number of the listing
| price
| Price, in GBP, for one night stay
| neighbourhood
| Neighbourhood listing is located in
| accommodates
| Number of people listing accommodates
| bathrooms
| Number of bathrooms
| bedrooms
| Number of bedrooms
| beds
| Number of beds (which can be different than the number of bedrooms)
| review_scores_rating
| Average rating of property
| number_of_reviews
| Number of reviews
| listing_url
| Listing URL
Create a histogram of the distribution of Airbnb listing prices (nightly rates) in Edinburgh. Sample code is provided below, but you will need to fill in the blanks.
ggplot(data = ___, mapping = aes(x = ___)) + geom_histogram() + labs( x = "Airbnb listing price, in £", y = "Frequency", title = "Distribution of Airbnb nightly rates in Edinburgh" )
ggplot(data = edibnb, mapping = aes(x = ___)) + geom_histogram() + labs( x = "Airbnb listing price, in £", y = "Frequency", title = "Distribution of Airbnb nightly rates in Edinburgh" )
ggplot(data = edibnb, mapping = aes(x = price)) + geom_histogram() + labs( x = "Airbnb listing price, in £", y = "Frequency", title = "Distribution of Airbnb nightly rates in Edinburgh" )
grade_this_code("And note that there are a couple warnings, we'll get to those in a bit.")
quiz( caption = "", question("Which of the following describes the shape of the distribution of nightly rates of Airbnb edibnb in Edinburgh? Check all that apply.", answer("Right skewed", correct = TRUE), answer("Left skewed", message = "Skew is on the side of the longer tail"), answer("Symmetric", message = "If you were to draw a vertical line down the middle of the x-axis, would the left and right sides of the distribution look like mirror images?"), answer("Unimodal", correct = TRUE), answer("Bimodal", message = "How many prominent peaks, or modes, do you see? (recall \"bi\" means \"two\")"), answer("Multimodal", message = "A distribution is said to be multimodal if there are three or more prominent peaks. How many prominent peaks, or modes, do you see?"), allow_retry = TRUE), question("Which of the following is false?", answer("There are no listings with a nightly rate above £1,250."), answer("More than 50% of listings have a nightly rate below £250."), answer("More than 25% of listings have a nightly rate above £500.", correct = TRUE), answer("It is possible that some of the listings have a nightly rate of £0."), allow_retry = TRUE) )
Create a faceted histogram where each facet represents a neighbourhood and displays the distribution of Airbnb prices in that neighbourhood. Sample code is provided below, but you will need to fill in the blanks.
ggplot(data = ___, mapping = aes(x = ___)) + geom_histogram() + facet_wrap(~___) + labs( x = "Airbnb listing price, in £", y = "Frequency", title = "Distribution of Airbnb nightly rates in Edinburgh", subtitle = "By neighbourhood" )
ggplot(data = edibnb, mapping = aes(x = ___)) + geom_histogram() + facet_wrap(~___) + labs( x = "Airbnb listing price, in £", y = "Frequency", title = "Distribution of Airbnb nightly rates in Edinburgh", subtitle = "By neighbourhood" )
ggplot(data = edibnb, mapping = aes(x = price)) + geom_histogram() + facet_wrap(~___) + labs( x = "Airbnb listing price, in £", y = "Frequency", title = "Distribution of Airbnb nightly rates in Edinburgh", subtitle = "By neighbourhood" )
ggplot(data = edibnb, mapping = aes(x = price)) + geom_histogram() + facet_wrap(~neighbourhood) + labs( x = "Airbnb listing price, in £", y = "Frequency", title = "Distribution of Airbnb nightly rates in Edinburgh", subtitle = "By neighbourhood" )
grade_this_code("You've successfully created a faceted histogram.")
You'll see in the above code that R gives us a couple of warnings. Let's analyse these:
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
The binwidth of a histogram defines the width each bar (bin) spans. Since we haven't specified a particular value in geom_histogram()
, R picks one for us, but it also lets us know that it would be better for us to pick a binwidth
intentionally. We'll get to practice this in a bit.
Warning: Removed 199 rows containing non-finite values (stat_bin).
This second warning is a consequence of the fact that we are working with a real-world dataset, and thus some of the values in the data are missing (we don't know the prices for those listings) and are encoded as NA
s. If we don't know the price, we can't represent it on the plot.
This doesn't stop R from creating the histogram, but it does mean some observations are omitted from this visualisation (199 of them, to be precise).
Instead of having R pick a binwidth
itself, try out different binwidths using the code below:
ggplot(data = ___, mapping = aes(x = ___)) + geom_histogram(binwidth = ___) + facet_wrap(~___) + labs( x = "Airbnb listing price, in £", y = "Frequency", title = "Distribution of Airbnb nightly rates in Edinburgh" )
What is a reasonable value for the binwidth
parameter? Put your answer in the block below:
grade_this({ if(.result >= 20 & .result <= 100) { pass("That's a suitable binwidth") } if(.result < 20) { fail("That binwidth seems quite small. See how narrow the bins are using that width.") } if(.result > 100) { fail("That binwidth seems quite large. See how wide the bins are using that width.") } fail("Invalid binwidth.") })
Let’s deconstruct this code:
ggplot()
is the function we are using to build our plot, in layers.Create a similar visualisation, this time showing the distribution of review scores (review_scores_rating
) across neighbourhoods.
ggplot(data = ___, mapping = aes(x = ___)) + geom_histogram() + facet_wrap(~___) + labs( x = "Average rating score of property", y = "Frequency", title = "Distribution of Airbnb rating scores in Edinburgh", subtitle = "By neighbourhood" )
ggplot(data = edibnb, mapping = aes(x = review_scores_rating)) + geom_histogram() + facet_wrap(~neighbourhood) + labs( x = "Average rating score of property", y = "Frequency", title = "Distribution of Airbnb rating scores in Edinburgh", subtitle = "By neighbourhood" )
grade_this_code("Your solution is correct!")
Which neighbourhood has the most ratings around 100%? Report your answer as a text string in quotation marks, e.g. for Tollcross, type in "Tollcross"
.
Did you forget to add the quotation marks?
grade_this({ if(identical(tolower(.result), "leith")) { pass("Leith has the highest number of perfect ratings.") } fail("Not quite. Try looking at the height of the bars and the labels of the x-axis.") })
question("How do users generally rate properties on Airbnb?", answer("Generally positively", correct = TRUE), answer("Generally negatively."), answer("Can't say from the visualisation."), correct = "Correct!", allow_retry = TRUE )
Congratulations, you've finished the first tutorial! We hope that you enjoyed practising your data handling, visualisation, and interpretation skills along the way.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.