##### Chapter 2: Managing and Understanding Data -------------------
##### R data structures --------------------
## Vectors -----
# create vectors of data for three medical patients
subject_name <- c("John Doe", "Jane Doe", "Steve Graves")
temperature <- c(98.1, 98.6, 101.4)
flu_status <- c(FALSE, FALSE, TRUE)
# access the second element in body temperature vector
temperature[2]
## examples of accessing items in vector
# include items in the range 2 to 3
temperature[2:3]
# exclude item 2 using the minus sign
temperature[-2]
# use a vector to indicate whether to include item
temperature[c(TRUE, TRUE, FALSE)]
## Factors -----
# add gender factor
gender <- factor(c("MALE", "FEMALE", "MALE"))
gender
# add blood type factor
blood <- factor(c("O", "AB", "A"),
levels = c("A", "B", "AB", "O"))
blood
# add ordered factor
symptoms <- factor(c("SEVERE", "MILD", "MODERATE"),
levels = c("MILD", "MODERATE", "SEVERE"),
ordered = TRUE)
symptoms
# check for symptoms greater than moderate
symptoms > "MODERATE"
## Lists -----
# display information for a patient
subject_name[1]
temperature[1]
flu_status[1]
gender[1]
blood[1]
symptoms[1]
# create list for a patient
subject1 <- list(fullname = subject_name[1],
temperature = temperature[1],
flu_status = flu_status[1],
gender = gender[1],
blood = blood[1],
symptoms = symptoms[1])
# display the patient
subject1
## methods for accessing a list
# get a single list value by position (returns a sub-list)
subject1[2]
# get a single list value by position (returns a numeric vector)
subject1[[2]]
# get a single list value by name
subject1$temperature
# get several list items by specifying a vector of names
subject1[c("temperature", "flu_status")]
## access a list like a vector
# get values 2 and 3
subject1[2:3]
## Data frames -----
# create a data frame from medical patient data
pt_data <- data.frame(subject_name, temperature, flu_status, gender,
blood, symptoms, stringsAsFactors = FALSE)
# display the data frame
pt_data
## accessing a data frame
# get a single column
pt_data$subject_name
# get several columns by specifying a vector of names
pt_data[c("temperature", "flu_status")]
# this is the same as above, extracting temperature and flu_status
pt_data[2:3]
# accessing by row and column
pt_data[1, 2]
# accessing several rows and several columns using vectors
pt_data[c(1, 3), c(2, 4)]
## Leave a row or column blank to extract all rows or columns
# column 1, all rows
pt_data[, 1]
# row 1, all columns
pt_data[1, ]
# all rows and all columns
pt_data[ , ]
# the following are equivalent
pt_data[c(1, 3), c("temperature", "gender")]
pt_data[-2, c(-1, -3, -5, -6)]
## Matrixes -----
# create a 2x2 matrix
m <- matrix(c(1, 2, 3, 4), nrow = 2)
m
# equivalent to the above
m <- matrix(c(1, 2, 3, 4), ncol = 2)
m
# create a 2x3 matrix
m <- matrix(c(1, 2, 3, 4, 5, 6), nrow = 2)
m
# create a 3x2 matrix
m <- matrix(c(1, 2, 3, 4, 5, 6), ncol = 2)
m
# extract values from matrixes
m[1, 1]
m[3, 2]
# extract rows
m[1, ]
# extract columns
m[, 1]
##### Managing data with R ------------
## saving, loading, and removing R data structures
# show all data structures in memory
ls()
# remove the m and subject1 objects
rm(m, subject1)
ls()
rm(list=ls())
##### Exploring and understanding data --------------------
## data exploration example using used car data
usedcars <- read.csv("usedcars.csv", stringsAsFactors = FALSE)
# get structure of used car data
str(usedcars)
## Exploring numeric variables -----
# summarize numeric variables
summary(usedcars$year)
summary(usedcars[c("price", "mileage")])
# calculate the mean income
(36000 + 44000 + 56000) / 3
mean(c(36000, 44000, 56000))
# the median income
median(c(36000, 44000, 56000))
# the min/max of used car prices
range(usedcars$price)
# the difference of the range
diff(range(usedcars$price))
# IQR for used car prices
IQR(usedcars$price)
# use quantile to calculate five-number summary
quantile(usedcars$price)
# the 99th percentile
quantile(usedcars$price, probs = c(0.01, 0.99))
# quintiles
quantile(usedcars$price, seq(from = 0, to = 1, by = 0.20))
# boxplot of used car prices and mileage
boxplot(usedcars$price, main="Boxplot of Used Car Prices",
ylab="Price ($)")
boxplot(usedcars$mileage, main="Boxplot of Used Car Mileage",
ylab="Odometer (mi.)")
# histograms of used car prices and mileage
hist(usedcars$price, main = "Histogram of Used Car Prices",
xlab = "Price ($)")
hist(usedcars$mileage, main = "Histogram of Used Car Mileage",
xlab = "Odometer (mi.)")
# variance and standard deviation of the used car data
var(usedcars$price)
sd(usedcars$price)
var(usedcars$mileage)
sd(usedcars$mileage)
## Exploring numeric variables -----
# one-way tables for the used car data
table(usedcars$year)
table(usedcars$model)
table(usedcars$color)
# compute table proportions
model_table <- table(usedcars$model)
prop.table(model_table)
# round the data
color_table <- table(usedcars$color)
color_pct <- prop.table(color_table) * 100
round(color_pct, digits = 1)
## Exploring relationships between variables -----
# scatterplot of price vs. mileage
plot(x = usedcars$mileage, y = usedcars$price,
main = "Scatterplot of Price vs. Mileage",
xlab = "Used Car Odometer (mi.)",
ylab = "Used Car Price ($)")
# new variable indicating conservative colors
usedcars$conservative <-
usedcars$color %in% c("Black", "Gray", "Silver", "White")
# checking our variable
table(usedcars$conservative)
# Crosstab of conservative by model
library(gmodels)
CrossTable(x = usedcars$model, y = usedcars$conservative)
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.