inst/doc/strikeoutsandhr.R

## ----nomessages, echo = FALSE-------------------------------------------------
# set some default options for chunks
knitr::opts_chunk$set(
  warning = FALSE,   # avoid warnings and messages in the output
  message = FALSE,
  collapse = TRUE,   # collapse all output into a single block
  tidy = FALSE,      # don't tidy our code-- assume we do it ourselves
  fig.height = 5,
  fig.width = 5
)
options(digits=4)    # number of digits to display in output; can override with chunk option R.options=list(digits=)
par(mar=c(3,3,1,1)+.1)

set.seed(1234)       # reproducibility

## ----load-packages------------------------------------------------------------
library(Lahman) 
library(ggplot2) 
library(dplyr)
library(car)

## ----Batting-names------------------------------------------------------------
data("Batting", package="Lahman") # load the data
str(Batting) # take a look at the structure of the complete data set, as it is

## ----Batting-filter-----------------------------------------------------------
Batting <- Batting %>%
  select(yearID, AB, SO, HR) %>% # select the variables that we need
  group_by(yearID) %>% # group by year, so that each row is one year
  summarise_each(funs(sum)) # we want the sum of AB, HR, and SO in the other rows

FullBatting<- Batting %>% # create a new variable that has SO rate and HR rate
  filter(yearID >= 1950) %>% # select the years from 1900+
  mutate(SO_rate = (SO/AB)*100, HR_rate = (HR/AB)*100) #add SO rate and HR rate as percentages to our data frame  
  
some(FullBatting) # look at a set of random observations

## -----------------------------------------------------------------------------
dim(FullBatting) # show the dimensions of the data frame

## -----------------------------------------------------------------------------
sum(FullBatting$SO) # find the sum of strikeout column

## -----------------------------------------------------------------------------
mean(FullBatting$SO_rate) # find the mean of the strikeout rate column

## -----------------------------------------------------------------------------
sum(FullBatting$HR) # find the sum of home run column

## -----------------------------------------------------------------------------
mean(FullBatting$HR_rate) # find the mean of the home run rate column

## -----------------------------------------------------------------------------
corr <- cor.test(FullBatting$SO_rate, FullBatting$HR_rate)
corr # find the correlation between strikeout rate and home run rate

## -----------------------------------------------------------------------------
Model_Totals <- lm(SO_rate~HR_rate, data=FullBatting)
summary(Model_Totals) # look at the model totals

## -----------------------------------------------------------------------------
plot <- ggplot(FullBatting, aes(x= SO_rate, y= HR_rate))+
geom_point()+ 
  xlab("Strikeout Rate") +
  ylab("Home Run Rate") +
  ggtitle("Relationship Between Strikeouts and Home Runs")
plot + stat_smooth(method= "lm") ##stat_smooth fits the model and then we plot the linear regression model

Try the Lahman package in your browser

Any scripts or data that you put into this service are public.

Lahman documentation built on May 4, 2023, 9:11 a.m.