data-raw/process_datasets.R

## Ball-by-ball information of IPL matches in 2008 - 2020

library(tidyverse)
library(janitor)

deliveries <- read_csv("data-raw/ipl.csv") %>%
  clean_names() %>%
  mutate(
    date = as.Date(date, "%Y-%m-%d"),
    year = format(date, "%Y")
  ) %>%
  select(id, year, date) %>%
  right_join(read_csv("data-raw/deliveries.csv"), by = c("id")) %>%
  mutate(
    id = as.character(id),
    year = as.numeric(year),
    inning = as.character(inning),
    non_boundary = as.factor(non_boundary),
    is_wicket = as.factor(is_wicket),
    extras_type = as.factor(extras_type),
    player_dismissed = as.factor(player_dismissed)
  )
usethis::use_data(deliveries, overwrite = TRUE)



## Creating function to create `teams` data set

winning_team <- function(teams, yr) {
  team_runs <- deliveries %>%
    filter(
      batting_team %in% teams | bowling_team %in% teams,
      yr == year
    ) %>%
    group_by(id, year, date, batting_team) %>%
    summarise(match_runs = sum(total_runs)) %>%
    mutate(winning_team = NA)


  team_overs <- deliveries %>%
    filter(
      batting_team %in% teams | bowling_team %in% teams,
      yr == year,
      extras_type %in% c("byes", "legbyes") | is.na(extras_type)
    ) %>%
    arrange(over, ball) %>%
    group_by(id, year, date, batting_team) %>%
    summarise(
      n_balls = n(),
      n_overs = length(unique(over)),
      diff = n_balls - (n_overs * 6)
    ) %>%
    mutate(n_overs2 = ifelse(diff < 0, as.numeric(paste0(
      n_overs - 1, ".",
      6 + diff
    )), n_overs)) %>%
    ungroup() %>%
    select(-c(n_balls, n_overs, diff)) %>%
    rename(n_overs = n_overs2)

  team_runs <- left_join(team_runs, team_overs, by = c(
    "id", "year", "date",
    "batting_team"
  ))


  dates <- team_runs %>%
    group_by(date) %>%
    summarise(N = n()) %>%
    filter(N == 1) %>%
    .$date

  df <- data.frame(
    id = NA,
    year = NA,
    date = NA,
    batting_team = NA,
    match_runs = NA,
    n_overs = NA,
    winning_team = NA
  )

  for (i in 1:nrow(team_runs)) {
    if (team_runs$date[i] %in% dates) {
      team_runs$winning_team[i] <- "No Result"
      df <- full_join(df, team_runs[i, ],
        by = c(
          "id", "year", "date", "batting_team", "match_runs",
          "n_overs", "winning_team"
        )
      )
      team_runs <- team_runs[-i, ]
    }
  }


  for (i in seq(1, nrow(team_runs), 2)) {
    if (team_runs$match_runs[i] < team_runs$match_runs[i + 1]) {
      team_runs$winning_team[i] <- team_runs$batting_team[i + 1]
      team_runs$winning_team[i + 1] <- team_runs$batting_team[i + 1]
    } else if (team_runs$match_runs[i] > team_runs$match_runs[i + 1]) {
      team_runs$winning_team[i] <- team_runs$batting_team[i]
      team_runs$winning_team[i + 1] <- team_runs$batting_team[i]
    } else {
      team_runs$winning_team[i] <- "Draw"
      team_runs$winning_team[i + 1] <- "Draw"
    }
  }


  team_runs <- team_runs %>%
    rbind(df) %>%
    filter(!is.na(id)) %>%
    arrange(date)

  return(team_runs)
}


## Information on winning team, overs bowled, runs made and wickets fallen for each team in
## each IPL match played in 2008-2020

teams <- map2_df((unique(deliveries$batting_team[deliveries$year == 2008])), 2008, winning_team) %>%
  rbind(map2_df((unique(deliveries$batting_team[deliveries$year == 2009])), 2009, winning_team)) %>%
  rbind(map2_df((unique(deliveries$batting_team[deliveries$year == 2010])), 2010, winning_team)) %>%
  rbind(map2_df((unique(deliveries$batting_team[deliveries$year == 2011])), 2011, winning_team)) %>%
  rbind(map2_df((unique(deliveries$batting_team[deliveries$year == 2012])), 2012, winning_team)) %>%
  rbind(map2_df((unique(deliveries$batting_team[deliveries$year == 2013])), 2013, winning_team)) %>%
  rbind(map2_df((unique(deliveries$batting_team[deliveries$year == 2014])), 2014, winning_team)) %>%
  rbind(map2_df((unique(deliveries$batting_team[deliveries$year == 2015])), 2015, winning_team)) %>%
  rbind(map2_df((unique(deliveries$batting_team[deliveries$year == 2016])), 2016, winning_team)) %>%
  rbind(map2_df((unique(deliveries$batting_team[deliveries$year == 2017])), 2017, winning_team)) %>%
  rbind(map2_df((unique(deliveries$batting_team[deliveries$year == 2018])), 2018, winning_team)) %>%
  rbind(map2_df((unique(deliveries$batting_team[deliveries$year == 2019])), 2019, winning_team)) %>%
  rbind(map2_df((unique(deliveries$batting_team[deliveries$year == 2020])), 2020, winning_team))

teams <- teams %>%
  distinct() %>%
  ungroup()
usethis::use_data(teams, overwrite = TRUE)

# Clean batsman dataset
batsman_100 <- read_csv("data-raw/batsman.csv")
batsman_100$PLAYER <- trimws(gsub("[^[:alnum:]]", " ", batsman_100$PLAYER))
batsman_100 <- batsman_100 %>%
  clean_names() %>%
  rename(
    num_100 = x100,
    num_50 = x50,
    num_4s = x4s,
    num_6s = x6s
  )
usethis::use_data(batsman_100, overwrite = TRUE)

# ipl
ipl <- read_csv("data-raw/ipl.csv") %>%
  clean_names() %>%
  mutate(
    date = as.Date(date, "%Y-%m-%d"),
    year = format(date, "%Y"),
    id = as.character(id),
    toss_decision = as.factor(toss_decision)
  )
usethis::use_data(ipl, overwrite = TRUE)

## Clean the Bowlers data
bowlers_100 <- read_csv("data-raw/bowlers.csv")
bowlers_100$PLAYER <- trimws(gsub("[^[:alnum:]]", " ", bowlers_100$PLAYER))
bowlers_100 <- bowlers_100 %>%
  clean_names() %>%
  rename(
    num_4w = x4w,
    num_5w = x5w
  )
usethis::use_data(bowlers_100, overwrite = TRUE)
Swaha294/ipl documentation built on May 10, 2022, 3:23 p.m.