In schubertjan/giro2019: Constains Data From Giro 2019 Roadbook

knitr::opts_chunk$set(
  warning = FALSE,
  message = FALSE,
  fig.width = 7,
  fig.height = 5,
  collapse = TRUE,
  comment = "#>"
)

library(giro2019)
library(DT)

In this blog post I will be looking at different stats from Giro roadbook(s). If you enjoy it and would like to play with the data more, you can download the whole package including the data and all the reports with devtools::install_github("schubertjan/giro2019").

May 16

#scrape data from wiki
stages <- stage.scrape()

#calculate cummulative sum after stage 6
dist.after.6 <- sapply(stages, function(x){
  x <- x[!is.na(x[, 4]), ]
  d <- x[, 4]
  cumsum(d)[6]
})

#plot
barplot(height = dist.after.6, 
        names.arg = names(dist.after.6),
        main = "Distance (km) Covered After Stage 6",
        ylab = "Distance (km)",
        cex.names = .7,
        cex.axis = .7,
        las=2)
abline(h = tail(dist.after.6,1), lty = 2)
legend("topright", legend = c("Giro 2019 - 1046 (km)"), lty = 2, cex = .5)

May 12

dat <- list(Roglic = tt[tt$rider == "Roglic", ],
            Dumoulin = tt[tt$rider == "Dumoulin", ],
            Yates = tt[tt$rider == "Yates S.", ])

#we will remove tour de romandie 2018 ITT as that was uphill and an outlier
dat$Roglic <- dat$Roglic[- c(5), ]

par(mfrow = c(3,1), mar = c(3,4,2,4))
x <- lapply(1:length(dat), function(i) {
  d <- dat[[i]]
  plot(d$len, d$time,
       xlab = "",
       ylab = "",
       main = paste0("ITT 2018/2019 - ", names(dat)[i]))
  abline(lm(time ~ len -1, data = d, w = 1/d$len))
  text(d$len, d$time,labels = d$race, cex = .8)
  title(ylab="Time (min)", xlab = "Length (km)", line=2)
})

#we will remove tour de romandie 2018 ITT as that was uphill and an outlier
m <- list(roglic_m = lm(time ~ len - 1, data = dat$Roglic, w = 1/dat$Roglic$len),
          dumoulin_m = lm(time ~ len - 1, data = dat$Dumoulin, w = 1/dat$Dumoulin$len),
          yates_m = lm(time ~ len - 1, data = dat$Yates, w = 1/dat$Yates$len))
lapply(m, summary)

pred <- sapply(m, predict, data.frame(len = c(5.9)))
obs <- c("6:48", "7:06", "6:59")
s <-  as.numeric(format(strptime(obs, format = "%M:%S"), format = "%S")) * 1/60^2
m <-  as.numeric(format(strptime(obs, format = "%M:%S"), format = "%M")) * 1/60
t <- (m+s)*60

(t - pred)*60/100
t/pred

May 10

dat <- list(Roglic = tt[tt$rider == "Roglic", ],
            Dumoulin = tt[tt$rider == "Dumoulin", ],
            Yates = tt[tt$rider == "Yates S.", ])

#we will remove tour de romandie 2018 ITT as that was uphill and an outlier
dat$Roglic <- dat$Roglic[- c(5), ]

par(mfrow = c(3,1), mar = c(3,4,2,4))
x <- lapply(1:length(dat), function(i) {
  d <- dat[[i]]
  plot(d$len, d$time,
       xlab = "",
       ylab = "",
       main = paste0("ITT 2018/2019 - ", names(dat)[i]))
  abline(lm(time ~ len -1, data = d, w = 1/d$len))
  text(d$len, d$time,labels = d$race, cex = .8)
  title(ylab="Time (min)", xlab = "Length (km)", line=2)
})

#we will remove tour de romandie 2018 ITT as that was uphill and an outlier
m <- list(roglic_m = lm(time ~ len - 1, data = dat$Roglic, w = 1/dat$Roglic$len),
          dumoulin_m = lm(time ~ len - 1, data = dat$Dumoulin, w = 1/dat$Dumoulin$len),
          yates_m = lm(time ~ len - 1, data = dat$Yates, w = 1/dat$Yates$len))
lapply(m, summary)

par(mfrow = c(1,1), mar = c(5,4,5,4))
plot(c(7,35), c(8,45), 
     type = "n",
     xlab = "Length (km)",
     ylab = "Time (min)",
     main = "Expected Time in ITT")
abline(m$roglic_m, col = "gold", lwd = 2)
abline(m$dumoulin_m, col = "firebrick3", lwd = 2)
abline(m$yates_m, col = "black", lwd = 2)
abline(v = c(8,34.8,17), lty = 2)
text(10,45,labels = "ITT1 - 8km", cex = .8)
text(32.5,45,labels = "ITT2 - 34.8km", cex = .8)
text(19.5,45,labels = "ITT3 - 17km", cex = .8)
legend("bottomright", 
       legend = c("Roglic", "Dumoulin", "Yates"), 
       col = c("gold", "firebrick3", "black"),
       lwd = c(2,2,2),
       cex = .8)

out <- sapply(m, predict, data.frame(len = c(8,34.8,17)))
m <- round( floor((out %% 1)*100) * 0.6, 0)
m <- ifelse(nchar(m) == 1, paste0("0",m),m)
out_print <- matrix(paste0(floor(out),
                           ":", 
                           m),
                    ncol = 3,
                    nrow = 3)
#out_print <- as.data.frame(out_print)
colnames(out_print) <- c("Roglic", "Dumoulin", "Yates")
row.names(out_print) <- c("ITT1", "ITT2", "ITT3")
knitr::kable(out_print)

out_diff <- apply(out, 2, function(x) out[, 3]-x)
out_diff <- as.data.frame(out_diff[, -3])
out_diff[4, ] <- colSums(out_diff)
out_diff <- as.matrix(out_diff)
m <- round(floor((out_diff %% 1)*100) * 0.6, 0)
m <- ifelse(nchar(m) == 1, paste0("0",m),m)
out_d_print <- matrix(paste0(floor(out_diff),
                           ":", 
                           m),
                    ncol = 2,
                    nrow = 4)
colnames(out_d_print) <- c("Roglic", "Dumoulin")
row.names(out_d_print) <- c("ITT1", "ITT2", "ITT3", "Total")
knitr::kable(out_d_print)

May 7

Today's data comes from Giro road book and has been downloaded using the tabulizer package: tabulizer::extract_tables("http://www.giroditalia.it/wp-content/uploads/2019/04/Garibaldi_Giro_italia_2019_low_res.pdf", pages = c(17, 18))

tabulizer::extract_tables("https://static2.giroditalia.it/wp-content/uploads/2017/04/Garibaldi_2017.pdf", pages = c(10))

tabulizer::extract_tables("https://legaciclismo.files.wordpress.com/2018/05/garibaldi-2018.pdf", pages = c(19, 20))

par(mfcol = c(3,2), mar = c(2,4,2,4))
hist(climbs_2017$p.media, 
     breaks = 10,
     xlim = c(2, 13),
     cex.main = 1,
     cex.lab = .8,
     cex.axis = .8,
     xlab = "Average Gradient",
     main = "Distribution of average gradient\n categorized climbs")
hist(climbs_2018$p.media, 
     breaks = 10,
     xlim = c(2, 13),
     cex.main = 1,
     cex.lab = .8,
     cex.axis = .8,
     main = "")
hist(climbs$p.med, 
     breaks = 10,
     xlim = c(2, 13),
     cex.main = 1,
     cex.lab = .8,
     cex.axis = .8,
     main = "")

hist(climbs_2017$p.max, 
     breaks = 10,
     xlim = c(5, 22),
     cex.main = 1,
     cex.lab = .8,
     cex.axis = .8,
     xlab = "Max Gradient",
     main = "Distribution of max gradient\n categorized climbs")
hist(climbs_2018$p.max, 
     breaks = 10,
     xlim = c(5, 22),
     cex.main = 1,
     cex.lab = .8,
     cex.axis = .8,
     main = "")
hist(climbs$p.max, 
     breaks = 10,
     xlim = c(5, 22),
     cex.main = 1,
     cex.lab = .8,
     cex.axis = .8,
     main = "")

l <- list(giro_2017 = climbs_2018[, c("length.km", "p.media","p.max")],
          giro_2018 = climbs_2017[, c("length.km", "p.media","p.max")],
          giro_2019 = climbs[, c("lungh.(km)", "p.med","p.max")]
          )
lapply(l, function(x) summary(x)[4, ])

May 5

tabulizer::extract_tables("https://static2.giroditalia.it/wp-content/uploads/2017/04/Garibaldi_2017.pdf", pages = c(10))

tabulizer::extract_tables("https://legaciclismo.files.wordpress.com/2018/05/garibaldi-2018.pdf", pages = c(19, 20))

Riders did an extra 3000m of climbing in 2017 compared with 2019. 2019 sees 1800m more categorized climbing than last year, or one big category 1 climb. All is building up for week 2 and 3. Unusually late GC action for Giro. On paper, stage 20 looks like the hardest in the last 3 years. Despite little GC action in week 1, riders will end week 2 with about the same amount of gained elevation as in previous 2 years. Week 3 is tougher than last year. Will be extra important to time the form right!

#fill the data with all stages
fill <- data.frame(tappa = 1:21)
climbs_2019 <- merge(fill, climbs, by = "tappa", all.x = TRUE)
fill <- data.frame(stage = 1:21)
climbs_2018 <- merge(fill, climbs_2018, by = "stage", all.x = TRUE)
climbs_2017 <- merge(fill, climbs_2017, by = "stage", all.x = TRUE)

#replace NA
climbs_2019$`disl.(m)` <- ifelse(is.na(climbs_2019$`disl.(m)`), 0, climbs_2019$`disl.(m)`)
climbs_2018$disl.m <- ifelse(is.na(climbs_2018$disl.m), 0, climbs_2018$disl.m)
climbs_2017$disl.m <- ifelse(is.na(climbs_2017$disl.m), 0, climbs_2017$disl.m)

#create colours
col_2017 <- rgb(200, 79, 178, maxColorValue = 255)
col_2018 <- rgb(105, 147, 45, maxColorValue = 255)
col_2019 <- rgb(85, 130, 169, maxColorValue = 255)

comp <- data.frame(Year = factor(c("2017", "2018", "2019")),
                   Total.elev = c(sum(climbs_2017$disl.m, na.rm = TRUE),
                                  sum(climbs_2018$disl.m, na.rm = TRUE),
                                  sum(climbs_2019$`disl.(m)`, na.rm = TRUE)))
with(comp,barplot(Total.elev, 
                  names.arg = Year,
                  xlab = "Stage",
                  ylab = "Total Elevation (m)",
                  main = "Total Elevation Gained in Categorized Climbs",
                  cex.axis = .8,
                  cex.names = .8,
                  col = c(col_2017, col_2018, col_2019)))

DT::datatable(comp, rownames = FALSE)

#summary by stage
s_2019 <- with(climbs_2019, tapply(`disl.(m)`, tappa, sum))
s_2018 <- with(climbs_2018, tapply(disl.m, stage, sum))
s_2017 <- with(climbs_2017, tapply(disl.m, stage, sum))

par(mfrow = c(3,1), mar = c(2,4,2,4))
plot(names(s_2017), 
     s_2017, 
     type = "b",
     lwd = 3, 
     col = col_2017,
     ylim = c(0, max(c(s_2017,
                       s_2018,
                       s_2019))),
     main = "Total Elevation Gained in Categorized Climbs By Stages",
     ylab = "Total Elevation (m)",
     xlab = "Stage",
     cex.axis = .85,
     cex.lab = .85,
     cex.main = 1.2,
     xaxt="n",
     pch = 16)
axis(1, labels = names(s_2017), at = names(s_2017), cex.axis = .85)
legend("topleft", 
       legend = c("2017", "2018", "2019", "Rest days"), 
       col = c(col_2017, col_2018, col_2019, "black"),
       lwd = c(3,3,3,1),
       lty = c(1,1,1,2),
       cex = .8)
abline(v = 3.5, lty = 2)
abline(v = 9.5, lty = 2)
abline(v = 15.5, lty = 2)
plot(names(s_2017), 
     s_2018, 
     type = "b",
     lwd = 3, 
     col = col_2018,
     ylim = c(0, max(c(s_2017,
                       s_2018,
                       s_2019))),
     ylab = "Total Elevation (m)",
     cex.axis = .85,
     cex.lab = .85,
     xaxt="n",
     pch = 16)
axis(1, labels = names(s_2017), at = names(s_2017), cex.axis = .85)
abline(v = 3.5, lty = 2)
abline(v = 9.5, lty = 2)
abline(v = 15.5, lty = 2)
plot(names(s_2017), 
     s_2019, 
     type = "b",
     lwd = 3, 
     col = col_2019,
     ylim = c(0, max(c(s_2017,
                       s_2018,
                       s_2019))),
     ylab = "Total Elevation (m)",
     cex.axis = .85,
     cex.lab = .85,
     xaxt="n",
     pch = 16)
axis(1, labels = names(s_2017), at = names(s_2017), cex.axis = .85)
abline(v = 9.5, lty = 2)
abline(v = 15.5, lty = 2)

#summary by stage
cs_2017 <- cumsum(s_2017)
cs_2018 <- cumsum(s_2018)
cs_2019 <- cumsum(s_2019)
#replace NA

par(mfrow = c(1,1), mar = c(5,4,4,2))
plot(names(s_2017), 
     cs_2017, 
     type = "b",
     lwd = 3, 
     col = col_2017,
     ylim = c(0, max(c(cs_2017,
                       cs_2018,
                       cs_2019))),
     main = "Cumulative Total Elevation Gained in Categorized Climbs By Stages",
     ylab = "Cumulative Total Elevation (m)",
     xlab = "Stage",
     cex.main = .8,
     cex.axis = .8,
     cex.lab = .8,
     xaxt="n",
     pch = 16)
legend("topleft", 
       legend = c("2017", "2018", "2019"), 
       col = c(col_2017, col_2018, col_2019),
       lwd = c(3,3,3),
       cex = .8)
lines(cs_2018, lwd = 3, pch = 16,col = col_2018, type = "b")
lines(cs_2019, lwd = 3, pch = 16, col = col_2019, type = "b")
axis(1, labels = names(s_2017), at = names(s_2017), cex.axis = .8)
legend("topleft", 
       legend = c("2017", "2018", "2019"), 
       col = c(col_2017, col_2018, col_2019),
       lwd = c(3,3,3),
       lty = c(1,1,1),
       cex = .8)

DT::datatable(data.frame(giro2017 = s_2017,
                         giro2018 = s_2018,
                         giro2019 = s_2019))

May 4

The data comes from 2019 Giro road book and has been downloaded using the tabulizer package tabulizer::extract_tables("http://www.giroditalia.it/wp-content/uploads/2019/04/Garibaldi_Giro_italia_2019_low_res.pdf", pages = c(17, 18)).

Below is a detailed overview of all classified climbs by stages. The x axis shows the length of a climb (km), the y axis elevation difference (m). All charts use the same axis to allow relative comparison. The colour represents climb's category.

Black = cat 1
Red = cat 2
Orange = cat 3
Green = cat 4

Most of climbing action is packed between 1 and 2 rest day. Stage 20 is packed with climbing.

#pre-process data
climbs_l <- split(climbs,climbs$tappa)
n <- lapply(climbs_l,function(x) nrow(x) + 1)
#creare NA for stages with  < 5 climbs 
for(i in 1:length(climbs_l)){
  if(n[i] != 6){
    for(j in n[[i]]:5){
      climbs_l[[i]][j, ] <- NA
      }
  }
}
climbs_full <- do.call(rbind, climbs_l)


par(mfrow = c(19,5), mar = c(1,1,2,1))
x <- lapply(1:nrow(climbs_full), function(i) {
  #create data
  x <- c(0, climbs_full$`lungh.(km)`[i])
  y <- c(0, climbs_full$`disl.(m)`[i])
  #if no data plot empty else plot climb
  if(is.na(x[2])) {
      plot(0,0,type = "n", axes = FALSE,ann = FALSE)
  } else {
  dat <- giro2019::createhilldata(x = climbs_full$`lungh.(km)`[i],
                                  y = climbs_full$`disl.(m)`[i],
                                  cat = climbs_full$cat[i])
  giro2019::hillplot(x = dat$x,
                     y = dat$y,
                     col_plot = dat$plot_col,
                     x_lim = max(climbs_full$`lungh.(km)`, na.rm = TRUE),
                     y_lim = max(climbs_full$`disl.(m)`, na.rm = TRUE),
                     stage = climbs_full$tappa[i],
                     climb_name = climbs_full$gpm[i],
                     showaxis = FALSE)
  }
})

DT::datatable(climbs[, c(1,4,5,11,12,13)], rownames = FALSE)