R/convert-html.R

Defines functions get.game.start.end process.pl.old p.split process.pl.new process.es.new integrate.new.pieces integrate.old.pieces

# Functions for processing HTML files raw.

get.game.start.end <- function(es.html) {
    ## es.html <- '<td align="center" style="font-size: 10px;font-weight:bold">Start&nbsp;12:06&nbsp;PDT</td>'

    es.html <- gsub("&nbsp;", " ", es.html)

    start.time.proto <- unlist(regmatches(es.html, gregexpr("([Ss]tart|[Ss]tart Time|[Dd]ebut|D\u0082but) *([0-9]{1,2}:[0-9]{2} *[APM]{0,2} *[A-Z]{0,3})", es.html)))
    start.time <- gsub ("([Ss]tart|[Ss]tart Time|[Dd]ebut|D\u0082but) *([0-9]{1,2}:[0-9]{2} *[APM]{0,2} *[A-Z]{0,3})", "\\2", start.time.proto)

    end.time.proto <- unlist(regmatches(es.html, gregexpr("(End|End Time|Fin) *([0-9]{1,2}:[0-9]{2} *[APM]{0,2} *[A-Z]{0,3})", es.html)))
    end.time <- gsub ("(End|End Time|Fin) *([0-9]{1,2}:[0-9]{2} *[APM]{0,2} *[A-Z]{0,3})", "\\2", end.time.proto)

    return(c(start.time, end.time))
  
}


process.pl.old <- function (pl.html) {

  date <- unlist(strsplit (gsub(" *([A-Za-z]+), ([A-Za-z]+) ([0-9]+), ([0-9]+).*",
                                "\\1;\\2;\\3;\\4",
                                pl.html[grep("[A-Za-z]+, [A-Za-z]+ [0-9]+, [0-9]+", pl.html)]),
                           ";"))
  
  nums <- range(grep("[Pp][Rr][Ee]>", pl.html))
  if (nums[1] == nums[2]) nums[2] <- length(pl.html)
  extract <- pl.html[(nums[1]+3):(nums[2]-1)]
  extract <- gsub("<i>(.*)</i>", "\\1", extract)
  extract <- gsub("<I>(.*)</I>", "\\1", extract)
  extract <- gsub("<strong>", "", extract)
  extract <- gsub("</strong>", "", extract)
  extract <- gsub("<STRONG>", "", extract)
  extract <- gsub("</STRONG>", "", extract)
  extract <- gsub("Penalty Shot,", "Penalty Shot", extract)
  extract <- gsub("\\t ", "", extract)
  extract <- gsub("\\t", "", extract)

  

  goal.lines <- which(substr(extract,5,5) == " ")
  #event.rec <- extract[gsub(" +([0-9]+).*", "\\1", extract) != extract]
  if (length(goal.lines)>0) event.rec <- extract[-goal.lines] else event.rec <- extract
  event.rec <- event.rec[substr(event.rec,5,5) %in% as.character(0:9) &
                         substr(event.rec,1,1) == " "]

  goal.players <- extract[goal.lines]
  goal.players <- goal.players[grep("^ *[A-Z\\.]{3}", goal.players)]
  goal.players <- t(matrix(goal.players[!grepl(" +SO +", goal.players)], nrow=2))
                    
  new.event <- data.frame(event=0, period=0, seconds=0, etype="", etext="",
                          a1="", a2="", a3="", a4="", a5="", a6="",
                          h1="", h2="", h3="", h4="", h5="", h6="",
                          ev.team="",
                          
                          ev.player.1="", ev.player.2="", ev.player.3="",
                          distance=NA, type="", homezone="",
                          stringsAsFactors=FALSE)
  
  playbyplay <- NULL; for (kk in 1:length(event.rec)) playbyplay <- rbind(playbyplay, new.event)
  for (kk in c(4:20,22:24)) playbyplay[,kk] <- as.character(playbyplay[,kk])
  
  playbyplay$event <- as.numeric(substr(event.rec,1,5))
  playbyplay$period <- as.numeric(substr(event.rec,6,9))
  playbyplay$seconds <- 1200*(playbyplay$period-1)+60*as.numeric(substr(event.rec,12,13))+as.numeric(substr(event.rec,15,16))
  playbyplay$etype <- substr(event.rec,18,34)
  playbyplay$ev.team <- substr(event.rec,35,37)
  playbyplay$etext <- substr(event.rec,43,nchar(event.rec))

  #BLOCK, GOAL, MISS, SHOT      FAC,    GIVE, TAKE      PENL    HIT    
  #STOP  PEND 
  event.types <- matrix(c("BLOCK", "BLOCKED SHOT     ",  "GOAL",  "GOAL             ",
                          "MISS",  "MISSED SHOT      ",  "SHOT",  "SHOT             ",
                          "FAC",   "FACE-OFF         ",  "GIVE",  "GIVEAWAY         ",
                          "TAKE",  "TAKEAWAY         ",  "PENL",  "PENALTY          ",
                          "HIT",   "HIT              ",  "STOP",  "STOPPAGE         ",
                          "PULL",  "GOALIE           ",  "HIT",   "HIT (!)          ",
                          "HIT",   "HIT (*)          ",  "SHOT",  "SHOT (!)         ",
                          "SHOT",  "SHOT (*)         "), nrow=2)
  playbyplay$etype <- event.types[1, match(playbyplay$etype, event.types[2,])]
  playbyplay$etype[is.na(playbyplay$etype)] <- ""

  playbyplay <- playbyplay[!(playbyplay$etype %in% c("STOP","", "PEND")),]
  
  #playbyplay[playbyplay$etype=="GOAL",-(6:17)]
  # FAC.
  #c1 <- which(playbyplay$etype=="FAC")
  #teamwon;homezone;ateam;anum;aname;hteam;hnum;hname
  p1 <- sapply(gsub(" *([A-Z'\\.\\-]+) won - ([a-z]+) zone\\. ([A-Z'\\.\\-]+) +([0-9]+) ([A-Z' \\.\\-]+) vs ([A-Z'\\.\\-]+) +([0-9]+) ([A-Z' \\.\\-]+)",
                    "\\1;\\2;\\3;\\4;\\5;\\6;\\7;\\8",
                    playbyplay$etext[playbyplay$etype=="FAC"]),
               function(tt) unlist(strsplit (tt, ";")))
  teams <- p1[c(3,6), 1]
  playbyplay$ev.team[playbyplay$etype=="FAC"] <- p1[1,]
  playbyplay$homezone[playbyplay$etype=="FAC"] <- p1[2,]

  playbyplay$ev.player.1[playbyplay$etype=="FAC"][p1[1,]==p1[3,]] <- paste(p1[4,], p1[5,])[p1[1,]==p1[3,]]
  playbyplay$ev.player.1[playbyplay$etype=="FAC"][p1[1,]!=p1[3,]] <- paste(p1[7,], p1[8,])[p1[1,]!=p1[3,]]
  playbyplay$ev.player.2[playbyplay$etype=="FAC"][p1[1,]==p1[3,]] <- paste(p1[7,], p1[8,])[p1[1,]==p1[3,]]
  playbyplay$ev.player.2[playbyplay$etype=="FAC"][p1[1,]!=p1[3,]] <- paste(p1[4,], p1[5,])[p1[1,]!=p1[3,]]

  
  #BLOCK
  #num;name
  if (sum(playbyplay$etype=="BLOCK") > 0) {
    p1 <- sapply(gsub(" *([0-9]+) +([A-Z' \\.\\-]+)", "\\1;\\2",
                      playbyplay$etext[playbyplay$etype=="BLOCK"]),
                 function(tt) unlist(strsplit (tt, ";")))
    playbyplay$ev.player.1[playbyplay$etype=="BLOCK"] <- paste(p1[1,], p1[2,])
  }

  #MISS
  #num;name
  if (sum(playbyplay$etype=="MISS") > 0) {
    p1 <- sapply(gsub(" *([0-9]+) +([A-Z' \\.\\-]+),.*", "\\1;\\2",
                      playbyplay$etext[playbyplay$etype=="MISS"]),
                 function(tt) {
                   text1 <- unlist(strsplit (tt, ";"))
                   if (length(text1) == 1) text1 <- unlist(strsplit(gsub(" *([0-9]+) +([A-Z' \\.\\-]+) .*", "\\1;\\2", text1), ";"))
                   text1
                 })
    playbyplay$ev.player.1[playbyplay$etype=="MISS"] <- paste(p1[1,], p1[2,])
  }

  #HIT, TAKE, GIVE
  #num;name
  if (sum(playbyplay$etype %in% c("HIT", "GIVE", "TAKE")) > 0) {
    p1 <- sapply(gsub(" *([0-9]+) +([A-Z' \\.\\-]+)", "\\1;\\2",
                      playbyplay$etext[playbyplay$etype %in% c("HIT", "TAKE", "GIVE")]),
                 function(tt) unlist(strsplit (tt, ";")))
    playbyplay$ev.player.1[playbyplay$etype %in% c("HIT", "TAKE", "GIVE")] <- paste(p1[1,], p1[2,])
  }

  #PENL
  #num;name;type
  if (sum(playbyplay$etype=="PENL") > 0) {
    p1 <- sapply(gsub(" *([0-9]+) +([A-Z' \\.\\-]+), (.*)", "\\1;\\2;\\3",
                      playbyplay$etext[playbyplay$etype=="PENL"]),
                 function(tt) {
                   text1 <- unlist(strsplit (tt, ";"))
                   if (length(text1) != 3) text1 <- c("","","Bench Minor")
                   text1
                 })
    playbyplay$ev.player.1[playbyplay$etype=="PENL"] <- paste(p1[1,], p1[2,])
    playbyplay$type[playbyplay$etype=="PENL"] <- p1[3,]
  }
  

  
  #SHOT
  #name happens one time in mixed case: 38 Robinson, 0304-2-0333
  if (sum(playbyplay$etype=="SHOT") > 0) {
    p1 <- sapply(gsub(" *([0-9]+) +([A-Za-z' \\.\\-]+), +([A-Za-z '\\-]+), +([0-9]+) .*", "\\1;\\2;\\3;\\4",
                      playbyplay$etext[playbyplay$etype=="SHOT"]),
                 function(tt) unlist(strsplit (tt, ";")))
    playbyplay$ev.player.1[playbyplay$etype=="SHOT"] <- paste(p1[1,], toupper(p1[2,]))
    playbyplay$type[playbyplay$etype=="SHOT"] <- p1[3,]
    playbyplay$distance[playbyplay$etype=="SHOT"] <- as.numeric(p1[4,])
  }
  
  #GOAL
  #2 assists
  c2 <- which(playbyplay$etype=="GOAL" & grepl("A: +[0-9]+ +[A-Z' \\.\\-]+, +[0-9]+", playbyplay$etext))
  if (length(c2)>0) {
    p1 <- sapply(gsub(" *([0-9]+) +([A-Z' \\.\\-]+),? +A: +([0-9]+) +([A-Z' \\.\\-]+), +([0-9]+) +([A-Z' \\.\\-]+),? *([A-Za-z '\\-]+), +([0-9]+) .*",
                      "\\1;\\2;\\3;\\4;\\5;\\6;\\7;\\8",
                      playbyplay$etext[c2]),
                 function(tt) unlist(strsplit (tt, ";")))
    playbyplay$ev.player.1[c2] <- paste(p1[1,], p1[2,])
    playbyplay$ev.player.2[c2] <- paste(p1[3,], p1[4,])
    playbyplay$ev.player.3[c2] <- paste(p1[5,], p1[6,])
    playbyplay$type[c2] <- p1[7,]
    playbyplay$distance[c2] <- as.numeric(p1[8,])
  }
  
  #1 assist
  #name happens one time in mixed case: 22 Pock- 0304-2-1136
  c2 <- which(playbyplay$etype=="GOAL" & grepl("A: +[0-9]+ +[A-Z' \\.\\-]+, +[A-Za-z]+", playbyplay$etext))
  if (length(c2)>0) {
    p1 <- sapply(gsub(" *([0-9]+) ([A-Za-z' \\.\\-]+), A: +([0-9]+) ([A-Z' \\.\\-]+), ([A-Za-z '\\-]+), +([0-9]+) .*",
                      "\\1;\\2;\\3;\\4;\\5;\\6",
                      playbyplay$etext[c2]),
                 function(tt) unlist(strsplit (tt, ";")))
    playbyplay$ev.player.1[c2] <- paste(p1[1,], toupper(p1[2,]))
    playbyplay$ev.player.2[c2] <- paste(p1[3,], toupper(p1[4,]))
    playbyplay$type[c2] <- p1[5,]
    playbyplay$distance[c2] <- as.numeric(p1[6,])
  }
  
  #unassisted
  c2 <- which(playbyplay$etype=="GOAL" & grepl("A: +[A-Za-z'\\-]+, +[A-Za-z]+", playbyplay$etext))
  if (length(c2)>0) {
    p1 <- sapply(gsub(" *([0-9]+) ([A-Z' \\.\\-]+), A: [A-Za-z'\\-]+, ([A-Za-z '\\-]+), +([0-9]+).*",
                      "\\1;\\2;\\3;\\4",
                      playbyplay$etext[c2]),
                 function(tt) unlist(strsplit (tt, ";")))
    playbyplay$ev.player.1[c2] <- paste(p1[1,], p1[2,])
    playbyplay$type[c2] <- p1[3,]
    playbyplay$distance[c2] <- as.numeric(p1[4,])
  }


  
  # players on ice for goals.
  #goal.rec <- array(0, c(dim(goal.players), 6))
  goal.evs <- which(playbyplay$etype=="GOAL")
  if (length(goal.evs) > 0)
    for (ii in 1:dim(goal.players)[1]) {
      for (jj in 1:2) {
                                        #home team?
        home <- 1*(gsub(" +([A-Z\\.]+).*", "\\1", goal.players[ii,jj])==teams[2])
        sq <- unlist(strsplit(goal.players[ii,jj], " "))
        #manual fix for space names like DE VRIES.
        for (xx in length(sq):2)
          if (grepl("[A-Z]+", sq[xx]) & grepl("[A-Z]+", sq[xx-1])) {
            sq[xx-1] <- paste(sq[xx-1], sq[xx])
            sq <- sq[-xx]
          }
        
        nums <- grep("[0-9]+", sq)
        playbyplay[goal.evs[ii],5+6*home+1:length(nums)] <- paste(sq[nums], sq[nums+1])
      }
    }

  playbyplay$homezone[playbyplay$homezone == "defensive"] <- "Def"
  playbyplay$homezone[playbyplay$homezone == "offensive"] <- "Off"
  playbyplay$homezone[playbyplay$homezone == "neutral"] <- "Neu"

  
  #remove text.
  playbyplay <- playbyplay[,-5]

  #flip zone to home zone referent.
  flip.1 <- playbyplay$ev.team == teams[1] & playbyplay$homezone=="Off"
  flip.2 <- playbyplay$ev.team == teams[1] & playbyplay$homezone=="Def"
  playbyplay$homezone[flip.1] <- "Def"
  playbyplay$homezone[flip.2] <- "Off"

  #Add this.
  playbyplay$homezone[playbyplay$etype %in% c("SHOT", "BLOCK", "GOAL", "MISS") & playbyplay$ev.team == teams[1]] <- "Def"
  playbyplay$homezone[playbyplay$etype %in% c("SHOT", "BLOCK", "GOAL", "MISS") & playbyplay$ev.team == teams[2]] <- "Off"


  output <- list(playbyplay=playbyplay,
                 teams=teams,
                 date=date)
                 
  return(output)
  
}



p.split <- function(bit, pieces) {
  out1 <- unlist(strsplit (bit, ";"))
  if (length(out1) == 1) out1 <- rep("", pieces)   
  out1
}

process.pl.new <- function (pl.html) {
  #pl.html = game.rec$pl
  
  teams <- pl.html[grep(".*On Ice.*", pl.html)][1:2]
  teams <- gsub("<td width=\"10%\" class=\"heading \\+ bborder\" align=\"center\">([A-Z\\.]+) On Ice</td>.*", "\\1", teams)

  date <- pl.html[grep("<td align=\"center\" style=\"font-size: 10px;font-weight:bold\">([A-Za-z]+), ([A-Za-z]+) ([0-9]+), ([0-9]+)</td>", pl.html)][1]
  date <- gsub("<td align=\"center\" style=\"font-size: 10px;font-weight:bold\">([A-Za-z]+), ([A-Za-z]+) ([0-9]+), ([0-9]+)</td>", "\\1;\\2;\\3;\\4", date)
  date <- unlist(strsplit(date, ";"))
    
  pl.main <- {
    find.1 <- grep("class=\"[a-z \\+]* bborder\"", pl.html)
    find.2 <- grep("<font style=\"cursor:hand;\"", pl.html)
    pl.reduced <- pl.html[sort(c(find.1, find.2))]
    pl.reduced <- gsub("&nbsp;", ", ", pl.reduced)
    pl.reduced <- gsub("; ", ", ", pl.reduced)
    pl.reduced <- gsub("[\uC9\uCA\uCB]", "E", pl.reduced)

    # p: player id.
    pl.reduced.1 <- gsub("<font style=\"cursor:hand;\" title=\"([A-Za-z ]+) - ([A-Za-z' \\.\\(\\)-]+)\">([0-9]+).*", "p;\\1;\\2;\\3", pl.reduced)
    # s: time
    pl.reduced.1 <- gsub("<td class=\"[a-z \\+]* \\+ bborder\" align=\"center\">([0-9]+):([0-9]+)<br.*", "s;\\1;\\2;", pl.reduced.1)
    # q: period number.
    pl.reduced.1 <- gsub("<td class=\"[a-z]* \\+ bborder\" align=\"center\">([0-9]+)</td>.*", "q;\\1;;", pl.reduced.1)
    
    # t: event type;
    pl.reduced.2 <- gsub("<td class=\"[a-z]* \\+ bborder\">([A-Z]{3,})</td>", "t;\\1;;", pl.reduced.1)
    pl.reduced.2 <- gsub("<td class=\"[a-z]* \\+ bborder\" align=\"center\">([A-Z]{3,})</td>", "t;\\1;;", pl.reduced.2)

    # x: event text;
    #pl.reduced.3 <- gsub("<td class=\"[a-z]* \\+ bborder\">([A-Za-z0-9',# \\(\\)\\.-]{3,})</td>", "x;\\1;;", pl.reduced.2)
    #pl.reduced.3 <- gsub("<td class=\"[a-z]* \\+ bborder\" align=\"center\">([A-Za-z0-9',# \\(\\)\\.-]{3,})</td>", "x;\\1;;", pl.reduced.3)
    pl.reduced.3 <- gsub("<td class=\"[a-z]* \\+ bborder\">(.{3,})</td>", "x;\\1;;", pl.reduced.2)
    pl.reduced.3 <- gsub("<td class=\"[a-z]* \\+ bborder\" align=\"center\">(.{3,})</td>", "x;\\1;;", pl.reduced.3)

    # e: event number;
    pl.reduced.4 <- gsub("<td align=\"center\" class=\"[a-z \\+]* \\+ bborder\">([0-9]+)</td>", "e;\\1;;", pl.reduced.3)

    # b: break;
    pl.reduced.5 <- gsub("<td class=\"[a-z \\+]* \\+ bborder\">", "b;;;", pl.reduced.4)
    
    pl.reduced.5[substr(pl.reduced.5, 1, 1) != "<"]
  }

  #fix one.
  pl.main <- gsub("Penalty Shot,", "Penalty Shot", pl.main)

  
  pl.grid <- sapply(pl.main, function(tt) {
    tt <- unlist(strsplit(tt, ";"))
    while (length(tt) < 4) tt <- c(tt, "")
    tt
  })
    
  playbyplay <- NULL
  new.event <- data.frame(event=0, period=0, seconds=0, etype="", etext="",
                         
                          a1=NA, a2=NA, a3=NA, a4=NA, a5=NA, a6=NA,
                          h1=NA, h2=NA, h3=NA, h4=NA, h5=NA, h6=NA)
  
  started <- FALSE; current.event <- NULL

  a.h <- rep(0,dim(pl.grid)[2])
  sw <- 1
  for (kk in 1:length(pl.main)) {
    if (pl.grid[1,kk] %in% c("e","b")) sw <- 1-sw
    a.h[kk] <- sw
  }
  pl.grid <- rbind(pl.grid, a.h)
  players <- unique(t(pl.grid[2:5, pl.grid[1,]=="p"]))
  rownames(players) <- NULL

  
  for (row in 1:length(pl.main)) {
    if (pl.grid[1, row] == "e") {
      if (started) playbyplay <- rbind(playbyplay, current.event) else started <- TRUE
      current.event <- new.event
      current.event$event <- as.numeric(pl.grid[2,row])
      coln <- 6
      ttype <- FALSE
      homeyet <- FALSE
    }
    if (pl.grid[1, row] == "q") current.event$period <- as.numeric(pl.grid[2,row])
    if (pl.grid[1, row] == "s") current.event$seconds <- 60*as.numeric(pl.grid[2,row])+as.numeric(pl.grid[3,row])
    if (pl.grid[1, row] == "t") current.event$etype <- pl.grid[2,row]
    if (pl.grid[1, row] == "x") current.event$etext <- pl.grid[2,row]

    if (pl.grid[1, row] == "p") {
#      current.event[1,coln] <- match(paste(pl.grid[3:4,row], collapse=" "),
#                                     paste(players[,2],players[,3]))
      current.event[1,coln] <- paste(pl.grid[4:3,row], collapse=" ")  #pl.grid[4,row]  #
      if (coln != 17 && coln != 11) coln <- coln + 1  
    }
    if (pl.grid[1, row] == "b") coln <- 12    #home team now.
  }

  playbyplay$ev.team <- substr(playbyplay$etext, 1, 3)

  playbyplay$ev.player.1 <- ""
  playbyplay$ev.player.2 <- ""
  playbyplay$ev.player.3 <- ""
  playbyplay$distance <- NA
  playbyplay$type <- ""
  playbyplay$homezone <- ""

  

  #BLOCK, GOAL, MISS, SHOT      FAC, GIVE, TAKE    PENL      HIT    
  #STOP  PEND 
  playbyplay <- playbyplay[!(playbyplay$etype %in% c("PSTR","GEND","ICING","OFFSIDE")),]


  
  #SHOT
  c1 <- which(playbyplay$etype=="SHOT")
  if (length(c1)>0) {
    p1 <- sapply(gsub("[A-Z \\.#-]*?([0-9]+) ([A-Z' \\.-]+), ([A-Za-z-]+),[A-Za-z,\\. ]*([0-9]+).*", "\\1;\\2;\\3;\\4", playbyplay$etext[c1]),
                 function(tt) {
                   out1 <- unlist(strsplit (tt, ";"))
                   if (length(out1) != 4) {  #No shot type listed.
                     out1 <- gsub(".*([0-9]+) ([A-Z' \\.-]+),[A-Za-z,\\. ]*([0-9]+).*", "\\1;\\2;\\3", out1)
                     out1 <- p.split(out1, 3)
                     out1 <- c(out1[1:2], "", out1[3])
                   }
                   out1
                 })
    playbyplay$ev.player.1[playbyplay$etype=="SHOT"] <- paste(p1[1,], p1[2,])
    playbyplay$distance[playbyplay$etype=="SHOT"] <- as.numeric(p1[4,])
    playbyplay$type[playbyplay$etype=="SHOT"] <- p1[3,]
  }
  
  #BLOCK
  #num;name;num2;name2;type
  c1 <- which(playbyplay$etype=="BLOCK")
  if (length(c1)>0) {
    p1 <- sapply(gsub(".*#([0-9]+) ([A-Z' \\.-]+) BLOCK[A-Z\\. ]+#([0-9]+) ([A-Z' \\.-]+), ([A-Za-z \\.]+),.*", "\\1;\\2;\\3;\\4;\\5",
                      playbyplay$etext[c1]),
                 function(tt) {
                   out1 <- unlist(strsplit (tt, ";"))
                   if (length(out1)==1) {
                     out1 <- gsub(".*#([0-9]+) ([A-Z' \\.-]+) BLOCK[A-Z\\. ]+#([0-9]+) ([A-Z' \\.-]+),.*",
                                  "\\1;\\2;\\3;\\4",
                                  out1)
                     out1 <- p.split(out1, 4)
                     out1 <- c(out1, "")
                   }
                   out1
                 })
    playbyplay$ev.player.1[playbyplay$etype=="BLOCK"] <- paste(p1[1,], p1[2,])
    playbyplay$ev.player.2[playbyplay$etype=="BLOCK"] <- paste(p1[3,], p1[4,])
    playbyplay$type[playbyplay$etype=="BLOCK"] <- p1[5,]
  }
  
  #MISS
  #num;name;type;homezone;distance
  #Wide of net; crossbar; post  -- ignored for now.

  c1 <- which(playbyplay$etype=="MISS")
  if (length(c1)>0) {
    p1 <- sapply(gsub(".* #([0-9]+) ([A-Z' \\.-]+), ([A-Za-z -]+),[A-Za-z,\\. ]*[rt], ([A-Za-z]+)\\..* ([0-9]+) .*", "\\1;\\2;\\3;\\4;\\5", playbyplay$etext[c1]),
                 function(tt) {
                   out1 <- unlist(strsplit (tt, ";"))
                   if (length(out1)==1) { #assume missing net location.
                     out1 <- gsub(".* #([0-9]+) ([A-Z' \\.-]+), ([A-Za-z -]+), ([A-Za-z]+)\\..* ([0-9]+) .*",
                                  "\\1;\\2;\\3;\\4;\\5",
                                  out1)
                     out1 <- p.split(out1, 5)
                   }
                   out1
                 })
    playbyplay$ev.player.1[playbyplay$etype=="MISS"] <- paste(p1[1,], p1[2,])
    playbyplay$distance[playbyplay$etype=="MISS"] <- as.numeric(p1[5,])
    playbyplay$type[playbyplay$etype=="MISS"] <- p1[3,]
  }
  
  #GOAL
  #num;name;type;distance
  #c2 <- which(playbyplay$etype=="GOAL" & grepl("Assists:", playbyplay$etext))
  c1 <- which(playbyplay$etype=="GOAL")
  if (length(c1)>0) {
    p1 <- sapply(gsub(".* #([0-9]+) ([A-Z' \\.-]+)[\\(\\)0-9, ]+([A-Za-z -]+), [A-Za-z]+\\..* ([0-9]+) .*",
                      "\\1;\\2;\\3;\\4", playbyplay$etext[c1]),
                 function(tt) {
                   out1 <- unlist(strsplit (tt, ";"))
                   if (length(out1)==1) { #assume missing shot type.
                     out2 <- gsub(".* #([0-9]+) ([A-Z' \\.-]+)[\\(\\)0-9, ]+ [A-Za-z]+\\..* ([0-9]+) .*",
                                  "\\1;\\2;;\\3",
                                  out1)
                     if (out2 == out1) #missing zone.
                       out2 <- gsub(".* #([0-9]+) ([A-Z' \\.-]+)[\\(\\)0-9, ]+([A-Za-z -]+), ([0-9]+) .*",
                                  "\\1;\\2;\\3;\\4",
                                  out1)
                     out1 <- p.split(out2, 4)
                   }
                   out1
                 })
    playbyplay$ev.player.1[playbyplay$etype=="GOAL"] <- paste(p1[1,], p1[2,])
    playbyplay$distance[playbyplay$etype=="GOAL"] <- as.numeric(p1[4,])
    playbyplay$type[playbyplay$etype=="GOAL"] <- p1[3,]
  }
  
  c2 <- which(playbyplay$etype=="GOAL" & grepl("Assists:", playbyplay$etext))
  if (length(c2) > 0) {
    p1 <- sapply(gsub(".*Assists: #([0-9]+) ([A-Z' \\.-]+)[\\(\\)0-9]+, #([0-9]+) ([A-Z' \\.-]+)[\\(\\)0-9, ]+.*", "\\1;\\2;\\3;\\4", playbyplay$etext[c2]),
                 p.split, 4)
#                 function(tt) unlist(strsplit (tt, ";")))
    playbyplay$ev.player.2[c2] <- paste(p1[1,], p1[2,])
    playbyplay$ev.player.3[c2] <- paste(p1[3,], p1[4,])
  }
  
  c1 <- which(playbyplay$etype=="GOAL" & grepl("Assist:", playbyplay$etext))
  if (length(c1) > 0) {
    p1 <- sapply(gsub(".*Assist: #([0-9]+) ([A-Z' \\.-]+).*", "\\1;\\2;\\", playbyplay$etext[c1]),
                 p.split, 2)
    playbyplay$ev.player.2[c1] <- paste(p1[1,], p1[2,])
  }
  
  
  #FAC
  #zone;num1;name1;num2;name2
  
  c1 <- which(playbyplay$etype=="FAC")
  if (length(c1)>0) {
    p1 <- sapply(gsub(".* won ([A-Za-z]+)\\..*#([0-9]+) ([A-Z' \\.-]+) .*#([0-9]+) ([A-Z' \\.-]+).*", "\\1;\\2;\\3;\\4;\\5",
                      playbyplay$etext[c1]),
                 p.split, 5)
               
    playbyplay$ev.player.1[c1] <- paste(p1[2,], p1[3,])
    playbyplay$ev.player.2[c1] <- paste(p1[4,], p1[5,])
    playbyplay$homezone[c1] <- p1[1,]
  }
  
  # GIVE
  #num;name;zone
  c1 <- which(playbyplay$etype=="GIVE")
  if (length(c1)>0) {
    p1 <- sapply(gsub(".*#([0-9]+) ([A-Z' \\.-]+), ([A-Za-z]+)\\..*", "\\1;\\2;\\3",
                      playbyplay$etext[playbyplay$etype=="GIVE"]),
                 p.split, 3)
    
#                                function(tt) unlist(strsplit (tt, ";")))
    playbyplay$ev.player.1[playbyplay$etype=="GIVE"] <- paste(p1[1,], p1[2,])
    playbyplay$homezone[playbyplay$etype=="GIVE"] <- p1[3,]
  }
  
  # TAKE
  #num;name;zone
  c1 <- which(playbyplay$etype=="TAKE")
  if (length(c1)>0) {
    p1 <- sapply(gsub(".*#([0-9]+) ([A-Z' \\.-]+), ([A-Za-z]+)\\..*", "\\1;\\2;\\3",
                      playbyplay$etext[playbyplay$etype=="TAKE"]),
                 p.split, 3)
    playbyplay$ev.player.1[playbyplay$etype=="TAKE"] <- paste(p1[1,], p1[2,])
    playbyplay$homezone[playbyplay$etype=="TAKE"] <- p1[3,]
  }

  # PENL
  c1 <- which(playbyplay$etype=="PENL" & grepl("Drawn By:", playbyplay$etext))
  if (length(c1) > 0) {
    p1 <- sapply(gsub(".*#([0-9]+) ([A-Z' \\.-]+), ([A-Za-z\\(\\)0-9 -]+), ([A-Za-z]+)\\..*#([0-9]+) ([A-Z' \\.-]+).*", "\\1;\\2;\\3;\\4;\\5;\\6",
                      playbyplay$etext[c1]),
                 p.split, 6)
    playbyplay$ev.player.1[c1] <- paste(p1[1,], p1[2,])
    playbyplay$ev.player.2[c1] <- paste(p1[5,], p1[6,])
    playbyplay$homezone[c1] <- p1[4,]
    playbyplay$type[c1] <- p1[3,]
  }
  
  c1 <- which(playbyplay$etype=="PENL" & !grepl("Drawn By:", playbyplay$etext))
  if (length(c1) > 0) {
    p1 <- sapply(gsub(".*#([0-9]+) ([A-Z' \\.-]+), ([A-Za-z\\(\\)0-9 :#-]+), ([A-Za-z]+)\\..*", "\\1;\\2;\\3;\\4",
                      playbyplay$etext[c1]),
                 function(tt) {
                   out1 <- unlist(strsplit (tt, ";"))
                   if (length(out1) == 1) out1 <- c("","","Bench minor", "")
                   out1
                 })
    playbyplay$ev.player.1[c1] <- paste(p1[1,], p1[2,])
    playbyplay$homezone[c1] <- p1[4,]
    playbyplay$type[c1] <- p1[3,]
  }
  

  #HIT
  c1 <- which(playbyplay$etype=="HIT")
  if (length(c1)>0) {
    p1 <- sapply(gsub(".*#([0-9]+) ([A-Z' \\.-]+) HIT.*#([0-9]+) ([A-Z' \\.-]+), ([A-Za-z]+).*", "\\1;\\2;\\3;\\4;\\5",
                      playbyplay$etext[playbyplay$etype=="HIT"]),
                 p.split, 5)
#                 function(tt) unlist(strsplit (tt, ";")))
    playbyplay$ev.player.1[playbyplay$etype=="HIT"] <- paste(p1[1,], p1[2,])
    playbyplay$ev.player.2[playbyplay$etype=="HIT"] <- paste(p1[3,], p1[4,])
    playbyplay$homezone[playbyplay$etype=="HIT"] <- p1[5,]
  }

  ########################################################################
  # replace number-last with number-first-last.

  #on.ice.players <- NULL
  #for (kk in 6:17) on.ice.players <- c(on.ice.players, playbyplay[,kk])
  #on.ice.players <- unique(on.ice.players)
  

  # seconds in game.
  playbyplay$seconds <- playbyplay$seconds + 1200*(playbyplay$period-1)

  # add approximate changes.
  orig.rec <- dim(playbyplay)[1]
  for (kk in 1:(orig.rec-1)) {
    past <- na.omit(unlist(playbyplay[kk,6:17]))
    future <- na.omit(unlist(playbyplay[kk+1,6:17]))
    if (playbyplay[kk,3] < playbyplay[kk+1,3]) 
      if (length(past) != length(future)) {
        new.record <- playbyplay[kk,]
        new.record[1,4] <- "CHANGE"
        new.record[1,5] <- "CHANGE"
        new.record$ev.team <- ""
        new.record[1,3] <- mean(c(playbyplay[kk,3], playbyplay[kk+1,3]))
        playbyplay <- rbind(playbyplay, new.record)
      } else if (any(past != future)) {
        new.record <- playbyplay[kk,]
        new.record[1,4] <- "CHANGE"
        new.record[1,5] <- "CHANGE"
        new.record$ev.team <- ""
        new.record[1,3] <- mean(c(playbyplay[kk,3], playbyplay[kk+1,3]))
        playbyplay <- rbind(playbyplay, new.record)
      }
  }
  playbyplay <- playbyplay[order(playbyplay[,3]),]
  playbyplay[,1] <- 1:dim(playbyplay)[1]

  #remove text.
  playbyplay <- playbyplay[,-5]

  #flip zone to home zone referent.
  flip.1 <- playbyplay$ev.team == teams[1] & playbyplay$homezone=="Off"
  flip.2 <- playbyplay$ev.team == teams[1] & playbyplay$homezone=="Def"
  playbyplay$homezone[flip.1] <- "Def"
  playbyplay$homezone[flip.2] <- "Off"

  playbyplay$homezone[playbyplay$etype=="CHANGE"] <- "Neu"
  stops <- which(playbyplay$ev.team != "ICI" & playbyplay$homezone == ""); playbyplay$homezone[stops] <- playbyplay$homezone[stops+1]

  playbyplay$homezone[playbyplay$etype %in% c("SHOT", "BLOCK", "GOAL", "MISS") & playbyplay$ev.team == teams[1]] <- "Def"
  playbyplay$homezone[playbyplay$etype %in% c("SHOT", "BLOCK", "GOAL", "MISS") & playbyplay$ev.team == teams[2]] <- "Off"

  #faceoffs follow stoppages, so cut them.
  playbyplay <- playbyplay[playbyplay$etype != "STOP",]
  
  
  output <- list(playbyplay=playbyplay,
                 players=players,
                 teams=teams,
                 date=date)
  return(output)
}



process.es.new <- function(es.html) {

  es.html <- gsub ("\uC9","E",es.html)
  es.html <- gsub ("\uCA","E",es.html)
  es.html <- gsub ("\uC8","E",es.html)
  rel.rows.1 <- grep("<tr", es.html)
  rel.rows.2 <- sort(c(outer(rel.rows.1, 1:3, "+")))
  picks <- es.html[rel.rows.2[grep(" *<td [awc ].*", es.html[rel.rows.2])]]

  picks.1 <- gsub(" *<td.*> *([0-9]{1,2})</td>", "n;\\1", picks)
  picks.2 <- gsub(" *<td.*>([CLRDG])</td>", "p;\\1", picks.1)
  picks.3 <- gsub(" *<td.*>([A-Za-z'\\.\\(\\), -]{6,})</td>", "m;\\1", picks.2)
  picks.3 <- picks.3[substr(picks.3, 1, 1) != "<"]

  if (length(picks)>2) {
    rk <- substr(picks.3,1,1)
    nms <- substr(picks.3,3,nchar(picks.3))
    lr <- length(rk)
    rk1 <- rbind(rk[1:(lr-2)], rk[2:(lr-1)], rk[3:lr])
    sts <- which(apply(rk1 == c("n","p","m"), 2, prod)>0)
  
    division <- grep("TEAM TOTALS", picks.3)[1]
  } else {nms <- rep("",3); sts <- NULL; division=3}
  
  names.mat <- data.frame(number=nms[sts],
                          pos=nms[sts+1],
                          lastfirst=toupper(nms[sts+2]),
                          hometeam=1*(sts>division), stringsAsFactors=FALSE)
  names.mat <- names.mat[grepl(",", names.mat$lastfirst),]
  names.mat$last <- gsub("(.*), .*", "\\1", names.mat$lastfirst)
  names.mat$first <- gsub(".*, (.*)", "\\1", names.mat$lastfirst)
  names.mat$numlast <- paste(names.mat$number, names.mat$last)
  names.mat$numfirstlast <- paste(names.mat$number, names.mat$first, names.mat$last)

  #remove duplicates.
  lastfirst <- unique(names.mat$lastfirst)
  if (length(lastfirst) != dim(names.mat)[1]) for (mm in lastfirst) {
    rws <- rev(which(names.mat$lastfirst == mm))
    if (length(rws)>1) names.mat <- names.mat[-rws[2:length(rws)],]
  }
  
  names.mat <- names.mat[order(names.mat$hometeam, as.numeric(names.mat$number)),]
  return(names.mat)
  
}



integrate.new.pieces <- function (es.html, pl.html) {
  #load("nhlr-data/20082009-20022.RData")
  #es.html=game.rec$es; pl.html=game.rec$pl
  
  es.table <- try(process.es.new(es.html), TRUE)
  if (class(es.table)=="try-error") es.table <- NULL
  pl.table <- try(process.pl.new(pl.html), TRUE)
  if (class(pl.table)=="try-error") pl.table <- NULL

  game.times <- get.game.start.end(es.html)

  # replace with full names.
  for (ii in which(colnames(pl.table$playbyplay) %in% c("ev.player.1", "ev.player.2", "ev.player.3")))
    pl.table$playbyplay[,ii] <- es.table$numfirstlast[match(pl.table$playbyplay[,ii],
                                                             es.table$numlast)]
    
  pl.table$players <- es.table
  pl.table$game.times <- game.times
  return(pl.table)
  
}

integrate.old.pieces <- function (home.image, away.image,
                                  es.html, pl.html, game) {
  #home.image=game.rec$imh; away.image=game.rec$imv; es.html=game.rec$es; pl.html=game.rec$pl

    
  es.table <- try(process.es.new(es.html), TRUE)
  if (class(es.table)=="try-error") es.table <- NULL
  pl.table <- try(process.pl.old(pl.html), TRUE)
  if (class(pl.table)=="try-error") pl.table <- NULL

  game.times <- get.game.start.end(es.html)
  
  if (length(pl.table) > 0) {
    if (substr(game,1,1)=="2") pl.table$playbyplay <- pl.table$playbyplay[pl.table$playbyplay$seconds <= 3900,]
  
  # replace with full names.
    for (ii in which(colnames(pl.table$playbyplay) %in% c("ev.player.1", "ev.player.2", "ev.player.3")))
      pl.table$playbyplay[,ii] <- es.table$numfirstlast[match(pl.table$playbyplay[,ii],
                                                              es.table$numlast)]
    for (ii in which(colnames(pl.table$playbyplay) %in% c(paste0("a",1:6), paste0("h",1:6))))
      pl.table$playbyplay[pl.table$playbyplay$etype=="GOAL",ii] <-
        es.table$numfirstlast[match(pl.table$playbyplay[pl.table$playbyplay$etype=="GOAL",ii],
                                    es.table$numlast)]
    es.home <- es.table[es.table$hometeam==1,]; es.home <- rbind(es.home, "")
    es.away <- es.table[es.table$hometeam==0,]; es.away <- rbind(es.away, "")
    
    if (length(home.image) == 0) warning ("Home image file does not exist.")
    if (length(away.image) == 0) warning ("Away image file does not exist.")
  
    home.players <- try(pick.out.features (pl.table$playbyplay$seconds, home.image), TRUE)
    if (class(home.players)=="try-error") home.players <- array(0, c(6, length(pl.table$playbyplay$seconds)))
    away.players <- try(pick.out.features (pl.table$playbyplay$seconds, away.image), TRUE)
    if (class(away.players)=="try-error") away.players <- array(0, c(6, length(pl.table$playbyplay$seconds)))
    
    home.players[home.players==0] <- nrow(es.home)
    away.players[away.players==0] <- nrow(es.away)

  # Players on ice.
    home.players.names <- t(array(es.home$numfirstlast[home.players], dim(home.players)))
    away.players.names <- t(array(es.away$numfirstlast[away.players], dim(away.players)))
    
    pl.table$playbyplay[pl.table$playbyplay$etype!="GOAL",5:10] <- away.players.names[pl.table$playbyplay$etype!="GOAL",]
    pl.table$playbyplay[pl.table$playbyplay$etype!="GOAL",6+5:10] <- home.players.names[pl.table$playbyplay$etype!="GOAL",]


  #Add changes.
    orig.rec <- dim(pl.table$playbyplay)[1]
    for (kk in 1:(orig.rec-1)) {
      past <- na.omit(unlist(pl.table$playbyplay[kk,6:17-1]))
      future <- na.omit(unlist(pl.table$playbyplay[kk+1,6:17-1]))
      if (pl.table$playbyplay[kk,3] < pl.table$playbyplay[kk+1,3]) 
        if (length(past) != length(future)) {
          new.record <- pl.table$playbyplay[kk,]
          new.record[1,4] <- "CHANGE"
          new.record$ev.team <- ""
          new.record[1,3] <- mean(c(pl.table$playbyplay[kk,3], pl.table$playbyplay[kk+1,3]))
          pl.table$playbyplay <- rbind(pl.table$playbyplay, new.record)
        } else if (any(sort(past) != sort(future))) {
          new.record <- pl.table$playbyplay[kk,]
          new.record[1,4] <- "CHANGE"
          new.record$ev.team <- ""
          new.record[1,3] <- mean(c(pl.table$playbyplay[kk,3], pl.table$playbyplay[kk+1,3]))
          pl.table$playbyplay <- rbind(pl.table$playbyplay, new.record)
        }
    }
    pl.table$playbyplay <- pl.table$playbyplay[order(pl.table$playbyplay[,3]),]
    pl.table$playbyplay[,1] <- 1:dim(pl.table$playbyplay)[1]
    pl.table$playbyplay$homezone[pl.table$playbyplay$etype=="CHANGE"] <- "Neu"
    
    pl.table$players <- es.table
    pl.table$game.times <- game.times
  } else {pl.table <- list(players=NULL, playbyplay=NULL, teams=NULL, date=NULL, game.times=game.times)}  
  
  return(pl.table)

}

Try the nhlscrapr package in your browser

Any scripts or data that you put into this service are public.

nhlscrapr documentation built on May 29, 2017, 12:41 p.m.