# Functions for loading data from the HTML files.
full.game.database <- function () {
game.roster <- NULL
seasons <- c("20022003", "20032004", "20052006", "20062007",
"20072008", "20082009", "20092010", "20102011",
"20112012", "20122013", "20132014")
games <- c(rep(1230,9), 720, 1230)
#Noted difficulties with existing data in regular season.
bad.game.list <- list(c(1:127, 134,135, #Not in system. 0203
#419, 483,
582, 598, 872), #bad images.
c(10, 251, 453, 456, 482, 802), #0304
c(18, 140, #Visitor GIF makes segfault. 0506
298, #wrong ES file -- 398 instead of 298.
458, #bogus ES file.
974), #0506
c(1024), #missing line of players for a goal. #0607
c(), c(259, 409, 1077), #0708 0809
c(81, 827, 836, 857, 863, 874), c(429), #0910 1011
c(259), c(), c()) #1112 1213
bad.playoff <- matrix(c("20032004", "0134",
"20052006", "0233"), nrow=2) #ucase TR TD
# Playoff brackets.
game.rec <- array(NA, c(7*15, 3)); count <- 0
for (kk in 1:4) for (ll in 1:2^(4-kk)) for (mm in 1:7) {
count <- count+1
game.rec[count,] <- c(kk,ll,mm)
}
gnum <- paste0("0", game.rec[,1], game.rec[,2], game.rec[,3])
#Game roster. Game session; Game number;
#To add: away team; home team. score. Date.
for (ss in 1:length(seasons)) {
gn1 <- as.character(1:games[ss]); while(any(nchar(gn1)<4)) gn1[nchar(gn1)<4] <- paste0("0", gn1[nchar(gn1)<4])
df1 <- data.frame(season=seasons[ss],
session=c(rep("Regular", games[ss]), rep("Playoffs", length(gnum))),
gamenumber=c(gn1, gnum),
awayteam="",
hometeam="",
awayscore="",
homescore="",
length="",
date="",
valid=c(!(1:games[ss] %in% bad.game.list[[ss]]), rep(TRUE, length(gnum))),
stringsAsFactors=FALSE)
game.roster <- rbind(game.roster, df1)
}
game.roster[,1] <- as.character(game.roster[,1])
game.roster[,2] <- as.character(game.roster[,2])
for (kk in 1:dim(bad.playoff)[2]) game.roster$valid[game.roster$season==bad.playoff[1,kk] &
game.roster$session=="Playoffs" &
game.roster$gamenumber==bad.playoff[2,kk]] <- FALSE
game.roster$gcode <- paste0(2+1*(game.roster$session=="Playoffs"), game.roster$gamenumber)
return(game.roster)
}
download.single.game <- function (season="20122013", gcode="20001", rdata.folder="nhlr-data", verbose=TRUE) {
#season="20122013"; gcode="20018"; rdata.folder="nhlr-data"; verbose=TRUE
if (!(season %in% c("20022003", "20032004", "20052006", "20062007",
"20072008", "20082009", "20092010", "20102011",
"20112012", "20122013", "20132014"))) stop(paste("Invalid season: ",season))
if (verbose) message(paste("Downloading files for game", season, gcode))
error.free <- TRUE
game.rec <- list()
infile <- paste("http://www.nhl.com/scores/htmlreports/",season,"/ES0",gcode,".HTM",sep="")
game.rec$es <- try(unlist(strsplit(gsub("\\t", "", gsub("\\r", "", getURL(infile))), "\n")), TRUE)
if (class(game.rec$es) == "try-error") game.rec$es <- NULL #error.free <- FALSE
infile <- paste("http://www.nhl.com/scores/htmlreports/",season,"/PL0",gcode,".HTM",sep="")
game.rec$pl <- try(unlist(strsplit(gsub("\\t", "", gsub("\\r", "", getURL(infile))), "\n")), TRUE)
if (class(game.rec$pl) == "try-error") game.rec$pl <- NULL #error.free <- FALSE
#see if x-y is there.
infile <- paste0("http://live.nhl.com/GameData/",season,"/",substr(season,1,4),"0",gcode,"/PlayByPlay.json")
file2 <- try(getURL(infile), TRUE)
if (class(file2) != "try-error") {
game.rec$xy <- try(fromJSON(file2))
if (class(game.rec$xy) == "try-error") game.rec$xy <- NULL
} else game.rec$xy <- NULL
if (season %in% c("20022003", "20032004", "20052006", "20062007")) {
infile <- paste("http://www.nhl.com/scores/htmlreports/",season,"/SCH",gcode,".gif",sep="")
outfile <- paste0(rdata.folder,"/",season,"H",gcode,".gif")
g1 <- try(download.file(infile, outfile, mode="wb"), TRUE)
if (class(g1) == "try-error") {
game.rec$imh <- NULL #error.free <- FALSE
} else {game.rec$imh <- read.gif(outfile)$image; file.remove(outfile)}
infile <- paste("http://www.nhl.com/scores/htmlreports/",season,"/SCV",gcode,".gif",sep="")
outfile <- paste0(rdata.folder,"/",season,"V",gcode,".gif")
g1 <- try(download.file(infile, outfile, mode="wb"), TRUE)
if (class(g1) == "try-error") {
game.rec$imv <- NULL #error.free <- FALSE
} else {game.rec$imv <- read.gif(outfile)$image; file.remove(outfile)}
}
#if (!error.free) game.rec <- NULL
suppressWarnings(dir.create(rdata.folder))
if (length(game.rec$es) > 10 & length(game.rec$pl) > 10) save (game.rec, file=paste0(rdata.folder, "/", season, "-", gcode, ".RData")) else error.free <- FALSE
return (error.free)
}
download.games <- function (games=full.game.database(), rdata.folder="nhlr-data") {
#games=full.game.database(); games = games[games$session=="Playoffs" & games[,1] == "20122013",]
for (kk in 1:dim(games)[1]) if (games$valid[kk])
games$valid[kk] <- download.single.game(games$season[kk],
paste0(2+1*(games$session[kk]=="Playoffs"),
games$gamenumber[kk]))
return(games)
}
process.single.game <- function (season="20122013", gcode="20001",
rdata.folder="nhlr-data",
override.download=FALSE,
save.to.file=TRUE) {
#season="20122013"; gcode="20712"; rdata.folder="nhlr-data"; override.download=FALSE; save.to.file=TRUE
if (!file.exists(paste0(rdata.folder, "/", season, "-", gcode, ".RData")) | override.download) {
#For Matt Taddy, 7-10-13.
#message("process.single.game -- Current directory:",getwd()); message(season, gcode, rdata.folder)
dl.time <- download.single.game(season, gcode, rdata.folder)
}
if (file.exists(paste0(rdata.folder, "/", season, "-", gcode, ".RData"))) {
game.rec <- NULL #loaded in next line.
load (paste0(rdata.folder, "/", season, "-", gcode, ".RData"))
if (season %in% c("20022003", "20032004", "20052006", "20062007") & (is.null(game.rec$imh) | is.null(game.rec$imv))) {
message("Re-downloading single game files due to incompleteness in graphics files.")
dl.time <- download.single.game(season, gcode, rdata.folder)
load (paste0(rdata.folder, "/", season, "-", gcode, ".RData"))
}
#game.rec
if (!is.null(game.rec)) {
if (length(game.rec$es)>10 & length(game.rec$pl>10)) {
if (season %in% c("20022003", "20032004", "20052006", "20062007")) {
suppressWarnings(game.info <- integrate.old.pieces (game.rec$imh, game.rec$imv,
game.rec$es, game.rec$pl, gcode))
} else {
suppressWarnings(game.info <- integrate.new.pieces (game.rec$es, game.rec$pl))
}
} else game.info <- list(playbyplay=data.frame(), teams=c("",""),
date=rep("",4), players=data.frame())
} else game.info <- list(playbyplay=data.frame(), teams=c("",""),
date=rep("",4), players=data.frame())
if (length(game.info$playbyplay) > 0) {
#xy?
game.info$playbyplay$xcoord <- NA; game.info$playbyplay$ycoord <- NA
if (!is.null(game.rec$xy)) game.info <- match.xy(game.info, game.rec$xy)
playbyplay <- game.info$playbyplay
#Changes need scrubbing
playbyplay$distance[playbyplay$etype=="CHANGE"] <- NA
playbyplay$type[playbyplay$etype=="CHANGE"] <- ""
playbyplay$homezone[playbyplay$etype=="CHANGE"] <- "Neu"
playbyplay$ev.player.1[playbyplay$etype=="CHANGE"] <- ""
playbyplay$ev.player.2[playbyplay$etype=="CHANGE"] <- ""
playbyplay$ev.player.3[playbyplay$etype=="CHANGE"] <- ""
playbyplay$xcoord[playbyplay$etype=="CHANGE"] <- NA
playbyplay$ycoord[playbyplay$etype=="CHANGE"] <- NA
#teams.
playbyplay$awayteam <- game.info$teams[1]
playbyplay$hometeam <- game.info$teams[2]
dateinfo <- strptime(paste(game.info$date, collapse=" "), "%a %b %d %Y")
ydays <- cumsum(c(0, 365, 366,
365, 365, 365, 366,
365, 365, 365, 366,
365, 365, 365, 366)) #through 2016.
refdate <- dateinfo$yday+ydays[as.numeric(game.info$date[4])-2001]
if (length(refdate) == 0) refdate <- 0
playbyplay <- cbind(season, gcode, refdate, playbyplay)
## add indicator if period is a shootout
playbyplay$shootout <- 0
playbyplay$shootout[which(as.numeric(as.character(playbyplay$gcode))<30000 & playbyplay$period==5)] <- 1
#goals and game score.
playbyplay$away.score <- playbyplay$home.score <- 0
home.goals <- which(playbyplay$etype=="GOAL" & playbyplay$ev.team==game.info$teams[2])
if (length(home.goals)>0) for (gg in 1:length(home.goals)) if (home.goals[gg] < dim(playbyplay)[1])
playbyplay$home.score[(home.goals[gg]+1):dim(playbyplay)[1]] <-
playbyplay$home.score[(home.goals[gg]+1):dim(playbyplay)[1]] + 1
away.goals <- which(playbyplay$etype=="GOAL" & playbyplay$ev.team==game.info$teams[1])
if (length(away.goals)>0) for (gg in 1:length(away.goals)) if (away.goals[gg] < dim(playbyplay)[1])
playbyplay$away.score[(away.goals[gg]+1):dim(playbyplay)[1]] <-
playbyplay$away.score[(away.goals[gg]+1):dim(playbyplay)[1]] + 1
#event length.
playbyplay$event.length <- playbyplay$seconds
playbyplay$event.length[2:dim(playbyplay)[1]] <-
playbyplay$event.length[2:dim(playbyplay)[1]] - playbyplay$event.length[1:(dim(playbyplay)[1]-1)]
#identify goaltenders, remove from player lists.
player.list <- game.info$players
player.list <- rbind(player.list, "")
playbyplay$away.G <- ""; for (kk in paste0("a",1:6)) {
playbyplay[is.na(playbyplay[,kk]),kk] <- ""
column <- playbyplay[,kk];
pl.match <- match(column, player.list$numfirstlast)
picks <- player.list$pos[pl.match]; picks[is.na(picks)] <- ""
if (sum(picks=="G")>0) {
playbyplay$away.G[picks=="G"] <-
player.list$numfirstlast[pl.match][picks=="G"]
playbyplay[picks=="G", kk] <- ""
}
}
playbyplay$home.G <- ""; for (kk in paste0("h",1:6)) {
playbyplay[is.na(playbyplay[,kk]),kk] <- ""
column <- playbyplay[,kk];
pl.match <- match(column, player.list$numfirstlast)
picks <- player.list$pos[pl.match]; picks[is.na(picks)] <- ""
if (sum(picks=="G")>0) {
playbyplay$home.G[picks=="G"] <-
player.list$numfirstlast[pl.match][picks=="G"]
playbyplay[picks=="G", kk] <- ""
}
}
#patch.for.shottypes <- function(playbyplay)
playbyplay$home.skaters <- apply(1*(playbyplay[,c(paste0("h",1:6), "home.G")] != ""), 1, sum)
playbyplay$away.skaters <- apply(1*(playbyplay[,c(paste0("a",1:6), "away.G")] != ""), 1, sum)
etypes <- c("GOAL","SHOT","MISS","BLOCK")
shot.rows <- which(playbyplay$etype %in% etypes)
playbyplay$type[shot.rows][playbyplay$type[shot.rows] == "Tip-in"] <- "Tip-In"
playbyplay$type[shot.rows][playbyplay$type[shot.rows] == "Wrap-around"] <- "Wrap"
shotstyles <- c("Backhand", "Tip-In", "Wrist", "Snap", "Slap", "Wrap", "Deflected")
playbyplay$type[shot.rows][!(playbyplay$type[shot.rows] %in% shotstyles)] <- "Unspecified"
playbyplay$distance <- as.numeric(playbyplay$distance)
rownames(playbyplay) <- 1:nrow(playbyplay)
game.info$playbyplay <- playbyplay
}
if (save.to.file) save (game.info, file=paste0(rdata.folder, "/", season, "-", gcode, "-processed.RData"))
} else game.info <- NULL
return(game.info)
}
process.games <- function (games=full.game.database(),
rdata.folder="nhlr-data",
override.download=FALSE) {
#games=full.game.database(); games = games[5341:5456,]
item <- list()
for (kk in which(games$valid)) {
message (paste(kk, games[kk,1], paste0(2+1*(games$session[kk]=="Playoffs"), games$gamenumber[kk])))
item <- #c(item,
process.single.game(
games$season[kk],
games$gcode[kk],
rdata.folder=rdata.folder,
override.download=override.download,
save.to.file=TRUE)
# )
}
#save(item, file="all-archived.RData")
return(TRUE)
}
reset.season <- function (games=full.game.database(),
season="20132014",
rdata.folder="nhlr-data") {
games$valid[games$season==season] <- TRUE
for (kk in which(games$season==season)) {
#Delete the games with empty play by play tables.
f1p <- paste0(rdata.folder, "/", games$season[kk], "-", games$gcode[kk], "-processed.RData")
f1o <- paste0(rdata.folder, "/", games$season[kk], "-", games$gcode[kk], ".RData")
if (file.exists(f1p)) {
load(f1p)
if (length(game.info$playbyplay)==0) {file.remove(f1p); file.remove(f1o)}
}
}
return(games)
}
retrieve.game <- function (season="20122013",
gcode="20001",
rdata.folder="nhlr-data",
force=TRUE) {
#season="20122013"; gcode="20001"; rdata.folder="nhlr-data"
if (!file.exists(paste0(rdata.folder, "/", season, "-", gcode, "-processed.RData"))) {
if (force) game.info <- process.single.game (season, gcode, rdata.folder, save.to.file=TRUE) else game.info <- NULL
} else load (paste0(rdata.folder, "/", season, "-", gcode, "-processed.RData"))
return(game.info)
}
augment.game <- function (game.info, player.list) {
#game.info=sample.game; player.list=roster; season=""; gcode=""
playbyplay <- game.info$playbyplay; teams <- game.info$teams
if (length(playbyplay) == 0) stop ("Play-by-play table does not exist.")
if (length(player.list) == 0) stop ("Player roster does not exist.")
#replace players with ID numbers.
for (cc in c(paste0("a",1:6), paste0("h",1:6), "away.G", "home.G", "ev.player.1", "ev.player.2", "ev.player.3")) {
replacement <- player.list$player.id[match(playbyplay[,cc], player.list$numfirstlast)]
if (is.null(replacement)) replacement <- rep(NA, dim(playbyplay)[1])
playbyplay[,cc] <- replacement
playbyplay[is.na(playbyplay[,cc]),cc] <- 1
}
#playbyplay <- patch.for.shottypes(playbyplay)
return(playbyplay)
}
fix.names.manually <- function (master.list) {
#one name, two players.
master.list$first[which(master.list$last=="PICARD" & master.list$first=="ALEXANDRE" & master.list$pos == "D")] <- "ALEXANDRE R."
#manual fixes.
master.list$last[which(master.list$last=="ANDERSSON" & master.list$first=="CRAIG")] <- "ANDERSON"
master.list$first[which(master.list$last=="ANTROPOV" & master.list$first=="NIKOLAI")] <- "NIK"
master.list$first[which(master.list$last=="AULD" & master.list$first=="ALEXANDER")] <- "ALEX"
master.list$first[which(master.list$last=="AXELSSON" & master.list$first=="PER JOHAN")] <- "P.J."
master.list$first[which(master.list$first=="P. J. ")] <- "P.J."
master.list$first[which(master.list$last=="BAILEY" & master.list$first=="JOSHUA")] <- "JOSH"
master.list$first[which(master.list$last=="BARCH" & master.list$first=="KRYSTOFER")] <- "KRYS"
master.list$first[which(master.list$last=="BARKER" & master.list$first=="CAMERON")] <- "CAM"
master.list$first[which(master.list$last=="BERGFORS" & master.list$first=="NICKLAS")] <- "NICLAS"
master.list$first[which(master.list$last=="BLACKBURN" & master.list$first=="DANIEL")] <- "DAN"
master.list$first[which(master.list$last=="BLAKE" & master.list$first=="ROBERT")] <- "ROB"
master.list$first[which(master.list$last=="BLUNDEN" & master.list$first=="MICHAEL")] <- "MIKE"
master.list$first[which(master.list$last=="BOURQUE" & master.list$first=="CHRISTOPHER")] <- "CHRIS"
master.list$first[which(master.list$last=="BOYNTON" & master.list$first=="NICHOLAS")] <- "NICK"
master.list$first[which(master.list$last=="BRIERE" & master.list$first=="DANNY")] <- "DANIEL"
master.list$first[which(master.list$last=="BRYZGALOV" & master.list$first=="ILJA")] <- "ILYA"
master.list$first[which(master.list$last=="BURROWS" & master.list$first=="ALEXANDRE")] <- "ALEX"
master.list$first[which(master.list$last=="CAMMALLERI" & master.list$first=="MICHAEL")] <- "MIKE"
master.list$first[which(master.list$last=="CARCILLO" & master.list$first=="DANIEL")] <- "DAN"
master.list$first[which(master.list$last=="CARLE" & master.list$first=="MATTHEW")] <- "MATT"
master.list$first[which(master.list$last=="CLEARY" & master.list$first=="DAN")] <- "DANIEL"
master.list$first[which(master.list$last=="CLEARY" & master.list$first=="DANNY")] <- "DANIEL"
master.list$first[which(master.list$last=="CORVO" & master.list$first=="JOSEPH")] <- "JOE"
master.list$first[which(master.list$last=="CRABB" & master.list$first=="JOSEPH")] <- "JOEY"
master.list$first[which(master.list$last=="CROMBEEN" & master.list$first=="BJ")] <- "B.J."
master.list$first[which(master.list$last=="CROMBEEN" & master.list$first=="BRANDON")] <- "B.J."
master.list$first[which(master.list$last=="DADONOV" & master.list$first=="EVGENII")] <- "EVGENY"
master.list$first[which(master.list$last=="DOWD" & master.list$first=="JAMES")] <- "JIM"
master.list$first[which(master.list$last=="DOWELL" & master.list$first=="JACOB")] <- "JAKE"
master.list$first[which(master.list$last=="DRAZENOVIC" & master.list$first=="NICHOLAS")] <- "NICK"
master.list$first[which(master.list$last=="DUMONT" & master.list$first=="J P")] <- "JEAN-PIERRE"
master.list$first[which(master.list$last=="DUMONT" & master.list$first=="J-P")] <- "JEAN-PIERRE"
master.list$first[which(master.list$last=="EARL" & master.list$first=="ROBBIE")] <- "ROBERT"
master.list$first[which(master.list$last=="FERNANDEZ" & master.list$first=="EMMANUEL")] <- "MANNY"
master.list$first[which(master.list$last=="FROLOV" & master.list$first=="ALEXANDER")] <- "ALEX"
master.list$first[which(master.list$first=="TJ")] <- "T.J."
master.list$first[which(master.list$last=="GAUTHIER" & master.list$first=="DENIS JR.")] <- "DENIS"
master.list$first[which(master.list$last=="GIGUERE" & master.list$first=="J")] <- "JEAN-SEBASTIEN"
master.list$first[which(master.list$last=="GIRARDI" & master.list$first=="DAN")] <- "DANIEL"
master.list$first[which(master.list$last=="GREENE" & master.list$first=="ANDY")] <- "ANDREW"
master.list$first[which(master.list$last=="GREER" & master.list$first=="MICHAEL")] <- "MIKE"
master.list$first[which(master.list$last=="GROSSMAN" & master.list$first=="NIKLAS")] <- "NICKLAS"
master.list$last[which(master.list$last=="GROSSMAN" & master.list$first=="NICKLAS")] <- "GROSSMANN"
master.list$first[which(master.list$last=="GUENIN" & master.list$first=="NATE")] <- "NATHAN"
master.list$first[which(master.list$last=="HALKO" & master.list$first=="STEVE")] <- "STEVENN"
master.list$first[which(master.list$last=="HIGGINS" & master.list$first=="CHRISTOPHER")] <- "CHRIS"
master.list$last[which(master.list$last=="HILLEN III" & master.list$first=="JOHN")] <- "HILLEN"
master.list$first[which(master.list$last=="HILLEN" & master.list$first=="JOHN")] <- "JACK"
master.list$first[which(master.list$last=="HOLIK" & master.list$first=="ROBERT")] <- "BOBBY"
master.list$first[which(master.list$last=="HOWARD" & master.list$first=="JAMES")] <- "JIMMY"
master.list$first[which(master.list$last=="IRWIN" & master.list$first=="MATTHEW")] <- "MATT"
master.list$first[which(master.list$last=="JACKMAN" & master.list$first=="RICHARD")] <- "RIC"
master.list$first[which(master.list$last=="JACQUES" & master.list$first=="J-F")] <- "JEAN-FRANCOIS"
master.list$first[which(master.list$last=="JOHANSSON" & master.list$first=="MATTIAS")] <- "MATHIAS"
master.list$first[which(master.list$last=="KALINSKI" & master.list$first=="JONATHON")] <- "JON"
master.list$last[which(master.list$last=="KASTSITSYN")] <- "KOSTITSYN"
master.list$first[which(master.list$last=="KOSTITSYN" & master.list$first=="SIARHEI")] <- "SERGEI"
master.list$first[which(master.list$last=="KILLORN" & master.list$first=="ALEXANDER")] <- "ALEX"
master.list$first[which(master.list$last=="KING" & master.list$first=="DWAYNE")] <- "D.J."
master.list$first[which(master.list$first=="DJ")] <- "D.J."
master.list$first[which(master.list$last=="KNUBLE" & master.list$first=="MICHAEL")] <- "MIKE"
master.list$first[which(master.list$last=="KOLANOS" & master.list$first=="KRYSTOFER")] <- "KRYS"
master.list$first[which(master.list$last=="KOMISAREK" & master.list$first=="MICHAEL")] <- "MIKE"
master.list$first[which(master.list$last=="KONDRATIEV" & master.list$first=="MAX")] <- "MAXIM"
master.list$first[which(master.list$last=="KOVALEV" & master.list$first=="ALEXEI")] <- "ALEX"
master.list$first[which(master.list$last=="KRONVALL")] <- "KRONWALL"
master.list$first[which(master.list$last=="LEGACE" & master.list$first=="EMMANUEL")] <- "MANNY"
master.list$first[which(master.list$last=="LETANG" & master.list$first=="KRISTOPHER")] <- "KRIS"
master.list$first[which(master.list$last=="MACIAS" & master.list$first=="RAYMOND")] <- "RAY"
master.list$first[which(master.list$last=="MACLEAN" & master.list$first=="DONALD")] <- "DON"
master.list$last[which(master.list$last=="MAGNAN-GRENIER")] <- "MAGNAN"
master.list$first[which(master.list$last=="MAYOROV" & master.list$first=="MAXIM")] <- "MAKSIM"
master.list$first[which(master.list$last=="MCCOLLUM" & master.list$first=="TOM")] <- "THOMAS"
master.list$first[which(master.list$last=="MCGILLIS" & master.list$first=="DAN")] <- "DANIEL"
master.list$last[which(master.list$last=="MEYER IV")] <- "MEYER"
master.list$first[which(master.list$last=="MEYER" & master.list$first=="FREDDY")] <- "FREDERICK"
master.list$first[which(master.list$last=="MILLER" & master.list$first=="ANDREW")] <- "DREW"
master.list$first[which(master.list$last=="MILLS" & master.list$first=="BRADLEY")] <- "BRAD"
master.list$first[which(master.list$last=="MODANO" & master.list$first=="MICHAEL")] <- "MIKE"
master.list$first[which(master.list$last=="MODIN" & master.list$first=="FREDDY")] <- "FREDRIK"
master.list$first[which(master.list$last=="NEIL" & master.list$first=="CHRISTOPHER")] <- "CHRIS"
master.list$first[which(master.list$last=="ODUYA" & master.list$first=="DAVID JOHNNY")] <- "JOHNNY"
master.list$first[which(master.list$last=="ODUYA" & master.list$first=="JOHN")] <- "JOHNNY"
master.list$last[which(master.list$last=="ORTMYER" & master.list$first=="JED")] <- "ORTMEYER"
master.list$first[which(master.list$last=="OVECHKIN" & master.list$first=="ALEXANDER")] <- "ALEX"
master.list$first[which(master.list$last=="PARENTEAU" & master.list$first=="PIERRE")] <- "P.A."
master.list$first[which(master.list$last=="PARENTEAU" & master.list$first=="PA")] <- "P.A."
master.list$first[which(master.list$last=="PELLEY" & master.list$first=="RODNEY")] <- "ROD"
master.list$first[which(master.list$last=="PEVERLEY" & master.list$first=="JOHN")] <- "RICH"
master.list$first[which(master.list$last=="POULIOT" & master.list$first=="MARC")] <- "MARC-ANTOINE"
master.list$first[which(master.list$last=="PROSPAL" & master.list$first=="VINNY")] <- "VACLAV"
master.list$first[which(master.list$last=="PURCELL" & master.list$first=="EDWARD")] <- "TEDDY"
master.list$last[which(master.list$last=="PUSHKAREV" & master.list$first=="KONSTANTIN")] <- "PUSHKARYOV"
master.list$first[which(master.list$last=="REINPRECHT" & master.list$first=="STEVE")] <- "STEVEN"
master.list$first[which(master.list$last=="RISSMILLER" & master.list$first=="PAT")] <- "PATRICK"
master.list$first[which(master.list$last=="RUPP" & master.list$first=="MICHAEL")] <- "MIKE"
master.list$first[which(master.list$last=="SANTORELLI" & master.list$first=="MICHAEL")] <- "MIKE"
master.list$first[which(master.list$last=="SCUDERI" & master.list$first=="ROBERT")] <- "ROB"
master.list$first[which(master.list$last=="SESTITO" & master.list$first=="TOMMY")] <- "TOM"
master.list$last[which(master.list$last=="SHISKANOV" & master.list$first=="TIMOFEI")] <- "SHISHKANOV"
master.list$first[which(master.list$last=="SILLINGER" & master.list$first=="MICHAEL")] <- "MIKE"
master.list$first[which(master.list$last=="SIM" & master.list$first=="JON")] <- "JONATHAN"
master.list$first[which(master.list$last=="SIMON" & master.list$first=="BEN")] <- "BENJAMIN"
master.list$first[which(master.list$last=="STAJAN" & master.list$first=="MATTHEW")] <- "MATT"
master.list$first[which(master.list$last=="STEEN" & master.list$first=="ALEXANDER")] <- "ALEX"
master.list$last[which(master.list$last=="ST LOUIS" & master.list$first=="MARTIN")] <- "ST. LOUIS"
master.list$first[which(master.list$last=="STORTINI" & master.list$first=="ZACHERY")] <- "ZACK"
master.list$last[which(master.list$last=="ST PIERRE" & master.list$first=="MARTIN")] <- "ST. PIERRE"
master.list$last[which(master.list$last=="STREBAK" & master.list$first=="MARTIN")] <- "STRBAK"
master.list$first[which(master.list$first=="PK")] <- "P.K."
master.list$first[which(master.list$last=="TAYLOR" & master.list$first=="TIMOTHY")] <- "TIM"
master.list$first[which(master.list$last=="THOMAS" & master.list$first=="TIMOTHY JR.")] <- "TIM"
master.list$first[which(master.list$last=="THOMAS" & master.list$first=="WILLIAM")] <- "BILL"
master.list$first[which(master.list$first=="RJ")] <- "R.J."
master.list$first[which(master.list$last=="VALICEVIC" & master.list$first=="ROBERT")] <- "ROB"
master.list$first[which(master.list$last=="VALIQUETTE" & master.list$first=="STEVE")] <- "STEPHEN"
master.list$first[which(master.list$last=="VANDERMEER" & master.list$first=="JAMES")] <- "JIM"
master.list$first[which(master.list$last=="VARLAMOV" & master.list$first=="SIMEON")] <- "SEMYON"
master.list$last[which(master.list$last=="VANDE VELDE" & master.list$first=="CHRIS")] <- "VANDEVELDE"
master.list$first[which(master.list$last=="WOZNIEWSKI" & master.list$first=="ANDREW")] <- "ANDY"
master.list$first[which(master.list$last=="WYMAN" & master.list$first=="JT")] <- "JAMES"
master.list$first[which(master.list$last=="YORK" & master.list$first=="MICHAEL")] <- "MIKE"
master.list$first[which(master.list$last=="ZHERDEV" & master.list$first=="NIKOLAY")] <- "NIKOLAI"
master.list$first[which(master.list$last=="ZOLNIERCZYK" & master.list$first=="HARRISON")] <- "HARRY"
master.list <- master.list[order(master.list[,2],
master.list[,3],
master.list[,1]),]
return(master.list)
}
make.unique.player.list <- function (master.list) {
#add Blanky.
#master.list <- rbind("", master.list)
#master.list <- fix.names.manually (master.list)
master.list$index <- 1:dim(master.list)[1]
for (kk in 2:dim(master.list)[1])
if (all(master.list[kk,2:3] == master.list[kk-1,2:3]))
master.list[kk,5] <- master.list[kk-1,5]
player.id.proto <- unique(master.list[,5])
master.list$player.id <- match(master.list[,5], player.id.proto)
return(master.list)
}
construct.rosters <- function (games=full.game.database(),
rdata.folder="nhlr-data",
roster.master=NULL,
positions=NULL) {
#games=full.game.database(); rdata.folder="nhlr-data"; roster.master=NULL; roster.unique=NULL; positions=NULL
message ("Begin: Loading rosters from game files.")
roster.collection <- list()
miscount <- 0
for (kk in 1:dim(games)[1]) #which(games$season=="20122013" & games$session == "Playoffs")) #
if (games$valid[kk]) {
tryme <- try({
game.info <- retrieve.game(games$season[kk], games$gcode[kk], rdata.folder, force=FALSE)
if (length(game.info$players)>0)
roster.collection[[kk]] <- fix.names.manually(game.info$players[,c(2,5,6,8)])
games$awayteam[kk] <- game.info$teams[1]
games$hometeam[kk] <- game.info$teams[2]
games$date[kk] <- paste(game.info$date, collapse=" ")
games$awayscore[kk] <- length(which(game.info$playbyplay$etype=="GOAL" &
game.info$playbyplay$ev.team==game.info$teams[1] &
game.info$playbyplay$shootout==0))
games$homescore[kk] <- length(which(game.info$playbyplay$etype=="GOAL" &
game.info$playbyplay$ev.team==game.info$teams[2] &
game.info$playbyplay$shootout==0))
##add one goal for shootout winner
if (max(game.info$playbyplay$shootout)==1){
if (max(game.info$playbyplay$away.score) > max(game.info$playbyplay$home.score)) {
games$awayscore[kk] <- length(which(game.info$playbyplay$etype=="GOAL" &
game.info$playbyplay$ev.team==game.info$teams[1] &
game.info$playbyplay$shootout==0)) +1
} else if (max(game.info$playbyplay$away.score) < max(game.info$playbyplay$home.score)) {
games$homescore[kk] <- length(which(game.info$playbyplay$etype=="GOAL" &
game.info$playbyplay$ev.team==game.info$teams[2] &
game.info$playbyplay$shootout==0)) +1
}
}
if (max(game.info$playbyplay$period)==3) {
games$length[kk] <- "REG"
} else if (max(game.info$playbyplay$period)==4) {
games$length[kk] <- "OT"
} else if (max(game.info$playbyplay$period)==5 & games$session[kk]=="Regular") {
games$length[kk] <- "SO"
} else if (max(game.info$playbyplay$period)>=5 & games$session[kk]=="Playoff") {
games$length[kk] <- "OT"
}
if (length(game.info$playbyplay) == 0) games$valid[kk] <- FALSE
}, TRUE)
if (class(tryme) == "try-error") {games$valid[kk] <- FALSE; miscount <- miscount+1}
if (kk %% 100 == 0) message(paste("Roster loading: game",kk,"of",dim(games)[1],"with",miscount,"skips."))
}
message ("End: Loading rosters from game files.")
blanky <- data.frame (pos="", last="", first="", numfirstlast="",
firstlast="", index=1,
player.id=1, stringsAsFactors=FALSE)
if (is.null(roster.master))
roster.master <- blanky
for (kk in 1:length(roster.collection)) if (games$valid[kk]) {
if (kk %% 500 == 0) message(paste("Roster merger: game",kk,"of",dim(games)[1]))
for (pp in 1:dim(roster.collection[[kk]])[1]) {
m1 <- match(roster.collection[[kk]]$numfirstlast[pp],
roster.master$numfirstlast)
if (is.na(m1)) { #no exact match?
rec1 <- blanky
rec1$pos <- roster.collection[[kk]]$pos[pp];
rec1$first <- roster.collection[[kk]]$first[pp];
rec1$last <- roster.collection[[kk]]$last[pp];
rec1$numfirstlast <- roster.collection[[kk]]$numfirstlast[pp];
rec1$firstlast <- paste(rec1$first, rec1$last)
rec1$index <- dim(roster.master)[1] + 1
m2 <- match(rec1$firstlast,
roster.master$firstlast)
if (is.na(m2)) rec1$player.id <- max(roster.master$player.id)+1 else rec1$player.id <- roster.master$player.id[m2]
roster.master <- rbind(roster.master, rec1)
}
}
}
#positions table and unique player list.
message("Finding most likely position for each player.")
if (is.null(positions)) {
zeroes <- rep(0, dim(roster.master)[1])
positions <- data.frame(pC=zeroes, pL=zeroes, pR=zeroes, pD=zeroes, pG=zeroes)
} else while (dim(positions)[1] < dim(roster.master)[1]) positions <- rbind(positions, 0)
for (kk in 1:length(roster.collection)) if (games$valid[kk]) {
r1.match <- match(roster.collection[[kk]]$numfirstlast,
roster.master$numfirstlast)
positions$pC[r1.match[roster.collection[[kk]]$pos=="C"]] <-
positions$pC[r1.match[roster.collection[[kk]]$pos=="C"]] + 1
positions$pL[r1.match[roster.collection[[kk]]$pos=="L"]] <-
positions$pL[r1.match[roster.collection[[kk]]$pos=="L"]] + 1
positions$pR[r1.match[roster.collection[[kk]]$pos=="R"]] <-
positions$pR[r1.match[roster.collection[[kk]]$pos=="R"]] + 1
positions$pD[r1.match[roster.collection[[kk]]$pos=="D"]] <-
positions$pD[r1.match[roster.collection[[kk]]$pos=="D"]] + 1
positions$pG[r1.match[roster.collection[[kk]]$pos=="G"]] <-
positions$pG[r1.match[roster.collection[[kk]]$pos=="G"]] + 1
}
unique.entries <- match(1:max(roster.master$player.id), roster.master$player.id)
roster.unique <- roster.master[unique.entries,]
roster.unique$pos <- sapply(1:max(roster.master$player.id), function(kk) {
pcount <- apply(rbind(positions[roster.master$player.id==kk,]), 2, sum)
c("C","L","R","D","G")[min(which(pcount==max(pcount)))]
})
#write.csv(roster.master, "master.list.csv", quote=FALSE); write.csv(games, "games.csv", quote=FALSE)
return(list(roster.master=roster.master,
roster.unique=roster.unique,
games=games,
positions=positions))
}
fold.frames <- function(frame.list) {
#frame.list = all.games
repeat {
hold.list <- list()
for (kk in 1:floor(length(frame.list)/2))
hold.list[[kk]] <- tryCatch(
rbind(frame.list[[2*kk-1]], frame.list[[2*kk]]),
warning = function(war) message(paste(kk, war)),
error = function(err) message(paste(kk, err)),
finally = {})
if (length(frame.list) %% 2 == 1) hold.list[[kk]] <- rbind(hold.list[[kk]], frame.list[[2*kk+1]])
frame.list <- hold.list
rm(hold.list)
print(length(frame.list))
if (length(frame.list) == 1) break
}
return(frame.list[[1]])
}
assemble.mega.file <- function (roster.master=NULL,
games=full.game.database(),
rdata.folder="nhlr-data",
output.file="nhlscrapr-record.RData") {
if (is.null(roster.master)) {
roster.main <- construct.rosters (games)
roster.master <- roster.main$roster.master
}
all.games <- list()
for (kk in 1:dim(games)[1])
# if (games$valid[kk])
{
tryme <- try({
game.info <-
retrieve.game(games$season[kk],
games$gcode[kk],
rdata.folder, force=FALSE)
if (length(game.info$playbyplay)>0)
all.games[[kk]] <- augment.game(game.info, roster.master)
}, TRUE)
if (kk %% 100 == 0) message(paste("Event assembly: game",kk,"of",dim(games)[1]))
}
all.games <- all.games[!sapply(all.games, is.null)]
grand.data <- fold.frames(all.games)
save(grand.data, roster.master, games, file=output.file)
return(TRUE)
}
nhlscrapr.everything <- function () {
#What are all games that can/should be downloaded? valid=FALSE implies previously screened problems.
#Just a subset for testing.
games <- full.game.database()
#Takes HTML files and (possibly) GIF images and produces event and player tables for each game.
process.games (games)
#Take all the game table roster lists and produce both a unique player
#list and an improved game table.
roster.main <- construct.rosters (games)
roster.master <- roster.main$roster.master
roster.unique <- roster.main$roster.unique
games <- roster.main$games
save(roster.master, roster.unique, games, file="nhl20022014.RData")
#Do it all at once for a big database.
assemble.mega.file (roster.master, games, output.file="mynhlscrapes.RData")
#library(nhlscrapr); load("mynhlscrapes.RData"); grand.data <- fold.frames(all.games); save(grand.data, file="grand10.RData")
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.