knitr::opts_chunk$set( collapse = TRUE, comment = "#>", fig.path = "man/figures/README-", out.width = "100%" )
The goal of cardid is to allow you to use the information of basketball and baseball sports trading cards in order to find the (1) player and (2) year that each card refers to. In addition, it allows the creation of a card identifier.
You can install the development version from GitHub with:
install.packages("remotes") remotes::install_github("jk2049/cardid")
Below you can find an example of the functionalities offered. Please keep in mind that this is an early version. As a result, the results of this code have only been tested when running the functions in the same order as in the below example:
library(cardid) library(dplyr) library(stringr) library(data.table) library(plyr)
The package offers three datasets. The first is a dataset with information about all the player who have played in the MLB:
head(dt.baseball.players)
The second is a dataset with information about all the player who have played in the NBA:
head(dt.basketball.players)
The third is a subset of a dataset that contains information about sports trading cards that were listed on eBay:
head(dt.example) dt.example.titles <- dt.example[, list(sports, card_title)] dt.example.basketball.titles <- dt.example.titles[sports == "Basketball", ] dt.example.baseball.titles <- dt.example.titles[sports == "Baseball", ] dt.example.years <- dt.example[, list(sports, card_title, product_section_year)]
First, we use the functions ''baseball.preprocess'' and ''baseball.preprocess'' in order to manipulate the player names of the MLB and the NBA players respectively. Standard NLP actions are taken (i.e., remove stopwords etc.) that allow us to build a dictionary of player names. This is important as the same player names (e.g., shaquille o'neal) can be written in different ways in the card titles (e.g., shaquille o'neal, shaquille o neal, shaquille oneal etc.).
baseball.players <- baseball.preprocess(dt.baseball.players) basketball.players <- basketball.preprocess(dt.basketball.players)
Second, we use the function ''data.preprocess'' on the card title columns in order to manipulate the card titles (i.e., remove stopwords etc.)
dt.example.basketball.titles$card_title <- data.preprocess(dt.example.basketball.titles$card_title) dt.example.baseball.titles$card_title <- data.preprocess(dt.example.baseball.titles$card_title) dt.example.years$card_title_2 <- data.preprocess(dt.example.years$card_title)
Then, we use the function ''get.player'' to find the player(s) that each card title refers to.
get.player <- function(dt.input, card.title.col.name = "card_title", sport){ card.title.index <- which(names(dt.input) == card.title.col.name) if(sport == "basketball"){ players.in.title <- basketball.players[str_detect(dt.input[card.title.index], basketball.players$player), "original_player"] }else if(sport == "baseball"){ players.in.title <- baseball.players[str_detect(dt.input[card.title.index], baseball.players$player), "original_player"] }else{ stop(paste("Sport ", sport, " is not supported.")) } n.players <- length(players.in.title) players.in.title <- paste(players.in.title, sep = ",") if(length(players.in.title) == 0) players.in.title <- NA dt.output <- c(dt.input, "n_players" = n.players, players.in.title) dt.output <- as.data.frame(t(dt.output)) return(dt.output) }
dt.example.basketball.titles <- rbindlist( apply(dt.example.basketball.titles, 1, get.player, card.title.col.name = "card_title", sport = "basketball"), fill=TRUE) dt.example.baseball.titles <- rbindlist( apply(dt.example.baseball.titles, 1, get.player, card.title.col.name = "card_title", sport = "baseball"), fill=TRUE)
The next step is to identify the year that each card refers to. We can identify the card year with at most two ways: One is by analyzing the card title through the function ''get.year.from.title'', while the other is to analyze the information contained in the product section field of the listed cards through the function ''get.year.from.product.section''. The product section field may or may not have been filled in by the seller.
Once we are done, we use ''get.product.year'' to construct the card year column.
# Year from the card title dt.example.years <- suppressWarnings(rbindlist(apply(dt.example.years, 1, get.year.from.title, card.title.col.name = "card_title"), fill=TRUE)) # Year from the product section information dt.example.years <- rbindlist(apply(dt.example.years, 1, get.year.from.product.section, product.section.year.col.name = "product_section_year"), fill=TRUE) # Combine the two dt.example.years <- get.product.year(dt.example.years)
Now that we have the identified player and year of every card, we can construct
the product identifier. If there is no identified player or no identified year
for a card, then that we insert a missing value (i.e.,
dt.example.names <- rbind.fill(dt.example.baseball.titles, dt.example.basketball.titles) rm(dt.example.baseball.titles, dt.example.basketball.titles) dt.example.names <- as.data.table(dt.example.names) dt.example.names <- dt.example.names[!duplicated(dt.example.names)] dt.example <- merge(dt.example.years, dt.example.names, by.x = c("sports", "card_title_2"), by.y = c("sports", "card_title"), all.x = TRUE) dt.example[, product_id := paste0(V4, "-", product_id_year)] # If any component is NA then replace the ID with NA dt.example[is.na(V4), product_id := NA] dt.example[is.na(product_id_year), product_id := NA]
Although we have constructed a product identifier, there are still some pending issues that can only be discovered upon closer inspection of the data:
remove.duplicate.players <- function(dt.input, sport){ # Subset data to specific sport dt.input$sports <- tolower(dt.input$sports) dt.input <- dt.input[sports == sport, ] if(sport == "basketball"){ all.players <- basketball.players$original_player ## Keep duplicate entries all.players.duplicates <- all.players[duplicated(all.players)] }else if(sport == "baseball"){ all.players <- baseball.players$original_player ## Keep duplicate entries all.players.duplicates <- all.players[duplicated(all.players)] all.players.duplicates <- c(all.players.duplicates, all.players[grepl("jr\\.", all.players)]) }else{ stop(paste("Sport ", sport, " is not supported.")) } rm(all.players) i.limit <- as.numeric(max(dt.input$n_players)) + 3 dt.input$flag <- 0 dt.input$sports <- tolower(dt.input$sports) for(i in 4:i.limit){ col.player.name <- paste0("V", i) dt.input[get(col.player.name) %in% all.players.duplicates, flag := 1] rm(col.player.name) } indices <- which(dt.input$flag == 1) # Fix columns # Some V columns have the same players for(i in 1:length(indices)){ # print(paste0("Loop 1 - ", sport, " ", i, " out of ", length(indices))) card.info <- dt.input[indices[i], ] # Get all the players and insert them into a vector players.in.title <- c() for(j in 4:i.limit){ col.player.name <- paste0("V", j) players.in.title <- append(players.in.title, card.info[, get(col.player.name)]) rm(col.player.name) } # Remove the NAs players.in.title <- players.in.title[!is.na(players.in.title)] # Get the number of duplicate players n.duplicated.player.in.title <- sum(duplicated(players.in.title)) # Get the duplicate players duplicated.player.in.title <- players.in.title[duplicated(players.in.title)] # For every duplicate player in the vector j <- 1 # Insert all the players into a new vector insert.players.in.title <- players.in.title while(j <= length(duplicated.player.in.title)){ # Find the index of all the entries of that player duplicated.player.in.title.index <- which(players.in.title == duplicated.player.in.title[j]) # Remove the first index (we want to keep the first - i.e. one - entry) duplicated.player.in.title.index <- duplicated.player.in.title.index[-1] # Replace the duplicate indices with NA insert.players.in.title[duplicated.player.in.title.index] <- NA rm(duplicated.player.in.title.index) j <- j + 1 } # Remove the NAs insert.players.in.title <- insert.players.in.title[!is.na(insert.players.in.title)] # Change n_players dt.input[indices[i], n_players := as.numeric(length(insert.players.in.title))] # Lengthen the new vector so that it has i.limit elements (corresponding to V4-Vi.limit) insert.players.in.title <- c(insert.players.in.title, rep(NA, i.limit-length(insert.players.in.title))) # Insert the new elements into the current columns for(z in 1:i.limit){ col.player.name <- paste0("V", z+3) dt.input[indices[i], (col.player.name) := insert.players.in.title[z]] rm(col.player.name) } rm(card.info, players.in.title, n.duplicated.player.in.title, duplicated.player.in.title, j, z, insert.players.in.title) } return(dt.input) } fix.names <- function(dt.input, sport){ if(sport == "baseball"){ dt.input <- dt.input[sports == sport, ] i.limit <- as.numeric(max(dt.input$n_players)) + 3 dt.input$flag <- 0 for(i in 4:i.limit){ col.player.name <- paste0("V", i) dt.input[grepl(" jr\\.", get(col.player.name)), flag := 1] rm(col.player.name) } indices <- which(dt.input$flag == 1) for(i in 1:length(indices)){ # print(paste0("Loop 2 - sport ", sport, " ", i, " out of ", length(indices))) card.info <- dt.input[indices[i], ] # Get all the players and insert them into a vector players.in.title <- c() for(j in 4:i.limit){ col.player.name <- paste0("V", j) players.in.title <- append(players.in.title, card.info[, get(col.player.name)]) rm(col.player.name) } # Remove the jr. part players.in.title.jr.removed <- gsub(" jr\\.", "", players.in.title) # Remove the NAs players.in.title <- players.in.title[!is.na(players.in.title)] players.in.title.jr.removed <- players.in.title.jr.removed[!is.na(players.in.title.jr.removed)] # Get the number of duplicate players n.duplicated.player.in.title <- sum(duplicated(players.in.title.jr.removed)) # Get the duplicate players duplicated.player.in.title <- players.in.title.jr.removed[duplicated(players.in.title.jr.removed)] # For every duplicate player in the vector j <- 1 # Insert all the players into a new vector insert.players.in.title <- players.in.title while(j <= length(duplicated.player.in.title)){ # Find the index of all the entries of that player duplicated.player.in.title.index <- which(grepl(duplicated.player.in.title[j], players.in.title)) # Remove the first index (we want to keep the first - i.e. one - entry) duplicated.player.in.title.index <- duplicated.player.in.title.index[-2] # Replace the duplicate indices with NA insert.players.in.title[duplicated.player.in.title.index] <- NA rm(duplicated.player.in.title.index) j <- j + 1 } # Remove the NAs insert.players.in.title <- insert.players.in.title[!is.na(insert.players.in.title)] # Change n_players dt.input[indices[i], n_players := as.numeric(length(insert.players.in.title))] # Lengthen the new vector so that it has 8 elements (corresponding to V4-V11) insert.players.in.title <- c(insert.players.in.title, rep(NA, 8-length(insert.players.in.title))) # Insert the new elements into the current columns for(z in 1:i.limit){ col.player.name <- paste0("V", z+3) dt.input[indices[i], (col.player.name) := insert.players.in.title[z]] rm(col.player.name) } rm(card.info, players.in.title, n.duplicated.player.in.title, duplicated.player.in.title, j, insert.players.in.title, z) } dt.input[, flag := NULL] return(dt.input) }else{ stop(paste("This function is not valid for sport ", sport, ". It's only valid for baseball.")) } } player.tie.break <- function(dt.input, sport){ if(sport == "basketball"){ all.players <- dt.basketball.players$player }else if(sport == "baseball"){ all.players <- dt.baseball.players$player }else{ stop(paste("Sport ", sport, " is not supported.")) } all.players.duplicates <- all.players[duplicated(all.players)] if(sport == "basketball"){ all.players <- dt.basketball.players }else if(sport == "baseball"){ # Reconstruct flag dt.input$flag <- 0 i.limit <- as.numeric(max(dt.input$n_players)) + 3 for(i in 4:i.limit){ col.player.name <- paste0("V", i) dt.input[get(col.player.name) %in% all.players.duplicates, flag := 1] rm(col.player.name) } rm(i, i.limit) all.players <- dt.baseball.players } indices <- which(dt.input$flag == 1) for(i in 1:length(indices)){ # print(paste0("Loop 3 - ", sport, " player ", i, " out of ", length(indices))) card.info <- dt.input[indices[i], ] card.title <- card.info[, card_title_2] # Get all the players and insert them into a vector players.in.title <- c() col.names <- c() i.limit <- as.numeric(max(dt.input$n_players)) + 3 for(j in 4:i.limit){ col.player.name <- paste0("V", j) col.names <- append(col.names, col.player.name) players.in.title <- append(players.in.title, card.info[, get(col.player.name)]) rm(col.player.name) } # Keep only the names that are in the names of the duplicated vector players.in.title <- players.in.title[players.in.title %in% all.players.duplicates] players.in.title.count <- 1 while(players.in.title.count <= length(players.in.title)){ # Get active years for every player player.active.year <- all.players[player == players.in.title[players.in.title.count], list(player, from, to)] # Compare with card title year player.active.year[, potential := ((as.numeric(from)-1) <= as.numeric(card.info[, product_id_year]) & as.numeric(to) >= as.numeric(card.info[, product_id_year]))] player.count <- sum(player.active.year$potential == TRUE) player.index <- which(player.active.year$potential == TRUE) if(!is.na(player.count) & player.count == 1){ # One player fits player.name <- player.active.year[potential == TRUE, player] new.player.name <- paste0(player.name, "-", player.index) }else if(!is.na(player.count) & player.count == 0){ # No players fit player.name <- player.active.year[1, player] new.player.name <- paste0(player.name, "-no-fit") }else if(!is.na(player.count) & player.count > 1){ # All players fit player.name <- player.active.year[1, player] new.player.name <- paste0(player.name, "-all-fit") }else if(is.na(player.count)){ # All players fit player.name <- player.active.year[1, player] new.player.name <- paste0(player.name, "-NA") } rm(player.active.year, player.count, player.index) pot.cols <- card.info[, col.names, with = FALSE] col.ind <- which(pot.cols == player.name) rm(pot.cols, player.name) col.player.name <- paste0("V", col.ind+3) dt.input[indices[i], (col.player.name) := new.player.name] rm(new.player.name, col.ind, col.player.name) players.in.title.count <- players.in.title.count + 1 } rm(card.info, card.title, players.in.title, players.in.title.count) rm(col.names) } dt.input$flag <- NULL return(dt.input) } flag.id.valid <- function(dt.input){ # Flag valid dt.input[, valid := 0] dt.input[n_players == 1 & !grepl("-no-fit", V4) & !grepl("-all-fit", V4) & !grepl("-NA", V4) & !grepl("-no-fit", V5) & !grepl("-all-fit", V5) & !grepl("-NA", V5) & !grepl("-no-fit", V6) & !grepl("-all-fit", V6) & !grepl("-NA", V6) & !grepl("-no-fit", V7) & !grepl("-all-fit", V7) & !grepl("-NA", V7) & !grepl("-no-fit", V8) & !grepl("-all-fit", V8) & !grepl("-NA", V8) & !grepl("-no-fit", V9) & !grepl("-all-fit", V9) & !grepl("-NA", V9), valid := 1] dt.input[is.na(product_id_year), valid := 0] return(dt.input) }
dt.example.5.1.1 <- remove.duplicate.players(dt.example, "basketball") dt.example.5.1.2 <- remove.duplicate.players(dt.example, "baseball") dt.example.5.2.2 <- fix.names(dt.example.5.1.2, "baseball") dt.example.5.3.1 <- player.tie.break(dt.example.5.1.1, "basketball") dt.example.5.3.2 <- player.tie.break(dt.example.5.2.2, "baseball") dt.example.5 <- rbind.fill(dt.example.5.3.1, dt.example.5.3.2) dt.example.5 <- as.data.table(dt.example.5) dt.example.valid <- flag.id.valid(dt.example.5)
Using the function ''drop.useless.columns'' we can drop some columns that are no longer needed.
drop.useless.columns <- function(dt.input){ # Columns that are not needed any longer are dropped dt.input$four_digit_year <- NULL dt.input$four_digit_year_count <- NULL dt.input$two_digit_year <- NULL dt.input$two_digit_year_count <- NULL dt.input$two_to_four_digit_year <- NULL dt.input$four_digit_product_section_year <- NULL dt.input$four_digit_product_section_year_count <- NULL dt.input$two_digit_product_section_year <- NULL dt.input$two_digit_product_section_year_count <- NULL dt.input$two_to_four_digit_product_section_year <- NULL # Also drop the V columns that are empty for(i in 4:11){ col.player.name <- paste0("V", i) if(col.player.name %in% colnames(dt.input)){ if(sum(is.na(dt.input[, get(col.player.name)])) == nrow(dt.input)){ dt.input[, (col.player.name) := NULL] } rm(col.player.name) } } return(dt.input) }
colnames(dt.example.valid) dt.example.valid <- drop.useless.columns(dt.example.valid) colnames(dt.example.valid)
Our initial sports player datasets contain information that we have not included. Using the function ''add.player.info'' we add this information to our dataset.
add.player.info <- function(dt.input){ # dt.input <- dt.example.valid[1347, ] baseball.players.keep <- baseball.players basketball.players.keep <- basketball.players baseball.players$player <- baseball.players$original_player baseball.players$original_player <- NULL baseball.players <- baseball.players[!duplicated(baseball.players), ] basketball.players$player <- basketball.players$original_player basketball.players$original_player <- NULL basketball.players <- basketball.players[!duplicated(basketball.players), ] dt.baseball.players <- as.data.table(baseball.players) dt.basketball.players <- as.data.table(basketball.players) dt.basketball.players[, player := tolower(player)] dt.baseball.players[, player := tolower(player)] # # Find how many times each name appears dt.basketball.players[, n := .N, by = player] dt.baseball.players[, n := .N, by = player] # # Create an index for the names dt.basketball.players[, index := 1:.N, by = player] dt.baseball.players[, index := 1:.N, by = player] # dt.baseball.players[n == 5, ] # # For those names that appear more than once change name to name-index dt.basketball.players[, player_name := player] dt.baseball.players[, player_name := player] dt.basketball.players$A <- 1 dt.basketball.players$A <- NULL dt.baseball.players$A <- 1 dt.baseball.players$A <- NULL dt.basketball.players[n != 1, player_name := paste0(player_name, "-", index)] dt.baseball.players[n != 1, player_name := paste0(player_name, "-", index)] dt.basketball.players$sports <- "basketball" dt.baseball.players$sports <- "baseball" dt.players <- rbind(dt.basketball.players[, list(sports, player_name, from, to, active, hall_of_famer)], dt.baseball.players[, list(sports, player_name, from, to, active, hall_of_famer)]) col.names <- colnames(dt.input) col.names <- c(col.names, colnames(dt.players)) col.names <- unique(col.names) col.names <- col.names[col.names != "player_name"] # Use this new name to add HoF to dt.cards dt.input <- merge(dt.input, dt.players, by.x = c("sports", "V4"), by.y = c("sports", "player_name"), all.x = TRUE) setcolorder(dt.input, col.names) return(dt.input) }
dt.example.valid.hof <- add.player.info(dt.example.valid) head(dt.example.valid.hof)
Let's inspect the important columns of our produced dataset. The first column (''card_title'') shows the original card title. The second (''n_players'') shows the number of players that were identified using our dictionary, while the third (''V4'') shows the first player name that was identified by our dictionary. If there is more than one identified player, then each following name is inserted into columns ''V5'', ''V6'', etc. (these columns are not displayed here to save space). In the fourth column (''product_id_year'') we can find the identified card year, while in the fifth column (''product_id'') we see the product id. In the sixth column we can find whether the product id is valid (''valid'' equals 1). A product id is valid only if (1) a year has been identified for this card and (2) exactly one player name has been identified for this same card. In the columns that follow we can find the year that the identified player started (''from'') and stopped (''to'') playing in the MLB/NBA, as well as whether the identified player is currently active in the MLB/NBA (''active'' equals 1) and whether the player has been inducted to the hall of fame (''hall of famer'' equals 1).
dt.example.valid.hof <- dt.example.valid.hof[, list(card_title, n_players, V4, product_id_year, product_id, valid, from, to, active, hall_of_famer)] head(dt.example.valid.hof[valid == 1, ]) head(dt.example.valid.hof[valid == 0, ])
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.