knitr::opts_chunk$set(echo = TRUE)
Re-creation and expansion of this blog
Get data from kaggle-data.
Save in to the \data folder in the root of the repo and unzip. Re-name to
fd_data.csv.
Get functions
# LINE FOR INSTALLING FROM GITHUB HERE library(JP.fraudDetection) # remove once dependencies setup library(data.table) library(ggplot2) library(ggridges)
# load data location = file.path(getwd(), "/data") file = "fd_data.csv" dt <- fread(file.path(location, file), colClasses = list(character=c("type","nameOrig","nameDest"), numeric=c("amount","oldbalanceOrg","newbalanceOrig", "oldbalanceDest","newbalanceDest"), integer=c("step","isFlaggedFraud"), factor=c("isFraud")) ) head(dt) dim(dt) # imbalance in classes: fraud is uncommon dt[, .(.N, prop = .N/nrow(dt)), by = isFraud]
sample the data for plotting, to save memory and speed
p = 0.2 set.seed(1) sample_dt <- rbindlist(list(dt[isFraud == 1,], dt[isFraud == 0,][sample(1:.N, size = floor(.N*p)),])) sample_dt[, logAmount := log(amount + 1)]
Can see the amount is higher usually in Fraudulent cases
pl <- ggplot(sample_dt, aes(x = logAmount, y = isFraud)) + geom_density_ridges(fill = "goldenrod1") + theme_minimal() pl
Only affects Transfer and Cash_out types of purchases
pl <- ggplot(sample_dt, aes(x = logAmount, y = type, fill = isFraud)) + geom_density_ridges(alpha = .6) + theme_minimal() pl prop_types <- dt[type %in% c("TRANSFER", "CASH_OUT"), .N/nrow(dt)]
These make up r round(prop_types*100, 1)% of the data
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.