knitr::opts_chunk$set(echo = TRUE)

Setup

Re-creation and expansion of this blog

Get data from kaggle-data. Save in to the \data folder in the root of the repo and unzip. Re-name to fd_data.csv.

Get functions

# LINE FOR INSTALLING FROM GITHUB HERE
library(JP.fraudDetection)

# remove once dependencies setup
library(data.table)
library(ggplot2)
library(ggridges)

Data

# load data
location = file.path(getwd(), "/data")
file = "fd_data.csv"
dt <- fread(file.path(location, file),
            colClasses = list(character=c("type","nameOrig","nameDest"),
                              numeric=c("amount","oldbalanceOrg","newbalanceOrig",
                                        "oldbalanceDest","newbalanceDest"),
                              integer=c("step","isFlaggedFraud"),
                              factor=c("isFraud"))
)
head(dt)
dim(dt)
# imbalance in classes: fraud is uncommon
dt[, .(.N, prop = .N/nrow(dt)), by = isFraud]

Plots

sample the data for plotting, to save memory and speed

p = 0.2
set.seed(1)
sample_dt <- rbindlist(list(dt[isFraud == 1,],
                            dt[isFraud == 0,][sample(1:.N, size = floor(.N*p)),])) 
sample_dt[, logAmount := log(amount + 1)]

Can see the amount is higher usually in Fraudulent cases

pl <- ggplot(sample_dt, aes(x = logAmount, y = isFraud)) + 
  geom_density_ridges(fill = "goldenrod1") + theme_minimal()
pl

Only affects Transfer and Cash_out types of purchases

pl <- ggplot(sample_dt, aes(x = logAmount, y = type, fill = isFraud)) + 
  geom_density_ridges(alpha = .6) + theme_minimal()
pl
prop_types <- dt[type %in% c("TRANSFER", "CASH_OUT"), .N/nrow(dt)]

These make up r round(prop_types*100, 1)% of the data



jphelps13/fraud-detection documentation built on May 12, 2019, 4:36 a.m.