scripts/test_code/uptake/simulate-uptake-lsoa.R

library(sf)
library(dplyr)
library(xgboost)

pct.all <- readRDS("../cyipt-securedata/pct-routes-all.Rds")
pct.all <- as.data.frame(pct.all)
pct.all$geometry <- NULL
gc()

#Remove Unneded Data
pct.all <- pct.all[,names(pct.all)[!names(pct.all) %in% c("bicycle_16_24","bicycle_25_34","bicycle_35_49","bicycle_50_64","bicycle_65_74","bicycle_75p",
                                                         "bicycle_male_16p","bicycle_male_16_24","bicycle_male_25_34","bicycle_male_35_49","bicycle_male_50_64",
                                                         "bicycle_male_65_74","bicycle_male_75p","bicycle_female_16p","bicycle_female_16_24","bicycle_female_25_34",
                                                         "bicycle_female_35_49","bicycle_female_50_64","bicycle_female_65_74","bicycle_female_75p",
                                                         "pct.gov","pct.gen","pct.dutch","pct.ebike","ID","lsoa1","lsoa2","workathome","waypoint","is_two_way","co2_saving")]]

#Get an Idea if public transport is an option
pct.all$publictrans <- (pct.all$train + pct.all$underground + pct.all$bus) / pct.all$all_16p

pct.all <- pct.all[,names(pct.all)[!names(pct.all) %in% c("underground","train","bus","taxi","motorcycle","carorvan","passenger","other","onfoot")]]



#change to numeric
for(i in 1:ncol(pct.all)){
  pct.all[,i] <- as.numeric(pct.all[,i])
}

### Function
# Generic fucntion for testing model

test.model <- function(traindata, rounds){
  mat <- as.matrix(traindata[,names(traindata)[!names(traindata) %in% "pct.census"] ])
  model <- xgboost(data = mat, label = traindata$pct.census, nrounds = rounds)
  mat.all <- mat.all <- as.matrix(pct.all[,colnames(mat)])
  predict <- predict(object = model, mat.all)
  message(paste0("Correlation = ", round(cor(predict, pct.all$pct.census)^2,4) ))
  importance <- xgb.importance(model = model, feature_names = colnames(mat))
  xgb.plot.importance(importance)
  result <- data.frame(actual = pct.all$pct.census, predicted = predict)
  plot(sample_frac(result, 0.01))
  abline(a = 0, b = 1, col = "Red", lwd = 2)
  return(result)
}


###########

# Get a random sample of the data to train on
train <- sample_frac(pct.all, 0.1)


#####################################################################################

# Test Some models

# Idea 1: Basic Distance and Hilliness Just like the PCT

m1 <- test.model(traindata = train[,c("pct.census","all_16p","length","av_incline")], rounds = 10)

# Correlation = 0.4051

# Idea 2: Add In Busyness Score

m2 <- test.model(traindata = train[,c("pct.census","all_16p","length","av_incline","busyness")], rounds = 10)

# Correlation = 0.4051

# Idea 3: Try All Physical Characteritics

m3 <- test.model(traindata = train[,c("pct.census","all_16p","length","time","cum_hill",
                                "change_elev","dif_max_min","up_tot","down_tot",
                                "av_incline","calories","busyness")], rounds = 10)

# Correlation = 0.4111
# Average Incline (0.1) and Busyness (0.5) are the key factors

# Idea 4: Try Demographic Data Just Ages

m4 <- test.model(traindata = train[,c("pct.census","all_16p","all_16_24","all_25_34","all_35_49","all_50_64","all_65_74","all_75p")], rounds = 10)

# Correlation = 0.3155
# Oddly 50 - 60s and 35 - 49s are the best predictors

# Idea 5: Demographics Age and Gender

m5 <- test.model(traindata = train[,c("pct.census","all_16p","male_16p","male_16_24","male_25_35","male_35_49","male_50_64","male_65_74","male_75p",
                                "female_16p","female_16_24","female_25_34","female_35_49","female_50_64","female_65_74","female_75p")], rounds = 10)

# Correlation = 0.3399
# Males of all kinds best predictor, better than total population, followed by other younger males groups

# Idea 6: Demographics and Physical

m6 <- test.model(traindata = train[,c("pct.census","all_16p","male_16p","male_16_24","male_25_35","male_35_49","male_50_64","male_65_74","male_75p",
                                "female_16p","female_16_24","female_25_34","female_35_49","female_50_64","female_65_74","female_75p",
                                "length","time","cum_hill","change_elev","dif_max_min","up_tot","down_tot","av_incline","calories","busyness")], rounds = 10)

# Correlation = 0.4471
# Dominated by young males and then average incline and length


# Idea 7: Just Age non Generte and Physical Characteristics

m7 <- test.model(traindata = train[,c("pct.census","all_16p","all_16_24","all_25_34","all_35_49","all_50_64","all_65_74","all_75p",
                                "length","time","cum_hill",
                                "change_elev","dif_max_min","up_tot","down_tot",
                                "av_incline","calories","busyness")], rounds = 10)

# Correlation = 0.4306

# Now average incline and numbe of younger people more important

# Idea 8: Remove the cases with low cycling and train on a more relevant dataset

train2 <- pct.all[pct.all$pct.census > 1, ]
train2 <- sample_frac(train2, 0.1)

m8 <- test.model(traindata = train2[,c("pct.census","all_16p","all_16_24","all_25_34","all_35_49","all_50_64","all_65_74","all_75p",
                                "length","time","cum_hill",
                                "change_elev","dif_max_min","up_tot","down_tot",
                                "av_incline","calories","busyness")], rounds = 10)

# Correlation = 0.4048

# Similar Patteern to Idea 7 but better fit

# Idea 9: Try more Rounds on Idea 8

m9 <- test.model(traindata = train2[,c("pct.census","all_16p","all_16_24","all_25_34","all_35_49","all_50_64","all_65_74","all_75p",
                                 "length","time","cum_hill",
                                 "change_elev","dif_max_min","up_tot","down_tot",
                                 "av_incline","calories","busyness")], rounds = 20)

# Correlation = 0.4125

# Idea 10: Only Train on data with high percentage cycling

train3 <- pct.all
train3$pcycle <- train3$pct.census / train3$all_16p
train3 <- train3[train3$pcycle > 0.1,]
train3 <- sample_frac(train3, 0.5)

m10 <- test.model(traindata = train3[,c("pct.census","all_16p","all_16_24","all_25_34","all_35_49","all_50_64","all_65_74","all_75p",
                                        "length","time","cum_hill",
                                        "change_elev","dif_max_min","up_tot","down_tot",
                                        "av_incline","calories","busyness")], rounds = 10)
# Correlation = 0.3744


#Idea 11: weight sample based on % cycling

train4 <- sample_frac(pct.all, size = 0.1, weight = pct.all$pct.census / pct.all$all_16p)

m11 <- test.model(traindata = train4[,c("pct.census","all_16p","all_16_24","all_25_34","all_35_49","all_50_64","all_65_74","all_75p",
                                        "length","time","cum_hill",
                                        "change_elev","dif_max_min","up_tot","down_tot",
                                        "av_incline","calories","busyness")], rounds = 10)
cyipt/cyipt documentation built on Aug. 16, 2020, 10:24 p.m.