#' ---
#' title: "File to prep data for the gravity analysis of fluscape data"
#' author: Steven Riley
#' date: 2024-01-04
#' output: html_document
#' ---
#' This file is designed to be run with the private fluscape repo sat next to this development
#' version of the fluscapeR package. It will source some functions not available to fluscape
#' and will make versions of th key data that are not available to the public. It will also create
#' the jittered public versions of the data that are included in the package.
#' Clear memory, setup required libraries and set local data dir
rm(list=ls(all=TRUE))
library(devtools)
library(raster)
library(tidyverse)
library(raster)
load_all()
#' Assumes that the soure package fluscapeR is next to main private fluscape repo
local_data_dir <- "~/tmp" # Steven
local_data_dir <- "D:/tmp" # Jon
local_data_dir <- "./" # Vivi
fluscape_top_dir <- "../fluscape/"
fluscape_top_dir <- "../fluscape/" # Vivi
source(paste0(fluscape_top_dir,"source/R/mob_utility_private.r"))
source(paste0(fluscape_top_dir,"source/R/fluscape_copy_stevensRfunctions.R"))
source(paste0(fluscape_top_dir,"source/R/GeneralUtility.r"))
#' Load the snapshot of landscan saved into the fluscape directory
x1 <- fsc.load.wide.raster(fluscapetopdir=fluscape_top_dir)
#' This is not yet saved as a package data object, until I see how it get used later
#' ## Make the contacts data base for visit 1 and a jittered version
#'
#' The non-jittered version is NOT used in the package, but is generated here so that the
#' analysis can be run by the fluscape team from exactly the same code that others would
#' use to test the methods.
#' Load the fluscape data sets, and trim to visit 1
participants <- load_particpant_data_long(topdir = fluscape_top_dir)
participants <- participants[participants$VISIT==1,]
participants$pid = paste(participants$LOC_ID, participants$HH_ID, participants$PARTICIPANT_ID, sep="_")
households <- load_hh_data_long(topdir = fluscape_top_dir)
# recreate locations information from households
locations <- unique( households[,c("LOC_ID","LOC_Lat","LOC_Long","URBAN","DIST_FRM_GZ")] )
#' Set area boundary
long.lim = c( 112.8, 114.2 )
lat.lim = c( 22.6, 24.0 )
#' These are old versions of the data
# part_all <- load_particpant_data_long()
# cont_all <- read.csv("../fluscape/data/clean_datasets/ContactsAll.csv")
# dim(part_all)
# names(part_all)
#' Keep old data line in just in case. Not run.
## contacts_0 <- mob_load_old_contact_data( locations, households, participants )
#' If you have saved contacts_fluscape_V1 in your local,
#' you can load this contacts_fluscape_V1.rda directly and skip line 71-80,
#' otherwise you need to run line 71-80 and save it as a local file
#' In the future, we may intergrate it in the pacakge.
load(file=paste0(local_data_dir,"contacts_fluscape_V1.rda"))
#' Generate actual potentially identifiable data and main jittered data
#' The approximate conversions are: Latitude: 1 deg = 110.574 km. Longitude: 1 deg = 111.320*cos(latitude) km. So at 23 degrees, 1 degree of logitude is 110*cos(23/180*pi) = 101 ~= 100
#' SO a jitter error of 100 metres is a jitter of 0.001
contacts_fluscape_V1 <- mob_generate_contacts_data_from_scratch(
locations,
households,
participants,
jitterxy=FALSE,
topdir=fluscape_top_dir
)
contacts_jit <- mob_generate_contacts_data_from_scratch(
locations,
households,
participants,
jitterxy=TRUE,
jitter_error=0.001,
x1=x1,
topdir=fluscape_top_dir
)
# assess the jittering
hist(contacts_fluscape_V1$lat,breaks=c(0,seq(23,24,0.1),40))
hist(contacts_jit$lat-contacts_fluscape_V1$lat)
plot( contacts_fluscape_V1$HH_Long, contacts_fluscape_V1$HH_Lat, pch=16, cex=.3, col="blue", xlim=c(113.5,113.6), ylim=c(23.2,23.3))
points( contacts_jit$HH_Long, contacts_jit$HH_Lat, pch=16, cex=.3, col="red")
#' rename and write the potentially personally identifiable version to the local temp directory
save(contacts_fluscape_V1,file=paste0(local_data_dir,"/","contacts_fluscape_V1.rda"))
#' Rename and use 100m jittered data
contacts_V1_jittered_100m <- contacts_jit
usethis::use_data(contacts_V1_jittered_100m, overwrite = TRUE)
#' ## Prep for making the S matrix
lat.lim2 <- as.vector(c(22.11667,24.50833))
long.lim2 <- as.vector(c(112.2667,114.8000))
margin <- 0
ext <- extent(
long.lim2[1]-margin,long.lim2[2]+margin,lat.lim2[1]-margin,lat.lim2[2]+margin
)
gz_pop_raster <- crop(x1,ext)
saveRDS(gz_pop_raster, file = paste0(local_data_dir,"/data/","gz_pop_raster.rds"))
#' Brings in the previously generated S matrix for called pop_S_mat_fluscape
load("~/dbox/shares/me_jr_hm_dc_gravity/current/to_upload/pop_S_mat_fluscape.rda") # steven
load("G:/OneDrive - Imperial College London/Postdoc/Fluscape/fluscapeR/fluscapeR/pop_S_mat_fluscape.rda") # vivi
load("D:/tmp/pop_S_mat_fluscape.rda") # jon
# transfer this binary to jons machine
str(pop_S_mat_fluscape)
#' Firstly, we need to check that we can run this just for a small example. This test will also be
#' repeated in one of the package vingettes.
popsize_vector = values(popmatrix)
D=ncell(popsize_vector)
gz_pop_raster_agg <- aggregate(gz_pop_raster,popsize_vector,D,10)
pop_S_mat_fluscape_agg <- mob_calc_S_mat(gz_pop_raster_agg)
dim(pop_S_mat_fluscape_agg)
#' Now I need to load up the pre-calculated object and compare it with the size of this then
#' find the right size of matrix so I can check the radiation model runs.
# pop_S_mat_fluscape <- readRDS( "~/dbox/projects/mobility/socio-spatial-behaviour/tmpdata/read_et_al_S_margin.rds")
#' ## Code below here from Jon and must be an alternative way of making the S matrix. I just need to figure ot out. I will only need a little bit of it
#Next - check that the gravity model produces sensible results and try to calculate the likelihood of the
# data
# Up to here XXXX needs to work below
fnLog <- paste0(local_data_dir,"/gravity_log_debug.csv")
if (!file.exists(fnLog)) {
dftmp <- make.data.df(nrow=0)
write.csv(dftmp,fnLog,row.names=FALSE)
}
# fit radiation model to previous existing instance of S matrix, pop_S_mat_fluscape
# the input of tmp1 will only return default likelihood
tmp1 <- fit.mobility.model(
contacts_fluscape_V1,
gz_pop_raster,
Smat = pop_S_mat_fluscape,
noRepeats = 1,
logfile = fnLog,
optfun = fit.offset.radiation.optim,
psToFit = c("offset"),
psLB = c(1),
psUB = c(20*1000),
datasubset = "ALL",
fdebug=TRUE,
lognote = ""
)
# this input will run harriet.offset.radiation
tmp2 <- fit.mobility.model( contacts_fluscape_V1,
gz_pop_raster,
Smat = pop_S_mat_fluscape,
logfile=fnLog,
optfun = NULL,
psToFit = NULL,
psLB = NULL,
psUB = NULL,
datasubset="ALL",
fdebug=TRUE,
lognote="")
#------------------ works up to here.... ---------------------#
contacts <- contacts_fluscape_V1
# contacts_tmp <- contacts[contacts$LOC_ID<=5 & contacts$LOC_ID!=2,]
contacts_tmp <- contacts
contacts_small <- contacts_tmp[!is.na(contacts_tmp$long) | !is.na(contacts_tmp$lat) | !is.na(contacts_tmp$HH_Long) | !is.na(contacts_tmp$HH_Lat),]
range(contacts_small$long)
range(contacts_small$lat)
originx <- contacts_small$HH_Long
originy <- contacts_small$HH_Lat
dim(contacts_small)
# Find the spatial extent of destination long-lats, and crop landscan density raster.
# ext <- extent(113.2,114.5,22.5,23.5) # suitable for LOC_ID==1
# ext <- extent(113.2,114.5,22.7,23.6) # suitable for LOC_ID<=5 & LOC_ID!=2
ext <- extent(
min(contacts_small$long),
max(contacts_small$long),
min(contacts_small$lat),
max(contacts_small$lat)) # suitable for all locations
x2 <- crop(x1,ext)
#' Generating S matrix for radiation model will take very long time (~ a week).
#' For people who want to run the models by themselves, instead of generating full size
#' S matrix, we write a function to downsize the raw data and generate a sample S matrix.
#' Use agg_num to adjust the size - larger agg_num, smaller size
#' If agg_num = 0, it will calculate based on the full size of the data.
S.gravity <- generate.agg.Smatrix(contacts_fluscape_V1, x2, 0, "gravity")
S.raiation <- generate.agg.Smatrix(contacts_fluscape_V1, x2, 10, "radiation")
#' generate.agg.Smatrix includes codes from line 198 to line 299.
#' People can test if generate.agg.Smatrix() generates the same results as line 198-299 does
#' by using fundtion identical().
#destindices <- fsc.gen.dest.index(x2)
#originindices <- fsc.gen.origin.index(originx,originy,x2)
#distances <- fsc.gen.dist.matrix(x2,originindices,destindices)
# faster non-zero destination indices calculation:
tmp = getValues(x2)
cells = which( !is.na(tmp) & tmp>0 )
destindices <- data.frame(index=1:length(cells), cells=cells)
# faster origin indices calculation:
cells = unique( cellFromXY(x2, contacts_small[,c("HH_Long","HH_Lat")] ) )
originindices <- data.frame(index=1:length(cells), cells=cells)
# faster distances calculation:
a = xyFromCell( x2, destindices$cell )
b = xyFromCell( x2, originindices$cell )
distances = pointDistance( b, a, longlat=T )
# Next steps
# - generate a matrix of observations
# - generate a gravity model
# - compare the gravity models with the data
# number of origin and possible destination cells
noorig <- dim(originindices)[1] # number of origin cells
nodest <- dim(destindices)[1] # number of possible destination cells, large!
# make a matrix of observed contact origin-destinations
contactindices = cellFromXY( x2, contacts_small[,c("long","lat")] ) # in which cells did contact occur?
hhindices = cellFromXY( x2, contacts_small[,c("HH_Long","HH_Lat")] ) # which cells have participant households?
obs.tab = matrix( 0, nrow=noorig, ncol=nodest ) # matrix of observations
pb = txtProgressBar(min = 0, max = noorig, initial = 0)
for (k in 1:noorig) {
j = originindices$cell[k]
i = contactindices[hhindices==j]
i = i[!is.na(i)]
z = table(i)
i.indices = match(as.numeric(names(z)),destindices$cells)
obs.tab[k,i.indices] = as.numeric(z)
setTxtProgressBar(pb,k)
}
close(pb)
# faster gravity model generation
M <- 1
# Make a matrix of predicted origin-destination weights for gravity model
# with an offset of 500 and a power term M.
gravmodel <- matrix( nrow=noorig, ncol=nodest )
pb = txtProgressBar(min = 0, max = noorig, initial = 0)
for (i in 1:noorig) {
orcell <- originindices$cell[i]
dscell <- destindices$cell
n_o <- extract(x2,orcell)
n_d <- extract(x2,dscell)
dist <- distances[i, ]
gravmodel[i, ] <- (n_o * n_d) / (500 + dist^M)
setTxtProgressBar(pb,i)
}
close(pb)
pb = txtProgressBar(min = 0, max = noorig, initial = 0)
for (i in 1:noorig) {
gravmodel[i,] <- gravmodel[i,] / sum(gravmodel[i,])
setTxtProgressBar(pb,i)
}
close(pb)
#' test if generate.agg.S.radiation() can generate the same result
identical(S.gravity, gravmodel)
### WORK IN PROGRESS... NOT WORKING. IGNORE. START
#cells_in_range <- function( i, j, dist )
#{ destindices$cell[ which(dist<=dist[j]) ]
#}
#sum.n <- function( x2, dist, j )
#{
# cells <- destindices$cell[ which(dist<=dist[j]) ]
# sum(extract(x2,cells))
#}
#S <- matrix( nrow=noorig, ncol=nodest )
#i = 1
# dist <- distances[i, ]
# j = 5438
# no_ij <- sum.n( x2, dist, j )
#
# no_ij <- sum.n( x2, i, j )
### WORK IN PROGRESS... NOT WORKING. IGNORE. END
# Find radiation density sums, s_ij, for each orig-dest cell combination.
# Finds the total population density within equivalent distance from the origin.
# Warning, this is slow and takes a long time.
# Save to disk after generation for future use.
S <- matrix( nrow=noorig, ncol=nodest )
for (i in 1:noorig) {
print(paste(i,"/",noorig))
dist <- distances[i, ]
pb = txtProgressBar(min = 0, max = nodest, style = 3)
for (j in 1:nodest) {
within_d = which(dist<=dist[j])
radcell <- destindices$cell[within_d] # all destination cells within radius dist
S[i,j] = sum( extract(x2,radcell) )
setTxtProgressBar( pb, j )
close(pb)
}
}
# write.csv(S, "S.csv", row.names=FALSE)
# S <- as.matrix( read.csv("S.csv") ) # To read back in.
# jon: see D:\Users\jon\Dropbox\work\Fluscape\source\R\gravy_functions.R
gravy.fast.calc.S <- function( distances, x2, originindices, noorig, nodest,
write.to.file=TRUE, filename="auxilliary/tmp_S.csv" ) {
# Faster function to calculate S_ij, the sum of population densities within radius r_ij,
# for each origin-destination pair.
# Inputs:
# distances
# x2
# originindices
# noorig
# nodest
# Outputs:
# S -- matrix of S_ij, where rows correspond to originindices, and columns to destinations.
max.r = max(distances) # maximum distance
max.rij = which( distances==max.r, arr.ind=TRUE ) # which pair is it?
# ADD CHECK ***HERE*** THAT x2 IS BIG ENOUGH GIVEN ORIGINS AND max.rij
print("Warning: no automatic check that x2 is large enough to generate correct S_ij.")
allindices <- gravy.gen.dest.index( x2 )
all.distances <- gravy.gen.dist.matrix( x2, originindices, allindices ) # distances from origins to all possible cells
maxj = nrow(allindices)
S = matrix( 0, nrow=noorig, ncol=maxj )
for (i in 1:noorig) {
png(filename="S_prog.png")
plot( i, main=paste( "i =", i ) )
dev.off()
cell.i = originindices$cells[i]
n_i = extract( x2, cell.i ) # density in cell i
cell.j = destindices[ ,2]
r.ij = all.distances[i, ]
rank.r.ij <- rank( r.ij, ties.method="min" )
d.ij <- data.frame( j=cell.j, rank.d=rank.r.ij, d=r.ij, n=0 )
d.ij = d.ij[order(d.ij$d), ]
k.seq = unique(d.ij$rank)
# k: 1
nk = length(k.seq)
k1 = which( all.distances[i,]>=0 &
all.distances[i,]<=d.ij$d[1] )
j = allindices$cells[ k1 ]
n.j = extract( x2, j )
d.ij$S[1] = sum( n.j )
# k: 2 ... nk
pb = txtProgressBar(min=0,max=nk,style=3)
for (k in k.seq[2:nk]) {
setTxtProgressBar(pb,k)
k1 = which( all.distances[i,]>d.ij$d[k-1] &
all.distances[i,]<=d.ij$d[k] )
j = allindices$cells[ k1 ]
n.j = extract( x2, j )
d.ij$n[k] = sum( n.j )
}
close(pb)
d.ij$S = cumsum( d.ij$n )
k2 = match( d.ij$j, allindices$cells )
S[i,k2] <- d.ij$S
}
if (write.to.file==TRUE) {
write.csv( S, filename, row.names=F )
}
return( S )
}
# fast radiation model, given S_ij
radmodel <- matrix( nrow=noorig, ncol=nodest)
for (i in 1:noorig) {
orcell <- originindices$cell[i]
dscell <- destindices$cell
n_o <- extract(x2,orcell)
n_d <- extract(x2,dscell)
n_rad <- S[i,]
radmodel[i,] <- (n_o * n_d) / (n_d + S[i,])*(n_d + n_o + S[i,])
}
for (i in 1:noorig) {
radmodel[i, ] <- radmodel[i, ] / sum(radmodel[i, ])
}
# write.csv( radmodel, file="radmodel.csv", row.names=F, col.names=F )
par(mfcol=c(2,nrow(originindices)), mar=c(1,1,1,1) )
xy = xyFromCell(x2,destindices$cell)
for (i in 1:nrow(originindices))
{ col.seq = radmodel[i,]/max(radmodel[i,] )
plot( xy, pch=15, cex=.6, col=rgb(col.seq,0,1-col.seq) )
points( xyFromCell( x2, originindices$cell[i] ), pch=3, cex=1, col=3 )
points( xy[obs.tab[i,]>0,], pch=1, cex=.75, col=3 )
plot( distances[i,]/1000, radmodel[i,], pch=16, col=rgb(0,0,0,.1) )
}
# errors/warnings:
# 1: In extract(x2, radcell) :
# returning values at CELL NUMBERS (not coordinates) : 1919 and 1920
#2: In extract(x2, radcell) :
# returning values at CELL NUMBERS (not coordinates) : 1921 and 1922
#3: In extract(x2, radcell) :
# returning values at CELL NUMBERS (not coordinates) : 2075 and 2076
#4: In extract(x2, radcell) :
# returning values at CELL NUMBERS (not coordinates) : 2077 and 2078
fsc.calc.model <- function()
# Why are there zeros above!
# The zeros above need to be fixed first
# !!!!! Starting code not run
# It should be possible to adapt the code below
# to calculate the log likelihood of the data given
# a gravity type model
dataMatrix <- gravmodel
dataMatrix[,] <- 1
veccurrentN <- colSums(dataMatrix)
veccurrentP <- vector(length=noorig)
veccurrentP[] <- 1
lnlike = 0
for (i in 1:(nodest-1)) {
for (j in 1:noorig) {
n <- dataMatrix[i,j]
if (n > 0) {
p <- gravmodel[i,j]
lnlike <- lnlike + dbinom(n,veccurrentN[j],p/veccurrentP[j],log=TRUE)
if (is.na(lnlike)) browser()
veccurrentN[j] <- veccurrentN[j] - n
veccurrentP[j] <- veccurrentP[j] - p
if (veccurrentP[j] < -1e100) browser()
}
}
}
# !!!!! Ending code not run
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.