malamodpkg: R scripts - modeling the risk of residual malaria transmission at a micro-scale

## @knitr setup_param

## Setup path to the datasets (both input and output)
path_to_output_datasets<-"/home/ptaconet/react/datasets/data_BF"
## Setup the path to the file containing the EarthData credentials (for NASA datasets). See example at system.file("example-data/credentials_earthdata.txt", package = "getRemoteData")
path_to_earthdata_credentials<-"/home/ptaconet/react/datasets/credentials_earthdata.txt"
## Setup the path to the input dataset of dates, latitudes and longitudes of interest. The dataset in our case is stored in a geopackage database. For the reproducibility we also provide the path to the same dataset as a simple csv file (see first chunk of section "Workflow preparation")
path_to_db<-"/home/ptaconet/react/datasets/react_db.gpkg"
query_input_sampling_points<-"SELECT idpointdecapture as id, date_capture as date, X as longitude, Y as latitude FROM ent_hlcmetadata WHERE codepays='BF'"
# Setup buffer sizes in meters (i.e. buffers within which the statistics will be calculated). Multiple sizes might be provided as a vector.
buffer_sizes<-c(500,1000,2000)
## Specific to time series data :
#Setup the time frames/lags for time-series data
lagTime_timeSeries<-40
hourStart_Dday<-"18"
hourEnd_Dday<-"08"
# GPM, TAMSAT and ERA-5 data are originally provided at approx. respectively 10km, 4km and 27km spatial resolutions. Resample them spatially ? If TRUE, resampling will use bilinear interpolation
gpm_resample<-TRUE
tamsat_resample<-TRUE
era5_resample<-TRUE
gpm_resample_output_res<-250 # Spatial resolution (in meters) for output resampled data. Must be setup only if xxx_resample is set to TRUE
tamsat_resample_output_res<-250
era5_resample_output_res<-1000

## For some covariates (Settlements and population, Built-up, Topography, Pedology, Land use / cover) we need own data. These data are stored in the geopackage database (stored under "path_to_db"). We provide the names of the tables that we will use (parameters starting with "db_table")

# Specific to Settlements and population :
db_table_pop_by_village<-"rst_villages"  # table containing the population for each village
db_table_loc_households_by_village<-"rst_menages"  # table containing the location of each house in each village

# Specific to Population (HRSL)
path_to_hrls_raster<-"/home/ptaconet/react/datasets/data_BF/HRSL/hrsl_bfa_pop.tif"

## Specific to Topography and hydrographic network:
path_to_grassApplications_folder<-"/usr/lib/grass74" # Path to the GRASS application folder, for the setup of the rgrass7 package. Can be retrieved in the terminal with grass74 --config path . More info on the use of rgrass7 at https://grasswiki.osgeo.org/wiki/R_statistics/rgrass7
threshold_accumulation_raster<-800  # threshold for the accumulaton raster dataset (all values above this threshold are considered as the hydrographic network). # For CIV: 800 ; For BF: 1000

## Specific to Pedology :
db_table_pedology_rast<-"lco_cipedology"  # Name of pedology raster in the DB
hydromorphic_classes_pixels<-c(11,14,5,2,13) # pixels values whose classes are considered hydromorphic.   for CIV: c(11,14,5,2,13)  for BF: c(2,3,8,9,10)

## Specific to Land use / cover :
db_table_metadata_lulc<-"landcovers_metadata" # Name of land use / cover metadata table in the DB. This table contains the information to use to compute land use / land cover landscape metrics : the various LU/LC rasters, the path to the rasters, etc.
layers_lulc<-c("lco_ciL2","lco_ciL3","lco_ciL4","lco_ciL5","ESACCI-LC-L4-LC10-Map-20m-P1Y-2016-v1.0","W020N20_ProbaV_LC100_epoch2015_global_v2.0.1") # Names of the LU/LC layers in the table "db_table_metadata_lulc" to use for the computation of landscape metrics
lsm_to_extract<-c("lsm_l_shdi","lsm_l_siei","lsm_l_prd","lsm_c_np","lsm_c_pland","lsm_c_ed")  # landscape metrics to calculate for the LU/LC maps. A list of available LSM can be found with landscapemetrics::list_lsm()


## @knitr setup_packages

require(raster)
require(sp)
require(sf)
require(spatstat)
require(maptools)
require(rgrass7)
require(rgdal)
require(gdalUtils)
require(rgeos)
require(tidyverse)
require(httr)
require(furrr)
require(reticulate)
require(RSQLite)
require(landscapemetrics)
if(!require(getRemoteData)){
  require(devtools)
  devtools::install_github("ptaconet/getRemoteData")
}
require(getRemoteData)


## @knitr setup_functions

# Reshaping a list to get a data.frame of data to download, used as input of the getRemoteData::getRemoteData::downloadData function.
.getDftoDl<-function(data_md){
  DftoDl<- data_md %>%
    flatten %>%
    do.call(rbind,.) %>%
    distinct %>%
    set_names("name","url","destfile")
  return(DftoDl)
}

# Get the names of the time-series rasters to use for each date
.getRastersNames_timeSeries<-function(df_sampling_points,xxxData_md,dataCollection){
  rastsNames<-map(df_sampling_points$date_numeric,~pluck(xxxData_md,as.character(.))) %>%
    map(.,pluck(dataCollection)) %>%
    map(.,pluck("name")) %>%
    map(.,as.character)
  return(rastsNames)
}

# Get the path of local rasters
.getRastersPaths<-function(xxxData_md,dataCollection){
  path<-modify(xxxData_md,dataCollection) %>%
    map(~list_modify(.,"url" = NULL)) %>%
    map(data.frame) %>%
    reduce(bind_rows) %>%
    dplyr::select(name,destfile) %>%
    unique
  return(path)
}

# Time series data: Extract a summary of a variable over 1 buffer
.extractVar_ts_singleBuff<-function(rasts,names_rasts_to_use,spPoints,buffer_size,fun_summarize){
  res<-rasts[names_rasts_to_use] %>%  # filter only the rasters of the time series
    map_dfr(~raster::extract(.,spTransform(spPoints,proj4string(.)),buffer=buffer_size,fun=eval(parse(text=fun_summarize)), na.rm=TRUE, small=TRUE)) %>% # for each raster, calculate the stats
    #set_names(origin_date-as.numeric(names(.))) %>% ## to name by the number of days separating the date of interest from the the day of the data
    set_names(seq(0,ncol(.)-1,1)) %>%   ## to name by the lag index
    mutate(id=as.character(spPoints$id)) %>%
    mutate(buffer=buffer_size) %>%
    gather(time_lag,val,-c(id,buffer)) %>%
    mutate(time_lag=as.numeric(time_lag))
  return(res) # to put in wide format : res <- res %>% unite(var,var,time_lag)) %>% spread(key=var,value=val)
}


# Time series data: Extract a summary of a variable over multiple buffers
.extractVar_ts<-function(buffer_sizes,df_sampling_points,names_rasts_to_use,rasters,fun_summarize){
  res<-buffer_sizes %>% # for each buffer, calculate stats
    set_names %>%
    future_map_dfr(~pmap_dfr(list(names_rasts_to_use,df_sampling_points$sp_points,.,fun_summarize),
                             ~.extractVar_ts_singleBuff(rasters,..1,..2,..3,..4)))  %>%
    #mutate(var=var_name) %>%
    dplyr::select(id,buffer,time_lag,val)
  # to put in wide format : lst_min <- lst_min %>% unite(var,var,time_lag) %>% spread(key=var,value=val)
  return(res)
}

# Static (non time-series) data: Extract a summary of a variable over 1 buffer
.extractVar_rast_singleBuff<-function(rasts,spPoints,buffer_size,fun_summarize){
  res<-raster::extract(rasts,spTransform(spPoints,proj4string(rasts)),buffer=buffer_size,fun=eval(parse(text=fun_summarize)), na.rm=TRUE, small=TRUE,df=TRUE) %>%
    mutate(id=as.character(spPoints$id)) %>%
    mutate(buffer=buffer_size) %>%
    select(-ID) %>%
    gather(var,val,-c(id,buffer))
  return(res)
}

# Static (non time-series) data: Extract a summary of line (vector) variable over 1 buffer
.extractVar_line_singleBuff<-function(network_sf,sfPoints,buffer_size){
  buffer<-st_buffer(sfPoints,dist=buffer_size) # Create the buffer around each point
  sfPoints <- sfPoints %>%
    rename(geometry_pt=geometry) # rename point geometry column, else an error is sent back on the pipe flow
  res<-st_join(buffer,network_sf, join = st_intersects,left = TRUE) %>% # Get the lines inside each buffer
    left_join(as.data.frame(network_sf)) %>%  # get the line geometry
    rename(geometry_line=geom) %>% # rename line geometry column to be more clear
    left_join(as.data.frame(sfPoints),by="id")  %>%  # get the point geometry
    mutate(dist_line_to_pt=st_distance(geometry_pt,geometry_line,by_element = T)) %>% # get the distance between each point and line
    st_drop_geometry() %>%
    dplyr::select(id,length_stream,dist_line_to_pt,accumulation) %>%
    group_by(id) %>%
    summarise(mean_dist_to_stream=as.numeric(mean(dist_line_to_pt)),min_dist_to_stream=as.numeric(min(dist_line_to_pt)),length_stream=as.numeric(sum(length_stream)),mean_acc_by_dist=as.numeric(mean(accumulation*dist_line_to_pt))) %>% # compute stats related to hydrographic network
    mutate(mean_dist_to_stream=na_if(mean_dist_to_stream,0)) %>%
    mutate(min_dist_to_stream=na_if(min_dist_to_stream,0)) %>%
    mutate(length_stream = if_else(is.na(length_stream), 0, length_stream))


  res <- res %>%
    mutate(buffer=buffer_size) %>%
    gather(var,val,-c(id,buffer))
  return(res)
}

# Spatially resample a product (using bilinear interpolation)
.resampleProd<-function(rast,resample_output_res){
  resample_output_res<-getRemoteData::convertMetersToDegrees(resample_output_res,latitude_4326=mean(c(extent(rast)[3],extent(rast)[4])))
  r<-rast
  res(r)<-resample_output_res
  rast<-raster::resample(rast,r,method='bilinear')
}

# Impute NAs and setup quality flag
# qval=1 is for values that where retrieved using the LST 1km/1day products (best quality)
# qval=2 is for values that where retrieved using the temperature of the upper(s) buffer(s) for the same day / time lag
# qval=3 is for values that where retrieved using the mean temperature of the previous and the following day / time lag
# qval=4 is for values that where retrieved using the LST 1km/8day temperature for the week of interest
# qval=5 is for values that where retrieved using the mean temperature of the previous and the following LST 1km/8day temperature
# qval=0 is for NAs remaining after these imputings

# qval=1
.modis_fillNA_l1<-function(df,qvalue){
  df <- df %>%
    mutate(qval=if_else(!is.na(val),qvalue,0))
  return(df)
}

# qval=2
.modis_fillNA_l2<-function(df,qvalue){
  df <- df %>%
    arrange(id,time_lag,buffer) %>%
    group_by(id,time_lag) %>%
    fill(val, .direction = "up") %>%
    mutate(qval=if_else((qval==0 & !is.na(val)),qvalue,qval)) %>%
    as.data.frame()
  return(df)
}

# qval=3
.modis_fillNA_l3<-function(df,qvalue){
  df <- df %>%
    arrange(id,buffer,time_lag) %>%
    mutate(val_ahead=lead(val)) %>%
    mutate(val_lag=lag(val)) %>%
    mutate(val_mean=map2_dbl(.x=val_ahead,.y=val_lag,~mean(c(.x,.y),na.rm=T))) %>%
    mutate(val_mean=replace(val_mean, which(is.nan(val_mean)), NA)) %>%
    mutate(val=if_else(!is.na(val),val,val_mean)) %>%
    mutate(qval=if_else((qval==0 & !is.na(val)),qvalue,qval))  %>%
    dplyr::select(-c(val_ahead,val_lag,val_mean))
  return(df)
}

# qval=4
.modis_fillNA_l4<-function(df,df_lst8day,qvalue){
  lst8day_as_1day <- df_lst8day %>%
    slice(rep(1:n(), each = 7)) %>%
    group_by(id,buffer) %>%
    mutate(time_lag=seq(0,n()-1,1))

  df<-left_join(df,lst8day_as_1day,by=c("id","buffer","time_lag")) %>%
    mutate(val.x=if_else(!is.na(val.x),val.x,val.y)) %>%
    dplyr::select(-c(val.y,var.y,qval.y)) %>%
    rename(val=val.x,var=var.x,qval=qval.x) %>%
    mutate(qval=if_else((qval==0 & !is.na(val)),qvalue,qval))
  return(df)
}

## @knitr prepare_datesloc

con <- dbConnect(RSQLite::SQLite(),path_to_db)
raw_sampling_points<-dbGetQuery(con, query_input_sampling_points)
##   to reproduce without access to the DB, you might use :
# raw_sampling_points<-read.csv(system.file("extdata/CIV_sampling_points.csv", package = "malamodpkg"),stringsAsFactors = F)
head(raw_sampling_points)

## @knitr prepare_roi

roi<-sf::st_as_sf(raw_sampling_points, coords = c("longitude", "latitude"), crs = 4326) %>%
  sf::st_bbox()

mean_roi_latitude<-mean(c(roi$ymin,roi$ymax))
roi[1]<-roi[1]-0.05-getRemoteData::convertMetersToDegrees(max(buffer_sizes),mean_roi_latitude) #xmin
roi[2]<-roi[2]-0.05-getRemoteData::convertMetersToDegrees(max(buffer_sizes),mean_roi_latitude) #ymin
roi[3]<-roi[3]+0.05-getRemoteData::convertMetersToDegrees(max(buffer_sizes),mean_roi_latitude) #xmin
roi[4]<-roi[4]+0.05-getRemoteData::convertMetersToDegrees(max(buffer_sizes),mean_roi_latitude) #ymin

roi<-roi %>%
  sf::st_as_sfc() %>%
  sf::st_sf()

utm_zone<-getRemoteData::getUTMepsg(roi)
roi_utm<-st_transform(roi,utm_zone)

# Convert points as sp and sf objects for further processing
spPoints <- SpatialPointsDataFrame(coords=data.frame(raw_sampling_points$longitude,raw_sampling_points$latitude),data=raw_sampling_points,proj4string=CRS("+init=epsg:4326"))
sfPoints <- st_as_sf(spPoints)

## @knitr prepare2_datesloc
df_sampling_points<-raw_sampling_points %>%
  mutate(date=as.Date(date)) %>%
  group_by(date) %>%
  arrange(date) %>%
  nest(latitude,longitude,id) %>%
  set_names(c("date_date","coords")) %>%
  mutate(sp_points=map(coords,~SpatialPointsDataFrame(coords=data.frame(.$longitude,.$latitude),data=.,proj4string=CRS("+init=epsg:4326")))) %>%
  dplyr::select(-coords) %>%
  mutate(date_numeric=as.integer(date_date))  %>%
  mutate(lagTime_timeSeries=lagTime_timeSeries)
df_sampling_points

datesDday<-as.Date(unique(raw_sampling_points$date))
head(datesDday)

datesTimesSeries<-df_sampling_points$date_date %>%
  map(~seq(.,.-lagTime_timeSeries,-1)) %>%
  unlist %>%
  unique %>%
  sort %>%
  as.Date(origin="1970-01-01")
head(datesTimesSeries)

## @knitr prepare_parralel
plan(multiprocess)
options(future.globals.maxSize= 20000*1024^2) # 20 GB for the max size to be exported for the furrr future expression (https://stackoverflow.com/questions/40536067/how-to-adjust-future-global-maxsize-in-r)

## @knitr prepare_grass
loc <- rgrass7::initGRASS(path_to_grassApplications_folder, home=path_to_output_datasets, gisDbase="GRASS_TEMP", override=TRUE,mapset = "PERMANENT" )
rgrass7::execGRASS("g.proj",flags="c",parameters = list(proj4=sf::st_crs(roi_utm)$proj4string))

## @knitr prepare_earthdata
earthdata_credentials<-readLines(path_to_earthdata_credentials)
username_EarthData<-strsplit(earthdata_credentials,"=")[[1]][2]
password_EarthData<-strsplit(earthdata_credentials,"=")[[2]][2]
httr::set_config(authenticate(user=username_EarthData, password=password_EarthData, type = "basic"))

## @knitr modis_import
##########  1/ we extract the set of parameters that will be parsed in the `getRemoteData::getData_modis` function
# Get MODIS tile
modisCrs<-"+proj=sinu +lon_0=0 +x_0=0 +y_0=0 +a=6371007.181 +b=6371007.181 +units=m +no_defs" # MODIS data come with a specific CRS. We first need to transform the CRS of our ROI from EPSG 4326 to the specific MODIS CRS.
modisTile<-getRemoteData::getMODIStileNames(roi)
roi_bbox<-sf::st_bbox(sf::st_transform(roi,modisCrs))

cat("Retrieving information to download MODIS data on the OpeNDAP servers...\n")
modisOpenDAP_md<-c("MOD11A1.006","MYD11A1.006","MOD11A2.006","MYD11A2.006","MOD13Q1.006","MYD13Q1.006","MOD16A2.006","MYD16A2.006") %>%
  data.frame(stringsAsFactors = F) %>%
  set_names("source") %>%
  mutate(OpendapURL=paste0("https://opendap.cr.usgs.gov/opendap/hyrax","/",source,"/",modisTile,".ncml")) %>%
  mutate(Opendap_timeVector=map(OpendapURL,~getRemoteData::getOpenDAPvector(.,"time"))) %>%
  mutate(Opendap_xVector=map(OpendapURL,~getRemoteData::getOpenDAPvector(.,"XDim"))) %>%
  mutate(Opendap_yVector=map(OpendapURL,~getRemoteData::getOpenDAPvector(.,"YDim"))) %>%
  mutate(Opendap_minLon=map(Opendap_xVector,.f=~which.min(abs(.x-roi_bbox$xmin))-1)) %>%
  mutate(Opendap_maxLon=map(Opendap_xVector,.f=~which.min(abs(.x-roi_bbox$xmax))-1)) %>%
  mutate(Opendap_minLat=map(Opendap_yVector,.f=~which.min(abs(.x-roi_bbox$ymax))-1)) %>%
  mutate(Opendap_maxLat=map(Opendap_yVector,.f=~which.min(abs(.x-roi_bbox$ymin))-1)) %>%
  mutate(roiSpatialIndexBound=pmap(list(Opendap_minLat,Opendap_maxLat,Opendap_minLon,Opendap_maxLon),.f=~c(..1,..2,..3,..4))) %>%
  mutate(destFolder=file.path(path_to_output_datasets,source)) %>%
  mutate(dimensionsToRetrieve=case_when(source %in% c("MOD11A1.006","MYD11A1.006","MOD11A2.006","MYD11A2.006") ~ list(c("LST_Day_1km","LST_Night_1km")),
                                        source %in% c("MOD13Q1.006","MYD13Q1.006") ~ list(c("_250m_16_days_NDVI","_250m_16_days_EVI")),
                                        source %in% c("MOD16A2.006","MYD16A2.006") ~ list(c("ET_500m"))
  ))

##########  2/ we retrieve the URLs of all the MODIS datasets to download using the `getRemoteData::getData_modis` function
modisData_md<-datesDday %>%
  set_names(as.numeric(.)) %>% # names will be numeric format of the dates (days since 1970-01-01)
  map(~pmap(list(.,pluck(modisOpenDAP_md,"source"),pluck(modisOpenDAP_md,"destFolder"),pluck(modisOpenDAP_md,"dimensionsToRetrieve"),pluck(modisOpenDAP_md,"Opendap_timeVector"),pluck(modisOpenDAP_md,"roiSpatialIndexBound")),
            ~getRemoteData::getData_modis(timeRange = c(..1-lagTime_timeSeries,..1),
                                          collection=..2,
                                          destFolder=..3,
                                          modisTile=modisTile,
                                          dimensions=unlist(..4),
                                          OpenDAPtimeVector=unlist(..5),
                                          roiSpatialIndexBound=unlist(..6),
                                          download = F)) %>%
        set_names(pluck(modisOpenDAP_md,"source")))
head(modisData_md)


##########  3/ we download the data using the `getRemoteData::downloadData` function.
DftoDl<-.getDftoDl(modisData_md)
cat("Downloading MODIS data...\n")
Dl_res<-getRemoteData::downloadData(DftoDl,username_EarthData,password_EarthData,parallelDL=TRUE)
# Check if the datasets were downloaded
head(file.exists(DftoDl$destfile))


## @knitr temperature_tidytransform
# Local function to extract MODIS LST data
.extract_modis_lst<-function(MOD_collection, MYD_collection){
  # For each date of interest, get the related time-series rasters names (i.e. the whole set of 40 days rasters)
  rastsNames_LST<-.getRastersNames_timeSeries(df_sampling_points,modisData_md,MOD_collection)

  # Build the path to the Terra (MOD) and Aqua (MYD) products
  path_to_mod<-.getRastersPaths(modisData_md,MOD_collection)
  path_to_myd<-.getRastersPaths(modisData_md,MYD_collection)
  path_to_mod_myd<-merge(path_to_mod,path_to_myd,by="name",suffixes = c("_mod","_myd"))

  # Pre-process LstMin
  rasts_LstMin<-path_to_mod_myd %>%
    mutate(rast_mod=map(destfile_mod,~getRemoteData::prepareData_modis(.,"LST_Night_1km"))) %>%  # open Terra products as raster
    mutate(rast_myd=map(destfile_myd,~getRemoteData::prepareData_modis(.,"LST_Night_1km"))) %>%  # Open Aqua products as raster
    mutate(rast=map2(rast_mod,rast_myd,~min(.x,.y,na.rm = T))) %>%  # Take the minimum value available between Terra and Aqcua
    mutate(rast=map(rast,~.-273.15)) %>%  # convert into celcius degrees
    pluck("rast") %>%
    set_names(path_to_mod_myd$name)

  # Pre-process LstMax
  rasts_LstMax<-path_to_mod_myd %>%
    mutate(rast_mod=map(destfile_mod,~getRemoteData::prepareData_modis(.,"LST_Day_1km"))) %>%
    mutate(rast_myd=map(destfile_myd,~getRemoteData::prepareData_modis(.,"LST_Day_1km"))) %>%
    mutate(rast=map2(rast_mod,rast_myd,~max(.x,.y,na.rm = T))) %>%
    mutate(rast=map(rast,~.-273.15)) %>%
    pluck("rast") %>%
    set_names(path_to_mod_myd$name)

  # Extract the variables
  cat("Extracting LstMin...\n")
  lstMin<-.extractVar_ts(buffer_sizes,df_sampling_points,rastsNames_LST,rasts_LstMin,"mean") # to put in wide format : lstMin <- lstMin %>% unite(var,var,time_lag) %>% spread(key=var,value=val)
  lstMin$var<-"lstMin"
  cat("Extracting LstMax...\n")
  lstMax<-.extractVar_ts(buffer_sizes,df_sampling_points,rastsNames_LST,rasts_LstMax,"mean")
  lstMax$var<-"lstMax"
  #head(lstMin)

  return(list(lstMin,lstMax))
}

# Extract MODIS LST 1 day
cat("extracting MODIS LST 1 day...\n")
lst1day<-.extract_modis_lst("MOD11A1.006","MYD11A1.006")
lst1day[[1]]$var<-"TND"
lst1day[[2]]$var<-"TMD"
# Extract MODIS LST 8 days
cat("extracting MODIS LST 8 days...\n")
lst8day<-.extract_modis_lst("MOD11A2.006","MYD11A2.006")
lst8day[[1]]$var<-"TNW"
lst8day[[2]]$var<-"TMW"

## Impute NAs
lst8day<-lst8day %>%
  map(.,~.modis_fillNA_l1(.,1))

lst1day<-lst1day %>%
  map(.,~.modis_fillNA_l1(.,1)) %>%
  map(.,~.modis_fillNA_l2(.,2)) %>%
  map(.,~.modis_fillNA_l3(.,3)) %>%
  map2(.x=.,.y=lst8day,~.modis_fillNA_l4(.x,.y,4))

lst8day<-lst8day %>%
  map(.,~.modis_fillNA_l2(.,2)) %>%
  map(.,~.modis_fillNA_l3(.,3))

lst1day<-lst1day %>%
  map2(.x=.,.y=lst8day,~.modis_fillNA_l4(.x,.y,5))


TND<-lst1day[[1]]
TMD<-lst1day[[2]]
TNW<-lst8day[[1]]
TMW<-lst8day[[2]]

# Export
write.csv(TND,file.path(path_to_output_datasets,"envCov_TND.csv"),row.names = F)
write.csv(TMD,file.path(path_to_output_datasets,"envCov_TMD.csv"),row.names = F)
write.csv(TNW,file.path(path_to_output_datasets,"envCov_TNW.csv"),row.names = F)
write.csv(TMW,file.path(path_to_output_datasets,"envCov_TMW.csv"),row.names = F)


## @knitr vegetation_tidytransform
# For each date of interest, get the related time-series rasters names (i.e. the whole set of 40 days rasters)
rastsNames_VI<-map2(.getRastersNames_timeSeries(df_sampling_points,modisData_md,"MOD13Q1.006"),.getRastersNames_timeSeries(df_sampling_points,modisData_md,"MYD13Q1.006"),c) %>%
  map(as.numeric) %>%
  map(~sort(.,decreasing=TRUE)) %>%
  map(as.character)

# Build the path to the Terra (MOD) and Aqua (MYD) products
path_to_mod<-.getRastersPaths(modisData_md,"MOD13Q1.006")
path_to_myd<-.getRastersPaths(modisData_md,"MYD13Q1.006")
path_to_mod_myd<-rbind(path_to_mod,path_to_myd) %>% arrange(name)

# Pre-process NDVI. Bad quality pixels are directly set to NA. We keep all the other pixels.
rasts_ndvi<-path_to_mod_myd %>%
  mutate(rast=map(destfile,~getRemoteData::prepareData_modis(.,"_250m_16_days_NDVI"))) %>%
  pluck("rast") %>%
  set_names(path_to_mod_myd$name)

# Pre-process EVI
rasts_evi<-path_to_mod_myd %>%
  mutate(rast=map(destfile,~getRemoteData::prepareData_modis(.,"_250m_16_days_EVI"))) %>%
  pluck("rast") %>%
  set_names(path_to_mod_myd$name)

# Extract the variables
cat("Extracting ndvi...\n")
VNI<-.extractVar_ts(buffer_sizes,df_sampling_points,rastsNames_VI,rasts_ndvi,"mean")
VNI$var<-"VNI"
cat("Extracting evi...\n")
VEI<-.extractVar_ts(buffer_sizes,df_sampling_points,rastsNames_VI,rasts_evi,"mean")
VEI$var<-"VEI"
rm(rasts_ndvi,rasts_evi)

## Impute NAs
VNI<-VNI %>%
  .modis_fillNA_l1(.,1) %>%
  .modis_fillNA_l2(.,2) %>%
  .modis_fillNA_l3(.,3)

VEI<-VEI %>%
  .modis_fillNA_l1(.,1) %>%
  .modis_fillNA_l2(.,2) %>%
  .modis_fillNA_l3(.,3)

# Export
write.csv(VNI,file.path(path_to_output_datasets,"envCov_VNI.csv"),row.names = F)
write.csv(VEI,file.path(path_to_output_datasets,"envCov_VEI.csv"),row.names = F)


## @knitr evapotransiration_tidytransform
# For each date of interest, get the related time-series rasters names (i.e. the whole set of 40 days rasters)
rastsNames_Et<-.getRastersNames_timeSeries(df_sampling_points,modisData_md,"MOD16A2.006")

# Build paths to data
path_to_mod<-.getRastersPaths(modisData_md,"MOD16A2.006")
path_to_myd<-.getRastersPaths(modisData_md,"MYD16A2.006")
path_to_mod_myd<-merge(path_to_mod,path_to_myd,by="name",suffixes = c("_mod","_myd"))

# Pre-process
rasts_Et<-path_to_mod_myd %>%
  mutate(rast_mod=map(destfile_mod,~getRemoteData::prepareData_modis(.,"ET_500m"))) %>%
  mutate(rast_myd=map(destfile_myd,~getRemoteData::prepareData_modis(.,"ET_500m"))) %>%
  mutate(rast_mod=map(rast_mod,~clamp(.x,upper=32760,useValues=FALSE))) %>% ## Set pixel values >= 32760 (quality pixel values) to NA
  mutate(rast_myd=map(rast_myd,~clamp(.x,upper=32760,useValues=FALSE))) %>% ## Set pixel values >= 32760 (quality pixel values) to NA
  mutate(rast=map2(rast_mod,rast_myd,~mean(.x,.y,na.rm = T))) %>%
  pluck("rast") %>%
  set_names(path_to_mod_myd$name)

# Extract
cat("Extracting et...\n")
EVT<-.extractVar_ts(buffer_sizes,df_sampling_points,rastsNames_Et,rasts_Et,"mean")
EVT$var<-"EVT"
rm(rasts_Et)

## Impute NAs
EVT<-EVT %>%
  .modis_fillNA_l1(.,1) %>%
  .modis_fillNA_l2(.,2) %>%
  .modis_fillNA_l3(.,3)

# Export
write.csv(EVT,file.path(path_to_output_datasets,"envCov_EVT.csv"),row.names = F)


## @knitr precipitationGpm_import
#########  Extract set of information that will be parsed in the getData_gpm function
roi_bbox<-st_bbox(roi)
cat("Retrieving information to download GPM data on the OpeNDAP servers...\n")
gpmOpenDAP_md<-c("GPM_3IMERGDF.06","GPM_3IMERGHH.06") %>%
  data.frame(stringsAsFactors = F) %>%
  set_names("source") %>%
  mutate(OpendapURL="https://gpm1.gesdisc.eosdis.nasa.gov/opendap/GPM_L3/GPM_3IMERGHH.06/2016/001/3B-HHR.MS.MRG.3IMERG.20160101-S000000-E002959.0000.V06B.HDF5") %>%
  mutate(Opendap_xVector=map(OpendapURL,~getOpenDAPvector(.,"lon"))) %>%
  mutate(Opendap_yVector=map(OpendapURL,~getOpenDAPvector(.,"lat"))) %>%
  mutate(Opendap_minLon=map(Opendap_xVector,.f=~which.min(abs(.x-roi_bbox$xmin))-4)) %>%
  mutate(Opendap_maxLon=map(Opendap_xVector,.f=~which.min(abs(.x-roi_bbox$xmax))+4)) %>%
  mutate(Opendap_minLat=map(Opendap_yVector,.f=~which.min(abs(.x-roi_bbox$ymin))-4)) %>% ## careful, this line is not the same as for Modis. ymax has become ymin.
  mutate(Opendap_maxLat=map(Opendap_yVector,.f=~which.min(abs(.x-roi_bbox$ymax))+4)) %>%
  mutate(roiSpatialIndexBound=pmap(list(Opendap_minLat,Opendap_maxLat,Opendap_minLon,Opendap_maxLon),.f=~c(..1,..2,..3,..4))) %>%
  mutate(destFolder=file.path(path_to_output_datasets,source)) %>%
  mutate(dimensionsToRetrieve=case_when(source %in% c("GPM_3IMERGDF.06") ~ list(c("precipitationCal")),
                                        source %in% c("GPM_3IMERGHH.06") ~ list(c("precipitationCal","precipitationQualityIndex"))
  ))

#########  Build list of datasets to download for GPM Daily and for all dates
gpmOpenDAP_md_daily<-gpmOpenDAP_md %>%
  filter(source=="GPM_3IMERGDF.06")

gpmData_md_daily<-datesDday %>%
  set_names(as.numeric(.)) %>% # names will be numeric format of the dates (days since 1970-01-01)
  map(~pmap(list(.,pluck(gpmOpenDAP_md_daily,"source"),pluck(gpmOpenDAP_md_daily,"destFolder"),pluck(gpmOpenDAP_md_daily,"dimensionsToRetrieve"),pluck(gpmOpenDAP_md_daily,"roiSpatialIndexBound")),
            ~getRemoteData::getData_gpm(timeRange = c(..1-lagTime_timeSeries,..1),
                                        collection=..2,
                                        destFolder=..3,
                                        dimensions=unlist(..4),
                                        roiSpatialIndexBound=unlist(..5))) %>%
        set_names(pluck(gpmOpenDAP_md_daily,"source")))

#########  Build list of datasets to download for GPM half-hourly and for all dates
gpmOpenDAP_md_hhourly<-gpmOpenDAP_md %>%
  filter(source=="GPM_3IMERGHH.06")

gpmData_md_hhourly<-datesDday %>%
  set_names(as.numeric(.)) %>% # names will be numeric format of the dates (days since 1970-01-01)
  map(~pmap(list(.,pluck(gpmOpenDAP_md_hhourly,"source"),pluck(gpmOpenDAP_md_hhourly,"destFolder"),pluck(gpmOpenDAP_md_hhourly,"dimensionsToRetrieve"),pluck(gpmOpenDAP_md_hhourly,"roiSpatialIndexBound")),
            ~getRemoteData::getData_gpm(timeRange = c(paste0(as.Date(..1,origin="1970-01-01")," ",hourStart_Dday,":00:00"),paste0(as.Date(..1,origin="1970-01-01")+1," ",hourEnd_Dday,":00:00")),
                                        collection=..2,
                                        destFolder=..3,
                                        dimensions=unlist(..4),
                                        roiSpatialIndexBound=unlist(..5))) %>%
        set_names(pluck(gpmOpenDAP_md_hhourly,"source")))

#########  Download the data
DftoDl<-rbind(.getDftoDl(gpmData_md_daily),.getDftoDl(gpmData_md_hhourly))
cat("Downloading GPM data...\n")
Dl_res<-getRemoteData::downloadData(DftoDl,username_EarthData,password_EarthData,parallelDL=TRUE)
# Check if the datasets were downloaded
head(file.exists(DftoDl$destfile))

## @knitr precipitationGpmDaily_tidytransform
# For each date of interest, get the related time-series rasters names (i.e. the whole set of 40 days rasters)
rastsNames_gpmDay<-.getRastersNames_timeSeries(df_sampling_points,gpmData_md_daily,"GPM_3IMERGDF.06")

# Build paths to data
path_to_gpmDay<-.getRastersPaths(gpmData_md_daily,"GPM_3IMERGDF.06")

# Pre-process
rasts_gpmDay<-path_to_gpmDay %>%
  mutate(rast=future_map(destfile,~getRemoteData::prepareData_gpm(.,"precipitationCal"))) %>%
  pluck("rast") %>%
  set_names(path_to_gpmDay$name)

# resampling to 250 m
if(gpm_resample){
  rasts_gpmDay<-future_map(rasts_gpmDay,~.resampleProd(.,gpm_resample_output_res))
}

# Extract
cat("Extracting daily precipitations (gpm)...\n")
RGD<-.extractVar_ts(buffer_sizes,df_sampling_points,rastsNames_gpmDay,rasts_gpmDay,"mean")
RGD$var<-"RGD"
RGD$qval<-1
rm(rasts_gpmDay)

# Export
write.csv(RGD,file.path(path_to_output_datasets,"envCov_RGD.csv"),row.names = F)


## @knitr precipitationGpmHhourly_tidytransform
rastsNames_gpmHhour<-.getRastersNames_timeSeries(df_sampling_points,gpmData_md_hhourly,"GPM_3IMERGHH.06")

# Build paths to data
path_to_gpmHhour<-.getRastersPaths(gpmData_md_hhourly,"GPM_3IMERGHH.06")

# Pre-process
rasts_gpmHhour<-path_to_gpmHhour %>%
  mutate(rast=future_map(destfile,~getRemoteData::prepareData_gpm(.,"precipitationCal"))) %>%
  mutate(rast_qval=future_map(destfile,~getRemoteData::prepareData_gpm(.,"precipitationQualityIndex"))) %>%
  mutate(rast_qval=future_map(rast_qval,~clamp(.x,lower=0.4,useValues=FALSE))) %>% ## Set pixel values <= 0.4 (quality pixel values) to NA. See https://pmm.nasa.gov/sites/default/files/document_files/IMERGV06_QI.pdf for additional info (threshold of 0.4 is given in this doc)
  mutate(rast=future_map2(rast,rast_qval,~raster::mask(.x,.y))) %>%
  pluck("rast") %>%
  set_names(path_to_gpmHhour$name)

# resampling to 250 m
if(gpm_resample){
  rasts_gpmHhour<-future_map(rasts_gpmHhour,~.resampleProd(.,gpm_resample_output_res))
}

# Set pixel to 1 if there was rain (precip. >=0.05mm), else 0
rasts_gpmHhour<-future_map(rasts_gpmHhour,~raster::reclassify(.,c(-Inf,0.05,0, 0.05,Inf,1)))

# Extract
cat("Extracting half hourly precipitations (gpm)...\n")
RGH<-.extractVar_ts(buffer_sizes=10,df_sampling_points,rastsNames_gpmHhour,rasts_gpmHhour,"sum")  # With this we get if is has rain (1) or not (0) for each half hour of each night (ie the lag). This dataset will be useful to study the detailed rythms (cycles) of the mosquitoes. Note that ther is only 1 cell intersected (i.e. buffer_sizes=10) so the summurazing function is quite useless
# Now we extract the variable that we want: the proportion of half-hours where it has rained for the total duration of each night. There are quite few NAs: we remove them (ie we do not take them into account for the calculation of the proportions)
RGH<-RGH %>%
  filter(!is.na(val)) %>%
  group_by(id,buffer) %>%
  summarise(val=round(sum(val)/n()*100))
RGH$var<-"RGH"
RGH$qval<-1
rm(rasts_gpmHhour)

# Export
write.csv(RGH,file.path(path_to_output_datasets,"envCov_RGH.csv"),row.names = F)

## @knitr precipitationTamsat_import
tamsat_md<-data.frame(output_time_step=c("daily"), #,"monthly","monthly"),
                      output_product=c("rainfall_estimate"), #,"rainfall_estimate","anomaly"),
                      output_output=c("individual"), #,"individual","individual"),
                      stringsAsFactors = F) %>%
  mutate(source=paste(output_time_step,output_product,output_output,sep="_"))  %>%
  mutate(destFolder=file.path(path_to_output_datasets,"TAMSAT",source))

tamsatData_md<-datesDday %>%
  set_names(as.numeric(.)) %>% # names will be numeric format of the dates (days since 1970-01-01)
  map(~pmap(list(.,pluck(tamsat_md,"destFolder"),pluck(tamsat_md,"output_time_step"),pluck(tamsat_md,"output_product"),pluck(tamsat_md,"output_output")),
            ~getRemoteData::getData_tamsat(timeRange = c(..1-lagTime_timeSeries,..1),
                                           destFolder=..2,
                                           output_time_step=..3,
                                           output_product=..4,
                                           output_output=..5)) %>%
        set_names(pluck(tamsat_md,"source")))

## Download datasets
DftoDl<-.getDftoDl(tamsatData_md)
cat("Downloading TAMSAT data...\n")
Dl_res<-getRemoteData::downloadData(DftoDl,parallelDL=TRUE)
# Check if the datasets were downloaded
head(file.exists(DftoDl$destfile))

## @knitr precipitationTamsat_tidytransform
rastsNames_tamsat<-.getRastersNames_timeSeries(df_sampling_points,tamsatData_md,"daily_rainfall_estimate_individual")

# Build paths to data
path_to_tamsat<-.getRastersPaths(tamsatData_md,"daily_rainfall_estimate_individual")

# Pre-process
rasts_tamsat<-path_to_tamsat %>%
  mutate(rast=future_map(destfile,~getRemoteData::prepareData_tamsat(.,roi))) %>%
  pluck("rast") %>%
  set_names(path_to_tamsat$name)

# resampling to 250 m
if(tamsat_resample){
  rasts_tamsat<-future_map(rasts_tamsat,~.resampleProd(.,tamsat_resample_output_res))
}

# Extract
cat("Extracting daily precipitations (tamsat)...\n")
RTD<-.extractVar_ts(buffer_sizes,df_sampling_points,rastsNames_tamsat,rasts_tamsat,"mean")
RTD$var<-"RTD"
RTD$qval<-1
rm(rasts_tamsat)

# Export
write.csv(RTD,file.path(path_to_output_datasets,"envCov_RTD.csv"),row.names = F)

## @knitr lightSettlements_import
viirsDnbData_md<-datesDday %>%
  set_names(as.numeric(.)) %>% # names will be numeric format of the dates (days since 1970-01-01)
  map(~map(.,
           ~getRemoteData::getData_viirsDnb(timeRange = .,
                                            roi=roi,
                                            dimensions=c("Monthly_AvgRadiance","Monthly_CloudFreeCoverage"),
                                            destFolder=file.path(path_to_output_datasets,"viirs_dnb"))) %>%
        set_names("viirs_dnb"))


## Download datasets
DftoDl<-.getDftoDl(viirsDnbData_md)
cat("Downloading VIIRS DNB data...\n")
Dl_res<-getRemoteData::downloadData(DftoDl,parallelDL=TRUE)
# Check if the datasets were downloaded
head(file.exists(DftoDl$destfile))

## @knitr lightSettlements_tidytransform
# Get the names of the rasters to use for each date
rastsNames_viirsDnb<-.getRastersNames_timeSeries(df_sampling_points,viirsDnbData_md,"viirs_dnb") %>%
  map(~keep(.,str_detect(.,"Monthly_AvgRadiance")))

# Build paths to data
path_to_viirsDnb_AvgRadiance<-.getRastersPaths(viirsDnbData_md,"viirs_dnb") %>%
  mutate(date=substr(name,nchar(name)-4,nchar(name))) %>%
  filter(str_detect(name,"AvgRadiance")) %>%
  set_names(c("name_AvgRadiance","destfile_AvgRadiance","date"))

path_to_viirsDnb_CloudFreeCoverage<-.getRastersPaths(viirsDnbData_md,"viirs_dnb") %>%
  mutate(date=substr(name,nchar(name)-4,nchar(name))) %>%
  filter(str_detect(name,"CloudFreeCoverage")) %>%
  set_names(c("name_CloudFreeCoverage","destfile_CloudFreeCoverage","date"))

path_to_viirsDnb<-merge(path_to_viirsDnb_AvgRadiance,path_to_viirsDnb_CloudFreeCoverage,by="date")

# Pre-process
rasts_viirsDnb<-path_to_viirsDnb %>%
  mutate(rast_AvgRadiance=map(destfile_AvgRadiance,~raster(.))) %>%
  mutate(rast_CloudFreeCoverage=map(destfile_CloudFreeCoverage,~raster(.))) %>%
  mutate(rast_CloudFreeCoverage=map(rast_CloudFreeCoverage,~clamp(.x,lower=1,useValues=FALSE))) %>% # Quality control : if there are 0 cloud free obs, we set the pixel to NA
  mutate(rast=map2(rast_AvgRadiance,rast_CloudFreeCoverage,~mask(.x,.y))) %>%
  pluck("rast") %>%
  set_names(path_to_viirsDnb_AvgRadiance$name_AvgRadiance)

# Extract
cat("Extracting nighttime lights (VIIRS DNB...\n")
LNL<-.extractVar_ts(buffer_sizes,df_sampling_points,rastsNames_viirsDnb,rasts_viirsDnb,"mean")
LNL$time_lag<-NULL
LNL$var<-"LNL"
LNL$qval<-1
rm(rasts_viirsDnb)

# Export
write.csv(LNL,file.path(path_to_output_datasets,"envCov_LNL.csv"),row.names = F)

## @knitr lightMoon_import
moonData_md<-datesDday %>%
  set_names(as.numeric(.)) %>% # names will be numeric format of the dates (days since 1970-01-01)
  map(~map(.,
           ~getRemoteData::getData_imcce(timeRange = .,
                                         roi=roi,
                                         destFolder=file.path(path_to_output_datasets,"moon_imcce"))) %>%
        set_names("moon_imcce"))

## Download datasets
df_DataToDL<-.getDftoDl(moonData_md)
cat("Downloading IMCCE moon data...\n")
Dl_res<-getRemoteData::downloadData(df_DataToDL,parallelDL=TRUE)

## @knitr lightMoon_tidytransform
# Get the names of the files to use for each date
DfNames_moon<-.getRastersNames_timeSeries(df_sampling_points,moonData_md,"moon_imcce") %>%
  unlist() %>%
  cbind(names(moonData_md)) %>%
  data.frame(stringsAsFactors = F) %>%
  set_names(c("name","date"))

# Build paths to data
path_to_moon<-.getRastersPaths(moonData_md,"moon_imcce")

# Pre-process
moon_magnitude_dfs<-path_to_moon %>%
  mutate(df=map(destfile,~read.csv(.,skip=10))) %>%
  mutate(V.Mag=map_dbl(df,"V.Mag"))

# Extract
LMN<-raw_sampling_points %>%
  dplyr::select(id,date) %>%
  mutate(date=as.character(as.numeric(as.Date(date)))) %>%
  left_join(DfNames_moon,by="date") %>%
  left_join(moon_magnitude_dfs,by="name") %>%
  dplyr::select(id,V.Mag) %>%
  set_names("id","val") %>%
  mutate(var="LMN") %>%
  mutate(buffer=NA) %>%
  mutate(qval=1) %>%
  dplyr::select(id,buffer,val,var,qval)

# Export
write.csv(LMN,file.path(path_to_output_datasets,"envCov_LMN.csv"),row.names = F)

## @knitr wind_import
era5_md<-data.frame(dimensionsToRetrieve=c("10m_u_component_of_wind","10m_v_component_of_wind"),
                    stringsAsFactors = F) %>%
  mutate(source=dimensionsToRetrieve)

era5Data_md<-datesDday %>%
  set_names(as.numeric(.)) %>% # names will be numeric format of the dates (days since 1970-01-01)
  map(~pmap(list(.,pluck(era5_md,"source"),pluck(era5_md,"dimensionsToRetrieve")),
            ~getRemoteData::getData_era5(timeRange = c(paste0(as.Date(..1,origin="1970-01-01")," ",hourStart_Dday,":00:00"),paste0(as.Date(..1,origin="1970-01-01")+1," ",hourEnd_Dday,":00:00")),
                                         roi=roi,
                                         destFolder=file.path(path_to_output_datasets,..2),
                                         dimensions=..3)) %>%
        set_names(era5_md$source))

# First create directories in the wd
directories<-era5_md$source %>%
  file.path(path_to_output_datasets,.) %>%
  as.list()  %>%
  lapply(dir.create,recursive = TRUE)

cat("Downloading ERA-5 wind data...\n")
# Then download
#import python CDS-API
cdsapi <- reticulate::import('cdsapi')
#for this step there must exist the file .cdsapirc in the root directory of the computer (e.g. "/home/ptaconet")
server = cdsapi$Client() #start the connection
# for now we have not found a better solution that making a "for" loop (it does not work as expected with purrr)
for (i in 1:length(era5Data_md)){
  for (j in 1:length(era5Data_md[[i]])){
    for (k in 1:length(era5Data_md[[i]][[j]]$destfile)){
      if (!file.exists(era5Data_md[[i]][[j]]$destfile[k])){
        server$retrieve("reanalysis-era5-single-levels",
                        era5Data_md[[i]][[j]]$url[k][[1]],
                        era5Data_md[[i]][[j]]$destfile[k])
      }
    }
  }
}

## @knitr wind_tidytransform
# Get the names of the rasters to use for each date
rastsNames_era5<-.getRastersNames_timeSeries(df_sampling_points,era5Data_md,"10m_u_component_of_wind")

# Build paths to data
path_to_u_wind<-.getRastersPaths(era5Data_md,"10m_u_component_of_wind")
path_to_v_wind<-.getRastersPaths(era5Data_md,"10m_v_component_of_wind")
path_to_u_v_wind<-merge(path_to_u_wind,path_to_v_wind,by="name",suffixes = c("_u","_v"))

# source : https://stackoverflow.com/questions/21484558/how-to-calculate-wind-direction-from-u-and-v-wind-components-in-r
rasts_wind<-path_to_u_v_wind %>%
  mutate(rast_u_wind=future_map(destfile_u,~getRemoteData::prepareData_era5(.)[[1]])) %>%
  mutate(rast_v_wind=future_map(destfile_v,~getRemoteData::prepareData_era5(.)[[1]])) %>%
  mutate(rast_wind_speed=future_map2(rast_u_wind,rast_v_wind,~overlay(.x,.y,fun=function(.x,.y){return(sqrt(.x^2+.y^2))}))) %>%
  mutate(rast_wind_dir_cardinal=future_map2(rast_u_wind,rast_v_wind,~overlay(.x,.y,fun=function(.x,.y){return(atan2(.x,.y))}))) %>%
  mutate(rast_wind_dir_cardinal=future_map(rast_wind_dir_cardinal,~.*180/pi)) %>%
  mutate(rast_wind_dir_cardinal=future_map(rast_wind_dir_cardinal,~.+180)) %>%
  select(name,rast_wind_speed,rast_wind_dir_cardinal)

## resample wind speed but not wind direction
if(era5_resample){
  rasts_wind <- rasts_wind %>%
    mutate(rast_wind_speed=future_map(rast_wind_speed,~.resampleProd(.,era5_resample_output_res)))
}

rasts_wind_speed <- rasts_wind %>%
  pluck("rast_wind_speed") %>%
  set_names(path_to_u_wind$name)

rasts_wind_dir_cardinal <- rasts_wind %>%
  pluck("rast_wind_dir_cardinal") %>%
  set_names(path_to_u_wind$name)

rm(rasts_wind)

# Extract
cat("Extracting wind speed...\n")
WSP<-.extractVar_ts(buffer_sizes=10,df_sampling_points,rastsNames_era5,rasts_wind_speed,"sum") # There is only 1 cell intersected (i.e. buffer_sizes=10) so the summurazing function is quite useless
WSP <- WSP %>%
  group_by(id,buffer) %>%
  summarise(val=mean(val))
WSP$var<-"WSP"
WSP$qval<-1
rm(rasts_wind_speed)


cat("Extracting wind direction...\n")
# formula :  https://en.wikipedia.org/wiki/Mean_of_circular_quantities (see section "Example")
WDR<-.extractVar_ts(buffer_sizes=10,df_sampling_points,rastsNames_era5,rasts_wind_dir_cardinal,"sum") # Same as above
WDR <- WDR %>%
  mutate(sin_angle=sin(val*(pi/180))) %>%
  mutate(cos_angle=cos(val*(pi/180))) %>%
  group_by(id,buffer) %>%
  summarise(mean_sin=mean(sin_angle),mean_cos=mean(cos_angle)) %>%
  mutate(val=atan(mean_sin/mean_cos)/(pi/180)) %>%
  mutate(val=case_when(mean_sin>0 & mean_cos>0 ~ val,
                       mean_cos<0 ~ val+180,
                       mean_sin<0 & mean_cos>0 ~ val+360)) %>%
  select(id,buffer,val)
WDR$var<-"WDR"
WDR$qval<-1
rm(rasts_wind_dir_cardinal)

# Export
write.csv(WSP,file.path(path_to_output_datasets,"envCov_WSP.csv"),row.names = F)
write.csv(WDR,file.path(path_to_output_datasets,"envCov_WDR.csv"),row.names = F)

## @knitr topography_setup
destFolder_dem<-file.path(path_to_output_datasets,"DEM_SRTM")
dem_output_path<-file.path(destFolder_dem,"DEM.tif")
dem_depressionless_output_path<-file.path(destFolder_dem,"DEM_depressionless.tif")
slope_output_path<-file.path(destFolder_dem,"slope.tif")
aspect_output_path<-file.path(destFolder_dem,"aspect.tif")
accumulation_output_path<-file.path(destFolder_dem,"accumulation.tif")
accumulation_threshold_output_path<-gsub(".tif","_treshold.tif",accumulation_output_path)
tci_output_path<-file.path(destFolder_dem,"tci.tif")
twi_output_path<-file.path(destFolder_dem,"twi.tif")
streams_network_path<-file.path(destFolder_dem,"streams_network.gpkg")

## @knitr topography_import
DftoDl<-getRemoteData::getData_srtm(roi,destFolder=destFolder_dem)
cat("Downloading SRTM data...\n")
Dl_res<-getRemoteData::getRemoteData::downloadData(DftoDl,username_EarthData,password_EarthData,parallelDL=TRUE)
# Check if the datasets were downloaded
head(file.exists(DftoDl$destfile))


## @knitr topography_tidytransform1
# Unzip and get path
strm_tiles<-map(DftoDl$destfile,~unzip(.,exdir=destFolder_dem))
# Create DEM in UTM projection and save to disk
DEM <- strm_tiles %>%
  map(~raster::raster(.)) %>%
  do.call(raster::merge, .) %>%
  raster::crop(roi) %>%
  raster::projectRaster(crs=sf::st_crs(roi_utm)$proj4string) %>%
  raster::writeRaster(dem_output_path,NAflag=0,overwrite=TRUE)

## @knitr topography_tidytransform2
# Import DEM to GRASS and set region
rgrass7::execGRASS("r.external", flags="o", parameters=list(input=dem_output_path, output="tmprast",band=1))
rgrass7::execGRASS("g.region", parameters=list(raster="tmprast"))

# Filters and generates a depressionless elevation map
rgrass7::execGRASS("r.fill.dir", flags="overwrite", parameters=list(input="tmprast", output="DEM",direction="dir"))
rgrass7::execGRASS("r.out.gdal", flags=c("t","m","overwrite"), parameters=list(input="DEM", output=dem_depressionless_output_path, format="GTiff",  createopt="TFW=YES,COMPRESS=LZW" ))

# Compute slope and aspect and save to disk
rgrass7::execGRASS("r.slope.aspect", flags="overwrite", parameters=list(elevation="DEM", slope="slope",aspect="aspect",format="percent", precision="FCELL",zscale=1,min_slope=0))
rgrass7::execGRASS("r.out.gdal", flags=c("t","m","overwrite"), parameters=list(input="slope", output=slope_output_path, format="GTiff",  createopt="TFW=YES,COMPRESS=LZW" ))
rgrass7::execGRASS("r.out.gdal", flags=c("t","m","overwrite"), parameters=list(input="aspect", output=aspect_output_path, format="GTiff",  createopt="TFW=YES,COMPRESS=LZW" ))

# Compute hydrograpy indices and save to disk
rgrass7::execGRASS("r.terraflow", flags="overwrite", parameters=list(elevation="DEM", accumulation="accumulation", tci="tci"))
rgrass7::execGRASS("r.out.gdal", flags=c("t","m","overwrite"), parameters=list(input="accumulation", output=accumulation_output_path, format="GTiff",  createopt="TFW=YES,COMPRESS=LZW" ))
rgrass7::execGRASS("r.out.gdal", flags=c("t","m","overwrite"), parameters=list(input="tci", output=tci_output_path, format="GTiff",  createopt="TFW=YES,COMPRESS=LZW" ))

# Compute TWI indice
rgrass7::execGRASS("r.topidx", flags="overwrite", parameters=list(input="DEM", output="twi"))
rgrass7::execGRASS("r.out.gdal", flags=c("t","m","overwrite"), parameters=list(input="twi", output=twi_output_path, format="GTiff",  createopt="TFW=YES,COMPRESS=LZW" ))

## @knitr topography_tidytransform3
# Create accumulation threshold raster
# NB : Grass has a algorithm 'r.stream.extract' that can do the job (see https://grass.osgeo.org/grass76/manuals/addons/r.accumulate.htmlsœ)
acc_raster<-raster(accumulation_output_path) %>%
  raster::reclassify(c(-Inf,threshold_accumulation_raster,NA, threshold_accumulation_raster,Inf,1)) %>%
  raster::writeRaster(accumulation_threshold_output_path,datatype='INT2S',overwrite=TRUE)

# skeletonization (thinning extraction) and vectorization of stream network from flow accumulation map. See https://grass.osgeo.org/grass76/manuals/r.thin.html
rgrass7::execGRASS("r.external", flags="overwrite", parameters=list(input=accumulation_threshold_output_path, output="acc_threshold",band=1))
rgrass7::execGRASS("g.region", parameters=list(raster="acc_threshold"))
rgrass7::execGRASS("r.thin", flags="overwrite", parameters=list(input="acc_threshold",output="acc_thin"))
rgrass7::execGRASS("r.out.gdal", flags=c("t","m","overwrite"), parameters=list(input="acc_thin", output=file.path(path_to_dem_folder,"accumulation_thin.tif"), format="GTiff",  createopt="TFW=YES,COMPRESS=LZW" ))
rgrass7::execGRASS("r.to.vect", flags="overwrite", parameters=list(input="acc_thin", output="acc_thin_vect", type="line"))
# Next we need to execute the v.split algo. With v.split we split the stream network into pieces of 20 meters. This will be used then to measure the mean length between the HLC point and the streams within the buffer
# the step v.split does not give the appropriate results when executed in R, although it does not send back any error... It is weird because it works in QGIS (through GRASS plugin). For now we need to do it by hand in QGIS. To do it by hand execute:
rgrass7::execGRASS("v.out.ogr", flags=c("m","overwrite"), parameters=list(input="acc_thin_vect", type="line", output=file.path(destFolder_dem,"acc_thin_vect.gpkg")))
# and than manually execute the algo v.split in the GRASS plugin of QGIS. Save the output file under streams_network_path
# To do it via R it should be :
#rgrass7::execGRASS("v.split", flags=c("overwrite","verbose"), parameters=list(input="acc_thin_vect", output="acc_thin_vect_split", length=20, units="map"))
#rgrass7::execGRASS("v.out.ogr", flags=c("m","overwrite"), parameters=list(input="acc_thin_vect_split", type="line", output=streams_network_path))

## @knitr topography_tidytransform4
dem_and_derivatives_rast <- brick(list(dem_depressionless_output_path,slope_output_path,aspect_output_path,accumulation_output_path,tci_output_path,twi_output_path))
names(dem_and_derivatives_rast)<-c("TEL","TSL","TAS","WAC","TCI","TWI")

TEL_TSL_TAS_WAC_TCI_TWI<-buffer_sizes %>% # for each buffer, calculate stats
  set_names %>%
  map_dfr(~.extractVar_rast_singleBuff(dem_and_derivatives_rast,spPoints,.,"mean")) %>%
  mutate(val=if_else(var=="WAC",val*res(dem_and_derivatives_rast)[1]*res(dem_and_derivatives_rast)[2]/10000,val)) # convert accumulation from number of pixels to surface in ha

# Export
TEL_TSL_TAS_WAC_TCI_TWI<-TEL_TSL_TAS_WAC_TCI_TWI %>%
  mutate(qval=1)

write.csv(TEL_TSL_TAS_WAC_TCI_TWI,file.path(path_to_output_datasets,"envCov_TEL_TSL_TAS_WAC_TCI_TWI.csv"),row.names = F)

## @knitr topography_tidytransform5
# open stream network
streams_network_sf<-sf::read_sf(streams_network_path) %>%
  st_zm() %>%
  mutate(length_stream=st_length(.)) %>%
  mutate(DN=seq(1,nrow(.),1))  %>%
  select(DN,length_stream,geom)

# Get the accumulation value on each piece of the network. For this, take the centroid of each piece of network and calculate the accumulation on this piece.
accumulation<-raster(accumulation_output_path)
accumulation<-accumulation*res(accumulation)[1]*res(accumulation)[2]/10000 # convert accumulation from number of pixels to surface in ha

sp_cent <- rgeos::gCentroid(as(streams_network_sf, "Spatial"), byid=TRUE)
streams_network_sf<-raster::extract(accumulation,sp_cent,df=TRUE) %>%
  select(-ID) %>%
  mutate(DN=streams_network_sf$DN) %>%
  full_join(streams_network_sf,.)

# Convert the points to UTM projection
sfPoints_utm<-st_transform(sfPoints,utm_zone)

# Calculate the stats for all buffers
WAD_WMD_WLS_WAL<-buffer_sizes %>%
  set_names() %>%
  future_map_dfr(~.extractVar_line_singleBuff(streams_network_sf,sfPoints_utm,.))

# Export
WAD_WMD_WLS_WAL<-WAD_WMD_WLS_WAL %>%
  mutate(var=str_replace_all(var,c("mean_dist_to_stream"="WAD","min_dist_to_stream"="WMD","length_stream"="WLS","mean_acc_by_dist"="WAL"))) %>%
  mutate(qval=1)

write.csv(WAD_WMD_WLS_WAL,file.path(path_to_output_datasets,"envCov_WAD_WMD_WLS_WAL.csv"),row.names = F)

## @knitr settlementsPop_import
con <- DBI::dbConnect(RSQLite::SQLite(),dbname = path_to_db)
sf_households_by_village <- st_read(con,db_table_loc_households_by_village) %>%
  st_set_crs(4326) %>%
  st_intersection(roi) %>%  # keep only the points in our ROI
  st_transform(utm_zone)

## @knitr settlementsPop_tidytransform

.extract_pop_buffer<-function(sf_households_by_village,sfPoints,buffer_size){
  POP <- buffer_size %>%
    st_buffer(st_transform(sfPoints,utm_zone),dist=.) %>%
    st_join(sf_households_by_village, join = st_intersects,left = TRUE) %>%
    st_drop_geometry() %>%
    group_by(id) %>%
    summarise(val=sum(population)) %>%
    mutate(buffer=buffer_size)
  return(POP)
}

## Get stats: population in the buffers
POP <- c(100,500) %>%  # buffers of 100m and 500m
  map_dfr(~.extract_pop_buffer(sf_households_by_village,sfPoints,.)) %>%
  mutate(var="POP") %>%
  mutate(qval=1)

# Export
write.csv(POP,file.path(path_to_output_datasets,"envCov_POP.csv"),row.names = F)


## @knitr settlementsPopHRSL_import
rast_hrsl<-raster(path_to_hrls_raster)

## @knitr settlementsPopHRSL_tidytransform
## Get stats: population in the buffers
POH<-buffer_sizes %>% # for each buffer, calculate stats
  set_names %>%
  future_map_dfr(~.extractVar_rast_singleBuff(rast_hrsl,spPoints,.,"sum")) %>%
  mutate(var="POH") %>%
  mutate(qval=1)

# Export
write.csv(POH,file.path(path_to_output_datasets,"envCov_POH.csv"),row.names = F)


## @knitr buitup_tidytransform

# Get the villages convex hull (considered as the boundaries of the villages)
villages_convexhull <- sf_households_by_village %>%
  group_by(codevillage) %>%  # group by village
  summarize(geom_convHull=st_convex_hull(st_union(geom))) %>% #get convex hull polygon. We consider the convex hull as the edges of the village. The convex hull is the minimum polygon that encompasses all the locations of the households.
  st_transform(utm_zone)

## Get the distance from each catch point to the edge of the village
BDE <- sfPoints %>%
  st_transform(utm_zone) %>%
  mutate(codevillage=substr(id,2,4)) %>%  # create codevillage before joining with edges of the village
  left_join(as.data.frame(villages_convexhull),by="codevillage") %>%  # join convex hull geometry
  mutate(geom_convHull=st_cast(geom_convHull, "LINESTRING")) %>%  # convert convex hull geometry from polygon to linestring
  mutate(val=st_distance(geometry,geom_convHull,by_element=TRUE)) %>% # compute distance between catch point and edges of the village
  st_drop_geometry() %>%
  dplyr::select(id,val) %>%
  mutate(var="BDE") %>%
  mutate(buffer=NA) %>%
  mutate(qval=1)

## Get an indice of the clustering or ordering of the households in each village. For this we use the locations of the households with the function spatstat::clarkevans

.extract_clarkevans_buffer<-function(sf_households_by_village,sfPoints,buffer_size){
  BCH <- buffer_size %>%
    st_buffer(st_transform(sfPoints,utm_zone),dist=.) %>%
    st_join(sf_households_by_village, join = st_intersects,left = TRUE) %>%
    st_drop_geometry() %>%
    filter(!is.na(X)) %>%
    group_by(id) %>%
    nest() %>%
    mutate(sp_points=map(data,~SpatialPoints(coords=data.frame(.$X,.$Y),proj4string=CRS("+init=epsg:4326")))) %>%
    mutate(ppp=map(sp_points,~as(., "ppp"))) %>%   # uses the maptools package
    mutate(ppp_n=as.numeric(map(ppp,~as.numeric(.$n)))) %>%
    filter(ppp_n>=5) %>% # if there are less than 5 settlements, the clarkenv indicator cannot be computed
    mutate(clark_index=map(ppp,~spatstat::clarkevans(.))) %>%
    mutate(clark_index=map_dbl(clark_index,~as.numeric(.[1]))) %>%
    dplyr::select(id,clark_index) %>%
    set_names("id","val") %>%
    mutate(buffer=buffer_size)

  return(BCH)
}

BCH <- c(100,500) %>%  # buffers of 100m and 500m
  map_dfr(~.extract_clarkevans_buffer(sf_households_by_village,sfPoints,.)) %>%
  mutate(var="BCH") %>%
  mutate(qval=1)

# Export
write.csv(BDE,file.path(path_to_output_datasets,"envCov_BDE.csv"),row.names = F)

## @knitr pedology_import
# we have not found the solution to import the rasters directly from the gpkg database. So we read them in the db and write them as tif :
path_to_pedology_rast<-file.path(path_to_output_datasets,"pedology","pedology.tif")
#gdalUtils::gdal_translate(path_to_db,path_to_pedology_rast,ot="Int16",of="GTiff",oo=paste0("TABLE=",db_table_pedology_rast))

## @knitr pedology_tidytransform
pedology_rast<-raster::raster(path_to_pedology_rast)
names(pedology_rast)="HYS"
# Set to 0 classes that are not hydromorphic and to 1 the classes that are hydromorphic
pedology_rast[!(pedology_rast %in% hydromorphic_classes_pixels)]<-0
pedology_rast[pedology_rast!=0]<-1

HYS<-buffer_sizes %>% # for each buffer, calculate stats
  set_names %>%
  map_dfr(~.extractVar_rast_singleBuff(pedology_rast,spPoints,.,"sum"))

# convert surface from count of pixels to proportion (%) of hydromorphic soil in the buffer
HYS <- HYS %>%
  mutate(val=val*res(pedology_rast)[1]*res(pedology_rast)[2]) %>%
  mutate(val=val/(pi*buffer^2)*100) %>%
  mutate(qval=1)

# Export
write.csv(HYS,file.path(path_to_output_datasets,"envCov_HYS.csv"),row.names = F)

## @knitr lulc_import
metadata_landcovers <- tbl(con,db_table_metadata_lulc) %>%
  as.data.frame() %>%
  distinct(layer_label,layer_path,layer_id) %>%
  filter(layer_label %in% layers_lulc) %>%
  mutate(layer=seq(1,nrow(.),1))

lulc_rasts_list<-metadata_landcovers$layer_path %>%
  map(~raster(.)) %>%
  map(~crop(.,st_transform(roi,proj4string(.))))

# If the raster is not projected, project it to utm zone
for (i in 1:length(lulc_rasts_list)){
  if(proj4string(lulc_rasts_list[[i]])=="+proj=longlat +datum=WGS84 +no_defs +ellps=WGS84 +towgs84=0,0,0"){
    lulc_rasts_list[[i]]<-projectRaster(lulc_rasts_list[[i]],crs=sf::st_crs(roi_utm)$proj4string,method = "ngb")
  }
}

# Check if LU/LC rasters are ok prior to calculating LSM
for (i in 1:length(lulc_rasts_list)){
  print(landscapemetrics::check_landscape(lulc_rasts_list[[i]]))
}

## @knitr lulc_tidytransform

# We have tried to run the process with all at once using furrr but it fails (probably because it is too long).
# So we split the points into mutliple parts and run the process with a "for" loop.

.extractLSM_singlePoint<-function(lulc_rasts_list,buffer_sizes,sfPoint,lsm_to_extract,metadata_landcovers){

  res_th_pt <- buffer_sizes %>%
    set_names(buffer_sizes) %>%
    future_map_dfr(~landscapemetrics::sample_lsm(lulc_rasts_list,
                                                 sfPoint,
                                                 what = lsm_to_extract,
                                                 shape = "circle",
                                                 size = .),
                   .id = "buffer")

  res_th_pt <- res_th_pt %>%
    select(-c(id,percentage_inside)) %>%
    rename(val=value,class_pixel=class) %>%
    mutate(id=sfPoint$id) %>%
    mutate(buffer=as.numeric(buffer)) %>%
    left_join(metadata_landcovers) %>%
    select(id,buffer,layer_id,class_pixel,level,metric,val)

  return(res_th_pt)

}

sfPoints_list <- raw_sampling_points %>%
  filter(!(grepl("NAM",id))) %>% # only for CIV, because village NAM does not overlap the own LU/LC maps
  transpose() %>%
  map(~as.data.frame(.,stringsAsFactors=F))  %>%
  map(~st_as_sf(.,coords = c("longitude", "latitude"), crs = 4326)) %>%
  map(~st_transform(.,utm_zone))

div_pts<-c(seq(0,length(sfPoints_list),50),length(sfPoints_list))

LSM<-NULL

for (i in 1:(length(div_pts)-1)){

  cat(paste0(Sys.time()," : Processing points ",div_pts[i]+1," to ",div_pts[i+1]," over ",length(sfPoints_list)," in total \n"))

  plan(multiprocess)
  options(future.globals.maxSize= 20000*1024^2)

  LSM_th_pts <- sfPoints_list[seq(div_pts[i]+1,div_pts[i+1],1)] %>%
    future_map_dfr(~.extractLSM_singlePoint(lulc_rasts_list,buffer_sizes,.,lsm_to_extract,metadata_landcovers),.progress = T)

  LSM<-rbind(LSM,LSM_th_pts)

  # Export, overwriting the previous export at each iteration
  write.csv(LSM,file.path(path_to_output_datasets,"envCov_LSM.csv"),row.names = F)

}

## @knitr setup_close
# Remove grass temporary folder
system(paste0("rm -r ", fTile.path(path_to_output_datasets,"GRASS_TEMP")))
file.remove(file.path(path_to_output_datasets,".grassrc7"))
# Disconnect from the local database
dbDisconnect(con)
ptaconet/malamodpkg documentation built on Feb. 12, 2020, 3:45 p.m.
rdrr.io home R language documentation Run R code online
CRAN packages Bioconductor packages R-Forge packages GitHub packages
Note that we can't provide technical support on individual packages. You should contact the package authors for that.
ptaconet/malamodpkg
R scripts - modeling the risk of residual malaria transmission at a micro-scale

R_scripts/import_tidy_transform_envdata.R
In ptaconet/malamodpkg: R scripts - modeling the risk of residual malaria transmission at a micro-scale

R Package Documentation

Browse R Packages

We want your feedback!

ptaconet/malamodpkg R scripts - modeling the risk of residual malaria transmission at a micro-scale

R_scripts/import_tidy_transform_envdata.R In ptaconet/malamodpkg: R scripts - modeling the risk of residual malaria transmission at a micro-scale

R Package Documentation

Browse R Packages

We want your feedback!

ptaconet/malamodpkg
R scripts - modeling the risk of residual malaria transmission at a micro-scale

R_scripts/import_tidy_transform_envdata.R
In ptaconet/malamodpkg: R scripts - modeling the risk of residual malaria transmission at a micro-scale