scripts/various/old/04.02.master.variogram.scores.new.R

#######################################################################################

###################  master script part 4 - variogram scores  #########################

#######################################################################################

# This script compares the downweigh sample covariance matrix and then PCAing it based on their variogram scores.
#
# 
# Files generated:
# 
# Directories: PCACov, GeoStat, ECC
# Data files (* in 1:12): PCA/CovRes_mon*.RData, PCA/diff_var_by_PC_m*.RData, PCA/var_sc_by_PC.RData, PCA/var_sc_by_PC_no_marg_corr.RData
#                         GeoStat/diff_var_geoStat_m*.RData, GeoStat/variogram_exp_m*.RData, GeoStat/var_sc.RData,
#                         ECC/ECC_fc.RData, ECC/diff_var_ECC_m*.RData, ECC/var_sc.RData
#
# Plots: mean_variogram_scores.pdf
#
# Requires previous run of 03.master.var.est.R
# with the same value of name_abbr as below.


##### setting up ######

rm(list = ls())

setwd("~/NR/SFE")
options(max.print = 1e3)

library(PostProcessing)
library(data.table)

name_abbr = "NAO_2" 

save_dir = paste0("~/PostClimDataNoBackup/SFE/Derived/", name_abbr,"/")

load(file = paste0(save_dir,"setup.RData"))


########################################

for_res_cov = function(dt,
                       weight_mat,
                       Y = 1985:2000,
                       M = 1:12,
                       save_dir = "~/PostClimDataNoBackup/SFE/PCACov/",
                       ens_size = 9,
                       version = "wrt_ens_mean")
{  
  land_ids <- which(dt[, is.na(Ens_bar) | is.na(SST_bar)])
  if(!identical(land_ids,integer(0)))
  {
    dt = dt[-land_ids,]
  }
  
  for(mon in M)
    {
      print(paste0("month =",mon))  
      
      dt_SC = copy(dt[month == mon & year %in% Y,])
      
      if(version == "wrt_ens_mean")
      {
        dt_SC[,"Res":= SST_bar - trc(Ens_bar + Bias_Est)]  
        cor_factor = 1 / sqrt( length(Y))
        
        sqrt_cov_mat = as.matrix(na.omit(dt_SC[,Res]))
        
        res_cov_sqrt = cor_factor * matrix(sqrt_cov_mat,ncol = length(Y) )
        
        Sigma = res_cov_sqrt %*% t(res_cov_sqrt)
        Sigma = weight_mat * Sigma
        
        
      }
      
      save(Sigma, file = paste0(save_dir,"CovRes_mon",mon,".RData"))
      
      rm(dt_SC)
    }
  
}



###############################
########### PCA ###############
###############################


# to skip this section, run instead:

# PCA_dir = paste0(save_dir, "PCA/")
# load(file = paste0(PCA_dir,"variogram_scores.RData"))


##### setting up ######

PCA_dir = paste0(save_dir,"PCA_dw/")
dir.create(PCA_dir, showWarnings = FALSE)

training_years = DT[!(year %in% validation_years),unique(year)]


# get weight matrix:
L = 2000

NA_rows = DT[,which(is.na(SST_bar) | is.na(SST_hat) | is.na(SD_hat))]

DT_NA_free = DT[-NA_rows,]

sp <- sp::SpatialPoints(cbind(x=DT_NA_free[YM == min(YM), Lon],
                              y=DT_NA_free[YM == min(YM), Lat]), 
                        proj4string = sp::CRS("+proj=longlat +datum=WGS84"))
Dist <- sp::spDists(sp, longlat = TRUE)

weights = GneitingWeightFct(Dist, L=L)
weight_mat = matrix(weights, nrow = dim(Dist)[1])

# get downweighted covariance matrices:

for_res_cov(Y = training_years,
            dt = DT, 
            weight_mat = weight_mat,
            save_dir = PCA_dir,
            ens_size = ens_size,
            version = "wrt_mean")

PCs = c(10,20,30,40,50,100,150,200,250) # range of PCs to test

  
#### variogram score computation: ####


for(m in months)
  { 
   # setup: get principal components and marginal variances for the given month m:
  
    print(paste0("m = ",m))
     load(file = paste0(PCA_dir,"CovRes_mon",m,".RData"))
  
    PCA = irlba::irlba(Sigma, nv = max(PCs))
    
    assign(paste0("PCA",m), PCA)
    
    
    land_ids = which(DT[year == min(year) & month == min(month),is.na(Ens_bar) | is.na(SST_bar)])
    
    PCA_DT = DT[year == min(year) & month == min(month),][-land_ids,.(Lon,Lat,grid_id)]
    
    for(d in  1:max(PCs))
    {
      PCA_DT [,paste0("PC",d) := PCA$d[d]*PCA$u[,d]]
    } 
    
    
    
    # also get marginal variances
    variances = list()
    d = 1 
    vec = PCA_DT[,eval(parse(text = paste0("PC",d)))]
    variances[[d]] = vec^2
    
    if(max(PCs)>1)
    {
      for(d in 2:max(PCs)){
        vec = PCA_DT[,eval(parse(text = paste0("PC",d)))]
        variances[[d]] = variances[[d-1]] + vec^2
      }
    }
    names(variances) = paste0("var",1:max(PCs))
    PCA_DT = data.table(PCA_DT,rbindlist(list(variances)))    
  
  # now move to getting the variogram scores:
    
  # without marginal correction:
    
  dummy_fct = function(y)
  {
    var_sc_PCA_old(m, y, DT, PCA = PCA, PCA_DT = PCA_DT, dvec = PCs, ens_size = ens_size,
                   file_name = "var_sc_by_PC_dw_before",
                   marginal_correction = FALSE, 
                   cov_dir = PCA_dir, save_dir = PCA_dir)
  }
  parallel::mclapply(validation_years,dummy_fct,mc.cores = length(validation_years))

  # with marginal correction:
  
  dummy_fct = function(y)
  {
    var_sc_PCA_old(m, y, DT, PCA = PCA, PCA_DT = PCA_DT, dvec = PCs, ens_size = ens_size,
                   file_name = "var_sc_by_PC_dw_before",
                   marginal_correction = TRUE, 
                   cov_dir = PCA_dir, save_dir = PCA_dir)
  }
  parallel::mclapply(validation_years,dummy_fct,mc.cores = length(validation_years))
}



###### combine: ######

scores_nmc = list()
k=0
for(m in months){
  for(y in validation_years)
  {k=k+1
  load(file = paste0(PCA_dir,"var_sc_by_PC_dw_before_nmc_m",m,"_y",y,".RData"))
  scores_nmc[[k]] = scores
  }
}
scores_nmc = rbindlist(scores_nmc)

scores_mc = list()
k=0
for(m in months){
  for(y in validation_years)
  {k=k+1
  load(file = paste0(PCA_dir,"var_sc_by_PC_dw_before_m",m,"_y",y,".RData"))
  scores_mc[[k]] = scores
  }
}
scores_mc = rbindlist(scores_mc)





pdf(file = paste0(plot_dir,"vs_by_dist_Gn_wf_PCA50_wrtm.pdf"))
plot(score_by_dist[,.(dist,mean_sc_mc)],type = "l",
     main = "variogram score for comp. supp. cov. fct.",
     xlab = "Parameter L", ylab = "mean vs")
dev.off()








print(paste0("Minimal variogram score is achieved for ",mean_sc[mean_sc == min(mean_sc),d][1],
             " principal components with a score of ",mean_sc[,min(mean_sc)],
             ". Without marginal correction it is achieved for ",mean_sc_nmc[mean_sc == min(mean_sc),d][1],
             " principal components with a score of ",mean_sc_nmc[,min(mean_sc)],"."))

# save

mean_sc = scores_mc[,mean_sc := mean(sc), by = d][,.(d,mean_sc)]
mean_sc = unique(mean_sc)


mean_sc_nmc = scores_nmc[,mean_sc := mean(sc), by = d][,.(d,mean_sc)]
mean_sc_nmc = unique(mean_sc_nmc)


save(scores_mc,scores_nmc,mean_sc,mean_sc_nmc,file = paste0(PCA_dir,"variogram_scores",weight_name_addition,".RData"))




#########################################
########### Geostationary ###############
#########################################

# to skip this section, run instead:

# geostat_dir = paste0(save_dir, "GeoStat/")
# load(file = paste0(geostat_dir,"variogram_scores.RData"))



###########################################

geostat_dir = paste0(save_dir, "GeoStat/")
dir.create(geostat_dir, showWarnings = FALSE)

geostationary_training(dt = DT, save_dir = geostat_dir,m = months)


for(m in months)
{ 
  # setup: get principal components and marginal variances for the given month m:
  
  print(paste0("m = ",m))
  
  load(paste0(geostat_dir,"variogram_exp_m",m,".RData")) 
  
  
 
  # variogram scores without marginal correction:
  
  dummy_fct = function(y)
  {
    var_sc_geoStat_new( DT, m, y, Mod = Mod, Dist = Dist,
                        weighted = weight_a_minute, weight_fct = weight_fct, 
                        file_name = paste0("var_sc",weight_name_addition),
                        save_dir = geostat_dir, data_dir = geostat_dir,
                        mar_var_cor = FALSE)
  }
  parallel::mclapply(validation_years,dummy_fct,mc.cores = length(validation_years))
  
  # with marginal correction:
  
  dummy_fct = function(y)
  {
    var_sc_geoStat_newnew( DT,m, y, Mod = Mod, Dist = Dist,
                        weighted = weight_a_minute, weight_fct = weight_fct, 
                        file_name = paste0("var_sc",weight_name_addition),
                        save_dir = geostat_dir, data_dir = geostat_dir,
                        mar_var_cor = TRUE)
  }
  parallel::mclapply(validation_years,dummy_fct,mc.cores = length(validation_years))
}


####### combine #########

scores_geostat_nmc = list()
k=0
for(m in months){
  for(y in validation_years)
  {k=k+1
  load(file = paste0(geostat_dir,"var_sc",weight_name_addition,"_nmc_m",m,"_y",y,".RData"))
  scores_geostat_nmc[[k]] = scores
  }
}
scores_geostat_nmc = rbindlist(scores_geostat_nmc)

scores_geostat_mc = list()
k=0
for(m in months){
  for(y in validation_years)
  {k=k+1
  load(file = paste0(geostat_dir,"var_sc",weight_name_addition,"_m",m,"_y",y,".RData"))
  scores_geostat_mc[[k]] = scores
  }
}
scores_geostat_mc = rbindlist(scores_geostat_mc)


mean_geostat_sc = scores_geostat_mc[,mean_sc := mean(sc)][,.(mean_sc)]
mean_geostat_sc = unique(mean_geostat_sc)


mean_geostat_sc_nmc = scores_geostat_nmc[,mean_sc := mean(sc)][,.(mean_sc)]
mean_geostat_sc_nmc = unique(mean_geostat_sc_nmc)

# save

save(scores_geostat_mc,scores_geostat_nmc,mean_geostat_sc,mean_geostat_sc_nmc,file = paste0(geostat_dir,"variogram_scores",weight_name_addition,".RData"))




#########################################
################ ECC ####################
#########################################

# to skip this section, run instead:

# ECC_dir = paste0(save_dir,"ECC/")
# load(file = paste0(ECC_dir,"var_sc.RData"))
# sc_ECC = sc

#########################################

ECC_dir = paste0(save_dir,"ECC/")
dir.create(ECC_dir, showWarnings = FALSE)

forecast_ECC(dt = DT[year %in% validation_years],save_dir = ECC_dir)

setup_var_sc_ECC(eval_years = validation_years,
                 months = months,
                 weighted = weight_a_minute, weight_fct = weight_fct, 
                 ens_size = ens_size,
                 file_name = paste0("diff_var_ECC",weight_name_addition,"m"),
                 data_dir = ECC_dir, save_dir = ECC_dir)

var_sc_ECC(months = months,eval_years = validation_years,save_dir = ECC_dir, 
           data_name = paste0("diff_var_ECC",weight_name_addition,"m"),file_name = paste0("var_sc",weight_name_addition,".RData"))

load(file = paste0(ECC_dir,"var_sc",weight_name_addition,".RData"))
sc_ECC = sc



#########################################
########### plotting scores: ##############
###########################################



# we leave out ECC, because its score is just too bad:
print(paste0("variogram score for ECC is ",sc_ECC[,mean(sc)]))


pdf(paste0(plot_dir,"/mean_variogram_scores_dw_before.pdf"))
  rr = range(c(scores_mc[,mean(sc),by =  d][,2],scores_nmc[,mean(sc),by = d][,2],mean_geostat_sc[,mean_sc],mean_geostat_sc_nmc[,mean_sc]))
  
  
  
  #### with marginal correction ####
  
  plot(x = mean_sc[[1]],
       y = mean_sc[[2]],
       ylim = rr,
       type = "b",
       col = "blue",
       main = paste0("mean variogram scores"),
       xlab = "number of principal components",
       ylab = "mean score")
  
  #---- add minima: -----
  #abline(h = min(mean_sc[[2]]), lty = "dashed", col = adjustcolor("blue",alpha = .5))
  
  opt_num_PCs = mean_sc[,which.min(mean_sc)]
  
  points(x = mean_sc[[1]][opt_num_PCs],
         y = mean_sc[[2]][opt_num_PCs],
         col = "blue",
         bg = "blue",
         pch = 21)
  
  #### without marginal correction ####
  
  lines(x = mean_sc_nmc[[1]],
       y = mean_sc_nmc[[2]],
       type = "b",
       col = "darkgreen")
  
  #---- add minima: -----
  #abline(h = min(mean_sc_nmc[[2]]), lty = "dashed", col = adjustcolor("darkgreen",alpha = .5))
  
  opt_num_PCs = mean_sc_nmc[,which.min(mean_sc)]
  
  points(x = mean_sc_nmc[[1]][opt_num_PCs],
         y = mean_sc_nmc[[2]][opt_num_PCs],
         col = "darkgreen",
         bg = "darkgreen",
         pch = 21)
  
  # --- add geostat value: ----
  
  abline(h = mean_geostat_sc[,mean_sc], lty = "dashed", col = adjustcolor("darkred"))
  abline(h = mean_geostat_sc_nmc[,mean_sc], lty = "dashed", col = adjustcolor("black"))
  #abline(h = sc_ECC[,mean(sc)], lty = "dashed", col = adjustcolor("pink"))
  
  
  legend("topright",legend = c("PCA, m.c.v.","PCA","geostat"),
         col = c("blue","darkgreen","darkred"),lty = c(1,1,2))
  # legend("topright",legend = c("PCA, m.c.v.","PCA","geostat, m.c.v.","geostat","ECC"),
  #        col = c("blue","darkgreen","darkred","black","pink"),lty = c(1,1,2,2,2))
dev.off()

##################

save.image(file = paste0(save_dir,"setup.RData"))
ClaudioHeinrich/pp.sst documentation built on March 12, 2020, 3:15 a.m.