########### shrinking of eigenvalues ###############
#' computes the covariance estimate by the eigenvalue-shrinking-method of Ledoit and Wolf,
#' but with the sample covariance matrix downweighted by distance of gridpoints before deriving the estimate. For some details and notation see Pourahmadi 2013, pp.101-102.
#'
#' @param dt The data table
#' @param weight_mat The matrix containing the weights for tapering: Has dimension n x n, where n is the number of grid_ids in water in dt.
#' Should be generated by the function weight_mat.
#' @param Y,M The years and months considered.
#' @param save_years For these years the covariance estimate is computed and saved,
#' using for each year all previous years.
#' @param save_dir The directory to save in.
#' @param file_name The saved files are named paste0(file_name,"_m",+,"_y",++,".RData"), where + is the month and ++ the year.
#'
#'
#' @export
cov_est_SE = function(dt, weight_mat,
Y=unique(dt[,year]), M = 1:12,
save_years = 2013:2018, save_dir, file_name = "cov_est_SE")
{
dt = dt[year %in% Y & month %in% M,]
na_ids <- which(dt[, is.na(Ens_bar) | is.na(SST_bar)])
if(!identical(na_ids,integer(0)))
{
dt = dt[-na_ids,]
}
num_loc = dt[year == min(year) & month == min(month),.N]
cov_est_by_month = function(m)
{
print(paste0("month =",m))
full_data_mat = matrix(dt[month == m,SST_bar - SST_hat],nrow = num_loc)
for(y in save_years)
{
num_years = max(which(Y < y))
data_mat = full_data_mat[,1:num_years]
sam_cov_mat = 1/num_years * data_mat %*% t(data_mat)
S = weight_mat * sam_cov_mat
p = dim(S)[1]
n = num_years
m_n = sum(diag(S))/p
d_n_sq = sum(diag( (S - diag(rep(m_n,p))) %*% (S - diag(rep(m_n,p))) ))/p
b_n_sq_bar = 0
for(k in 1:n)
{ x_k_sq = (data_mat[,k])^2
mat_temp = (diag(x_k_sq) - S) %*% (diag(x_k_sq) - S)
b_n_sq_bar = b_n_sq_bar + (1/n^2)*sum(diag(mat_temp))/p
}
b_n_sq = min(d_n_sq,b_n_sq_bar)
a_n_sq = d_n_sq - b_n_sq
Sigma = diag(rep(b_n_sq * m_n / d_n_sq,p)) + a_n_sq/d_n_sq * S
sin_val_dec = svd(Sigma)
# vectorize matrix mult:
temp = as.vector(t(matrix(rep(sqrt(sin_val_dec$d),dim(S)[1]),nrow = dim(S)[1])))
sqrt_Sigma = t(sin_val_dec$u * temp) # Sigma = t(sqrt_Sigma) %*% sqrt_Sigma
full_file_name = paste0(file_name,"_m",m,"_y",y,".RData")
save(sqrt_Sigma, file = paste0(save_dir,full_file_name))
}
}
parallel::mclapply(M,cov_est_by_month,mc.cores = length(M))
}
# var_sc_SE_new = function(m, y, dt, weight_mat,
# Sigma,
# ens_size = 9,
# save_dir = "~/PostClimDataNoBackup/SFE/Derived/SE/",
# file_name = "var_sc",
# weighted = FALSE,
# weight_fct = NULL)
# {
# # setting up:
#
# dt = dt[year %in% y & month %in% m,]
#
# land_ids <- which(dt[, is.na(Ens_bar) | is.na(SST_bar)])
# if(!identical(land_ids,integer(0)))
# {
# dt = dt[-land_ids,]
# }
#
#
#
# # build data table that contains pairs of coordinates for all coordinates contained in dt:
#
# dt_coor_1 = dt[year == min(year) & month == min(month),.(Lon,Lat,grid_id)]
# setnames(dt_coor_1,c("Lon1","Lat1","grid_id1"))
# # add dummy key, do outer full merge with a duplicate, and remove key again:
# dt_coor_1[,"key" := 1]
# dt_coor_2 = copy(dt_coor_1)
# setnames(dt_coor_2,c("Lon2","Lat2","grid_id2","key"))
# var_sc_prereq = merge(dt_coor_1,dt_coor_2, by = "key",allow.cartesian = TRUE)
# var_sc_prereq[, "key" := NULL]
#
# # get indices for pairs of locations
#
# n_loc_pair = dt_coor_1[,.N]
# id_1 = as.vector(mapply(rep,1:n_loc,times = n_loc_pair))
# id_2 = rep(1:n_loc_pair, times = n_loc_pair)
#
#
# # find variance and covariance locations in Sigma:
# n_loc = dt[,.N]
# var_id_1 = id_1 + (id_1-1)*n_loc
# var_id_2 = id_2 + (id_2-1)*n_loc
# cov_id = id_1 + (id_2 - 1)*n_loc
#
#
# #get weights for variogram score
#
# if(weighted)
# {
# if(is.null(weight_fct))
# {
# weight_fct = function(x)
# { y = rep(1, length(x))
# y[abs(x)>1] = (1/x[abs(x)>1]^2)
# return(y)
# }
# }
#
# var_sc_prereq[,"weights" := weight_fct(weight_dt[,mean_SST][id_1] - weight_dt[,mean_SST][id_2])]
# #normalizing:
# var_sc_prereq[,"weights" := weights/sum(weights)]
# } else {
# var_sc_prereq[,"weights" := 1/.N]
# }
#
#
#
# ##--- For each pair of coordinates (i,j) compute the variance of X_i-X_j
# ##--- for the predictive distribution with d principal components
#
# print("getting variances of differences:")
#
# diff_var = list()
#
#
# Sigma_wt = weight_mat * Sigma
#
# var_sc_prereq[,var:= Sigma_wt[var_id_1] + Sigma_wt[var_id_2] - 2*Sigma_wt[cov_id]]
#
# #complement var_sc_prereq by the squared differences of the mean vectors and squared differences of observation:
#
# var_sc_prereq[,sq_m_diff := (dt[,SST_hat][id_1]-dt[,SST_hat][id_2])^2]
#
# var_sc_prereq[,sq_obs_diff := (dt[,SST_bar][id_1]-dt[,SST_bar][id_2])^2]
#
#
# # get variogram scores
# print("done - compute variogram scores:")
#
#
# #var_sc_by_PC = function(d){
#
# return_data = data.table(year = y)
#
# dt_temp = var_sc_prereq[,.SD,.SDcols = c("var","sq_m_diff","sq_obs_diff","weights")]
# setnames(dt_temp,c("dvar","sq_mean_diff", "sq_obs_diff","weights"))
# return_data[,sc := variogram_score_nrm_p2(dt_temp)]
#
#
# scores = return_data[,month := m]
#
# save(scores, file = paste0(save_dir,file_name,"_m",m,"_y",y,".RData"))
# }
#
#' Forecasts by shrinking eigenvalues
#'
#' @description Generates forecasts for the SE post-processing approach.
#'
#' @param dt the data table.
#' @param Y,M Integer vectors containing year(s) and month(s).
#' @param n Integer. Size of the desired forecast ensemble.
#' @param cov_dir,cov_file_name Where the covariance estimates are stored and how they are named, see \code{cov_est_SE}.
#'
#' @return data table containing n columns with noise and n columns with forecasts.
#'
#' @examples \dontrun{}
#'
#' @author Claudio Heinrich
#'
#' @export
forecast_SE = function(dt, Y, M,
n = 10,
cov_dir, cov_file_name = "cov_est_SE")
{
#find land grid ids:
dt = dt[year %in% Y,][month %in% M,]
land_ids <- which(dt[, is.na(Ens_bar) | is.na(SST_bar)])
if(!identical(land_ids,integer(0)))
{
dt_water = dt[-land_ids,]
} else {
dt_water = dt
}
SD_cols = c("Lon","Lat","grid_id","month","year","YM",
"SST_hat","SST_bar","Ens_bar","Bias_Est","var_bar","SD_hat")
SD_cols = SD_cols[which(SD_cols %in% colnames(dt))]
fc_water <- na.omit( dt_water[,.SD,.SDcols = SD_cols])
forecast_by_month = function(m)
{
print( paste0("Month = ",m))
dt_month = fc_water[month == m,]
for(y in Y)
{
print(y)
load(file = paste0(cov_dir,cov_file_name,"_m",m,"_y",y,".RData"))
mcf = fc_water[year == y & month == m, SD_hat]/diag(sqrt_Sigma)
sqrt_Sigma_hat = diag(mcf) %*% sqrt_Sigma
no = rnorm(n = dim(sqrt_Sigma_hat)[1] * n)
no = sqrt_Sigma_hat %*% matrix(no,nrow = dim(sqrt_Sigma_hat)[1])
for (i in 1:n)
{
dt_month[year == y, paste0("no",i):= no[,i]]
dt_month[year == y,paste0("fc",i):= SST_hat + .SD,
.SDcols = paste0("no",i)]
}
}
return(dt_month)
}
temp = parallel::mclapply(M,forecast_by_month,mc.cores = length(M))
fc_water = rbindlist(temp)
#-------- add land --------------
if(!identical(land_ids,integer(0)))
{
fc_land = dt[land_ids,.SD,.SDcols = SD_cols]
fc_land[, paste0("no",1:n):= NA]
fc_land[, paste0("fc",1:n):= NA]
fc = rbindlist(list(fc_water,fc_land), fill = TRUE)
}
# order:
fc = fc[ order(year,month,Lon,Lat)]
return(fc)
}
# ###############################################
# ########### SE variogram score ################
# ###############################################
#
#
# # to skip this section, run instead:
#
# # SE_dir = paste0(save_dir, "SE/")
# # load(file = paste0(SE_dir,"variogram_scores.RData"))
#
#
# ##### setting up ######
#
# SE_dir = paste0(save_dir,"SE/")
# dir.create(SE_dir, showWarnings = FALSE)
#
# training_years = DT[!(year %in% validation_years),unique(year)]
#
# for_res_cov_SE(Y = training_years,
# dt = DT,
# save_dir = SE_dir,
# ens_size = ens_size)
#
#
#
# L = 2500
#
# NA_rows = DT[,which(is.na(SST_bar) | is.na(SST_hat) | is.na(SD_hat))]
#
# DT_NA_free = DT[-NA_rows,]
#
# sp <- sp::SpatialPoints(cbind(x=DT_NA_free[YM == min(YM), Lon],
# y=DT_NA_free[YM == min(YM), Lat]),
# proj4string = sp::CRS("+proj=longlat +datum=WGS84"))
# Dist <- sp::spDists(sp, longlat = TRUE)
#
# weights = weight_fct(Dist)
# weight_mat = matrix(weights, nrow = dim(Dist)[1])
#
#
#
# #### variogram score computation: ####
#
#
# for(m in months)
# {
# # setup: get principal components and marginal variances for the given month m:
#
# print(paste0("m = ",m))
# load(file = paste0(SE_dir,"CovRes_mon",m,".RData"))
#
# land_ids = which(DT[year == min(year) & month == min(month),is.na(Ens_bar) | is.na(SST_bar)])
#
#
# dummy_fct = function(y)
# {
# var_sc_SE_new(m, y, DT,weight_mat = weight_mat, Sigma = Sigma, ens_size = ens_size,
# weighted = FALSE,
# save_dir = SE_dir)
# }
# parallel::mclapply(validation_years,dummy_fct,mc.cores = length(validation_years))
#
# }
#
#
#
# ###### combine: ######
#
# scores_SE = list()
# k=0
# for(m in months){
# for(y in validation_years)
# {k=k+1
# load(file = paste0(SE_dir,"var_sc_m",m,"_y",y,".RData"))
# scores_SE[[k]] = scores
# }
# }
# scores_SE = rbindlist(scores_SE)
#
# mean_sc_SE = scores_SE[,mean_sc := mean(sc)]
#
#
# # save
#
# save(scores_SE_mc,scores_SE_nmc,mean_sc,mean_sc_nmc,file = paste0(PCA_dir,"variogram_scores",weight_name_addition,".RData"))
#
# ##########################################
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.