R/SE.R

Defines functions cov_est_SE forecast_SE

########### shrinking of eigenvalues ###############

#' computes the covariance estimate by the eigenvalue-shrinking-method of Ledoit and Wolf, 
#' but with the sample covariance matrix downweighted by distance of gridpoints before deriving the estimate. For some details and notation see Pourahmadi 2013, pp.101-102.
#'
#' @param dt The data table
#' @param weight_mat The matrix containing the weights for tapering: Has dimension n x n, where n is the number of grid_ids in water in dt. 
#'                   Should be generated by the function weight_mat.
#' @param Y,M The years and months considered.
#' @param save_years For these years the covariance estimate is computed and saved,
#'                   using for each year all previous years.
#' @param save_dir The directory to save in.
#' @param file_name The saved files are named paste0(file_name,"_m",+,"_y",++,".RData"), where + is the month and ++ the year. 
#'   
#'   
#' @export

cov_est_SE = function(dt, weight_mat,
                          Y=unique(dt[,year]), M = 1:12,
                          save_years = 2013:2018, save_dir, file_name = "cov_est_SE")
{  
  dt = dt[year %in% Y & month %in% M,]
  
  na_ids <- which(dt[, is.na(Ens_bar) | is.na(SST_bar)])
  if(!identical(na_ids,integer(0)))
  {
    dt = dt[-na_ids,]
  }
  
  num_loc = dt[year == min(year) & month == min(month),.N]
  
  cov_est_by_month = function(m)
  {
    print(paste0("month =",m))  
    
    full_data_mat = matrix(dt[month == m,SST_bar - SST_hat],nrow = num_loc)
    
    for(y in save_years)
    {
      num_years = max(which(Y < y))
      data_mat = full_data_mat[,1:num_years]
      
      sam_cov_mat = 1/num_years * data_mat %*% t(data_mat) 
      S = weight_mat * sam_cov_mat
      
      p = dim(S)[1]
      n = num_years
      
      m_n = sum(diag(S))/p
      d_n_sq = sum(diag( (S - diag(rep(m_n,p))) %*% (S - diag(rep(m_n,p))) ))/p
      
      b_n_sq_bar = 0
      for(k in 1:n)
      { x_k_sq = (data_mat[,k])^2
        mat_temp = (diag(x_k_sq) - S) %*% (diag(x_k_sq) - S)
        b_n_sq_bar = b_n_sq_bar + (1/n^2)*sum(diag(mat_temp))/p
      }
      
      b_n_sq = min(d_n_sq,b_n_sq_bar)
      a_n_sq = d_n_sq - b_n_sq
        
      Sigma = diag(rep(b_n_sq * m_n / d_n_sq,p)) + a_n_sq/d_n_sq * S
      
      sin_val_dec = svd(Sigma)
      
      # vectorize matrix mult:
      temp = as.vector(t(matrix(rep(sqrt(sin_val_dec$d),dim(S)[1]),nrow = dim(S)[1])))
      
      sqrt_Sigma = t(sin_val_dec$u * temp) # Sigma = t(sqrt_Sigma) %*% sqrt_Sigma
      
      full_file_name = paste0(file_name,"_m",m,"_y",y,".RData")
      save(sqrt_Sigma, file = paste0(save_dir,full_file_name))
    }
  }
  
  parallel::mclapply(M,cov_est_by_month,mc.cores = length(M))
}



# var_sc_SE_new = function(m, y, dt, weight_mat,
#                           Sigma, 
#                           ens_size = 9,
#                           save_dir = "~/PostClimDataNoBackup/SFE/Derived/SE/",
#                           file_name = "var_sc",
#                           weighted = FALSE,
#                           weight_fct = NULL)
# {
#   # setting up:
#   
#   dt = dt[year %in% y & month %in% m,]
#   
#   land_ids <- which(dt[, is.na(Ens_bar) | is.na(SST_bar)])
#   if(!identical(land_ids,integer(0)))
#   {
#     dt = dt[-land_ids,]
#   }
#   
#   
#   
#   # build data table that contains pairs of coordinates for all coordinates contained in dt:
#   
#   dt_coor_1 = dt[year == min(year) & month ==  min(month),.(Lon,Lat,grid_id)]
#   setnames(dt_coor_1,c("Lon1","Lat1","grid_id1"))
#   # add dummy key, do outer full merge with a duplicate, and remove key again:
#   dt_coor_1[,"key" := 1]
#   dt_coor_2 = copy(dt_coor_1) 
#   setnames(dt_coor_2,c("Lon2","Lat2","grid_id2","key"))
#   var_sc_prereq = merge(dt_coor_1,dt_coor_2, by = "key",allow.cartesian = TRUE)
#   var_sc_prereq[, "key" := NULL]
#   
#   # get indices for pairs of locations
#   
#   n_loc_pair = dt_coor_1[,.N]
#   id_1 = as.vector(mapply(rep,1:n_loc,times = n_loc_pair))
#   id_2 = rep(1:n_loc_pair, times = n_loc_pair)
#   
#   
#   # find variance and covariance locations in Sigma:
#   n_loc = dt[,.N]
#   var_id_1 = id_1 + (id_1-1)*n_loc
#   var_id_2 = id_2 + (id_2-1)*n_loc
#   cov_id = id_1 + (id_2 - 1)*n_loc
#   
#   
#   #get weights for variogram score
#   
#   if(weighted)
#   {
#     if(is.null(weight_fct))
#     {
#       weight_fct = function(x)
#       { y = rep(1, length(x))
#       y[abs(x)>1] = (1/x[abs(x)>1]^2)
#       return(y)
#       }
#     }
#     
#     var_sc_prereq[,"weights" := weight_fct(weight_dt[,mean_SST][id_1] - weight_dt[,mean_SST][id_2])]
#     #normalizing:
#     var_sc_prereq[,"weights" := weights/sum(weights)]
#   } else {
#     var_sc_prereq[,"weights" := 1/.N]
#   }
#   
#   
#   
#   ##--- For each pair of coordinates (i,j) compute the variance of X_i-X_j 
#   ##--- for the predictive distribution with d principal components
#   
#   print("getting variances of differences:")
#   
#   diff_var = list()
#   
#   
#   Sigma_wt = weight_mat * Sigma
#   
#   var_sc_prereq[,var:= Sigma_wt[var_id_1] + Sigma_wt[var_id_2] - 2*Sigma_wt[cov_id]]
#   
#   #complement var_sc_prereq by the squared differences of the mean vectors and squared differences of observation:
#   
#   var_sc_prereq[,sq_m_diff := (dt[,SST_hat][id_1]-dt[,SST_hat][id_2])^2]
#   
#   var_sc_prereq[,sq_obs_diff := (dt[,SST_bar][id_1]-dt[,SST_bar][id_2])^2]
#   
#   
#   # get variogram scores
#   print("done - compute variogram scores:")
#   
#   
#   #var_sc_by_PC = function(d){
#   
#   return_data =  data.table(year = y)
#     
#   dt_temp = var_sc_prereq[,.SD,.SDcols = c("var","sq_m_diff","sq_obs_diff","weights")]
#   setnames(dt_temp,c("dvar","sq_mean_diff", "sq_obs_diff","weights"))
#   return_data[,sc := variogram_score_nrm_p2(dt_temp)]
#     
#     
#   scores = return_data[,month := m]  
#     
#   save(scores, file = paste0(save_dir,file_name,"_m",m,"_y",y,".RData"))
# }
# 


#' Forecasts by shrinking eigenvalues
#' 
#' @description Generates forecasts for the SE post-processing approach.
#' 
#' @param dt the data table.
#' @param Y,M Integer vectors containing year(s) and month(s).
#' @param n Integer. Size of the desired forecast ensemble. 
#' @param cov_dir,cov_file_name Where the covariance estimates are stored and how they are named, see \code{cov_est_SE}.
#' 
#' @return data table containing n columns with noise and n columns with forecasts.
#' 
#' @examples \dontrun{}
#' 
#' @author Claudio Heinrich        
#' 
#' @export

forecast_SE = function(dt, Y, M, 
                       n = 10,
                       cov_dir, cov_file_name = "cov_est_SE")
{ 
  #find land grid ids:
  dt = dt[year %in% Y,][month %in% M,]
  
  land_ids <- which(dt[, is.na(Ens_bar) | is.na(SST_bar)])
  if(!identical(land_ids,integer(0)))
  {
    dt_water = dt[-land_ids,]
  } else {
    dt_water = dt
  }
  
  SD_cols = c("Lon","Lat","grid_id","month","year","YM",
              "SST_hat","SST_bar","Ens_bar","Bias_Est","var_bar","SD_hat")
  SD_cols = SD_cols[which(SD_cols %in% colnames(dt))]
  fc_water <- na.omit( dt_water[,.SD,.SDcols = SD_cols])
  
  forecast_by_month = function(m)
    {
      print( paste0("Month = ",m))
    
      dt_month = fc_water[month == m,]
      
      for(y in Y)
        {
          print(y)
          load(file = paste0(cov_dir,cov_file_name,"_m",m,"_y",y,".RData"))  
          
          mcf = fc_water[year == y & month == m, SD_hat]/diag(sqrt_Sigma)
          sqrt_Sigma_hat = diag(mcf) %*% sqrt_Sigma 
          
          no = rnorm(n = dim(sqrt_Sigma_hat)[1] * n)
          no = sqrt_Sigma_hat %*% matrix(no,nrow = dim(sqrt_Sigma_hat)[1])
          for (i in 1:n)
            {
            dt_month[year == y, paste0("no",i):= no[,i]]
            dt_month[year == y,paste0("fc",i):= SST_hat + .SD, 
                     .SDcols = paste0("no",i)]
            }
        }
      return(dt_month)
    }
  temp = parallel::mclapply(M,forecast_by_month,mc.cores = length(M))
  fc_water = rbindlist(temp)
  
  #-------- add land --------------
  
  if(!identical(land_ids,integer(0)))
  {
    fc_land = dt[land_ids,.SD,.SDcols = SD_cols]
    fc_land[,  paste0("no",1:n):= NA]
    fc_land[,  paste0("fc",1:n):= NA]
    
    fc = rbindlist(list(fc_water,fc_land), fill = TRUE)
  }
  
  # order:
  
  fc = fc[ order(year,month,Lon,Lat)]
  
  return(fc)
}


# ###############################################
# ########### SE variogram score ################
# ###############################################
# 
# 
# # to skip this section, run instead:
# 
# # SE_dir = paste0(save_dir, "SE/")
# # load(file = paste0(SE_dir,"variogram_scores.RData"))
# 
# 
# ##### setting up ######
# 
# SE_dir = paste0(save_dir,"SE/")
# dir.create(SE_dir, showWarnings = FALSE)
# 
# training_years = DT[!(year %in% validation_years),unique(year)]
# 
# for_res_cov_SE(Y = training_years,
#                dt = DT, 
#                save_dir = SE_dir,
#                ens_size = ens_size)
# 
# 
# 
# L = 2500
# 
# NA_rows = DT[,which(is.na(SST_bar) | is.na(SST_hat) | is.na(SD_hat))]
# 
# DT_NA_free = DT[-NA_rows,]
# 
# sp <- sp::SpatialPoints(cbind(x=DT_NA_free[YM == min(YM), Lon],
#                               y=DT_NA_free[YM == min(YM), Lat]), 
#                         proj4string = sp::CRS("+proj=longlat +datum=WGS84"))
# Dist <- sp::spDists(sp, longlat = TRUE)
# 
# weights = weight_fct(Dist)
# weight_mat = matrix(weights, nrow = dim(Dist)[1])
# 
# 
# 
# #### variogram score computation: ####
# 
# 
# for(m in months)
# { 
#   # setup: get principal components and marginal variances for the given month m:
#   
#   print(paste0("m = ",m))
#   load(file = paste0(SE_dir,"CovRes_mon",m,".RData"))
#   
#   land_ids = which(DT[year == min(year) & month == min(month),is.na(Ens_bar) | is.na(SST_bar)])
#   
#   
#   dummy_fct = function(y)
#   {
#     var_sc_SE_new(m, y, DT,weight_mat = weight_mat, Sigma = Sigma, ens_size = ens_size,
#                    weighted = FALSE,
#                    save_dir = SE_dir)
#   }
#   parallel::mclapply(validation_years,dummy_fct,mc.cores = length(validation_years))
#   
# }
# 
# 
# 
# ###### combine: ######
# 
# scores_SE = list()
# k=0
# for(m in months){
#   for(y in validation_years)
#   {k=k+1
#   load(file = paste0(SE_dir,"var_sc_m",m,"_y",y,".RData"))
#   scores_SE[[k]] = scores
#   }
# }
# scores_SE = rbindlist(scores_SE)
# 
# mean_sc_SE = scores_SE[,mean_sc := mean(sc)]
# 
# 
# # save
# 
# save(scores_SE_mc,scores_SE_nmc,mean_sc,mean_sc_nmc,file = paste0(PCA_dir,"variogram_scores",weight_name_addition,".RData"))
# 
# ##########################################
ClaudioHeinrich/pp.sst documentation built on March 12, 2020, 3:15 a.m.