ProbCast: Flexible Probabilistic Forecasting

Documented in pinball

#' Pinball Score for \code{MultiQR} Objects
#' 
#' @description This function calculates the pinball score for each quantile in a
#' \code{MultiQR} object. Optionally, results are produced by cross-validation fold
#' or covariate, 95\% confidence intervals are estimated via bootstrap, and results
#' are plotted.
#' 
#' @author Jethro Browell, \email{jethro.browell@@strath.ac.uk}
#' @param qrdata \code{MultiQR} object.
#' @param realisations Vector of realisations corresponding to rows of \code{qrdata}.
#' \code{NA} accepted where realisations are missing.
#' @param kfolds Optional vector of fold/test labels corresponding to rows of \code{qrdata}.
#' Cannot be used with \code{subsets}.
#' @param subsets Optional vector of covariates to bin data by.
#' Breaks between bins are the empirical quantiles of
#' \code{subsets} by default or all unique factors or charater strings. Custom
#' breaks may be specifed, see \code{breaks}.
#' Cannot be used with \code{kfolds}.
#' @param breaks Either the number of quantiles to use to bin \code{subsets} by (resulting
#' in \code{breaks+1} bins, defaults to \code{breaks=4}), or, if \code{length(breaks) > 1}, a vector of spcific break
#' points. Only used if \code{subsets} provided.
#' @param bootstrap Number of boostrap samples used to generate 95\% confidence intervals.
#' @param plot.it \code{boolean}. Make a plot?
#' @param ... Additional arguments passed to \code{plot()}.
#' @details Missing values in \code{realisations} are handled by \code{na.rm=T} when
#' calculating average exceedence of a given quantile.
#' @return Quantile Score data and, if \code{plot.it=T}, a plot.
#' @export
pinball <- function(qrdata,realisations,kfolds=NULL,subsets=NULL,breaks=4,bootstrap=NULL,plot.it=T,...){
  
  if(nrow(qrdata)!=length(realisations)){stop("nrow(qrdata)!=length(realisations)")}
  if(!is.null(kfolds)){
    if(length(kfolds)!=length(realisations)){stop("!is.null(kfolds) & nrow(kfolds)!=length(realisations)")}
  }
  if(!is.null(subsets)){
    if(length(subsets)!=length(realisations)){stop("!is.null(subsets) & nrow(subsets)!=length(realisations)")}
  }
  if(!is.null(kfolds) & !is.null(subsets)){stop("Only one of subsets and kfolds can !=NULL.")}
  if(length(breaks) ==1 & breaks[1]<1){stop("breaks must be a positive integer.")}
  
  qs <- as.numeric(gsub(colnames(qrdata),pattern = "q",replacement = ""))/100
  
  if(!is.null(kfolds)){
    total <- "All_cv"} else{
      total <- "All"  
    }
  
  PBL <- data.frame(Quantile=qs,
                    Loss=as.numeric(rep(NA,length(qs))),
                    kfold=total,
                    subset=NA,
                    upper=NA,
                    lower=NA)
  for(q in qs){
    if(total == "All_cv"){
      PBL$Loss[which(qs==q)] <- mean(((realisations-qrdata[[paste0("q",100*q)]])*q*(realisations>=qrdata[[paste0("q",100*q)]])+
                                        (realisations-qrdata[[paste0("q",100*q)]])*(q-1)*(realisations<qrdata[[paste0("q",100*q)]]))[kfolds!="Test"],
                                     na.rm = T)
      
      if(!is.null(bootstrap)){
        bs_data <- rep(NA,bootstrap)
        for(i in 1:bootstrap){
          data_length <- length(qrdata[[paste0("q",100*q)]][kfolds!="Test"])
          i_samp <- sample(1:data_length,size = data_length,replace = T)
          bs_data[i] <- mean(((realisations-qrdata[[paste0("q",100*q)]])*q*(realisations>=qrdata[[paste0("q",100*q)]])+
                                (realisations-qrdata[[paste0("q",100*q)]])*(q-1)*(realisations<qrdata[[paste0("q",100*q)]]))[kfolds!="Test"][i_samp],
                             na.rm = T)        
        }
        PBL$upper[which(qs==q)] <- quantile(bs_data,probs = 0.975)
        PBL$lower[which(qs==q)] <- quantile(bs_data,probs = 0.025)
      }
      
      
    } else{
      PBL$Loss[which(qs==q)] <- mean((realisations-qrdata[[paste0("q",100*q)]])*q*(realisations>=qrdata[[paste0("q",100*q)]])+
                                       (realisations-qrdata[[paste0("q",100*q)]])*(q-1)*(realisations<qrdata[[paste0("q",100*q)]]),na.rm = T)
      
      
      if(!is.null(bootstrap)){
        bs_data <- rep(NA,bootstrap)
        for(i in 1:bootstrap){
          data_length <- length(qrdata[[paste0("q",100*q)]])
          i_samp <- sample(1:data_length,size = data_length,replace = T)
          bs_data[i] <- mean(((realisations-qrdata[[paste0("q",100*q)]])*q*(realisations>=qrdata[[paste0("q",100*q)]])+
                                (realisations-qrdata[[paste0("q",100*q)]])*(q-1)*(realisations<qrdata[[paste0("q",100*q)]]))[i_samp],na.rm = T)        
        }
        PBL$upper[which(qs==q)] <- quantile(bs_data,probs = 0.975)
        PBL$lower[which(qs==q)] <- quantile(bs_data,probs = 0.025)
      }
      
      
    }
  }
  
  # CV folds
  if(!is.null(kfolds)){
    kfolds[is.na(kfolds)]<-"Test"
    
    for(fold in unique(kfolds)){
      
      tempPBL <- data.frame(Quantile=qs,
                            Loss=as.numeric(rep(NA,length(qs))),
                            kfold=fold,
                            subset=NA,
                            upper=NA,
                            lower=NA)
      for(q in qs){
        tempPBL$Loss[which(qs==q)] <- mean(((realisations-qrdata[[paste0("q",100*q)]])*q*(realisations>=qrdata[[paste0("q",100*q)]])+
                                              (realisations-qrdata[[paste0("q",100*q)]])*(q-1)*(realisations<qrdata[[paste0("q",100*q)]]))[kfolds==fold],
                                           na.rm = T)
        
        if(!is.null(bootstrap)){
          bs_data <- rep(NA,bootstrap)
          for(i in 1:bootstrap){
            data_length <- length(qrdata[[paste0("q",100*q)]][kfolds==fold])
            i_samp <- sample(1:data_length,size = data_length,replace = T)
            bs_data[i] <- mean(((realisations-qrdata[[paste0("q",100*q)]])*q*(realisations>=qrdata[[paste0("q",100*q)]])+
                                  (realisations-qrdata[[paste0("q",100*q)]])*(q-1)*(realisations<qrdata[[paste0("q",100*q)]]))[kfolds==fold][i_samp],na.rm = T)        
          }
          tempPBL$upper[which(qs==q)] <- quantile(bs_data,probs = 0.975)
          tempPBL$lower[which(qs==q)] <- quantile(bs_data,probs = 0.025)
        }
        
        
      }
      
      PBL <- rbind(PBL,tempPBL)
    }
  }
  
  
  ## Subsets
  if(!is.null(subsets)){
    
    
    if(is.factor(subsets) | is.character(subsets)){
      
      for(i in unique(subsets)){
        indexs <- which(subsets==i)  
        
        
        tempPBL <- data.frame(Quantile=qs,
                              Loss=as.numeric(rep(NA,length(qs))),
                              kfold=NA,
                              subset = i,
                              upper=NA,
                              lower=NA)
        for(q in qs){
          
          tempPBL$Loss[which(qs==q)] <- mean(((realisations-qrdata[[paste0("q",100*q)]])*q*(realisations>=qrdata[[paste0("q",100*q)]])+
                                                (realisations-qrdata[[paste0("q",100*q)]])*(q-1)*(realisations<qrdata[[paste0("q",100*q)]]))[indexs],
                                             na.rm = T)
          
          if(!is.null(bootstrap)){
            bs_data <- rep(NA,bootstrap)
            for(j in 1:bootstrap){
              data_length <- length(qrdata[[paste0("q",100*q)]][indexs])
              i_samp <- sample(1:data_length,size = data_length,replace = T)
              bs_data[j] <- mean(((realisations-qrdata[[paste0("q",100*q)]])*q*(realisations>=qrdata[[paste0("q",100*q)]])+
                                    (realisations-qrdata[[paste0("q",100*q)]])*(q-1)*(realisations<qrdata[[paste0("q",100*q)]]))[indexs][i_samp],na.rm = T)
            }
            tempPBL$upper[which(qs==q)] <- quantile(bs_data,probs = 0.975)
            tempPBL$lower[which(qs==q)] <- quantile(bs_data,probs = 0.025)
          }
          
        }
        
        
        PBL <- rbind(PBL,tempPBL)
      }
      
      
      
    } else {
      
      if(length(breaks)==1){
        break_qs <- quantile(subsets,probs = seq(from = 1/(breaks+1),by=1/(breaks+1),length.out=breaks),na.rm = T)
        break_qs <- c(-Inf,break_qs,Inf)
      }else{
        break_qs <- c(-Inf,breaks,Inf)
      }
      
      for(i in 2:length(break_qs)){
        indexs <- which(subsets>break_qs[i-1] & subsets<=break_qs[i])  
        
        
        tempPBL <- data.frame(Quantile=qs,
                              Loss=as.numeric(rep(NA,length(qs))),
                              kfold=NA,
                              subset = i-1,
                              upper=NA,
                              lower=NA)
        for(q in qs){
          
          tempPBL$Loss[which(qs==q)] <- mean(((realisations-qrdata[[paste0("q",100*q)]])*q*(realisations>=qrdata[[paste0("q",100*q)]])+
                                                (realisations-qrdata[[paste0("q",100*q)]])*(q-1)*(realisations<qrdata[[paste0("q",100*q)]]))[indexs],
                                             na.rm = T)
          
          if(!is.null(bootstrap)){
            bs_data <- rep(NA,bootstrap)
            for(j in 1:bootstrap){
              data_length <- length(qrdata[[paste0("q",100*q)]][indexs])
              i_samp <- sample(1:data_length,size = data_length,replace = T)
              bs_data[j] <- mean(((realisations-qrdata[[paste0("q",100*q)]])*q*(realisations>=qrdata[[paste0("q",100*q)]])+
                                    (realisations-qrdata[[paste0("q",100*q)]])*(q-1)*(realisations<qrdata[[paste0("q",100*q)]]))[indexs][i_samp],na.rm = T)
            }
            tempPBL$upper[which(qs==q)] <- quantile(bs_data,probs = 0.975)
            tempPBL$lower[which(qs==q)] <- quantile(bs_data,probs = 0.025)
          }
          
        }
        
        
        PBL <- rbind(PBL,tempPBL)
      }
      
    }
    
  }
  
  
  
  
  if(plot.it){
    
    
    if(!is.null(subsets)){
      
      plot(PBL[,1:2],type="b",pch=16,
           xlim=c(0,1),
           ylab="Pinball Loss",col="white",...)
      grid()
      
      lvls <- na.omit(unique(PBL$subset))
      
      for(br in seq_along(lvls)){
        
        lines(PBL[which(PBL$subset==lvls[br]),1:2],type="b",pch=16,col=rainbow(length(lvls))[br])
        
        if(!is.null(bootstrap)){
          polygon(x = c(PBL$Quantile[which(PBL$subset==lvls[br])],rev(PBL$Quantile[which(PBL$subset==lvls[br])])),
                  y=c(PBL$upper[which(PBL$subset==lvls[br])],rev(PBL$lower[which(PBL$subset==lvls[br])])),
                  col = rainbow(length(lvls),alpha = .3)[br],border = NA)
          
        }
        
      }
      
      if(is.factor(subsets) | is.character(subsets)){
        
        legend("topleft",lvls,
               lty=c(rep(1,length(lvls))),
               col=c(rainbow(length(lvls))),
               pch=c(rep(16,length(lvls))),bty = "n",cex=.7,ncol = 2)
        
      } else{
        if(length(breaks)==1 & breaks[1]==1){
          legend("topleft",c(paste0("<=",break_qs[2]),paste0(">",break_qs[2])),
                 lty=c(rep(1,breaks+1)),
                 col=c(rainbow(breaks+1)),
                 pch=c(rep(16,breaks+1)),bty = "n")
        }else{
          legend("topleft",c(paste0(c("<=",paste0(signif(break_qs[2:(length(break_qs)-2)],digits=2)," to "),">"),
                                    signif(break_qs[c(2:((length(break_qs)-2)+1),(length(break_qs)-2)+1)],digits=2))),
                 lty=c(rep(1,length(break_qs)-1)),
                 col=c(rainbow(length(break_qs)-1)),
                 pch=c(rep(16,length(break_qs)-1)),bty = "n")
        }
      }
      
      
    } else{
      
      plot(PBL[,1:2],type="b",pch=16,
           xlim=c(0,1),
           ylab="Pinball Loss",col="white",...)
      
      grid()
      lines(PBL[which(PBL$kfold==total),1:2],type="b",pch=16)
      
      if(!is.null(bootstrap)){
        polygon(x = c(PBL$Quantile[which(PBL$kfold==total)],rev(PBL$Quantile[which(PBL$kfold==total)])),
                y=c(PBL$upper[which(PBL$kfold==total)],rev(PBL$lower[which(PBL$kfold==total)])),
                col = rgb(0,0,1,alpha = 0.3),border = NA)
        
      }
      
      if(!is.null(kfolds)){
        for(fold in unique(kfolds)){
          if(fold!="Test"){
            
            lines(PBL[which(PBL$kfold==fold),1:2],type="b",pch=16,col="Grey50")
            
            if(!is.null(bootstrap)){
              polygon(x = c(PBL$Quantile[which(PBL$kfold==fold)],rev(PBL$Quantile[which(PBL$kfold==fold)])),
                      y=c(PBL$upper[which(PBL$kfold==fold)],rev(PBL$lower[which(PBL$kfold==fold)])),
                      col = grey(.5,alpha = 0.3),border = NA)
              
            }
            
          } else{
            lines(PBL[which(PBL$kfold==fold),1:2],type="b",pch=16,col="red")
            
            
            if(!is.null(bootstrap)){
              polygon(x = c(PBL$Quantile[which(PBL$kfold==fold)],rev(PBL$Quantile[which(PBL$kfold==fold)])),
                      y=c(PBL$upper[which(PBL$kfold==fold)],rev(PBL$lower[which(PBL$kfold==fold)])),
                      col = rgb(1,0,0,alpha = 0.3),border = NA)
              
            }
            
          }
          
        }
        lines(PBL[PBL$kfold==total,1:2],type="b",pch=16,col="blue")
      }
      
      
      
      if(!is.null(kfolds) & !("Test"%in%kfolds)){
        legend("topleft",c(total,"CV Folds"),lty=c(1,1),col=c("blue","Grey50"),pch=c(16,16),bty = "n")
      }else if("Test"%in%kfolds){
        lines(PBL[PBL$kfold=="Test",1:2],type="b",pch=16,col="red")
        legend("topleft",c("Test",total,"CV Folds"),lty=c(1,1,1),col=c("red","blue","Grey50"),pch=c(16,16,16),bty = "n")
      }
      
      
    }
  }
  
  return(PBL)
}