R/preparedata_fn.R

Defines functions preparedata_fn

Documented in preparedata_fn

#' A function to prepare mortality data with stratification (e.g. by product, sex/gender, country, cause of death etc.)
#'
#' Construct a 3-dimensional data array (dim 1: strata, dim 2: ages, dim 3: years) from raw data frames/arrays suitable for fitting various Bayesian Stochastic Mortality Models.
#' @param data_df_array data (normally death counts or exposures) to be provided either as a **data frame** or a **3-dimensional array**. If providing data frame, it should be structured as (column 1: ages, column 2: years, column 3: death/expo data, column 4: strata); if providing array, it should be as (dim 1: strata, dim 2: ages, dim 3: years). \cr
#' If only one stratum (AP data), either remove column 4 or ensure it contains only one stratum label. Alternatively, one can also provide a **2-dimensional matrix** (age\eqn{\times }year) but please set \code{data_matrix=TRUE}.
#' @param strat_name a vector of strings indicating names of each stratum.
#' @param ages a numeric vector indicating which ages to use.
#' @param years a numeric vector indicating which years to use.
#' @param data_matrix a logical value indicating if a 2-dimensional matrix (age\eqn{\times }year) is supplied.
#' @param round a logical value indicating whether data entries should be rounded.
#' @return A list with components:
#' \describe{
#'   \item{\code{data}}{A 3-dimensional data array (dim 1: strata, dim 2: ages, dim 3: years).}
#'   \item{\code{strat_name}}{A vector of strings describing the names of each stratum.}
#'   \item{\code{ages}}{A numeric vector describing the ages used.}
#'   \item{\code{years}}{A numeric vector describing the years used.}
#'   \item{\code{n_strat}}{A numeric value describing the number of strata.}
#'   \item{\code{n_ages}}{A numeric value describing the number of ages.}
#'   \item{\code{n_years}}{A numeric value describing the number of years.}
#' }
#' @keywords utilities
#' @concept data preprocessing
#' @concept data formatting
#' @concept mortality data
#' @concept stratification
#' @importFrom dplyr filter
#' @importFrom magrittr `%>%`
#' @importFrom rlang .data
#' @export
#' @examples
#' ##################
#' ##if p>1 (more than 1 strata)
#' ##################
#' 
#' #Input: data.frame
#' data("data_summarised")
#' head(data_summarised)
#' 
#' #prepare death data
#' death<-preparedata_fn(data_summarised[,c("Age","Year","Claim","Product")],
#' strat_name = c("ACI","DB","SCI"),ages=35:65)
#' #prepare exposure data
#' expo<-preparedata_fn(data_summarised[,c("Age","Year","Exposure","Product")],
#' strat_name = c("ACI","DB","SCI"),ages=35:65)
#' 
#' #visualise data
#' str(death);str(expo)
#' 
#' #works also if data.frame only contains only 1 stratum
#' death<-preparedata_fn(data_summarised[,
#' c("Age","Year","Claim","Product")][data_summarised$Product=="ACI",],ages=35:65)
#' 
#' expo<-preparedata_fn(data_summarised[,
#' c("Age","Year","Exposure","Product")][data_summarised$Product=="ACI",],ages=35:65)
#' 
#' str(death);str(expo)
#' 
#' #Input: 3D data array
#' data("dxt_array_product");data("Ext_array_product")
#' death<-preparedata_fn(dxt_array_product,ages=35:65)
#' expo<-preparedata_fn(Ext_array_product,ages=35:65)
#' 
#' str(death);str(expo)
#' 
#' ##################
#' ##if p=1 (only 1 stratum)
#' ##################
#' 
#' #specifying only one of the strats from the data.frame
#' death<-preparedata_fn(data_summarised[,c("Age","Year","Claim","Product")],
#'   strat_name = "ACI",ages=35:65)
#' expo<-preparedata_fn(data_summarised[,c("Age","Year","Exposure","Product")],
#'   strat_name = "ACI",ages=35:65)
#' str(death);str(expo)
#'
#' #if data.frame only contains 1 strat (4 columns)
#' death<-preparedata_fn(data_summarised[,c("Age","Year","Claim","Product")]
#'   [data_summarised$Product=="ACI",],ages=35:65)
#' expo<-preparedata_fn(data_summarised[,c("Age","Year","Exposure","Product")]
#'   [data_summarised$Product=="ACI",],ages=35:65)
#' str(death);str(expo)
#' 
#' #if data.frame only contains 1 strat (3 columns)
#' death<-preparedata_fn(data_summarised[,c("Age","Year","Claim")]
#'   [data_summarised$Product=="ACI",],ages=35:65)
#' expo<-preparedata_fn(data_summarised[,c("Age","Year","Exposure")]
#'   [data_summarised$Product=="ACI",],ages=35:65)
#' str(death);str(expo)
#' 
#' #Input: 3D data array
#' death<-preparedata_fn(dxt_array_product,strat_name="ACI",ages=35:65)
#' expo<-preparedata_fn(Ext_array_product,strat_name="ACI",ages=35:65)
#' str(death);str(expo)
#' 
#' #Input: 2D matrix 
#' death<-preparedata_fn(dxt_array_product["ACI",,],data_matrix=TRUE,ages=35:65)
#' expo<-preparedata_fn(Ext_array_product["ACI",,],data_matrix=TRUE,ages=35:65)
#' str(death);str(expo)

preparedata_fn<-function(data_df_array,data_matrix=FALSE,strat_name=NULL,
                         ages=NULL,years=NULL,round=TRUE){
  
  if (data_matrix){
    original_dimname<-dimnames(data_df_array)
    temp<-data_df_array
    
    p<-1
    A<-dim(temp)[1]
    T<-dim(temp)[2]
    
    if (is.null(strat_name)){strat_name<-paste0("strat_",1)}
    
    if (!is.null(original_dimname)){
      
      #data_df_array<-provideDimnames(data_df_array,base=list(strat_name,original_dimname[[1]],original_dimname[[2]]))
      
      if (!is.null(ages)){
        A<-length(ages)
        ages_name<-as.character(ages)
        #data_df_array<-data_df_array[,as.character(ages),,drop=FALSE]
        temp<-temp[ages_name,]
      }else{
        ages<-as.numeric(original_dimname[[1]])
        ages_name<-original_dimname[[1]]
        }
      
      if (!is.null(years)){
        T<-length(years)
        years_name<-as.character(years)
        #data_df_array<-data_df_array[,,as.character(years),drop=FALSE]
        temp<-temp[,years_name]
      }else{
        years<-as.numeric(original_dimname[[2]])
        years_name<-original_dimname[[2]]
        }
    }
    else {
      if (is.null(strat_name)){
        strat_name<-paste0("strat_",1)
      }
      
      if (is.null(ages)){
        ages<-1:A
        ages_name<-as.character(ages)
      } else {
        if (length(ages)<A){
          min_age<-min(ages)
          #data_df_array<-data_df_array[,ages-min_age+1,,drop=FALSE]
          temp<-temp[ages-min_age+1,]
          ages_name<-as.character(ages)
          A<-length(ages)
        } else if (length(ages)==A){ 
          ages_name<-as.character(ages)}
        else {stop("No such ages in the data.")}
      }
      
      if (is.null(years)){
        years<-1:T
        years_name<-as.character(years)
      } else {
        if (length(years)<T){
          min_year<-min(years)
          #data_df_array<-data_df_array[,,years-min_year+1,drop=FALSE]
          temp<-temp[,years-min_year+1]
          years_name<-as.character(years)
          T<-length(years)
        } else if (length(years)==T){ 
          years_name<-as.character(years)}
        else {stop("No such years in the data.")}
      }
      
    }
    
  data_df_array<-array(dim=c(1,A,T))
  data_df_array[1,,]<-temp
  data_df_array<-provideDimnames(data_df_array,base=list(strat_name,ages_name,years_name))  
    
  } else {
    
  dim_data<-dim(data_df_array)
  
  #data supplied in cube/3D-array
  if(length(dim_data)==3){
    p<-dim_data[1]
    A<-dim_data[2]
    T<-dim_data[3]
    
    original_dimname<-dimnames(data_df_array)
    temp<-data_df_array
    
    if (!is.null(original_dimname)){
      if (!is.null(strat_name)){
        data_df_array<-data_df_array[strat_name,,,drop=FALSE]
        p<-length(strat_name)
      }else{strat_name<-original_dimname[[1]]}
      
      if (!is.null(ages)){
        data_df_array<-data_df_array[,as.character(ages),,drop=FALSE]
        A<-length(ages)
      }else{ages<-as.numeric(original_dimname[[2]])}
      
      if (!is.null(years)){
        data_df_array<-data_df_array[,,as.character(years),drop=FALSE]
        T<-length(years)
      }else{years<-as.numeric(original_dimname[[3]])}
    }
    else {
      if (is.null(strat_name)){
        strat_name<-paste0("strat_",1:p)
      }
      
      if (is.null(ages)){
        ages<-1:A
        ages_name<-as.character(ages)
      } else {
        if (length(ages)<A){
          min_age<-min(ages)
          data_df_array<-data_df_array[,ages-min_age+1,,drop=FALSE]
          ages_name<-as.character(ages)
          A<-length(ages)
        } else if (length(ages)==A){ 
          ages_name<-as.character(ages)}
        else {stop("No such ages in the data.")}
      }
      
      if (is.null(years)){
        years<-1:T
        years_name<-as.character(years)
      } else {
        if (length(years)<T){
          min_year<-min(years)
          data_df_array<-data_df_array[,,years-min_year+1,drop=FALSE]
          years_name<-as.character(years)
          T<-length(years)
        } else if (length(years)==T){ 
          years_name<-as.character(years)}
        else {stop("No such years in the data.")}
      }
      
      data_df_array<-provideDimnames(data_df_array,base=list(strat_name,ages_name,years_name)) 
    }
    
  } #data supplied in 2D-data frame
  else if (length(dim_data)==2){
    if (all(class(data_df_array)!="data.frame")) stop ("Please provide a data.frame object.")
    
    if (ncol(data_df_array)==4){
      #2D-data frame with 4 columns (APP data)
      names(data_df_array)<-c("Age","Year","Data","Strat")
    
      p<-length(unique(data_df_array$Strat))
      Strat_name<-unique(data_df_array$Strat)
      
      if(is.null(ages)){
        ages <- unique(data_df_array$Age)
      }
      if(any(is.na(ages))){stop("ages contain NA!")}
      
      if(is.null(years)){
        years <- unique(data_df_array$Year)
      }
      if(any(is.na(years))){stop("years contain NA!")}
      
      if(is.null(strat_name)){
        strat_name <- unique(data_df_array$Strat)
      }
      if(any(is.na(strat_name))){stop("strata contain NA!")}
      
      data_df_array<-as.data.frame(data_df_array)
      
      data_df_array <- magrittr::`%>%`(
        data_df_array,
        dplyr::filter(.data$Age %in% ages,
                      .data$Year %in% years,
                      .data$Strat %in% strat_name)
      )
    
      for (i in 1:p){
        assign(paste0("data_",Strat_name[i]),data_df_array[data_df_array$Strat==Strat_name[i],])
        assign(paste0(Strat_name[i],"_length"),dim(get(paste0("data_",Strat_name[i])))[1])
      }
      
      #min_age<-min(data_df_array$Age);max_age<-max(data_df_array$Age)
      #A<-max_age-min_age+1
      #min_year<-min(data_df_array$Year);max_year<-max(data_df_array$Year)
      #T<-max_year-min_year+1
      
      A<-length(ages);T<-length(years)
      
      data_df_array<-array(dim=c(p,A,T))
    
      for (i in 1:p){
        #assign(paste0("data_df_array_",Strat_name[i]),matrix(NA,nrow=A,ncol=T))
        temp_temp_data<-matrix(NA,nrow=A,ncol=T)
        temp_data<-get(paste0("data_",Strat_name[i]))
      
        for (j in 1:get(paste0(Strat_name[i],"_length"))){
          #index_i<-temp_data$Age[j]-min_age+1
          #index_j<-temp_data$Year[j]-min_year+1
          #assign(paste0("data_df_array_",Strat_name[i],"[",index_i,index_j,"]"),temp_data[j])
          index_i<-which(temp_data$Age[j]==ages)
          index_j<-which(temp_data$Year[j]==years)
          temp_temp_data[index_i,index_j]<-temp_data$Data[j]
        
          #rownames(temp_temp_data)<-as.character((1:A)+min_age-1)
          #colnames(temp_temp_data)<-as.character((1:T)+min_year-1)
          #assign(paste0("data_df_array_",Strat_name[i]),temp_temp_data)
        
        }
      
        data_df_array[i,,]<-temp_temp_data
      
      }
      #ages_name<-as.character((1:A)+min_age-1)
      #years_name<-as.character((1:T)+min_year-1)
      ages_name<-as.character(ages)
      years_name<-as.character(years)
      data_df_array<-provideDimnames(data_df_array,base=list(Strat_name,ages_name,years_name))
    
      if (!is.null(strat_name)){
        p<-length(strat_name)
        if (p==1){
          temp<-data_df_array[strat_name,,]
          data_df_array<-array(dim=c(1,A,T))
          data_df_array[1,,]<-temp
          data_df_array<-provideDimnames(data_df_array,base=list(strat_name,ages_name,years_name))
        }
        else {
          data_df_array<-data_df_array[strat_name,,]
        }
      } else{
        strat_name<-Strat_name
      }
    
      if (!is.null(ages)){
        if(p==1){
          temp<-data_df_array[,as.character(ages),]
          A<-length(ages)
          data_df_array<-array(dim=c(1,A,T))
          data_df_array[1,,]<-temp
          data_df_array<-provideDimnames(data_df_array,base=list(strat_name,as.character(ages),years_name))
        }else{
          data_df_array<-data_df_array[,as.character(ages),]
          A<-length(ages)}
      } else{
        ages<-as.numeric(ages_name)
      }
    
      if (!is.null(years)){
        if (p==1){
          temp<-data_df_array[,,as.character(years)]
          T<-length(years)
          data_df_array<-array(dim=c(1,A,T))
          data_df_array[1,,]<-temp
          data_df_array<-provideDimnames(data_df_array,base=list(strat_name,as.character(ages),as.character(years)))
        }else{
          data_df_array<-data_df_array[,,as.character(years)]
          T<-length(years)}
      } else{
        years<-as.numeric(years_name)
      }
      } else if (ncol(data_df_array)==3) {
        #2D-data frame with 3 columns (AP data)
        names(data_df_array)<-c("Age","Year","Data")
      
        p<-1
        if (is.null(strat_name)){Strat_name<-paste0("strat_",1)}
        
        data_df_array$Strat<-Strat_name
        
        if(is.null(ages)){
          ages <- unique(data_df_array$Age)
        }
        if(any(is.na(ages))){stop("ages contain NA!")}
        
        if(is.null(years)){
          years <- unique(data_df_array$Year)
        }
        if(any(is.na(years))){stop("years contain NA!")}
        
        if(is.null(strat_name)){
          strat_name <- unique(data_df_array$Strat)
        }
        if(any(is.na(strat_name))){stop("strata contain NA!")}
        
        data_df_array<-as.data.frame(data_df_array)
        
        data_df_array <- magrittr::`%>%`(
          data_df_array,
          dplyr::filter(.data$Age %in% ages,
                        .data$Year %in% years,
                        .data$Strat %in% strat_name)
        )
      
        for (i in 1:p){
          assign(paste0("data_",Strat_name[i]),data_df_array[data_df_array$Strat==Strat_name[i],])
          assign(paste0(Strat_name[i],"_length"),dim(get(paste0("data_",Strat_name[i])))[1])
        }
      
          min_age<-min(data_df_array$Age);max_age<-max(data_df_array$Age)
          A<-max_age-min_age+1
          min_year<-min(data_df_array$Year);max_year<-max(data_df_array$Year)
          T<-max_year-min_year+1
      
        data_df_array<-array(dim=c(p,A,T))
      
        for (i in 1:p){
          #assign(paste0("data_df_array_",Strat_name[i]),matrix(NA,nrow=A,ncol=T))
          temp_temp_data<-matrix(NA,nrow=A,ncol=T)
          temp_data<-get(paste0("data_",Strat_name[i]))
        
          for (j in 1:get(paste0(Strat_name[i],"_length"))){
            index_i<-temp_data$Age[j]-min_age+1
            index_j<-temp_data$Year[j]-min_year+1
            #assign(paste0("data_df_array_",Strat_name[i],"[",index_i,index_j,"]"),temp_data[j])
            temp_temp_data[index_i,index_j]<-temp_data$Data[j]
          
            #rownames(temp_temp_data)<-as.character((1:A)+min_age-1)
            #colnames(temp_temp_data)<-as.character((1:T)+min_year-1)
            #assign(paste0("data_df_array_",Strat_name[i]),temp_temp_data)
          
          }
        
          data_df_array[i,,]<-temp_temp_data
        
        }
        ages_name<-as.character((1:A)+min_age-1)
        years_name<-as.character((1:T)+min_year-1)
        data_df_array<-provideDimnames(data_df_array,base=list(Strat_name,ages_name,years_name))
      
        if (!is.null(strat_name)){
          p<-length(strat_name)
          if (p==1){
            temp<-data_df_array[strat_name,,]
            data_df_array<-array(dim=c(1,A,T))
            data_df_array[1,,]<-temp
            data_df_array<-provideDimnames(data_df_array,base=list(strat_name,ages_name,years_name))
          }
          else {
            data_df_array<-data_df_array[strat_name,,]
          }
        } else{
          strat_name<-Strat_name
        }
      
        if (!is.null(ages)){
          if(p==1){
            temp<-data_df_array[,as.character(ages),]
            A<-length(ages)
            data_df_array<-array(dim=c(1,A,T))
            data_df_array[1,,]<-temp
            data_df_array<-provideDimnames(data_df_array,base=list(strat_name,as.character(ages),years_name))
          }else{
            data_df_array<-data_df_array[,as.character(ages),]
            A<-length(ages)}
        } else{
          ages<-as.numeric(ages_name)
        }
      
        if (!is.null(years)){
          if (p==1){
            temp<-data_df_array[,,as.character(years)]
            T<-length(years)
            data_df_array<-array(dim=c(1,A,T))
            data_df_array[1,,]<-temp
            data_df_array<-provideDimnames(data_df_array,base=list(strat_name,as.character(ages),as.character(years)))
          }else{
            data_df_array<-data_df_array[,,as.character(years)]
            T<-length(years)}
        } else{
          years<-as.numeric(years_name)
        }
      } else {
        stop("Data supplied incorrectly.")
     }
    } else{
      stop("Data supplied incorrectly.")
    }}
  
  if(round) data_df_array<-round(data_df_array)
  if(any(is.na(data_df_array))){warning("Data contains NA!")}
  list(data=data_df_array,strat_name=strat_name,ages=ages,years=years,n_strat=p,n_ages=A,n_years=T)
}

Try the BayesMoFo package in your browser

Any scripts or data that you put into this service are public.

BayesMoFo documentation built on Aug. 11, 2025, 1:07 a.m.