R/helper.r
In Rcan: Cancer Registry Data Analysis and Visualisation

core.error_variable <- function(df_data, varname, funcname,type="numeric") {
  
  if (!is.null(varname)) {
  
    if (!(varname%in% colnames(df_data))) {
      
      stop(paste0(varname, " is not a column of ",   deparse(substitute(df_data)), " ,see documentation: Help(", deparse(substitute(funcname)), ")"))
      
    }
    
    if (type == "numeric") {
      if (!is.numeric(df_data[[varname]])) {
        
        
       stop(paste0(varname, " type must be ",type," see documentation: Help(", deparse(substitute(funcname)), ")"))
        
      }
      
    } else if (type == "character"){
      if (!is.character(df_data[[varname]])) {
        stop(paste0(varname, " type must be ",type," see documentation: Help(", deparse(substitute(funcname)), ")"))
        
      }
      
    } else if (type == "factor"){
      if (!is.factor(df_data[[varname]])) {
        stop(paste0(varname, " type must be ",type," see documentation: Help(", deparse(substitute(funcname)), ")"))
        
      }
    }
  }
}

core.error_missingage <- function(df_data, var_age,missing_age,funcname) {
  
  if (!missing_age %in% df_data[[var_age]]) {
    stop(paste0("\n",missing_age, " value is not present in the variable ",var_age," see documentation: Help(", deparse(substitute(funcname)), ")"))
  }
}

core.error_age_parse <- function(df_data, var_age, missing_age,funcname) {

  dt_data <- as.data.table(df_data)
  dt_data <- dt_data[, c(var_age), with=FALSE]

  setnames(dt_data, var_age, 'CSU_A')
  if (!is.null(missing_age)) {
    dt_data[CSU_A == missing_age, CSU_A := NA]
  }

  if (!all(grepl(".*?(\\d{1,3}).*$",dt_data[!is.na(CSU_A)]$CSU_A, perl=TRUE)))
  {
    temp <- unique(as.character(dt_data[!grepl(".*?(\\d{1,3}).*$",dt_data[!is.na(CSU_A)]$CSU_A, perl=TRUE),]$CSU_A))
    stop(paste0("\nthe value ",temp,", in the variable ",var_age, "\ncannot be change to numeric and is not declared as missing age,\nSee documentation: Help(", deparse(substitute(funcname)), ")\n"))
  }

}



core.error_time_variable <- function(df_data, var_year, group_by, funcname) {
  
  
  if (is.null(group_by)) {
    df_data$CSU_dum_by <- "dummy_by"
    group_by <- "CSU_dum_by"
  }
  
  dt_data <- data.table(df_data, key = c(group_by)) 
  dt_data$temp <- 1
  nrow_base <- nrow(dt_data)
  dt_test <- dt_data[ ,temp:=sum(temp), by=c(var_year, group_by)]
  nrow_test <-  nrow(dt_data[ ,sum(temp), by=c(var_year, group_by)]) 

  if (nrow_test != nrow_base) {
    setkeyv(dt_test, c(group_by,var_year))
    print(head(dt_test[temp>1, ]))
    dt_data <- NULL
    stop(paste0("There is more than 1 data per year (see above).\nUse the 'group_by' option"," ,see documentation: Help(", deparse(substitute(funcname)), ")\n","or call the function on a subset to define the sub-population of interest.\n"))
  }
}

core.csu_dt_rank <- function(df_data,
                        var_value = "CASES",
                        var_rank = "cancer_label",
                        group_by = NULL,
                        number = NULL, 
                        ties.method="min") {
  
  bool_dum_by <- FALSE
  if (is.null(group_by)) {
    
    df_data$CSU_dum_by <- "dummy_by"
    group_by <- "CSU_dum_by"
    bool_dum_by <- TRUE
  }
  
  df_data <- as.data.table(df_data)
  dt_rank <- df_data[, list(rank_value=sum(get(var_value))), by=c(var_rank, group_by)]
  dt_rank[, CSU_RANK:= frank(-rank_value, ties.method=ties.method), by=group_by]
  
  if (!is.null(number)){
    dt_rank <- dt_rank[CSU_RANK <= number,c(group_by, var_rank, "CSU_RANK"), with=FALSE]
  }
  
  df_data <- merge(dt_rank, df_data,by=c(group_by, var_rank), all.x=TRUE)
  
  if (bool_dum_by) {
    
    df_data[,CSU_dum_by:=NULL]
    
  }
  
  return(df_data)
  
}


core.csu_legend_wrapper <- function(label, width) {
  
  label <- sapply(strwrap(label, width = width, simplify = FALSE), paste, collapse="\n")
  return(label)
  
}



core.csu_tick_generator <- function(max,min = 0,logscale=FALSE) {
  
  
  
  if (!logscale) {
    
    if (min > 0) {
      min = 0
    } 
    
    if (max < 0) {
      max = 0
    }
    
    
    temp_log_max = 10^floor(log10(max-min))
    temp_unit_floor_max = floor((max-min)/(temp_log_max))
    
    if (temp_unit_floor_max < 2) {
      tick_space = 0.2*temp_log_max
    } else {
      if (temp_unit_floor_max < 5) {
        tick_space = 0.5*temp_log_max
      } else {
        tick_space = temp_log_max
      }
    }
    
    temp_top <- ceiling(max/tick_space)*tick_space
    temp_floor <- floor(min/tick_space)*tick_space
    tick_list <- seq(temp_floor, temp_top, tick_space)
    tick_minor_list <- NULL
    
  } else {
    
    
    temp_log_max = 10^floor(log10(max))
    temp_unit_floor_max = floor(max/(temp_log_max))
    
    temp_log_min <- 10^floor(log10(min))
    temp_unit_floor_min <- floor(min/(temp_log_min))
    
    if (temp_log_min == temp_log_max) {
      
      tick_list <- c(temp_unit_floor_min:(temp_unit_floor_max+1)*temp_log_min)
      
      
      if (temp_unit_floor_max == temp_unit_floor_min) {
        tick_minor_list <- c((temp_unit_floor_min*temp_log_min)+0:9*(temp_log_min/10)) 
      } else {
        tick_minor_list <- c((temp_unit_floor_min*temp_log_min)+0:19*(temp_log_min/10)) 
      }
      
    } else if (temp_log_max/temp_log_min < 1000) {
      
      
      if (temp_unit_floor_min < 6) {
        tick_list <- temp_unit_floor_min:5*temp_log_min 
        tick_list <- c(tick_list, temp_log_min*7) ## min . . 5 7
      } else  {
        tick_list <- temp_unit_floor_min*temp_log_min ## min 
      }
      
      tick_minor_list <- temp_unit_floor_min:19*temp_log_min ## min .  . 19
      
      while (temp_log_min != (temp_log_max/10)) {
        temp_log_min = temp_log_min*10 
        tick_list <- c(tick_list, c(1,2,3,5,7)*temp_log_min)
        tick_minor_list <- c(tick_minor_list, 2:19*temp_log_min)
      }
      
      tick_minor_list <- c(tick_minor_list, 2:(temp_unit_floor_max+1)*temp_log_max)
      
      if (temp_unit_floor_max <5) {
        tick_list <- c(tick_list, 1:(temp_unit_floor_max+1)*temp_log_max)
      } else if (temp_unit_floor_max <7) {
        tick_list <- c(tick_list, c(1,2,3,5,temp_unit_floor_max+1)*temp_log_max)
      } else {
        tick_list <- c(tick_list, c(1,2,3,5,7,temp_unit_floor_max+1)*temp_log_max)
      }
      
    } else {
      
      if (temp_unit_floor_min == 1) {
        tick_list <- c(1,2,3,5)*temp_log_min
      } else  if (temp_unit_floor_min == 2) {
        tick_list <- c(2,3,5)*temp_log_min
      } else  if (temp_unit_floor_min < 6) {
        tick_list <- c(5,7)*temp_log_min
      } else {
        tick_list <- 7*temp_log_min
      }
      tick_minor_list <- temp_unit_floor_min:9*temp_log_min ## min .  . 19
      
      while (temp_log_min != (temp_log_max/10)) {
        temp_log_min = temp_log_min*10 
        tick_list <- c(tick_list, c(1,2,5)*temp_log_min)
        tick_minor_list <- c(tick_minor_list, 2:9*temp_log_min)
      }
      
      tick_minor_list <- c(tick_minor_list, 2:(temp_unit_floor_max+1)*temp_log_max)
      
      if (temp_unit_floor_max <5) {
        tick_list <- c(tick_list, unique(c(1,2,temp_unit_floor_max+1)*temp_log_max))
      } else if (temp_unit_floor_max <6) {
        tick_list <-c(tick_list, c(1,2,5)*temp_log_max)
      } else if (temp_unit_floor_max < 7) {
        tick_list <- c(tick_list, c(1,2,5,7)*temp_log_max)
      } else {
        tick_list <- c(tick_list, c(1,2,5,7,temp_unit_floor_max+1)*temp_log_max)
      }
      
      
      
    }
    
  }
  
  return(list(tick_list=tick_list, tick_minor_list=tick_minor_list))
  
}
  


core.csu_axes_label <- function(l) {
  
  l <- format(l, big.mark = ",", scientific = FALSE, drop0trailing = TRUE)
  
}

core.csu_year_tick_generator <- function(min, max) {
  
  mod <- 5
  if (max - min < 10 ) {
    mod <- 1 
  } else if (max - min < 20){
    mod <- 2
  } 
  
  
  temp1 <- min - (min %% mod)
  temp2 <- max - (max %% mod) +ifelse(mod>=5,mod,0)
  
  if (temp2 - temp1 <= mod*6) {
    year_space <- mod 
    year_list <- seq(temp1,temp2,year_space)
    year_minor_list <- year_list
    
  } else  {
    year_space <- mod*2 
    if (temp1 %% mod*2 > 0) {
      year_list <- seq(temp1+mod,temp2,year_space)
      year_minor_list <-  seq(temp1,temp2,year_space/2)
    } else {
      year_list <- seq(temp1,temp2,year_space)
      year_minor_list <-  seq(temp1,temp2,year_space/2)
    }
  }
  
  return(list(tick_list=year_list, tick_minor_list=year_minor_list))
  
  
}


core.csu_format_export <- function(type, plot_title, landscape=FALSE) {
  
  #http://www.altelia.fr/actualites/calculateur-resolution-definition-format.htm
  
  # 6 inch = 15.24 cm
  #10,795
  
  #format
  #if (!is.null(format_export)) {
  #  filename <- gsub("[[:punct:][:space:]\n]", "_", plot_title)
  #  core.csu_format_export(format_export, plot_title = filename, landscape = FALSE)
  #}
  
  # help file
  # \item{format_export}{
  #   export the graph in different format:\cr
  #   \tabular{ll}{
  #     \code{NULL} \tab Plot in R studio windows.\cr
  #     \code{"pdf"} \tab Export in PDF format.\cr
  #     \code{"tiff"} \tab Export in TIFF 300dpi format.\cr
  #     \code{"png"} \tab Export in PNG 200dpi format.\cr
  #     \code{"svg"} \tab Export in SVG format. Can be edit with \url{https://inkscape.org/fr/}.\cr
  #   }
  #   The filename is the \code{plot_title} option.
  # }
  # \item{graph_dev}{
  #   If the plot is embedded in a graphics Device function (such as pdf()), the graph_dev option should be set to TRUE for the first graph to avoid a blank page.
  # }
  

  
  png_width <- ifelse(landscape, 2339 , 1654 )
  png_height <- ifelse(landscape, 1654 , 2339 )
  tiff_width <- ifelse(landscape, 3508 , 2480 )
  tiff_height <- ifelse(landscape, 2480 , 3508 )
  pdf_width <- ifelse(landscape, 11.692 , 8.267 )    
  pdf_height <- ifelse(landscape, 8.267 , 11.692 )   
  
  if (type == "pdf") {
    
    pdf(paste(plot_title,".",type, sep=""),width = pdf_width, height = pdf_height) 
    
  } else if (type == "svg") {
    
    svg(paste(plot_title,".",type, sep=""),width = pdf_width, height = pdf_height) 
    
  } else if (type == "png") {
    
    png(paste(plot_title,".",type, sep=""),width = png_width, height = png_height, units = "px",res = 200) 
    
    
  } else if (type == "tiff") {
    
    tiff(paste(plot_title,".",type, sep=""),width = tiff_width, height = tiff_height,units = "px",res = 300,compression ="lzw")
  }
}


core.csu_asr <- function(df_data, var_age, var_cases, var_py, group_by=NULL,
                         var_age_group=NULL, missing_age = NULL,db_rate = 100000,
                         first_age = 1, last_age = 18, pop_base = "SEGI", crude_rate = FALSE,
                         var_st_err=NULL,correction_info=FALSE, var_asr="asr",age_dropped = FALSE,
                         pop_base_count = NULL, age_label_list = NULL, Rcan_print=FALSE) 
{
  
  
  
  bool_dum_by <- FALSE
  bool_dum_age <- FALSE
  
  if (is.null(group_by)) {
    
    df_data$CSU_dum_by <- "dummy_by"
    group_by <- "CSU_dum_by"
    bool_dum_by <- TRUE
    
  }
  
  
  
  if (is.null(var_age_group)) {
    
    df_data$CSU_dum_age <- "dummy_age_gr"
    var_age_group <- "CSU_dum_age"
    group_by <- c(group_by, "CSU_dum_age")
    bool_dum_age <- TRUE
    
  }
  
  
  dt_data <- data.table(df_data, key = group_by) 
  setnames(dt_data, var_age, "CSU_A")
  setnames(dt_data, var_cases, "CSU_C")
  setnames(dt_data, var_py, "CSU_P")

  if (crude_rate == FALSE)
  {
    if (is.null(pop_base_count)) {
      dt_data <-  dt_data[,list( CSU_C=sum(CSU_C), CSU_P=sum(CSU_P)), by=c(group_by, "CSU_A")]
    }


    # create index to keep order
    index_order <- c(1:nrow(dt_data))
    dt_data$index_order <- index_order
    
    # missing age 
    dt_data[dt_data$CSU_A %in% missing_age,CSU_A:=NA ] 
    dt_data[is.na(dt_data$CSU_A),CSU_P:=0 ] 

    #parse age
    dt_data[,CSU_A :=  as.numeric(gsub(".*?(\\d{1,3}).*$", "\\1",CSU_A, perl=TRUE))]
    if (max(dt_data$CSU_A,na.rm=TRUE) > 25) {
      dt_data[,CSU_A := round((CSU_A/5)+1)]
    }
    
    #create age dummy: 1 2 3 4 --- 19
    dt_data$age_factor <- as.numeric(c(as.factor(dt_data$CSU_A)))

    
    # correction factor 
    dt_data$correction <- 1 
    if (!is.null(missing_age)) {
      
      
      dt_data[, total:=sum(CSU_C), by=group_by] #add total
      dt_data[!is.na(dt_data$age_factor) , total_known:=sum(CSU_C), by=group_by] #add total_know
      dt_data$correction <- dt_data$total / dt_data$total_know 
      dt_data[is.na(dt_data$correction),correction:=1 ] 
      dt_data$total <- NULL
      dt_data$total_known <- NULL
      
    }
    
    if (is.null(pop_base_count)) {
      
      # create world population DF for different nb of age group
      SEGI_pop <- c(12000,10000,9000,9000,8000,8000,6000,6000,6000,6000,5000,4000,4000,3000,2000,1000,500,500)
      EURO_pop <- c(8000,7000,7000,7000,7000,7000,7000,7000,7000,7000,7000,6000,5000,4000,3000,2000,1000,1000)
      EURO2_pop <- c(5000,5500,5500,5500,6000,6000,6500,7000,7000,7000,7000,6500,6000,5500,5000,4000,2500,2500)
      WHO_pop <- c(8860,8690,8600,8470,8220,7930,7610,7150,6590,6040,5370,4550,3720,2960,2210,1520,910,630)
    
    if (pop_base == "EURO") {
      pop <- EURO_pop
    } else if (pop_base == "EURO2") {
      pop <- EURO2_pop
    } else if (pop_base == "WHO") {
      pop <- WHO_pop
    } else {
      pop <- SEGI_pop
    }
      
      # calculated total pop for age selected 
      total_pop <- sum(pop[first_age:last_age])
      
      Standard_pop <- data.table(pop = pop, age_factor= c(1:18))
      
      pop[17] <- pop[17]+ pop[18]
      pop[18] <- 0
      Standard_pop$pop17 <- pop
      pop[16] <- pop[16]+ pop[17]
      pop[17] <- 0
      Standard_pop$pop16 <- pop
      pop[15] <- pop[15]+ pop[16]
      pop[16] <- 0
      Standard_pop$pop15 <- pop
      

      
      #age dropped option
      if (age_dropped) {
        dt_data$age_factor <- dt_data$age_factor + first_age -1   
      }
      
      
      # keep age selected 
      dt_data=dt_data[dt_data$age_factor %in% c(first_age:last_age) | is.na(dt_data$age_factor), ]
      
      # calculated maximum age group with population data
      if (last_age == 18) {
        dt_data <- merge(dt_data, dt_data[dt_data$CSU_P != 0,list(nb_age_group = max(age_factor)), by=var_age_group], by=var_age_group)  
      } else {
        dt_data$nb_age_group <- 18
      }
      
      # show population with less than 18 age group
      if (last_age == 18) {
        temp <- subset(dt_data,nb_age_group <18, select= c(var_age_group, "nb_age_group"))
        if (nrow(temp) >0) {
          setkey(temp,NULL)
          
          
          if (Rcan_print) {
          
            cat("\n")
            cat("Population with less than 18 age group:\n\n" )
            print
            print(unique(temp), row.names = FALSE)
            cat("\n")
            
          }
        }
        temp <- NULL
      }
      
      #regroup case for population with nb of age group <  18 
      for (i in 15:17) {
        
        if (i %in% dt_data$nb_age_group) {
          
          dt_data[nb_age_group == i & age_factor >= i , CSU_C:=sum(CSU_C), by=group_by] #add total_know
          dt_data[nb_age_group == i & age_factor > i & !is.na(age_factor), CSU_C := 0] 
          
        } 
      }
      
      #add world pop to database 
      dt_data <- merge(dt_data,Standard_pop, by =c("age_factor"), all.x=TRUE )

      Standard_pop <- NULL
      dt_data[nb_age_group==17, pop:=pop17]
      dt_data[nb_age_group==16, pop:=pop16]
      dt_data[nb_age_group==15, pop:=pop15]
      
      #return(dt_data)
      
    } else {
      
      #keep age group selected 
      dt_data <- dt_data[age_factor %in% (first_age:last_age), ]
      
      #calcul total pop for canreg
      total_pop <-sum(unique(dt_data[, c("age_factor", pop_base_count), with=FALSE])[[pop_base_count]])
      
      #get age group list variable
      if (is.null(age_label_list)) {
        age_label_list <- var_age
      }
      age_group_list <- as.character(unique(dt_data[[age_label_list]]))
      age_group_list <- paste(age_group_list,  collapse=" ")
      
      #rename variable population reference
      setnames(dt_data, pop_base_count, "pop")
    }
    
    #calcul ASR
    
    dt_data[dt_data$CSU_P != 0,rate:= dt_data$CSU_C[dt_data$CSU_P != 0]/ dt_data$CSU_P[dt_data$CSU_P != 0] * db_rate]
    dt_data$asr <- dt_data$rate * dt_data$pop
    dt_data[is.na(dt_data$asr),asr:=0 ] 
    
    dt_data$st_err <- ( dt_data$rate * (dt_data$pop^2) * (db_rate - dt_data$rate))/dt_data$CSU_P
    dt_data[is.na(dt_data$st_err),st_err:=0 ] 
    
    # to check order 
    dt_data<- dt_data[order(dt_data$index_order ),]
    dt_data<-  dt_data[,list( CSU_C=sum(CSU_C), CSU_P=sum(CSU_P),asr=sum(asr),st_err = sum(st_err),correction = max(correction)), by=group_by]
    
    dt_data$asr <- dt_data$asr / total_pop
    dt_data$asr <- dt_data$asr * dt_data$correction
    dt_data$st_err <- (dt_data$st_err / (total_pop^2))^(1/2)
    dt_data$st_err <- dt_data$st_err * dt_data$correction
    
    dt_data$asr <- round(dt_data$asr, digits = 2)
    dt_data$st_err <- round(dt_data$st_err, digits = 2)
    dt_data$correction <- round((dt_data$correction-1)*100, digits = 1)
    
    if (is.null(var_st_err)) {
      dt_data$st_err <- NULL
    } else {
      setnames(dt_data, "st_err", var_st_err)
    }

     if (var_asr!="asr") {
      setnames(dt_data, "asr", var_asr)
    }
    
    if (!correction_info) {
      dt_data$correction <- NULL
    }

  }
  else 
  {

     #parse age
    dt_data[,CSU_A :=  as.numeric(gsub(".*?(\\d{1,3}).*$", "\\1",CSU_A, perl=TRUE))]
    if (max(dt_data$CSU_A,na.rm=TRUE) > 25) {
      dt_data[,CSU_A := round((CSU_A/5)+1)]
    }
    
    #create age dummy: 1 2 3 4 --- 19
    dt_data$age_factor <- as.numeric(c(as.factor(dt_data$CSU_A)))
    
    #age dropped option
    if (age_dropped) {
      dt_data$age_factor <- dt_data$age_factor + first_age -1   
    }
        
    # keep age selected 
    dt_data=dt_data[dt_data$age_factor %in% c(first_age:last_age) | is.na(dt_data$age_factor), ]

    dt_data <- dt_data[,list( CSU_C=sum(CSU_C), CSU_P=sum(CSU_P)), by=c(group_by)]
    dt_data <- dt_data[, CSU_RESULT := CSU_C/CSU_P*db_rate]
    if (var_asr=="asr") {
      setnames(dt_data, "CSU_RESULT", 'crude_rate')
    }
    else 
    {
      setnames(dt_data, "CSU_RESULT", var_asr)
    }

  }

  #manage data.table input and output
  if (is.data.table(df_data)) 
  {
    df_data <- dt_data
  } else {
    df_data <- data.frame(dt_data)
  }

  
  if (bool_dum_age) {
    df_data$CSU_dum_age <- NULL
  }
  if (bool_dum_by) {
    df_data$CSU_dum_by <- NULL
  }
  
  setnames(df_data, "CSU_C", var_cases)
  setnames(df_data,  "CSU_P", var_py)
  
  
  if (is.null(pop_base_count)) {
    
    if (Rcan_print) {
      temp <- last_age*5-1
      if (last_age == 18)  temp <- "99+"

      if (crude_rate == FALSE)
      {
         cat("ASR have been computed for the age group ", (first_age-1)*5,"-", temp , "\n",  sep="" )
      }
      else {
        cat("Crude rate have been computed for the age group ", (first_age-1)*5,"-", temp , "\n",  sep="" )
      }
      
    }
    temp<- NULL
    
  } else {
    
    #cat("ASR have been computed for the age groups:\n",age_group_list , "\n",  sep="" )
    age_group_list<- NULL
    
  }
  
  return(df_data)
  
}

core.csu_cumrisk <- function(df_data, var_age, var_cases, var_py, group_by=NULL,
                         missing_age = NULL, last_age = 18,var_st_err=NULL,correction_info=FALSE,
                         var_cumrisk="cumrisk", age_label_list = NULL, Rcan_print=FALSE) 
{
  
  
  
  bool_dum_by <- FALSE
  
  if (is.null(group_by)) {
    
    df_data$CSU_dum_by <- "dummy_by"
    group_by <- "CSU_dum_by"
    bool_dum_by <- TRUE
    
  }
  
  
  dt_data <- data.table(df_data, key = group_by) 
  setnames(dt_data, var_age, "CSU_A")
  setnames(dt_data, var_cases, "CSU_C")
  setnames(dt_data, var_py, "CSU_P")

  dt_data <-  dt_data[,list( CSU_C=sum(CSU_C), CSU_P=sum(CSU_P)), by=c(group_by, "CSU_A")]

  # create index to keep order
  index_order <- c(1:nrow(dt_data))
  dt_data$index_order <- index_order
  
  # missing age 
  dt_data[dt_data$CSU_A %in% missing_age,CSU_A:=NA ] 
  dt_data[is.na(dt_data$CSU_A),CSU_P:=0 ] 

  #parse age
  dt_data[,CSU_A :=  as.numeric(gsub(".*?(\\d{1,3}).*$", "\\1",CSU_A, perl=TRUE))]
  if (max(dt_data$CSU_A,na.rm=TRUE) > 25) {
    dt_data[,CSU_A := round((CSU_A/5)+1)]
  }

  #create age dummy: 1 2 3 4 --- 19
  dt_data$age_factor <- as.numeric(c(as.factor(dt_data$CSU_A)))
  
  # correction factor 
  dt_data$correction <- 1 
  if (!is.null(missing_age)) {
    
    
    dt_data[, total:=sum(CSU_C), by=group_by] #add total
    dt_data[!is.na(dt_data$age_factor) , total_known:=sum(CSU_C), by=group_by] #add total_know
    dt_data$correction <- dt_data$total / dt_data$total_know 
    dt_data[is.na(dt_data$correction),correction:=1 ] 
    dt_data$total <- NULL
    dt_data$total_known <- NULL

    dt_data<- dt_data[!is.na(age_factor),]
    
  }

  if (!is.null(age_label_list)) {
  # calcul year interval from age group label
  
    dt_temp <- unique(dt_data[, c(age_label_list), with=FALSE])
    dt_temp[, min:=as.numeric(regmatches(get(age_label_list), regexpr("[0-9]+",get(age_label_list))))]
    dt_temp[, max:=shift(min, type ="lead")]
    dt_temp[, age_span := max-min]
    dt_temp <- dt_temp[, c("age_span",age_label_list), with=FALSE]
    dt_data <- merge(dt_data, dt_temp,by= age_label_list, all.x=TRUE)
  } else {
    
    dt_data[, age_span:=5]
  }

  #keep age group selected 
  age_max <- max(dt_data$age_factor)

  if (age_max-1 < last_age) {
    last_age <- age_max-1 
  }
  dt_data=dt_data[dt_data$age_factor <= last_age]  
  
  #calculate cum risk
  dt_data[,cumrisk:=age_span*(CSU_C/CSU_P)]
  dt_data[CSU_P==0,cumrisk:=0]
  dt_data[, st_err := (CSU_C/CSU_P)/CSU_P]
  dt_data[is.na(dt_data$st_err),st_err:=0 ] 

  # to check order 
  dt_data<- dt_data[order(dt_data$index_order ),]

  dt_data <- dt_data[,list( cumrisk=sum(cumrisk), CSU_P=sum(CSU_P),CSU_C=sum(CSU_C),st_err = sum(st_err),correction = max(correction)), by=group_by]
  dt_data[,cumrisk:=(1-exp(-cumrisk))*100*correction]
  
  dt_data[, st_err:=(st_err^(1/2))*100*5]
  dt_data[, st_err:=st_err*correction]

  dt_data[,cumrisk:=round(cumrisk, digits = 2)]
  dt_data[,st_err:=round(st_err, digits = 2)]
  dt_data[, correction:=round((correction-1)*100, digits = 1)]

  if (is.null(var_st_err)) {
    dt_data$st_err <- NULL
  } else {
    setnames(dt_data, "st_err", var_st_err)
  }
  
  if (var_cumrisk!="cumrisk") {
    setnames(dt_data, "cumrisk", var_cum_risk)
  }

  if (!correction_info) {
    dt_data$correction <- NULL
  }
  
  setnames(dt_data, "CSU_C", var_cases)
  setnames(dt_data,  "CSU_P", var_py)

   #manage data.table input and output
  if (is.data.table(df_data)) 
  {
    df_data <- dt_data
  } else {
    df_data <- data.frame(dt_data)
  }

  if (bool_dum_by) {
    df_data$CSU_dum_by <- NULL
  }  
    
  if (Rcan_print) {
    temp <- last_age*5-1
    cat("Cumulative risk have been computed for the age group ","0-", temp , "\n",  sep="" )
    temp<- NULL
  }
  
  return(df_data)
  
}


core.csu_eapc <- function(df_data,
           var_rate="asr",
           var_year="year",
           group_by= NULL,
           var_eapc="eapc", 
           CI_level = 0.95)
{

    
  

    
    #create fake group to have group_by optional 
    bool_dum_by <- FALSE
    
    if (is.null(group_by)) {
      
      df_data$CSU_dum_by <- "dummy_by"
      group_by <- "CSU_dum_by"
      bool_dum_by <- TRUE
    }
    
    dt_data <- data.table(df_data, key = c(group_by)) 

   
    
    setnames(dt_data, var_rate, "CSU_R")
    setnames(dt_data, var_year, "CSU_Y")

    # remove 0 data
    dt_data <- dt_data[!CSU_R == 0,]

    dt_data[, id_group:=.GRP, by=group_by]
    
    temp_max <- max(dt_data$id_group)
    for (i in 1:temp_max) {
      
      temp <- summary(glm(log(CSU_R) ~ CSU_Y,
                          family=gaussian(link = "identity"),
                          data=dt_data[dt_data$id_group  == i,] 
      ))
      
      dt_data[dt_data$id_group  == i, CSU_EAPC:=temp$coefficients[[2]]]
      dt_data[dt_data$id_group  == i, CSU_ST:=temp$coefficients[[4]]]
      
    }
    
    dt_data$CSU_UP <- 100*(exp(dt_data$CSU_EAPC-(qt((1-CI_level)/2, df = Inf)*dt_data$CSU_ST))-1)
    dt_data$CSU_LOW <- 100*(exp(dt_data$CSU_EAPC+(qt((1-CI_level)/2, df = Inf)*dt_data$CSU_ST))-1)
    dt_data$CSU_EAPC <- 100*(exp(dt_data$CSU_EAPC)-1)
    
    
    
    dt_data<-  dt_data[,list( CSU_EAPC=mean(CSU_EAPC), CSU_UP=mean(CSU_UP),CSU_LOW=mean(CSU_LOW)), by=group_by]
    
    
    setnames(dt_data, "CSU_EAPC", var_eapc)
    setnames(dt_data, "CSU_UP", paste(var_eapc, "up", sep="_"))
    setnames(dt_data, "CSU_LOW", paste(var_eapc, "low", sep="_"))
    
    #manage data.table input and output
    if (is.data.table(df_data)) 
    {
      df_data <- dt_data
    } else {
      df_data <- data.frame(dt_data)
    }

    if (bool_dum_by) {
      df_data$CSU_dum_by <- NULL
    }
    
    
  
    return(df_data)
    
    
    
}



core.csu_ageSpecific <-function(df_data,
           var_age="age",
           var_cases="cases",
           var_py="py",
           group_by = NULL,
           missing_age = NULL,
           db_rate = 100000,
           logscale=FALSE, 
           plot_title=NULL,
           legend=csu_trend_legend(),
           color_trend = NULL,
           CI5_comparison=NULL,
           linesize = 0.5,
           age_label_list = NULL,
           log_point=TRUE,
           plot_subtitle=NULL,
           plot_caption=NULL,
           xtitle = "Age at diagnosis",
           ytitle = "Age-specific incidence rate per",
           label_group_by = waiver())

{
    
    
    
    bool_CI5_comp <- FALSE
    CI5_cancer_label <- NULL
    bool_dum_by <- FALSE
    
    
    # manage CI5 import
    if (!is.null(CI5_comparison)) {
      
      bool_CI5_comp <- TRUE
      data(csu_ci5_mean, envir = e <- new.env())
      df_CI5 <- e$csu_ci5_mean
      dt_CI5 <- data.table(df_CI5)
      if (is.character(CI5_comparison)) {
        if (!(CI5_comparison%in% dt_CI5$ci5_cancer_label)) {
          stop('CI5_comparison value must be a correct cancer label, see documentation: Help(csu_ci5_mean)')
          
        } else {
          dt_CI5 <- dt_CI5[dt_CI5$ci5_cancer_label == CI5_comparison, ]
        }
        
      } else {
        if (is.numeric(CI5_comparison)) {
          if (!(CI5_comparison%in% dt_CI5$ci5_cancer_code)) {
            stop('CI5_comparison value must be a correct cancer code, see documentation: Help(csu_ci5_mean)')
            
          } else {
            dt_CI5 <- dt_CI5[dt_CI5$ci5_cancer_code == CI5_comparison, ]
          }
        }
      }
      CI5_cancer_label <- toString(dt_CI5$ci5_cancer_label[1])
    }
    
    
    # manage group by options
    if (is.null(group_by)) {
      
      df_data$CSU_dum_by <- "dummy_by"
      group_by <- "CSU_dum_by"
      bool_dum_by <- TRUE
    }
    
    
    dt_data <- data.table(df_data, key = group_by)
    setnames(dt_data, var_age, "CSU_A")
    setnames(dt_data, var_cases, "CSU_C")
    setnames(dt_data, var_py, "CSU_P")
    setnames(dt_data, group_by, "CSU_BY")
    
    ##group population (use sum)
    dt_data <- dt_data[, list(CSU_C=sum(CSU_C),CSU_P=sum(CSU_P)), by=c("CSU_BY", "CSU_A") ]

    ##change by to factor
    dt_data$CSU_BY <- factor(dt_data$CSU_BY)

    
    ##to calcul age group
    dt_data[CSU_A %in% missing_age,CSU_A:=NA ] 
    dt_data[is.na(CSU_A),CSU_P:=0 ] 

    #parse age
    dt_data[, temp_label:=CSU_A]
    dt_data[,CSU_A :=  as.numeric(gsub(".*?(\\d{1,3}).*$", "\\1",CSU_A, perl=TRUE))]
    if (max(dt_data$CSU_A,na.rm=TRUE) > 25) {
      dt_data[,CSU_A := round((CSU_A/5)+1)]
    }

    dt_temp_label <- unique(dt_data[, c("CSU_A", "temp_label"), with=FALSE])
    dt_data[, temp_label:= NULL]
    
    dt_data$CSU_age_factor <- as.numeric(c(as.factor(dt_data$CSU_A)))
    dt_data <- merge(dt_data, dt_data[dt_data$CSU_P != 0,list(nb_age_group = max(CSU_age_factor)), by="CSU_BY"], by="CSU_BY")   

  
    for (i in 15:17) {
      if (i %in% dt_data$nb_age_group) {

        dt_data[nb_age_group == i & CSU_age_factor >= i , CSU_C:=sum(CSU_C), by="CSU_BY"] ##add total_know
        dt_data[nb_age_group == i & CSU_age_factor > i & !is.na(CSU_age_factor), CSU_C := 0] 
      } 
    }

    dt_data <- dt_data[CSU_P!=0] 
    ##calcul rate 
    dt_data$rate <- dt_data$CSU_C/dt_data$CSU_P *db_rate

    ##create age label:
    if (is.null(age_label_list)) {
      
      ##create age dummy: 1 2 3 4 --- 18
      ##regroup case for population with nb of age group <  18 
      
      
      max_age <- max(dt_data$nb_age_group)
      
      if (max_age > 18) {
        stop('The function cannot have more than 18 age-group, see documentation: Help(csu_graph_ageSpecific)')
      }
      
      age_label <- c("0-4","5-9","10-14","15-19","20-24","25-39","30-34","35-39","40-44", "45-49","50-54","55-59","60-64","65-69","70-74","75-79","80-84","85+")
      
      age_label <- c(age_label[1:(max_age-1)], paste0((max_age-1)*5,"+"))
                     
    } else {
      age_label <-age_label_list
      max_age <- length(age_label)
    }
    
    ## to calcul breaks
    
    if (logscale) {
      min_tick_value <- min(dt_data[rate != 0,]$rate)
      
    } else {
      min_tick_value <- 0
    }
    
    
    tick <- core.csu_tick_generator(max = max(dt_data$rate), min=min_tick_value, logscale = logscale )
    tick_space <- tick$tick_list[length(tick$tick_list)] - tick$tick_list[length(tick$tick_list)-1]
    
    temp_top <- ceiling(max(dt_data$rate)/tick_space)*tick_space
    temp_expand_y <- max(dt_data$rate)/35
    temp_expand_y_up <- max(dt_data$rate)+temp_expand_y
    if (temp_expand_y_up > temp_top-(tick_space/2)) {
      temp_expand_y_up <- temp_top+temp_expand_y
    }
    
    th_legend <- list(theme(legend.position="none"))
    
    if (!bool_dum_by & legend$position == "bottom") {
      
      th_legend <- list(theme(
        legend.key = element_rect(fill="transparent"),
        legend.position = "bottom",
        legend.text = element_text(size = 14),
        legend.title = element_text(size = 14),
        legend.key.size=unit(1,"cm"),
        legend.margin = margin(0, 0, 0, 0)
      ))
    }
    
    if (bool_CI5_comp & is.null(age_label_list)) {
      
      if (max_age < 18) {
        dt_CI5[CSU_age_factor >= max_age , CSU_C:=sum(CSU_C)] ##add total_know
        dt_CI5[ CSU_age_factor >= max_age , CSU_P:=sum(CSU_P)]
        dt_CI5 <- dt_CI5[CSU_age_factor <= max_age]    
      }
      

      
      dt_CI5$rate <- dt_CI5$CSU_C/dt_CI5$CSU_P *db_rate
      
      if (logscale) {
        dt_CI5[rate == 0 , rate:= NA]
      }
      
    }
    
    
    ylim_inf <- min(c(tick$tick_list, tick$tick_minor_list))
    ylim_sup <- max(c(tick$tick_list, tick$tick_minor_list))

    
    ##csu_plot
    if (logscale) {
      base_plot <- ggplot(dt_data[, rate := ifelse(rate==0,NA, rate )], aes(CSU_age_factor, rate))
    } else {
      base_plot <- ggplot(dt_data, aes(CSU_age_factor, rate))
    }
    if (bool_CI5_comp) {
      
      pos_y_text = - tick_space
      if (temp_top/tick_space > 7) {
        
        pos_y_text = pos_y_text*1.5
        
      }
      
      if(is.null(plot_caption)) {
        plot_caption <- paste0("- - - - - - : Mean for ",CI5_cancer_label," cancer in CI5 XII")
      }
      
      
      base_plot <- base_plot + 
        geom_line(data = dt_CI5,
                  linewidth = 1,
                  linetype=2,
                  colour = "grey50", 
                  show.legend=FALSE)
      
    } 
    
    
    csu_plot <- base_plot+
      geom_line(aes(color=CSU_BY), linewidth = 1,na.rm=TRUE)+
      guides(color = guide_legend(override.aes = list(size=0.75)))+
      labs(title = plot_title,
           subtitle = plot_subtitle,
           caption = plot_caption)+
      scale_x_continuous(name = xtitle,
                         breaks=seq(1, max_age, 1),
                         labels = age_label,
                         minor_breaks = NULL,
                         expand = c(0.015,0.015)
      )
    
    if (logscale){
      if (log_point) {
        csu_plot <- csu_plot +
          geom_point(aes(fill=CSU_BY), size = 3,na.rm=TRUE,shape=21,stroke=0.5,colour="black", show.legend=FALSE)
      }
      csu_plot <- csu_plot +
        scale_y_continuous(name = paste(ytitle, formatC(db_rate, format="d", big.mark=",")),
                           breaks=tick$tick_list,
                           minor_breaks = tick$tick_minor_list,
                           limits=c(ylim_inf,ylim_sup),
                           labels=core.csu_axes_label,
                           trans = "log10"
        )
    } else {
      
      csu_plot <- csu_plot +
        coord_cartesian( ylim=c(-temp_expand_y, temp_expand_y_up),  expand = TRUE)+
        scale_y_continuous(name = paste(ytitle, formatC(db_rate, format="d", big.mark=",")),
                           breaks=tick$tick_list,
                           labels=core.csu_axes_label,
                           expand = c(0,0)
        )
    } 
    
    
    
    csu_plot <- csu_plot +
      theme(
        plot.background= element_blank(),
        panel.background = element_blank(),
        panel.grid.major= element_line(colour = "grey70"),
        panel.grid.minor= element_line(colour = "grey70"),
        plot.title = element_text(size=16, margin=margin(0,0,15,0),hjust = 0.5),
        plot.subtitle = element_text(size=15, margin=margin(0,0,15,0),hjust = 0.5),
        plot.caption = element_text(size=10, margin=margin(15,0,0,0)),
        axis.title = element_text(size=14),
        axis.title.y = element_text(margin=margin(0,15,0,0)),
        axis.title.x = element_text(margin=margin(15,0,0,0)),
        plot.margin=margin(20,20,20,20),
        axis.text = element_text(size=14, colour = "black"),
        axis.text.x = element_text(size=14, angle = 60,  hjust = 1),
        axis.ticks= element_line(colour = "black", linewidth = linesize),
        axis.ticks.length = unit(0.2, "cm"),
        axis.line.x = element_line(colour = "black", 
                                   linewidth = linesize, linetype = "solid"),
        axis.line.y = element_line(colour = "black", 
                                   linewidth = linesize, linetype = "solid")
      )+
      th_legend
    
      
    
    if (!is.null(color_trend)) {
      
      csu_plot <- csu_plot +
        scale_colour_manual(name=legend$title,
                            labels = label_group_by,
                            values= color_trend,
                            drop = FALSE)
      
      if (logscale) {
        csu_plot <- csu_plot +
          scale_fill_manual(labels = label_group_by,
                            values= color_trend,
                            drop = FALSE)
      }
      
      
    } else {
      csu_plot <- csu_plot +
        scale_colour_discrete(name=legend$title)
    }
    
    if (!bool_dum_by & legend$position=="right") {
      
      csu_plot <- csu_plot + 
        geom_text(data = dt_data[CSU_age_factor == nb_age_group, ],
                  aes(label = CSU_BY),
                  hjust=-0.05)+
        theme(plot.margin = unit(c(0.5, legend$right_space_margin, 0.5, 0.5), "lines"))
      
    } else {
      
      csu_plot <- csu_plot +
        guides(color = guide_legend(nrow=legend$nrow))
    }
    
    dt_data$nb_age_group <- NULL
    dt_data$CSU_age_factor <- NULL
    
    if (logscale){
      dt_data[, rate := ifelse(is.na(rate),0, rate )]
    }

    #get back age label 
    dt_data <- merge(dt_data, dt_temp_label, by=("CSU_A"), all.x=TRUE)
    temp = grep("CSU_A", colnames(dt_data))
    temp2 = length(colnames(dt_data))
    dt_data <- dt_data[,c(1:temp, temp2, (temp+2):temp2-1) , with=FALSE ]
    dt_data[, CSU_A := NULL]
    setnames(dt_data, "temp_label", "CSU_A")
    
    
    return(list(csu_plot = csu_plot, dt_data = dt_data, CI5_cancer_label = CI5_cancer_label,legend_position=legend$position,bool_dum_by = bool_dum_by))
    
  }



core.csu_ageSpecific_top <- function(df_data, 
                                     var_age, 
                                     var_cases, 
                                     var_py,
                                     var_top,
                                     group_by=NULL,
                                     missing_age=NULL,
                                     db_rate = 100000,
                                     logscale = FALSE,
                                     nb_top = 5,
                                     plot_title=NULL,
                                     plot_subtitle=NULL,
                                     label_group_by=NULL,
                                     xtitle = "Age at diagnosis",
                                     ytitle = "Age-specific incidence rate per",
                                     var_color=NULL,
                                     plot_caption=NULL,
                                     var_age_label_list = NULL,
                                     caption_bypass=FALSE) {
  
  
  
  
  bool_dum_by <- FALSE
  
  if (is.null(group_by)) {
    
    df_data$CSU_dum_by <- "dummy_by"
    group_by <- "CSU_dum_by"
    df_data$CSU_dum_by <- factor(df_data[[group_by]],levels=c("dummy_by"), labels=c(""))
    bool_dum_by <- TRUE
  } else {
    df_data[[group_by]] <- as.factor(df_data[[group_by]])
  }
  
  df_data <- core.csu_dt_rank(df_data, var_value = var_cases, var_rank = var_top,group_by = group_by, number = nb_top) 
  df_data[["dummy_top"]] <-core.csu_legend_wrapper(df_data[[var_top]], 14)
  
  
  plotlist <- list()
  datalist <- list()
  j <- 1 
  
  #dummmy variable to factorize variable
  df_data$CSU_dum_by <- as.factor(df_data[[group_by]])
  for (i in levels( df_data$CSU_dum_by)) {
    
    if (!is.null(label_group_by)) {
    
        label_group <- label_group_by[j]
    }
    else {
      label_group <- i
    }
    
    if (caption_bypass) {
      if (j == 1) {
        plot_caption <- ""
      } else {
        plot_caption <- plot_title
        plot_title <- ""
  
      }
    }
    
    dt_plot <- df_data[get("CSU_dum_by") == i]
    
    if (!is.null(var_color)) {
      dt_label_order <- setkey(unique(dt_plot[, c("dummy_top",var_color, "CSU_RANK"), with=FALSE]), CSU_RANK)
      dt_plot$dummy_top <- factor(dt_plot$dummy_top,levels = dt_label_order$dummy_top) 
      color_trend <- as.character(dt_label_order[[var_color]])
    } else {
      dt_label_order <- setkey(unique(dt_plot[, c(var_top, "CSU_RANK"), with=FALSE]), CSU_RANK)
      dt_plot[[var_top]] <- factor(dt_plot[[var_top]],levels = dt_label_order[[var_top]]) 
      color_trend <- NULL
      
    }
    
    
    if (!is.null(var_age_label_list)) {
      age_label_list <- unique(dt_plot[[var_age_label_list]])
    } else {
      age_label_list <- NULL
    }
    
    if (is.null(plot_subtitle)) {
      subtitle_temp <- label_group
    } else {
      subtitle_temp <- paste0(plot_subtitle,"\n",label_group)
    }
    
   
    
    temp <- core.csu_ageSpecific(
      dt_plot,
      var_age=var_age,
      var_cases= var_cases,
      var_py=var_py,
      group_by = "dummy_top",
      missing_age = missing_age,
      db_rate= db_rate,
      plot_title = plot_title,
      plot_subtitle = subtitle_temp,
      plot_caption = plot_caption,
      color_trend = color_trend,
      logscale = logscale,
      log_point=FALSE,
      age_label_list = age_label_list,
      xtitle = xtitle,
      ytitle = ytitle
      )
    
    dt_temp <- temp$dt_data
    dt_temp[[group_by]] <- i

    
    setnames(dt_temp, "CSU_BY", "dummy_top")
    
    

    
    plotlist[[j]] <- temp$csu_plot
    datalist[[j]] <- dt_temp
    
    j <- j+1
  }
  
  df_data <- unique(df_data[,c("dummy_top",group_by, var_top, "CSU_RANK"), with=FALSE])
  
  dt_final <- NULL
  for (i in 1:(j-1)) {
    dt_final <- rbind(dt_final, datalist[[i]])
  }
  
  dt_final <-  merge(dt_final,df_data,by=c("dummy_top",group_by),all.x=TRUE, all.y=FALSE)
  dt_final[, dummy_top :=NULL]
  setnames(dt_final, "CSU_A", var_age)
  setnames(dt_final, "CSU_C", var_cases)
  setnames(dt_final, "CSU_P", var_py)
  
  setkeyv(dt_final, c(group_by,"CSU_RANK",var_age))
  
  if (bool_dum_by) {
    dt_final[, CSU_dum_by:=NULL]
  }
  
  return(list(plotlist=plotlist, dt_data=dt_final))
  
}



core.csu_time_trend <- function (
  df_data,
  var_trend = "asr",
  var_year = "year",
  group_by = NULL,
  logscale = TRUE,
  smoothing = NULL,
  legend = csu_trend_legend(),
  color_trend = NULL,
  ytitle = "Age standardized rate per 100000",
  xtitle = "Year",
  plot_title = "test",
  linesize = 0.5,
  plot_subtitle = NULL,
  plot_caption = NULL) {
  
  
  if (!is.null(smoothing)) {
    if (smoothing == 0) {
      smoothing <- NULL
    }
  }
  
  
  
  dt_data <- data.table(df_data, key = group_by)
  setnames(dt_data, var_year, "CSU_Y")
  setnames(dt_data, var_trend, "CSU_T")
  setnames(dt_data, group_by, "CSU_BY")
  
  bool_dum_by <- FALSE
  if (is.null(group_by)) {
    
    dt_data$CSU_dum_by <- "dummy_by"
    group_by <- "CSU_dum_by"
    bool_dum_by <- TRUE
  }
  
  #change by to factor
  dt_data$CSU_BY <- factor(dt_data$CSU_BY)
  
  #smooth with loess  fonction
  if (!is.null(smoothing))
  {
    dt_data[,CSU_smooth:= loess( CSU_T ~ CSU_Y, span=smoothing)$fitted, by=CSU_BY]
  } else {
    dt_data[,CSU_smooth:= CSU_T]
  }
  
  dt_data[, max_year:=max(CSU_Y), by=CSU_BY]
  
  # to calcul y axes breaks
  
  if (logscale) {
    min_tick_value <- min(dt_data[CSU_smooth > 0,]$CSU_smooth)
  } else {
    min_tick_value <- 0
  }
  
  
  tick <- core.csu_tick_generator(max = max(dt_data$CSU_smooth), min=min_tick_value, logscale = logscale )
  tick_space <- tick$tick_list[length(tick$tick_list)] - tick$tick_list[length(tick$tick_list)-1]
  
  
  #to calcul year axes break
  
  year_tick <-  core.csu_year_tick_generator(min(dt_data$CSU_Y),max(dt_data$CSU_Y))
  
  
  
  
  temp_top <- ceiling(max(dt_data$CSU_smooth)/tick_space)*tick_space
  temp_expand_y <- max(dt_data$CSU_smooth)/35
  temp_expand_y_up <- max(dt_data$CSU_smooth)+temp_expand_y
  if (temp_expand_y_up > temp_top-(tick_space/2)) {
    temp_expand_y_up <- temp_top+temp_expand_y
  }
  
  th_legend <- list(theme(legend.position="none"))
  
  if (!bool_dum_by & legend$position == "bottom") {
    
    th_legend <- list(theme(
      legend.key = element_rect(fill="transparent"),
      legend.position = "bottom",
      legend.text = element_text(size = 12),
      legend.title = element_text(size = 12),
      legend.key.size=unit(1,"cm"),
      legend.margin = margin(0, 0, 0, 0)
    ))
  }
  
  
  xlim_inf <- min(c(year_tick$tick_list, year_tick$tick_minor_list))
  xlim_sup <- max(c(year_tick$tick_list, year_tick$tick_minor_list))
  
  ylim_inf <- min(c(tick$tick_list, tick$tick_minor_list))
  ylim_sup <- max(c(tick$tick_list, tick$tick_minor_list))

  
  #csu_plot
  if (logscale) {
    base_plot <- ggplot(dt_data[, CSU_smooth := ifelse(CSU_smooth==0,NA, CSU_smooth )], aes(CSU_Y, CSU_smooth))
  } else {
    base_plot <- ggplot(dt_data, aes(CSU_Y, CSU_smooth))
  }
  
  csu_plot <- base_plot+
    geom_line(aes(color=CSU_BY), linewidth = 0.75,na.rm=TRUE)+
    guides(color = guide_legend(override.aes = list(size=0.75)))+
    labs(title = plot_title, 
         subtitle = plot_subtitle,
         caption = plot_caption)+
    scale_x_continuous(name = xtitle,
                       breaks=year_tick$tick_list,
                       limits=c(xlim_inf,xlim_sup),
                       minor_breaks = year_tick$tick_minor_list,
                       expand = c(0.015,0.015)
    )
  
  
  
  if (logscale){
    
    
    csu_plot <- csu_plot +
      scale_y_continuous(name = ytitle,
                         breaks=tick$tick_list,
                         minor_breaks = tick$tick_minor_list,
                         limits=c(ylim_inf,ylim_sup),
                         labels=core.csu_axes_label,
                         trans = "log10"
      )
  } else {
    
    csu_plot <- csu_plot +
      coord_cartesian( ylim=c(-temp_expand_y, temp_expand_y_up),  expand = TRUE)+
      scale_y_continuous(name = ytitle,
                         breaks=tick$tick_list,
                         labels=core.csu_axes_label,
                         expand = c(0,0)
      )
  } 
  
  if (is.null(color_trend)) {
    csu_plot <- csu_plot +scale_colour_discrete(name=legend$title)
  } 
  else {
    csu_plot <- csu_plot +scale_colour_manual(name=NULL, values=color_trend)
  }
  
  
  csu_plot <- csu_plot +
    theme(
      plot.background= element_blank(),
      panel.background = element_blank(),
      panel.grid.major= element_line(colour = "grey70"),
      panel.grid.minor= element_line(colour = "grey70"),
      plot.title = element_text(size=16, margin=margin(0,0,15,0),hjust = 0.5),
      plot.subtitle = element_text(size=15, margin=margin(0,0,15,0),hjust = 0.5),
      plot.caption = element_text(size=10, margin=margin(15,0,0,0)),
      axis.title = element_text(size=12),
      axis.title.y = element_text(margin=margin(0,15,0,0)),
      axis.title.x = element_text(margin=margin(15,0,0,0)),
      plot.margin=margin(20,20,20,20),
      axis.text = element_text(size=12, colour = "black"),
      axis.text.x = element_text(size=12,  hjust = 0.5),
      axis.ticks= element_line(colour = "black", linewidth = linesize),
      axis.ticks.length = unit(0.2, "cm"),
      axis.line.x = element_line(colour = "black", 
                                 linewidth = linesize, linetype = "solid"),
      axis.line.y = element_line(colour = "black", 
                                 linewidth = linesize, linetype = "solid")
    )+
    th_legend
  
  
  
  if (!bool_dum_by & legend$position=="right") {
    
    csu_plot <- csu_plot + 
      geom_text(data = dt_data[CSU_Y == max_year, ],
                aes(label = CSU_BY),
                hjust=0,
                nudge_x=0.5)+
      theme(plot.margin = unit(c(0.5, legend$right_space_margin, 0.5, 0.5), "lines"))
    
  } else {
    
    csu_plot <- csu_plot +
      guides(color = guide_legend(nrow=legend$nrow))
  }
  
  return(list(dt_data=dt_data, csu_plot = csu_plot, legend_position=legend$position,bool_dum_by = bool_dum_by))
  
  
}


core.csu_year_extract <- function(year_list) {

  temp <- gsub("[^\\d]", "", year_list, perl=TRUE)
  if (unique(nchar(temp)) == 4 ) {
    return(as.numeric(year_list))
  }
  else if (unique(nchar(temp)) == 6 ) {
    
      test <- all(grepl("(18|19|20)\\d{2}\\d{2}",temp))
      if (test) {
        return(as.numeric(gsub("(^\\d{4}).+", "\\1", temp, perl=TRUE))) 
      }
      else {
        return(as.numeric(gsub(".+(\\d{4}$)", "\\1", temp, perl=TRUE))) 
      }

  }
  else if (unique(nchar(temp)) == 8 ) {
     
     test <- all(grepl("(18|19|20)\\d{2}\\d{4}",temp))
     if (test) {
        return(as.numeric(gsub("(^\\d{4}).+", "\\1", temp, perl=TRUE))) 
     }
     else {
        return(as.numeric(gsub(".+(\\d{4}$)", "\\1", temp, perl=TRUE))) 
     }
  }
}


core.csu_icd_group <- function(icd_list) {

  bool_follow <- FALSE 
  icd_first <- icd_list[1]
  icd_long <- icd_first

  code_active = as.numeric(sub(".+?(\\d+)", "\\1", icd_first))

  for (code in icd_list[-1]) {

    code_new = as.numeric(sub(".+?(\\d+)", "\\1", code))
    bool_follow <- code_new == code_active + 1

    if (bool_follow) {
      icd_long <- paste0(icd_first, '-', sprintf("%02d", code_new))

    }
    else {
      icd_long <- paste0(icd_long, ',', as.character(code))
      icd_first <- icd_long
    }
    code_active <- code_new

  }
  return(icd_long)
}


core.csu_icd_ungroup <- function(icd_group) {

  icd_group <- gsub("\\s", "", icd_group)
  
  icd_list <- NULL
  ICD_reg <-"([A-Za-z]\\d+)(\\W?)(.+)?"

  while (nchar(icd_group)>=3) {

    icd_start <- sub(ICD_reg, "\\1", icd_group)
    icd_mark <- sub(ICD_reg, "\\2", icd_group)
    icd_group <- sub(ICD_reg, "\\3", icd_group)


    if (icd_mark == "-") {

      letter_start <- sub("([A-Za-z])(\\d+)", "\\1", icd_start)
      code_start <- sub("([A-Za-z])(\\d+)", "\\2", icd_start)
      code_nchar <- nchar(code_start)
      code_start <- as.numeric(code_start)
      code_end <- as.numeric(sub("[A-Za-z]?(\\d+)(.+)?", "\\1", icd_group))

      for (code in code_start:code_end) {
        icd_list <- c(icd_list, paste0(letter_start, sprintf(paste0("%0",code_nchar,"d"), code)))
      }
      icd_group <- sub("([A-Za-z]?\\d+)(\\W?)(.+)?", "\\3", icd_group) 
    }
    else  {
      icd_list <- c(icd_list, sub("([A-Za-z]\\d+)", "\\1", icd_start))
    }
  }

  return(icd_list)
}


core.csu_group_cases <- function(df_data, var_age ,group_by=NULL,var_cases = NULL,df_ICD = NULL,var_ICD=NULL,var_year = NULL, all_cancer=FALSE) {


  label_by <- NULL
  dt_data <- data.table(df_data)

  bool_dum_by <- FALSE
  if (is.null(group_by)) {
      
    dt_data$CSU_dum_by <- "dummy_by"
    group_by <- "CSU_dum_by"
    bool_dum_by <- TRUE
  }
  

  if (is.null(var_cases)) {
    var_cases <- "CSU_C"
    dt_data[, CSU_C:= 1]
  } else {
    setnames(dt_data, var_cases, "CSU_C")
  }

  dt_data <- dt_data[, unique(c("CSU_C", var_age,group_by,var_ICD,var_year)), with = FALSE]

  if (!is.null(var_year)) {
    dt_data$year <-  core.csu_year_extract(dt_data[[var_year]])
    if (var_year != "year") {
      dt_data[, (var_year) := NULL]  
    }
    group_by <- c(group_by, "year")

  }


  if (!is.null(df_ICD)) {
  # merge with ICD 
    dt_ICD <- data.table(df_ICD)
    setkeyv(dt_ICD,c("LABEL", "ICD")) 
    dt_ICD <- unique(dt_ICD)

    #ungroup ICD code 
    dt_table <- data.table()
    for (row in 1:nrow(dt_ICD)) {

      icd_group <- as.character(dt_ICD[row]$ICD)
      temp <- core.csu_icd_ungroup(paste(icd_group, collapse=","))
      temp <- data.table(ICD_ungroup = temp, ICD =icd_group )
      
      dt_table <- rbind(dt_table, temp)

    }

    dt_ICD <- merge(dt_ICD, dt_table, by="ICD")
    dt_ICD[, ICD:=NULL]
    dt_ICD <- unique(dt_ICD)

    dt_ICD_unique <- setDT(dt_ICD)[, .N, keyby=ICD_ungroup][N>1,]  

    if (nrow(dt_ICD_unique) > 0) {

      dt_ICD_unique <- merge(dt_ICD_unique,dt_ICD,by="ICD_ungroup", all.x=TRUE) 
      dt_ICD_unique <- merge(dt_ICD_unique, df_ICD, by="LABEL")
      dt_ICD_unique <- dt_ICD_unique[,c("LABEL", "ICD") , with=FALSE]
      cat("-----\n")
      print(as.data.frame(dt_ICD_unique))
      stop(paste0("There is different label with the same ICD code in the ICD file defined")) 

    }

    setnames(dt_ICD, "ICD_ungroup", "ICD")

    dt_ICD[, ICD_group:= sapply(LABEL, function(x) {core.csu_icd_group(as.vector(dt_ICD[LABEL == x, ]$ICD))})]
    list_ICD <- dt_ICD$ICD

    dt_data$temp <- as.character(dt_data[[var_ICD]])
    dt_data[, ICD := list_ICD[match(dt_data$temp, list_ICD)]]
    dt_data[!is.na(ICD), temp := NA]
    dt_data[, temp:= substr(temp, 1, 3)] 
    dt_data[!is.na(temp), ICD:=list_ICD[match(dt_data[!is.na(temp)]$temp, list_ICD)]]
    dt_data <- dt_data[!is.na(ICD), ]
    dt_data <- merge(dt_data, dt_ICD, by=c("ICD"))
    dt_data[,c("temp", var_ICD,"ICD") := list(NULL, NULL,NULL)]
    dt_data[, LABEL := factor(LABEL)]
    dt_data[, ICD_group := factor(ICD_group)]

    label_by <- c(label_by,"LABEL")
    group_by <- c(group_by, "ICD_group")

    dt_ICD <- NULL

  }

  # create age group
  dt_data[, age_group:= ((get(var_age) - (get(var_age) %% 5))/5)+1]
  dt_data[, age_group :=  ifelse(age_group > 18 & age_group <31,18,age_group)]
  dt_data[, age_group :=  ifelse(age_group > 18 ,19,age_group)]
  dt_data[, c(var_age) := NULL]

  dt_data <-  dt_data[,list(CSU_C = sum(CSU_C)),by=eval(colnames(dt_data)[!colnames(dt_data) %in% c("CSU_C")])]

  #complete missing age group
  if (max(dt_data$age_group) == 19) {
    temp <- c(1:19)
  }
  else {
     temp <- c(1:18)
  }

  dt_CJ = dt_data[, do.call(CJ, c(.SD,list(age_group=temp), unique=TRUE)), .SDcols=group_by]
  temp <- copy(colnames(dt_CJ))

  # add age_group label
  dt_CJ[, temp1 := sprintf("%02d",(age_group-1)*5)]
  dt_CJ[, temp2 := (age_group*5)-1]
  dt_CJ[, age_group_label := ifelse(temp2 == 89, paste0(temp1,"+"), paste0(temp1,"-",  sprintf("%02d", temp2)))] 
  dt_CJ[age_group == 19 , age_group_label :=  "Unknown"]
  dt_CJ[,c("temp1","temp2") := list(NULL, NULL)]

  group_by  <- c(group_by, "age_group")

  ##add ICD group label (but #dad is almost pink)
  if (!is.null(df_ICD)) {
    dt_temp <- unique(dt_data[, c("ICD_group","LABEL"), with=FALSE])
    dt_CJ <- merge(dt_CJ, dt_temp, by="ICD_group", all.x=TRUE)
    temp <- c(temp, "LABEL")
  }

  dt_data <- merge(dt_CJ, dt_data,by=temp, all.x=TRUE)[, CSU_C := ifelse(is.na(CSU_C),0, CSU_C )]
  keep_by <- group_by[!group_by %in% c("year", "age_group", "ICD_group")]

  setkeyv(dt_data, keep_by)
  dt_data[ , group_id := .GRP, by = key(dt_data)]
  dt_data[,temp := sum(CSU_C),by=group_id]
  dt_data <- dt_data[temp>0,]
  dt_data[, c("temp","group_id") := NULL]
  setkeyv(dt_data, group_by)



  if (var_cases=="CSU_C") {
    setnames(dt_data, "CSU_C","cases")
    var_cases <- NULL
  }
  else {
    setnames(dt_data, "CSU_C",var_cases)
  }

  if (bool_dum_by) {
    dt_data$CSU_dum_by <- NULL
  }

  #manage data.table input and output
  if (is.data.table(df_data)) 
  {
    df_data <- dt_data
  } else {
    df_data <- data.frame(dt_data)
  }


  return (df_data)
}