WIP/frg_wilcox.R

#' @title FRG_Comp_Sig_Matrix
#' 
#' @description Function used to perform the Wilcoxon test of significant differences, using differnt number of "before fire years" as reference \cr
#' The function creates and saves matrixes of p-values obtained using differnt number of before fire years. Matrix is saved within the "p-matrixes" subfolder \cr
#' of the folder containing the final output file. Additional info necessary to plot the time series (i.e., boxplots, objectids of fires, fireyears, ecc) \ cr
#' are also saved in the same file (plot_stat data frame). 
#' 
#' @details
#'
#' @param In_File string File of scaled VI Time Series extracted for single-fire areas
#' @param out_file_pmat string Output file name where the p-values matrix is saved
#' @param min_pix numeric Minimum number of CORE pixels for a bunt area to be considered
#' @param perc_diff numeric Minimum percentage difference checked for significant reduction 
#' @param MedWdt numeric Number of pre-fire years considered as reference
#' @param sub_zones flag If 1 , consider ENV_ZONES differences when analyzing
#' @param sig_level numeric Significance level used for Wilcoxon test 
#' @param erode flag If 1, perform analysis only on NON CORE burnt pixels 
#'
#' @return Saves a p_matrix.RData file containing the p-values matrix and the "plot_stats" data frame in the "p_matrixes" subfolder
#'
#' @author Lorenzo Busetto - email: lorenzo.busetto@@jrc.ec.europa.eu
#' Created Date: Feb 16, 2012
#' @export
#' @importFrom data.table rbindlist setkey 
#' @importFrom abind abind
#' @importFrom dplyr summarise filter select group_by
#' @importFrom gdata drop.levels
#' 
frg_wilcox = function(In_File   = In_File,   out_file_pmat = out_file_pmat, min_pix = min_pix, 
                      perc_diff = perc_diff, MedWdt = MedWdt, sub_zones = sub_zones, erode = erode) 
{
  
  median_analysis = T
  perc_difference = perc_diff
  
  Out_File = out_file_pmat
  # Load data and define processing options.
  
  Out_File_R = paste(Out_File,'.RData', sep = '')
  Out_File_csv = paste(Out_File,'.csv', sep = '')
  
  #- ---------------------------------------------------- - 
  # Get time series data from the input file ----
  #- ---------------------------------------------------- - 
  
  print(paste('-> Performing Statistical Analysis on Post-Fire SVI Time Series from: ', basename(In_File)))   # Status Message
  get(load(In_File))
  Index    = attributes(ts_data)$Index				
  SVI_File = attributes(ts_data)$SVI_File   
  Shape_File = attributes(ts_data)$Shape_File
  CSV_File = attributes(ts_data)$CSV_File
  CLC_File = attributes(ts_data)$CLC_File
  
  
  # Retrieve time series data
  # Retrieve time series data
  # if(erode == 0) { #  Get data in the case that erode = 0
  #   Data = Data [,1:(8+N_Years)]
  # } else { #  Get data in the case that erode = 1
  #   names = names(Data)[1:(8+N_Years)]    
  #   Data = Data [,c(1:7,(8+N_Years+1):(8+2*N_Years+1))]
  #   names(Data)= names
  # }
  
  # Initialization and Preliminary pre elaborations
  counter = 1
  ptm <- proc.time()
  # Select "Interesting" CLC Classes
  sel_levels = c('Schlerophyllus Vegetation', 'Broadleaved Forests', 'Coniferous Forests',	'Mixed Forests','Transitional Vegetation')  
  ts_data = gdata::drop.levels(subset(ts_data,CLC_Class %in% sel_levels))		# Remove unneeded CLC classes
  ts_data$N_PIX = as.factor(paste('p', ts_data$N_PIX, sep = '_'))	  	# Convert N_PIX to factor and add a "p" before the number
  n_fires = length(unique(ts_data$OBJECTID))												# Number of records to be analyzed (total number of FIRES in both burned areas shapefile and MODIS ROI )
  
  # st_ind = 8   ;    end_ind = length(names(Data))		;  								# Start and end indexes of "years" columns
  
  # Get ancillary data regarding fires from the EFFIS shape file
  
  Data_Shape  = droplevels(subset(Data_Shape,OBJECTID %in% unique(ts_data$OBJECTID)))			# Remove fires not "present" in the MODIS dataset					
  recs = unique(Data_Shape$OBJECTID)							# Count the remaining fires
  n_recs =length(recs)														# Number of fires to process
  browser()
  # Reshape data structure to facilitate the analyis (melting) and get info about the data (e.g., number of years, ecc)
  ts_data = ts_data[complete.cases(ts_data),]
  ts_data_Melted = ts_data
  # Data_Melted = melt(Data, id.vars = c(1:(st_ind-1)))
  # names(Data_Melted)[st_ind:(st_ind+1)] = c('Year', 'Index')
  attributes(ts_data_Melted)$Index = Index
  # 	Data_Melted = drop.levels(subset(Data_Melted,Year != '2000'))			# If present, remove the data regarding year 2000 (Strong uncertainties in MODIS data of this year !!!!
  
  # Avail_Years = as.numeric(levels(Data_Melted$Year))				# Get list of available years
  # Number of years in the time series	
  Start_Year = min(ts_data_Melted$Year)															# Starting year of the time serie
  End_Year = max(ts_data_Melted$Year)  	                            # Ending year of the time serie
  N_Years  =  1 + as.numeric(End_Year) - as.numeric(Start_Year)
  Avail_Years = seq(Start_Year, End_Year, 1)
  #- --------------------------------------------------------------------------- -#
  #- #Initialize output matrixes
  #- --------------------------------------------------------------------------- -#
  
  plot_stat_names = c('CASE_ID','OBJECTID', 'FireYear', 'YearFromFire','Area_All','Area_Forest','Area_CLC','ENV_ZONE','CLC_Class','N_PIX','Time_Signif','Year','YearDiff','low_ci', 'mean','hi_ci','std_dev','low_box', 'X25th','median','X75th','up_box')
  
  # Define the output matrixes that will contain information on p-values of wilcoxon rank sum tests. 
  
  max_plus_years  =  N_Years - 3						#To compute the limits of the p-values full matrixes: 
  max_minus_years = -(N_Years - 1)				#To compute the limits of the p-values full matrixes
  
  p_matrix_median = NULL ; plot_stat = NULL    # Initialize output variables
  
  p_matrix_median_empty =  array(ts_data = NA, dim = c(max_plus_years-max_minus_years, max_plus_years-max_minus_years))		#Define an empty matrix with correct dimensions to store the results of 
  colnames(p_matrix_median_empty) = seq(max_minus_years,max_plus_years-1, 1 )																		    #the wilcoxon comparisons - using medians 
  rownames(p_matrix_median_empty) = seq(max_minus_years + 1,max_plus_years, 1 )		
  
  #- ---------------------------------------------------- - #
  # Start Analysis on single fires. FirePix is a counter on fires. ? Fires are analyzed ordered by total area according to the EFFIS shapefile ?
  #- ---------------------------------------------------- - #
  
  case_ID = 0 
  
  ts_data_Melted$YearDiff = (as.numeric(as.character(ts_data_Melted$Year)) - as.numeric(ts_data_Melted$FireYear))		# Compute the "Years from fire" variable.
  ts_data_Melted = data.table(ts_data_Melted)																														# cONVERT TO A DATA.TABLE to improve speed !!!
  setkey(ts_data_Melted ,OBJECTID,CLC_Class,ENV_ZONE)																									# Set the "keys" for the table. 
  
  for (FirePix in 1:n_fires){
    # for (FirePix in 1:200){
    
    FireCode = recs[FirePix]																				    # Getrr OBJECTID of the fire to be analyzed
    Data_Fire =droplevels(ts_data_Melted[J(FireCode)])												# Get Data of the selected fire
    Data_Fire_Shape = subset(Data_Shape, OBJECTID == FireCode)
    
    YearDiffs = (unique(Data_Fire$YearDiff)	)															# get levels of "Difference" with fire year - full and before fire
    YearDiffs_Before = YearDiffs[which(YearDiffs < 0)]
    
    if (length(Data_Fire$FireYear) != 0 & is.finite(Data_Fire$FireYear[1])) {   #control on number of records for selected fire
      FireYear =  Data_Fire$FireYear[1]																	# Get year of occurrence of the selected fire		
      Area_All = Data_Fire$Area_All[1]                                   # Get total burnt Area
      Area_Forest =  Data_Fire$Area_Forest[1]                           # Get total in forest CLC types
      YearFromFire = Start_Year - FireYear														  # Compute years from fire at the beginning of the time serie for the selected fire 	
      
      # Limit the analysis to fires occurred at least three years later than the start of the time series AND at least one year before its end 
      if ((FireYear - Start_Year >= 3) & (FireYear < End_Year )) {		
        
        CLC_classes = c(levels(Data_Fire$CLC_Class),'All')						# Get array of CLC classes present in the selected fire Fire
        ENV_Zones = c(levels(Data_Fire$ENV_ZONE), 'All')					# Get Array of ENV_ZONES present in the selected fire
        
        if (sub_zones == 0) {ENV_Zones = 'All'} else {ENV_Zones = c(levels(Data_Fire$ENV_ZONE), 'All')}
        # Start cyclying on ENV_Zones "available" in the selected fire
        
        for (Zone in ENV_Zones) {	
          if (Zone != 'All') {Data_Zone =droplevels(Data_Fire[ENV_ZONE == Zone])	} else  {Data_Zone = Data_Fire}
          
          # Start cyclying on CLC classes "available" in the selected fire 
          
          for (Class in CLC_classes) {
            
            if (Class != 'All') {Data_Class =droplevels(Data_Zone[CLC_Class == Class])	} else  {Data_Class = Data_Zone} 
            
            Data_Class_check <- Data_Class %>% 
              group_by(N_PIX) %>% 
              summarise(lgtyy = length(Index)) %>% 
              filter(lgtyy == N_Years) %>% 
              select(N_PIX)
            Data_Class = subset(Data_Class, Data_Class$N_PIX %in% Data_Class_check$N_PIX) 
            n_pix= length(unique(Data_Class$N_PIX))		# Check to see if at least min_pix pixels are "available" for the selected CLC class and Zone in the selected fire
            case_ID = case_ID + 1
            if (n_pix >= min_pix) { # if number of pixels grater than minimum , perform analysis 
              
              # Compute the "Area_CLC" class, i.e., the area burnt in each CLC class for the analyzed fire
              if (Class =='All' ) {Area_CLC = Area_Forest}
              if (Class =='Broadleaved Forests' ) {Area_CLC = Data_Fire_Shape[['BroadLeave']]}
              if (Class =='Coniferous Forests' )  {Area_CLC = Data_Fire_Shape[['Coniferous']]}
              if (Class =='Mixed Forests' )       {Area_CLC = Data_Fire_Shape[['MixedFores']]}
              if (Class =='Transitional Vegetation' ) {Area_CLC = Data_Fire_Shape[['Transition']]}
              if (Class =='Schlerophyllus Vegetation' ) {Area_CLC = Data_Fire_Shape[['Sclerophyl']]}
              
              if (median_analysis == T) {
                
                #- ----------------------------------------------
                # - Creation of the "means" matrix. Replace data of the "before" years with medians computed considering 1 year, 2 years, etcetera. 
                #- ----------------------------------------------	
                
                Data_Class_med = Data_Class
                
                for (Before_Year in YearDiffs_Before) {				# Start of cycle on: 
                  
                  Data_Before = droplevels(Data_Class[YearDiff >= Before_Year & YearDiff < 0])								# Get before fire data up to "Before_Year" CHECK !!!!!!!!!!!
                  mean_before = as.integer(Data_Before[,list(mean=mean(Index, na.rm = T)), by = list(N_PIX)]$mean)      # Compute means per pixels of the before years
                  Data_Class_med[YearDiff == Before_Year]$Index = mean_before    														# Replace the original values with the means of before years
                }    # End of cycle on: 
                
                #- ----------------------------------------------
                # - Statistical analysis - apply wilcoxon test to compute the p-values matrixes - check differences with respect to means of before years
                #- ----------------------------------------------
                p_matrix_median_tmp = p_matrix_median_empty		#Define an empty matrix with correct dimensions to store the results
     
                
                
                paired_wtest_median = try(pairwise.wilcox.test(Data_Class_med$Index, Data_Class_med$YearDiff, 
                                                               paired = TRUE, alternative = 'less', 
                                                               p.adjust.method = 'none', 
                                                               mu = -perc_diff))  # Compute p-values
           
                if(class(paired_wtest_median) == "try-error") {
                  # browser()
                } 
                
                #- ----------------------------------------------
                # - Then put results in the cumulated matrix. Assign the "CASE_ID" as dimname of the 3rd dimension 
                # for easy referencing
                #- ----------------------------------------------
                
                p_matrix_median_tmp[attributes(paired_wtest_median$p.value)[2]$dimnames[[1]], 
                                    attributes(paired_wtest_median$p.value)[2]$dimnames[[2]]] = paired_wtest_median$p.value
                
                p_matrix_median = try(abind(p_matrix_median, p_matrix_median_tmp, along = 3))
                if(class(p_matrix_median) == "try-error") {
                  # browser()
                } 
                dimnames(p_matrix_median)[[3]][dim(p_matrix_median)[3]] = case_ID
                
              }
              
              #- --------------------------------------------------------------------------- -#
              #- Compute data to be saved to allow plotting (CIs + boxplot thresholds
              #- --------------------------------------------------------------------------- -#
              
              # Compute confidence intervals using the method from Morey (2008) (http://wiki.stdout.org/rcookbook/Graphs/Plotting%20means%20and%20error%20bars%20%28ggplot2%29/)
              #							ci_data = summarySEwithin(Data_Class, measurevar ='Index', withinvars = 'Year', idvar = 'N_PIX', na.rm = FALSE, conf.interval = .95)
              #							ci_data = c(NA,NA,NA)
              #							int_data = cbind(ci_data$Index - ci_data$ci,ci_data$Index, ci_data$Index+ci_data$ci) 
              int_data = array(999, dim =c(length(Avail_Years),3) )
              # Compute quantiles of distribution for different years
              
              box_data = t(boxplot(Index~YearDiff, data = Data_Class, plot = FALSE)$stats)
              sdev_data = Data_Class[, list(sd= sd(Index, na.rm=TRUE)), by = list(YearDiff)]
              
              # Fill-in the output matrix for the analyzed fire.
              
              plot_stat_tmp= as.data.frame(cbind(case_ID, FireCode,FireYear, YearFromFire,Area_All,Area_Forest,Area_CLC, as.character(Zone),as.character(Class),n_pix,  Time_Signif = 'Yes', Avail_Years,YearDiffs, int_data,sdev_data$sd, box_data), stringsAsFactors = FALSE)
              names(plot_stat_tmp) = plot_stat_names
              plot_stat [[counter]]= plot_stat_tmp # Add the results for the selected fire to the full output matrix
              counter = counter + 1
            } 	else {	# If n_pix too low, the fire is skipped. Neither "plot_stat" nor the p-values matrixes are filled
              
            } 
            
          }   # End of cycle on ENV_Zones  for a single fire
          
        }   # End of cycle on CLC classes for a single fire
        
      } else case_ID = case_ID + 1	# End of check for sufficient number of years before and after fire
    }		
    
    if ((FirePix/100)-floor(FirePix/100) == 0) {message('-> Analysing Burnt Area: ',  FirePix, ' of: ', n_fires)}
    
  }   # End of Cycle on Fires
  
  plot_stat <- rbindlist(plot_stat)
  # browser()
  #Convert the columns to the correct formats
  # for (cc_df in c(5,6,7,10, 14:22)) plot_stat[, cc_df, with = FALSE] = as.numeric(plot_stat[,cc_df, with = FALSE])
  # for (cc_df in c(2,3,8,9,11)) plot_stat [,cc_df, with = FALSE] = as.factor(plot_stat [,cc_df, with = FALSE])
  # for (cc_df in c(1,4,12,13)) plot_stat [,cc_df, with = FALSE] = as.ordered(as.numeric(plot_stat [,cc_df, with = FALSE]))  	
  
  plot_stat <- plot_stat[, CASE_ID      := as.ordered(as.numeric(CASE_ID))]
  plot_stat <- plot_stat[, OBJECTID     := as.factor(OBJECTID)]
  plot_stat <- plot_stat[, FireYear     := as.factor(FireYear)]
  plot_stat <- plot_stat[, YearFromFire := as.ordered(as.numeric(YearFromFire))]
  plot_stat <- plot_stat[, Area_All     := as.numeric(Area_All)]
  plot_stat <- plot_stat[, Area_Forest  := as.numeric(Area_Forest)]
  plot_stat <- plot_stat[, Area_CLC     := as.numeric(Area_CLC)]
  plot_stat <- plot_stat[, ENV_ZONE     := as.factor(ENV_ZONE)]
  plot_stat <- plot_stat[, CLC_Class    := as.factor(CLC_Class)]
  plot_stat <- plot_stat[, N_PIX        := as.numeric(N_PIX)]
  plot_stat <- plot_stat[, Time_Signif  := as.factor(Time_Signif)]
  plot_stat <- plot_stat[, Year         := as.ordered(as.numeric(Year))]
  plot_stat <- plot_stat[, YearDiff     := as.ordered(as.numeric(YearDiff))]
  plot_stat <- plot_stat[, low_ci       := as.numeric(low_ci)]
  plot_stat <- plot_stat[, mean         := as.numeric(mean)]
  plot_stat <- plot_stat[, hi_ci        := as.numeric(hi_ci)]
  plot_stat <- plot_stat[, std_dev      := as.numeric(std_dev)]
  plot_stat <- plot_stat[, low_box      := as.numeric(low_box)]
  plot_stat <- plot_stat[, X25th        := as.numeric(X25th)]
  plot_stat <- plot_stat[, median       := as.numeric(median)]
  plot_stat <- plot_stat[, X75th        := as.numeric(X75th)]
  plot_stat <- plot_stat[, up_box       := as.numeric(up_box)]
  
  # Add attributes to the output Data Frame (Useful to keep track of processing !)
  
  attr(plot_stat, "RData_File") = In_File
  attr(plot_stat, "SVI_File") =SVI_File
  attr(plot_stat, "Shape_File") = Shape_File	
  attr(plot_stat, "CSV_File") = CSV_File
  attr(plot_stat, "CLC_File") = CLC_File
  attr(plot_stat, "Index") = Index
  attr(plot_stat, "Min_Pix") = min_pix   
  attr(plot_stat, "Min_Percentage") = perc_diff   
  attr(plot_stat, "Processing_Date") = Sys.Date()   
  attr(plot_stat, "Eroded") = erode  
  
  if (median_analysis == T) {
    # Add attributes to the output Data Frame (Useful to keep track of processing !)
    attr(p_matrix_median, "RData_File") = In_File
    attr(p_matrix_median, "SVI_File") =SVI_File
    attr(p_matrix_median, "Shape_File") = Shape_File	
    attr(p_matrix_median, "CSV_File") = CSV_File
    attr(p_matrix_median, "CLC_File") = CLC_File
    attr(p_matrix_median, "Index") = Index
    attr(p_matrix_median, "Min_Pix") = min_pix   
    attr(p_matrix_median, "Min_Percentage") =perc_diff
    attr(p_matrix_median, "Processing_Date") = Sys.Date()  
    attr(p_matrix_median, "Eroded") = erode  
  }
  
  #save the output in RData format	
  save(plot_stat, p_matrix_median, file = Out_File_R)
  gc(verbose = F)				#Garbage collection
  
  return('DONE')
  
}
lbusett/frgtool documentation built on May 20, 2019, 11:27 p.m.