In lbusett/phenoriceR: Routines for analysis of PhenoRice outputs

knitr::opts_chunk$set(echo = TRUE)
library(sprawl)
library(ggplot2)
library(ggjoy)
library(tibble)
library(raster)
library(phenoriceR)
library(dplyr)
library(mapview)
library(RColorBrewer)
library(knitr)
library(sf)

mapviewOptions(basemaps = c("Esri.WorldImagery", "OpenStreetMap.DE"),
               raster.palette = colorRampPalette(brewer.pal(9, "RdYlGn")),
               vector.palette = colorRampPalette(brewer.pal(9, "YlGnBu")),
               na.color = "transparent",
               layers.control.pos = "topright")

Here I am going to document the passages used to reshuffle the RiceAtlas dataset, and correct inconsistencies, needed to allow an easier comparison with PhenoRice results.

Preprocessing - changes on data organization

The original "RiceAtlas" attribute table is organized in "wide" format, with one row for "subregion", and many columns containing the data for the different seasons (e.g., PLANT_ST_1, PLANT_ST_2, ....).

main_folder <- "/home/lb/my_data/prasia/Data/"

in_riceatlas_shp = read_vect(file.path(main_folder,
                                       "vector/Ricetlas/riceatlas_asia.shp"))
# print(head(in_riceatlas_shp, 2), width = 800)

This makes working with the dataset difficult. First thing we (Bhogendra) did, was transforming the dataset in a more "manageable" long format:

in_data <- data.table::fread(file.path(main_folder,
                                       "/vector/NormalizedDB.txt"))
head(in_data)

Now, we have multiple rows per subregion, each corresponding to a possible "Season" and to a Phenological "indicator" (e.g., Planting or Harvesting). We can do some additional reshuffling for ease of use:

in_atlas <- in_data %>%
  # split the "Pheno_stage" column
  tidyr::separate(Pheno_stage, c("Stage", "season"), "_st") %>%
  # remove empty rows
  dplyr::filter((Start != 0 | Peak != 0 | End != 0)) %>%
  # create a unique name for each region and a unique identifier ysung ISO + OBJID
  dplyr::mutate(ID_name = paste(ISO, Region, Sub_region, sep = "_")) %>%
  dplyr::mutate(ID_name = stringr::str_replace(ID_name, "NA", ""))%>%
  dplyr::mutate(ID = paste(ISO, Objectid, sep = "_")) %>%
  #remove whitespaces from "Region"
  dplyr::mutate(Region = gsub(" ", "_", Region) ) %>%
  dplyr::mutate(ID_name = gsub(" ", "_", ID_name) ) %>%
  # Rename and reorder columns
  dplyr::select(ID, ID_name, N_seasons, season,
                Stage, Start, Peak, End, Total_area, Seasonal_area,
                Seasonal_production, Season, Country, Region, Sub_region,
                Objectid, ISO) %>%
  dplyr::rename(cty = ISO, N_seas = N_seasons, Seas_name = Season, Season = season,
                Seas_area = Seasonal_area , Seas_prod = Seasonal_production,
                Tot_area = Total_area, OBJECTID = Objectid) %>%
  dplyr::mutate(Stage = factor(Stage,
                               levels = c("plant", "harv"),
                               labels = c("Planting","Harvesting"))) %>% 
  tibble::as_tibble()

head(in_atlas)

Checks on "consistency of the dataset"

A quick check on the "consistency" of the dataset can be made by computing and plotting the differences between the reported "Start", "Peak" and "End" DOYs, for each region.

in_atlas <- in_atlas %>% 
  dplyr::mutate(endstart_lgt  = (End - Start), 
                peakstart_lgt = (Peak - Start),
                endpeak_lgt   = (End - Peak)) %>% 
  dplyr::select(1:9, endstart_lgt, peakstart_lgt, endpeak_lgt, everything())
print(head(in_atlas), width = 700)

in_atlas <- in_atlas %>% 
  dplyr::mutate(endstart_lgt  = (End - Start), 
                peakstart_lgt = (Peak - Start),
                endpeak_lgt   = (End - Peak)) %>% 
  dplyr::select(1:9, endstart_lgt, peakstart_lgt, endpeak_lgt, everything())

print(ggplot(in_atlas) + 
        geom_histogram(aes(x = endstart_lgt), binwidth = 7)  + 
        facet_wrap(~Stage) + theme_bw() +
        ggtitle("Differences between End and Start in riceatlas"))

print(ggplot(in_atlas) + 
        geom_histogram(aes(x = peakstart_lgt), binwidth = 7) + 
        facet_wrap(~Stage) + theme_bw() + 
        ggtitle("Differences between Peak and Start in riceatlas"))

print(ggplot(in_atlas) + 
        geom_histogram(aes(x = endpeak_lgt), binwidth = 7) + 
        facet_wrap(~Stage) + theme_bw() + 
        ggtitle("Differences between End and Peak in riceatlas"))

From this, it is obvious that the dataset is usually coherent, but also that it has some problems. In several cases Start/Peak/End values do not account for "circularity".

Therefore we can have cases such as:

Start = 354,Peak = 3, End = 18 --> End <=Start and Peak <= Start --> leading to negative outliers
Start = 4, Peak = 292, End = 34 --> End <=Start and Peak <= Start --> leading to positive outliers

, plus other less common problems (e.g., Start = 32 ,Peak = 32, End = 0, Start = 148, Peak = 148, End = 155 Start = 213 ,Peak = 227, End = 213, Start = 277, Peak = 19, End = 307), as well as obvious mistakes (e.g., Start = 74, Peak = 74, End = 196, Start = 1, Peak = 4, End = 66)

We also have a very "anomalous" high frequency at Peak - Start = 30, but we can not do anything about that.

All this makes an automatic comparison with PhenoRice results difficult.

Initial situation - PHL

For example, see what we have as Start - Peak - End ranges for some provinces in PHL:

A mix of correct data, some missing data (e.g, Tarlac Harvesting Start), and inconsistent data (e.g., Nueva Ecjia Planting; Bataan Harvesting). Most of the problems are obviously related to "circularization" of DOYS in Season 2, which crosses the year, and for which Start/Peak/End are evidently not coherent. In addition, we have the already mentioned problems of "switched" seasons, in some cases (though in the end this is the lesser problem)

in_phl <- in_atlas %>% dplyr::filter(cty == "PHL") 
in_phl <- in_phl   %>% dplyr::filter(Region %in% unique(in_phl$Region)[2:9])

print(ggplot(data = in_phl, aes(x = Sub_region)) + theme_bw() +
        geom_point(aes(y = Peak, color = Stage), position = position_dodge(.3)) + 
        geom_errorbar(aes(ymax = End, ymin = Start, color = Stage), 
                      position = position_dodge(width = .3)) +
        facet_wrap(~Season) + coord_flip()  +
        ylab("DOY") + xlab("RiceAtlas polygon") + 
        ggtitle("Start/Peak/End ranges vs Season - Original"))

print(ggplot(data = in_phl, aes(x = Sub_region)) + theme_bw() +
        geom_point(aes(y = Peak, color = factor(Season)), position = position_dodge(.3)) + 
        geom_errorbar(aes(ymax = End, ymin = Start, color = factor(Season)), 
                      position = position_dodge(width = .3)) +
        facet_wrap(~Stage) + coord_flip()  +
        ylab("DOY") + xlab("RiceAtlas polygon") +
        ggtitle("Start/Peak/End ranges vs Season - Original"))

Reshufflling of RiceAtlas data to solve the problem

To solve the problem, we need to filter out errors and reshuffle the DOYS in a consistent way.

out_atlas <- in_atlas %>% 
  # add 365 to End/Peak where differences are negative
  pr_mutate_when((endstart_lgt < 0 & endstart_lgt < 0), 
                 list(End = End + 365)) %>%
  # re-create the diffs for test purposes
  dplyr::mutate(endstart_lgt = (End - Start),
                peakstart_lgt = (Peak - Start), 
                endpeak_lgt   = (End - Peak)) %>% 
  # Second fix where we still have prblems
  pr_mutate_when((endpeak_lgt > 250 & peakstart_lgt < -250 & !is.na(Start)), 
                 list(Start = Start - 365)) %>% 
  pr_mutate_when((endpeak_lgt > 250 & peakstart_lgt < -250 & !is.na(End)), 
                 list(End   = End - 365)) %>% 
  # re-create the diffs for test purposes
  dplyr::mutate(endstart_lgt = (End - Start),
                peakstart_lgt = (Peak - Start), 
                endpeak_lgt   = (End - Peak)) %>% 
  # Third fix where we still have prblems
  pr_mutate_when(endpeak_lgt > 300, 
                 list(End   = End - 365)) %>% 
  # re-create the diffs for test purposes
  dplyr::mutate(endstart_lgt = (End - Start),
                peakstart_lgt = (Peak - Start), 
                endpeak_lgt   = (End - Peak)) %>%
  # Remove Start/End when they are equal to Peak
  pr_mutate_when((Start == Peak), list(Start = NA)) %>%
  pr_mutate_when((End == Peak),   list(End   = NA)) 

# Now manually fixErros and corner cases
out_atlas <- out_atlas %>% 
  #manually fix corner cases
  #"IND Puducherry Yanam", which has very strange data on the original table
  #( Harvesting:  Start  = 4  Peak = 292  End = 34;  Planting: Start  = 277   Peak = 19   End = 307).
  #The obnly reasonable way IMO is to remove Start and set End to 365 + 34 (Planting)
  #and remove Start and End (Harvesting)
  pr_mutate_when((ID_name == "IND Puducherry Yanam" & Stage == "Harvesting" & Season == 2),
                 list(Start = NA, End = 365 + End)) %>%
  pr_mutate_when((ID_name == "IND Puducherry Yanam" & Stage == "Planting" & Season == 2),
                 list(Start = NA, End = NA)) %>% 
  #"MMR Tanintharyi"
  pr_mutate_when((ID_name == "MMR Tanintharyi " & Stage == "Planting" & Season == 2),
                 list(Start = NA, End   = NA)) %>% 
  #"severla places with "213   227   213"" --> Remove End
  pr_mutate_when((Start == 213 & Peak == 227 & End == 213), list(End = NA)) %>% 
  #IND Orissa Nuapada
  pr_mutate_when((ID_name == "IND Orissa Nuapada" & Stage == "Planting" & Season == 3),
                 list(End = NA)) %>% 
  #IND Uttaranchal Naini Tal
  pr_mutate_when((ID_name == "IND Uttaranchal Naini Tal" & Stage == "Planting" & Season == 1),
                 list(End = NA)) %>% 
  #IND Orissa Malkangiri: Peak == 0. Resete to midpoint S/E
  pr_mutate_when((ID_name == "IND Orissa Malkangiri" & Stage == "Planting" & Season == 2),
                 list(Peak = 258)) %>% 
  #several places with "319   305   334" --> Remove Start
  pr_mutate_when((peakstart_lgt == -14), list(Start = NA)) %>% 
  #several places with "30   -5   35" --> Remove Start
  pr_mutate_when((peakstart_lgt == -5),                         list(Start   = NA)) %>% 
  # re-create the diffs for test purposes
  dplyr::mutate(endstart_lgt = (End - Start),
                peakstart_lgt = (Peak - Start), 
                endpeak_lgt   = (End - Peak))

Let's see again the histograms:

print(ggplot(out_atlas) + 
        geom_histogram(aes(x = endstart_lgt), binwidth = 7)  + 
        facet_wrap(~Stage) + theme_bw() +
        ggtitle("Differences between End and Start in riceatlas"))

print(ggplot(out_atlas) + 
        geom_histogram(aes(x = peakstart_lgt), binwidth = 7) + 
        facet_wrap(~Stage) + theme_bw() + 
        ggtitle("Differences between Peak and Start in riceatlas"))

print(ggplot(out_atlas) + 
        geom_histogram(aes(x = endpeak_lgt), binwidth = 7) + 
        facet_wrap(~Stage) + theme_bw() + 
        ggtitle("Differences between End and Peak in riceatlas"))

Ok. Now we are in much better shape (note that I never touched the "Peak" date, which we take as "reference"). End - Start is always positive, as well as End - Peak. Still some polygons with "anomalously" large ranges, but we can live with those.

Let's see the PHL region again:

# in_phl <- out_atlas %>% 
#   dplyr::filter(cty == "PHL" & Region == "Region 3 - Central Luzon")
in_phl <- out_atlas %>% dplyr::filter(cty == "PHL") 
in_phl <- in_phl   %>% dplyr::filter(Region %in% unique(in_phl$Region)[2:9])

ggplot(data = in_phl, aes(x = Sub_region)) + theme_bw() +
  geom_point(aes(y = Peak, color = Stage), position = position_dodge(.3)) + 
  geom_errorbar(aes(ymax = End, ymin = Start, color = Stage), 
                position = position_dodge(width = .3)) +
  facet_wrap(~Season) + coord_flip()  +
  ylab("DOY") + xlab("RiceAtlas polygon")

Much better but still not there. Now I'll have to reconcile the planting and the harvesting. If Harvesting after reshuffling is still before planting, subtract 365:

reorganize <- function(sub_atlas) {
  for (seas in unique(sub_atlas$Season)) {
    data_in <- sub_atlas %>% 
      dplyr::filter(Season == seas)
    if (data_in$Peak[1] > data_in$Peak[2]) {
      sub_atlas[sub_atlas$Season == seas,]$Start[1] <- data_in$Start[1] - 365
      sub_atlas[sub_atlas$Season == seas,]$Peak[1]  <- data_in$Peak[1] - 365
      sub_atlas[sub_atlas$Season == seas,]$End[1]   <- data_in$End[1] - 365
    }
  }
  sub_atlas
}

final_out <- list()
regions <- unique(out_atlas$ID_name)
for (regind in seq_along(regions)) {
  indata <- out_atlas %>% 
    dplyr::filter(ID_name == regions[regind])
  regions[regind]
  final_out[[regind]] <- reorganize(indata)
}
final_out <- data.table::rbindlist(final_out)

Let's see the result:

in_phl <- final_out %>% dplyr::filter(cty == "PHL") 
in_phl <- in_phl   %>% dplyr::filter(Region %in% unique(in_phl$Region)[2:9])

ggplot(data = in_phl, aes(x = Sub_region)) + theme_bw() +
  geom_point(aes(y = Peak, color = Stage), position = position_dodge(.3)) + 
  geom_errorbar(aes(ymax = End, ymin = Start, color = Stage), 
                position = position_dodge(width = .3)) +
  facet_wrap(~Season) + coord_flip()  +
  ylab("DOY") + xlab("RiceAtlas polygon")

Almost ALL SET ! Now reorder the seasons so that 1 - 2 - 3 is always in order (not really needed, but while we are at it....)

final_out2 <- final_out %>% 
  group_by(ID_name, Stage) %>% 
  dplyr::mutate(Seas_ord = rank(Peak, ties.method = "first"))

Final situation - PHL

Here the final result:

in_phl <- final_out2 %>% dplyr::filter(cty == "PHL") 
in_phl <- in_phl   %>% dplyr::filter(Region %in% unique(in_phl$Region)[2:9])

print(ggplot(data = in_phl, aes(x = Sub_region)) + theme_bw() +
        geom_point(aes(y = Peak, color = Stage), position = position_dodge(.3)) + 
        geom_errorbar(aes(ymax = End, ymin = Start, color = Stage), 
                      position = position_dodge(width = .3)) +
        facet_wrap(~Seas_ord) + coord_flip()  +
        ylab("DOY") + xlab("RiceAtlas polygon") + 
        ggtitle("Start/Peak/End ranges vs Season - Final"))

print(ggplot(data = in_phl, aes(x = Sub_region)) + theme_bw() +
        geom_point(aes(y = Peak, color = factor(Seas_ord)), position = position_dodge(.3)) + 
        geom_errorbar(aes(ymax = End, ymin = Start, color = factor(Seas_ord)), 
                      position = position_dodge(width = .3)) +
        facet_wrap(~Stage) + coord_flip()  +
        ylab("DOY") + xlab("RiceAtlas polygon") + 
        ggtitle("Start/Peak/End ranges vs Season - Final"))

DONE! Now we have a coherent dataset. We can save the reshuflled dataset, after re-joining the geometry.

in_shp <- in_riceatlas_shp %>% 
  select(OBJECTID, ISO) %>% 
  rename(cty = ISO) %>% 
  arrange(cty, OBJECTID)

out_shp <- in_shp %>% 
  left_join(final_out2)

write_shape(out_shp, file.path(main_folder, 
                               "/vector/Ricetlas/riceatlas_asia_reshuffled.shp"), 
                               overwrite = TRUE)
# out_phl <- out_shp %>% 
#   dplyr::filter(cty == "PHL", Seas_ord == 2, Stage == "Planting")
# 
# plot_vect(out_phl, fill_var = "Peak", palette_name = "RdYlGn")