mrfable: FABLE project data

Documented in readIndiaAPY

#' @title readIndiaAPY
#' This function reads foodcrops data for India taken from here: https://eands.dacnet.nic.in/APY_96_To_07.htm
#' @author Anastasis Giannousakis
#' @param subtype Area, Yield, or Production
#' @importFrom readxl read_excel excel_sheets
#' @importFrom tidyr gather
#' @importFrom dplyr filter %>%
#' @importFrom magclass as.magpie getRegions<-
#' @importFrom madrat downloadSource
#' @examples
#' \dontrun{ a <- madrat::readSource(type="IndiaAPY",subtype="Rice",convert="onlycorrect") }
#' @return magpie object containing Area, Yield, and Production data.


readIndiaAPY <- function(subtype=NA){

  # helper function to convert data from wide to long format and specify variable and unit
  .gather <- function(x, varunit, season=FALSE){
    fct<- 1
    if(season) fct<-2
    d <- gather(x, "year", "value", colnames(x)[-(1:fct)], factor_key = TRUE) # convert to long format
#    d <- d[-c(grep("State", d[,1]), which(is.na(d[, 1]))), ] # remove irrelevant rows
    varname <- strsplit(varunit, " \\(")[[1]][[1]] # split variable-unit
    unit <- gsub(" |\\)|\\'", "", strsplit(varunit, " \\(")[[1]][[2]])
    unit <- sub("000", "k", unit)
    out <- cbind("variable"=varname, "unit"=unit, "season"=NA, d)
    if (season) out <- cbind("variable"=varname, "unit"=unit, d)
    return(out)
  }

  # helper function to prepare data from excel files for conversion to data.frame
  .fixdata <- function(a){
    # are the data of type R (with seasons in the 2nd column) or type W (without)?
    if (grepl("Summer|Kharif|Rabi|Total",a[,2]))  {
      type1 <- "R"
    } else {
      type1 <- "W"
    }

    if (type1 == "W") {
      if (any(grepl("Kharif|Winter|Autumn|Rabi|Summer",names(a)))) {
        season<-tolower(strsplit(
          strsplit(
            grep("Kharif|Winter|Autumn|Rabi|Summer",names(a),value = TRUE),
            split="\\(")[[1]][2],
          split="\\)")[[1]][1])
      } else {
        season <- NA
      }
      if (!grepl(".*.[0-9]-[0-9].*.",a[3,3])) a <- a[-1,]
      colnames(a)<-sub("-.*","",a[3, ]) # use row with years as column names
      a <- a[-which(is.na(a[, 1])),]
      colnames(a)[1] <- "state"
      # remove remaining rows that do not contain data
      if (grepl("1",a[,1])) a <- a[-which(a[, 1]==1),]
      if (length(grep("State",a[,1]))==2) a <- a[-grep("State",a[,1])[[2]],]
    } else {
      if (!grepl(".*.[0-9]-[0-9].*.",a[2,3])) a <- a[-1,]
      if (!grepl(".*.[0-9]-[0-9].*.",a[2,3])) a <- a[-1,]
      colnames(a)<-sub("-.*","",a[2,]) # use row with years as column names
      a<-a[-intersect(which(is.na(a[,2])),which(is.na(a[,1]))),]
      colnames(a)[1:2]<-c("state", "season")
      for (i in 1:length(t(a[, 1]))){
        if(is.na(a[[i, 1]])) a[[i, 1]]<-a[[i-1, 1]]
      }
      if (grepl("1",a[,1])) a <- a[-which(a[, 1]==1),] # remove remaining rows that do not contain data
    }
    ind <- length(which(grepl("[0-9]", colnames(a)))) / 3 # find length of each data table
    if (type1 == "R") {
      a <- as.data.frame(a)
      # remove remaining rows that do not contain data
      if (length(grep("State",a[,1]))==2) a <- a[-grep("State",a[,1])[[2]],]
      if (length(grep("Estimates",a[,1]))>0) a <- a[-grep("Estimates",a[,1]),]
      a[,"season"] <- tolower(a[,"season"])
      # remove rows with no data that begin with a season name (generated by readxl in some files)
      if (any(grepl("Kharif|Winter|Autumn|Rabi|Summer",a[,1]))) a <- a[-which(is.na(a[, 2])), ]
      out<-rbind(
        .gather(cbind(a[-1,1:2], a[-1,(3+0*ind):(2+1*ind)]), "Area ( 000 Hektares)", TRUE),
        .gather(cbind(a[-1,1:2], a[-1,(3+1*ind):(2+2*ind)]), "Production ( 000 Tonnes)", TRUE)
      )
    } else {
        out <- rbind(
          .gather(cbind(a[-1,1], a[-1,(2+0*ind):(1+1*ind)]), "Area ( 000 Hektares)"),
          .gather(cbind(a[-1,1], a[-1,(2+1*ind):(1+2*ind)]), "Production ( 000 Tonnes)")
        )
      out[,"season"] <- season
    }
    return(out)

  }


  crop <- NULL
  if (file.exists("crops.txt")) {
    crops <- readLines("crops.txt")
  } else {
    stop("please download the data with madrat::downloadSource first")
  }
  excelfiles <- dir()[grep("xls", dir())] # read-in only excel files
  out <- NULL

  for (i in crops) {
    for (j in grep(i, excelfiles, value = TRUE)) {
      suppressMessages(a <- read_excel(j))
      tmp <- cbind("crop" = i, .fixdata(a))
      out <- rbind(out, tmp)
    }
  }

  dtfile <- "allfood1996-2013.xls"
  if (!file.exists(dtfile)) downloadSource("IndiaAPY" , overwrite = TRUE)
  for (i in crops) {
    suppressMessages(a <- read_excel(dtfile, sheet = grep(i, excel_sheets(dtfile), value = TRUE, ignore.case = TRUE)[1]))
    a[,(grep("State",a)[[2]]-1):length(a[1,])] <- NULL # remove section with 5-year average
    out <- rbind(out, cbind("crop" = i, .fixdata(a)))
  }

  dtfile <- "allfood2014-2018.xls"
  if (!file.exists(dtfile)) downloadSource("IndiaAPY" , overwrite = TRUE)
  for (i in crops) {
    suppressMessages(a <- read_excel(dtfile, sheet = grep(i, excel_sheets(dtfile), value = TRUE, ignore.case = TRUE)[1]))
    out <- rbind(out, cbind("crop" = i, .fixdata(a)))
  }
  out <- as.data.frame(out)


  # Convert data column to numeric
  suppressWarnings(out[["value"]] <- as.numeric(out[["value"]]))

  # If applicable, filter out specific crops
  if (!is.na(subtype)) out <- filter(out,`crop`==subtype)

  out <- as.magpie(out, spatial="state")


  return(out)

}