R/readInXlsxRda.R

Defines functions readInXlsxRda

Documented in readInXlsxRda

#'readInXlsxRda
#' @param file.filePath a character string indicating a the WHOLE PATH leading to a xlsx file. The file should contain the corresponding sheetsfor data and variables.
#' @param dataEnd.lastRow integer indicating the end row until where the data should be read. If not given reads till the end of the excel file. Default value is \code{NULL}

#' @param loadPar string values "data" or "data-var". If invoked with "data", the data is loaded into the global environment and if invoked with "data-var" loads both the Data and Variables in the global environment. Default value is \code{data}
#' @param data string indicating the name of the Data sheet in the excel file containing the values of the variables described in \code{variables}. Default value is \code{data} (See below)
#' @param variables string indicating the name of the variable Description sheet in the excel file. Default value is \code{variables}. description, and range of values, of data columns
#' @param varClass.dfColClass string indicating the name of the column in the sheet variables providing the classes of the variables . Default value is \code{varClass}."varClass"
#' @param varValues.dfColValues string indicating the name of the column in the sheet variables providing the values and IMPORTANTLY factor levels of the variables. Default value is \code{varValues}."varValues"
#' @param skip.deprecated boolean (TRUE  or FALSE)  indicating to skip the depricated values or not.Default value is \code{TRUE}.
#' @param idCol.dfIdCol string indicating the name of the column in the "data" sheet containing numeric values from 1 to n, where n= number of famers/plots.No default value available.
#' @param restParRow numeric. A number between 0 and 1 to indicate the percentage of NA values allowed in a Row in the Date sheed in the excel file. Default value is \code{0.975}.
#' @param restParCol numeric. A number between 0 and 1 to indicate the percentage of NA values allowed in a Column in the Date sheed in the excel file. Default value is \code{0.975}.
#' @param gcTabs boolean (TRUE or FALSE) for R and java garbage collection. Useful when large excel files are read in. The default value is  \code{F}.
#' @param outputSubdir character string indicating the sub-directory inside the \code{directory} where all produced files are saved.
#' @return As per the value given for the argument \code{loadPar} "data" or "variables" or both will be loaded in the global environment. If the data column names mismatch with the variable name list, an excel file indicating the location and name of the column names that mismatch. If there are any blank or NA values in the data, an excel file indicating the location of the Na values.
#' @description ReadInXlsxRda is used to read and perform different checks in an xlsx file. It uses the native function loc.col from the package. The first check is performed on the Data columns on the data sheet and the Variable names in the variable sheet for any mismatch and aborts the run if there is any mismatch. An excel report is saved in the file's directory or outputSubdir sub-directory. The excel file consists of 2 sheets "Data-sheet" and "Var-sheet". The "Data-sheet" consists of the DataName and DataLoc with the name of the column and location respectively that doesnt match with the corresponding variable sheet. The "Var-sheet" consists of VarNames and VarLoc with the name of the variable and location respectively that doesnt match with the corresponding data sheet. Loads the "data" or "variable" or both in the global environment based on the value given for the parameter \code{loadPar}.Outputs an excel file in the file path (or outputSubdir)  with the row number, column number and the column name of the NA or wrong values.
#' @examples
#' readInXlsxRda(file,9,"data-var","data-1","variables-1",2,TRUE,"PATH")
#' readInXlsxRda(file)
#' readInXlsxRda(file,10)
#' readInXlsxRda(file,data="data-sheet",variables="variable-sheet")
#' @export
readInXlsxRda<-function(file,dataEnd=NULL,loadPar="data",data="data",variables="variables",
                        varClass="varClass",varValues="varValues",skip.deprecated=TRUE,
                        idCol,restParRow=0.975,restParCol=0.975,gcTabs=T,outputSubdir=NULL)
{


  # code with parameter for garbage collection if excel files or tables are large
  if(gcTabs==T)
  {
    jgc<-function()
    {
      gc()
      .jcall("java/lang/System", method = "gc")
    }
  }

  #local environment directory
  baseDir<-getwd()
  directory<-sub("(.+/).+\\..+", "\\1",file)
  
  #checking for directories
  
  #output directory
  if(!is.null(outputSubdir))
  {
    if(!dir.exists(file.path(directory, outputSubdir)))
    {
      dir.create(file.path(directory,outputSubdir))
    }
     outputDir<-paste(directory,"/",outputSubdir,sep="")
  }else
  {
    outputDir<-directory
  }

  filename <- sub('.*/', '', file)

  fname <- sub("([^.]+)\\.xlsx", "\\1", filename)
  fnamevar <- paste(fname,"var",sep = "")

  
  #Read fist row (HEADER) of Data and first col of var and compare!
  datInitial <- read.xlsx(file.path(directory,filename), sheetName=data,
                          endRow = 2)
  datVarNames <-colnames(datInitial)

  varInitial<-read.xlsx(file.path(directory,filename), sheetName=variables,
                       colIndex = 1)
  varNames <- as.vector(varInitial[,1])

  if(gcTabs==T){jgc()}


  #Vars Check
  if(!identical(datVarNames,varNames))
  {

    missVar<-missValVec(varNames,datVarNames)

    missDat<-missValVec(datVarNames,varNames)

    DataLoc <- loc.col(datInitial,missDat)
    if(length(DataLoc)==0){DataLoc<-data.frame(DataName="NA",DataLoc="NA")}
    colnames(DataLoc)<-c("DataName","DataLoc")

    VarLoc <- which(!is.na(match(varNames,missVar)))

    if(length(VarLoc)==0)
    {
      varsDF<-data.frame(VarNames="NA",VarLoc="NA")
    }
    else
    varsDF <- data.frame(VarNames = varNames[VarLoc],VarLoc)

    varCheckTxt<-paste(outputDir,"/",fname,"_var_check.xlsx",sep="")
    write.xlsx(DataLoc, file = varCheckTxt,row.names=FALSE,sheetName = "Data-Sheet")
    write.xlsx(varsDF, file = varCheckTxt,append=TRUE, row.names=FALSE,sheetName = "Var-Sheet")
    stop("Variable names mismatch, please check the log-file in ",outputDir,"/",varCheckTxt,"\n ATTENTION delete or rename  the log-file for the next run!")

    if(gcTabs==T){jgc()}
  }

  var <- read.xlsx(file.path(directory,filename), sheetName=variables,
                    colClasses="character", stringsAsFactors=FALSE)

   if(gcTabs==T){jgc()}

  classes <- var[,varClass]
  classes[classes == "factor"] <- "character"
  classes[classes == "deprecated"] <- "character"

  Data <- read.xlsx(file.path(directory,filename), sheetName=data,colIndex=1:length(var[,1]), 
                    colClasses=classes, stringsAsFactors=FALSE, endRow = dataEnd)

  if(gcTabs==T){jgc()}

  #formatting factors
  classes <- grep("factor", var[,varClass]) #CHANGE I decided to make a parameter out of it
  for(i in classes) {
    Data[,i] <- factor(Data[,i], levels=strsplit(var[i,varValues],",")[[1]])
  }

  # eliminating deprecated variables
  if(skip.deprecated) {
    classes <- grep("deprecated", var[,varClass])
    if(length(classes) > 0) {
      var <- var[-classes,]
      Data <- Data[,-classes]
    }
  }

  #Removing columns and rows that have a % equal or higher than  restParRow or restParCol of NA values

  Data <- Data[,colSums(is.na(Data)) < nrow(Data)*restParCol] 
  Data <- Data[rowSums(is.na(Data)) < ncol(Data)*restParRow,]
  
  if(missing(idCol)==F)
  {
    if(length(Data[,1])!=Data[length(Data[,1]),idCol])
      {
      message("ATTENTION! some farmers/plots were eliminated due to high proportion of missing data")
    }
  }
  
  var <- var[rowSums(is.na(var)) < ncol(var),]
  var <- var[,colSums(is.na(var)) < nrow(var)]
  if (length(names(Data))!=length(var[,1]))
  {
    message("Following variables are deprecated due to a high count of NA values:\n",
            "--->",paste(var[which(is.na(match(var[,1],names(Data)))),1],collapse=" ; "))
    var <- var[-(which(is.na(match(var[,1],names(Data))))),]
  }

  #Creating an RDA with Data and var using filename
  #Loading data/variables in the gobal environment
  
  ########################################################################################################
  
  rdaFile<-paste(outputDir,"/",fname,".rda",sep="")
  
  assign(fname, Data)
  assign(fnamevar,var)

  if(loadPar=="data"){
    message("Data loaded in the current session environment")
    assign(fname, Data,envir=.GlobalEnv)
    save(list=ls(pattern=fname),file=rdaFile)
    }
  else if(loadPar=="data-var"){
    message("Data and Variables loaded in the current session environment")
    assign(fnamevar, var,envir=.GlobalEnv )
    assign(fname, Data,envir=.GlobalEnv)
    save(list=ls(pattern=fname),file=rdaFile)
  }
  else{
    message("Data and Variables were NOT loaded in the current session environment\n They were saved in your given file directory")
    save(list=ls(pattern=fname),file=rdaFile)
  }

  #Creating errorReport of the data and
  #Output the row and column of NA values
  NAlocation <- as.data.frame(which(is.na(Data),arr.ind = TRUE))
  NAlocation[,"colName"]<-names(Data)[NAlocation[,2]]

  if(length(NAlocation[,1])>0){
    reportTxt = paste(outputDir,"/",fname,"_report.xlsx",sep="")
    message("NA report stored in location ",directory,"/",reportTxt,"\n  !ATTENTION delete or rename  the report file for the next run!")
    write.xlsx(NAlocation, file = reportTxt,
               row.names=F,sheetName = "NAlocation")
  }
  setwd(baseDir)
}
lwTools/agriTrf documentation built on March 26, 2020, 12:09 a.m.