R/DfReadDataFile.R
In RJafroc: Artificial Intelligence Systems and Observer Performance

#' Read a data file 
#' 
#' @description Read a disk file and create a ROC, FROC or LROC dataset object 
#'    from it.
#' 
#' @param fileName A string specifying the name of the file. 
#'    The file-extension must match the format specified below.
#'    
#' @param format A string specifying the format of the data file. 
#'    It can be \code{"JAFROC"}, the default, which requires a \code{.xlsx} Excel file
#'    (\bold{not \code{.xls}}), \code{"MRMC"} or \code{"iMRMC"}. 
#'    For \code{"MRMC"} the format is determined by the data file extension 
#'    (\code{.csv} or \code{.txt} or \code{.lrc}) 
#'    as specified in \url{https://perception.lab.uiowa.edu/}. For \code{"iMRMC"} the 
#'    file extension is \code{.imrmc} and the format is described in 
#'    \url{https://code.google.com/archive/p/imrmc/}. \bold{See following note for
#'    important information about deprecation of the \code{"MRMC"} format}.
#'    
#'    
#' @param newExcelFileFormat Logical. Must be true to read LROC data. 
#'    This argument only applies to the \code{"JAFROC"} format. 
#'    The default is \code{FALSE}. If \code{TRUE} the function accommodates 3 
#'    additional columns
#'    in the \code{Truth} worksheet. If \code{FALSE}, the original function (as in version 
#'    1.2.0) is used and the three extra columns, if present, throws an error.  
#'    
#' @param lrocForcedMark Logical: For LROC dataset only: is a forced mark required 
#'    on every image? The default is \code{NA}. If a mark is not required, set 
#'    it to \code{FALSE} otherwise to \code{TRUE}.  
#'    
#' @param delimiter The string delimiter to be used for the \code{"MRMC"} 
#'    format ("," is the default), see \url{https://perception.lab.uiowa.edu/}.
#'    This parameter is not used when reading \code{"JAFROC"} 
#'    or \code{"iMRMC"} data files.
#'    
#' @param sequentialNames A logical variable: if \code{TRUE}, consecutive integers 
#'    (starting from 1) will be used as the 
#'    treatment and reader IDs (i.e., names). Otherwise, treatment 
#'    and reader IDs in the original data file will be used.
#' 
#' @note The \code{"MRMC"} format is deprecated. For non-JAFROC formats four file 
#'    extensions (\code{.csv}, \code{.txt}, \code{.lrc} and \code{.imrmc}) are possible, 
#'    all of which are restricted to ROC data. Only the \code{iMRMC} format is actively 
#'    supported, i.e, files with extension \code{.imrmc}. Other formats (\code{.csv}, 
#'    \code{.txt}, \code{.lrc}) are deprecated. Such files can still be read by this 
#'    function and then saved to a JAFROC format file for further analysis within this 
#'    package. \bold{For non-JAFROC data file formats, the \code{readerID} and 
#'    \code{modalityID} fields  must be unique integers}.
#' 
#' @return A dataset with the structure specified in \code{\link{RJafroc-package}}.
#' 
#' @examples
#' fileName <- system.file("extdata", "toyFiles/ROC/rocCr.xlsx", 
#' package = "RJafroc", mustWork = TRUE)
#' rdrArr1D <- DfReadDataFile(fileName, newExcelFileFormat = TRUE)
#'
#' 
#' \donttest{
#' fileName <- system.file("extdata", "Roc.xlsx", 
#' package = "RJafroc", mustWork = TRUE)
#' RocDataXlsx <- DfReadDataFile(fileName)
#' 
#' fileName <- system.file("extdata", "RocData.csv", 
#' package = "RJafroc", mustWork = TRUE)
#' RocDataCsv<- DfReadDataFile(fileName, format = "MRMC")
#' 
#' fileName <- system.file("extdata", "RocData.imrmc", 
#' package = "RJafroc", mustWork = TRUE)
#' RocDataImrmc<- DfReadDataFile(fileName, format = "iMRMC")
#' 
#' fileName <- system.file("extdata", "Froc.xlsx", 
#' package = "RJafroc", mustWork = TRUE)
#' FrocDataXlsx <- DfReadDataFile(fileName, sequentialNames = TRUE)
#' }
#' 
#' @importFrom tools file_ext
#' @importFrom stringr str_trim str_length
#' @export

DfReadDataFile <- function (fileName, format = "JAFROC", 
                            newExcelFileFormat = FALSE, 
                            lrocForcedMark = NA,
                            delimiter = ",", 
                            sequentialNames = FALSE) 
{
  
  if (format == "JAFROC") {
    # handle JAFROC format Excel files
    if (!(file_ext(fileName) == "xlsx")) 
      stop("The extension of JAFROC data file must be .xlsx, NOT .xls.\n")
    if (!newExcelFileFormat) { 
      if (!is.na(lrocForcedMark)) stop("Attempt to read possibly LROC dataset with newExcelFileFormat flag set to FALSE") 
      return((ReadJAFROCOldFormat(fileName, sequentialNames)))
    } else {
      return(ReadJAFROCNewFormat(fileName, lrocForcedMark, sequentialNames))
    }
  } else {
    # handle non-JAFROC format text files
    if (format == "iMRMC") {
      return(ReadImrmc(fileName, sequentialNames))
    } else if (format == "MRMC") {
      if (file_ext(fileName) == "lrc") {
        return(ReadLrc(fileName, sequentialNames))
      } else {
        return(ReadOrDbmMrmc(fileName, delimiter, sequentialNames))
      }
    } else {
      errMsg <- sprintf("%s is not an available file format.", format)
      stop(errMsg)
    }
  }
} 



preCheck4BadEntries <- function(truthTable) {
  
  # START not sure what this does
  # for (i in 1:3){
  #   truthTable[grep("^\\s*$", truthTable[ , i]), i] <- NA
  # }
  # 
  # naRows <- colSums(is.na(truthTable))
  # if (max(naRows) > 0) {
  #   if (max(naRows) == min(naRows)) {
  #     truthTable <- truthTable[1:(nrow(truthTable) - max(naRows)), ]
  #   }
  # }
  # END not sure what this does
  
  # check for blank cells in Truth worksheet
  errorMsg <- ""
  for (i in 1:5) {
    if (any(is.na(truthTable[, i]))) {
      # each blank Excel cell is returned as NA
      # blank lines in Excel sheet are ignored i.e. skipped, as if they were not there
      naLines <- which(is.na(truthTable[, i])) + 1
      errorMsg <- paste0(errorMsg, "\nThere are empty cells at line(s) ", 
                         paste(naLines, collapse = ", "), " in the TRUTH table.")
    }
  }
  if (errorMsg != "") stop(errorMsg)
  
  for (i in 1:3)
    if (any(is.na(suppressWarnings(as.numeric(as.character(truthTable[, i])))))) {
      suppressWarnings({naLines <- which(is.na(as.numeric(as.character(truthTable[, i])))) + 1})
      if (i == 1) errorMsg <- paste0(errorMsg, 
                                     "\nThere are non-integer values(s) for caseID at line(s) ", paste(naLines, collapse = ", "), " in the TRUTH table.")
      if (i == 2) errorMsg <- paste0(errorMsg, 
                                     "\nThere are non-integer values(s) for LessionID at line(s) ", paste(naLines, collapse = ", "), " in the TRUTH table.")
      if (i == 3) errorMsg <- paste0(errorMsg, 
                                     "\nThere are non-numeric values(s) for Weights at line(s) ", paste(naLines, collapse = ", "), " in the TRUTH table.")
    }
  if (errorMsg != "") stop(errorMsg)
  
  if (any(!is.wholenumber(as.numeric(truthTable[[1]])))) stop("Non-integer values in Truth worksheet column 1")
  if (any(!is.wholenumber(as.numeric(truthTable[[2]])))) stop("Non-integer values in Truth worksheet column 2")
  if (any(!is.double(as.numeric(truthTable[[3]])))) stop("Non-floating point values in Truth worksheet column 3")
  
  # code to check for sequential lesionIDs in Truth sheet: 0,0,1,2,0,1,2,3,0,1 etc
  # normal case lesionIDS are all 0
  # for each abnormal case, the lesionID starts from 1 and works up, sequentially, to number of lesions on the case
  # a case can start abruptly wiht lesionID = 0 or 1, but not with lesionID = 2 or more
  # if it starts with lesionID = 2 or more, the previous one must be one less, i.e., sequential
  t <- as.numeric(truthTable$LesionID) # at this stage the cells in truthTable could be characters, 
  # which would break the following code; hence we convert to numerics; the lesionID field is convertible
  # to integers, even if entered as characters; if not there is an error in the data file
  for (k in 1:length(t)) {
    if (t[k] %in% c(0,1)) next else {
      if (t[k] != (t[k-1] + 1)) {
        errorMsg <- paste0(errorMsg, "\nNon-sequential lesionID encountered at line(s) ",
                           paste(k + 1, collapse = ", "), " in the TRUTH table.")
      }
    }
  }
  if (errorMsg != "") stop(errorMsg)
  
}



# SPLIT-PLOT-A: Reader nested within test; Hillis 2014 Table VII part (a)
# SPLIT-PLOT-C: Case nested within reader; Hillis 2014 Table VII part (c)
checkTruthTable <- function (truthTable, lrocForcedMark) 
{
  
  preCheck4BadEntries (truthTable)
  
  type <- (toupper(truthTable[,6][which(!is.na(truthTable[,6]))]))[1]
  design <- (toupper(truthTable[,6][which(!is.na(truthTable[,6]))]))[2]
  if (design == "CROSSED") design <- "FCTRL"
  if (!(type %in% c("FROC", "ROC", "LROC"))) stop("Unsupported data type: must be ROC, FROC or LROC.\n")
  if (!(design %in% c("FCTRL", "SPLIT-PLOT-A", "SPLIT-PLOT-C"))) stop("Study design must be FCTRL, SPLIT-PLOT-A or SPLIT-PLOT-C\n")
  
  if (type == "LROC") {
    if (is.na(lrocForcedMark)) stop("For LROC dataset one must set the lrocForcedMark flag to a logical") 
  } else {
    if (!is.na(lrocForcedMark)) stop("For non-LROC dataset one cannot set the lrocForcedMark flag to a logical") 
  }
  
  df <- truthTable[1:5]
  df["caseLevelTruth"] <- (truthTable$LesionID > 0)
  ########################################################
  # sort the TRUTH sheet of the Excel worksheet on the lesionID field
  # this puts normal cases first, regardless of how they are entered
  ########################################################
  truthTableSort <- df[order(df$caseLevelTruth),]
  
  caseIDCol <- as.integer(truthTable$CaseID)
  # TBA need a note on use of indx, why it is not used for readerID, etc.
  lesionIDCol <- as.integer(truthTable$LesionID)
  weightsCol <- as.numeric(truthTable$Weight)
  # readerIDCol <- truthTable$ReaderID # temp for testing non-character input
  # modalityIDCol <- truthTable$ModalityID# do:
  # bug non-character input error for HUGE dataset
  readerIDCol <- as.character(truthTable$ReaderID) # bug fix to avoid non-character input error below
  modalityIDCol <- as.character(truthTable$ModalityID) # do:
  # 
  L <- length(truthTable$CaseID) # length in the Excel sheet
  for (i in 1:5) if ((length(truthTable[[i]])) != L) stop("Cols of unequal length in Truth Excel worksheet")  
  
  normalCases <- sort(unique(caseIDCol[lesionIDCol == 0]))
  abnormalCases <- sort(unique(caseIDCol[lesionIDCol > 0]))
  K1 <- length(normalCases)
  K2 <- length(abnormalCases)
  K <- (K1 + K2)
  
  if (design == "SPLIT-PLOT-A") {
    # for this design the length is twice what it needs to be
    caseIDCol <- as.integer(truthTable$CaseID)[1:(L/2)]
    # lesionIDCol <- as.integer(truthTable$LesionID)[1:(L/2)]
    weightsCol <- truthTable$Weight[1:(L/2)]
    # preserve the strings; DO NOT convert to integers
    J <-  0 # find max number of readers, given that his data has 3 readers in one group and 4 in the other group
    for (el in 1:length(readerIDCol)) {
      if (length(strsplit(readerIDCol[el], split = ",")[[1]]) > J) J <- length(strsplit(readerIDCol[el], split = ",")[[1]])
    }
    rdrArr <- array(dim = c(L,J))
    for (l in 1:L) {
      val <- strsplit(readerIDCol[l], split = ",|\\s")[[1]]
      val <- val[val != ""]
      for (i in 1:length(val)) {
        rdrArr[l,i] <- val[i]
      }
    }
    # preserve the strings; DO NOT convert to integers
    I <- length(strsplit(modalityIDCol[1], split = ",")[[1]])
    trtArr <- array(dim = c(L,I))
    for (l in 1:L) {
      if (grep("^\\(.\\)", modalityIDCol[l]) == 1) { # match found to something like (1), i.e., one nested factor
        val <- grep("^\\(.\\)", modalityIDCol[l], value = T)
        val <- strsplit(val, split = "\\(|\\)")[[1]]
        val <- val[val != ""]
        for (i in 1:length(val)) {
          trtArr[l] <- val[i]
        }
      } else stop("Was expecting nested notation, using () brackets ...")
    }
  } else if (design == "SPLIT-PLOT-C") {
    # preserve the strings; DO NOT convert to integers
    J <- length(strsplit(readerIDCol[1], split = ",")[[1]])
    rdrArr <- array(dim = c(L,J))
    for (l in 1:L) {
      if (grep("^\\(.\\)", readerIDCol[l]) == 1) { # match found to something like (1), i.e., one nested factor
        val <- grep("^\\(.\\)", readerIDCol[l], value = T)
        val <- strsplit(val, split = "\\(|\\)")[[1]]
        val <- val[val != ""]
        for (i in 1:length(val)) {
          rdrArr[l] <- val[i]
        }
      } else stop("Was expecting nested notation, using () brackets ...")
      # preserve the strings; DO NOT convert to integers
      I <- length(strsplit(modalityIDCol[1], split = ",")[[1]])
      trtArr <- array(dim = c(L,I))
      for (l in 1:L) {
        val <- strsplit(modalityIDCol[l], split = ",|\\s")[[1]]
        val <- val[val != ""]
        for (i in 1:length(val)) {
          trtArr[l,i] <- val[i]
        }
      }
    }
  } else if (design == "FCTRL") {
    # preserve the strings; DO NOT convert to integers
    J <- length(strsplit(readerIDCol[1], split = ",")[[1]]) # bug non-character input error for HUGE dataset
    rdrArr <- array(dim = c(L,J))
    for (l in 1:L) {
      val <- strsplit(readerIDCol[l], split = ",|\\s")[[1]]
      val <- val[val != ""]
      for (i in 1:length(val)) {
        rdrArr[l,i] <- val[i]
      }
      # preserve the strings; DO NOT convert to integers
      I <- length(strsplit(modalityIDCol[1], split = ",")[[1]])
      trtArr <- array(dim = c(L,I))
      for (l in 1:L) {
        val <- strsplit(modalityIDCol[l], split = ",|\\s")[[1]]
        val <- val[val != ""]
        for (i in 1:length(val)) {
          trtArr[l,i] <- val[i]
        }
      }
    }
  } else stop("incorrect design value")
  
  if (design == "SPLIT-PLOT-A") {
    rdrArr1D <- t(unique(rdrArr)) # rdrArr is 2-dimensional; rdrArr1D is a one-dimensional array of all the readers in the study
    rdrArr1D <- rdrArr1D[!is.na(rdrArr1D)] # this modification is needed for HYK dataset with 3 readers in one group and 4 in the other
  } else {
    if (any(is.na(rdrArr))) stop("Illegal value in ReaderID column in Truth sheet")
    rdrArr1D <- as.vector(unique(rdrArr)) # rdrArr is 2-dimensional; rdrArr1D is a one-dimensional array of all the readers in the study
  }
  if (any(is.na(trtArr))) stop("Illegal value in ModalityID column in Truth sheet")
  trtArr1D <- as.vector(unique(trtArr))
  
  I <- length(trtArr1D)
  J <- length(rdrArr1D)
  
  truthTableStr <- array(dim = c(I, J, K, max(lesionIDCol)+1)) 
  for (l in 1:L) {
    k <- which(unique(truthTableSort$CaseID) == truthTable$CaseID[l])
    el <- lesionIDCol[l] + 1
    if (design == "SPLIT-PLOT-A") {
      i <- which(unique(trtArr) == trtArr[l])
      for (j1 in 1:length(rdrArr[l,])) {
        j <- which(rdrArr1D == rdrArr[l,j1])
        truthTableStr[i, j, k, el] <- 1
      }
    }
    else if (design == "SPLIT-PLOT-C") {
      i <- which(unique(trtArr) == trtArr[l,])
      j <- which(rdrArr1D == rdrArr[l])
      truthTableStr[i, j, k, el] <- 1
    } else if (design == "FCTRL") {
      i <- which(unique(trtArr) == trtArr[l,])
      j <- which(rdrArr1D == rdrArr[l,])
      truthTableStr[i, j, k, el] <- 1
    } else stop("incorrect study design")
  }
  
  perCase <- as.vector(table(caseIDCol[caseIDCol %in% abnormalCases]))
  weights <- array(dim = c(K2, max(perCase)))
  IDs <- array(dim = c(K2, max(perCase)))
  
  UNINITIALIZED <- RJafrocEnv$UNINITIALIZED
  for (k2 in 1:K2) {
    k <- which(caseIDCol == abnormalCases[k2])
    IDs[k2, ] <- c(sort(lesionIDCol[k]), 
                   rep(UNINITIALIZED, max(perCase) - length(k)))
    if (all(weightsCol[k] == 0)) {
      weights[k2, 1:length(k)] <- 1/perCase[k2]
    } else {
      weights[k2, ] <- as.numeric(c(weightsCol[k][order(lesionIDCol[k])], 
                                    rep(UNINITIALIZED, max(perCase) - length(k))))
      sumWeight <- sum(weights[k2, weights[k2, ] != UNINITIALIZED])
      if (sumWeight != 1){
        if (sumWeight <= 1.01 && sumWeight >= 0.99){
          weights[k2, ] <- weights[k2, ] / sumWeight
        }else{
          errorMsg <- paste0("The sum of the weights of Case ", k2, " is not 1.")
          stop(errorMsg)
        }
      }
    }
  }
  
  return (list(
    rdrArr1D = rdrArr1D,
    trtArr1D = trtArr1D,
    truthTableSort = truthTableSort,
    truthTableStr = truthTableStr,
    type = type,
    design = design,
    caseID = caseIDCol,
    perCase = perCase,
    lesionIDCol = lesionIDCol,
    IDs = IDs,
    weights = weights,
    normalCases = normalCases,
    abnormalCases = abnormalCases
  ))
  
}