MLWIC2: Machine Learning for Wildlife Image Classification

Documented in make_input

#' Create an input file to run \code{classify} or \code{train} in \code{MLWIC}
#'
#' \code{make_input} will make a csv with the specifications necessary to either classify 
#' images or to train a new model. See `details` below for using the different options
#' 
#' @param input_file The absolute path to your input csv. It must contain a column called "filename"
#'  and unless you are using the built in model, a column called "class" (which would be your species or group of species).
#' @param option A value of 1-5 descrbing how you want to supply data and how you want the function to 
#'  create an input file. See `details` below for what each option means. Setting an option value overrides 
#'  inputs for `find_file_names`, `usingBuiltIn`, `images_classified`, `find_class_IDs`, and `trainTest`.
#' @param output_dir The absolute path where you would like to store your new csv. It can be anywhere on your computer,
#'  but you'll want to be able to find it in the next step, so you might want to store it in your MLWIC2_helper_files folder. 
#' @param find_file_names logical. If TRUE, this function will find all image files within a 
#'  specified directory. You must specify the directory (`path_prefix`) for this to work.
#'  If you already have a spreadsheet (eg. a `.csv`) with the names of files and their classifications,
#'  this is not the option for you. 
#' @param path_prefix Path to where your images are stored. You need to specify this if 
#'  you want MLWIC2 to `find_file_names` (or if you are using option 4). 
#' @param image_file_suffixes The suffix for your image files. Only specify this if you are 
#'  using the `find_file_names` option. The default is .jpg files. This is case-sensitive.
#' @param recursive logical. Only necessary if you are using the `find_file_names` option. 
#'  If TRUE, the function will find all relevant image files in all subdirectories from the 
#'  path you specify. If FALSE, it will only find images in the folder that you provide as your 
#'  `path_prefix`.
#' @param usingBuiltIn logical. If TRUE, you are setting up a data file to classify images using
#'  the built in model. 
#' @param model_type If usingBuiltIn=TRUE, you can specify `species_model` or `empty_animal` so that 
#'  your class_ID's will match those of the model
#' @param images_classified logical. If TRUE, you have classifications to go along with these images
#'  (and you want to test how the model performs on these images).
#' @param find_class_IDs logical. If TRUE, and you have images_classified, MLWIC2 will try to match up
#'  your text classifications with the values from the trained model. If FALSE and you have images classified,
#'  you need to have a column in your input file called `class_ID`. 
#' @param trainTest logical. Do you want to create separate csvs for training and testing
#' @param file_prefix What you want to appear as the filename before the suffix. If you are
#'  only creating a file to test the model, you could specify "test_" and your output file name
#'  would be "test_image_labels.csv". If you specify `trainTest = TRUE`, your suffixes will automatically be
#'  "_train.csv" and "_test.csv"
#' @param propTrain proportion of images you want for training. `1-propTrain` is the proportion
#'  that will be used for testing the model. 
#'  
#' @details
#' \itemize{
#'  \item Use \code{option=1} if you have labels for your images and you want to test the model on your images, you need to have an `input_file` csv that has at last two columns and one of these must be 'filename' and the other must be 'class_ID'. The 'class_ID' column must contain the number associated with each class.
#'  \item \code{option=2}: This is the same as Option 1, except instead of having a number for each class, you have a column called `class` containing your classifications as words (e.g., 'dog' or 'cattle', 'empty'), the function will find the appropriate `class_ID` associated with these words.
#'  \item Use \code{option=3} if you do not have your images classified, but you have all of the filenames for the images you want to classify, you can have an `input_file` csv with a column called 'filename' and whatever other columns you would like.
#'  \item \code{option=4}: MLWIC2 will find the filenames of all of your images and create your input file. For this option, you need to specify your `path_prefix` which is the parent directory of your images.
#'  \item \code{option=5}: If you are planning to train a model, you will want training and testing sets of images. This function will set up these files also.
#' }
#' @export

make_input <- function(
  input_file = NULL,
  output_dir = getwd(), 
  option = NULL, 
  find_file_names = FALSE,
  path_prefix = getwd(),
  image_file_suffixes = c(".jpg", ".JPG"),
  recursive = TRUE,
  usingBuiltIn = TRUE, 
  model_type = "species_model",
  images_classified = FALSE,
  find_class_IDs = FALSE,
  trainTest = FALSE, 
  file_prefix = "",
  shiny=FALSE, 
  propTrain = 0.9
){
  
  # incorporate options
  #if(!is.null(option)){
    if(option=="1"){
      # images labeled, using this function to create a file
      images_classified <- TRUE
      find_file_names <- FALSE
      usingBuiltIn <- TRUE
      find_class_IDs <- FALSE
      trainTest <- FALSE
    }
    if(option=="2"){
      # images labeled, using this function to create a file and find the class IDs
      images_classified <- TRUE
      find_file_names <- FALSE
      usingBuiltIn <- TRUE
      find_class_IDs <- TRUE
      trainTest <- FALSE
    }
    if(option=="3"){
      # finding file names based on path_prefix
      images_classified <- FALSE
      find_file_names <- FALSE
      usingBuiltIn <- TRUE
      find_class_IDs <- FALSE
      trainTest <- FALSE
    }
    if(option=="4"){
      images_classified <- FALSE
      find_file_names <- TRUE
      usingBuiltIn <- TRUE
      find_class_IDs <- FALSE
      trainTest <- FALSE
    }
    if(option=="5"){
      images_classified <- TRUE
      find_file_names <- FALSE
      usingBuiltIn <- TRUE
      find_class_IDs <- FALSE
      trainTest <- TRUE
    }
 # }
  
  # make sure there is not overlapping logic
  if(usingBuiltIn == TRUE & trainTest == TRUE){
    stop("You have specified trainTest == TRUE and usingBuiltIn == TRUE. \n
         This does not make sense because you do not want to make separate train and \n
         test files if you are using the built in model. trainTest is only used if \n
         you are building a model. ")
  }
  if(trainTest==TRUE & images_classified == FALSE){
    stop("You have specified trainTest == TRUE and images_classified == FALSE. \n
         This does not make sense because you cannot train a model if you do not \n
         have classified images.")
  }
  if(find_file_names == TRUE & images_classified == TRUE){
    stop("You have specified find_file_names==TRUE and images_classified==TRUE. \n
         When MLWIC2 executes the find_file_names option it cannot accept image \n
         classifications associated with each image. If you want to supply \n
         image classifications, you need to supply an input_file. ")
  }
  if(find_file_names == TRUE & is.null(path_prefix)){
    stop("You have specified find_file_names==TRUE and but you have not specified the \n
         directory where your image files are located on your computer.")
  }

  # set directory to output location
  wd1 <- getwd() # so we can return user to their previous wd
  setwd(output_dir)
  
  # if using shiny, I need to create a new directory to put the file
  if(shiny){
    shinyDir <- "MLWIC2_inputFile_dir"
    setwd(output_dir)
    if(dir.exists(shinyDir)){
    }else {
      dir.create(shinyDir)
    }
  }
  
  # make input file using only the path
  if(find_file_names){ #** Option 4
    # make a pattern argument for list_files because it cannot take a vector
    pattern <- paste0(image_file_suffixes, collapse="|")
    
    # find file names in directory
    file_names <- list.files(path = path_prefix,  
                             pattern=pattern,
                             full.names=FALSE, recursive=recursive)
    df <- data.frame(file_names, rep(0, length(file_names)))
    if(shiny){
      shinyDir <- "MLWIC2_inputFile_dir"
      if(dir.exists(paste0(output_dir, "/", shinyDir))){
      } else{
        dir.create(paste0(output_dir, "/", shinyDir))
      }
      output.file <- file(paste0(output_dir, "/", shinyDir, "/","image_labels.csv"), "wb")
    } else {
      output.file <- file(paste0(output_dir, "/","image_labels.csv"), "wb")
    }
    utils::write.table(df, row.names = FALSE, col.names = FALSE, file = output.file, quote = FALSE,append = TRUE, sep = ",")
    close(output.file)
    rm(output.file) 
    if(shiny){
      cat(paste0("Your file is located at '", output_dir, "/", shinyDir, "/", "image_labels.csv'."))
    } else{
      cat(paste0("Your file is located at '", output_dir, "/", "image_labels.csv'."))
    }
    
  } else {
    # load in file
    inFile <- utils::read.csv(input_file)
    
    if(usingBuiltIn){
      if(images_classified){
        if(find_class_IDs){
          cnames <- colnames(inFile)
          cnames_shouldBe <- c("class_ID", "filename")
          cnames_bool <- cnames_shouldBe %in% cnames
          if(any(!cnames_bool)){
            stop("You have specified that you want MLWIC2 to find_class_IDs. In order to do this,\n
                 your inFile must contain a column called `class` and a column called `filename`")
          }
          
          # setup a lower case
          speciesID <- speciesID
          contains <- (data.frame(lapply(speciesID[, 2:19], as.character), stringsAsFactors = FALSE))
          contains <- sapply(contains, FUN=tolower)
          
          # test finding a classID of a name
          # nm <- tolower("Cow")
          # rowOfClass <- which(contains == nm, arr.ind=TRUE)[1]
          # #grep(nm, contains, ignore.case=TRUE, value=FALSE)
          # class_ID <- speciesID[rowOfClass,1]
          # inFile <- data.frame(class = c("Cattle", "chickadee", "nada", "Eagle", "Eagle", "skunk"), num = 1:6)
          
          # function to get the classID of a given class
          findClassID <- function(x){
            rowOfClass <- which(contains == tolower(x), arr.ind=TRUE)[1]
            class_ID <- speciesID[rowOfClass,1]
            return(class_ID)
          }
          inFile$class_ID <- sapply(inFile$class_ID, findClassID)
          
          if(model_type == "empty_animal"){
            # if we're using the empty animal model, change to either 0 or one. 
            inFile$class_ID_EA <- ifelse(inFile$class_ID == 27, 0, 1)
            inFile$class_ID <- inFile$class_ID_EA
          } # if species_model, leave as is. 
          
          # make some output to show how classes were changed
          class_IDs_new <- inFile[match(unique(inFile$class), inFile$class), "class_ID"]
          old_new <- data.frame(input_class = unique(inFile$class), 
                                class_ID =class_IDs_new)
          nas <- old_new[(is.na(old_new$class_ID)),]
          nas_df <- data.frame(nas, group_name=rep("none", nrow(nas)))
          old_new2 <- merge(old_new, speciesID, by="class_ID")
          old_new3 <- data.frame(input_class = old_new2$input_class, 
                                 class_ID = old_new2$class_ID, 
                                 group_name = old_new2$group_name)
          old_new4 <- rbind(old_new3, nas_df)
          
          # return a talbe showing how their labels were changed
          # cat("This function will return a table of how your class names were changed
          #     to make class_ID's to match the function. If you are not happy with these, 
          #     it is best for you to find class_IDs for your species using the table here:
          #     https://github.com/mikeyEcology/MLWIC2/blob/master/speciesID.csv and specifying
          #     find_class_IDs=FALSE the next time you run `make_input`")
          #return(old_new4)
          
          # remove rows from input file where there is no matching classID
          inFile2 <- inFile[!is.na(inFile$class_ID),]
          
          # write output
          df <- data.frame(inFile2$filename, inFile2$class_ID)
          if(shiny){
            output.file <- file(paste0(output_dir, "/", shinyDir, "/", file_prefix, "image_labels.csv"), "wb")
          } else {
            output.file <- file(paste0(output_dir, "/", file_prefix, "image_labels.csv"), "wb")
          }
          utils::write.table(df, row.names = FALSE, col.names = FALSE, file = output.file, quote = FALSE,append = TRUE, sep = ",")
          close(output.file)
          rm(output.file) 
          print(paste0("Your file is located at ", output_dir, "/", file_prefix, "image_labels.csv."))
          
          
        } else { # not finding file names; user is supplying class_ID. 
          cnames <- colnames(inFile)
          cnames_shouldBe <- c("class_ID", "filename")
          cnames_bool <- cnames_shouldBe %in% cnames
          if(any(!cnames_bool)){
            stop("You have specified that you want MLWIC2 to make an input file using your class_IDs and \n
                 filenames. Your input file must contain a column called `class_ID` and a column called `filename`")
          }
          # here we are just essentially reading and writing the file (option 1)
          
          # write output
          df <- data.frame(inFile$filename, inFile$class_ID)
          if(shiny){
            #output.file <- file(paste0(output_dir, "/", shinyDir, "/", file_prefix, "image_labels.csv"), "wb")

            output.file <- file(paste0(output_dir, "/", shinyDir, "/", file_prefix, "image_labels.csv"), "wb")
            
          } else {
            output.file <- file(paste0(output_dir, "/", file_prefix,"image_labels.csv"), "wb")
          }
          utils::write.table(df, row.names = FALSE, col.names = FALSE, file = output.file, quote = FALSE,append = TRUE, sep = ",")
          close(output.file)
          rm(output.file)
          if(shiny){
            print(paste0("Your file is located at ",output_dir, "/", shinyDir, "/", file_prefix, "image_labels.csv."))
          }else{
            print(paste0("Your file is located at ", output_dir, "/", file_prefix, "image_labels.csv."))
          }
                      
        } # end not finding file names; user is supplying class_ID. 
        
        
      } else { # images not classified, but using builtin
        cnames <- colnames(inFile)
        cnames_bool <- "filename" %in% cnames
        if(any(!cnames_bool)){
          stop("Your input_file does not contain a column called 'filename'")
        } 
        df <- data.frame(inFile$filename, rep(0, nrow(inFile)))
        
        
        
        # write output
        if(shiny){
          output.file <- file(paste0(output_dir, "/", shinyDir, "/", file_prefix, "image_labels.csv"), "wb")
        } else {
          output.file <- file(paste0(output_dir, "/", file_prefix, "image_labels.csv"), "wb")
        }
        utils::write.table(df, row.names = FALSE, col.names = FALSE, file = output.file, quote = FALSE,append = TRUE, sep = ",")
        close(output.file)
        rm(output.file) 
        print(paste0("Your file is located at ", output_dir, "/", file_prefix, "image_labels.csv."))
        
      } # end images not classified, but using builtin
      
    } else { # (not using builtin)
      cnames <- colnames(inFile)
      if(images_classified){
        cnames_shouldBe <- c("class", "filename")
      } else{
        cnames_shouldBe <- c("filename")
      }
      
      cnames_bool <- cnames_shouldBe %in% cnames
      if(any(cnames_bool==FALSE)){
        stop("The column names in your input_file must include 'class' and 'filename'. \n
           The 'class' column contains the names of the species in each image. ")
      }
      
      if(images_classified){ # not using builtin
        # create a lookup table
        group_name <- unique(inFile$class) 
        class_ID <- seq_along(group_name)
        tblLu <- data.frame(class_ID, group_name)
        
        # make a df that contains the ID for each file
        df1 <- merge(inFile, tblLu, by.x="class", by.y="group_name")
        df2 <- data.frame(df1$filename, df1$class_ID)
      } else{
        df2 <- data.frame(inFile$filename, rep(0, nrow(inFile)))
      }
      
      # write out data frame
      if(trainTest==FALSE){ 
        if(shiny){
          output.file <- file(paste0(output_dir, "/", shinyDir, "/", file_prefix,"image_labels.csv"), "wb")
        } else {
          output.file <- file(paste0(output_dir, "/", file_prefix,"image_labels.csv"), "wb")
        }
        utils::write.table(df, row.names = FALSE, col.names = FALSE, file = output.file, quote = FALSE,append = TRUE, sep = ",")
        close(output.file)
        rm(output.file) 
        print(paste0("Your file is located at ", wd, file_prefix, "image_labels.csv."))
      } else {
        # set up training and testng datasets
        ntrain <- floor(nrow(df2)*proptrain)
        ntest <- nrow(df2) - ntrain
        train_rows <- sample(nrow(df2), ntrain, replace=FALSE)
        df.train <- df2[train_rows,]
        df.test <- df2[-train_rows,]
        # write it out
        if(shiny){
          output.file <- file(paste0(output_dir, "/", shinyDir, "/", file_prefix, "_train.csv"), "wb")
        } else {
          output.file <- file(paste0(output_dir, "/", file_prefix, "_train.csv"), "wb")
        }
        utils::write.table(df.train, row.names = FALSE, col.names = FALSE, file = output.file, quote = FALSE,append = TRUE, sep = ",")
        close(output.file)
        rm(output.file) 
        
        if(shiny){
          output.file <- file(paste0(output_dir, "/", shinyDir, "/", file_prefix, "_test.csv"), "wb")
        } else {
          output.file <- file(paste0(output_dir, "/", file_prefix, "_test.csv"), "wb")
        }
        utils::write.table(df.test, row.names = FALSE, col.names = FALSE, file = output.file, quote = FALSE,append = TRUE, sep = ",")
        
        close(output.file)
        rm(output.file) 
        
        # print information
        print(paste0("Your files are located in ", wd, "\n
                   With the file names: ", file_prefix, "_test.csv and \n", 
                     file_prefix, "train.csv."))
      }
      
    } # end if not using builtin
    
    if(images_classified & !(usingBuiltIn)){
      # return the lookup table
      return(tblLU)
    }
    
  } # end else for not using find_file_names
  
  setwd(wd1)
}

 # make_input(output_dir = "/Users/mikeytabak/MLWIC_examples/",
 #   path_prefix="/Users/mikeytabak/MLWIC_examples/images/", 
 #            option=4,
 #   image_file_suffixes = c(".JPG", ".jpg"
 #                           ))
# make_input(input_file ="/Users/mikeytabak/MLWIC_examples/image_labels_headers.csv", 
#                       option=1)