R/process_image_db.r

#' Batch process all data in the IFPRI database using the
#' automatic ROI detection methodology
#'
#' @param database STATA or RDS file as provided by IFPRI
#' @param path path of the local IFPRI database images to process images will be
#' downloaded to local computer if no local copy exists
#' @param plot TRUE / FALSE (output summary plots?)
#' @param force force regeneration of all indices
#' @param force_all force regeneration of all data in database
#' including estimating the horizon and ROI, as well as indices / features
#' @keywords gcc calculation, QA/GC
#' @export

process_image_db = 
  function(database = NULL,
           local_database = "cropmonitor.rds",
           path = "~/cropmonitor/",
           server = "http://cdn.wheatcam.ifpri.org/ReportImages",
           thumbnails = FALSE,
           force = TRUE,
           force_all = FALSE,
           ncores = 6){
  
  # for consistency with list.files() use tilde
  # expansion if applicable
  path = path.expand(path = path)
  
  # check if parameters are available
  if ( is.null(database) ){
    stop('No database file available!')
  }
  
  # create directory to hold all the data
  if (!dir.exists(path)){
    stop('data directory does not exist')
  }
  
  # create image and thumb paths
  img_path = sprintf("%s/images",path)
  thumb_path = sprintf("%s/thumbs",path)
  
  # create subdirs holding original images
  # and image thumbnails
  if (!dir.exists(img_path) ){
    stop('data directory does not exist')  
  }
  
  # move into the directory
  setwd(path)
  
  # error trap an empty database filename
  if (is.null(database)){
    stop("please provide a valid input file with experiment meta-data.")
  }
  
  # read in dta database file or RDS data file
  file_format = tail(unlist(strsplit(basename(database),"\\.")), n = 1)
  
  # read STATA or RDS data
  if (file_format == "dta"){
    df = readstata13::read.dta13(database)
  } else {
    df = readRDS(database)
  }
  
  # check which files exist in the current directory
  # split out some variable for clarity
  userid = df$old_uniqueuserid
  reportid = df$reportid
  cropsite = df$old_uniquecropsiteid
  epoch_date = df$pic_timestamp
  
  user_img_path = paste(img_path, userid, sep = "/")
  user_thumb_path = paste(thumb_path, userid, sep = "/")
  site_user_img_path = paste(user_img_path, cropsite, sep = "/")
  site_user_thumb_path = paste(user_thumb_path, cropsite, sep = "/")
  
  # create image location string
  filename = paste(userid,"-",
                   cropsite,"-",
                   epoch_date,".jpg", sep = "")
  
  online_image_location = paste(server,
                                userid,
                                cropsite,
                                filename,
                                sep = "/")
  
  local_image_location = paste(site_user_img_path,
                               filename,
                               sep = "/")
  
  thumbnail_location = paste(site_user_thumb_path,
                             filename,
                             sep = "/")
  
  current_image_location = list.files(img_path,
                                      "*.jpg",
                                      recursive = TRUE,
                                      full.names = TRUE)
  
  # list the index locations of files which are in the local image db
  files_to_process = which(local_image_location %in% current_image_location)
  
  # compare current local database with the one provided
  # as a parameter
  if (file.exists("cropmonitor.rds") & force == FALSE ){
    
    # read local database
    local_database = readRDS(local_database)
    
    # check if the header is not the same
    # calculate everything regardless
    if(names(df) != names(local_database)){
      break
    }
    
    # create unique identifiers to compare files
    old_file_id = sprintf("%s-%s-%s",
                          local_database$old_uniqueuserid,
                          local_database$old_uniquecropsiteid,
                          local_database$pic_timestamp)
    
    new_file_id = sprintf("%s-%s-%s",
                          df$old_uniqueuserid,
                          df$old_uniquecropsiteid,
                          df$pic_timestamp)
    
    # figure out which lines to progress
    # those in the DTA file but not in the existing JSON one
    duplicates = which(new_file_id %in% old_file_id)
    
    # copy duplicates
  }
  
  # now loop over all new images in the database and fill in the
  # missing values. Report progress
  cat("Processing database entries\n")
  cat(sprintf("will finish in ~ %s days on a %s core machine.\n",
              round((length(files_to_process) * 13) / (3600 * 24 * ncores),2),
              ncores))

  # set the number of cores
  cl = makeCluster(ncores)
  registerDoSNOW(cl)
  
  # set progress bar
  pb = txtProgressBar(max = length(files_to_process),
                       style = 3)
  progress = function(n) setTxtProgressBar(pb, n)
  opts = list(progress = progress)
  
  # use lapply instead of loop
  crop_index_details = foreach(j = 1:length(files_to_process),
                               .combine = rbind,
                               .options.snow = opts) %dopar% {
    
    # grab location of the file to process
    i = files_to_process[j]
                                 
    # if there is picture time stamp this is
    # is a free format comment not a picture
    # skip
    if (is.na(df$pic_timestamp[i])){
      return(rep(NA,12))
    }
    
    # load image
    img = raster::brick(local_image_location[i])
    
    # estimate an ROI (based upon the location of the horizon)
    roi = try(estimate_roi(img = img, plot = FALSE), silent = TRUE)
    
    if(inherits(roi,"try-error")){
      return(rep(NA,9))
    }

    # calculate the gcc / grvi
    greenness_values = calculate_gcc(img = img,
                                     roi = roi)
    
    # calculate the glcm
    glcm_values = calculate_glcm(img = img,
                                 roi = roi)
    
    # calculate the sobel data
    sobel_values = calculate_sobel(img = img,
                                   roi = roi)
    
    # visualize the regions of interest and horizon
    # in an image thumbnail for review, requires
    # flag
    if (thumbnails){
      thumbnail_location = sprintf("%s/%s",site_user_thumb_path[i],filename[i])
      jpeg(thumbnail_location, 374, 280, bg = "black")
      raster::plotRGB(img)
      lines(1:ncol(img),
            roi$horizon,
            lwd = 2,
            col='red')
      lines(roi$roi,
            lwd = 2,
            lty = 2,
            col = 'yellow')
      dev.off()
    }
    
    # the original ROI polygon descriptor
    #roi = paste(as.vector(greenness_values$roi@polygons[[1]]@Polygons[[1]]@coords),collapse = ',')

    # garbage collection cleanup
    raster::removeTmpFiles()

    # return values
    return(c(greenness_values$r_dn,
             greenness_values$g_dn,
             greenness_values$b_dn,
             greenness_values$gcc,
             greenness_values$grvi,
             glcm_values$glcm,
             sobel_values$sobel))
  }
  
  # output matrix
  output = matrix(NA, nrow(df), 12)
  
  output[files_to_process,] = crop_index_details
  colnames(output) = c(
                    "r_dn",
                    "g_dn",
                    "b_dn",
                    "gcc_90",
                    "grvi",
                    "glcm_variance",
                    "glcm_homogeneity",
                    "glcm_contrast",
                    "glcm_dissimilarity",
                    "glcm_entropy",
                    "glcm_second_moment",
                    "sobel")
  
  # write to file after combining with the
  # original database
  df = data.frame(df[,1:17],output)
  
  # write new data to file
  # use rds not json as json gets fucked up easily
  # by additions in the ingested dta file, namely
  # the use of {} returns nested list instead
  # of data frames, which is a PITA
  saveRDS(df,file = paste0(head(unlist(strsplit(basename(database),"\\.")),
                                n = 1),
                           ".rds"))
  
  # stop cluster
  stopCluster(cl)
  
  # close progress bar
  close(pb)
}
khufkens/cropmonitor documentation built on May 31, 2019, 8:29 a.m.