R/exiftool.R

Defines functions exiftool

exiftool <- function(fname, output="list", attr_filter=NULL, dedup=TRUE, verbose=FALSE) {
  # Run 'exiftool' command line program on a file and return the results
  # Arguments:
  #   fname (char or char vec): filepath or vector of filepaths to run exiftool on
  #   output {char} -- format to return data as
  #     options:
  #       list: return as list in format:
  #             list(
  #               fname1 = exif,
  #               fname2 = exif,
  #                ...)
  #             where 'exif' is a named character vector
  #       df: regular dataframe where columns are every possible EXIF attribute, one row per fname
  #             data.frame(fname=..., key1=..., key2=..., ...)
  #       df_melt: melted dataframe with only id variable, key and value columns
  #             data.frame(fname=..., key=..., val=...)
  #       json: json string version of df_melt
  #             [
  #               {
  #                 "fname":"...",
  #                 "key":"...",
  #                 "val":"..."
  #               },
  #               ...
  #             ]
  #       list_json: list of json objects
  #             list(
  #               fname1=[{"key1":"val1","key2":"val2",...}],
  #               fname2=[{"key1":"val1","key2":"val2",...}],
  #               ...
  #             )
  #  attr_filter {char}: attribute or attributes to filter exiftool output for
  #  dedup {logical} -- If TRUE, names of EXIF attributes will be checked for duplicate names. If
  #                     any are found, a suffix is appended. Suffixes may be "_2", "_3", ...
  #  verbose {logical} -- If TRUE, verbose output with timestamps printed to console
  #
  # Returns:
  #   exiftool data in desired output format: {list}, {dataframe}, {json}
  
  # Program functions
  splitAt <- function(x, pos) unname(split(x, cumsum(seq_along(x) %in% pos)))
  split_fnames_char_limit <- function(fnames, char_limit) {
    # Determine at which point to split vector of filenames to comply with arbitrary char limit
  
    # Arguments:
    #   fnames (string or character vector): filepath or vector of filepaths
    #   char_limit {numeric} -- arbitrary character limit of command line command
    #
    # Returns:
    #   list
    split_fname = c()
    count = 0
    str_lengths = nchar(fnames)
    for (i in 1:length(str_lengths)) {
      val = str_lengths[i] + 3  # Account for two double quotes and a space
      count = count + val
      if (count > char_limit - nchar("exiftool ")) {
        split_fname = append(split_fname, i-1)
        count = 0
      }
    }
    return(splitAt(fname, split_fname))
  }
  get_exif_list <- function(fname_batches, attr_filter, dedup, verbose) {
    # Iterate over batches and parse EXIF keys and values
    # Arguments:
    #   fname_batches: fname_batches
    #   attr_filter: attr_filter
    #
    # Returns:
    #   {list}

    suppressPackageStartupMessages(require(stringr))
    if (verbose) echo("Running exiftool...", timestamp=TRUE, fn_name="exiftool")
    # out = lapply(fname_batches, function(fname_vec, attr_filter) {
    out = lapply(1:length(fname_batches), function(i, attr_filter, dedup) {
      fname_vec = fname_batches[[i]]
      fname_string = paste0(sprintf('"%s"', fname_vec), collapse = " ")
      cmd = sprintf('exiftool %s', fname_string)
      res = system(cmd, intern=TRUE)
      split_loc = grep("^========", res)
      res[split_loc] = NA
      res = splitAt(res, split_loc)
      res = lapply(res, function(x) x[!is.na(x)])
      keys = lapply(res, function(x) tolower(gsub(" ", "_", trimws(gsub("^(.*?)(:)(.*)$", "\\1", x)))))
      vals = lapply(res, function(x) trimws(gsub("^(.*?)(:)(.*)$", "\\3", x)))
      # Filter result if any attributes specified
      if (!is.null(attr_filter)) {
        vals = lapply(1:length(vals), function(i) vals[[i]][keys[[i]] %in% attr_filter])
        keys = lapply(keys, function(x) x[x %in% attr_filter])
      }
      # Apply names to each char vector
      vals = lapply(1:length(vals), function(i) {
        names(vals[[i]]) = keys[[i]]
        return(vals[[i]])
      })
      # Mark any duplicate keys with a _\d suffix to ensure no duplicate keys
      if (dedup == TRUE) {
        vals = lapply(vals, function(x) {
          suffix = 2
          while (any(duplicated(names(x)))) {
            names(x)[duplicated(names(x))] = paste0(names(x)[duplicated(names(x))], "_", suffix)
            suffix = suffix + 1
          }
          # Account for cases with more than 2 duplicates. The above while loop will change
          # a vector of c("focal_length", "focal_length", "focal_length") to
          # c("focal_length", "focal_length_2", "focal_length_2_3")
          # names(x) = gsub("\\_(\\d)\\_", "_", names(x))
          # Commented out because it also changes a vector like
          # c("face_1_position", "face_2_position") to c("face_position", "face_position")
          return(x)
        })
      }

      if (verbose) {
        echo("Batch %s of %s complete (size: %s files)",
          str_pad(i, width=nchar(max(length(fname_batches))), side="left", pad="0"),
          length(fname_batches),
          length(fname_vec),
          indent=1, timestamp=TRUE, fn_name="exiftool")
      }
      return(vals)
    }, attr_filter, dedup)
    out = do.call(c, out)
    names(out) = unlist(fname)
    return(out)
  }
  format_output <- function(out, output) {
    # Format results as desired output format
    # Arguments:
    #   out {dataframe} -- dataframe to format 
    #   output {char} -- desired output format, one of "list", "df", "list_json", "df_melt"
    #
    # Returns:
    #   out in output format

    if (output == "list") {
      return(out)

    } else if (output == "df") {
      seq_max = seq_len(max(sapply(out, length)))
      mat = t(sapply(out, "[", i=seq_max))
      mat = as.data.frame(cbind(rownames(mat), mat))
      mat = data.frame(lapply(mat, as.character), stringsAsFactors=FALSE)
      rownames(mat) = NULL
      colnames(mat)[1] = "fname"
      return(mat)

    } else if (output == "list_json") {
      require(jsonlite)
      list_json = lapply(out, function(vec) {
        df_json = data.frame(key=names(vec), val=unname(vec), stringsAsFactors=FALSE)
        df_json = as.data.frame(t(df_json))
        df_json = data.frame(lapply(df_json, as.character), stringsAsFactors=FALSE)
        colnames(df_json) = as.character(df_json[1,])
        rownames(df_json) = NULL
        df_json = df_json[2,]
        return(toJSON(df_json))
      })
      return(list_json)

    } else if (output %in% c("df_melt", "json")) {
      df = do.call(rbind, lapply(names(out), function(name) {
        exif = out[name][[1]]
        data.frame(
          fname = name,
          key = names(exif),
          val = unname(exif),
          stringsAsFactors=FALSE)
      }))

      if (output == "df_melt") {
        return(df)

      } else if (output == "json") {
        require(jsonlite)
        return(toJSON(df))
      }

    }
  }

  # Check parameters
  stopifnot(is.character(fname) & length(fname) > 0)
  
  # Run exiftool on each file, format as specified in function parameters
  if (verbose) echo("Number of files detected           : %s", length(fname), timestamp=TRUE, fn_name="exiftool")
  est_time = fmt_seconds(0.06079523*length(fname), round_digits=0)
  if (verbose) echo("Expected program time              : %s %s", est_time$value, est_time$units, timestamp=TRUE, fn_name="exiftool")
  char_limit = as.numeric(system("getconf ARG_MAX", intern=TRUE)) - 25000
  if (verbose) echo("Using command-line character limit : %s", char_limit, timestamp=TRUE, fn_name="exiftool")
  fname_batches = split_fnames_char_limit(fname, char_limit)
  if (verbose) echo("Number of batches to run           : %s", length(fname_batches), timestamp=TRUE, fn_name="exiftool")
  out = get_exif_list(fname_batches, attr_filter, dedup, verbose)
  out = format_output(out, output)

  # If only run on a single file, and output is a list, return only named chr
  if (class(out) == "list" && length(out) == 1) {
    return(out[[1]])
  } else {
    return(out)
  }
}
tsouchlarakis/rdoni documentation built on Sept. 16, 2019, 8:53 p.m.