R/extract-MIME-level-and-filenames.R

Defines functions extract_MIME_level_and_filenames

#' Extract specific attachments Mime levels and the filenames of a message
#' @param meta An object of type \code{character} containing the BODYSTRUCTURE
#'   previously fetched messages via \code{fetch_metadata}.
#' @param use_uid Default is \code{FALSE}. In this case, results will be
#'   presented as message's sequence numbers. A message sequence number is a
#'   message's relative position to the oldest message in the mailbox. It may
#'   change after deleting or moving messages. If a message is deleted,
#'   sequence numbers are reordered to fill the gap. If \code{TRUE}, the
#'   command will be performed using the \code{"UID"} or unique identifier,
#'   and results are presented as such. UIDs are always the same during the
#'   life cycle of a message.
#' @noRd
extract_MIME_level_and_filenames <- function(meta, use_uid) {

  if(isTRUE(use_uid)) {
    meta <- gsub("FETCH \\(UID \\d+ BODYSTRUCTURE \\(", "", meta, ignore.case = TRUE)
    meta <- gsub("\\* \\d+ FETCH \\(BODYSTRUCTURE \\(", "", meta, ignore.case = TRUE) #v0.9.1 -- IMAP server (Office 365 changed behavior)
    meta <- gsub("^BODYSTRUCTURE \\(", "", meta, ignore.case = TRUE) #v0.9.1 -- IMAP server (Office 365 changed behavior)
    meta <- gsub("^UID \\d+ BODYSTRUCTURE \\(", "", meta, ignore.case = TRUE) #v0.9.1 -- Gmail
  } else {
    meta <- gsub("FETCH \\(BODYSTRUCTURE \\(", "", meta, ignore.case = TRUE)
    meta <- gsub("^BODYSTRUCTURE \\(", "", meta, ignore.case = TRUE) #v0.9.1 -- IMAP server (Office 365 changed behavior)
  }

  meta <- gsub('\\)$', "", meta)

  count_MIME_levels = 0
  paren_sum = 0
  j = 0

  attachments_MIME_level <- c()
  # attachment_init_char <- c()
  content_type <- c()

  while (j <= nchar(meta)) {
    # j = 360
    char <- substr(meta, j, j)

    if(char == "("){
      paren_sum = paren_sum + 1 # supposing it will never be negative # sums opening parenthesis

      if (grepl("attachment", substr(meta, j, j + 15), ignore.case = TRUE) && grepl("filename", substr(meta, j + 15, j + 30), ignore.case = TRUE)) { #v0.9.1 bug fix when multiple attacments are in the same boundary
        # .. it would be a dismatch between the filenames and the calculated parts - so we only consider the (attachment (filename ...)) as a MIME part

        content_type <- c(content_type, "attachment")
        attachments_MIME_level <- c(attachments_MIME_level, count_MIME_levels + 1)
      }

      if (grepl("inline|INLINE", substr(meta, j, j + 11), ignore.case = TRUE) && grepl("filename", substr(meta, j + 11, j + 30), ignore.case = TRUE)) {

        content_type <- c(content_type, "inline")
        attachments_MIME_level <- c(attachments_MIME_level, count_MIME_levels + 1)
      }

    } else if(char == ")"){
      paren_sum = paren_sum - 1
      if(paren_sum == 0) {
        count_MIME_levels = count_MIME_levels + 1
      }
    }

    j = j + 1

  }


  attachments_both <- unlist(
    regmatches(meta,
               gregexpr('\\(\"(attachment|ATTACHMENT|inline|INLINE)\" \\(\"(filename|FILENAME)\" \"(.*?)\"',
                        meta))) # ok.. GMAIL returns uppercase

  attachments_both <- gsub('\\(\"(attachment|ATTACHMENT|inline|INLINE)\" \\(\"(filename|FILENAME)\" \"', '',
                           attachments_both)
  attachments_both <- gsub('\"', '', attachments_both) # literal... it is not a regular expression in this case

  # fazer mais algumas limpezas nos nomes

  if (!identical(attachments_both, character(0))) {
    df_meta_to_fetch <- data.frame(filenames = attachments_both,
                                   MIME_level = attachments_MIME_level,
                                   content_disposition = content_type,
                                   row.names = NULL,
                                   stringsAsFactors = FALSE)

    # df_meta_to_fetch$filenames <- gsub("\\?\\=\\s*|\\?=\r\n\\s*|=\\?[A-Za-z0-9-]+\\?Q\\?|\\?=$","",
    #                                   df_meta_to_fetch$filenames)

    df_meta_to_fetch$filenames <- gsub("\r\n","",
                                       df_meta_to_fetch$filenames)

    # gsub("\\?\\=\\s*", "", out_df$filenames)
    # "ending with"

    # substituting URI encoding of a dot (=2E|%2E) -- it happens with yandex mail in some cases
    # we opted for decoding only dots first to get the correct file extension part
    # df_meta_to_fetch$filenames <- gsub("=2E|%2E",".", df_meta_to_fetch$filenames)
    df_meta_to_fetch$filenames <- decode_mime_header(df_meta_to_fetch$filenames)

    forbiden_chars <- "[\\/:*?\"<>|]"
    df_meta_to_fetch$filenames <- gsub(forbiden_chars, "", df_meta_to_fetch$filenames)

    # standard URLdecoding:
    for (j in seq_along(df_meta_to_fetch$filenames)) {
      df_meta_to_fetch$filenames[j] <- tryCatch({
        utils::URLdecode(df_meta_to_fetch$filenames[j])
      }, warning = function(w) {
        df_meta_to_fetch$filenames[j]
      }, error = function(e) {
        df_meta_to_fetch$filenames[j]
      })
    }

    return(df_meta_to_fetch)

  } else {

    return(NULL)

  }


  # depois que fizer esse dataframe, posso filtarr dependendo do tipo de aruqivo que o usuario quer baixar
  # se eh inline ou attachment

}
allanvc/mRpostman documentation built on Jan. 26, 2024, 5:22 p.m.