R/malwarehash.R

Defines functions .malware_hash

# documented, memoised versions of these are at the end

.malware_hash <- function(hashes, timeout=getOption("timeout")) {

  host <- "hash.cymru.com"
  port <- 43

  # setup query
  cmd <- "begin\n"
  hashes_c <- paste(unlist(hashes), collapse="\n")
  cmd <- sprintf("%s%s\nend\n", cmd, hashes_c)

  # setup connection and post query
  con <- sock(host=host, port=port, blocking=TRUE, open="r+", timeout=timeout)
  if (is.null(con$result)) {
    message("Error opening connection to hash.cymru.com")
    data.frame(
      sha1_md5 = rep(NA, length(hashes)),
      last_known_timestamp = rep(NA, length(hashes)),
      detection_pct = rep(NA, length(hashes))
    ) -> out
    class(out) <- c("tbl_df", "tbl", "data.frame")
    return(out)
  }

  con <- con$result
  cat(cmd, file=con)
  response <- readLines(con)
  close(con)

  if (length(response) == 0) {
    message("Error reading from connection to hash.cymru.com")
    data.frame(
      sha1_md5 = rep(NA, length(hashes)),
      last_known_timestamp = rep(NA, length(hashes)),
      detection_pct = rep(NA, length(hashes))
    ) -> out
    class(out) <- c("tbl_df", "tbl", "data.frame")
    return(out)
  }

  # trim header, split fields and convert results
  trim_df(
    read.table(
      textConnection(tail(response, -2)),
      stringsAsFactors = FALSE, 
      header = FALSE,
      na.strings = "NO_DATA"
    )
  ) -> response
  names(response) <- c("sha1_md5", "last_known_timestamp", "detection_pct")
  response$last_known_timestamp <- as.POSIXct(
    response$last_known_timestamp, "1970-01-01 00:00:00", tz="GMT"
  )
  response$detection_pct <- as.numeric(response$detection_pct) / 100
  class(response) <- c("tbl_df", "tbl", "data.frame")
  return(response)

}

#' Retrieves malware hash metadata from the Malware Hash Registry
#'
#' The Malware Hash Registry (MHR) project is a look-up service similar to the
#' Team Cymru IP address to ASN mapping project. This project differs however,
#' in that you can query the service for a computed MD5 or SHA-1 hash of a file
#' and, if it is malware and the service knows about it, it returns the last
#' time it's seen it along with an approximate anti-virus detection percentage.
#'
#' @param hashes vector of IPv4 address (character - dotted-decimal)
#' @param timeout	numeric: the timeout (in seconds) to be used for this connection.
#'        Beware that some OSes may treat very large values as zero: however the
#'        POSIX standard requires values up to 31 days to be supported.
#' @return data frame of BGP Origin ASN lookup results
#'   \itemize{
#'     \item \code{sha1_md5} - hash queried for
#'     \item \code{last_known_timestamp} - last known GMT timestamp associated with that hash
#'     \item \code{detection_pct} - detection percentage across a mix of AV packages
#'   }
#'   If a socket connection cannot be made (i.e. a network problem on your
#'   end or a service/network problem on their end), all columns will be
#'   \code{NA}.
#' @note Attempting to enumerate the malware registry via the public service
#'       interface is not only impractical, it is also strictly prohibited.
#'       Contact Team Cymru if the public interface is insufficient for your
#'       needs and we may be able to come up with alternative arrangement. Also,
#'       A direct connection to TCP Port 43 (WHOIS) is required for most of these
#'       API functions to work properly.
#' @seealso \url{http://www.team-cymru.org/IP-ASN-mapping.html}
#' @export
#' @examples \dontrun{
#' malware_hash(c("1250ac278944a0737707cf40a0fbecd4b5a17c9d",
#'                "7697561ccbbdd1661c25c86762117613",
#'                "cbed16069043a0bf3c92fff9a99cccdc",
#'                "e6dc4f4d5061299bc5e76f5cd8d16610",
#'                "e1112134b6dcc8bed54e0e34d8ac272795e73d74"))
#' }
malware_hash <- memoise::memoise(.malware_hash)

Try the cymruservices package in your browser

Any scripts or data that you put into this service are public.

cymruservices documentation built on May 2, 2019, 2:59 p.m.