R/patterns2021.R

Defines functions add_patterns.data.table add_patterns.default add_patterns models_to_centroids pattern_current_to_1.3.5 pattern_1.3.4_to_1.3.5 pattern_1.3.6_to_1.3.5 pattern_1.3.7_to_1.3.5 pattern_no_change

Documented in add_patterns models_to_centroids pattern_1.3.4_to_1.3.5 pattern_1.3.6_to_1.3.5 pattern_current_to_1.3.5 pattern_no_change

#' Groom data for patterns, no changes
#' 
#' Because VERIS changes, columns come and go.  This makes the clustering
#' function unhappy.  This function can be used if no no changes to the 
#' centroids are needed but the function is desired
#' 
#' @param veris veris object to cluster
#' @param centroids skmeans cluster prototypes
#' @return veris, centroids
#' @export
#' 
pattern_no_change <- function(veris, centroids) {
  return(list("veris"=veris, "centroids"=centroids))
}

#' Groom data for patterns from VERIS 1.3.7 to 1.3.5
#' TODO: eventually add the ability to chain the updates, instead of copying code
#' Because VERIS changes, columns come and go.  This makes the clustering
#' function unhappy.  We need to make sure the data we're clustering matches
#' the matrix we're using to cluster (based on veris 1.3.5).
#' 
#' @param veris veris object to cluster
#' @param centroids skmeans cluster prototypes
#' @return veris, centroids
#' @export
pattern_1.3.7_to_1.3.5 <- function(veris, centroids) {
  ### Because VERIS changes, the columns used to cluster may change.
  ### We can map renamed enumerations from VERIS to their new name using this mapping
  veris_name_map <- readr::read_csv("from,to
    action.hacking.variety.Footprinting,action.hacking.variety.Profile host
    action.hacking.variety.HTTP Response Splitting,action.hacking.variety.HTTP response splitting
    action.hacking.variety.Use of backdoor or C2,action.hacking.vector.Backdoor
    action.social.vector.Website,action.social.vector.Web application
    action.hacking.vector.Backdoor or C2,action.hacking.vector.Backdoor")
  ### For columns that are combined during a VERIS update  (i.e. 2 columns in 
  ###  centroids are now 1 in veris) we will duplicate the column in VERIS under
  ###  the old name to allow it to be used during clustering.
  veris_col_dup <- readr::read_csv("from,to
", n_max=1) # n_max forces to no rows
  ### It's possible for columns to be removed from VERIS.  Those must be removed
  ### from the the cluster
  veris_name_rem <- c("actor.Multiple", "victim.industry2.31_33", "victim.industry2.44_45", "victim.industry2.48_49", 
                      "action.malware.variety.SQL injection", "action.error.variety.Carelessness", 
                      "asset.assets.variety.S - web application", "asset.assets.variety.S - File server", 
                      "asset.assets.variety.M -Documents", "asset.assets.variety.S - Web Application", 
                      "attribute.confidentiality.data.variety.Persoal","action.error.variety.Omisssion")
  ### New columns added to veris cannot be added without re-clustering.
  ### Apply the rules to account for VERIS updates
  ### Handle joined columns
  if (nrow(veris_col_dup) > 0) {
    for (i in 1:nrow(veris_col_dup)) {
      veris[[veris_col_dup[i, ][["to"]]]] <- veris[[veris_col_dup[i, ][["from"]]]]
    }
  }
  ### Update columns
  colnames(centroids) <- plyr::mapvalues(colnames(centroids), veris_name_map$from, veris_name_map$to)
  ### Remove obsolete columns
  centroids <- centroids[, setdiff(colnames(centroids), veris_name_rem)]
  
  return(list("veris"=veris, "centroids"=centroids))
}



#' Groom data for patterns from VERIS 1.3.6 to 1.3.5
#' 
#' Because VERIS changes, columns come and go.  This makes the clustering
#' function unhappy.  We need to make sure the data we're clustering matches
#' the matrix we're using to cluster (based on veris 1.3.5).
#' 
#' @param veris veris object to cluster
#' @param centroids skmeans cluster prototypes
#' @return veris, centroids
#' @export
pattern_1.3.6_to_1.3.5 <- function(veris, centroids) {
  ### Because VERIS changes, the columns used to cluster may change.
  ### We can map renamed enumerations from VERIS to their new name using this mapping
  veris_name_map <- readr::read_csv("from,to
    action.hacking.variety.Footprinting,action.hacking.variety.Profile host
    action.hacking.variety.HTTP Response Splitting,action.hacking.variety.HTTP response splitting
    action.hacking.variety.Use of backdoor or C2,action.hacking.vector.Backdoor
    action.social.vector.Website,action.social.vector.Web application
    action.hacking.vector.Backdoor or C2,action.hacking.vector.Backdoor")
  ### For columns that are combined during a VERIS update  (i.e. 2 columns in 
  ###  centroids are now 1 in veris) we will duplicate the column in VERIS under
  ###  the old name to allow it to be used during clustering.
  veris_col_dup <- readr::read_csv("from,to
", n_max=1) # n_max forces to no rows
  ### It's possible for columns to be removed from VERIS.  Those must be removed
  ### from the the cluster
  veris_name_rem <- c("actor.Multiple", "victim.industry2.31_33", "victim.industry2.44_45", "victim.industry2.48_49", 
                      "action.malware.variety.SQL injection", "action.error.variety.Carelessness", 
                      "asset.assets.variety.S - web application", "asset.assets.variety.S - File server", 
                      "asset.assets.variety.M -Documents", "asset.assets.variety.S - Web Application", 
                      "attribute.confidentiality.data.variety.Persoal")
  ### New columns added to veris cannot be added without re-clustering.
  
  ### Apply the rules to account for VERIS updates
  ### Handle joined columns
  if (nrow(veris_col_dup) > 0) {
    for (i in 1:nrow(veris_col_dup)) {
      veris[[veris_col_dup[i, ][["to"]]]] <- veris[[veris_col_dup[i, ][["from"]]]]
    }
  }
  ### Update columns
  colnames(centroids) <- plyr::mapvalues(colnames(centroids), veris_name_map$from, veris_name_map$to)
  ### Remove obsolete columns
  centroids <- centroids[, setdiff(colnames(centroids), veris_name_rem)]
  
  return(list("veris"=veris, "centroids"=centroids))
}

#' Groom data for patterns from VERIS 1.3.4 to 1.3.5
#' 
#' Because VERIS changes, columns come and go.  This makes the clustering
#' function unhappy.  We need to make sure the data we're clustering matches
#' the matrix we're using to cluster first.  
#' 
#' @param veris veris object to cluster
#' @param centroids skmeans cluster prototypes
#' @return veris, centroids
#' @export
pattern_1.3.4_to_1.3.5 <- function(veris, centroids) {
  ### Because VERIS changes, the columns used to cluster may change.
  ### We can map renamed enumerations from VERIS to their new name using this mapping
  veris_name_map <- readr::read_csv("from,to
    action.malware.vector.Web download,action.malware.vector.Web application - download
    action.malware.vector.Web drive-by,action.malware.vector.Web application - drive-by")
  ### For columns that are combined during a VERIS update  (i.e. 2 columns in 
  ###  centroids are now 1 in veris) we will duplicate the column in VERIS under
  ###  the old name to allow it to be used during clustering.
  veris_col_dup <- readr::read_csv("from,to
    action.physical.vector.Victim work area,action.physical.vector.Visitor privileges
    action.physical.vector.Public facility,action.physical.vector.Uncontrolled location")
  ### It's possible for columns to be removed from VERIS.  Those must be removed
  ### from the the cluster
  veris_name_rem <- c("discovery_method.external.variety.Monitoring service")
  ### New columns added to veris cannot be added without re-clustering.
  
  ### Apply the rules to account for VERIS updates
  ### Handle joined columns
  for (i in 1:nrow(veris_col_dup)) {
    veris[[veris_col_dup[i, ][["to"]]]] <- veris[[veris_col_dup[i, ][["from"]]]]
  }
  ### Update columns
  colnames(centroids) <- plyr::mapvalues(colnames(centroids), veris_name_map$from, veris_name_map$to)
  ### Remove obsolete columns
  centroids <- centroids[, setdiff(colnames(centroids), veris_name_rem)]
  
  return(list("veris"=veris, "centroids"=centroids))
}


#' Currently a reference to verisr::pattern_1.3.7_to_1.3.5()
#' 
#' @inheritParams pattern_1.3.7_to_1.3.5
#' @export
pattern_current_to_1.3.5 <- function(...) {pattern_1.3.7_to_1.3.5(...)}


#' Helper function to retrieve prototypes from skmeans models for add_patterns
#' 
#' @return A matrix of of skmeans centroids with one row per centroid
models_to_centroids <- function() {
  #data("models", envir=environment())
  load(system.file("data", "2021_pattern_models.rda", package="verisr"), verbose=FALSE)
  
  non_breach_proto <- models$non_breach$prototypes
  breach_proto <- models$breach$prototypes
  rownames(non_breach_proto) <- paste0("cluster.I", rownames(non_breach_proto))
  rownames(breach_proto) <- paste0("cluster.B", rownames(breach_proto))
  
  rm(models)
  
  return(rbind(non_breach_proto, breach_proto))
}


#' A function to add patterns to a verisr dataframe
#' 
#' This function works by scoring the incidents according to the skmeans clusters.  Not, it can be rather slow on large data sets.
#'
#' @param veris  A verisr data.table or data.frame like veris object
#' @param centroids  A matrix of of skmeans centroids with one row per centroid. If null, (the default), the 2021 DBIR pattern centroids will be used.
#' @param prefix  The predicate of the column name to use for the patterns
#' @param replace  Whether to remove previously existing columns with the same predicate before adding the patterns
#' @param clusters  If TRUE, will add the clusters to the returned veris object as 'cluster.X' with a value of the cosign distance to the cluster
#' @param threshold The ratio of the difference of cluster-to-incident distances and the smallest cluster-to-incident distance. Defaults to 1/10th (i.e. the difference must be 1/10th the distance to the incident. This results in two percent of clusters kept in 2020 data)
#' @param veris_update_f A function to apply to centoids and veris to handle updates to veris after the clusters are defined.  It must take a veris object and a centroid and return a list of a veris object and centroid.  Because veris adds, removes, and changes enumerations each year, this function modifies the data and centroids, (currently based on veris 1.3.5) to be compatible with the current version of VERIS.
#' @return veris object with the columns added
#' @export
add_patterns <- function(veris,
                         centroids=NULL, 
                         prefix="pattern", 
                         replace=TRUE, 
                         clusters=FALSE, 
                         threshold=0.1,
                         veris_update_f = verisr::pattern_current_to_1.3.5) {
  UseMethod("add_patterns")
}
  
add_patterns.default <- function(veris,
                         centroids, #=NULL, 
                         prefix, #="pattern", 
                         replace, #=TRUE, 
                         clusters, #=FALSE, 
                         threshold, #=0.1,
                         veris_update_f) { # = verisr::pattern_1.3.6_to_1.3.5) {
  if (is.null(centroids)) {
    centroids <- verisr:::models_to_centroids()
  }
  
  ### 2021 V6
  cluster_map <- readr::read_csv(
    "B1,System Intrusion
    B2,Lost and Stolen Assets
    B3,Social Engineering
    B4,Basic Web Application Attacks
    B5,Privilege Misuse
    B6,Miscellaneous Errors
    B7,Miscellaneous Errors
    B8,System Intrusion
    I1,Denial of Service
    I2,Miscellaneous Errors
    I3,Lost and Stolen Assets
    I4,System Intrusion
    I5,Privilege Misuse
    I6,Basic Web Application Attacks
    I7,System Intrusion
    I8,Privilege Misuse
    I9,System Intrusion
    I10,Social Engineering", 
    comment = "#",
    quote = "'",
    col_names = c("cluster", "pattern")
  )
  cluster_map$cluster <- paste0("cluster.", cluster_map$cluster)
  cluster_map$pattern <- paste0(prefix, ".", cluster_map$pattern)
  
  #glimpse(cutoffs)
  #glimpse(cluster_map)
  
  ### This applies any veris updates necessary to match the veris version of the
  ###  data to the veris version of the clusters.
  if (!is.null(veris_update_f)) {
    veris_update <- veris_update_f(veris, centroids)
    veris <- veris_update[[1]]
    centroids <- veris_update[[2]]
  }
  ### As a last resort, we remove columns not in the veris dataframe, but
  ###  we generate a warning first
  if (length(setdiff(colnames(centroids), names(veris))) > 0) {
    warning(paste0("There are columns in the clusters not in the data: (", 
            paste(setdiff(colnames(centroids), names(veris)), collapse = ", "),
            "). Removing them from clusters so that clustering can proceed, ",
            "but you should check your data and if these are significant ",
            "columns, you shoudl apply a veris update function with ",
            "veris_update_f to properly handle them."))
    centroids <- centroids[ , intersect(names(veris), colnames(centroids))]
  }
  
  
  ## get columns to use for clustering
  #breach_cols_to_cluster <- colnames(models$breach$prototypes)
  #non_breach_cols_to_cluster <- colnames(models$non_breach$prototypes)
  cols_to_cluster <- colnames(centroids)
  
  
  ###  Predict clusters
  
  chunk <- suppressWarnings(verisr::flatten_verisr(veris)) # unfortunately slow
  
  ## save master_ids
  master_ids <- chunk$plus.master_id
  
  ## convert verisr to matrix
  chunk <- chunk[, cols_to_cluster]
  col_names <- names(chunk)
  chunk <- as.matrix(chunk)
  colnames(chunk) <- col_names
  rownames(chunk) <- master_ids
  
  ## It's possible for columns that are not in the schema to be in both chunk and the centroids
  ## In this case, the column will contain 'NAs' where the value wasn't specified
  ## If we leave them, they will cause clustering issues.
  ## Implicitly those should all be FALSE so we fill that in here. - GDB 210723
  chunk[is.na(chunk)] <- FALSE
  
  ## calculate cosign distance. (Small is better. Took me a while to figure that out)
  all_pred <- skmeans::skmeans_xdist(
    x=chunk[rowSums(chunk, na.rm=TRUE) > 0, ], 
    y=centroids)
  
  ### create a single dataframe of cluster distances
  all_pred <- tibble::as_tibble(all_pred, rownames="plus.master_id")
  ### Removed above and replaced with below 2 because rownames don't seem to be mained by skmeans
  #all_pred <- tibble::as_tibble(all_pred)
  #all_pred[["plus.master_id"]] <- rownames(chunk[rowSums(chunk) > 0, ])
  all_pred <- tidyr::pivot_longer(all_pred, -plus.master_id, names_to="cluster", values_to="value")
  
  
  ### Join patterns with dataset
    # map the clusters to patterns
  patterns <- dplyr::left_join(all_pred, cluster_map, by=c("cluster" = "cluster")) 
    # remove unmapped clusters (before scoring so unmapped clusters don't disadvantage other clusters)
    # the other clusters already have a max distance cutoff
  patterns <- dplyr::filter(patterns, pattern != paste0(prefix, ".NA")) 
  # DDoS is the only incident that can be included below the quality threshold. Because of that, it's picking up low-quality 
  #   Incidents that should be in other clusters.  Here we remove DDoS w/o DDoS so and let them go into the next likely pattern
  patterns <- dplyr::left_join(patterns, 
                               # Because 'veris' isn't flattened, we use chunk for the DoS columns.  However chunk is a matrix and
                               #    lacks 'plus.master_id', so we convert to a tibble and add plus.master_id.
                               # Little worried about performance, but chunk should be far smaller due to fewer columns and all but 1 are logical.
                               # GDB 210723
                               dplyr::mutate(
                                 tibble::as_tibble(chunk[ , c("action.hacking.variety.DoS", "action.malware.variety.DoS")]),
                                 `plus.master_id` = rownames(chunk)
                               ),
                               by=c("plus.master_id"="plus.master_id")
              )

  patterns <- dplyr::mutate(patterns, value = ifelse(
    pattern == paste0(prefix, ".", "Denial of Service") & !(action.hacking.variety.DoS | action.malware.variety.DoS), # if it's DoS w/o DOS
    1, # Make DoS the longest possible distance
    value)) # otherwise leave it alone
  patterns <- dplyr::select(patterns, -action.hacking.variety.DoS, -action.malware.variety.DoS)
  # 'cut' ends up being the TRUE/FALSE for the pattern. The closest always is in as distance is 0, but others below the threshold can also make it
  # first check that the value is above the minimum distance from the centroid for the given centroid
  # this is determined by looking at the distribution of distances where the centroid is the top assignment
  # if we don't do this, everything will get a pattern since it'll have a closest centroid no matter how far away
  #        dplyr::left_join(cutoffs, by=c("cluster" = "cluster")) %>%
  #        dplyr::mutate(cut = value <= cutoff) %>%
  # We look for centroids similar distance to the top centroid for each incident
  # This ensures that if an incident is equally close to two centroids it gets counted in both clusters
  # We compare the difference in distances between the clusters and incident and minimum-distance cluster to the incident.
  # We keep the incident assignment if the distance is below the reshold. (10% results in 2% of clusters kept in addition to the closest cluser in 2020 data)
  patterns <- dplyr::group_by(patterns, plus.master_id) 
  patterns <- dplyr::mutate(patterns, cut = (value - min(value))/min(value) <= threshold) 
  patterns <- dplyr::ungroup(patterns)
    #dplyr::glimpse() %>% # DEBUG
  patterns <- dplyr::select(patterns, plus.master_id, pattern, cut) 
  patterns <- dplyr::arrange(patterns, dplyr::desc(cut)) # arrange true first so it comes before false in distinct
  patterns <- dplyr::distinct(patterns, plus.master_id, pattern, .keep_all = TRUE)
  patterns <- tidyr::pivot_wider(patterns, names_from=pattern, values_from=cut, values_fill=FALSE)
  
  
  ### If replace, remove the existing prefix columns
  if (replace) {
      veris <- veris[, !grepl(paste0("^", prefix), names(veris))]
  }
  
  
  ### Join the patterns to the veris dataframe
  ## (Where the magic happens)
  ## Done by plus.master_id to support future veris where one line is not necessarily one incident
  veris <- dplyr::left_join(
    veris,
    patterns,
    by=c("plus.master_id"="plus.master_id")
  )
  
  
  ### Need to remove "NA's" before the sanity checks
  veris[grepl(paste0("^", prefix), names(veris))][is.na(veris[grepl(paste0("^", prefix), names(veris))])] <- FALSE
  
  ### Sanity checks
  ## Ensure the core tenant of the pattern is met
  veris[
    veris[[paste0(prefix, ".Privilege Misuse")]]  & !veris[["action.Misuse"]], # misuse without misuse
    paste0(prefix, ".Privilege Misuse")] <- FALSE
  veris[
    veris[[paste0(prefix, ".Miscellaneous Errors")]]  & !veris[["action.Error"]], # error without error
    paste0(prefix, ".Miscellaneous Errors")] <- FALSE
  veris <- veris[
    veris[[paste0(prefix, ".Social Engineering")]]  & !veris[["action.Social"]], # phishing without Social action
    paste0(prefix, ".Social Engineering")] <-  FALSE
  veris <- veris[
    veris[[paste0(prefix, ".System Intrusion")]]  & !(veris[["action.Hacking"]] | veris[["action.Malware"]]), # Intrusion without hacking/malware
    paste0(prefix, ".System Intrusion")] <- FALSE
  #veris[
  #  veris[[paste0(prefix, ".Crimeware")]]  & !veris$action.Malware, # Crimeware without malware
  #  paste0(prefix, ".Crimeware")] <- FALSE
  veris[
    veris[[paste0(prefix, ".Basic Web Application Attacks")]]  & 
      !unlist(apply(veris[,intersect(names(veris), 
                             c( # this list accounts for old and new 'web application'
                               "asset.assets.variety.S - Web application",
                               "action.hacking.vector.Web application",
                               "action.malware.vector.Web download",
                               "action.malware.vector.Web drive-by",
                               "action.malware.vector.Web application - drive-by",
                               "action.malware.vector.Web application - download",
                               "action.malware.vector.Web application",
                               "action.social.vector.Website",
                               "action.social.vector.Web application",
                               "action.error.vector.Web application"
                             ))],
            MARGIN=1,
            any)),
    paste0(prefix, ".Basic Web Application Attacks")] <- FALSE
  veris[
    veris[[paste0(prefix, ".Lost and Stolen Assets")]]  & !(veris$action.Physical | veris$action.error.variety.Loss), # loss w/o physical or loss
    paste0(prefix, ".Lost and Stolen Assets")] <- FALSE
### NOTE:   Commenting out the  Denial of service sanity check:
###         This is forced above, but in such a way that the incident can be reclassified rather than being classified 'EE'
###         The reason is the training data used Quality incidents where DoS can bypass the quality filter, drawing non-DoS low quality incidents
###         to it.  To remedy that, we allow re-assignment of Non-DOS DoS-cendroid incidents to the next closest cluster.
#  veris[
#    veris[[paste0(prefix, ".Denial of Service")]]  & !(veris$action.hacking.variety.DoS | veris$action.malware.variety.DoS), # DoS w/o DoS
#    paste0(prefix, ".Denial of Service")] <- FALSE
  
  
  # If including clusters, join the cluster distances
  if (clusters) {
    all_clust <- dplyr::select(all_pred, plus.master_id, cluster, value)
    all_clust <- tidyr::pivot_wider(all_clust, names_from=cluster, values_from=value, values_fill=NA)
    veris <- dplyr::left_join(
      veris,
      all_clust,
      by=c("plus.master_id"="plus.master_id")
    )
  }
  
  # code anything uncoded as everything else
  veris[[paste0(prefix, ".Everything Else")]] <- !unlist(apply(veris[, grep(paste0("^", prefix), names(veris))], MARGIN=1, any, na.rm=TRUE))
  
  return(veris)
}


add_patterns.data.table <- function(veris,
                                    centroids, #=NULL, 
                                    prefix, #="pattern", 
                                    replace, #=TRUE, 
                                    clusters, #=FALSE, 
                                    threshold, #=0.1,
                                    veris_update_f) { # = verisr::pattern_1.3.6_to_1.3.5) {
  if (is.null(centroids)) {
    centroids <- verisr:::models_to_centroids()
  }
  
  ### 2021 V6
  cluster_map <- readr::read_csv(
    "B1,System Intrusion
    B2,Lost and Stolen Assets
    B3,Social Engineering
    B4,Basic Web Application Attacks
    B5,Privilege Misuse
    B6,Miscellaneous Errors
    B7,Miscellaneous Errors
    B8,System Intrusion
    I1,Denial of Service
    I2,Miscellaneous Errors
    I3,Lost and Stolen Assets
    I4,System Intrusion
    I5,Privilege Misuse
    I6,Basic Web Application Attacks
    I7,System Intrusion
    I8,Privilege Misuse
    I9,System Intrusion
    I10,Social Engineering", 
    comment = "#",
    quote = "'",
    col_names = c("cluster", "pattern")
  )
  cluster_map$cluster <- paste0("cluster.", cluster_map$cluster)
  cluster_map$pattern <- paste0(prefix, ".", cluster_map$pattern)
  
  #glimpse(cutoffs)
  #glimpse(cluster_map)
  
  ### This applies any veris updates necessary to match the veris version of the
  ###  data to the veris version of the clusters.
  if (!is.null(veris_update_f)) {
    veris_update <- veris_update_f(veris, centroids)
    veris <- veris_update[[1]]
    centroids <- veris_update[[2]]
  }
  ### As a last resort, we remove columns not in the veris dataframe, but
  ###  we generate a warning first
  if (length(setdiff(colnames(centroids), names(veris))) > 0) {
    warning(paste0("There are columns in the clusters not in the data: (", 
                   paste(setdiff(colnames(centroids), names(veris)), collapse = ", "),
                   "). Removing them from clusters so that clustering can proceed, ",
                   "but you should check your data and if these are significant ",
                   "columns, you should apply a veris update function with ",
                   "veris_update_f to properly handle them.  ",
                   "This error is likely due to using a schema newer than 1.3.5."))
    centroids <- centroids[ , intersect(names(veris), colnames(centroids))]
  }
  
  
  ## get columns to use for clustering
  #breach_cols_to_cluster <- colnames(models$breach$prototypes)
  #non_breach_cols_to_cluster <- colnames(models$non_breach$prototypes)
  cols_to_cluster <- colnames(centroids)
  
  
  ###  Predict clusters
  
  chunk <- suppressWarnings(verisr::flatten_verisr(veris)) # unfortunately slow
  
  ## save master_ids
  master_ids <- chunk$plus.master_id
  
  ## convert verisr to matrix
  chunk <- chunk[, ..cols_to_cluster]
  col_names <- names(chunk)
  chunk <- as.matrix(chunk)
  colnames(chunk) <- col_names
  rownames(chunk) <- master_ids
  
  ## It's possible for columns that are not in the schema to be in both chunk and the centroids
  ## In this case, the column will contain 'NAs' where the value wasn't specified
  ## If we leave them, they will cause clustering issues.
  ## Implicitly those should all be FALSE so we fill that in here. - GDB 210723
  chunk[is.na(chunk)] <- FALSE
  
  ## calculate cosign distance. (Small is better. Took me a while to figure that out)
  all_pred <- skmeans::skmeans_xdist(
    x=chunk[rowSums(chunk, na.rm=TRUE) > 0, ], 
    y=centroids)
  
  ### create a single dataframe of cluster distances
  all_pred <- tibble::as_tibble(all_pred, rownames="plus.master_id")
  ### Removed above and replaced with below 2 because rownames don't seem to be mained by skmeans
  #all_pred <- tibble::as_tibble(all_pred)
  #all_pred[["plus.master_id"]] <- rownames(chunk[rowSums(chunk) > 0, ])
  all_pred <- tidyr::pivot_longer(all_pred, -plus.master_id, names_to="cluster", values_to="value")
  
  
  ### Join patterns with dataset
  # map the clusters to patterns
  patterns <- dplyr::left_join(all_pred, cluster_map, by=c("cluster" = "cluster")) 
  # remove unmapped clusters (before scoring so unmapped clusters don't disadvantage other clusters)
  # the other clusters already have a max distance cutoff
  patterns <- dplyr::filter(patterns, pattern != paste0(prefix, ".NA")) 
  # DDoS is the only incident that can be included below the quality threshold. Because of that, it's picking up low-quality 
  #   Incidents that should be in other clusters.  Here we remove DDoS w/o DDoS so and let them go into the next likely pattern
  patterns <- dplyr::left_join(patterns, 
                               # Because 'veris' isn't flattened, we use chunk for the DoS columns.  However chunk is a matrix and
                               #    lacks 'plus.master_id', so we convert to a tibble and add plus.master_id.
                               # Little worried about performance, but chunk should be far smaller due to fewer columns and all but 1 are logical.
                               # GDB 210723
                               dplyr::mutate(
                                 tibble::as_tibble(chunk[ , c("action.hacking.variety.DoS", "action.malware.variety.DoS")]),
                                 `plus.master_id` = rownames(chunk)
                               ),
                               by=c("plus.master_id"="plus.master_id")
  )
  patterns <- dplyr::mutate(patterns, value = ifelse(
    pattern == paste0(prefix, ".", "Denial of Service") & !(action.hacking.variety.DoS | action.malware.variety.DoS), # if it's DoS w/o DOS
    1, # Make DoS the longest possible distance
    value)) # otherwise leave it alone
  patterns <- dplyr::select(patterns, -action.hacking.variety.DoS, -action.malware.variety.DoS)
  # 'cut' ends up being the TRUE/FALSE for the pattern. The closest always is in as distance is 0, but others below the threshold can also make it
  # first check that the value is above the minimum distance from the centroid for the given centroid
  # this is determined by looking at the distribution of distances where the centroid is the top assignment
  # if we don't do this, everything will get a pattern since it'll have a closest centroid no matter how far away
  #        dplyr::left_join(cutoffs, by=c("cluster" = "cluster")) %>%
  #        dplyr::mutate(cut = value <= cutoff) %>%
  # We look for centroids similar distance to the top centroid for each incident
  # This ensures that if an incident is equally close to two centroids it gets counted in both clusters
  # We compare the difference in distances between the clusters and incident and minimum-distance cluster to the incident.
  # We keep the incident assignment if the distance is below the reshold. (10% results in 2% of clusters kept in addition to the closest cluser in 2020 data)
  patterns <- dplyr::group_by(patterns, plus.master_id) 
  patterns <- dplyr::mutate(patterns, cut = (value - min(value))/min(value) <= threshold) 
  patterns <- dplyr::ungroup(patterns)
  #dplyr::glimpse() %>% # DEBUG
  patterns <- dplyr::select(patterns, plus.master_id, pattern, cut) 
  patterns <- dplyr::arrange(patterns, dplyr::desc(cut)) # arrange true first so it comes before false in distinct
  patterns <- dplyr::distinct(patterns, plus.master_id, pattern, .keep_all = TRUE)
  patterns <- tidyr::pivot_wider(patterns, names_from=pattern, values_from=cut, values_fill=FALSE)
  
  
  ### If replace, remove the existing prefix columns
  if (replace) {
    veris <- veris[, !grepl(paste0("^", prefix), names(veris)), with=FALSE]
  }
  
  
  ### Join the patterns to the veris dataframe
  ## (Where the magic happens)
  ## Done by plus.master_id to support future veris where one line is not necessarily one incident
  veris <- dplyr::left_join(
    veris,
    patterns,
    by=c("plus.master_id"="plus.master_id")
  )
  
  
  ### Need to remove "NA's" before the sanity checks
  for (j in grep(paste0("^", prefix), names(veris), value=TRUE)) {
    data.table::set(veris,which(is.na(veris[[j]])),j,FALSE)
  }
  
  ## TODO: Need to convert lines to be data table compatible
  
  ### Sanity checks
  ## Ensure the core tenant of the pattern is met
  data.table::set(veris, which(!veris$action.Misuse), paste0(prefix, ".Privilege Misuse"), FALSE) # misuse without misuse
  data.table::set(veris, which(!veris$action.Error), paste0(prefix, ".Miscellaneous Errors"), FALSE) # error without error
  data.table::set(veris, which(!veris$action.Social), paste0(prefix, ".Social Engineering"), FALSE) # phishing without Social action
  data.table::set(veris, which(!(veris$action.Hacking | veris$action.Malware)), paste0(prefix, ".System Intrusion"), FALSE) # Intrusion without hacking/malware
  data.table::set(veris, # webapp without web application
                  which(!unlist(apply(veris[,intersect(names(veris), 
                                                c( # this list accounts for old and new 'web application'
                                                 "asset.assets.variety.S - Web application",
                                                 "action.hacking.vector.Web application",
                                                 "action.malware.vector.Web download",
                                                 "action.malware.vector.Web drive-by",
                                                 "action.malware.vector.Web application - drive-by",
                                                 "action.malware.vector.Web application - download",
                                                 "action.malware.vector.Web application",
                                                 "action.social.vector.Website",
                                                 "action.social.vector.Web application",
                                                 "action.error.vector.Web application"
                                               )), 
                                     with = FALSE],
                               MARGIN=1,
                               any))
                        ),
                  paste0(prefix, ".Basic Web Application Attacks"),
                  FALSE
  )
  data.table::set(veris, which(!(veris$action.Physical | veris$action.error.variety.Loss)), paste0(prefix, ".Lost and Stolen Assets"), FALSE) # loss w/o physical or loss
  ### NOTE:   Commenting out the  Denial of service sanity check:
  ###         This is forced above, but in such a way that the incident can be reclassified rather than being classified 'EE'
  ###         The reason is the training data used Quality incidents where DoS can bypass the quality filter, drawing non-DoS low quality incidents
  ###         to it.  To remedy that, we allow re-assignment of Non-DOS DoS-cendroid incidents to the next closest cluster.
  # data.table::set(veris, which(!(veris$action.hacking.variety.DoS | veris$action.malware.variety.DoS)), paste0(prefix, ".Denial of Service"), FALSE) # DoS w/o DoS
  
  
  # If including clusters, join the cluster distances
  if (clusters) {
    all_clust <- dplyr::select(all_pred, plus.master_id, cluster, value)
    all_clust <- tidyr::pivot_wider(all_clust, names_from=cluster, values_from=value, values_fill=NA)
    veris <- dplyr::left_join(
      veris,
      all_clust,
      by=c("plus.master_id"="plus.master_id")
    )
  }
  
  # Required for the creation of everything else column below
  data.table::setDT(veris)
  
  # code anything uncoded as everything else
  data.table::set(veris, i = NULL, paste0(prefix, ".Everything Else"), !unlist(apply(veris[, grep(paste0("^", prefix), names(veris)), with=FALSE], MARGIN=1, any, na.rm=TRUE)))
  
  return(veris)
}
vz-risk/verisr documentation built on Aug. 5, 2023, 4:34 a.m.