R/plot_anomaly.R

Defines functions investigate_cluster investigate_neighbours investigate_feature_importance investigate_scores_decline investigate_regression_tree plot.stranger plot.fortifiedanomaly plot.anomalies plot.singular

Documented in plot.anomalies plot.fortifiedanomaly plot.singular plot.stranger

## CODE -- Set of visualization functions --------------
## THIS FILE CONTAINS FUNCTIONS TO MAKE SOME VISUALIZATIONS
# TO UNDERSTAND SCORE LOCALLY AROUND A SINGLE RECORD           # Author : WeLoveDataScience                                   # License: xxxxx


# Helper (sub) functions-----------

investigate_cluster <- function(data, id, score, anomaly_id, n.anom = 100, n.cluster = 4, ...){

  # Step1: Select records
  ## Take top n-anomaly
  data <- data[order(data[, score], decreasing = TRUE), ]
  nbre_observation <- min(n.anom, nrow(data))
  top_score <- data[1: nbre_observation, ]

  ## Add the selected Id (if not yet in) in the Top n-anomly selection
  if(!(anomaly_id %in% top_score[, id])){
    top_score <- rbind(top_score, data[data[, id] == anomaly_id,])

  }

  rownames(top_score) <- top_score[[id]]
  # Step 2: Compute hierachical clustering
  distance_matrix <- dist(scale(top_score[, !colnames(top_score) == id]), method = "euclidean")
  hc <- hclust(distance_matrix, method = "ward.D2")

  # Step 3: Plot Dendrogram
  ## Convert hclust into a dendrogram and plot
  hcd <- as.dendrogram(hc)

  ## Makes plots (Full tree + Zooming-in cutted trees )
  par(mfrow = c(2, 1), mar = c(1,1,1,1))

  ### Plot Full Tree
  plot(hcd,
       main = "Full Tree",
       type = "rectangle",
       ylab = "",
       nodePar = list(lab.cex = 0.001, pch = c(NA, 5),cex = 0.2, col = "steelblue"),
       edgePar = list(col = "steelblue"),
       axes = FALSE,
       cex.main = 1
  )
  rect.hclust(hc, k= n.cluster)

  ### Plot Tree with the cluster of the specific ID

  #### find the height corresponding to the number of cluster
  cutting_height = (max(hc$height) - min(hc$height))  / 2
  for(i in 1:nbre_observation){
    clust <- length(cut(hcd, h = cutting_height)$lower )
    if (clust > n.cluster){
      cutting_height = cutting_height + 0.01 * (max(hc$height) - min(hc$height))
    }else if (clust < n.cluster){
      cutting_height = cutting_height - 0.01 * (max(hc$height) - min(hc$height))
    }else{ break }
  }

  #### Find the cluster of the specific anomaly_id
  #https://stackoverflow.com/questions/25452472/extract-labels-membership-classification-from-a-cut-dendrogram-in-r-i-e-a-c

  cut_hcd <- cut(hcd, h = cutting_height)$lower

  for(i in 1:length(cut_hcd)){
    member_list <- unlist(dendrapply(cut_hcd[[i]],function(n) {
      labels<-c()
      if (is.leaf(n)) {
        a <- attributes(n)
        labels<-c(labels,a$label)
      }
      labels
    }))
    anomaly_id_cluster <- i
    if(as.character(anomaly_id) %in% member_list) {break}
  }

  # cf. https://rpubs.com/gaston/dendrograms



  chcd <- cut(hcd, h = cutting_height)$lower[[anomaly_id_cluster]]
  colLab <- function(n) {
    if (is.leaf(n)) {
      a <- attributes(n)
      labCol <- ifelse(a$label %in% as.character(anomaly_id),"orange","darkgray")
      attr(n, "nodePar") <- c(a$nodePar, list(lab.col = labCol,lab.cex = 0.6, pch = c(NA, 5),cex = 0.2))
    }
    n
  }
  # using dendrapply
  chcd = dendrapply(chcd, colLab)

  #### Make the plot
  plot(chcd,
       main = paste("Zooming-in Cluster ",anomaly_id_cluster,sep =""),
       type = "rectangle",
       ylab = "",
       edgePar = list(col = "steelblue"),
       axes = FALSE,
       cex.main = 0.8)
}


investigate_neighbours <- function(data, id, score, anomaly_id, k = NULL, ...){


  if (is.null(k)) k <- min(1000,nrow(data)-1)
  ## Load library and data
  Vectorize(require)(package = c("FNN","tidyr"),character.only = TRUE)

  if (inherits(data,"data.table")){ data=as.data.frame(data) }

    knn <- get.knn(data[, !colnames(data) %in% c(id,score)], k, algorithm="cover_tree")



  ## Filter data to keep only the K neighbours

  rownames(knn$nn.index) <- data[[id]]
  knn_id <- data[[id]][as.vector(knn$nn.index[anomaly_id, ])]
  # print(anomaly_id)
  knn_data <- data[data[, id] %in% c(anomaly_id, knn_id), ]

  ## Compute the number of columns for the faced_wrap to obtain a almost square plots
  x <- dim(knn_data)[2]
  column_number <- round(sqrt(x),0)

  ## Reshape
  knn_data <- gather(knn_data, key = indicator, value = val, - one_of(c(id,score)))

  ## plot
  ggplot(data=knn_data, aes(x = val, y = get(score))) +
    geom_smooth(color = "steelblue", method="loess", se=FALSE, size = 0.3) +
    geom_point(color = "gray", alpha = 0.7, size = 1) +
    geom_point(data = knn_data[knn_data[, id] == anomaly_id, ],aes(x = val, y = get(score)), color = "orange", alpha = 1, size = 3) +
    labs(y="Anomaly Score") +
    facet_wrap( ~indicator, nrow = column_number, scales = "free_x") +
    theme_bw() +
    theme(strip.background = element_rect(fill = "gray80", colour = "black",size = 0.5, linetype = "solid"),
          strip.text = element_text(face = "bold"),
          axis.title.x=element_blank(),
          axis.ticks.x=element_blank(),
          axis.ticks.y=element_blank(),
          legend.position="none")
}


investigate_feature_importance <- function(data, id, score, anomaly_id, num.trees=NULL, ...){
  if (is.null(num.trees)) num.trees <- min(300,2*sqrt(ncol(data)))
  Vectorize(require)(package = c("ranger"),character.only = TRUE)

  data$anomalyRF <- as.factor(data[[id]] %in% anomaly_id)
  remove=c(score,id)
  dataRF <- data[,!colnames(data) %in% remove,drop=FALSE]
  rf <- ranger::ranger(formula=anomalyRF~. , data=dataRF, importance="impurity", num.trees=num.trees, write.forest = FALSE)
  varimp <- data.frame(Importance=ranger::importance(rf))
  varimp[,"Variable"] <- rownames(varimp)


  # create a theme for dot plots, which can be reused
  theme_dotplot <- theme_bw(14) +
    theme(axis.text.y = element_text(size = rel(.75)),
          axis.ticks.y = element_blank(),
          axis.title.x = element_text(size = rel(.75)),
          panel.grid.major.x = element_blank(),
          panel.grid.major.y = element_line(size = 0.5),
          panel.grid.minor.x = element_blank())


  ggplot(varimp, aes(x = Importance, y = reorder(Variable, Importance))) +
    geom_point(color = "orange") +
   theme_dotplot +
    xlab("Importance") +
    ylab("Variable") + ggtitle(paste("Variable importance for anomalies with IDs:", paste(anomaly_id, collapse=",")))
}





investigate_scores_decline <- function(data, id, score, anomaly_id, k = NULL, n_label = 15, ...){
  if (is.null(k)) k <- min(1000,nrow(data)-1)
  #Load library and data
  Vectorize(require)(package = c("FNN"),character.only = TRUE)

    # step 0: Filter data to keep only the K-neighbours
  knn <- suppressMessages(get.knn(data[, !colnames(data) %in% c(id,score)], k, algorithm="cover_tree"))

  rownames(knn$nn.index) <- data[[id]]
  knn_id <- data[[id]][as.vector(knn$nn.index[anomaly_id, ])]
  # print(anomaly_id)
  knn_data <- data[data[, id] %in% c(anomaly_id, knn_id), ]

  # Step 1: Decreasing order
  knn_data <- knn_data[order(knn_data[, score], decreasing = TRUE), ]

  # Plot
  ggplot(data = knn_data, aes(x = 1:(nrow(knn_data)), y = get(score))) +
    geom_point(color = "steelblue", size = 0.7, alpha = 0.8) +
    labs( x = "k-closest neighbours", y="Anomaly Score") +
    geom_text(data = knn_data[1:n_label, ], aes(label = get(id), y = get(score), x = 1:n_label),
              hjust=-1.1, size=2.5, color = "orange") +
    theme_bw() +
    theme(strip.background = element_rect(fill = "gray80", colour = "black",size = 0.5, linetype = "solid"),
          strip.text = element_text(face = "bold"),
          axis.ticks.x=element_blank(),
          axis.ticks.y=element_blank())
}


investigate_regression_tree <- function(data, id, score, anomaly_id, k=NULL, ...){

  # Load library and data
  Vectorize(require)(package = c("FNN", "rpart"),character.only = TRUE)
  if (is.null(k)) k <- min(1000,round(nrow(data)*2/3))
  # step 0: Filter data to keep only the K-neighbours
  knn <- suppressMessages(get.knn(data[, !colnames(data) %in% c(id,score)], k, algorithm="cover_tree"))

  rownames(knn$nn.index) <- data[[id]]
  knn_id <- data[[id]][as.vector(knn$nn.index[anomaly_id, ])]
  # print(anomaly_id)
  knn_data <- data[data[, id] %in% c(anomaly_id, knn_id), ]

    # Step1: Build a large regression tree (with a small cp).
    tree <- rpart::rpart(knn_data[,score] ~ ., data=knn_data[, !colnames(knn_data) %in% c(score, id)] , method = "anova", control = rpart.control(cp = 0.0001))

  # Step2: Pick the tree size that minimizes prediction error.
  # Prediction error rate in cross-validation = Root node error * xerror * 100%
  # Hence we want the cp value (with a simpler tree) that minimizes the xerror.
  best_cp <- tree$cptable[which.min(tree$cptable[,"xerror"]),"CP"]

  # Step3: Prune the tree using the best cp.
  tree_pruned <- prune(tree, cp = best_cp)

  plot(tree_pruned, cex = .8, compress = FALSE, minbranch = 10, nspace = 0.01)
  text(tree_pruned,
       use.n=TRUE, # use.n = TRUE adds number of observations at each node
       xpd = TRUE,  # xpd = TRUE keeps the labels from exteding outside the plot
       #minlength = 1, # length of label
       all=FALSE,
       pretty = TRUE,
       cex=.7,
       digits = 2,
       fancy = FALSE)
}



#' Data visualizations of anomaly score locally around a specific data point
#' @details
#' Function that produces visualizations to understand the anomaly score locally around a specific data point.
#' We believe this should help people to trust scores a made by models even if they don’t fully understand them.
#' Today, 5 visualisazions are implemented;
#' (1) A hierarchical clustering, named "cluster", showing among the top n-anomaly which records belongs to the same cluster
#' a specific record. Finding the commun pattern amoung the cluster may lead to the orign of of the specifi record score.
#' (2) A dots plot, named "neighbours", showing the relationship between the anomly score and each feature for the
#' k nearest neighbours of a specific record.
#' (3) A bar chart, named "feature_importance", showing how sensitive is the anomaly score of a specific record to each of feature.
#' This may help to identify the features behind the score.
#' (4) A dots plot, names "score_decline", showing the decrease in anomaly score among the k nearest neighbours of a specific record.
#' The shape indicates how extrem and how frequent is the anomaly score of a speicif record amoung its neighbours.
#' (5) A Regression tree, named "regression_tree", showing the roots to high score around a specific record.
#'
#' @param x is either of class dataframe, stranger or anomaly. It contains the observations; each row represents an observation
#'  and each variable is stored in one column. It must have at least one column with IDs and one column with the anomaly
#'  score for each ID.
#' @param type is the name of the visualization;
#' (1) A hierarchical clustering, named "cluster", showing among the top n-anomaly which records belongs to the same cluster
#' a specific record. Finding the commun pattern amoung the cluster may lead to the orign of of the specifi record score.
#' (2) A dots plot, named "neighbours", showing the relationship between the anomly score and each feature for the
#' k nearest neighbours of a specific record.
#' (3) A bar chart, named "feature_importance", showing how sensitive is the anomaly score of a specific record to each of feature.
#' This may help to identify the features behind the score.
#' (4) A dots plot, names "score_decline", showing the decrease in anomaly score among the k nearest neighbours of a specific record.
#' The shape indicates how extrem and how frequent is the anomaly score of a speicif record amoung its neighbours.
#' (5) A Regression tree, named "regression_tree", showing the roots to high score around a specific record.
#' @param id is the colname with records IDs
#' @param score is the colname which contains the anomaly score
#' @param anomaly_id is the record ID you want to investigate
#' @param \dots Additional parameters to pass 
#'
#' @details 
#' Extra parameters that can be used in \dots:
#' \itemize{
#' \item check logical indicating if object data should be checked for validity. The default is TRUE, this check is not necessary
#' when data is known to be valid such as when it is the direct result of stranger().
#' \item  keep character vector: names of columns to keep (filter)
#' \item  drop character vector: names of columns to drop (filter)
#' \item  n.cluster is the number of cluster groups to emphasis.
#' This parameter must only be specified with type ="cluster".
#' \item  n.anom is the number of top anomalies to be considered.
#' This parameter must only be specified with type ="cluster".
#' \item  k is the number of neighbours to be considered. This parameter must always be specified,
#' except with type = "cluster".
#' \item  n_label specifies the number of data point to be labelled in the plot.
#' This parameter must only be specified with type ="scores_decline".
#' }
#' 
#' @return A plot
#' @rdname plot
plot.stranger <- function(x,
                          type="cluster",
                          id = ".id",
                          score = NULL,
                          anomaly_id = NULL,...){

  s <- singularize(x,...)


  plot.singular(s, type=type,id=id,score=score,anomaly_id=anomaly_id,...)
}


#' @rdname plot
plot.fortifiedanomaly <- function(x,
                          type="feature_importance",
                          id = ".id",
                          anomaly_id = NULL,score=NULL,
                          ...){

  type <- match.arg(type, c( "feature_importance","neighbours", "regression_tree") ) #"neighbours"

  # anoflag <- attr(x,"meta")$anomalyflag
  # score <- anoflag
  # if (type=="neighbours"){
  #   assertthat::assert_that(!is.null(anomaly_id),msg="Anomaly ID required")
  #   return(investigate_neighbours(data=x, id=id,
  #                   anomaly_id=anomaly_id,
  #                   score=x$meta$anomalyflag))
  # }

  if (type=="feature_importance" & is.null(score)) {
    score  <- names(x)[1]

    if (is.null(anomaly_id)) warning(paste0("No score given to determine anomaly; data have been sorted by ",score,". You should explicitely provide a score (anomaly metric) or will have unexpected results."))
  }



  ## Well-defined ID
  if (is.null(id))    stop("The function required an ID to be defined")
  if  ( anyDuplicated(x$id) > 0 )   stop("The function required an ID to be defined")


  id = match.arg(id, colnames(x))


  ## Well-defined score
  if (is.null(score) | is.numeric(x$score)) {
    stop("You need to specify a numeric column populated with the anomaly score")
  }
  else {
    score = match.arg(score, colnames(x))
  }

  # Select desired feature
  if (hasArg("keep") & hasArg("drop")){
    if (any(keep%in% drop)) stop("Can't assign variables  both in keep and drop arguments")
  }

  if (hasArg("keep")) {
    if (any(keep%in% c(id, score))) stop("Can't remove ID or anomaly score")
    if (!is.null(keep)) {x <- x[, (colnames(x)%in% keep),drop=FALSE]}
  }

  if (hasArg("keep") & hasArg("drop")){
    if (!is.null(drop)) x <- x[,!(colnames(x)%in% drop),drop=FALSE]
  }
  assertthat::assert_that(ncol(x)>0,msg="No more column remaining for investigation; please change keep/drop.")


  # select top anomaly if argument is NULL
  if (is.null(anomaly_id)){
    anodist <- x[[score]]
    sortdorder <- attr(anodist,"sort")
    anomaly_id <- x[[id]][order(sortdorder*x[[score]])][1]
    print(anomaly_id)
  }

data <- as.data.frame(x)

  # Call the plot function
  function_arguments <- list(data = data, id = id, score = score, anomaly_id = anomaly_id, ... )
  do.call(paste("investigate",type , sep="_"), function_arguments)


}


#' @rdname plot
plot.anomalies <- function(x,
                                  type="feature_importance",
                                  id = ".id",
                                  anomaly_id = NULL,
                                  ...){

  if (is.null(anomaly_id)) anomaly_id <- x[1]
  x <- fortify.anomalies(x,colname="flag_anomaly")
  plot.fortifiedanomaly(x,type=type,id=id,anomaly_id=anomaly_id,score="flag_anomaly",...)
}


#' @rdname plot
plot.singular <- function(x,
                          type="cluster",
                          id = ".id",
                          score = NULL,
                          anomaly_id = NULL,
                          ...){

  # Check parameters for validity
  ## Type is one of the options


  type <- match.arg(type, c("cluster","neighbours", "feature_importance", "scores_decline", "regression_tree") )


  if (!(type %in% c( "feature_importance", "regression_tree") )
      & (length(anomaly_id)>1)){
    anomaly_id <- anomaly_id[1]
    warning(paste("Method",type,"works with only one anomaly; first one considered."))
  }


  if (type=="feature_importance" & is.null(score)) {
    score  <- names(x)[1]

    if (is.null(anomaly_id)) warning(paste0("No score given to determine anomaly; data have been sorted by ",score,". You should explicitely provide a score (anomaly metric) or will have unexpected results."))
  }



  ## Well-defined ID
  if (is.null(id))    stop("The function required an ID to be defined")
  if  ( anyDuplicated(x$id) > 0 )   stop("The function required an ID to be defined")


  id = match.arg(id, colnames(x))


  ## Well-defined score
  if (is.null(score) | is.numeric(x$score)) {
    stop("You need to specify a numeric column populated with the anomaly score")
  }
  else {
    score = match.arg(score, colnames(x))
  }

  # Select desired feature
  if (hasArg("keep") & hasArg("drop")){
    if (any(keep%in% drop)) stop("Can't assign variables  both in keep and drop arguments")
  }

  if (hasArg("keep")) {
    if (any(keep%in% c(id, score))) stop("Can't remove ID or anomaly score")
    if (!is.null(keep)) {x <- x[, (colnames(x)%in% keep),drop=FALSE]}
  }

  if (hasArg("keep") & hasArg("drop")){
    if (!is.null(drop)) x <- x[,!(colnames(x)%in% drop),drop=FALSE]
  }
  assertthat::assert_that(ncol(x)>0,msg="No more column remaining for investigation; please change keep/drop.")


  # select top anomaly if argument is NULL
  if (is.null(anomaly_id)){
    anodist <- x[[score]]
    sortdorder <- attr(anodist,"sort")
    anomaly_id <- x[[id]][order(sortdorder*x[[score]])][1]
    print(anomaly_id)
  }



  # keep id and score
  # todo: checks presence of id (+manage) and score
  x <- x[,c(id,score),with=FALSE]
  data <- fortify.singular(x)


  ## Data is Dataframe
  if(!inherits(data, "data.frame")){
    data <- as.data.frame(data)
    message("Your data has been converted to a dataframe to be compatible with ggplot function.")}
  if(inherits(x, "data.table")){
    data <- as.data.frame(data)
    message("Your data has been converted to a dataframe to be compatible with ggplot function.")}


  # Call the plot function
  function_arguments <- list(data = data, id = id, score = score, anomaly_id = anomaly_id, ... )
  do.call(paste("investigate",type , sep="_"), function_arguments)
  #
  # investigate(
  #   data,
  #   type=type,
  #   id=id,
  #   score=score,
  #   anomaly_id=anomaly_id,...)
}

Try the stranger package in your browser

Any scripts or data that you put into this service are public.

stranger documentation built on March 18, 2018, 2:01 p.m.