#' fig_clean
#' Removes color values close to white, also possible to remove text on plot, useful for gridlines on KM curves
#' @param fig.hsl array of pixels in HSL format (hue/saturation/lightness)
#' @param bg_lightness a lightness threshold value between 0 and 1; every pixel with lightness > bg_lightness is considered background and removed (default: 0.1).
#' @param attempt_OCR a logical indicating whether to attempt text recognition (default: False)
#' @param word_sensitivity A sensitivity percentage value for word recognition, potential pixel patterns are considered words if confidence > word_sensitivity (default: 30)
#'
#'
#' @return fig.df: a dataframe with x,y and h,s,l values for each remaining pixel
#' @export
#'
#' @examples # fig_clean(fig.hsl = figure, bg_lightness = 0.1, attempt_OCR = T, word_sensitivity = 30)
#'
fig_clean <- function(fig.hsl, bg_lightness = 0.1, attempt_OCR = F, word_sensitivity = 30){
if(attempt_OCR == T){
# flip arrays for OCR
fig.l <- fig.hsl[,,3]
fig.l_flip <- fig.l[dim(fig.l)[1]:1,]
fig.hsl_flip <- fig.hsl[dim(fig.hsl)[1]:1,,]
# Taken from a stack exchange basically takes xml output into a tibblefunc
k1<- tesseract::ocr( magick::image_read(jpeg::writeJPEG(fig.l_flip)),HOCR = T)
doc <- xml2::read_xml(k1)
nodes <- xml2::xml_find_all(doc, ".//span[@class='ocrx_word']")
words <- xml2::xml_text(nodes)
meta <- xml2::xml_attr(nodes, 'title')
bbox <- stringr::str_replace(stringr::str_extract(meta, "bbox [\\d ]+"), "bbox ", "")
conf <- as.numeric(stringr::str_replace(stringr::str_extract(meta, "x_wconf.*"), "x_wconf ", ""))
if(any(conf > word_sensitivity)){
res_ocr <- tibble::tibble(confidence = conf, word = words, bbox = bbox)
bbox_mat <- apply(stringr::str_split(res_ocr$bbox, pattern = " ",simplify = T), 2, as.numeric)
bbox_mat <- as.data.frame(bbox_mat)
colnames(bbox_mat) <- c("x0","y0","x1","y1")
res_ocr <- dplyr::bind_cols(res_ocr,bbox_mat)
res_ocr <- dplyr::filter(res_ocr, confidence > word_sensitivity)
# Removing identified words
for(i in 1:dim(res_ocr)[1] ){
fig.hsl_flip[res_ocr$y0[i]:res_ocr$y1[i],res_ocr$x0[i]:res_ocr$x1[i],] <- c(0,0,1)
}
}
# flip array back
fig.hsl <- fig.hsl_flip[dim(fig.hsl_flip )[1]:1, ,]
}
# removing based on white sensitivity
fig.l <- fig.hsl[,,3]
non_bg.mat <- which(1 - fig.l > bg_lightness, arr.ind=T)
non_bg_h <- fig.hsl[cbind(non_bg.mat, rep(1, dim(non_bg.mat)[1]))]
non_bg_s <- fig.hsl[cbind(non_bg.mat, rep(2, dim(non_bg.mat)[1]))]
non_bg_l <- fig.hsl[cbind(non_bg.mat, rep(3, dim(non_bg.mat)[1]))]
# convert remaining points to dataframe
fig.df <- data.frame(non_bg.mat, as.numeric(non_bg_h), non_bg_s, non_bg_l)
colnames(fig.df) <- c("y", "x", "h", "s", "l")
if(attempt_OCR){
fig.df <- fig.df[fig.df$l > 0.1,]
}
return(fig.df)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.