R/process_checkboxes.R

Defines functions get_chkbox_wrapper get_chkbox_questions get_chkbox_options identify_chkboxes_by_parts identify_chkboxes

Documented in get_chkbox_options get_chkbox_questions get_chkbox_wrapper identify_chkboxes identify_chkboxes_by_parts

#'@title get checkboxes
#'@param img_file file of image or np array of image
#'@export
identify_chkboxes <- function(img_file){
  # Read the image
  #tictoc::tic()
  if (!"numpy.ndarray" %in% class(img_file)) {
    gray <-  cv2$imread(normalizePath(img_file), 0L) %>% 
      reticulate::np_array(dtype = "uint8")
  } else {
    gray <- img_file
  }

  gray <-  cv2$GaussianBlur(gray, reticulate::tuple(7L, 7L), 0L) %>%
    reticulate::np_array(dtype = "uint8")

  # threshold the image
  c(ret, thresh1) %<-% cv2$threshold(gray ,128,255,
                                     bitwOr(cv2$THRESH_BINARY, cv2$THRESH_OTSU))
  thresh1 <- 255-thresh1
  # find contours in the image
  c(cnts, hirachy) %<-% cv2$findContours(thresh1 %>%
                                           reticulate::np_array(dtype = "uint8"),
                                         cv2$RETR_EXTERNAL, cv2$CHAIN_APPROX_NONE)
  c(cnts, boundingBoxes) %<-% sort_contours(cnts, "top-to-bottom", hmax = 300)
  #tictoc::toc()
  orig <-  gray
  i <-  0
  threshold_max_area <- 3000;
  threshold_min_area <- 200
  #tictoc::tic()
  checkboxes_cnts <- cnts %>% purrr::map(
    function(c) {
      area <-  cv2$contourArea(c)
      if (area > threshold_max_area | area < threshold_min_area) return(NULL)
      peri <- cv2$arcLength(c, T)
      approx <- cv2$approxPolyDP(c, 0.05 * peri, T)
      c(x, y, w, h) %<-% cv2$boundingRect(approx)
      aspect_ratio <- w / h
      if (aspect_ratio < 0.90 | aspect_ratio > 1.1) return(NULL)
      sqaure_chk <- ptwise_chk_approx(approx)
      if (sqaure_chk$flag){
        return(c)
      } else {
        return(NULL)
      }
    }
  )
  #tictoc::toc()

  if (length(checkboxes_cnts) > 0) {
    checkboxes_cnts[sapply(checkboxes_cnts, is.null)] <- NULL
    if (length(checkboxes_cnts) == 0) return(NULL)
    checkboxes_df <- checkboxes_cnts %>%
      purrr::map(
        function(c) {
          mat <- c %>% cv2$boundingRect() %>%
            unlist() %>% t() %>% tibble::as_tibble()
          colnames(mat) <- c('x', 'y', 'w', 'h')
          return(mat)
        }) %>% dplyr::bind_rows()
    return(checkboxes_df %>%
             dplyr::mutate(chkbox_id = seq(1, dplyr::n(), 1)))
  }
  # 1:length(cnts) %>% purrr::map(
  #   function(i) {
  #     c(x, y, w, h) %<-% cv2$boundingRect(cnts[[i]])
  #     new_img <- orig %>% reticulate::py_to_r() %>% .[y:(y+h), x:(x+w)]
  #     cv2$imwrite(file.path('tmp1', paste0(i,'.png')), new_img)
  #   }
  # )
}

#'@title get checkboxes by parts
#'@param bounds_df identify bounds data frame
#'@param color_img colored image
#'@export
identify_chkboxes_by_parts <- function(bounds_df, color_img) {
  removed_img <- remove_color(color_img) %>% reticulate::np_array('uint8')
  res_main <- removed_img %>% identify_chkboxes() #%>% arrange(y, x)
  chkbox_cnts <- 1:nrow(bounds_df) %>%
    purrr::map(function(l) {
      row <- bounds_df[l, ]
      part_img <- quick_img_chk(row, removed_img, NULL)
      res <- part_img %>% identify_chkboxes()
      if (!is.null(res)) res <- res %>%
        dplyr::mutate(x = x + row$x, y = y + row$y)
    }) %>% dplyr::bind_rows() %>%
    dplyr::mutate(chkbox_id = seq(1, dplyr::n(), 1))
  
  if (is.null(res_main)) return(chkbox_cnts)
  
  res_ids <- purrr::cross_df(list(id1 = res_main$chkbox_id, id2 = chkbox_cnts$chkbox_id)) %>% 
    dplyr::inner_join(res_main %>% dplyr::select(id1:=chkbox_id, x, y), by = 'id1') %>%
    dplyr::inner_join(chkbox_cnts %>% dplyr::select(id2:=chkbox_id, x, y), by = 'id2') %>% 
    dplyr::mutate(dist = sqrt((x.x - x.y)^2 + (y.x - y.y)^2)) %>% 
    dplyr::group_by(id1) %>% dplyr::summarise(dist = min(dist)) %>% 
    dplyr::filter(dist > 50) %>% dplyr::pull(id1)
  
  chkbox_cnts1 <- dplyr::bind_rows(
    chkbox_cnts, 
    res_main %>% dplyr::filter(chkbox_id %in% res_ids)
  )%>% dplyr::mutate(chkbox_id = seq(1, dplyr::n(), 1))
  
  return(chkbox_cnts1)
}

#'@title get checkbox options
#'@param chkbox_df data frame with checkbox information
#'@param words_df data frame with all the words info from azure
#'@param question_df data frame with all the questions
#'@export
get_chkbox_options <- function(chkbox_df, words_df, question_df) {
  chkbox_df1 <- chkbox_df %>% dplyr::inner_join(question_df, by = 'chkbox_id')%>%
    dplyr::group_by(line_text) %>% dplyr::mutate(row = floor(mean(y))) %>%
    dplyr::arrange(line_text, x) %>%
    dplyr::mutate(leadx = dplyr::lead(x, 1, default = Inf), 
                  lagx = dplyr::lag(x, 1, default = 0)) %>%
    dplyr::ungroup() %>%
    dplyr::select(-line_text)

  joint_df <- purrr::cross_df(list(combo_id = words_df$combo_id,
                                            chkbox_id = chkbox_df1$chkbox_id)) %>%
    dplyr::left_join(words_df, by = 'combo_id') %>%
    dplyr::left_join(chkbox_df1, by = 'chkbox_id') %>%
    dplyr::mutate(
      #diffx = x.x  - x.y - w.y, 
      diffx = x.x + w.x - x.y,
      diffy = y.x - y.y,
      dist = sqrt(diffx^2 + diffy^2)
    ) %>%
    dplyr::mutate(text = gsub("\\[|\\]|\\|", "", text) %>%
                    stringr::str_squish()) %>%
    dplyr::filter(text != "") %>%
    dplyr::distinct() 
  
  preceding_word_df  <- joint_df %>%
    dplyr::filter(x.x < x.y, abs(diffy) <= 25) %>% 
    dplyr::group_by(chkbox_id) %>%
    dplyr::mutate(min_diffx = diffx[abs(diffx) == min(abs(diffx))]) %>% 
    dplyr::filter(diffx <= min_diffx, x.x > lagx) %>% 
    #dplyr::filter(abs(diffy) <= 25, diffx > -25, x.x <= leadx)%>%
    dplyr::summarise(pre_text = paste(text, collapse = " "))
  
  following_word_df <- joint_df %>%
    dplyr::filter(x.x > (x.y + w.y), abs(diffy) <= 25) %>% 
    dplyr::group_by(chkbox_id) %>%
    dplyr::mutate(min_diffx = diffx[abs(diffx) == min(abs(diffx))]) %>% 
    dplyr::filter(diffx >= min_diffx, x.x < leadx) %>% 
    #dplyr::filter(abs(diffy) <= 25, diffx > -25, x.x <= leadx)%>%
    dplyr::summarise(follow_text = paste(text, collapse = " "))

  options_df <- preceding_word_df %>% 
    dplyr::left_join(following_word_df)
  return(options_df)
}

#'@title get checkbox questions
#'@param chkbox_df data frame with checkbox information
#'@param lines_df data frame with all the lines info from azure
#'@export
get_chkbox_questions <- function(chkbox_df, lines_df) {
  question_df <- purrr::cross_df(list(line_id = lines_df$line_id,
                                      chkbox_id = chkbox_df$chkbox_id)) %>%
    dplyr::left_join(lines_df, by = 'line_id') %>%
    dplyr::left_join(chkbox_df, by = 'chkbox_id') %>%
    dplyr::mutate(
      diffx = x.x  - x.y - w.y, diffy = y.x - y.y,
      dist = sqrt(diffx^2 + diffy^2)
    ) %>%
    dplyr::mutate(text = gsub("\\[|\\]", "", text) %>%
                    stringr::str_squish()) %>%
    dplyr::filter(text != "")  %>%
    dplyr::group_by(chkbox_id) %>%
    dplyr::filter(diffy >= -25, diffy <= 25) %>%
    #dplyr::filter(!grepl('yes no', ignore.case = T, text)) %>%
    dplyr::arrange(x.x) %>%
    dplyr::distinct() %>%
    dplyr::summarise(line_text = paste(text, collapse = " "))
  return(question_df)
}

#'@title get checkboxes questions, options, selections
#'@param chkbox_df data frame with checkbox information
#'@param words_df  data frame with all the words info from azure
#'@param lines_df  data frame with all the lines info from azure
#'@param img       the image np array of the page
#'@export
get_chkbox_wrapper <- function(chkbox_df, words_df, lines_df, img) {

  ##find out the question
  question_df <- get_chkbox_questions(chkbox_df = chkbox_df,
                                      lines_df = lines_df)
  ##find choice options
  preceding_word_df <- get_chkbox_options(chkbox_df = chkbox_df,
                  words_df = words_df, question_df = question_df)

  question_df1 <- question_df %>%
    dplyr::left_join(preceding_word_df %>% dplyr::select(chkbox_id, text),
                     by = "chkbox_id") %>%
    dplyr::left_join(chkbox_df, by = "chkbox_id")

  question_df1$box <- 1:nrow(question_df1) %>%
    purrr::map(~quick_img_chk(question_df1[., ], img, NULL))

  question_df1$box_mu <- 1:nrow(question_df1) %>%
    purrr::map_dbl(~quick_img_chk(question_df1[., ], img, NULL)$mean() %>%
                     reticulate::py_to_r() %>% as.numeric())

  question_df1 <- question_df1 %>%
    dplyr::mutate(selected = ifelse(box_mu < mean(box_mu), T, F))

  return(question_df1)
}
Atan1988/alvision documentation built on Dec. 15, 2020, 7:03 a.m.