R/functions.R

Defines functions compare_versions_read_outside compare_versions calculate_features_from_versions_and_extract_categorised_alerts decide_heurist_if_same_alert extract_clean_features_from_calculated_features extract_comments_from_code report_features calculate_features_from_versions graph_path_from_alert nodes_path_from_alert calculate_features_when_equal calculate_features cross_versions show_ast generate_ast_tree_from_raw_nodes read_raw_ast_nodes extract_piece_of_code read_and_decorate_code_and_alerts_mapped read_and_decorate_code_and_alerts read_and_decorate_code decorate_code_alerts_mapped decorate_code_and_alerts aggregate_alerts_by_line map_lines read_number_of_lines assemble_diff_command read_pmd_xml decorate_code assemble_pmd_command debuga

Documented in assemble_diff_command assemble_pmd_command calculate_features calculate_features_from_versions calculate_features_from_versions_and_extract_categorised_alerts compare_versions compare_versions_read_outside cross_versions decide_heurist_if_same_alert decorate_code decorate_code_alerts_mapped decorate_code_and_alerts extract_clean_features_from_calculated_features extract_comments_from_code extract_piece_of_code generate_ast_tree_from_raw_nodes map_lines read_and_decorate_code read_and_decorate_code_and_alerts read_and_decorate_code_and_alerts_mapped read_number_of_lines read_pmd_xml read_raw_ast_nodes report_features show_ast

debuga <- function(x, esse){
  print(esse)
}


#' Assemble command in order to execute PMD
#' 
#' 
#' 
#' @param pmd_path path to PMD
#' @param code_path path to code file
#' @param rule_path path to rule
#' @param output name of the output xml file, without extension
#' 
#' @import stringr
#'
#' @return PMD command to generate the alerts
#' @export
#'
#' @examples
assemble_pmd_command <- function(pmd_path, code_path, rule_path, output){
  command <- str_glue("{pmd_path} -d {code_path} -f xml -R {rule_path} -reportfile {output}.xml")
}


#' Make lines of code nice in a markdown document
#'
#' @param strings lines of code
#' @param size_line_of_code number of columns of the lines of code shown
#'
#' @return a string to be used in a markdown file
#' @export
#' 
#' @import stringr
#' 
#' @import tibble
#' @import dplyr
#'
#' @examples
decorate_code <- function(strings, size_line_of_code = 80) {
  strings %>%
    enframe(name = "line", value = "code") %>%
    mutate(
      line = as.character(.data$line) %>%  str_pad(width = 3, side = "left"),
      code = .data$code %>%  str_trunc(width = size_line_of_code, ellipsis = "..."),
      final_code = str_glue("/*{line}*/{code}")
    ) %>%
    pull(.data$final_code)
}



#' Read and format XML file generated by PMD
#'
#'
#'
#' @param file file to be read
#'
#' @return alerts from PMD file
#' @export
#' 
#' @import purrr
#' @import dplyr
#' @import tidyr
#' @import tibble
#' @import dplyr
#' @importFrom rlang .data
#' @examples
read_pmd_xml <- function(
  file,
  include_file_in_output = FALSE
){
  
  #file <- "internal.xml"

  empty <- tibble(
    linha  = numeric(),
    beginline = integer(),
    endline   = integer(),
    begincolumn = integer(),
    endcolumn = integer(),
    rule = character(),
    ruleset = character(),
    package = character(),
    class = character(),
    priority= integer(),
    variable = character(),
    method= character(),
    id_alert  = integer()
  )

  content_xml <- xml2::read_xml(file)
  
  alerts_initial <- content_xml %>% 
    xml2::xml_children() %>% 
    xml2::xml_children() %>% 
    xml2::xml_attrs() %>% 
    map_df(.f = ~enframe(x = .x ))
  

  if (nrow(alerts_initial) != 0){
    alerts <- alerts_initial %>% 
      mutate(primeiro_campo = if_else(.data$name == "beginline", 1, 0)  ) %>% 
      mutate(linha = cumsum(.data$primeiro_campo) ) %>% 
      select(-.data$primeiro_campo) %>% 
      pivot_wider(names_from = .data$name, values_from = .data$value) %>% 
      mutate(id_alert = row_number()) %>% 
      mutate_at(
        vars(one_of(c("beginline", "endline", "begincolumn", "endcolumn", "priority"))),
        as.integer
      ) %>% 
      bind_rows(empty)
    
    
    if(include_file_in_output){
      
      files_count <- content_xml %>% 
        xml2::xml_children() %>% 
        xml2::xml_length() %>% 
        map_df(.f = ~enframe(x = .x )) %>% 
        filter(
          value != 0
        ) %>% 
        select(
          count_items = value
        )
      
      files <- content_xml %>% 
        xml2::xml_children() %>% 
        xml2::xml_attrs() %>% 
        map_df(.f = ~enframe(x = .x )) %>% 
        filter(
          name == "name"
        ) %>% 
        select(
          file = value
        )
      
      files_with_count <- bind_cols(files, files_count) %>% 
        rowwise() %>% 
        mutate(
          items = list(1:count_items)
        ) %>% 
        ungroup() %>% 
        unnest(
          items
        ) %>% 
        select(
          file
        )
      
      alerts <- bind_cols(
        alerts, files_with_count
      )     
      

    }
    
  }
  else{
    alerts <- empty
  }
  
  write_rds(alerts, "alerts.rds")
  
  alerts
}


#' Assembles the diff command between two code files
#'
#' @param code_path_left path to "left" file
#' @param code_path_right path to "right" file 
#' @param output_left piece of output file name representing left
#' @param output_right piece of output file name representing right 
#'
#' @return git diff command
#' @export
#' 
#' @import stringr
#'
#'
#' @examples
assemble_diff_command <- function(code_path_left, code_path_right,  output_left, output_right){
  saida <- str_glue("git diff -U0 --patience --numstat --summary --output={output_left}_{output_right}.diff --no-index {code_path_left} {code_path_right}")
  saida
}

#' Return number of lines of the file
#'
#' @param file file to be read
#'
#' @return number of lines
#' @export
#'
#' @importFrom readr read_table
#' 
#' @examples
read_number_of_lines <- function(file){
  
  saida <- readr::read_table(file = file, col_names = FALSE, skip_empty_rows = FALSE) %>% 
    nrow() 
  saida
}

#' Map lines from two files based on the diff result on them
#'
#' @param file diff file
#' @param lines_prev_param number of lines of left(prev) file
#' @param lines_post_param number of lines of right(post) file 
#'
#' @return a tibble with the corresponding lines of both files
#' @export
#' 
#' @importFrom rlang .data
#'
#' @examples
map_lines <- function(
  file = NA, 
  lines_prev_param, 
  lines_post_param,
  diff_content = NA
){

  
  
  # file <- "C:\\doutorado\\AnaliseTwitter4j\\match_algorithm_description\\old_original_new_1.diff"

  lines_prev_param <- as.integer(lines_prev_param)
  
  lines_post_param <- as.integer(lines_post_param)
  
  if(!is.na(file)){
    file_size <- file.size(file)
  } else {
    file_size <- 1
  }

  if(file_size != 0){
    
    if(is.na(diff_content)){
      
      diff_content_prepared <- diff_marks <- read_table(file, col_names = FALSE ) 
      
    } else {
      diff_content_prepared <- enframe(diff_content, value = "X1") %>% 
        select(X1)
    }
    

    diff_marks <- diff_content_prepared %>% 
      rename(text = 1) %>% 
      mutate(
        marca_inicio_diff = str_detect(.data$text, "diff --git"),
        id_diff = cumsum(.data$marca_inicio_diff),
        diff_title = if_else(.data$marca_inicio_diff, .data$text, NA_character_)
      ) %>% 
      select(-.data$marca_inicio_diff) %>% 
      fill(.data$diff_title, .direction = "down") %>% 
      filter(str_starts(.data$text, "@@")) %>% 
      separate(.data$text, sep = " ", into = c("mark", "minus", "plus"), extra = "drop" ) %>% 
      select(-.data$mark) %>% 
      separate(.data$minus, into = c("line_remove", "n_remove"), sep = ",") %>% 
      separate(.data$plus, into = c("line_add", "n_add"), sep = ",") %>% 
      mutate(
        n_remove = if_else(is.na(.data$n_remove),"1",.data$n_remove),
        n_add = if_else(is.na(.data$n_add),"1",.data$n_add)
      ) %>% 
      mutate(
        line_remove = str_remove(.data$line_remove,"\\-") %>% str_trim(),
        line_add = str_remove(.data$line_add,"\\+" %>% str_trim())
      ) %>% 
      separate(
        .data$diff_title, 
        sep = " ", 
        into = 
          c(
            "diff", 
            "git", 
            "file_prev",
            "file_post"
          ), 
        extra = "drop" 
      ) %>% 
      select(c(-.data$diff,-.data$git)) %>% 
      mutate(
        file_post = str_replace(.data$file_post, "b/",""),
        file_prev = str_replace(.data$file_prev, "a/","")
      ) %>% 
      mutate(
        lines_prev = lines_prev_param,
        lines_post = lines_post_param
      ) %>%    
      mutate_at(
        vars(ends_with("_add")),
        as.integer
      ) %>% 
      mutate_at(
        vars(ends_with("_remove")),
        as.integer
      ) %>% 
      mutate(
        line_add = if_else(.data$n_add == 0, .data$line_add + 1L, .data$line_add)
      ) %>% 
      mutate(
        end_remove = .data$line_remove + .data$n_remove - 1L,
        end_add = .data$line_add + .data$n_add - 1L
      ) %>% 
      mutate(
        line_remove = if_else(.data$n_remove == 0 | is.na(.data$n_remove) , .data$line_remove+1L, .data$line_remove ),
        end_remove = if_else(.data$n_remove == 0 | is.na(.data$n_remove) , .data$end_remove+1L, .data$end_remove )
      ) %>%
      group_by(.data$id_diff) %>% 
      mutate(
        id_diff_id = row_number(),
        n_diff = n()
      ) %>% 
      ungroup() 
    
    
    last_diff <- diff_marks %>% 
      group_by(.data$id_diff) %>% 
      summarise(
        line_remove = first(.data$lines_prev) + 1L ,
        n_remove = NA,
        line_add = first(.data$lines_post) + 1L ,
        n_add = NA,
        file_prev = first(.data$file_prev),
        file_post = first(.data$file_post),
        lines_prev = first(.data$lines_prev),
        lines_post = first(.data$lines_post),
        end_remove = NA,
        end_add = NA,
        id_diff_id = last(.data$id_diff_id) + 1L,
        n_diff = first(.data$n_diff)
      ) %>% 
      ungroup()
    
    map <- diff_marks %>% 
      bind_rows(last_diff) %>% 
      arrange(.data$id_diff, .data$id_diff_id) %>% 
      mutate(
        end_remove_prev = lag(.data$end_remove),
        end_add_prev = lag(.data$end_add)
      ) %>% 
      mutate(
        end_remove_prev = if_else(is.na(.data$end_remove_prev),0L, .data$end_remove_prev),
        end_add_prev = if_else(is.na(.data$end_add_prev),0L, .data$end_add_prev)
      ) %>% 
      mutate(
        line_add = if_else(is.na(.data$line_add),0L, .data$line_add)
      ) %>% 
      filter(!is.na(.data$line_remove )) %>% 
      mutate(
        map_remove = map2(.x = (.data$end_remove_prev + 1L), .y = (.data$line_remove - 1L),.f = function(x, y) x:y),
        map_add = map2(.x = (.data$end_add_prev+1L), .y = (.data$line_add - 1L),.f = function(x, y) x:y)
      ) %>%
      filter(!is.na(.data$lines_post)) %>%
      unnest(cols = c(.data$map_remove, .data$map_add )) %>% 
      select(
        .data$lines_post,
        .data$lines_prev,
        .data$file_prev,
        .data$file_post,
        .data$map_remove,
        .data$map_add
      ) 
    
    
    post_sem_prev <- diff_marks %>% 
      select(.data$lines_post, .data$file_post, .data$file_prev) %>% 
      distinct() %>% 
      replace_na(list(lines_post = 1)) %>% 
      mutate( lines =  map(.x = .data$lines_post, .f = function(x){tibble(map_add = 1:x)} )) %>% 
      unnest(.data$lines) %>% 
      anti_join(map, by = c("file_post","map_add" )) 
    
    prev_sem_post <- diff_marks %>% 
      select(.data$lines_prev, .data$file_prev, .data$file_post) %>% 
      distinct() %>% 
      replace_na(list(lines_prev = 1)) %>% 
      mutate( lines =  map(.x = .data$lines_prev, .f = function(x){tibble(map_remove = 1:x)} )) %>% 
      unnest(.data$lines) %>% 
      anti_join(map, by = c("file_post","map_remove" )) 
    
    
    
    final_map <- map %>%
      bind_rows(post_sem_prev) %>%
      bind_rows(prev_sem_post) %>% 
      mutate(
        changed = sum((is.na(.data$map_remove) | is.na(.data$map_add) ))
      ) %>% 
      rowwise() %>% 
      mutate(
        min_map = max(c(.data$map_remove, .data$map_add), na.rm = TRUE)
      ) %>%
      arrange(.data$min_map)   
    
    
    info_map_lines <- list(
      file = file,
      lines_prev_param = lines_prev_param,
      lines_post_param = lines_post_param ,
      output_function = final_map
    )
    
    final_map <- final_map %>% 
      mutate(equal = FALSE)
    
  }
  else{
    final_map <- tibble(map_remove = 1:lines_prev_param, map_add = 1:lines_post_param) %>% 
      mutate(equal = TRUE)
  }
  
  
  final_map
  
 
}


aggregate_alerts_by_line <-
  function(
    alerts, 
    trunc_rule_length_param = NA_real_,
    use_mnemonic = FALSE
    ) {
    

    if(use_mnemonic){
      trunc_rule_length <- 10000
    }
    else{
      trunc_rule_length <-
        if_else(is.na(trunc_rule_length_param),
                10,
                trunc_rule_length_param)
    }
    
    output <- alerts %>%
      mutate(beginline = as.integer(.data$beginline)) %>%
      group_by(.data$beginline,
               .data$rule) %>%
      summarise(n = n()) %>%
      mutate(
        rule_mnemonic = extract_mnemonic(.data$rule)        
      ) %>% 
      mutate(rule = if_else(
        is.na(trunc_rule_length_param),
        .data$rule,
        str_trunc(
          .data$rule,
          width = trunc_rule_length,
          side = "right",
          ellipsis = ""
        ) %>% as.character()
      )) %>%
      mutate(
        rule = if_else(n == 1, .data$rule, str_glue("{.data$rule}({n})") %>% as.character()),
        rule_mnemonic = if_else(n == 1, .data$rule_mnemonic, str_glue("{.data$rule_mnemonic}({n})") %>% as.character())
      )
    
  }

#' Make lines of code look nice in markdown, with their alerts 
#'
#'
#'it's called from read_and_decorate_code
#'
#'
#' @param strings lines of code
#' @param alerts alerts 
#' @param region_only include only regions near alerts?
#' @param region_size size of the region near alerts contained in the output
#' @param size_line_of_code size of lines of code in the markdown output
#' @param length_alert_name size of lines of alert names in the markdown output
#'
#' @import tibble
#' @import dplyr
#' @return a string to be used in a markdown file
#'
#' @examples
decorate_code_and_alerts <-
  function(strings,
           alerts,
           region_only = FALSE,
           region_size = 3,
           size_line_of_code = 160,
           length_alert_name = 35,
           use_mnemonic = FALSE
                 
    ) {
    

    alert <- alerts %>%
      as_tibble() %>%
      select(.data$beginline, .data$rule) %>%
      aggregate_alerts_by_line() 

    if(use_mnemonic){
      length_alert_name_with_mnemonic <- alert$rule_mnemonic %>% str_length() %>% max(na.rm = TRUE)
    }else{
      length_alert_name_with_mnemonic = length_alert_name
    }
    

    max_rule <- alert %>%
      mutate(
        field_to_be_considered = 
          if_else(
            use_mnemonic,
            .data$rule_mnemonic,
            .data$rule
          )
      ) %>% 
      pull(.data$field_to_be_considered) %>%
      str_length() %>%
      max(na.rm = TRUE)
    
    
    
    output <- strings %>%
      enframe(name = "line", value = "code") %>%
      left_join(alert,
                by = c("line" = "beginline")) %>%
      mutate(
        alert_mark_up = if_else(is.na(.data$rule), NA_integer_, .data$line ),  
        alert_mark_down = if_else(is.na(.data$rule), NA_integer_, .data$line )  
      ) %>%
      fill(
        .data$alert_mark_up,
        .direction = "up"
      ) %>% 
      fill(
        .data$alert_mark_down,
        .direction = "down"
      ) %>% 
      replace_na(
        list(
          alert_mark_down = 0,
          alert_mark_up = length(strings)
        )
      ) %>% 
      mutate(
        dist_up = .data$line - .data$alert_mark_down,
        dist_down = .data$alert_mark_up - .data$line
      ) %>% 
      rowwise() %>%
      mutate(
        min_dist = min(.data$dist_up, .data$dist_down)
      ) %>% 
      mutate(
        code = if_else(.data$min_dist == region_size + 1, "/* ...  */", .data$code )
      ) %>% 
      filter(
        .data$min_dist <= region_size + 1
      ) %>% 
      ungroup() %>% 
      replace_na(
        list(
          rule = "",
          rule_mnemonic = ""
        )
      ) %>%
      mutate(
        line = as.character(.data$line) %>%  str_pad(width = 3, side = "left"),
        code = .data$code %>%  str_trunc(width = size_line_of_code - length_alert_name_with_mnemonic, ellipsis = "..."),
        final_rule =
          if_else(
            replicate(n = .data$rule_mnemonic %>% length(),use_mnemonic),
            .data$rule_mnemonic %>% str_pad(width = length_alert_name_with_mnemonic, side = "right"),
            .data$rule %>% str_pad(width = length_alert_name_with_mnemonic, side = "right")
          )
        ,
        final_code = str_glue("/*{.data$line}-{.data$final_rule}*/{code}")
      ) %>%
      pull(.data$final_code) %>%
      identity()

    if(use_mnemonic){

      complement <- alert %>% 
        mutate(
          legend = str_glue("{rule_mnemonic}: {rule}")
        ) %>% 
        pull(legend) %>% 
        unique() %>% 
        str_flatten(collapse = "\n")

      output <- c(output,"\n", complement)
    }
    
    output
    
  }

#'  Make two versions of lines of code look nice, side by side, in markdown, with their alerts
#'
#' @param strings_old_param source code of the old version
#' @param alerts_old_param alerts of the old version
#' @param strings_new_param source code of the new version
#' @param alerts_new_param alerts of the new version
#' @param map_param line map between new and old versions
#' @param region_only show only region where alerts are ?
#' @param region_size size of the region near alerts to show
#' @param length_alert_name_side_by_side length of the alert name in the markdown output
#' @param size_line_of_code_side_by_side size of each line of code in the markdown output
#'
#' @import tibble
#' @import dplyr
#'
#' @return code that can be put on markdown document
#'
#' @examples
decorate_code_alerts_mapped <-
  function(strings_old_param,
           alerts_old_param,
           strings_new_param,
           alerts_new_param,
           map_param,
           region_only = FALSE,
           region_size = 3,
           length_alert_name_side_by_side = 14,
           size_line_of_code_side_by_side = 77,           
           use_mnemonic = FALSE
           ) {
    # for debug
    # strings_old_param <-  read_lines("old/code.java")
    # strings_new_param <-  read_lines("new/code.java")
    # alerts_old_param <-  examples_executed$pmd_output[[1]]
    # alerts_new_param <-  examples_executed$pmd_output[[2]]
    # map_param <-  examples_crossed$lines_map[[1]]
    # region_only = FALSE
    # region_size = 3      
    

    map <- map_param %>%
      select(line_old = .data$map_remove,
             line_new = .data$map_add)
    
    strings_old <-
      strings_old_param %>% enframe(name = "line_old", value = "code_old") %>% replace_na(list(code_old = ""))
    
    alerts_old <- alerts_old_param %>%
      aggregate_alerts_by_line(trunc_rule_length_param = length_alert_name_side_by_side, use_mnemonic = TRUE) %>%
      select(
        line_old = .data$beginline, 
        rule_old = .data$rule,
        rule_mnemonic_old = .data$rule_mnemonic,
      )
    
    strings_new <-
      strings_new_param %>% enframe(name = "line_new", value = "code_new") %>% replace_na(list(code_new = ""))
    
    alerts_new <- alerts_new_param %>%
      aggregate_alerts_by_line(trunc_rule_length_param = length_alert_name_side_by_side, use_mnemonic = TRUE) %>%
      select(
        line_new = .data$beginline, 
        rule_new = .data$rule,
        rule_mnemonic_new = .data$rule_mnemonic,
      )
    
    max_mnemonic_length <- c(alerts_new$rule_mnemonic_new, alerts_old$rule_mnemonic_old) %>% 
      str_length() %>% 
      max(na.rm = TRUE)

    if(use_mnemonic){
      length_alert_name_side_by_side <- max_mnemonic_length
    }
    
    
    saida <- map %>%
      left_join(strings_old,
                by = c("line_old")) %>%
      left_join(alerts_old,
                by = c("line_old")) %>%
      left_join(strings_new,
                by = c("line_new")) %>%
      left_join(alerts_new,
                by = c("line_new")) %>%
      ungroup() %>% 
      mutate(
        id_line = row_number(), 
        alert_mark_up = if_else(is.na(.data$rule_new) & is.na(.data$rule_old) , NA_integer_, .data$id_line ),  
        alert_mark_down = if_else(is.na(.data$rule_new) & is.na(.data$rule_old), NA_integer_, .data$id_line )  
      ) %>%
      fill(
        .data$alert_mark_up,
        .direction = "up"
      ) %>% 
      fill(
        .data$alert_mark_down,
        .direction = "down"
      ) %>% 
      replace_na(
        list(
          alert_mark_down = 0,
          alert_mark_up = nrow(map)
        )
      ) %>% 
      mutate(
        dist_up = .data$id_line - .data$alert_mark_down,
        dist_down = .data$alert_mark_up - .data$id_line
      ) %>% 
      rowwise() %>%
      mutate(
        min_dist = min(.data$dist_up, .data$dist_down)
      ) %>% 
      mutate(
        code_new = if_else(.data$min_dist == region_size + 1, "/* ...  */", .data$code_new ),
        code_old = if_else(.data$min_dist == region_size + 1, "/* ...  */", .data$code_old )
      ) %>% 
      filter(
        .data$min_dist <= region_size + 1
      ) %>% 
      ungroup() %>% 
      select(
        -.data$id_line
      ) %>% 
      mutate(line_old = as.character(.data$line_old),
             line_new = as.character(.data$line_new)) %>%
      replace_na(
        list(
          code_new = "/*XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX*/",
          code_old = "/*XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX*/",
          line_old = "",
          line_new = "",
          code_old = "",
          code_new = "",
          rule_old = "",
          rule_new = "",
          rule_mnemonic_old = str_dup(' ', times = length_alert_name_side_by_side),
          rule_mnemonic_new = str_dup(' ', times = length_alert_name_side_by_side)
          
        )
      ) %>%
      mutate(
        line_old = .data$line_old %>%  str_pad(width = 3, side = "left"),
        line_new = .data$line_new %>%  str_pad(width = 3, side = "left"),
        code_old = .data$code_old %>%
          str_trunc(
            width = size_line_of_code_side_by_side - length_alert_name_side_by_side,
            ellipsis = ""
          ) %>%
          str_pad(
            width = size_line_of_code_side_by_side - length_alert_name_side_by_side,
            side = "right"
          ),
        
        code_new = .data$code_new %>%
          str_trunc(
            width = size_line_of_code_side_by_side - length_alert_name_side_by_side,
            ellipsis = ""
          ) %>%
          str_pad(
            width = size_line_of_code_side_by_side - length_alert_name_side_by_side,
            side = "right"
          ),
        
        rule_old = .data$rule_old %>% str_pad(width = length_alert_name_side_by_side + 3, side = "right"),
        rule_new = .data$rule_new %>% str_pad(width = length_alert_name_side_by_side + 3, side = "right"),
        
        rule_old_final = 
          if_else(
            replicate(n = .data$rule_mnemonic_old %>% length() , expr = use_mnemonic),
            rule_mnemonic_old,
            rule_old
          ),
        
        rule_new_final = 
          if_else(
            replicate(n = .data$rule_mnemonic_new %>% length() , expr = use_mnemonic),
            rule_mnemonic_new,
            rule_new
          ),
        

        final_code = str_glue(
          "/*{.data$line_old}-{.data$rule_old_final}*/{.data$code_old}/*{.data$line_new}-{.data$rule_new_final}*/{.data$code_new}"
        )
      ) %>%
      pull(.data$final_code) 
    
    
    if(use_mnemonic){
      
      complement_new <- alerts_new %>% 
        rename_with(
          .fn = ~str_remove(string = .x, pattern = "_new")
        ) %>% 
        mutate(
          legend = str_glue("{rule_mnemonic}: {rule}")
        )  
        
      complement_old <- alerts_old %>% 
        rename_with(
          .fn = ~str_remove(string = .x, pattern = "_old")
        ) %>% 
        mutate(
          legend = str_glue("{rule_mnemonic}: {rule}")
        )  
      
        
      complement <- complement_new %>% 
        bind_rows(
          complement_old
        ) %>% 
        pull(legend) %>% 
        unique() %>% 
        str_flatten(collapse = "\n")
      
      saida <- c(saida,"\n", complement)
    }
    

    saida
    
  }

#' Reads a code file, decorate and returns a string for a RMarkdown document
#'
#' @param file file to be read
#'
#'
#' @importFrom readr read_lines
#'
#' @return a string for a RMarkdown document
#' @export
#'
#' @examples
read_and_decorate_code <-  function(file) {
  readr::read_lines(file) %>%
    decorate_code() %>%
    as.character()
}

#' Read a source code and, with its alerts, make it nice to put on a markdown document
#'
#' @param file source code file which matches with the alerts
#' @param alerts alerts from an output of calculate_features, output$versions_executed$pmd_output\[\[i\]\]
#' @param region_only must the output contain only the region with alerts?
#' @param region_size how many lines must the region near the alert contained in the output have?
#'
#'
#' @importFrom readr read_lines
#' @return a nice markdown content to be put on a markdown document
#' @export
#'
#' @examples
read_and_decorate_code_and_alerts <-  function(
  file, 
  alerts,  
  region_only = FALSE,
  region_size = 3,
  use_mnemonic = FALSE
  
) {
  #for debug
  # file <-  "old/code.java"
  # alerts <- examples_executed$pmd_output[[1]]
  
  readr::read_lines(file) %>%
    decorate_code_and_alerts(alerts = alerts, region_only = region_only, region_size = region_size, use_mnemonic = use_mnemonic) %>%
    as.character()
}



#' Reads an old version ans a new version of a source code and returns a string for a RMarkdown document compating them
#'
#' @param file_old file containing the old version of a source code
#' @param alerts_old alerts for the old version of a source code
#' @param file_new file containing the new version of a source code
#' @param alerts_new alerts for the new version of a source code
#' @param map line map between new and old versions
#' @param region_only show only the region near the alerts?
#' @param region_size size of the region contained in the output
#'
#' @importFrom readr read_lines
#' 
#' 
#' @return string ready for a RMarkdown document
#' @export
#'
#' @examples
read_and_decorate_code_and_alerts_mapped <-
  function(file_old,
           alerts_old,
           file_new,
           alerts_new,
           map,             
           region_only = FALSE,
           region_size = 3,
           use_mnemonic = FALSE,
           size_line_of_code_side_by_side = 77
    ) {
    # For debug
    # file_old <- "old/code.java"
    # alerts_old <- examples_executed$pmd_output[[1]]
    # file_new <-  "new/code.java"
    # alerts_new <- examples_executed$pmd_output[[2]]
    # map <- examples_crossed$lines_map[[1]]
    
    strings_new <- read_lines(file_new)
    strings_old <- read_lines(file_old)
    
    decorate_code_alerts_mapped(
      strings_old_param = strings_old,
      strings_new_param = strings_new,
      alerts_old_param = alerts_old,
      alerts_new_param = alerts_new,
      map_param = map,
      region_only = region_only,
      region_size = region_size,
      use_mnemonic = use_mnemonic,
      size_line_of_code_side_by_side = size_line_of_code_side_by_side
    ) %>%
      as.character()
    
  }



#' Extract piece of code
#'
#' @param strings_param code to extract piece from
#' @param begin_line line where the piece begins
#' @param end_line  line where the piece ends
#' @param begin_column column where the piece begins
#' @param end_column column where the piece ends
#' @import tibble
#' @import dplyr

#' @return piece of code
#' @export
#'
#' @examples
extract_piece_of_code <-  function(strings_param, begin_line, end_line, begin_column, end_column){
  
  
  # #for debug
  # strings_param <- read_lines("data/caso1_extract_piece_of_code/code.java")
  # begin_line <- 9
  # end_line <- 9
  # begin_column <- 9
  # end_column <- 44
  
  
 output <- strings_param[begin_line:end_line]
 
 output[1] <- str_sub(output[1], start = begin_column)
 if(begin_line == end_line){
   end_column <- end_column - begin_column + 1 
 }
 output[end_line-begin_line + 1] <- str_sub(output[end_line-begin_line + 1], 1, end_column)
 
 output %>% str_flatten(collapse = "\n")

}


#' Raw abstract syntax tree from a source code file
#' 
#' Reads the source code file, runs PMD Source Analyzer and returns the war Abstract Syntax Tree
#'
#' @param code_location location of a file containing the source code
#' @param output_location temporary output where xml from PMD will be written
#' @param pmd_location location of PMD.bat
#' @param blockrules_location location of xml containing the blockrules
#'
#' @importFrom readr read_lines
#' @import tibble
#' @import dplyr
#' @return a dataframe containing the raw AST, with all the nodes captured from source code
#' @export
#'
#' @examples
read_raw_ast_nodes <-  function(
  code_location, 
  output_location,
  pmd_location,
  blockrules_location,
  alerts = NA,
  include_file_in_output = FALSE
){
  
  # code_location <- code_file_old
  # output_location <- output_old
  
  if(is.na(alerts)){
    system(str_glue("{pmd_location}/pmd.bat -d {code_location} -f xml -R {blockrules_location} -reportfile {output_location}"), show.output.on.console =  FALSE, invisible = TRUE)
    alerts <- read_pmd_xml(output_location, include_file_in_output = include_file_in_output)
    delete_file <- TRUE
  } else {
    
    empty <- tibble(
      linha  = numeric(),
      beginline = integer(),
      endline   = integer(),
      begincolumn = integer(),
      endcolumn = integer(),
      rule = character(),
      ruleset = character(),
      package = character(),
      class = character(),
      priority= integer(),
      variable = character(),
      method= character(),
      id_alert  = integer()
    )
    

    alerts <- alerts %>% 
      mutate(
        id_alert = row_number(),
        linha = row_number() %>% as.numeric()
      ) %>% 
      mutate_at(
        vars(one_of(c("beginline", "endline", "begincolumn", "endcolumn", "priority"))),
        as.integer
      ) %>% 
      bind_rows(empty)
    

    delete_file <- FALSE
  }
  
  

  code_all_lines <- read_lines(code_location)
  
  
  returned_value <- alerts %>% 
    replace_na(
      list(
        method = "No method"
      )
    ) %>% 
    left_join(
      map_rule_small,
      by = c("rule")
    ) %>% 
    mutate(
      code = pmap(
        .l =  list(
          begin_line = .data$beginline, 
          end_line = .data$endline, 
          begin_column = .data$begincolumn, 
          end_column = .data$endcolumn                
        ),
        .f = extract_piece_of_code ,
        strings_param = code_all_lines
      )
    )
  
  if(delete_file){
    file.remove(output_location)
  }
  
  returned_value
}


#' Generate Abstract Syntax Tree from raw nodes
#'
#' @param nodes raw nodes from read_raw_ast_nodes
#' @import tidygraph
#' @import fuzzyjoin
#' @return the Abstract Syntax Tree, a graph
#' @export
#'
#' @examples
generate_ast_tree_from_raw_nodes <-  function(nodes){
  
  #nodes <- read_rds(    
  max_column <- max(c(nodes$endcolumn,nodes$begincolumn))
  
  nodes_from <- nodes %>%  rename_all(.funs = ~str_glue("{.x}_from")) %>% 
    mutate(
      location_begin_from = .data$beginline_from * max_column + .data$begincolumn_from,
      location_end_from = .data$endline_from * max_column + .data$endcolumn_from,
    )
  
  nodes_to <- nodes %>%  rename_all(.funs = ~str_glue("{.x}_to")) %>% 
    mutate(
      location_begin_to = .data$beginline_to * max_column + .data$begincolumn_to,
      location_end_to = .data$endline_to * max_column + .data$endcolumn_to 
    )
  
  
  all_edges <- fuzzyjoin::interval_inner_join(nodes_to, nodes_from,
                               by = c("location_begin_to" = "location_begin_from", "location_end_to" = "location_end_from"),
                               type = "within"
    ) %>% 
    filter(.data$id_alert_from != .data$id_alert_to) %>% 
    select(
      from = .data$id_alert_from,
      to = .data$id_alert_to
    ) 
  
    
  # 
  # all_edges <- nodes_from %>% 
  #   crossing(nodes_to) %>% 
  #   mutate(
  #     location_begin_from = .data$beginline_from * max_column + .data$begincolumn_from,
  #     location_begin_to = .data$beginline_to * max_column + .data$begincolumn_to,
  #     location_end_from = .data$endline_from * max_column + .data$endcolumn_from,
  #     location_end_to = .data$endline_to * max_column + .data$endcolumn_to
  #   ) %>% 
  #   filter(.data$id_alert_from != .data$id_alert_to) %>% 
  #   filter(
  #     .data$location_begin_from <= .data$location_begin_to & .data$location_end_from >= .data$location_end_to
  #   ) %>% 
  #   select(
  #     from = .data$id_alert_from,
  #     to = .data$id_alert_to
  #   ) 
  # 
  # 
  descendents <- all_edges %>% 
    group_by(.data$from) %>% 
    summarise(n_descendents = n()) 
  
  nodes_sorted <- nodes %>% 
    left_join(
      descendents,
      by = c("id_alert" = "from")
    ) %>% 
    replace_na(
      list(n_descendents = 0 )
    ) %>% 
    arrange(
      desc(.data$n_descendents)
    ) %>% 
    mutate(
      id_alert_old = .data$id_alert,
      id_alert = row_number()
    ) %>%  
    mutate(
      name = case_when(
        .data$small_rule %in% c("name", "class_type","var_id" ) ~ str_glue('{.data$id_alert}:line:{.data$beginline},{.data$small_rule}:{.data$code}'),
        TRUE ~ str_glue("{.data$id_alert}:line:{.data$beginline},{.data$small_rule}")
      )
    )
  
  map_new_id_alert <- nodes_sorted %>% 
    select(
      .data$id_alert_old,
      .data$id_alert
    )
  
  all_edges_new_id <-  all_edges %>% 
    left_join(
      map_new_id_alert,
      c("from" = "id_alert_old")
    ) %>% 
    mutate(
      from = .data$id_alert
    ) %>% 
    select(-.data$id_alert) %>% 
    left_join(
      map_new_id_alert,
      c("to" = "id_alert_old")
    ) %>% 
    mutate(
      to = .data$id_alert
    ) %>% 
    select(-.data$id_alert) 
  
  nodes_sorted <-  nodes_sorted %>% 
    select(-.data$id_alert_old)
  
  
  complete_graph <- create_empty(n = 0, directed = TRUE) %>% 
    bind_nodes(nodes_sorted ) %>% 
    bind_edges(all_edges_new_id) 
  
  
  output <- complete_graph %>% 
    convert(to_dfs_tree , root = 1, mode = "out" )
  

  output
  
}




#' Generate a plot from graph with AST
#' 
#' 
#'
#' @param graph_dfs_tree a graph contained in the output from calculate_features_from_versions 
#' @param size_label size of the label attached to the node
#' @param alpha_label opacity of the label attached to the node
#' @param node_text_field name of a column in the output from calculate_features_from_versions used in the label attached to the node
#' @param name_field name of a column in the output from calculate_features_from_versions used inside the node
#' @param show_label are the labels shown?
#' @param title title of the plot
#' 
#' @import tidygraph
#' @import ggplot2
#' @import ggraph
#'
#' @return a plot
#' @export
#'
#' @examples
show_ast <-  function(
  graph_dfs_tree, 
  size_label = 5.5, 
  alpha_label = 1,
  node_text_field = "id_alert", 
  name_field = "name", 
  show_label = TRUE,
  title = "",
  aspect = 1.3,
  nudge_x = 0,
  nudge_y = 0
){
  
  #graph_dfs_tree <- grafo
  
  # graph_selected <- graph_dfs_tree %>% 
  #     activate(nodes) %>% 
  #     filter(
  #         id_alert %in% c(9, 10, 42, 41, 15, 16)
  #     ) %>% 
  #     as_tibble()
  
  # size_label = 5.5 
  # alpha_label = 1
  # node_text_field = "id_alert" 
  # name_field = "name" 
  # show_label = TRUE
  # title = ""
  
  
  
  if(show_label){
    
    if(is.numeric(alpha_label)){
      layer <- geom_node_label(
        aes(label = .data[[name_field]]),
        label.size = 0.3,
        repel = TRUE,
        size = size_label,
        label.padding =  0.4,
        alpha = alpha_label,
        stroke = 4,
        hjust = "left",
        nudge_x = nudge_x,
        nudge_y = nudge_y
      )
    }
    else{
      layer <- geom_node_label(
        aes(
          label = .data[[name_field]],
          alpha = .data[[alpha_label]]
        ),
        label.size = 0.3,
        repel = TRUE,
        size = size_label,
        label.padding = 0.3,
        stroke = 4,
        nudge_x = nudge_x,
        nudge_y = nudge_y
      )
    }
    
  }
  else{
    layer <- NULL
  }
  

  ggraph(graph_dfs_tree, layout = "tree" ) +
    geom_edge_link(arrow = arrow(length = unit(2, 'mm')), 
                   end_cap = circle(3, 'mm'), start_cap = circle(3, 'mm')) +    
    layer +
    geom_node_point(
      aes(color = .data$method),
      size = 8,
      shape = 21
    ) +
    geom_node_text(
      aes(label = .data[[node_text_field]]),
      size = 4
    ) +
    theme_void() +
    theme(
      aspect.ratio = aspect  ,
      legend.position = "top" 
    ) +
    guides(
      alpha = FALSE
    ) +
    ggtitle(title)
  
  
}


#' Adds the line map to the dataframe used inside calculate_features_from_versions
#'
#' @param examples_executed dataframe with the versions and their pmd output
#' @return dataframe with line maps info
#'
#' @examples
cross_versions <- function(
  examples_executed,
  diff_content = NA
  
  
){
  
  # examples_executed <- examples_sec2_executed
  

  examples_executed_selected_fields_left <-
    examples_executed %>% select(.data$id, .data$name, .data$path, .data$output) %>%
    rename_all(
      .funs = function(x) {
        str_glue("{x}_left")
      }
    )
  
  examples_executed_selected_fields_right <-
    examples_executed %>% select(.data$id, .data$name, .data$path, .data$output) %>%
    rename_all(
      .funs = function(x) {
        str_glue("{x}_right")
      }
    )
  

  if(is.na(diff_content)){
  
    saida <- examples_executed_selected_fields_left %>%
      crossing(examples_executed_selected_fields_right) %>% 
      filter(.data$id_left < .data$id_right) %>% 
      mutate(diff_command =
               map2(
                 .x = .data$path_left,
                 .y = .data$path_right,
                 ~assemble_diff_command(
                   code_path_left = .x,
                   code_path_right = .y,
                   output_left = .data$output_left,
                   output_right = .data$output_right
                 )
               )) %>% 
      mutate(lines_left = read_number_of_lines(.data$path_left),
             lines_right = read_number_of_lines(.data$path_right)) %>%
      mutate(
        output_diff_command = map(
          .x = .data$diff_command,
          .f = ~ system(command =  .x, show.output.on.console = FALSE)
        ),
        file_diff = str_glue("{output_left}_{output_right}.diff")
      ) %>%
      mutate(lines_map = pmap(
        .l = list(
          file = .data$file_diff  ,
          lines_prev_param = .data$lines_left,
          lines_post_param = .data$lines_right
        ),
        .f = map_lines
      ))
    
  } else{
    
    diff_content_flat <- diff_content %>% str_flatten(collapse = "\n")


    saida <- examples_executed_selected_fields_left %>%
      crossing(examples_executed_selected_fields_right) %>% 
      filter(.data$id_left < .data$id_right) %>% 
      mutate(lines_left = read_number_of_lines(.data$path_left),
             lines_right = read_number_of_lines(.data$path_right)) %>%
      mutate(lines_map = pmap(
        .l = list(
          lines_prev_param = .data$lines_left,
          lines_post_param = .data$lines_right,
          diff_content = diff_content
        ),
        .f = map_lines
      ))
    
    
  }


  saida
   
}



#' calculate the features from the combination of the alerts in the new and the old version
#' 
#' it's called from calculate_features_from_versions 
#'
#' @param graph_old graph related to the old version
#' @param graph_new graph related to the new version
#' @param coordinates line map
#' 
#' @import tidygraph
#'
#' @return calculated features
#'
#' @examples
calculate_features <-  function(graph_old, graph_new, coordinates){
  
  # graph_new <- graphs_from_alerts_new$graph_new[[2]]
  # graph_old <- graphs_from_alerts_old$graph_old[[2]]
  

    alert_old <- graph_old %>% 
      activate("nodes")  %>% 
      as_tibble() %>% 
      select(
        .data$beginline,
        .data$endline,
        .data$rule,
        .data$id_group,
        .data$method,
        .data$rule_alert,
        .data$code
      ) %>% 
      rowwise() %>%  
      mutate( code = str_flatten(.data$code, collapse = "\n") ) %>% 
      ungroup() %>% 
      left_join(
        coordinates %>% select(-.data$new),
        by = c("beginline" = "old")
      ) %>% 
      rename(
        begin_common_line = .data$common_line
      ) %>% 
      left_join(
        coordinates %>%  select(-.data$new),
        by = c("endline" = "old")
      ) %>% 
      rename(
        end_common_line = .data$common_line
      ) %>% 
      mutate(
        node = row_number()
      ) %>% 
      rename_all(
        .funs = ~str_glue("{.x}_old")
      )
    
    alert_new <- graph_new %>% 
      activate("nodes") %>% 
      as_tibble() %>%
      select(
        .data$beginline,
        .data$endline,
        .data$rule,
        .data$id_group,
        .data$method,
        .data$rule_alert,
        .data$code
      ) %>% 
      rowwise() %>% 
      mutate( code = str_flatten(.data$code, collapse = "\n") ) %>%
      ungroup() %>% 
      left_join(
        coordinates %>% select(-.data$old),
        by = c("beginline" = "new")
      ) %>% 
      rename(
        begin_common_line = .data$common_line
      ) %>% 
      left_join(
        coordinates %>%  select(-.data$old),
        by = c("endline" = "new")
      ) %>% 
      rename(
        end_common_line = .data$common_line
      ) %>% 
      mutate(
        node = row_number()
      ) %>% 
      rename_all(
        .funs = ~str_glue("{.x}_new")
      )
    

    
    match_path <- alert_old %>% 
      full_join(
        alert_new,
        by = c("node_old" = "node_new")
      ) 
    
    features_match_path <- match_path %>% 
      mutate(
        last_method_id_old = if_else(
          .data$rule_old %in% c(
            "compilation_unit", 
            "constructor_declaration", 
            "method"
          ), 
          .data$id_group_old, 
          NA_integer_
        ),
        last_method_id_new = if_else(.data$rule_new %in% c(
          "compilation_unit", 
          "constructor_declaration", 
          "method"
        ), 
        .data$id_group_new, 
        NA_integer_
        ),
        last_method_begin_line_old = if_else(
          .data$rule_old %in% c(
            "compilation_unit", 
            "constructor_declaration", 
            "method"
          ), 
          .data$begin_common_line_old, 
          NA_integer_
        ),
        last_method_begin_line_new = if_else(.data$rule_new %in% c(
          "compilation_unit", 
          "constructor_declaration", 
          "method"
        ), 
        .data$begin_common_line_new, 
        NA_integer_
        ),
        last_method_end_line_old = if_else(
          .data$rule_old %in% c(   
            "compilation_unit", 
            "constructor_declaration", 
            "method"
          ), 
          .data$end_common_line_old, 
          NA_integer_
        ),
        last_method_end_line_new = if_else(.data$rule_new %in% c(
          "compilation_unit", 
          "constructor_declaration", 
          "method"
        ),    
        .data$end_common_line_new, 
        NA_integer_
        ),
        
        last_method_code_old = if_else(
          .data$rule_old %in% c(
            "compilation_unit", 
            "constructor_declaration", 
            "method"
          ), 
          .data$code_old, 
          NA_character_
        ),
        
        
        last_method_code_new = if_else(
          .data$rule_new %in% c(
            "compilation_unit", 
            "constructor_declaration", 
            "method"
          ), 
          .data$code_new, 
          NA_character_
        ),
        
        last_code_new = .data$code_new,
        last_code_old = .data$code_old,
        
        
        last_block_id_old = if_else(.data$rule_old %in% c("compilation_unit","block"), .data$id_group_old, NA_integer_),
        last_block_id_new = if_else(.data$rule_new %in% c("compilation_unit","block"), .data$id_group_new, NA_integer_),
        
        last_block_begin_line_old = if_else(.data$rule_old %in% c("compilation_unit","block"), .data$begin_common_line_old, NA_integer_),
        last_block_begin_line_new = if_else(.data$rule_new %in% c("compilation_unit","block"), .data$begin_common_line_new, NA_integer_),
        
        last_block_end_line_old = if_else(.data$rule_old %in% c("compilation_unit","block"), .data$end_common_line_old, NA_integer_),
        last_block_end_line_new = if_else(.data$rule_new %in% c("compilation_unit","block"), .data$end_common_line_new, NA_integer_),
        
        last_class_begin_line_old = if_else(.data$rule_old %in% c("compilation_unit"), .data$begin_common_line_old, NA_integer_),
        last_class_begin_line_new = if_else(.data$rule_new %in% c("compilation_unit"), .data$begin_common_line_new, NA_integer_),
        
        last_class_end_line_old = if_else(.data$rule_old %in% c("compilation_unit"), .data$end_common_line_old, NA_integer_),
        last_class_end_line_new = if_else(.data$rule_new %in% c("compilation_unit"), .data$end_common_line_new, NA_integer_),
        
        last_block_begin_line_old = if_else(.data$rule_old %in% c("compilation_unit","block"), .data$begin_common_line_old, NA_integer_),
        last_block_begin_line_new = if_else(.data$rule_new %in% c("compilation_unit","block"), .data$begin_common_line_new, NA_integer_),
        
        
        last_id_group_old = .data$id_group_old,
        last_id_group_new = .data$id_group_new,
        
        last_common_group_begin_line = if_else(
          .data$id_group_new == .data$id_group_old, 
          .data$begin_common_line_old, 
          NA_integer_
        ),
        
        last_common_group_end_line = if_else(
          .data$id_group_new == .data$id_group_old, 
          .data$end_common_line_old, 
          NA_integer_
        ),
        
        last_method_name_old = .data$method_old,
        
        last_method_name_new = .data$method_new
        
        
      ) %>% 
      fill(
        .data$last_method_id_old,
        .data$last_method_id_new,
        .data$last_method_begin_line_new,
        .data$last_method_begin_line_old,
        .data$last_method_end_line_new,
        .data$last_method_end_line_old,
        .data$last_id_group_old,
        .data$last_id_group_new,
        .data$last_block_id_old,
        .data$last_block_id_new,
        .data$last_block_begin_line_old,
        .data$last_block_begin_line_new,
        .data$last_block_end_line_old,
        .data$last_block_end_line_new,
        .data$begin_common_line_new,
        .data$begin_common_line_old,
        .data$end_common_line_new,
        .data$end_common_line_old,
        .data$last_common_group_begin_line,
        .data$last_common_group_end_line,
        .data$id_group_new,
        .data$id_group_old,
        .data$last_class_begin_line_old,
        .data$last_class_begin_line_new,
        .data$last_class_end_line_old,
        .data$last_class_end_line_new,
        .data$rule_alert_new, 
        .data$rule_alert_old,
        .data$last_method_name_old,
        .data$last_method_name_new,
        .data$last_method_code_new,
        .data$last_method_code_old,
        .data$last_code_new,
        .data$last_code_old
        
      ) %>%
      mutate(
        same_rule = .data$rule_alert_new == .data$rule_alert_old,
        same_id_group = .data$id_group_new == .data$id_group_old,  
        same_method_group = .data$last_method_id_new == .data$last_method_id_old,
        same_method_name = .data$last_method_name_old == .data$last_method_name_new,
        same_block = .data$last_block_id_new == .data$last_block_id_old,
        last_common_group_mean_line = (.data$last_common_group_begin_line + .data$last_common_group_end_line)/2,
        mean_line_new = (.data$begin_common_line_new + .data$end_common_line_new)/2,
        mean_line_old = (.data$begin_common_line_old + .data$end_common_line_old)/2,
        mean_line_last_common_group = (.data$last_common_group_begin_line + .data$last_common_group_end_line)/2,
        dist_line = abs(.data$mean_line_new - .data$mean_line_old),
        size_last_block = .data$last_common_group_end_line - .data$last_common_group_begin_line,
        dist_line_normalized_block = .data$dist_line/if_else(.data$size_last_block == 0, 1L , .data$size_last_block ),
        size_unit = .data$last_class_end_line_new - .data$last_class_begin_line_new,
        size_method = if_else(
          .data$same_method_group,
          .data$last_method_end_line_new - .data$last_method_begin_line_new,
          .data$size_unit
        ),  
        dist_line_normalized_method = .data$dist_line/.data$size_method,
        dist_line_normalized_unit = .data$dist_line/.data$size_unit,
        same_code = str_trim(.data$last_code_old) == str_trim(.data$last_code_new),
        same_method_code = str_trim(.data$last_method_code_old) == str_trim(.data$last_method_code_new)
      ) %>% 
      select(
        .data$same_rule,
        .data$same_id_group,
        .data$same_method_group,
        .data$same_method_name,
        .data$same_block,
        .data$same_code,
        .data$same_method_code,
        .data$dist_line,
        .data$dist_line_normalized_block,
        .data$dist_line_normalized_method,
        .data$dist_line_normalized_unit
      ) %>% 
      slice_tail(n = 1) 



  features_match_path
  
}

calculate_features_when_equal <- function(){
  tibble(
    same_rule = TRUE,
    same_id_group = TRUE,
    same_method_group = TRUE,
    same_method_name = TRUE,
    same_block = TRUE,
    same_code = TRUE,
    same_method_code = TRUE,
    dist_line = 0,
    dist_line_normalized_block = 0,
    dist_line_normalized_method = 0,
    dist_line_normalized_unit = 0
  )
}

nodes_path_from_alert <- function(graph, id_node){
  output <- graph %>% 
    convert(to_shortest_path , from = id_node, to = 1  , mode = "out" ) %>%
    activate("nodes") %>% 
    as_tibble() %>% 
    select(.data$id_group)
}

graph_path_from_alert <- function(graph, id_node){
  output <- graph %>% 
    convert(to_shortest_path , from = id_node, to = 1  , mode = "out" ) 
}




#' Calculate features from combinations of PMD alerts of a new and an old version of source code
#' 
#' Reads or gets as input two versions of source codes, a new and an old, and returns the info about the alerts and the combination between the alerts.
#' The main output are the features related to combinations of an alerts: an alert from the new version and an alert from the old version. 
#' All the combinations between an old alert and a new alert are considered.
#' The main goal is to differentiate between Fixed, Old and New alerts.
#' If a combination of alerts from the new and the old versions have many features that indicate they have the same role in the code, the alert in the new version may be considered an old alert.
#'     
#'
#' @param code_file_new file containing the source code of the new version. If this parameter is empty, the parameter code_new is used
#' @param code_file_old file containing the source code of the old version. If this parameter is empty, the parameter code_old is used
#' @param code_new source code of the new version. If this parameter is empty, the parameter code_file_new is used
#' @param code_old source code of the old version. If this parameter is empty, the parameter code_file_old is used
#' @param pmd_path complete path to pmd.bat including the name of the file 
#' @param mostra_new which nodes of the new version must appear?
#' @param mostra_old which nodes of the old version must appear?
#' @param glue_string template of a string that will be passed to str_glue and will be available in the returned info
#' @param rule_path path to the rules used for the PMD alerts
#' @param blockrules_location path to the rules for the Abstract Syntax Tree
#' @param optimize_feature_calculation if true, the features are not calculated for all the combinations of new and old alerts if there is a perfect match
#' 
#' @import readr
#'
#' @return a list with information about the comparison between old and new versions.
#' The list contains the following items:
#' 
#' * versions_executed: a dataframe containing the PMD alerts generated for each version
#' * versions_crossed: a dataframe containing information about the combination of versions including a map between the lines of code in the new and in the old version
#' * graph_old_with_alert: a tidygraph containing the abstract syntax tree of the old version and information about PMD alerts
#' * graph_new_with_alert: a tidygraph containing the abstract syntax tree of the new version and information about PMD alerts
#' * graph_old_with_group: a tidygraph containing the abstract syntax tree of the old version and information about PMD alerts.
#' Here, the nodes of the new and the old versions are categorized in groups
#' * graph_new_with_group: a tidygraph containing the abstract syntax tree of the new version and information about PMD alerts.
#' Here, the nodes of the new and the old versions are categorized in groups
#' * graphs_from_alerts_old: a dataframe containing, for each alert of the old version, the path from the node related to the alert to the root node (compilation unit), in a tidygraph
#' * graphs_from_alerts_new: a dataframe containing, for each alert of the new version, the path from the node related to the alert to the root node (compilation unit), in a tidygraph
#' * features: a dataframe containing, for each combination of old and new alert, the features related to the combinations
#' * categorised_alerts: dataframe with old and new alerts categorised in fixed, open and new
#' 
#' @export
#'
#'@importFrom magrittr %T>% 
#'
#' @examples
calculate_features_from_versions <- function(
  code_file_new = "", 
  code_file_old = "", 
  code_new = "", 
  code_old = "",
  pmd_path,
  rule_path = "rulesets/java/quickstart.xml",
  blockrules_location = "data/blockrules/blockrules.xml",
  mostra_new = c(10, 43, 17, 15, 18, 16, 45, 44),
  mostra_old = c(10, 42, 41, 15, 16, 43),
  glue_string = "",
  optimize_feature_calculation = TRUE,
  alerts_new = NA,
  alerts_old = NA,
  ast_new = NA,
  ast_old = NA,
  diff = NA,
  only_categorised = FALSE
  
){
  



  
  start <- Sys.time()

  uid <- uuid::UUIDgenerate()

  feather::write_feather(
    tibble(
      file_new = code_file_new,
      file_old = code_file_old,
      time = start
    ),
    
    str_glue("logexec/{uid}.feather")
  )
  
  if(code_new != ""){
    
    write_lines(code_new, "code_files_new/new.java")
    write_lines(code_old, "code_files_old/old.java")
    code_file_new <- "code_files_new/new.java"
    code_file_old <- "code_files_old/old.java"
  }
  
  
  path_code_file_old <- code_file_old %>% 
    str_remove("/[^/]*$") 
  
  path_code_file_new <- code_file_new %>% 
    str_remove("/[^/]*$") 
  
  output_code_file_old <- uuid::UUIDgenerate()
  
  output_code_file_new <- uuid::UUIDgenerate()
  
  
  examples_sec2 <- tribble(
    
    ~name,                  ~path,      ~output,          
    "Simple old",  code_file_old ,  output_code_file_old %>% as.character(),
    "Simple new",  code_file_new ,  output_code_file_new %>%  as.character(),
  ) %>% 
    mutate(id = row_number()) 
  

  if(is.na(alerts_new)){  

    examples_sec2_executed <- examples_sec2 %>%
      mutate(pmd_command =
               map2(
                 .x = .data$path,
                 .y = .data$output,
                 ~assemble_pmd_command(
                   pmd_path = pmd_path,
                   code_path = .x ,
                   rule_path = rule_path,
                   output = .y
                 )
               )) %>%
      mutate(pmd_command_output = map(
        .x = .data$pmd_command,
        .f =  ~ system(command =  .x, show.output.on.console = FALSE)
      )) %>% 
      mutate(pmd_output = map(.x = str_glue("{.data$output}.xml"), .f = read_pmd_xml))
  } else{
    
    
    tib_pmd_output <- tribble(
      ~name,             ~pmd_output,
      "Simple old",     alerts_old,
      "Simple new",     alerts_new,
      
    )
    
    examples_sec2_executed <- examples_sec2 %>% 
      left_join(
        tib_pmd_output, 
        by = c("name")
      )

  }
  


  examples_sec2_crossed <- cross_versions(examples_sec2_executed, diff_content = diff) 

  
  map <- examples_sec2_crossed$lines_map[[1]] %>% 
    select(   
      old = .data$map_remove,
      new = .data$map_add,
      equal
    )

  

  output_old <-  code_file_old %>% 
    str_replace(".java", ".xml") 
  
  output_new <-  code_file_new %>% 
    str_replace(".java", ".xml") 
  

  nodes_old <- read_raw_ast_nodes(
    code_location = code_file_old,
    output_location =  output_old,
    pmd_location = str_remove(pmd_path, "/pmd.bat"),
    blockrules_location = blockrules_location,
    alerts = ast_old 
  )
  


  graph_old <- generate_ast_tree_from_raw_nodes(nodes_old)
  
  nodes_new <- read_raw_ast_nodes(
    code_location = code_file_new,
    output_location <-  output_new,
    pmd_location = str_remove(pmd_path, "/pmd.bat"),
    blockrules_location = blockrules_location,
    alerts = ast_new
  )
  

  graph_new <- generate_ast_tree_from_raw_nodes(nodes_new)
  
  nodes_new <- graph_new %>% 
    activate("nodes") %>% 
    as_tibble() %>% 
    rename_all(
      ~str_glue("{.x}_new")
    )
  
  nodes_old <- graph_old %>% 
    activate("nodes") %>% 
    as_tibble() %>% 
    rename_all(
      ~str_glue("{.x}_old")
    ) 
  
  map <- examples_sec2_crossed$lines_map[[1]] %>% 
    select(   
      old = .data$map_remove,
      new = .data$map_add,
      equal
    )
  

  map_begin <- map %>% 
    rename_all(
      ~str_glue("{.x}_begin")
    )
  
  
  map_end <- map %>% 
    rename_all(
      ~str_glue("{.x}_end")
    )
  

  match_nodes <- nodes_old %>% 
    left_join(
      map_begin,
      by = c("beginline_old" = "old_begin")
    ) %>% 
    left_join(
      map_end,
      by = c("endline_old" = "old_end")
    ) %>% 
    left_join(
      nodes_new,
      by = c( 
        "new_begin" = "beginline_new",
        "new_end" = "endline_new",
        "rule_old" = "rule_new"
      )
    ) %>%
    group_by(
      .data$id_alert_old
    ) %>% 
    mutate(
      n_old = n()
    ) %>%   
    group_by(
      .data$id_alert_new
    ) %>% 
    mutate(
      n_new = n()
    ) %>% 
    ungroup() %>% 
    filter(
      (.data$n_old == 1 & .data$n_new == 1) | (.data$begincolumn_new == .data$begincolumn_old & .data$begincolumn_old == .data$begincolumn_new)
    ) %>% 
    group_by(
      .data$id_alert_old
    ) %>% 
    mutate(
      n_old = n()
    ) %>% 
    group_by(
      .data$id_alert_new
    ) %>% 
    mutate(
      n_new = n()
    ) %>% 
    ungroup() %>% 
    select(
      .data$id_alert_new,
      .data$id_alert_old
    ) %>% 
    mutate(
      id_group = row_number()
    )
  


  offset_id_group_na <- 0L
  
  graph_new_with_group <- graph_new %>% 
    activate("nodes") %>% 
    left_join(
      match_nodes,
      by = c("id_alert" = "id_alert_new" )
    ) %>% 
    select(
      -.data$id_alert_old 
    ) %>% 
    mutate(
      mostra = case_when(
        id_alert %in% mostra_new ~ 1,
        TRUE ~ -1 
      )
    ) %>% 
    mutate(
      id_group = if_else(is.na(.data$id_group), -row_number()-offset_id_group_na, .data$id_group)
    )
  
  offset_id_group_na <- nrow(graph_new_with_group %>% activate("nodes") %>%  as_tibble())
  
  graph_old_with_group <- graph_old %>% 
    activate("nodes") %>% 
    left_join(
      match_nodes,
      by = c("id_alert" = "id_alert_old" )
    ) %>% 
    select(
      -.data$id_alert_new 
    ) %>% 
    mutate(
      mostra = case_when(
        .data$id_alert %in% mostra_old ~ 1,
        TRUE ~ -1 
      )
    ) %>% 
    mutate(
      id_group = if_else(is.na(.data$id_group), -row_number()-offset_id_group_na, .data$id_group)
    )        
  

  alerts_old <- examples_sec2_executed$pmd_output[[1]] %>% 
    rename_all(
      ~str_glue("{.x}_alert")
    ) %>% 
    mutate(
      one = 1
    )
  
  alerts_new <- examples_sec2_executed$pmd_output[[2]] %>% 
    rename_all(
      ~str_glue("{.x}_alert")
    ) %>% 
    mutate(
      one = 1
    )
    

  graph_old_with_alert <- graph_old_with_group %>% 
    activate("nodes") %>% 
    mutate(
      one = 1
    ) %>% 
    join_ast_alerts(alerts_old) %>% 
    mutate(
      text_alert = if_else(is.na(.data$id_alert_alert),
                           "",
                           str_glue("{.data$id_group}-{.data$rule_alert}") %>%  as.character()
      ),
      
      text_alert_id_node = if_else(is.na(.data$id_alert_alert),
                                   "",
                                   str_glue("{.data$id_alert}-{.data$rule_alert}") %>%  as.character()
      ),
      
      text_line_rule = if_else(is.na(.data$id_alert_alert),
                               "",
                               str_glue("{.data$id_alert}-{.data$rule_alert}") %>%  as.character()
                               
      ),
      
      glue = str_glue(glue_string)
      
      
    ) 
  

  
  
  graph_new_with_alert <- graph_new_with_group %>% 
    activate("nodes") %>% 
    mutate(
      one = 1
    ) %>% 
    join_ast_alerts(alerts_new) %>% 
    mutate(
      text_alert = if_else(is.na(.data$id_alert_alert),
                           "",
                           str_glue("{.data$id_group}-{.data$rule_alert}") %>%  as.character()
      ),
      
      text_alert_id_node = if_else(is.na(.data$id_alert_alert),
                                   "",
                                   str_glue("{.data$id_alert}-{.data$rule_alert}") %>%  as.character()
      ),
      
      glue = str_glue(glue_string)
      
    ) 
  
  
  graph_old_reverted <- graph_old_with_alert %>% 
    activate("edges") %>% 
    reroute(from = .data$to, to = .data$from)
  
  nodes_alerts_old <- graph_old_reverted %>% 
    activate("nodes") %>% 
    filter(!is.na(.data$id_alert_alert)) %>% 
    as_tibble() %>% 
    select(
      .data$id_alert,
      .data$id_group,
      .data$rule_alert
    ) 
  

  graphs_from_alerts_old <- nodes_alerts_old %>% 
    mutate(graph = map(.x = .data$id_alert, .f = ~graph_path_from_alert(graph = graph_old_reverted, id_node = .x )   )) 
  
  
  graphs_from_alerts_old <- graphs_from_alerts_old %>% rename(
    id_alert_old = .data$id_alert,
    id_group_old = .data$id_group,
    rule_alert_old = .data$rule_alert,
    graph_old = .data$graph
  ) 
  
  
  graph_new_reverted <- graph_new_with_alert %>% 
    activate("edges") %>% 
    reroute(from = .data$to, to = .data$from)
  
  nodes_alerts_new <- graph_new_reverted %>% 
    activate("nodes") %>% 
    filter(!is.na(.data$id_alert_alert)) %>% 
    as_tibble() %>% 
    select(
      .data$id_alert,
      .data$id_group,
      .data$rule_alert
    ) 
    
  
  graphs_from_alerts_new <- nodes_alerts_new %>% 
    mutate(graph = map(.x = .data$id_alert, .f = ~graph_path_from_alert(graph = graph_new_reverted, id_node = .x )   )) 
  

  graphs_from_alerts_new <- graphs_from_alerts_new %>%  rename(
    id_alert_new = .data$id_alert,
    graph_new = .data$graph,
    id_group_new = .data$id_group,
    rule_alert_new = .data$rule_alert
  ) 
  
  

  coordinates <- map %>% 
    ungroup() %>% 
    mutate(common_line = row_number()) 
  

  

  
  if (graphs_from_alerts_new %>% nrow() == 0 & graphs_from_alerts_old %>% nrow() == 0){
    match_alerts_alg2 <-  tibble(id_alert_new = integer(), id_alert_old = integer())
    categorised_alerts <- tibble(version = character())
    saida <- list(
      versions_executed = examples_sec2_executed,
      versions_crossed = examples_sec2_crossed,
      graph_old_with_alert =  graph_old_with_alert,
      graph_new_with_alert = graph_new_with_alert,
      graph_old_with_group = graph_old_with_group,
      graph_new_with_group = graph_new_with_group,
      graphs_from_alerts_old = graphs_from_alerts_old,
      graphs_from_alerts_new = graphs_from_alerts_new,
      features = match_alerts_alg2
    )
    
  }else{
    if(xor(graphs_from_alerts_new %>% nrow() == 0, graphs_from_alerts_old %>% nrow() == 0 )){
      match_alerts_alg2 <-  tibble(id_alert_new = integer(), id_alert_old = integer())
      saida <- list(
        versions_executed = examples_sec2_executed,
        versions_crossed = examples_sec2_crossed,
        graph_old_with_alert =  graph_old_with_alert,
        graph_new_with_alert = graph_new_with_alert,
        graph_old_with_group = graph_old_with_group,
        graph_new_with_group = graph_new_with_group,
        graphs_from_alerts_old = graphs_from_alerts_old,
        graphs_from_alerts_new = graphs_from_alerts_new,
        features = match_alerts_alg2
      )


      alerts_old <- saida$graph_old_with_alert %>%
        activate("nodes") %>%
        as_tibble() %>%
        filter(!is.na(.data$id_alert_alert)) %>%
        mutate(
          version = "old",
          category = "fixed"
        )
      
      alerts_new <- saida$graph_new_with_alert %>%
        activate("nodes") %>%
        as_tibble() %>%
        filter(!is.na(.data$id_alert_alert)) %>%
        mutate(
          version = "new",
          category = "new"
        )
    
      categorised_alerts <- bind_rows(alerts_new, alerts_old)  

    }else{
    


        if(sum(coordinates$equal) > 0){
          
          
          
          match_alerts_alg2 <- graphs_from_alerts_new %>%
            inner_join(
              graphs_from_alerts_old,
              by = c("id_group_new" = "id_group_old", "rule_alert_new" = "rule_alert_old")
            ) %>% 
            rowwise() %>% 
            mutate(
              features = calculate_features_when_equal() %>% list()
            )
          
          saida <- list(
            versions_executed = examples_sec2_executed,
            versions_crossed = examples_sec2_crossed,
            graph_old_with_alert =  graph_old_with_alert,
            graph_new_with_alert = graph_new_with_alert,
            graph_old_with_group = graph_old_with_group,
            graph_new_with_group = graph_new_with_group,
            graphs_from_alerts_old = graphs_from_alerts_old,
            graphs_from_alerts_new = graphs_from_alerts_new,
            features = match_alerts_alg2
          )
          
          combinations_same_alerts <- clean_features <- extract_clean_features_from_calculated_features(
            calculated_features = saida
          ) %>%
            decide_heurist_if_same_alert() %>%
            bind_cols(saida$features) %>%
            filter(.data$same_alert) %>% 
            select(.data$id_alert_new, .data$id_alert_old, .data$same_alert)
          
          
          
        }else{
          
    
          if(optimize_feature_calculation){
            match_alerts_alg2 <- graphs_from_alerts_new %>%
            inner_join(
              graphs_from_alerts_old,
              by = c("rule_alert_new" = "rule_alert_old", "id_group_new" = "id_group_old")
            )
          }
          else{
          
            match_alerts_alg2 <- tibble(id_alert_new = integer())
            
          }
        
    
          if(nrow(match_alerts_alg2) > 0){
            

            match_alerts_alg2 <-  match_alerts_alg2 %>% 
              # rowwise() %>%
              mutate(
                features = map2(
                  .x = .data$graph_old, 
                  .y = .data$graph_new,
                  .f = ~calculate_features(graph_old = .x, graph_new = .y, coordinates = coordinates ) 
                )
              ) 
            
            saida <- list(
              versions_executed = examples_sec2_executed,
              versions_crossed = examples_sec2_crossed,
              graph_old_with_alert =  graph_old_with_alert,
              graph_new_with_alert = graph_new_with_alert,
              graph_old_with_group = graph_old_with_group,
              graph_new_with_group = graph_new_with_group,
              graphs_from_alerts_old = graphs_from_alerts_old,
              graphs_from_alerts_new = graphs_from_alerts_new,
              features = match_alerts_alg2
            )
            
    
            combinations_same_alerts <- clean_features <- extract_clean_features_from_calculated_features(
              calculated_features = saida
            ) %>%
              decide_heurist_if_same_alert() %>%
              bind_cols(saida$features) %>%
              filter(.data$same_alert) %>% 
              select(.data$id_alert_new, .data$id_alert_old, .data$same_alert)
            
          }else{
            
            match_alerts_alg2 <- tibble(id_alert_new = integer())
            
            combinations_same_alerts <- tibble(
              same_alert = logical(), 
              id_alert_new = integer(),
              id_alert_old = integer()
            )
            
          }
          
    
          
    
          graph_new_no_match <- graphs_from_alerts_new %>% 
            anti_join(
              combinations_same_alerts,
              by = c("id_alert_new")
            )
          
          graph_old_no_match <- graphs_from_alerts_old %>% 
            anti_join(
              combinations_same_alerts,
              by = c("id_alert_old")
            )
          

          match_alerts_rest <- graph_new_no_match %>%
            crossing(
              graph_old_no_match
            ) %>% 
            filter(
              rule_alert_new == rule_alert_old
            )
          
    
          if(nrow(match_alerts_rest) > 0){
            

            match_alerts_rest <- match_alerts_rest %>% 
              # rowwise() %>%
              mutate(
                features = map2(
                  .x = .data$graph_old, 
                  .y = .data$graph_new,
                  .f = ~calculate_features(graph_old = .x, graph_new = .y, coordinates = coordinates ) ,
                  .progress = TRUE
                )
              ) 
              
            saida <- list(
              versions_executed = examples_sec2_executed,
              versions_crossed = examples_sec2_crossed,
              graph_old_with_alert =  graph_old_with_alert,
              graph_new_with_alert = graph_new_with_alert,
              graph_old_with_group = graph_old_with_group,
              graph_new_with_group = graph_new_with_group,
              graphs_from_alerts_old = graphs_from_alerts_old,
              graphs_from_alerts_new = graphs_from_alerts_new,
              features = match_alerts_rest
            )
            
            combinations_same_alerts_rest <- clean_features <- extract_clean_features_from_calculated_features(
              calculated_features = saida
            ) %>%
              decide_heurist_if_same_alert() %>%
              bind_cols(saida$features) %>%
              filter(.data$same_alert) %>% 
              select(.data$id_alert_new, .data$id_alert_old, .data$same_alert)
            
            combinations_same_alerts <- combinations_same_alerts %>% 
              bind_rows(
                combinations_same_alerts_rest
              )
          } 
        }
        
    
        combinations_same_alerts_old <- combinations_same_alerts %>%
          select(.data$id_alert_old, .data$same_alert ) %>%
          distinct()
      
        combinations_same_alerts_new <- combinations_same_alerts %>%
          select(.data$id_alert_new, .data$same_alert ) %>%
          distinct()
      
        alerts_old <- graph_old_with_alert %>%
          activate("nodes") %>%
          as_tibble() %>%
          filter(!is.na(.data$id_alert_alert)) %>%
          left_join(
            combinations_same_alerts_old,
            by = c("id_alert" = "id_alert_old")
          ) %>%
          replace_na(list(same_alert = FALSE)) %>%
          mutate(
            version = "old",
            category = if_else(.data$same_alert, "open", "fixed")
          )
        
        alerts_new <- graph_new_with_alert %>%
          activate("nodes") %>%
          as_tibble() %>%
          filter(!is.na(.data$id_alert_alert)) %>%
          left_join(
            combinations_same_alerts_new,
            by = c("id_alert" = "id_alert_new")
          ) %>%
          replace_na(list(same_alert = FALSE)) %>%
          mutate(
            version = "new",
            category = if_else(.data$same_alert, "open", "new")
          )
        
        categorised_alerts <- bind_rows(alerts_new, alerts_old)
    }
    
  }

  if(!only_categorised){
    
    saida <- list(
      versions_executed = examples_sec2_executed,
      versions_crossed = examples_sec2_crossed,
      graph_old_with_alert =  graph_old_with_alert,
      graph_new_with_alert = graph_new_with_alert,
      graph_old_with_group = graph_old_with_group,
      graph_new_with_group = graph_new_with_group,
      graphs_from_alerts_old = graphs_from_alerts_old,
      graphs_from_alerts_new = graphs_from_alerts_new,
      features = saida$features,
      categorised_alerts = categorised_alerts
    ) 
  }
  else{
    
    saida <- categorised_alerts %>% 
      mutate(
        file_new = code_file_new,
        file_old = code_file_old,
        time = Sys.time() - start  
      )      
    
  }
  

  write_rds(saida, str_glue("log/{uid}.rds"))

    
  saida
  
}



#' Reports the features in a latex table
#'
#' @param features_df features info from calculate_features_from_versions
#' @param caption caption of the table
#' 
#' @import tidyselect
#' @import knitr
#' @importFrom kableExtra collapse_rows
#' @importFrom kableExtra kable_styling
#' 
#'
#' @return a table in latex
#' @export
#'
#' @examples
report_features <- function(
  features_df, 
  caption,
  types_to_show = c(
    "same_rule",
    "same_id_group",
    "same_method_group",
    "same_method_name",
    "same_block",
    "same_code",
    "same_method_code",
    "dist_line",
    "dist_line_normalized_block",
    "dist_line_normalized_method",
    "dist_line_normalized_unit"
  ),
  return_raw_data = FALSE

){
  
  
  old_lines <- features_df$graph_old_with_alert %>%
    activate("nodes") %>% 
    as_tibble() %>% 
    filter(!is.na(.data$id_alert_alert)) %>% 
    select(.data$id_alert, .data$beginline) %>% 
    rename_with( ~str_glue("{.x}_old")) 
  
  new_lines <- features_df$graph_new_with_alert %>%
    activate("nodes") %>% 
    as_tibble() %>% 
    filter(!is.na(.data$id_alert_alert)) %>% 
    select(.data$id_alert, .data$beginline) %>% 
    rename_with( ~str_glue("{.x}_new")) 
  
  
  feature_names_translation <- tribble(
    ~feature,                       ~feature_display,
    "same_rule",                   "Same Rule",
    "same_id_group",               "Same Group ID",
    "same_method_group",           "Same Method Group ID",
    "same_method_name" ,           "Same Method Name",
    "same_block",                  "Same Block",
    "same_code",                   "Same Code",
    "same_method_code",            "Same Method Code",
    "dist_line",                   "Line Distance",
    "dist_line_normalized_block"  ,"Line Distance Normalized by Block Size",
    "dist_line_normalized_method" ,"Line Distance Normalized by Method Size",
    "dist_line_normalized_unit",   "Line Distance Normalized by Compilation Unit Size"
  ) %>% 
    filter(
      feature %in% types_to_show
    )
  
  
  saida_tabela <- features_df$features %>%
    unnest(.data$features, .sep = ".") %>%
    select(
      starts_with("id_alert") | starts_with("features")
    ) %>%
    left_join(
      old_lines,
      by = c("id_alert_old" = "id_alert_old")
    ) %>%
    left_join(
      new_lines,
      by = c("id_alert_new" = "id_alert_new")
    ) %>%
    select(
      -c(.data$id_alert_new, .data$id_alert_old)
    ) %>%
    mutate(
      across(
        where(is.numeric) & !starts_with("beginline_"),
        ~number(.x, accuracy = 0.01)
      )
    ) %>%
    mutate(
      across(
        where(is.logical),
        as.character
      )
    ) %>%
    relocate(
      .data$beginline_old, .data$beginline_new
    ) %>%
    pivot_longer(
      cols = c(-.data$beginline_old, -.data$beginline_new),
      names_to = "feature",
      values_to = "value"
    ) %>% 
    mutate(
      line_old_line_new = str_glue("Line (Old version):{beginline_old}, Line (New version):{beginline_new}")
    ) %>%
    select(
      c(-.data$beginline_new, -.data$beginline_old)
    ) %>% 
    relocate(
      .data$line_old_line_new
    ) %>% 
    mutate(
      feature = str_remove(.data$feature, "feature.") %>% str_remove("\\.")
    ) %>% 
    inner_join(
      feature_names_translation,
      by = c("feature")
    ) %>% 
    select(
      .data$line_old_line_new,
      feature = .data$feature_display,
      .data$value
    )
  
  
  if(return_raw_data){
    saida_tabela
  }
  else{
    kable(saida_tabela,
          format = "latex",
          caption = caption,
          escape = TRUE,
          # booktabs = TRUE,
          # align = "r",
          # linesep = "",
          col.names = c(
            "Alert combination",
            "Feature",
            "Value"
          )
    ) %>%
      kableExtra::collapse_rows(columns = 1, latex_hline = "major", valign =  "top") %>% 
      kableExtra::kable_styling(
        latex_options = c("HOLD_position", "striped")
      )
    
  }
  
  
  
  
}


#' Extract comments from java/C source code
#'
#' @param file_path path to the source code file
#'
#' @return dataframe with beginline, endline, begincolumn, endcolumn and comment
#' @export
#' @importFrom magrittr extract2
#'
#' @examples
extract_comments_from_code <- function(file_path){

  


  #for debug: file_path = "data/caso1_extract_comments_from_code/code.java"
  code <- read_lines(file_path) %>% 
    str_flatten("\n")
  
  line_breaks <- tibble(start = 0, end = 0) %>%  
    bind_rows(str_locate_all(code, "\n") %>% extract2(1) %>%  as_tibble())
  
  calculate_position_using_line_breaks <- function(begin_param, end_param){


    beginline <- line_breaks %>% 
      filter(.data$start <= begin_param) %>% 
      mutate(
        beginline = row_number(),
        begincolumn = begin_param - .data$start
      ) %>% 
      slice_tail() 
      
    endline <- line_breaks %>% 
      filter(.data$end <= end_param-1) %>% 
      mutate(
        endline = row_number(),
        endcolumn = end_param - .data$start
      ) %>% 
      slice_tail() 
    
    bind_cols(beginline, endline) %>% 
      select(
        .data$beginline,
        .data$begincolumn,
        .data$endline,
        .data$endcolumn
      )
    
  }
  
    
  positions_simple_comments <- str_locate_all(code, "//.+\n") %>% 
    extract2(1) %>%  as_tibble() %>% 
    rowwise() %>% 
    mutate(
      position = map2(.x = .data$start, .y = .data$end, .f = calculate_position_using_line_breaks  )
    ) %>% 
    ungroup() %>% 
    unnest(.data$position) %>% 
    mutate(endcolumn = .data$endcolumn - 1)
  
  simple_comments <- str_match_all(code, "//(.+)\n") %>% 
    extract2(1) %>% 
    as_tibble(.name_repair = "unique") %>% 
    select(
      comment = .data$...2
    ) %>% 
    bind_cols(positions_simple_comments)
  
  positions_multi_comments <- str_locate_all(code, "(?s)\\/\\*.+?\\*\\/") %>% 
    extract2(1   ) %>%  as_tibble() %>% 
    rowwise() %>% 
    mutate(
      position = map2(.x = .data$start, .y = .data$end, .f = calculate_position_using_line_breaks  )
    ) %>% 
    ungroup() %>% 
    unnest(.data$position) 

  multi_comments <- str_match_all(code, "(?s)\\/\\*.+?\\*\\/") %>% 
    extract2(1) %>% 
    as_tibble(.name_repair = "unique") %>% 
    select(
      comment = .data$...1
    ) %>% 
    mutate(
      comment = str_remove(.data$comment, "\\/\\*") %>% str_remove("\\*\\/")
    ) %>% 
    bind_cols(positions_multi_comments)
  
  
  output <- simple_comments %>% 
    bind_rows(multi_comments) %>% 
    arrange(.data$beginline) %>% 
    select(
      .data$beginline, 
      .data$endline, 
      .data$begincolumn, 
      .data$endcolumn, 
      .data$comment
    ) %>% 
    mutate(
      across(where(is.numeric), as.integer)
    )
  
  output
  
  
  
}
  
  
#' Extract only the features from the structure generated by calculate_versions_from_versions
#'
#' @param calculated_features output from calculate_versions_from_versions
#'
#' @return dataframe with a line for each combination of alerts only with features
#' @export
#'
#' @examples
extract_clean_features_from_calculated_features <- function(calculated_features){
  
  calculated_features$features %>% 
    unnest(.data$features) %>%  
    select(
      starts_with("same_") | starts_with("dist_")
    )
  
}
  


#' Decide, with an heuristic, based on features, if a combination of alerts are "the same"
#'
#' given a number of combination of alerts with their features, decides if the alerts are related to the same 
#'
#'
#' @param clean_calculated_features dataframe with one line for each combination of alert. 
#' It may come from calculate_features_from_versions() %>% extract_clean_features_from_calculated_features()
#'
#' @return a dataframe with one column "same_alert" with the decision if the two alerts of the combination are the same
#' @export
#'
#' @examples
decide_heurist_if_same_alert <- function(clean_calculated_features){
  

  # clean_calculated_features <- calculate_features_from_versions(
  #     code_file_new = "data/caso1_calculate_features_from_versions_novo-v2/code.java",
  #     code_file_old = "data/caso1_calculate_features_from_versions_velho-v2/code.java",
  #     pmd_path = "pmd/bin/pmd.bat"
  #   )$features %>% 
  #   unnest(features) %>%  
  # select(
  #   starts_with("same_") | starts_with("dist_")
  # )
  
  output <-  clean_calculated_features %>% 
    mutate(
      same_alert = case_when(
        
        (.data$same_rule &
        .data$same_id_group &
        .data$same_method_group &
        .data$same_block &
        .data$same_method_name &
        .data$same_rule &
        .data$same_method_code &
        .data$same_code &
        .data$dist_line == 0) ~
          TRUE,
        # If it's the same method code, 
        # then I consider it's the same method, even if the name or the group is not the same,
        # But if the method code is the same, then the alert code must be the same
        (.data$same_rule &
           .data$same_method_code &
           .data$same_rule &
           .data$same_code) ~
          TRUE,
        # If it's the same id_group, then they are mapped to the same line and the kind of block is the same 
        # If it´s the same rule, and one of the features about the method is the same
        (
          .data$same_rule &
          (.data$same_method_code | .data$same_method_group | .data$same_method_name) &
          .data$same_id_group
        ) ~ TRUE,
        # If it´s not the same group, then we must have evidence that the method is the same 
        #  and the line distance must be less then 5 lines, and the rume must be the same
        (
        .data$same_rule &
          (.data$same_method_code | .data$same_method_group | .data$same_method_name) &
          .data$dist_line < 5
        ) ~ TRUE,
        
        TRUE ~ FALSE
      )
    ) %>% 
    select(
      .data$same_alert
    )

}



#' Title
#'
#' @param dir_old old version
#' @param dir_new new version
#' @param pmd_path path to pmd 
#' 
#' @importFrom pryr object_size
#' @importFrom pryr mem_used
#'
#' @return
#' @export
#'
#' @examples
calculate_features_from_versions_and_extract_categorised_alerts <- function(
  code_file_new,
  code_file_old,
  pmd_path,
  id = 0,
  log = "log"
  ){

    
  inicio <- Sys.time()
  
  saida <- calculate_features_from_versions(
    code_file_new = code_file_new,
    code_file_old = code_file_old,
    pmd_path = pmd_path
  ) 
  
  readr::write_csv(tibble::tibble(id = 1), "data/progress.rds", append = TRUE)
  
  fim <- Sys.time()
  
  
  write_rds(saida$versions_executed, str_glue("data/{log}/versions_executed/{id}.rds"))
  write_rds(saida$versions_crossed, str_glue("data/{log}/versions_crossed/{id}.rds"))
  write_rds(saida$graph_new_with_alert, str_glue("data/{log}/graph_new_with_alert/{id}.rds"))
  write_rds(saida$graph_old_with_alert, str_glue("data/{log}/graph_old_with_alert/{id}.rds"))
  # write_rds(saida$features, str_glue("data/{log}/features/{id}.rds"))
  write_rds(saida$categorised_alerts, str_glue("data/{log}/categorised_alerts/{id}.rds"))
  execution_tibble <- tibble(
    time = fim - inicio,
    object_size = pryr::object_size(saida),
    mem_used = pryr::mem_used()
  )
  
  write_rds(execution_tibble, str_glue("data/{log}/execution/{id}.rds"))
    
  saida %>%
    extract2("categorised_alerts")
}



#' Compare two versions of a source-code in terms of kludges
#' 
#' Lists all the java files, gets the PMD alerts and categorise the alerts in "open", "fixed" and "new"
#'
#' @param dir_old old version
#' @param dir_new new version
#' @param pmd_path path to pmd 
#' @param limit_executions must the files be limitef
#' @param n_limit if the files must be limited, how many?
#'
#' 
#'
#' @return alerts categorised in "new", "fixed" and "open"
#' @export
#'
#' @examples
compare_versions <- function(
  dir_old, 
  dir_new, 
  pmd_path, 
  limit_executions = FALSE, 
  n_limit = 20,
  parallel = FALSE,
  resume = FALSE,
  log = "log"
  ){
  
  # dir_old <- "c:/doutorado/eclipse/eclipse-R4_3/eclipse-R4_3"
  # dir_new <-  "c:/doutorado/eclipse/eclipse-R4_4/eclipse-R4_4"
  
  if(parallel){
    future::plan(future::multisession, workers = 4)
  }
  else{
    future::plan(future::sequential)
  }
  files_old <- list.files(path = dir_old, "\\.java$", recursive = TRUE) %>% 
    enframe(name = "id_old", value = "file_old") %>% 
    mutate(
      original_file_old = str_glue("{dir_old}/{file_old}")
    )
  
  files_new <- list.files(path = dir_old, "\\.java$", recursive = TRUE) %>% 
    enframe(name = "id_new", value = "file_new") %>% 
    mutate(
      original_file_new = str_glue("{dir_new}/{file_new}")
    )
  

  calculate_possibly <- possibly(
    .f =calculate_features_from_versions_and_extract_categorised_alerts,
    otherwise = tibble(category = "error")
  )
  
  if(resume){
    anti <- list.files("data/{log}/categorised_alerts" %>% str_glue()) %>% 
      str_match("([0-9]*)\\.rds") %>% 
      .[,2] %>% 
      enframe(name = "row", value = "id") %>% 
      select(id) %>% 
      mutate(id = as.integer(id)) %>% 
      filter(!is.na(id)) 
  }else{
    anti <- tibble(id = integer())
  }
  

  joined_files <- files_new %>% 
    inner_join(files_old, by = c("file_new" = "file_old")) %>% 
    filter( row_number() < n_limit | !limit_executions) %>% 
    mutate(
      id = row_number()
    ) %>% 
    anti_join(
      anti,
      by = c("id")
    ) %>% 
    mutate(
      # alerts = furrr::future_pmap(
      alerts = pmap(
      .l = list(
          code_file_new = original_file_new,
          code_file_old = original_file_old,
          id = id
        ),
        .f = calculate_features_from_versions_and_extract_categorised_alerts,
        # .progress = TRUE,
        #.options = furrr::future_options(packages = "kludgenudger"),
        pmd_path = "pmd/bin/pmd.bat",
        log = log
        ) 
    ) %>% 
    unnest(.data$alerts)
  
}








#' Compare two versions of a source-code in terms of kludges
#' 
#' Lists all the java files, gets the PMD alerts and categorise the alerts in "open", "fixed" and "new"
#'
#' @param dir_old old version
#' @param dir_new new version
#' @param pmd_path path to pmd 
#' @param limit_executions must the files be limitef
#' @param n_limit if the files must be limited, how many?
#'
#' 
#'
#' @return alerts categorised in "new", "fixed" and "open"
#' @export
#'
#' @examples
compare_versions_read_outside <- function(
  dir_old, 
  dir_new, 
  pmd_path = "pmd/bin/pmd.bat", 
  log = "log"
){
  
  

  pmd_path = "pmd/bin/pmd.bat"
  
  # dir_old <- "C:/doutorado/ArgoUML/0_10"
  # 
  # dir_new <- "C:/doutorado/ArgoUML/0_11_1"
  
  uid <- uuid::UUIDgenerate()

  system(str_glue("{pmd_path} -d {dir_old} -f xml -R data/blockrules/blockrules_simple.xml -reportfile old_ast{uid}.xml"), show.output.on.console =  FALSE, invisible = TRUE)  

  system(str_glue("{pmd_path} -d {dir_new} -f xml -R data/blockrules/blockrules_simple.xml -reportfile new_ast{uid}.xml"), show.output.on.console =  FALSE, invisible = TRUE)  

  system(str_glue("{pmd_path} -d {dir_old} -f xml -R rulesets/java/quickstart.xml -reportfile old_alerts{uid}.xml"), show.output.on.console =  FALSE, invisible = TRUE)  
   
  system(str_glue("{pmd_path} -d {dir_new} -f xml -R rulesets/java/quickstart.xml -reportfile new_alerts{uid}.xml"), show.output.on.console =  FALSE, invisible = TRUE)  
  
  system(str_glue("git diff -U0 --patience --numstat --summary --output=old_new{uid}.diff --no-index {dir_old} {dir_new}"), show.output.on.console =  FALSE, invisible = TRUE)  


  diff_pairs <- extract_diff_pairs_from_diff_file(file = "old_new{uid}.diff" %>% str_glue() ) %>% 
    filter(
      str_detect(file_old, ".java$"),
      str_detect(file_new, ".java$"),
    )
  
  ast_from_pmd_old <- read_pmd_xml_all_files(file = "old_ast{uid}.xml" %>% str_glue()) %>% 
    mutate(
      file = str_replace_all(
      file,
      "\\\\",
      "/"
    )
  )
  
  ast_from_pmd_new <- read_pmd_xml_all_files(file = "new_ast{uid}.xml" %>% str_glue()) %>% 
    mutate(
      file = str_replace_all(
        file,
        "\\\\",
        "/"
      )
    )
  
  alerts_from_pmd_old <- read_pmd_xml_all_files(file = "old_alerts{uid}.xml" %>%  str_glue()) %>% 
    mutate(
      file = str_replace_all(
        file,
        "\\\\",
        "/"
      )
    )
  
  alerts_from_pmd_new <- read_pmd_xml_all_files(file = "new_alerts{uid}.xml" %>%  str_glue()) %>% 
    mutate(
      file = str_replace_all(
        file,
        "\\\\",
        "/"
      )
    )
  
  diff_pairs_info <- diff_pairs %>% 
    left_join(
      ast_from_pmd_old %>% rename_with(~str_glue("{.x}_ast_old")),
      by = c("file_old" = "file_ast_old") 
    ) %>% 
    left_join(
      ast_from_pmd_new %>% rename_with(~str_glue("{.x}_ast_new")),
      by = c("file_new" = "file_ast_new") 
    ) %>% 
    left_join(
      alerts_from_pmd_old %>% rename_with(~str_glue("{.x}_alert_old")),
      by = c("file_old" = "file_alert_old") 
    ) %>% 
    left_join(
      alerts_from_pmd_new %>% rename_with(~str_glue("{.x}_alert_new")),
      by = c("file_new" = "file_alert_new") 
    ) 


  future::plan(future::multiprocess)
  
  diff_pairs_changed <- diff_pairs_info %>% 
    filter(
      mode == "changed",
      similarity_index != 100 | is.na(similarity_index),
    ) %>% 
    rowwise() %>% 
    filter(
      !is.null(data_alert_old),
      !is.null(data_alert_new)
    ) %>% 
    ungroup() %>% 
    # semi_join(
    #   faltando,
    #   by = "file_new"
    # ) %>% 
    # filter(
    #   str_detect(file_new, "C:/doutorado/ArgoUML/0_17/src_new/org/argouml/uml/generator/ParserDisplay.java")
    # ) %>%
    sample_n(nrow(.), replace = FALSE) %>% 
    # rowwise() %>% 
    mutate(
      resultado = furrr::future_pmap(
        .l = list(
          code_file_new = file_new, 
          code_file_old = file_old, 
          pmd_path = pmd_path,
          alerts_new = data_alert_new,
          alerts_old = data_alert_old,
          ast_new = data_ast_new,
          ast_old = data_ast_old,
          diff = data,
          only_categorised = TRUE
          
        ),
        
        .f = calculate_features_from_versions,
        
        .progress = TRUE
        
      )
        
        
  ) 
    

  diff_pairs_changed_similarity_100 <- diff_pairs_info %>% 
    filter(
      mode == "changed",
      similarity_index == 100,
    )
  
  
  diff_pairs_changed_new <- diff_pairs_info %>% 
    filter(
      mode == "new",
    )
  
  diff_pairs_changed_deleted <- diff_pairs_info %>% 
    filter(
      mode == "deleted",
    )
  
  
  diff_pairs_changed_only_old <- diff_pairs_info %>% 
    filter(
      mode == "changed",
      similarity_index != 100 | is.na(similarity_index),
    ) %>% 
    rowwise() %>% 
    filter(
      !is.null(data_alert_old) & is.null(data_alert_new)
    )  
  
  diff_pairs_changed_only_new <- diff_pairs_info %>% 
    filter(
      mode == "changed",
      similarity_index != 100 | is.na(similarity_index),
    ) %>% 
    rowwise() %>% 
    filter(
      is.null(data_alert_old) & !is.null(data_alert_new)
    )  
  
  diff_pairs_changed_no_alerts <- diff_pairs_info %>% 
    filter(
      mode == "changed",
      similarity_index != 100 | is.na(similarity_index),
    ) %>% 
    rowwise() %>% 
    filter(
      is.null(data_alert_old) & is.null(data_alert_new)
    )  
  
  total <- 
    nrow(diff_pairs_changed) + 
    nrow(diff_pairs_changed_similarity_100) + 
    nrow(diff_pairs_changed_new) + 
    nrow(diff_pairs_changed_deleted) +
    nrow(diff_pairs_changed_only_old) +
    nrow(diff_pairs_changed_only_new) +
    nrow(diff_pairs_changed_no_alerts)
  
  

  categorized_alerts_changed <- diff_pairs_changed %>% 
    select(resultado) %>%    
    unnest(resultado) %>% 
    mutate(
      across(
        .cols = any_of(c("priority")),
        .fns = as.integer
      )
    )
  
  categorized_alerts_changed_similarity_100_new <- diff_pairs_changed_similarity_100 %>% 
    select(data_alert_new) %>%    
    unnest(data_alert_new) %>% 
    mutate(
      version = "new",
      category = "open"
    ) %>% 
    mutate(
      across(
        .cols = any_of(c("priority")),
        .fns = as.integer
      )
    )
  
  
  categorized_alerts_changed_similarity_100_old <- diff_pairs_changed_similarity_100 %>% 
    select(data_alert_old) %>%    
    unnest(data_alert_old) %>% 
    mutate(
      version = "old",
      category = "open"
    ) %>% 
    mutate(
      across(
        .cols = any_of(c("priority")),
        .fns = as.integer
      )
    )
  
    
  categorized_alerts_changed_new <-  diff_pairs_changed_new %>% 
    select(data_alert_new) %>%    
    unnest(data_alert_new) %>% 
    mutate(
      version = "new",
      category = "new"
    ) %>% 
    mutate(
      across(
        .cols = any_of(c("priority")),
        .fns = as.integer
      )
    )
  
   
  categorized_alerts_changed_old <-  diff_pairs_changed_deleted %>% 
    select(data_alert_old) %>%    
    unnest(data_alert_old) %>% 
    mutate(
      version = "old",
      category = "fixed"
    ) %>% 
    mutate(
      across(
        .cols = any_of(c("priority")),
        .fns = as.integer
      )
    )
  
  
  categorized_alerts_changed_only_old <- diff_pairs_changed_only_old %>% 
    select(data_alert_old) %>%    
    unnest(data_alert_old) %>% 
    mutate(
      version = "old",
      category = "fixed"
    ) %>% 
    mutate(
      across(
        .cols = any_of(c("priority")),
        .fns = as.integer
      )
    )
  
  
  categorized_alerts_changed_only_new <- diff_pairs_changed_only_new %>% 
    select(data_alert_new) %>%    
    unnest(data_alert_new) %>% 
    mutate(
      version = "new",
      category = "new"
    ) %>% 
    mutate(
      across(
        .cols = any_of(c("priority")),
        .fns = as.integer
      )
    )
  
  

  resultado <- bind_rows(
    categorized_alerts_changed, 
    categorized_alerts_changed_similarity_100_new,
    categorized_alerts_changed_similarity_100_old,
    categorized_alerts_changed_new,
    categorized_alerts_changed_old,
    categorized_alerts_changed_only_old,
    categorized_alerts_changed_only_new
  )
  

  write_rds(x = resultado, path = str_glue("{log}.rds"))
  
  resultado
  
}





#' Join a graph containing the ast and the alerts
#' 
#' In this join, the function must consider if there is more than one alert for each node of the ast.
#' If this is the case, the function must duplicate these nodes
# 
#'
#' @param ast graph the Abstract Syntax Tree
#' @param alerts dataframe containing the alerts
#'
#' @return the graph containing the ast and the related alerts 
#' @export
#'
#' @examples
join_ast_alerts <- function(ast, alerts){
  
  # info <- read_rds("data/info_join_ast_alerts.rds")
  # ast <- info$ast
  # alerts <- info$alerts
  
  # write_rds(ast, "data/info_join_ast_alerts_ast.rds")
  # write_rds(alerts, "data/info_join_ast_alerts_alerts.rds")
  # ast <- read_rds("data/info_join_ast_alerts_ast.rds")
  # alerts <- read_rds("data/info_join_ast_alerts_alerts.rds")
  

  nodes <- ast %>% activate("nodes") %>% as_tibble()
   
  max_column_nodes <- max(c(nodes$endcolumn, nodes$begincolumn ))
  max_column_alerts <- max(c(alerts$endcolumn_alert, alerts$begincolumn_alert))
  
  max_column <- max(c(max_column_nodes, max_column_alerts)) + 1
  
  alerts_position <- alerts %>% 
    mutate(
      beginposition_alert = .data$beginline_alert * max_column + .data$begincolumn_alert,
      endposition_alert = .data$endline_alert * max_column + .data$endcolumn_alert
    ) %>% 
    select(
      .data$id_alert_alert,
      .data$beginposition_alert,
      .data$endposition_alert,
      .data$beginline_alert,
      .data$endline_alert,
      .data$begincolumn_alert,
      .data$endcolumn_alert
      
    )
  
  nodes_position <- nodes %>% 
    mutate(
      beginposition = .data$beginline * max_column + .data$begincolumn,
      endposition =  .data$endline * max_column + .data$endcolumn
    ) %>% 
    select(
      .data$beginline,
      .data$endline,
      .data$begincolumn,
      .data$endcolumn,
      .data$.tidygraph_node_index,
      .data$beginposition,
      .data$endposition
    )
  
  elements_to_join <- alerts_position %>% 
    crossing(nodes_position) %>% 
    filter(
      .data$beginposition_alert >= .data$beginposition,
      .data$endposition_alert <= .data$endposition
    ) %>% 
    mutate(
      looseness = (.data$beginposition_alert - .data$beginposition) + (.data$endposition - .data$endposition_alert)
    ) %>% 
    group_by(
      .data$id_alert_alert
    ) %>% 
    slice_min(.data$looseness,n = 1, with_ties = FALSE) %>% 
    ungroup() %>% 
    select(
      .data$id_alert_alert,
      .data$.tidygraph_node_index
    ) %>% 
    group_by(
      .data$.tidygraph_node_index
    ) %>% 
    mutate(
      index_inside_original_node = row_number()
    )
    

  alerts_multi <- elements_to_join %>% 
    group_by(
      .data$.tidygraph_node_index
    ) %>% 
    summarise(
      n = n()
    ) %>% 
    filter(
      .data$n > 1
    ) %>% 
    ungroup()
  
  if(nrow(alerts_multi > 0)){

    n_nodes <- ast %>% activate("nodes") %>% as_tibble() %>% nrow() 
  
    nodes_must_be_included <- ast %>% 
      activate("nodes") %>% 
      as_tibble() %>% 
      right_join(alerts_multi,
                by = c(   
                  ".tidygraph_node_index"
                )
      )
      
     edges_must_be_included <- ast %>% 
       activate("edges") %>% 
       as_tibble() %>% 
       semi_join(
         nodes_must_be_included,
         by = c("to" = ".tidygraph_node_index")
       ) 
     
      
     nodes_must_be_included_multi <- nodes_must_be_included %>% 
       rowwise() %>% 
       mutate(
         temporario_id_multi = list(2:n)
       ) %>% 
       ungroup() %>% 
       unnest(.data$temporario_id_multi) %>% 
       mutate(
         temporario_id = row_number(),
         temporario_new_id = n_nodes + .data$temporario_id
       ) %>% 
       left_join(
         edges_must_be_included,
         by = c(".tidygraph_node_index" = "to")
       )
     
     edges_to_be_included <- nodes_must_be_included_multi %>% 
       select(
         to = .data$temporario_new_id,
         .data$from
       ) %>% 
       filter(!is.na(from))
     
     nodes_to_be_included <- nodes_must_be_included_multi %>% 
       select(
         !matches("temporario")
       ) %>% 
       select(
         -c(.data$from, .data$n)
       ) %>% 
       rename(
         .tidygraph_node_index_original = .data$.tidygraph_node_index
       ) %>% 
       group_by(.data$id_alert) %>% 
       mutate(index_inside_original_node = row_number() + 1) %>% 
       ungroup()
     

     ast_for_join <- ast %>% 
       activate("nodes") %>% 
       mutate(
         .tidygraph_node_index_original = .data$.tidygraph_node_index
       ) %>% 
       bind_nodes(nodes_to_be_included) %>% 
       bind_edges(edges_to_be_included) %>% 
       activate("nodes") %>% 
       mutate( 
         index_inside_original_node = if_else(
           is.na(.data$index_inside_original_node),
           1,
           .data$index_inside_original_node
         )
       ) 
     

    
  }else{
    ast_for_join <- ast %>% 
      activate("nodes") %>% 
      mutate(
        .tidygraph_node_index_original = .data$.tidygraph_node_index
      ) %>% 
      mutate(index_inside_original_node = 1)
  }
   
  alerts_indexed_inside_node <- alerts %>% 
    left_join(
      elements_to_join,
      by = c("id_alert_alert")
    ) %>% 
    group_by(
      .data$.tidygraph_node_index
    ) %>% 
    mutate(
      index_inside_original_node = row_number()
    ) 
   
   output <- ast_for_join %>% 
     left_join(
       elements_to_join,
       by = c(
         ".tidygraph_node_index_original" = ".tidygraph_node_index",
         "index_inside_original_node"
       ),
       suffix = c("",".y")
     ) %>% 
     left_join(
       alerts_indexed_inside_node,
       by =  c(
         "id_alert_alert" 
         ),
       suffix = c("",".z")
     ) %>% 
     mutate(
       one.x = one,
       one.y = one
     ) %>%
     select(
       any_of(
       c("linha", 
       "beginline", 
       "endline",
       "begincolumn", 
       "endcolumn", 
       "rule", 
       "ruleset", 
       "package", 
       "class", 
       "priority", 
       "variable", 
       "method", 
       "id_alert", 
       "small_rule", 
       "code", 
       "n_descendents", 
       "name", 
       ".tidygraph_node_index", 
       "id_group", 
       "mostra", 
       "one.x", 
       "linha_alert", 
       "rule_alert", 
       "ruleset_alert", 
       "package_alert", 
       "class_alert", 
       "method_alert", 
       "externalInfoUrl_alert", 
       "priority_alert", 
       "id_alert_alert",
       "one.y"))
     )  

   # info_join_ast_alerts <- list(
   #    ast = ast,
   #    alerts = alerts,
   #    output_function = output
   # )
   # 
   #write_rds(output, "data/info_join_ast_alerts_output.rds")
   
   output
   
   
}


extract_comments_from_directory <- function(dir, dest_file){
  
  # dir <- "C:/doutorado/ArgoUML/0_29"
  
  future::plan(future::multisession)
  
  extract_possibly <- possibly(
    .f =extract_comments_from_code,
    otherwise = tibble(comment = "error")
  )
  
  
  # comments <- list.files(path = dir, pattern = "\\.java$", recursive = TRUE, full.names = TRUE) %>% 
  #   enframe(
  #     name = "id",
  #     value = "file"
  #   ) %>% 
  #   mutate(
  #     comments = purrr::map(.x = file, .f = extract_comments_from_code )
  #   ) %>% 
  #   unnest(comments)
  # 
  

  comments <- list.files(path = dir, pattern = "\\.java$", recursive = TRUE, full.names = TRUE) %>%
    enframe(
      name = "id",
      value = "file"
    ) %>%
    mutate(
      comments = furrr::future_map(.x = file, .f = extract_possibly,.progress = TRUE )
    ) %>%
    unnest(comments)
  
  write_rds(comments, dest_file)
  
  
}


#' Show nodes as latex
#'
#' @param nodes nodes to read
#'
#' @return latex with the nodes
#' @export
#'
#' @examples
show_latex_raw_ast_nodes <- function(nodes){
  
  nodes %>%
    select(
      -c(linha, ruleset, package, class, priority, variable, id_alert, small_rule)
    ) %>% 
    rename(
      line = beginline,
      endline = endline,
      col = begincolumn,
      endcol = endcolumn
      
    ) %>% 
    mutate(
      code = str_trunc(code,width = 30, ellipsis = "...")
    ) %>% 
    arrange(
      line, col
    ) %>% 
    kable(
      format = "latex",
      caption = "Elements captured in code\\label{elements_captured}",
      escape = TRUE
    ) %>%
    kable_styling(
      font_size = 8,
      latex_options = c("hold_position")
    )
  
  
}



#' Title
#'
#' @param dir 
#'
#' @import dplyr
#' @return
#' @export
#'
#' @examples
read_results <-  function(dir, version_old, version_new){
  
  print(dir)

  categorised_alerts <- list.files(
    str_glue("{dir}/categorised_alerts"),
    full.names = TRUE
  ) %>%
    enframe(name = "id", value = "file") %>%
    mutate(number = str_match(file, pattern = "[0-9]*.rds") %>% str_remove(".rds")) %>%
    mutate(categorised_alerts = furrr::future_map(
      .x = file,
      .f = read_rds,
      .progress = TRUE
    )) %>%
    select(
      id,
      categorised_alerts
    )
  
  
  executions <- list.files(
    str_glue("{dir}/execution"),
    full.names = TRUE
  ) %>%
    enframe(name = "id", value = "file") %>%
    mutate(number = str_match(file, pattern = "[0-9]*.rds") %>% str_remove(".rds")) %>%
    mutate(executions = furrr::future_map(
      .x = file,
      .f = read_rds,
      .progress = TRUE
    )) %>%
    select(
      executions
    )
  
  
  # features <- list.files(
  #   str_glue("{dir}/features"),
  #   full.names = TRUE
  # ) %>%
  #   enframe(name = "id", value = "file") %>%
  #   mutate(number = str_match(file, pattern = "[0-9]*.rds") %>% str_remove(".rds")) %>%
  #   mutate(features = furrr::future_map(
  #     .x = file,
  #     .f = read_rds,
  #     .progress = TRUE
  #   )) %>%
  #   select(features)
  
  
  # graph_old_with_alert <- list.files(
  #   str_glue("{dir}/graph_old_with_alert"),
  #   full.names = TRUE
  # ) %>%
  #   enframe(name = "id", value = "file") %>%
  #   mutate(number = str_match(file, pattern = "[0-9]*.rds") %>% str_remove(".rds")) %>%
  #   mutate(graph_old_with_alert = furrr::future_map(
  #     .x = file,
  #     .f = read_rds,
  #     .progress = TRUE
  #   )) %>%
  #   select(
  #     graph_old_with_alert
  #   )
  # 
  # 
  # graph_new_with_alert <- list.files(
  #   str_glue("{dir}/graph_new_with_alert"),
  #   full.names = TRUE
  # ) %>%
  #   enframe(name = "id", value = "file") %>%
  #   mutate(number = str_match(file, pattern = "[0-9]*.rds") %>% str_remove(".rds")) %>%
  #   mutate(graph_new_with_alert = furrr::future_map(
  #     .x = file,
  #     .f = read_rds,
  #     .progress = TRUE
  #   )) %>%
  #   select(graph_new_with_alert)
  
  versions_crossed <- list.files(
    str_glue("{dir}/versions_crossed"),
    full.names = TRUE
  ) %>%
    enframe(name = "id", value = "file") %>%
    mutate(number = str_match(file, pattern = "[0-9]*.rds") %>% str_remove(".rds")) %>%
    mutate(versions_crossed = furrr::future_map(
      .x = file,
      .f = read_rds,
      .progress = TRUE
    )) %>%
    select(versions_crossed)
  
  versions_executed <- list.files(
    str_glue("{dir}/versions_crossed"),
    full.names = TRUE
  ) %>%
    enframe(name = "id", value = "file") %>%
    mutate(number = str_match(file, pattern = "[0-9]*.rds") %>% str_remove(".rds")) %>%
    mutate(versions_executed = furrr::future_map(
      .x = file,
      .f = read_rds,
      .progress = TRUE
    )) %>%
    select(versions_executed)
  
  
  output <- bind_cols(
    categorised_alerts,
    executions,
    # features,
    # graph_new_with_alert,
    # graph_old_with_alert,
    versions_crossed,
    versions_executed
  ) %>% 
    mutate(
      version_old = version_old,
      version_new = version_new
    )
  
  
  output

}


extract_mnemonic <- function(big_name){

  output <- str_match_all(big_name, pattern = "[A-Z]") %>% 
    map(.f = str_flatten) %>% 
    unlist()
  
  output
  
}


get_file_pair <- function(diff_string){
  match <- str_match(
    diff_string, 
    "diff --git a\\/(.*) b\\/(.*)"
  ) %>% as_tibble() %>% 
    select(
      file_old = V2,
      file_new = V3
    )
}



#' extract from diff files th diff pairs
#'
#'
#'
#' @param file diff file
#'
#' @return pair of diff
#' @export
#'
#' @examples
extract_diff_pairs_from_diff_file <- function(file){
  
  diff_content <- read_lines(file) %>% 
    enframe(
      name = "diff_line",
      value = "diff_string"
    ) %>% 
    mutate(
      begin_diff = str_detect(diff_string,"^[ ]*diff --git"),
      item = cumsum(begin_diff)
    ) %>% 
    filter(
      item > 0
    ) %>% 
    group_by(
      item
    ) %>% 
    mutate(
      line = row_number()
    ) %>% 
    ungroup() 
  

  file_pair <- diff_content %>%  
    filter(
      line == 1
    ) %>% 
    mutate(
      file_pair = get_file_pair(diff_string) 
    ) %>% 
    select(
      item,
      file_pair
    ) %>% 
    mutate(
      file_old = file_pair$file_old,
      file_new = file_pair$file_new
    ) %>% 
    select(
      -file_pair
    )
    
  
  
  file_pair_mode <- diff_content %>%  
    filter(
      line == 2
    ) %>% 
    mutate(
      mode = case_when(
        str_detect(diff_string, "^new file") ~ "new",
        str_detect(diff_string, "^deleted file") ~ "deleted",
        TRUE ~ "changed",
      ),
      similarity_index = str_match(string = diff_string, pattern =  "similarity index ([0-9]*)\\%" )[,2]
    )

  diff_just_content <- diff_content %>% 
    select(
      item, diff_string
    ) %>% 
    group_by(item) %>% 
    nest()
    
  output <- file_pair %>% 
    full_join(
      file_pair_mode,
      by = c("item")
    ) %>% 
    full_join(
      diff_just_content,
      by = c("item")
    )

  output %>% 
    mutate(
      similarity_index = as.integer(similarity_index)
    )
  
}




read_pmd_xml_all_files <- function(file){
  
  
  alerts <- xml2::read_xml(file) %>% 
    xml2::xml_children() %>% 
    xml2::xml_children() %>% 
    xml2::xml_attrs() %>% 
    map_df(.f = ~enframe(x = .x )) %>% 
    mutate(
      id = cumsum(name == "beginline")
    ) %>% 
    pivot_wider(
      names_from = name,
      values_from = value 
    ) %>% 
    mutate(
      across(
        c(beginline, endline, begincolumn, endcolumn),
        as.numeric
      )
    )
  
  
  alerts_guides  <- read_lines(file) %>% 
    enframe() %>% 
    mutate(
      file = str_detect(value, "^\\<file"),
      violation = str_detect(value,"^\\<violation"),
      id_file = cumsum(file),
      id_violation = cumsum(violation)
    ) %>% 
    filter(
      file
    ) %>% 
    mutate(
      begin_id = id_violation + 1,
      end_id = lead(id_violation)
    ) %>% 
    replace_na(
      list(end_id = nrow(alerts))
    ) %>% 
    fuzzyjoin::interval_inner_join(
      alerts,
      by = c(
        "begin_id" = "id", 
        "end_id" = "id"
      ),
      type = "any"    
    ) %>% 
    mutate(
      file = str_match(value, '^<file name=\\"(.*)\\">')[,2],
      id_alert = row_number()
    )

  variable = NA
  
  alerts_guides %>% 
    mutate(
      variable = variable
    ) %>% 
    select(
      file,
      beginline,
      endline,
      begincolumn,
      endcolumn,
      rule,
      ruleset,
      package,
      class,
      priority,
      method,
      variable,
      id_alert
    ) %>% 
    group_by(
      file
    ) %>% 
    nest()
    
}


read_results_from_outside_read <- function(
  dir = NULL,
  prefix = NULL,
  package_start = "org\\."
){


  dir <- if_else(is.null(dir), here::here("tests/testthat"), dir)
  prefix <- if_else(is.null(prefix), here::here("tests/testthat"), prefix)
      
  
  future::plan(future::multiprocess)
  
  results <- list.files(dir, pattern = "^{prefix}-[0-9_]*-[0-9_]*\\.rds$" %>% str_glue()   ) %>% 
    enframe(value = "file") %>% 
    mutate(
      major_version_old = str_match_all(file, "{prefix}-([0-9]*)[\\._-]" %>% str_glue() ),
      minor_version_old = str_match_all(file, "{prefix}-[0-9]*_([0-9]*)" %>% str_glue() ),
      major_version_new = str_match_all(file, "{prefix}-[0-9_]*-([0-9]*)[\\._-]" %>% str_glue() ),
      minor_version_new = str_match_all(file, "{prefix}-[0-9_]*-[0-9]*_([0-9]*)" %>% str_glue() )
    ) %>% 
    rowwise() %>% 
    mutate(
      major_version_old = major_version_old[2],
      minor_version_old = minor_version_old[2],
      major_version_new = major_version_new[2],
      minor_version_new = minor_version_new[2]
    ) %>% 
    ungroup() %>% 
    mutate(
      across(
        .cols = matches("(major|minor)_version_(old|new)"),
        .fns = as.integer
      )
    ) %>% 
    mutate(
      across(
        .cols = matches("(major|minor)_version_(old|new)"),
        .fns = ~if_else(is.na(.x),0L,.x)
      )
    ) %>% 
    mutate(
      file_complete = str_glue("{dir}/{file}")
    ) %>% 
    mutate(
      categorised = furrr::future_map(.x = file_complete, .f = read_rds, .progress = TRUE)
    ) %>% 
    mutate(
      versions = str_match_all(file, pattern = "^{prefix}-([0-9_]*)-([0-9_]*)\\.rds$" %>% str_glue() )
    ) %>% 
    unnest_wider(
      versions,
      names_sep = "_"
    ) %>% 
    select(
      categorised,
      version_old = versions_2,
      version_new = versions_3,
      matches("(major|minor)_version_(old|new)")
    ) %>% 
    unnest(categorised) %>% 
    filter(
      str_starts(package, package_start)
    )
  
    
  results

}
  

#' Save alerts from a directory with versions
#'
#' @param dir 
#' @param pmd_path 
#' @param pattern_versions 
#'
#' @return
#' @export
#'
#' @examples
save_alerts <-  function(
  dir = here::here("ArgoUML"),
  pmd_path = "pmd/bin/pmd.bat",
  pattern_versions = "^0_[0-9]*",
  dest_dir = "alerts",
  remove_str = "",
  string_to_replace = "-",
  string_to_replace_with = "-"
  
){
  

  dirs <- list.files(dir, pattern = pattern_versions) %>% 
    enframe(
      value = "file"
    ) %>% 
    mutate(
      file_complete = str_glue("{dir}/{file}")
    ) %>%
    mutate(
      file = str_remove(file, remove_str)
    ) %>% 
    mutate(
      file = str_replace(file, string_to_replace, string_to_replace_with )
    ) %>% 
    mutate(
      saida = map2(
        .x = file ,
        .y = file_complete,
        .f = ~system(command = str_glue("{pmd_path} -d {.y} -f xml -R rulesets/java/quickstart.xml -reportfile {dest_dir}/{.x}.xml"), show.output.on.console =  FALSE, invisible = TRUE)  
      )
    )
    
}



read_all_alerts <- function(
  
  dir = here::here("tests/testthat/alerts"),
  pattern_major = "^0_([0-9]*)[\\._]",
  pattern_minor = "^0_[0-9]*_([0-9]*)",
  package_start = "org\\."
  
  ){
  
  
  future::plan(future::multiprocess)
  
  alerts <- list.files(dir) %>% 
    enframe(
      value = "file"
    ) %>%
    mutate(
      major_version = str_match_all(file, pattern_major ),
      minor_version = str_match_all(file, pattern_minor )
    ) %>% 
    rowwise() %>% 
    mutate(
      major_version = major_version[2],
      minor_version = minor_version[2]
    ) %>% 
    ungroup() %>% 
    mutate(
      across(
        .cols = c(major_version, minor_version),
        .fns = as.integer
      )
    ) %>% 
    replace_na(
      list(minor_version = 0)
    ) %>% 
    mutate(
      file_complete = str_glue("{dir}/{file}")
    ) %>% 
    mutate(
      alerts = furrr::future_map(
        .x = file_complete, 
        .f = read_pmd_xml_all_files, 
        .progress = TRUE
      )
    ) %>% 
    unnest(
      alerts,
      names_repair = "universal"
    ) %>% 
    unnest(
      data
    ) %>% 
    filter(
      str_starts(package, package_start)
    )
    

  alerts_info <- alerts %>%
    arrange(
      major_version,
      minor_version
    ) %>% 
    group_by(
      major_version,
      minor_version
    ) %>% 
    summarise(
      files = n_distinct(file...6),
      package = n_distinct(package),
      class = n_distinct(class),
      alerts = n()
    ) 
  
  alerts
  
    
}


#' Analyses alerts
#'
#' @return categorised alerts
#' @export
#' 
#' @import dplyr
#' @impor tidyr
#'
#' @examples
analyse_alerts_and_categories <- function(
  dir_outside_read = NULL,
  package_start_outside_read = NULL,
  dir_read_all_alerts = NULL,
  pattern_major_read_all_alerts = NULL,
  pattern_minor_read_all_alerts = NULL,
  package_start_read_all_alerts = NULL,
  prefix = NULL
){




  results <- read_results_from_outside_read(
    dir = dir_outside_read,
    package_start = package_start_outside_read,
    prefix = prefix
  )
  
  alerts <- read_all_alerts(
    dir = dir_read_all_alerts,
    pattern_major = pattern_major_read_all_alerts,
    pattern_minor = pattern_minor_read_all_alerts,
    package_start = package_start_read_all_alerts
  )
  
  results_summarised <- results %>% 
    group_by(
      across(
        matches("(major|minor)_version_(old|new)") | category
      )
    ) %>% 
    summarise(
      n = n()
    ) %>% 
    pivot_wider(
      names_from = category,
      values_from = n
    )

  alerts_summarised <- alerts %>% 
    group_by(
      across(
        matches("(major|minor)_version") 
      )
    ) %>% 
    summarise(
      n = n()
    ) %>% 
    arrange(
      major_version,
      minor_version
    ) %>% 
    left_join(
      results_summarised,
      by = c("major_version" = "major_version_new", "minor_version" = "minor_version_new" ),
      keep = TRUE
    ) %>% 
    ungroup() %>% 
    replace_na(
      list(
        fixed = 0,
        new = 0,
        open = 0
      )
    ) %>% 
    mutate(
      theo_increase = new - fixed,
      real_increase = n - lag(n),
      diff = theo_increase - real_increase,
      inc_new = diff * new / (fixed - new),
      inc_fixed = diff + inc_new,
      fixed_adjusted = (fixed + inc_fixed) %>% round(),
      new_adjusted = (new + inc_new) %>% round()
    ) %>% 
    filter(
      !is.na(minor_version_new)
    ) %>% 
    mutate(
      new_adjusted =
        if_else(new_adjusted < 0 | fixed_adjusted < 0,
                if_else(diff > 0,
                        new - diff,
                        new
                ),
                new_adjusted
                )
      ,
      fixed_adjusted =
        if_else(fixed_adjusted < 0 | new_adjusted < 0,
                if_else(diff < 0,
                        fixed + diff,
                        fixed
                ),
                fixed_adjusted
        )
      ,
      
      ratio_before = new / (fixed + new), 
      ratio_after = new_adjusted / (fixed_adjusted + new_adjusted),
      increase_after = new_adjusted - fixed_adjusted

    ) %>% 
    select(
      matches("_version"), n, fixed = fixed_adjusted, new = new_adjusted, open 
    ) 


  alerts_summarised
  
}


extract_root_dir_from_log <-  function(log = "C:/doutorado/kludgenudger/tests/testthat/log-14-15.rds"){
  

  file <- read_rds(log)
    
  root_new <- str_match(string = file$file_new, pattern = "^(.*/ArgoUML/.+?)/") %>% 
    first()
   
  root_old <- str_match(string = file$file_old, pattern = "^(.*/ArgoUML/.+?)/") %>% 
    first()
   
  tibble(
    root_new = root_new,
    root_old = root_old
  )
   
 
 
 
}


generate_diffs_from_versions <- function(
  dir = here::here("tests/testthat"),
  
  complement = tribble(
    ~file,                ~root_new,                         ~root_old,
    "log-14-15.rds",      "C:/doutorado/ArgoUML/0_15/",      "C:/doutorado/ArgoUML/0_14/",
    "log-17-18.rds",      "C:/doutorado/ArgoUML/0_18/",      "C:/doutorado/ArgoUML/0_17/",
    "log-18-19.rds",      "C:/doutorado/ArgoUML/0_19/",      "C:/doutorado/ArgoUML/0_18/",
    "log-19-20.rds",      "C:/doutorado/ArgoUML/0_20/",      "C:/doutorado/ArgoUML/0_19/",
    "log-25-26.rds",      "C:/doutorado/ArgoUML/0_26/",      "C:/doutorado/ArgoUML/0_25/",
    "log-26-27.rds",      "C:/doutorado/ArgoUML/0_27/",      "C:/doutorado/ArgoUML/0_26/",
    "log-27-28.rds",      "C:/doutorado/ArgoUML/0_28/",      "C:/doutorado/ArgoUML/0_27/",
    "log-28-29.rds",      "C:/doutorado/ArgoUML/0_29/",      "C:/doutorado/ArgoUML/0_28/",
    "log-30-31.rds",      "C:/doutorado/ArgoUML/0_31/",      "C:/doutorado/ArgoUML/0_30/",
    "log-31-32.rds",      "C:/doutorado/ArgoUML/0_32/",      "C:/doutorado/ArgoUML/0_31/",
    "log-32-33.rds",      "C:/doutorado/ArgoUML/0_33/",      "C:/doutorado/ArgoUML/0_32/",
    "log-33-34.rds",      "C:/doutorado/ArgoUML/0_34/",      "C:/doutorado/ArgoUML/0_33/"
    
  ),
  
  prefix = "log"
  
  
){
  

  execute_system <- function(file, root_old, root_new ){

    system(str_glue("git diff -U0 --patience --numstat --summary --output={file}.diff --no-index {root_old} {root_new}"), show.output.on.console =  FALSE, invisible = TRUE)      
  }
    
  results <- list.files(dir, pattern = "^{prefix}-[0-9_]*-[0-9_]*\\.rds$" %>% str_glue() ) %>% 
    enframe(value = "file") %>% 
    mutate(
      major_version_old = str_match_all(file, "{prefix}-([0-9]*)[\\._-]" %>% str_glue() ),
      minor_version_old = str_match_all(file, "{prefix}-[0-9]*_([0-9]*)" %>% str_glue() ),
      major_version_new = str_match_all(file, "[0-9]-([0-9]*)[\\._-]" %>% str_glue() ),
      minor_version_new = str_match_all(file, "[0-9]-[0-9]*_([0-9]*)" %>% str_glue() )
    ) %>% 
    select(file) %>% 
    mutate(
      roots = map(
        .x = file, .f = extract_root_dir_from_log
      )
    ) %>% 
    unnest(
      roots
    ) %>% 
    mutate(
      across(
        starts_with("root_"),
        ~str_sub(string = .x, end = -2)
      )
    ) %>% 
    filter(
      !is.na(root_new)
    ) %>% 
    bind_rows(
      complement    
    ) %>% 
    mutate(
      saida = pmap(
        .l = list(
          root_new = root_new,
          root_old = root_old,
          file = file
        ), 
        
        .f = execute_system
      )
    )

}


generate_ast_only_classes <- function(
  dir = "C:/doutorado/ArgoUML/", 
  pattern = "^0",
  pmd_path = "pmd/bin/pmd.bat",
  output_path = "only_classes",
  pattern_major_minor_versions = "^0_([0-9]*)\\_([0-9]*)"
  
){

  
  
  executions <- list.files(
    dir,
    pattern = pattern
  ) %>% 
    enframe(
      name = "id",
      value = "dir_incomplete"
    ) %>% 
    mutate(
      versions = str_match_all(string = dir_incomplete, pattern = pattern_major_minor_versions),
      dir_complete = str_glue("{dir}{dir_incomplete}"),
    ) %>% 
    rowwise() %>% 
    mutate(
      major_version = versions[2],
      minor_version = versions[3]
    ) %>% 
    ungroup() %>% 
    mutate(
      output = str_glue("{major_version}_{minor_version}") 
    ) %>% 
    mutate(
      saida = map2(
        .x = output, .y = dir_complete, .f =  ~system(str_glue("{pmd_path} -d {.y} -f xml -R data/blockrules/onlyclass.xml -reportfile {output_path}/{.x}"), show.output.on.console =  FALSE, invisible = TRUE)  
      ) 
    )
}



read_pmd_only_classes <- function(
  dir = "only_classes" ,
  pattern_major_minor_versions = "0_([0-9]*)_([0-9]*)"
){
  
  
  future::plan(future::multiprocess)
  
  saida <- list.files(dir) %>% 
    enframe(
      name = "id",
      value = "dir_incomplete"
    ) %>% 
    mutate(
      dir_complete = str_glue("{dir}/{dir_incomplete}")
    ) %>% 
    mutate(
      content = furrr::future_map(
        .x = dir_complete,
        .f = read_pmd_xml_all_files,
          .progress = TRUE
      )
    ) %>% 
    unnest(
      content
    ) %>% 
    unnest(data) %>% 
    select(
      dir_incomplete,
      file,
      package,
    ) %>% 
    distinct() %>% 
    filter(
      str_detect(package, pattern = "org\\.")
    ) %>% 
    mutate(
      versions = str_match_all(string = dir_incomplete, pattern = pattern_major_minor_versions),
      dir_complete = str_glue("{dir}{dir_incomplete}"),
    ) %>% 
    rowwise() %>% 
    mutate(
      major_version = versions[2],
      minor_version = versions[3]
    ) %>% 
    ungroup() %>% 
    mutate(
      across(
        .cols = ends_with(
          "version"
        ),
        .fns = as.integer
      )
    ) %>% 
    replace_na(
      list(
        minor_version = 0
      )
    ) 

}
  
  



compare_comments <- function(comments, dir_diffs){
  
  diff_pairs <- extract_diff_pairs_from_diff_file(file = "log-9_7-9_8.rds.diff" %>% str_glue() ) %>% 
    filter(
      str_detect(file_old, ".java$"),
      str_detect(file_new, ".java$"),
    )

  
}



#' Kludge exressions
#'
#' @return expressions 
#' @export
#'
#' @examples
get_kludge_expressions <-  function(){
  expressions <- c(
    "hack",
    "retarded",
    "at a loss",
    "stupid",
    "remove this code",
    "ugly",
    "something ?[[:alnum:][:space:]]* gone wrong",
    "nuke",
    "is problematic",
    "problematic",
    "may cause problem",
    "hacky",
    "unknown why we ever experience this",
    "treat .*as a soft error",
    "silly",
    "workaround for bug",
    "workaround",
    "kludge",
    "fixme",
    "this isn't quite right",
    "trial and error",
    "this is wrong",
    "hang our heads in shame",
    "temporary solution",
    "temporary fix",
    "causes issue",
    "something bad is going on",
    "cause for issue",
    "this doesn't look right",
    "this does not look right",
    "is this next line safe",
    "temporary crutch",
    "this can be a mess",
    "this isn't very solid",
    "this is temporary and will go away",
    "is this line really safe",
    "there is a problem",
    "some fatal error",
    "something serious is wrong",
    "don't use this",
    "do not use this",
    "get rid of this",
    "doubt that this would work",
    "this is bs",
    "give up and go away",
    "risk of this blowing up",
    "just abandon it",
    "prolly a bug",
    "buggy",
    "probably a bug",
    "hope everything will work",
    "toss it",
    "barf",
    "something bad happened",
    "fix this crap",
    "yuck",
    "certainly buggy",
    "remove me before production",
    "remove this before production",
    "you can be unhappy now",
    "this is uncool",
    "bail out",
    "it doesn't work yet",
    "it does not work yet",
    "crap",
    "inconsistency",
    "abandon all hope",
    "kaboom",
    "is this .*needed",
    "shame",
    "nasty",
    "horrible",
    "purists would",
    "that's for a next refactoring",
    "(?:todo:|needs-more-work:)[ ]*find a way to",
    "would be better here",
    "necessary\\?",
    "this should use ?[[:alnum:][:space:]]* instead of",
    "not sure whether this belongs here",
    "needs to be updated",
    "this ?[[:alnum:][:space:]]* (?:is|seems) ?[[:alnum:][:space:]]* redundant",
    "needs to be updated",
    "(?:todo:|needs-more-work:)[ ]*what\\?",
    "a better implementation would be",
    "why isn't this done",
    "what is this trying to do\\?",
    "this will fail",
    "crappy",
    "we need a better",
    "should not be using ?[[:alnum:][:space:]]* here",
    "hardwired",
    "untested",
    "just a guess",
    "what do we want to use .*\\?",
    "check that this is correct",
    "why aren't we ?[[:alnum:][:space:]]* \\?",
    "(?:this|it) ?.* does not work",
    "this (?:is|seems) a temporary method",
    "should there really be ?[[:alnum:][:space:]]* \\?",
    "not exact, but close",
    "this (?:is|seems) ?[[:alnum:][:space:]]* expensive way to",
    "unused?",
    "is this a good way ?[[:alnum:][:space:]]*\\?",
    "this indicates a ?[[:alnum:][:space:]]* problem",
    "potential ?[[:alnum:][:space:]]* issue",
    "a ?[[:alnum:][:space:]]* better way of doing this would",
    "really should be",
    "not sure this is ?[[:alnum:][:space:]]* right",
    "fix this",
    "(?:todo:|needs-more-work:)[ ]*review",
    "(:?it|this) (?:is|seems) ?[[:alnum:][:space:]]* a part implementation",
    "(:?it|this) should ?[[:alnum:][:space:]]* be using",
    "(:?it|this) should ?[[:alnum:][:space:]]* be extended",
    "(:?it|this) (?:is|seems) ambiguous",
    "(:?it|this) is never executed ",
    "(:?it|this) ?[[:alnum:][:space:]]* shouldn't",
    "(:?it|this) ?[[:alnum:][:space:]]* should not",
    "(:?it|this) ?[[:alnum:][:space:]]* should be",
    "we ?[[:alnum:][:space:]]* don't want",
    "we ?[[:alnum:][:space:]]* do not want",
    "needs to be tidied up",
    "is (:?it|this) possible",
    "does this help",
    "brute force",
    "this error needs to be",
    "needs to be ?[[:alnum:][:space:]]* after stable release",
    "fragile",
    "(?:todo:|needs-more-work:)[ ]*check",
    "until code is reviewed",
    "test this",
    "magic numbers?",
    "needs to be fixed",
    "not the right location",
    "how come this happens",
    "enhance so that",
    "we shouldn't need this",
    "we should not need this",
    "(?:todo:|needs-more-work:)[ ]*document",
    "not strictly correct",
    "(:?it|this) should ?[[:alnum:][:space:]]* be ?[[:alnum:][:space:]]* the other way round",
    "we can remove",
    "this is already defined",
    "for a next refactoring",
    "hardcoded",
    "this creates a dependency",
    "the ?[[:alnum:][:space:]]* (?:is|seems) redundant",
    "it is planned to refactor",
    "(:?it|this) is planned to refactor",
    "why ?[[:alnum:][:space:]]* continue here as if nothing has gone wrong\\?",
    "bad smell",
    "(?:todo:|needs-more-work:)[ ]*constraints",
    "need to replace this",
    "do we ?[[:alnum:][:space:]]* need ?[[:alnum:][:space:]]* this",
    "once we go ?[[:alnum:][:space:]]* we won't need this",
    "does not work",
    "why (?:is|are) ?[[:alnum:][:space:]]* being ignored",
    "is done twice",
    "never get executed",
    "how to avoid",
    "(?:the|it|this) ?[[:alnum:][:space:]]* only works",
    "\\:\\-\\(",
    "double counted",
    "is (?:this|it) ?[[:alnum:][:space:]]* correct",
    "not sure ?[[:alnum:][:space:]]* (?:this|it) belongs here",
    "less elegant ?[[:alnum:][:space:]]* that works",
    "(?:this|it)?[[:alnum:][:space:]]* is in the wrong place",
    "what (?:shall|should) we do here",
    "please explain this",
    "this needs work",
    "prove that this works",
    "does not handle",
    "is ?[[:alnum:][:space:]]* the right thing here",
    "(?:todo:|needs-more-work:)[ ]*verify",
    "explain that this ?[[:alnum:][:space:]]* works also",
    "why there is not test",
    "(:?this|the|it) ?[[:alnum:][:space:]]* contains a ?[[:alnum:][:space:]]* error",
    "we ?[[:alnum:][:space:]]* (?:don't|do not) need (?:it|this) any more",
    "is (?:it|this|the) ?[[:alnum:][:space:]]* still useful",
    "is not so beautiful",
    "shouldn't we ?[[:alnum:][:space:]]* do something ?[[:alnum:][:space:]]* here",
    "(:?don't|do not) understand",
    "ouch",
    "(?:todo:|needs-more-work:)[ ]*complete this",
    "(:?doesn't|does not) take into account",
    "why is this code commented",
    "turn off after built",
    "in the future",
    "causes? ?[[:alnum:][:space:]]* errors?",
    "always check for",
    "make (?:it|this) configurable",
    "(?:todo|needs-more-work):[ ]*provide",
    "(?:todo|needs-more-work):[ ]*constructors",
    "shall not be",
    "necessary\\?",
    "dummy implementation",
    "at the moment",
    "wasteful",
    "pending",
    "awkward",
    "really should be",
    "how can this possibly be",
    "(?:todo|needs-more-work):[ ]*add",
    "(?:todo|needs-more-work):[ ]*handle",
    "(?:todo|needs-more-work):[ ]*should",
    "(?:todo|needs-more-work):[ ]*this ?[[:alnum:][:space:]]* should",
    "(?:todo|needs-more-work):[ ]*assumes",
    "(?:todo|needs-more-work):[ ]*remove",
    "(?:todo|needs-more-work):[ ]*make",
    "(?:todo|needs-more-work):[ ]*move",
    "(?:todo|needs-more-work):[ ]*fix",
    "(?:todo|needs-more-work):[ ]*not implemented",
    "(?:todo|needs-more-work):[ ]*define",
    "(?:todo|needs-more-work):[ ]*disable",
    "(?:todo|needs-more-work):[ ]*treat",
    "(?:todo|needs-more-work):[ ]*use",
    "(?:todo|needs-more-work):[ ]*what\\?",
    "(?:todo|needs-more-work):[ ]*why",
    "(?:todo|needs-more-work):[ ]*split",
    "(?:todo|needs-more-work):[ ]*replace",
    "(?:todo|needs-more-work):[ ]*improve",
    "(?:todo|needs-more-work):[ ]*this ?[[:alnum:][:space:]]* needs",
    "(?:todo|needs-more-work):[ ]*we ?[[:alnum:][:space:]]* need",
    "(?:todo|needs-more-work): ?[[:alnum:][:space:]]* algorithm",
    "(?:todo|needs-more-work): ?[[:alnum:][:space:]]* incomplete",
    "(?:todo|needs-more-work): ?[[:alnum:][:space:]]* unfinished",
    "(?:todo|needs-more-work): ?[[:alnum:][:space:]]* in some cases",
    "(?:todo|needs-more-work): ?[[:alnum:][:space:]]* not ?[[:alnum:][:space:]]* completely",
    "(?:todo|needs-more-work): ?[[:alnum:][:space:]]* this does",
    "what if we need to .*\\?",
    "don't think so",
    "no point in doing this",
    "it would be ?[[:alnum:][:space:]]* more efficient to",
    "i don't know",
    "i'd suggest that",
    "need to be reviewed",
    "need to be changed",
    "there ?[[:alnum:][:space:]]* ought to be a check",
    "there ?[[:alnum:][:space:]]* must be a check",
    "it works,? but",
    "it would be better to",
    "should match ?[[:alnum:][:space:]]* uml",
    "too primitive",
    "replace by ?[[:alnum:][:space:]]* elegant",
    "is it a bug",
    "it is unclear to me",
    "strange construction",
    "good enough",
    "this will ?[[:alnum:][:space:]]* not be perfect",
    "hopefully",
    "i [do not|don't] see ?[[:alnum:][:space:]]* reason for",
    "i [do not|don't] know",
    "can we remove",
    "we presume",
    "remove one of them",
    "this does exactly the same",
    "[do not|don't] ?[[:alnum:][:space:]]* understand the code",
    "this test fails",
    "not sure if its worth fixing",
    "take care",
    "too complex",
    "you should consider using",
    "does not do this right",
    "don't need to",
    "subclasses are encouraged",
    "trouble is when",
    "is it fair to use ?[[:alnum:][:space:]]* here?",
    "you decide\\.",
    "verify\\.",
    "however, it is included as",
    "it makes the assumption",
    "never print",
    "we strongly recommend that",
    "todo breaks",
    "as best we can",
    "todo confirm",
    "shouldn't happen",
    "this is ?[[:alnum:][:space:]]* dubious",
    "which is better?[[:space:]]?",
    "this don't work",
    "how do i test ?[[:alnum:][:space:]]*\\?",
    "at some point ?[[:alnum:][:space:]]* will need",
    "no ?[[:alnum:][:space:]]* test methods are provided",
    "figure out some sane way",
    "test by eyeball",
    "lots of inner classes",
    "lots of ?[[:alnum:][:space:]]* tight coupling",
    "no thought given to",
    "this works fine, but",
    "code written on the fly",
    "shouldn't be a problem unless",
    "if you want to restore ?[[:alnum:][:space:]]*, you must uncomment",
    "disguised switch statements",
    "weird",
    "future releases? will"
    
  ) %>% 
    sort()
  
}

acha_kludge <- function(x = "something really gone wrong"){
  
  expressions <- get_kludge_expressions()

  expressions <- str_glue("\\b{expressions}\\b")
  
  bateu <- str_detect(string = x, pattern = expressions)
  
  sum(bateu) %>% as.integer()
  
  
}


extract_selected_comments <- function(
  path_to_comments = "comments_joda",
  output = "joda_selected_comments.rds"
){

  
  expressions <- get_kludge_expressions()
  
  expressions <- str_glue("\\b{expressions}\\b")


  future::plan(future::multiprocess)
  
  comments_raw <- list.files(path = path_to_comments ) %>% 
    enframe(
      value = "file_to_read"
    ) %>% 
    mutate(
      comments = furrr::future_map(.x = str_glue("{path_to_comments}/{file_to_read}"), .f = read_rds)
    ) %>% 
    unnest(
      comments
    ) %>%
    mutate(
      comment = stringi::stri_enc_toutf8(comment)
    ) %>% 
    mutate(
      comment = str_to_lower(comment)
    ) 

  
  comments_raw_contagem <- comments_raw %>%
    mutate(
      across(
        comment,
        .fns = ~str_remove_all(string = .x, pattern =  "\\n *?\\*")
      )
    ) %>% 
    mutate(
      string_length = str_length(comment) + 4,
      char_comment_ends = cumsum(string_length),
      char_comment_starts = lag(char_comment_ends) + 1
    ) %>% 
    replace_na(
      list(
        char_comment_starts = 1
      )
    )

  comments_flat <- str_flatten(string = comments_raw_contagem$comment, collapse = "####")
  
  localizacoes <- str_locate_all(
    string = comments_flat,
    pattern = expressions
  ) %>% 
    map(
      .f = as_tibble
    ) %>% 
    enframe() %>% 
    unnest(
      value
    ) %>% 
    mutate(
      bateu = 1 
    )
  
  
  comments_raw_contagem_joined <- localizacoes %>%
    interval_left_join(
      comments_raw_contagem,
      by = c(
        "start" = "char_comment_starts",
        "end" = "char_comment_ends"
      ),
      type = "within"
    )
  

  write_rds(comments_raw_contagem_joined, output)
  
  
  
}


extract_only_files_from_diff_pairs <- function(file = "log-9_7-9_8.rds.diff" ){
  
  diff_pairs <- extract_diff_pairs_from_diff_file(file) %>% 
    filter(
      str_detect(file_old, ".java$"),
      str_detect(file_new, ".java$"),
    ) %>% 
    select(
      file_old,
      file_new,
      mode,
      similarity_index
    )
  
}


#' Create verstion comparison of comments
#'
#' @return comparison of comments
#' @export
#'
#' @examples
create_version_comparisons_comment <-  function(
  
  selected_comments_file = "selected_comments_3.rds",
  pattern_diff = "log",
  dir_only_classes = "only_classes",
  pattern_major_minor_versions_only_classes = NULL
    
){
  

  future::plan(future::multiprocess)
  
  selected_comments <- read_rds(selected_comments_file) %>% 
    ungroup() %>% 
    mutate(
      comment = str_trim(comment) %>% str_to_lower() 
    ) %>% 
    mutate(
      comment = str_replace_all(string = comment, pattern = "needs\\-more\\-work", replacement = "todo"  )
    )
  


  diff_pairs <- list.files(pattern = "{pattern_diff}-.*\\.diff" %>% str_glue()) %>% 
    enframe(   
      value = "file" 
    ) %>% 
    separate(
      col = file,
      into = c("nada", "version_old", "version_new"),
      sep = "-",
      remove = FALSE
    ) %>% 
    mutate(
      version_new = str_remove_all(version_new, pattern = "\\.rds\\.diff")
    ) %>% 
    separate(
      version_new,
      into = c("major_version_new", "minor_version_new"),
      sep = "_"
    ) %>% 
    separate(
      version_old,
      into = c("major_version_old", "minor_version_old"),
      sep = "_"
    ) %>% 
    mutate(
      across(
        .cols = matches("\\_version\\_"),
        .fns = as.integer
      )
    ) %>% 
    replace_na(
      list(
        minor_version_new = 0,
        minor_version_old = 0
      )
    ) %>% 
    mutate(
        content = furrr::future_map(.x = file, .f = extract_only_files_from_diff_pairs)
      ) %>% 
    unnest(content)
  
  
  pmd_only_classes <- 
    read_pmd_only_classes(
      dir = dir_only_classes,
      pattern_major_minor_versions = pattern_major_minor_versions_only_classes
    ) %>% 
    mutate(
      file = str_replace_all(file, pattern =  "\\\\", replacement = "/"  )
    )
    

  count_comments_per_version <-  selected_comments %>% 
    left_join(
      pmd_only_classes,
      by = c("file")
    ) %>% 
    filter(
      str_detect(package, pattern = "org\\.")
    ) %>% 
    group_by(
      major_version,
      minor_version
    ) %>% 
    summarise(
      n = n()
    ) 
  
  
  diff_pairs_with_package <- diff_pairs %>% 
    left_join(
      pmd_only_classes %>% select(file_old = file, package_old = package),
      by = c("file_old")
    ) %>% 
    left_join(
      pmd_only_classes %>% select(file_new = file, package_new = package),
      by = c("file_new")
    ) %>% 
    filter(
      str_detect(package_old, pattern = "org\\.") | str_detect(package_new, pattern = "org\\.")
    )
  
  
  
  diff_pairs_new <- diff_pairs_with_package %>% 
    filter(
      mode == "new"
    ) %>% 
    inner_join(
      selected_comments,
      by = c("file_new" = "file")
    ) %>% 
    group_by(
      across(
        .cols = matches("\\_version\\_")
      )
    ) %>% 
    summarise(
      new = n(),
      fixed = 0
    )

  
  diff_pairs_deleted <- diff_pairs_with_package %>% 
    filter(
      mode == "deleted"
    ) %>% 
    inner_join(
      selected_comments,
      by = c("file_old" = "file")
    ) %>% 
    group_by(
      across(
        .cols = matches("\\_version\\_")
      )
    ) %>% 
    summarise(
      fixed = n(),
      new = 0
    )
  

  join_comments_each_version <- function(
    major_version_old, 
    minor_version_old,
    major_version_new, 
    minor_version_new,
    file_old,
    file_new
  ){
    
    
    # if(major_version_new == 2 & minor_version_new == 0  ){
    #   browser()
    # }
    
    comments_old <- selected_comments %>% ungroup() %>% 
      filter(
        file == file_old
      ) %>%
      select(
        comment_old = comment
      )
    
    comments_new <- selected_comments %>% ungroup() %>% 
      filter(
        file == file_new
      ) %>% 
      select(
        comment_new = comment
      )
    
    comments <- comments_old %>% 
      full_join(
        comments_new,
        by = c("comment_old" = "comment_new"),
        keep = TRUE
      )
    
    
    comments    
    
    
  }
  
  

  diff_pairs_changed <- diff_pairs_with_package %>% 
    filter(
      mode == "changed"
    ) %>% 
    select(
      matches("\\_version\\_") | matches("file\\_")
    ) %>% 
    mutate(
      comments = furrr::future_pmap(
      # comments = pmap(
      .l = list(
          major_version_old = major_version_old, 
          minor_version_old = minor_version_old,
          major_version_new = major_version_new, 
          minor_version_new = minor_version_new,
          file_old = file_old,
          file_new = file_new
        ) , 
        .f = join_comments_each_version
      ,
        .progress = TRUE
      )
    ) %>% 
    unnest(comments)
  
  count_new_fixed_comments <-  diff_pairs_changed %>% 
    group_by(
      across(
        .cols = matches("\\_version\\_")
      ) 
    ) %>% 
    summarise(
      new = sum(is.na(comment_old)),
      fixed = sum(is.na(comment_new))
    ) %>% 
    bind_rows(
      diff_pairs_new
    ) %>% 
    bind_rows(
      diff_pairs_deleted
    ) %>%
    ungroup() %>% 
    group_by(
      across(
        .cols = matches("\\_version\\_")
      ) 
    ) %>% 
    summarise(
      new = sum(new),
      fixed = sum(fixed)
    ) %>% 
    left_join(
      count_comments_per_version,
      by = c("major_version_new" = "major_version", "minor_version_new" = "minor_version"   )
    ) %>% 
    ungroup() %>% 
    mutate(
      inc_n = if_else(row_number() == 1, 0, new - fixed),
      cum_inc_n = cumsum(inc_n),
      n =  first(n) + cum_inc_n
    ) %>% 
    select(
      -c(inc_n, cum_inc_n)
    )
  
  
  count_new_fixed_comments
  
  
}


unzip_files_from_folder <-  function(path_param = "C:/doutorado/junit" ){

  list.files(
    path_param,
    full.names = TRUE
  ) %>% 
    enframe(
      value = "path"
    ) %>%
    mutate(
      map(
        .x = path,        
        .f = ~unzip(zipfile = .x)
      )
    )
  
  
  
}




rename_file_diff <- function(
  pattern_diff = "C:/doutorado/joda-time",
  prefix = "joda-time"
){
  
  content <- list.files(pattern = ".*\\.diff") %>% 
    enframe(
      value = "file"
    ) %>% 
    mutate(
      content = map(.x = file, .f = read_file)
    ) %>%
    mutate(
      pattern_detected = str_detect(content, pattern = pattern_diff )
    ) %>% 
    filter(
      pattern_detected
    ) %>% 
    rowwise() %>% 
    mutate(
      versions = str_match_all(string = content, pattern =  "diff --git.+?([0-9]+?\\.[0-9]+?).+?([0-9]+?\\.[0-9]+?).*"),
      version_new = versions[1,2],
      version_old = versions[1,3]
    ) %>% 
    select(
      -content
    ) %>% 
    mutate(
      version_new = str_replace(version_new, pattern = "\\.", replacement = "_" ),
      version_old = str_replace(version_old, pattern = "\\.", replacement = "_" ),
      new_name = str_glue("{prefix}-{version_new}-{version_old}.diff")
    )
  
  walk2(
    .x = content$file,
    .y = content$new_name,
    .f = ~file.copy(from = .x, to = .y)
  )
  
  
  
  
} 
crotman/kludgenudger documentation built on Oct. 19, 2021, 7:30 p.m.