R/report_scoring_grouping_method.R

Defines functions compare_summaries coding_summary

Documented in coding_summary compare_summaries

coding_summary <- function(user_doc_id, allocation_type="all", restrict_double_coded=TRUE){
  #' Summary of coding of document allocation.
  #'
  #' Returns tibble with number of items of each variable value combination types coded in the document by the user.
  #'
  #' @param user_doc_id The user_doc_id(s) to summarize.
  #' @param allocation_type The allocation type ("training", "testing","coding", "all").
  #' @param restrict_double_coded Only summarize articles which have been coded more than once (in FALSE summarize all articles).
  #' @return Returns tibble with number of items of predefined types coded in the document by the user.
  #' @export
  user_docs<-durhamevp::get_allocation(user_doc_id = user_doc_id, allocation_type=allocation_type)

  user_docs<-dplyr::filter(user_docs, status=="COMPLETED")

  if(restrict_double_coded){
    doublecoded_docs<-user_docs %>%
      dplyr::group_by(document_id) %>%
      dplyr::tally() %>%
      dplyr::filter(n>1)

    user_docs<-dplyr::filter(user_docs, document_id %in% doublecoded_docs$document_id)
  }

  event_report <- durhamevp::get_event_report(user_doc_id=dplyr::pull(user_docs, "id"))
  #model_event_report <- durhamevp::get_event_report(model_event_report_id)
  tags<-durhamevp::get_tag(event_report_id = dplyr::pull(event_report, "id"))
  attributes<-durhamevp::get_attribute(tag_id = dplyr::pull(tags, "id"))

  event_report<-dplyr::left_join(event_report, user_docs, by=c("user_doc_id"="id"), suffix=c("event_report", "user_doc"))
  tags<-dplyr::left_join(tags, event_report, by=c("event_report_id"="id"), suffix=c(".tags", "event_report"))

  attributes<-dplyr::left_join(attributes, tags, by=c("tag_id"="id"), suffix=c("attributes", "tags"))

  user_doc_coding_counts<-user_docs %>%
    mutate(level="user_doc") %>%
    tidyr::gather(variable, value, article_type, geo_relevant, time_relevant, electoral_nature, violence_nature, electoralviolence_nature, legibility, recommend_qualitative) %>%
    dplyr::rename(user_doc_id=id) %>%
    dplyr::group_by(user_doc_id, user_id, document_id, level, variable, value) %>%
    dplyr::tally()

  event_report_coding_counts<-event_report %>%
    mutate(level="event_report") %>%
    tidyr::gather(variable, value, event_type, environment, event_start, event_end) %>%
    dplyr::group_by(user_doc_id, user_id, document_id, level, variable, value) %>%
    dplyr::tally()

  tag_coding_counts <- tags %>%
    mutate(level="tag") %>%
    dplyr::rename(value=tag_value) %>%
    unite(variable, tag_table, tag_variable) %>%
    group_by(user_doc_id, user_id, document_id, level, variable, value) %>%
    tally()

  attribute_coding_counts <- attributes %>%
    mutate(level="attribute") %>%
    dplyr::rename(value=attribute_value, variable=attribute) %>%
    group_by(user_doc_id, user_id, document_id, level, variable, value) %>%
    tally()


  coding_summary <- bind_rows(user_doc_coding_counts,
            event_report_coding_counts,
            tag_coding_counts,
            attribute_coding_counts)


  coding_summary
}

compare_summaries<- function(coding_summary){
  #'Compares summaries of coding by user.
  #'
  #'@param coding_summary A coding summary (usually generated by the \code{coding_summary} function.)
  #'@export
  #'


  coding_allocs<-coding_summary %>%
    ungroup() %>%
    group_by(user_doc_id, document_id, user_id) %>%
    summarize()

  # There are some duplicated coding allocations (9 on 20/9/2018) - need to find out why!
  # For now remove them
  coding_allocs <- coding_allocs[!duplicated(coding_allocs[,c("document_id", "user_id")]),]


  coder_pairs<- dplyr::full_join(coding_allocs,
                                          coding_allocs,
                                          by=c("document_id"),
                                          suffix=c(".case1", ".case2"))
  # create unique pairs of respondents

  coder_pairs <- coder_pairs %>%
    dplyr::group_by(document_id) %>%
    tidyr::expand(user_id.case1, user_id.case2) %>%
    dplyr::filter(user_id.case1 < user_id.case2) %>%
    tibble::rowid_to_column("pair_no")

  long_coder_pairs <- coder_pairs %>%
    tidyr::gather(which_user_id, user_id, user_id.case1, user_id.case2) %>%
    dplyr::left_join(coding_allocs, by = c("document_id", "user_id")) %>%
    dplyr::arrange(pair_no)

  long_coding<-  dplyr::right_join(coding_summary,
                     long_coder_pairs,
                     by = c("user_doc_id", "user_id", "document_id"))

  compare_res<-long_coding %>%
    ungroup() %>%
    dplyr::select(document_id, pair_no, which_user_id, n, level, variable, value) %>%
    spread(which_user_id, n, fill=0) %>%
    mutate(abs_diff=abs(user_id.case1-user_id.case2)) %>%
    dplyr::select(-user_id.case1, -user_id.case2) %>%
    left_join(coder_pairs, by=c("pair_no", "document_id"), suffix=c(".val", "")) %>%
    gather(which_case, user_id, user_id.case1, user_id.case2)

  compare_res
}
gidonc/durhamevp documentation built on April 8, 2022, 10:31 a.m.