Import annotations

Import annotations from project directory exported by WebAnno.

project_path <- file.path(getwd(), "../data-raw", "annotations", "2017-04-26")
annotations <- 
  get_annotations.WebAnno_project(project_path) %>% 
  ### A: could abstract this step into a function to replace annotation field names, and make the replacements to be parameters to the program, rather than hardwired
  mutate(annotation_type = stringr::str_replace(annotation_type, "Score", "sentiment"),
         annotation_type = stringr::str_replace(annotation_type, "Context", "topic"),
         annotation_type = stringr::str_replace(annotation_type, "hypothetical", "nature"),
         annotation_type = stringr::str_replace(annotation_type, "dLevelchange", "level_change")) %>% 
  select(doc_id, annotator, id_annotation, annotation_type, annotation_value, everything())

Apply any project-specific fix-ups here.

# Old tag names that got changes partway through tagging
annotations %<>% 
  mutate(annotation_value = 
           ifelse(annotation_type == "topic" & annotation_value == "Remit (monetary policy)", 
                  "Objective (monetary policy)", annotation_value))

# Shorter topic tagnames to facilitate ease of reading plots (note the tags then
# won't align with any tagsets automatically extracted from the WebAnno project 
# files, if I implement such automatic extraction in future)
annotations %<>% 
  mutate(annotation_value = 
           ifelse(annotation_type == "topic",
                         `Financial stability / policy / regulation / supervision` = "Financial",
                         `Competence / implementation / approach / staff` = "Implementation",
                         `Instruments (monetary policy)` = "Instruments (MP)",
                         `Objective (monetary policy)` = "Objectives (MP)"),

Impute tags to documents with tag field left blank, where blank implies a default tag under the annotation policy.

### A: consider wrapping the following imputation into the import function -- 
### could do at document-annotator level using typesystem.xmi, or at project 
### level using project structure files (the json?) or a user-provided taglist
### (?).

# In case there are documents which *no* annotator has explicitly tagged, we need an external list of documents shown to annotators.

# Get complete list of doc_ids, annotators, and annotation types (some may never
# appear in the annotations doc). NOTE: workaround to use the getter function,
# rather than take direct from attributes of `annotations` object, because this 
# information being deleted from `annotations` object by dplyr generics
doc_ids <- get_doc_ids_from_project_path(project_path)
# Impute missing tags to annotators who failed to apply any tags and so don't
# appear in the explicit imported tagset. Not yet implemented. We implicitly
# assume that all annotators have applied at least one tag to at least one
# document... or else we ignore them.
annotators <- get_annotators(annotations) # NOTE: will miss any totally silent annotators
annotation_types <- get_annotation_types(annotations) # NOTE: will miss any totally unused annnotation types

# Complete the annotations by adding missing document-annotator-annotion type 
# triples. Values are filled with NA, pending imputation of implicit values for 
# no tag / blank tags 
## A: Consider wrapping into a function (e.g. impute_blank_tags) once our tagging policy is standardised
annotations %<>% 
  complete(doc_id = doc_ids, 
           annotator = annotators, 
           annotation_type = annotation_types)

# Tagging instructions were to not apply tags beyond the topic tag if that was the default blank (implicitly "Not BoE") tag. Therefore tag defaults for other tags should only be applied to the subset with explicit BoE topic tags. We therefore split the data.frame to implement this split policy, and then recombined. 
annotations %<>% 
  group_by(doc_id, annotator) %>% 
  mutate(BoE_topic = 
           sum(annotation_type == "topic" & ! > 0) %>% 

### WARNING: Hardwired field names and default values
defaults_list_BoE <-   
  list(topic = "Not BoE",
       nature = NA, ### Q: Does the blank tag (complement of the named tags) have a unique interpretation for this field?
       sentiment = "Neutral",
       level_change = NA) ### Q: Is this going to be given a (blank) default? 

annotations_BoE <- 
  annotations %>% 
  filter(BoE_topic) %>% 
  spread(annotation_type, annotation_value) %>% 
  filter(! %>% # drop spurious rows that spread has introduced
  replace_na(defaults_list_BoE) %>% 
  gather_("annotation_type", "annotation_value", annotation_types)

annotations_non_BoE <- 
  annotations %>% 
  filter(!BoE_topic) %>% 
  mutate(annotation_value = ifelse(annotation_type == "topic" &, defaults_list_BoE$topic, annotation_value))

stopifnot( # check that where topic is blank, no other tags have been applied
  annotations_non_BoE %>% 
  filter(annotation_type != "topic") %>% 
  filter(! %>% 
  nrow() == 0)

annotations <- 
  bind_rows(annotations_BoE, annotations_non_BoE) %>% 
  arrange(doc_id, annotator, id_annotation, annotation_type, annotation_value)


Extract topics tags.

### A: see if can map over different annotation_types, rather than repeating this pattern for each
topics <- 
  annotations %>% 
  filter(annotation_type == "topic") # %>% 
  # select(doc_id, id_annotation, annotator, annotation_value)
topics %>%
  mutate(annotation_value = ifelse(annotation_value != "Not BoE", "BoE", annotation_value)) %>% 
  ggplot(aes(x = annotator)) + 
  geom_bar(aes(fill = annotation_value)) + 
  coord_flip() + 
  scale_x_discrete(name = NULL)
topics %>% 
  filter(annotation_value != "Not BoE") %>% 
  plot_marginal_tag_frequency(sort = "frequency")

Censor cells with very few co-occurrences, to make it visually easier to identify the main features of disagreement.

plot_coincidences_annotators_pairwise(topics, cutoff = 0, sort = "frequency")

"Nature" tags

natures <- 
  annotations %>% 
  filter(BoE_topic) %>% 
  filter(annotation_type == "nature")
plot_marginal_tag_frequency(natures, sort = "frequency")
crossplot_pairwise(natures, cutoff = 0, sort = "frequency")


Extract sentiment tags into a dataframe.

sentiments <- 
  annotations %>%
  filter(BoE_topic) %>% 
  filter(annotation_type == "sentiment") %>% 
  select(doc_id, id_annotation, annotator, annotation_value)

Map sentiments strings to numbers.

# WARNING! Hardwired strings
sentiment_map_chr_to_numeric_correct <- 
    ~annotation_value, ~score,
    "1. Strongly Negative", -3L,
    "2. Negative"         , -2L,
    "3. Weakly Negative"  , -1L,
    "4. Neutral"          ,  0L,
    "5. Weakly Positive"  , +1L,
    "6. Positive"         , +2L,
    "7. Strongly Positive", +3L,
    "Couldn't score"      , NA_integer_

sentiment_map_chr_to_numeric_20170426 <- 
    ~annotation_value, ~score,
    "1. Strongly Negative", -3L,
    "2. Negative"         , -2L,
    "3. Weakly Negative"  , -1L,
    "4. Weakly Positive"  , +1L,
    "5. Positive"         , +2L,
    "6. Strongly Positive", +3L,
    "7. Couldn't score"   , NA_integer_,
    "Neutral"             , 0

# Note this will lump unscored documents in with explicit "Couldn't score" tags, so to check for documents lacking a sentiment tag, I would need to do so with NA_character_ early on
sentiments %<>%
  left_join(sentiment_map_chr_to_numeric_20170426, by = "annotation_value") %>% 
  rename(annotation_value_str = annotation_value,
         annotation_value = score) # keep varname generic to ease reuse of other code (would become unnecessary if passing as function argument)

Mean by annotator

Treating sentiment scores as on an interval scale from -3 (strongly negative) to +3 (strongly positive).

sentiments %>% 
  group_by(annotator) %>% 
  summarise(mean = mean(as.numeric(annotation_value), na.rm = TRUE))

Association between annotators

annotators <- unique(sentiments$annotator)
sentiments %>% 
  select(-annotation_value_str) %>% 
  spread(annotator, annotation_value) %>% 
  select_(.dots = as.list(annotators)) %>%
  #mutate_all(as.integer) %>% 
  cor(method = "spearman",
      use = "pairwise.complete.obs") %>% 
  knitr::kable(digits = 2)
plot_marginal_tag_frequency(sentiments, sort = "value")
### Q: How to deal with missing value, as will commonly occur? These should only come up when the topic is not BoE.
crossplot_pairwise(sentiments, cutoff = 0, sort = "value")

Level/change tags

levels_or_changes <- 
  annotations %>%
  filter(BoE_topic) %>% 
  filter(annotation_type == "level/change")


Document-by-document view

topic_unique_annotator <- 
  topics %>% 
  select(doc_id, annotator, annotation_value) %>% 
  group_by(doc_id, annotator) %>% 
  mutate(document_annotator_seq = row_number(),
         annotator_unique = ifelse(document_annotator_seq == 1, annotator, paste0(annotator, "_2"))) %>% 
  ungroup() %>% 
  select(-annotator, -document_annotator_seq) %>% 
  spread(annotator_unique, annotation_value)

Agreement / disagreement by document

topics_per_doc <- 
  topics %>%
  group_by(doc_id) %>% 
  summarise(n_distinct_topic = n_distinct(annotation_value))

Complete agreement.

topics_per_doc %>% 
  filter(n_distinct_topic == 1)

Documents with some disagreement.

topic_unique_annotator %>% 
  left_join(topics_per_doc, "doc_id") %>% 
  filter(n_distinct_topic > 1)



A: adapt for checking whatever our final tagging rules are

Each annotator was supposed to apply one and only one sentiment annotation per document. List any violations.

### A: modify this so that one and only one tag where there is also a topic tag
sentiments %>% 
  complete(doc_id, annotator) %>% 
  group_by(doc_id, annotator) %>% 
  summarise(n = sum(! %>% 
  filter(n != 1) %>% 
  arrange(annotator, doc_id, n)

From here on we work with whatever sentiment annotations are available (not assuming a balanced panel).

Completeness and multiple topic tagging

Distribution of number of topics per annotator-document pair (if any).

topics_per_annotator_document <- 
  topics %>% 
  complete(doc_id, annotator) %>% 
  group_by(doc_id, annotator) %>% 
  summarise(n_annotations = sum(!

How often is each annotator applying a given number of topic tags to the same document?

ntopic_per_annotator_document_distribution <- 
  topics_per_annotator_document %>% 
  group_by(annotator, n_annotations) %>% 
  summarise(n = n())

Annotators who have failed to annotate some documents.

ntopic_per_annotator_document_distribution %>% 
  filter(n_annotations == 0) %>% 

Annotators who have applied multiple tags to some documents.

ntopic_per_annotator_document_distribution %>% 
  filter(n_annotations > 1) %>% 
  group_by(annotator) %>%  

From here on we work with whatever topic annotations are available (not assuming a balanced panel, or a unique topic per annotator-document pair).

petereckley/webannor documentation built on May 25, 2019, 12:48 a.m.