This tutorial accompanies the SEMDIAL poster abstract on rezonateR
by showing how everything was coded.
library(tidyverse) library(rezonateR) sbc007_qa <- importRez("sbc007_adj_pair.rez", concatFields = c("text", "transcript"))
Several annotations have been made in the dataset we used for this study. Firstly, all question-answer sequences in the text have been marked as stacks. This includes anything that is formally marked by lexico-grammatical means as a question, including through formulaic expressions like you know? and really?. When the speaker is K- and makes a statement that is clearly a question, that is included as well; I exclude, however, cases where a K+ speaker produces a grammatical declarative with rising intonation, since those seem to have lower expectations for a response.
Here is an example of how a question and an answer may be marked as a stack:
{width="168"}
The seqType
variable that gives the specific type of question. This particular stack is tagged as a ritualised expression of surprise; the annotation is shown below along with a few others:
Resonance was marked throughout the text, including within question-answer pairs:
{width="252"}
Finally, each instance of the words yeah and mhm is manually annotated for some features in Rezonator:
Let's first add participant (speaker label) information to the cards DF:
sbc007_qa = sbc007_qa %>% addFieldForeign("token", "", "unit", "", "unit", "participant", "participant") sbc007_qa = sbc007_qa %>% addFieldForeign("card", "QA", "unit", "", "unit", "participant", "participant")
We can now determine whether each Q&A has Mary or Alice as the asker like this:
sbc007_qa_asker_byseq = sbc007_qa$cardDF$QA %>% group_by(chain) %>% summarise(asker = participant[which(actionType == "question")[1]]) %>% ungroup
Now let's put this info in the stack DF again:
sbc007_qa$stackDF$QA = sbc007_qa$stackDF$QA %>% rez_left_join(sbc007_qa_asker_byseq, by = c(id = "chain"), fieldaccess = "flex")
Find number of resonances between questions and answers:
getResonacesBetweenQA = function(actionType, unit){ findResonancesBetween(sbc007_qa, list(unit[actionType == "question"]), list(unit[actionType == "answer"])) } sbc007_qa = addUnitSeq(sbc007_qa, "rez", "default") sbc007_countrez_qa = sbc007_qa$cardDF$QA %>% group_by(chain) %>% summarise(resonances = getResonacesBetweenQA(actionType, unit)) %>% ungroup()
Now let's put this info in the stack DF again:
sbc007_qa$stackDF$QA = sbc007_qa$stackDF$QA %>% rez_left_join(sbc007_countrez_qa, by = c(id = "chain"), fieldaccess = "flex")
Now we can look at resonances by question type:
sbc007_qa$stackDF$QA %>% filter(resonances > 0) %>% ggplot(aes(y = resonances, x = seqType)) + geom_col() + scale_x_discrete(labels = c("Assessment\nsolicitation", "Candidate\nco-construction", "Confirmation\nrequest", "Info-seeking\nquestion"), name = "Type of Q-A sequence") + scale_y_continuous(breaks = seq(0,8,2))
We will first do a few preparatory steps:
library(ggwordcloud) #Add QA stack sequence numbers to token DF sbc007_qa = stackToToken(sbc007_qa, "QA") #Add seqType to the token DF as well sbc007_qa = addFieldForeign(sbc007_qa, "token", "", "stack", "QA", "QASeq", "seqType", sourceKeyName = "stackSeq") #Determine which tokens are actual words, as opposeed to e.g. punctuation sbc007_qa = sbc007_qa %>% addIsWordField(kind == "word")
Now we can look at the word clouds, highlighting interjections:
#Get the commonest part of speech of a word in the text getCommonest = function(char){ summ = as.factor(char) %>% summary names(summ[summ == max(summ)][1]) } sbc007_qa$tokenDF %>% filter(!is.na(seqType), kind == "word") %>% mutate(seqType = as.factor(seqType), textLower = as.factor(tolower(text))) %>% group_by(seqType, textLower) %>% summarise(POS_dft = getCommonest(POS_dft), n = n()) %>% mutate(POS_dft = case_when(textLower %in% c("yeah", "yes", "mhm", "right", "unhunh") ~ "ITJ")) %>% ungroup() %>% filter(seqType %in% c("info","disbelief", "recog_solicit")) %>% ggplot(aes(label = textLower, size = n, col = POS_dft)) + geom_text_wordcloud() + theme_minimal() + theme(legend.position = "top") + scale_x_continuous(expand = c(0,0)) + facet_wrap("seqType", labeller = as_labeller(c(disbelief = "Ritualised\ndisbelief", info = "Information\nseeking", recog_solicit = "Solicitation of\nrecognition")))
This produces the Gantt chart shown in the abstract:
#Getting participant information in stacks: sbc007_qa = sbc007_qa %>% addFieldForeign("card", "Turn", "unit", "", "unit", "participant", "participant") sbc007_qa = sbc007_qa %>% addFieldForeign("stack", "Turn", "card", "Turn", "card", "participant", "participant", type = "complex", complexAction = function(x) paste0(unique(x), collapse = ",")) getGantt(sbc007_qa, x = "token", obj = "stack", stacking = "Turn")+ theme(axis.title = element_blank()) + geom_point(data = sbc007_qa$tokenDF %>% filter(tolower(text) %in% c("yeah", "mhm")) %>% mutate(docTokenSeqLast = docTokenSeq, docTokenSeqFirst = docTokenSeq), aes(y = participant, x = docTokenSeq, col = tolower(text), shape = tolower(text)), size = 3) + theme(legend.position = "none") + scale_shape_manual(breaks = c("mhm", "yeah"), values = c("M", "Y"))
This code derives the non-manually-created features for the yeah/mhm dataset:
#Get lowercase words sbc007_qa$tokenDF <- sbc007_qa$tokenDF %>% rez_mutate(textLower = tolower(text)) #Get unit and turn lengths in the token DF sbc007_qa$tokenDF = sbc007_qa$tokenDF %>% rez_group_by(unit) %>% rez_mutate(unitLen = sum(isWord)) %>% rez_ungroup() sbc007_qa = stackToToken(sbc007_qa, "Turn") sbc007_qa$tokenDF = sbc007_qa$tokenDF %>% rez_group_by(TurnSeq) %>% rez_mutate(TurnLen = sum(isWord)) %>% rez_ungroup() #Get positions of IUs within turns, words within IUs (counting from the front (order) and counting from the back (back)) sbc007_qa$tokenDF = sbc007_qa$tokenDF %>% rez_mutate(TurnOrder = getOrderFromSeq(TurnSeq, isWord = isWord)) sbc007_qa$tokenDF = sbc007_qa$tokenDF %>% rez_mutate(unitBack = case_when( isWord ~ unitLen - wordOrder + 1, T ~ 0)) sbc007_qa$unitDF <- sbc007_qa$unitDF %>% rez_mutate(TurnOrder = getOrderFromSeq(TurnSeq)) %>% rez_group_by(TurnSeq) %>% rez_mutate(TurnLen = max(TurnOrder)) %>% rez_ungroup() %>% rez_mutate(TurnBack = TurnLen - TurnOrder + 1) #Getting the positions of units within turns to the token DF: sbc007_qa = sbc007_qa %>% addFieldForeign("token", "", "unit", "", "unitSeq", "unitTurnOrder", "TurnOrder", sourceKeyName = "unitSeq") %>% addFieldForeign("token", "", "unit", "", "unitSeq", "unitTurnLen", "TurnLen", sourceKeyName = "unitSeq") %>% addFieldForeign("token", "", "unit", "", "unitSeq", "unitTurnBack", "TurnBack", sourceKeyName = "unitSeq")
Get vector forms of properties:
yeah_mhm_sbc007 = sbc007_qa$tokenDF %>% filter(textLower %in% c("yeah", "mhm")) yeah_mhm_sbc007_vector = data.frame( hierarchy = case_when( yeah_mhm_sbc007$hierarchy == "K_plus" ~ 1, yeah_mhm_sbc007$hierarchy == "K_minus" ~ 0, T ~ .5 ), responsiveness = case_when( yeah_mhm_sbc007$responsiveness == "otherElicited" ~ 1, yeah_mhm_sbc007$responsiveness == "self" ~ 0, T ~ .5 ), seqPosition_second = .5 * (yeah_mhm_sbc007$seqPosition == "second"), seqPosition_third = .5 * (yeah_mhm_sbc007$seqPosition == "third"), seqPosition_other = .5 * (yeah_mhm_sbc007$seqPosition == "other"), stanceType_second = .5 * (yeah_mhm_sbc007$stanceType == "affiliation"), stanceType_third = .5 * (yeah_mhm_sbc007$stanceType == "understanding"), stanceType_other = .5 * (yeah_mhm_sbc007$stanceType == "other"), wordOrder = yeah_mhm_sbc007$wordOrder / max(yeah_mhm_sbc007$wordOrder), back = as.integer(yeah_mhm_sbc007$back) / max(as.integer(yeah_mhm_sbc007$back)), unitTurnOrder = log(yeah_mhm_sbc007$unitTurnOrder) / max(log(yeah_mhm_sbc007$unitTurnOrder)), unitTurnBack = log(yeah_mhm_sbc007$unitTurnBack) / max(log(yeah_mhm_sbc007$unitTurnBack)) ) yeah_mhm_sbc007 = yeah_mhm_sbc007 %>% group_by(textLower) %>% mutate(idNew = paste0(textLower, "_", 1:n())) %>% ungroup rownames(yeah_mhm_sbc007_vector) = yeah_mhm_sbc007 %>% pull(idNew)
Do hierarchical clustering and cut tree:
distMatrix = dist(yeah_mhm_sbc007_vector, method = "manhattan") yeah_mhm_clusters = hclust(distMatrix) yeah_mhm_sbc007 = yeah_mhm_sbc007 %>% mutate(clusters_5 = cutree(yeah_mhm_clusters, k = 5)) %>% mutate(clusters_2 = cutree(yeah_mhm_clusters, k = 2)) %>% mutate(category_fine = case_when( clusters_5 == 1 ~ "Self", clusters_5 == 2 ~ "Backchannel", clusters_5 == 3 ~ "NonInfoSPP", clusters_5 == 4 ~ "PostExp", clusters_5 == 5 ~ "InfoSPP" ) %>% factor(levels = c("Backchannel", "Self", "NonInfoSPP", "PostExp", "InfoSPP"))) %>% mutate(category_coarse = case_when( clusters_2 == 1 ~ "Regulatory", clusters_2 == 2 ~ "Substantive" )) %>% mutate(path = paste0(category_coarse, "-", category_fine), base = 1)
Draw Sankey diagram:
library(ggforce) yeah_mhm_all_ggforce = yeah_mhm_sbc007 %>% group_by(category_coarse, category_fine, textLower) %>% summarise(value = n()) %>% ungroup %>% gather_set_data(1:3, id_name = "id") ggplot(yeah_mhm_all_ggforce, aes(x, id = id, split = y, value = value)) + geom_parallel_sets(aes(fill = textLower), alpha = 0.3, axis.width = 0.1) + geom_parallel_sets_axes(axis.width = 0.1) + geom_parallel_sets_labels(colour = 'black', angle = 0, nudge_x = 0.05, hjust = 0) + coord_cartesian(xlim = c(1, 3.3)) + theme_minimal() + theme(axis.title.x = element_blank(), axis.text.x = element_blank())
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.