In gandalfnicolas/SADCAT: Dictionary creation with stereotype content dictionaries

knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>"
)

This guide uses a slightly different procedure than the paper (Nicolas, Bai, & Fiske, 2020), with better performance for responses > 2 words.

To use, examine and understand the structure of the obituaries file, and format your own text responses in the same format. Then replace the appropriate variables to run your text responses through the code provided to obtain all the dictionaries codings. Make sure all packages are properly installed and loaded.

Alternatively, this code also gives you direct access to the dictionaries in the first lines (e.g., SADCAT::All.steps_Dictionaries), which you can then transfer to your software or approach of choice for coding text.

Once all preprocessing is completed, please check that preprocessing completed as expected, and that your coding is not being particularly affected by irrelevant words that may be more common in your data than in the development data (some code is provided for this purpose after step 9).

#(remove pound sign from next lines to run if packages are not installed.)

# 1: install.packages("devtools") 
library(devtools)
#install_github("gandalfnicolas/SADCAT")
library(SADCAT)
#install.packages("quanteda")
library(quanteda) 
#install.packages("tidyverse")
library(tidyverse)
#install.packages("stringr")
library(stringr)
#install.packages("psych")
library(psych)
#install.packages("tm")
library(tm)
#install.packages("textstem")
library(textstem)

#2:  Load dictionaries: Using the most extensive dictionaries here. Switch to "SADCAT::Dictionaries_FT" for a smaller dictionary (i.e., it may miss some words related to the dimensions, but it may also make fewer coding mistakes when applying the dictionaries to domains different to the ones they were developed for originally).
Pre_Dictionaries = SADCAT::All.steps_Dictionaries



#3: initial preprocess
Pre_Dictionaries$values0 = as.character(Pre_Dictionaries$tv)

Pre_Dictionaries$values0 <- enc2utf8(as.character(Pre_Dictionaries$values0))

Pre_Dictionaries$values0 = tolower(Pre_Dictionaries$values0) #transform to lower case

#4: Lemmatize words. If lemmatization is taking too long on your data (step 8), you can opt to skip this step, or try to stem the responses instead. Please see quanteda language_stemmer documentation if you prefer to stem. (Note that, rarely, lemmatization may transform words in such a way that they are miscoded by the dictionaries, so always check your dictionary coding when the process is complete; if any words seem off, it may be due to lemmatization: I recommend removing them or preventing/reverting their lemmatization.)

toks = VCorpus(VectorSource(Pre_Dictionaries$values0))

corpusx <- tm_map(toks, lemmatize_strings) #ignore warnings if they come up

delete_ending_Ss2 = function(x){
  unlist(lapply(x, function(y) {paste(sapply(strsplit(y, ' '), delete_ending_Ss), collapse=' ')
  }))} #creates a function to delete the last letter of each word in text if that letter is an "s" - if prefer not to run this step, also delete from the processing of data in step 8

corpusx2 <- tm_map(corpusx, delete_ending_Ss2)

corpusx2 <- tm_map(corpusx2, PlainTextDocument)

corpusx2 <- t(data.frame(text = sapply(corpusx2, as.character), stringsAsFactors = FALSE))[,1]

corpusx2 <- quanteda::tokens(corpusx2, remove_numbers = T, remove_punct = T, remove_symbols = T)

string_new = vector()
for(i in 1:length(corpusx2)){
  string_new = rbind(string_new, paste(corpusx2[[i]], collapse=' '))
}
Pre_Dictionaries$values0 <- string_new



#5: transform into a format suitable for Quanteda dictionary coding. Here we create multiple dictionaries, for different contructs (e.g., Sociability, Morality, etc.). Dictionaries ending in "_dic" encode ALL the words for that construct. Dictionaries ending in "_hi" encode only high directional words for the construct (e.g., "friendly" for Sociability), while those ending in "_lo" encode low directional words (e.g., "unfriendly" for Sociability). Dictionaries ending in "_pos" or "_neg" encode positively and negatively valence words for each construct, respectively (direction and valence often, but not always, correlate). See Nicolas, Bai, & Fiske, 2021 for more information.

Pre_Dictionaries2 = Pre_Dictionaries %>%

  dplyr::transmute(
         Sociability_dic = ifelse(Sociability_dict == 1, values0, ""),
         Morality_dic = ifelse(Morality_dict ==1, values0,""),
         Ability_dic = ifelse(Ability_dict ==1, values0,""),
         Assertiveness_dic = ifelse(Assertiveness_dict ==1, values0,""),
         Status_dic = ifelse(Status_dict ==1, values0,""),
         Warmth_dic = ifelse(Warmth_dict ==1, values0,""),
         Competence_dic = ifelse(Competence_dict ==1, values0,""),
         Beliefs_dic = ifelse(Beliefs_dict ==1, values0,""),

         Sociability_dic_hi = ifelse(Sociability_dict_hi == 1, values0, ""),
         Morality_dic_hi = ifelse(Morality_dict_hi ==1, values0,""),
         Ability_dic_hi = ifelse(Ability_dict_hi ==1, values0,""),
         Assertiveness_dic_hi = ifelse(Assertiveness_dict_hi ==1, values0,""),
         Status_dic_hi = ifelse(Status_dict_hi ==1, values0,""),
         Warmth_dic_hi = ifelse(Warmth_dict_hi ==1, values0,""),
         Competence_dic_hi = ifelse(Competence_dict_hi ==1, values0,""),
         Beliefs_dic_hi = ifelse(Beliefs_dict_hi ==1, values0,""),
         Health_dic_hi = ifelse(Health_dict_hi ==1, values0,""),
         Beauty_dic_hi = ifelse(Beauty_dict_hi ==1, values0,""),
         Deviance_dic_hi = ifelse(Deviance_dict_hi ==1, values0,""),

         Sociability_dic_Pos = ifelse(Sociability_dict_Pos == 1, values0, ""),
         Morality_dic_Pos = ifelse(Morality_dict_Pos ==1, values0,""),
         Ability_dic_Pos = ifelse(Ability_dict_Pos ==1, values0,""),
         Assertiveness_dic_Pos = ifelse(Assertiveness_dict_Pos ==1, values0,""),
         Status_dic_Pos = ifelse(Status_dict_Pos ==1, values0,""),
         Warmth_dic_Pos = ifelse(Warmth_dict_Pos ==1, values0,""),
         Competence_dic_Pos = ifelse(Competence_dict_Pos ==1, values0,""),
         Beliefs_dic_Pos = ifelse(Beliefs_dict_Pos ==1, values0,""),
         Health_dic_Pos = ifelse(health_dict_Pos ==1, values0,""),
         occupation_dic_Pos = ifelse(occupation_dict_Pos ==1, values0,""),
         emotions_dic_Pos = ifelse(emotions_dict_Pos ==1, values0,""),
         deviance_dic_Pos = ifelse(deviance_dict_Pos ==1, values0,""),
         socialgroups_dic_Pos = ifelse(socialgroups_dict_Pos ==1, values0,""),
         Geography_dic_Pos = ifelse(Geography_dict_Pos ==1, values0,""),
         Appearance_dic_Pos = ifelse(Appearance_dict_Pos ==1, values0,""),
         Other_dic_Pos = ifelse(Other_dict_Pos ==1, values0,""),

         Sociability_dic_lo = ifelse(Sociability_dict_lo == 1, values0, ""),
         Morality_dic_lo = ifelse(Morality_dict_lo ==1, values0,""),
         Ability_dic_lo = ifelse(Ability_dict_lo ==1, values0,""),
         Assertiveness_dic_lo = ifelse(Assertiveness_dict_lo ==1, values0,""),
         Status_dic_lo = ifelse(Status_dict_lo ==1, values0,""),
         Warmth_dic_lo = ifelse(Warmth_dict_lo ==1, values0,""),
         Competence_dic_lo = ifelse(Competence_dict_lo ==1, values0,""),
         Beliefs_dic_lo = ifelse(Beliefs_dict_lo ==1, values0,""),
         Health_dic_lo = ifelse(Health_dict_lo ==1, values0,""),
         Beauty_dic_lo = ifelse(Beauty_dict_lo ==1, values0,""),
         Deviance_dic_lo = ifelse(Deviance_dict_lo ==1, values0,""),

         Sociability_dic_Neg = ifelse(Sociability_dict_Neg == 1, values0, ""),
         Morality_dic_Neg = ifelse(Morality_dict_Neg ==1, values0,""),
         Ability_dic_Neg = ifelse(Ability_dict_Neg ==1, values0,""),
         Assertiveness_dic_Neg = ifelse(Assertiveness_dict_Neg ==1, values0,""),
         Status_dic_Neg = ifelse(Status_dict_Neg ==1, values0,""),
         Warmth_dic_Neg = ifelse(Warmth_dict_Neg ==1, values0,""),
         Competence_dic_Neg = ifelse(Competence_dict_Neg ==1, values0,""),
         Beliefs_dic_Neg = ifelse(Beliefs_dict_Neg ==1, values0,""),
         Health_dic_Neg = ifelse(health_dict_Neg ==1, values0,""),
         occupation_dic_Neg = ifelse(occupation_dict_Neg ==1, values0,""),
         emotions_dic_Neg = ifelse(emotions_dict_Neg ==1, values0,""),
         deviance_dic_Neg = ifelse(deviance_dict_Neg ==1, values0,""),
         socialgroups_dic_Neg = ifelse(socialgroups_dict_Neg ==1, values0,""),
         Geography_dic_Neg = ifelse(Geography_dict_Neg ==1, values0,""),
         Appearance_dic_Neg = ifelse(Appearance_dict_Neg ==1, values0,""),
         Other_dic_Neg = ifelse(Other_dict_Neg ==1, values0,""),

         Health_dic = ifelse(health_dict ==1, values0,""),
         Family_dic = ifelse(relative_dict ==1, values0,""),
         Emotion_dic = ifelse(emotions_dict ==1, values0,""),
         Deviance_dic = ifelse(deviance_dict ==1, values0,""),
         Socialgroups_dic = ifelse(socialgroups_dict ==1, values0,""),
         Geography_dic = ifelse(Geography_dict ==1, values0,""),
         Occupation_dic = ifelse(occupation_dict ==1, values0,""),
         Other_dic = ifelse(Other_dict ==1, values0,""),
         Beauty_dic = ifelse(beauty_dict ==1, values0,""),
         Appearance_dic = ifelse(Appearance_dict ==1, values0,""))


Dicts_v2 = 
lapply(1:length(Pre_Dictionaries2), function(x) Pre_Dictionaries2[[x]][Pre_Dictionaries2[[x]] != ""])

names(Dicts_v2) = names(Pre_Dictionaries2)

Dicts_v2 = lapply(Dicts_v2, function(x) x[!is.na(x)])


#6: create quanteda dictionaries
Dicts_v2 = quanteda::dictionary(Dicts_v2)



#7: Load your data. Here, for illustration, we are using the obituaries data from Nicolas, Bai, & Fiske (2021).
obituaries = SADCAT::Obituary_data

obituaries$Text <- enc2utf8(as.character(obituaries$Text)) #UTF-8 encoding
obituaries$Beliefs = as.numeric(obituaries$Beliefs)


obituaries$Text = tolower(obituaries$Text) #transform to lower case

#8: Lemmatize: if you skipped this step before, skip it here as well.

toks = VCorpus(VectorSource(obituaries$Text))

corpusxy = tm_map(toks, removeWords, stopwords('english')) #remove stopwords - optional

corpusxy <- tm_map(corpusxy, lemmatize_strings) #warnings may be expected

corpusxy2 <- tm_map(corpusxy, delete_ending_Ss2)

toks <- tm_map(corpusxy2, PlainTextDocument)

toks <- t(data.frame(text = sapply(toks, as.character), stringsAsFactors = FALSE))[,1]

toks <- quanteda::tokens(toks, remove_numbers = T, remove_punct = T, remove_symbols = T)

string_lemma = vector()
for(i in 1:length(toks)){
  string_lemma = rbind(string_lemma, paste(toks[[i]], collapse=' '))}

#9: Match target text to dictionaries & convert to document-feature data; bind data.
toks_dict_pre <- tokens_lookup(toks, dictionary = Dicts_v2, nested_scope = "dictionary", exclusive = T, levels = 1)


toks_dict_pre = convert(dfm(toks_dict_pre), to = "data.frame")


toks_dict = cbind(toks_dict_pre,
                  ntoken(toks), #raw count
                  ntype(toks), #distinct count
                  string_lemma,
                  obituaries)


#### Optional - examine the words coded into the dictionaries

#Shows the words dictionary matching
Dictionaries_per_word = kwic(toks, Dicts_v2)[,c("pattern", "keyword")]

# extracts counts for specific dictionaries (change dictionary name to examine different dimensions)
Dictionaries_per_word_top = table(filter(Dictionaries_per_word,pattern == 'Deviance_dic')$keyword)

#then View(Dictionaries_per_word_top) and sort to find top words.
View(Dictionaries_per_word_top)
###


#10: Calculate Valence for responses.

toks_dict$valuex = SADCAT::clean_large_text(obituaries$Text)

toksval <- quanteda::tokens(toks_dict$valuex)

toks_val_pre = tokens_lookup(toksval, data_dictionary_LSD2015, nested_scope = "dictionary", exclusive = TRUE)

toks_val_pre = convert(dfm(toks_val_pre), to = "data.frame")

toks_val_pre$negative = toks_val_pre$negative + toks_val_pre$neg_positive
toks_val_pre$positive = toks_val_pre$positive + toks_val_pre$neg_negative

toks_val_pre$Val_lexicoder = toks_val_pre$positive - toks_val_pre$negative

toks_dict = cbind(toks_dict,
                      Val_lexicoder = toks_val_pre$Val_lexicoder)


#11: Get percentage of words in the specific text that belong to each dictionary
toks_dict = toks_dict %>%
  mutate_at(vars(sociability_dic:appearance_dic), .funs = list(percent = ~(./`ntoken(toks)`)))


#12: Get binary indicator: is dictionary in text or not?
toks_dict = toks_dict %>%
  mutate_at(vars(sociability_dic:appearance_dic), .funs = list(binary = ~ifelse(. > 0 , 1, 0)))


#13: Create direction indicators: Positive scores indicate text is more high (vs. low) directional, negative scores indicate the text is more low (vs. high) directional, for each dimension tested. This code can be modified to create other indicators (e.g., for valence subtract _neg from _pos variables.). Note that here computed using percent indicators, consider using binary indicator for shorter text.

toks_dict = toks_dict %>%
  mutate(Sociability_dirx = sociability_dic_hi_percent - sociability_dic_lo_percent,
         Morality_dirx = morality_dic_hi_percent - morality_dic_lo_percent,
         Ability_dirx =  ability_dic_hi_percent - ability_dic_lo_percent,
         Assertiveness_dirx = assertiveness_dic_hi_percent - assertiveness_dic_lo_percent,
         Status_dirx = status_dic_hi_percent - status_dic_lo_percent,
         Warmth_dirx = warmth_dic_hi_percent - warmth_dic_lo_percent,
         Competence_dirx = competence_dic_hi_percent - competence_dic_lo_percent,
         Beliefs_dirx = beliefs_dic_hi_percent - beliefs_dic_lo_percent,
         Health_dirx = health_dic_hi_percent - health_dic_lo_percent,
         Beauty_dirx = beauty_dic_hi_percent - beauty_dic_lo_percent,
         Deviance_dirx= deviance_dic_hi_percent - deviance_dic_lo_percent,

         Sociability_valx = sociability_dic_pos_percent - sociability_dic_neg_percent,
         Morality_valx = morality_dic_pos_percent - morality_dic_neg_percent,
         Ability_valx =  ability_dic_pos_percent - ability_dic_neg_percent,
         Assertiveness_valx = assertiveness_dic_pos_percent - assertiveness_dic_neg_percent,
         Status_valx = status_dic_pos_percent - status_dic_neg_percent,
         Warmth_valx = warmth_dic_pos_percent - warmth_dic_neg_percent,
         Competence_valx = competence_dic_pos_percent - competence_dic_neg_percent,
         Beliefs_valx = beliefs_dic_pos_percent - beliefs_dic_neg_percent,
         Health_valx = health_dic_pos_percent - health_dic_neg_percent,
         Occupation_valx = occupation_dic_pos_percent - occupation_dic_neg_percent,
         Emotions_valx = emotions_dic_pos_percent - emotions_dic_neg_percent,
         Deviance_valx = deviance_dic_pos_percent - deviance_dic_neg_percent,
         Socialgroups_valx = socialgroups_dic_pos_percent - socialgroups_dic_neg_percent,
         Geography_valx = geography_dic_pos_percent - geography_dic_neg_percent,
         Appearance_valx = appearance_dic_pos_percent - appearance_dic_neg_percent,
         Other_valx = other_dic_pos_percent - other_dic_neg_percent

         )

#14: Calculate No match: For each response, does it fail to be coded into any dictionary?

toks_dictnonepre = toks_dict %>%    
  dplyr::select(sociability_dic_binary:status_dic_binary,beliefs_dic_binary,health_dic_binary:other_dic_binary,appearance_dic_binary) %>%
  mutate(Sumdic = rowSums(.),
         None2y = ifelse(Sumdic > 0, 0, 1))

toks_dict$None2y = toks_dictnonepre$None2y

#15: Write data
write.csv(toks_dict, "obituaries dict.csv")


#### Variables:
# _dic variables indicate how many words in the text were coded into that dimension, regardless of whether the response is high or low (e.g., for sociability, both friendly and unfriendly are counted)
# _dic_hi and _dic_lo variables indicate how many words in the text were coded as being high and low (respectively) on the dimension (if the word is not in the dimension, it is not counted here)
# _dic_percent is simply the _dic variable divided by the total number of words in the text (so, percentage of words in the text coded into the dimension)
# _dic_binary indicates whether the text includes the dimension at all, 0 = No, 1 = yes
# _dirx indicates the direction of the dimension in the text. It is calculated as dic_hi - dic_lo, thus higher scores indicate more high-directional words in the text for that dimension, lower scores indicate more low-directional words in the text for that dimension. A score of 0 indicates either equal number of words in the dimension or no words related to the dimension (thus no directional bias).

#**** Please review paper to understand how the different dimensions are define and constructed, many overlap.


####example test

Sociability_mod = lm(Sociability ~ Sociability_dirx,data =  toks_dict)
summary(Sociability_mod)