inst/doc/corpustools.R

## ---- include=FALSE-----------------------------------------------------------
options(digits=3)
library(knitr)

## -----------------------------------------------------------------------------
library(corpustools)

## ---- eval=F------------------------------------------------------------------
#  ?tcorpus

## -----------------------------------------------------------------------------
colnames(sotu_texts)

## -----------------------------------------------------------------------------
tc = create_tcorpus(sotu_texts, doc_column = 'id', text_columns = 'text')

## -----------------------------------------------------------------------------
tc

## -----------------------------------------------------------------------------
## first 6 rows (head) of tokens and meta data.tables
head(tc$tokens)   
head(tc$meta)

## -----------------------------------------------------------------------------
tc = tokens_to_tcorpus(corenlp_tokens, doc_col = 'doc_id',
                       sentence_col = 'sentence', token_id_col = 'id')
tc

## -----------------------------------------------------------------------------
tc = create_tcorpus('This is an example')

tc$set('new_column', "any value")
tc$tokens

## -----------------------------------------------------------------------------
tc$set('new_column', toupper(token))

## -----------------------------------------------------------------------------
tc$set('new_column', tolower(token), token_id <= 2)

## -----------------------------------------------------------------------------
tc$set_name('new_column', 'token2')

## -----------------------------------------------------------------------------
tc$delete_columns('token2')

## -----------------------------------------------------------------------------
tc = create_tcorpus(sotu_texts, doc_col='id', split_sentences = T)
tc2 = subset(tc, sentence==1)

## number of tokens in full corpus
tc$n
## number of tokens in first sentence
tc2$n

## -----------------------------------------------------------------------------
tc2 = subset(tc, subset_meta = president == 'Barack Obama')

## number of tokens in president Obama documents
tc2$n

## -----------------------------------------------------------------------------
tc$subset(sentence == 1, subset_meta = president == 'Barack Obama')
tc$n

## ---- message=F---------------------------------------------------------------
d = data.frame(doc_id = paste('doc', 1:3),
               text = c('I am a text','I am also text', 'Ich bin ein Berliner'),
               date = as.POSIXct(c('2010-01-01','2010-01-02','2010-01-03')))

# document 1 and 2 are duplicates, and the first is deleted
tc = create_tcorpus(d)
tc$deduplicate(feature='token', date_col = 'date', similarity = 0.75, verbose = F)
tc$meta$doc_id     

# document 1 and 2 are duplicates, and the last is deleted
tc = create_tcorpus(d)
tc$deduplicate(feature='token', date_col = 'date', similarity = 0.75, keep = 'last', verbose = F)
tc$meta$doc_id

## -----------------------------------------------------------------------------
tc = create_tcorpus(sotu_texts, doc_column = 'id', text_columns = 'text')

tc$preprocess(use_stemming = T, remove_stopwords=T)
tc$tokens

## ---- eval=F------------------------------------------------------------------
#  tc = create_tcorpus("This is a quick example", udpipe_model='english-ewt')
#  tc$tokens   ## output not shown in this vignette

## ---- eval=F------------------------------------------------------------------
#  tc$feats_to_columns(c('Tense','Number','Person'))
#  

## ---- eval=F------------------------------------------------------------------
#  tc = create_tcorpus(sotu_texts, doc_column = 'id', udpipe_model='english-ewt', updipe_cores=4)

## ---- eval=F------------------------------------------------------------------
#  ## for sake of clarity, recreate the corpus and preprocess it
#  tc = create_tcorpus(sotu_texts, doc_column = 'id', text_columns = 'text')
#  tc$preprocess(use_stemming = T, remove_stopwords=T)
#  
#  tc$feature_subset('feature', !token_id == 5)

## ---- eval=F------------------------------------------------------------------
#  tc$feature_subset('feature', min_freq=10)

## -----------------------------------------------------------------------------
tc = create_tcorpus(sotu_texts, doc_col='id', split_sentences=T)

tc$preprocess(column = 'token', new_column = 'feature',
              remove_stopwords=T, use_stemming=T, min_docfreq = 5)

dfm = get_dfm(tc, 'feature')
dfm

## -----------------------------------------------------------------------------
##  Use sentences as rows instead of documents
dfm_sent = get_dfm(tc, 'feature', context_level = 'sentence')

##  Use a weighting scheme. 
dfm_weighted = get_dfm(tc, 'feature', weight = 'tfidf')

##  Only use a subset of the tCorpus
dfm_obama = get_dfm(tc, 'feature', subset_meta = president == "Barack Obama")


## -----------------------------------------------------------------------------
## create tcorpus and preprocess tokens
tc = create_tcorpus(sotu_texts, doc_column = 'id', text_columns = 'text')
tc$preprocess(use_stemming = T, remove_stopwords=T, min_docfreq = 5)

## fit lda model, using the create_feature argument to store the topic assignments
m = tc$lda_fit('feature', create_feature = 'topic', K = 5, alpha = 0.001)

## -----------------------------------------------------------------------------
head(tc$tokens, 10)

## ---- eval=F------------------------------------------------------------------
#  url = browse_texts(tc, category='topic', view=T)

## ---- eval=F------------------------------------------------------------------
#  ?search_features()

## -----------------------------------------------------------------------------
tc = create_tcorpus(sotu_texts, doc_column = 'id', text_columns = 'text', split_sentences=T)

## -----------------------------------------------------------------------------
hits = search_features(tc, query = c('terror*','war*'))

## -----------------------------------------------------------------------------
hits
summary(hits)

## -----------------------------------------------------------------------------
hits = search_features(tc, query = c('Terror# terror*','War# war*'))
summary(hits)

## -----------------------------------------------------------------------------
hits$queries
head(hits$hits)

## -----------------------------------------------------------------------------
queries = data.frame(label = c('War','Terrorism','Economy','Education'),
                     query = c('war* OR army OR bomb*', 'terroris*', 'econom* OR bank*', 'educat* OR school*'))
hits = search_features(tc, query=queries$query, code=queries$label)

## -----------------------------------------------------------------------------
count_tcorpus(tc, hits=hits)
count_tcorpus(tc, hits=hits, meta_cols = 'president')  ## break by meta variables

## ---- fig.width = 6, fig.height = 3, fig.align="center", eval=F---------------
#  library(ggplot2)
#  
#  date_hits = count_tcorpus(tc, hits, meta_cols='date', wide = F)
#  
#  ggplot(date_hits, aes(x=date, y=count, group=code)) +
#    geom_line(aes(linetype=code))
#  
#  pres_hits = count_tcorpus(tc, hits, meta_cols='president', wide = F)
#  
#  ggplot(pres_hits, aes(president, count)) +
#    geom_col(aes(fill=code), width=.5, position = "dodge")

## ---- fig.width = 6, fig.height = 3, fig.align="center"-----------------------
g = semnet(hits, measure = 'con_prob')

## -----------------------------------------------------------------------------
igraph::get.adjacency(g, attr = 'weight')

## ---- fig.width = 7, fig.height = 4, fig.align="center"-----------------------
plot(hits)

## ---- eval=F------------------------------------------------------------------
#  url = browse_hits(tc, hits, view=T)

## -----------------------------------------------------------------------------
get_kwic(tc, query = 'freedom* AND america*', n = 2)

## -----------------------------------------------------------------------------
## example query that matches first words in corpus
q = 'Example# unfinished OR restore OR "basic bargain"' 
tc$code_features(q)
head(tc$tokens, 10)

## -----------------------------------------------------------------------------
tc = create_tcorpus(sotu_texts, doc_column = 'id', text_columns = 'text', split_sentences=T)

con = search_contexts(tc, 'war')
con

## -----------------------------------------------------------------------------
tc = create_tcorpus(sotu_texts, doc_column = 'id', text_columns = 'text', split_sentences=T)
tc_war = subset_query(tc, 'war')

## -----------------------------------------------------------------------------
tc$subset_query('war')

## ---- eval=F------------------------------------------------------------------
#  library(quanteda)
#  dict = quanteda::data_dictionary_LSD2015

## ---- eval=F------------------------------------------------------------------
#  tc = create_tcorpus(sotu_texts, doc_column = 'id', text_columns = 'text', split_sentences=T)
#  hits = search_dictionary(tc, dict)

## ---- eval=F------------------------------------------------------------------
#  library(ggplot2)
#  agg_hits = count_tcorpus(tc, 'date', hits, wide = F)
#  ggplot(agg_hits, aes(x=date, y=count, group=code)) +
#    geom_line(aes(linetype=code))

## ---- eval=F------------------------------------------------------------------
#  dict = melt_quanteda_dict(dict)
#  dict$sentiment = ifelse(dict$code %in% c('positive','neg_negative'), 1, -1)
#  tc$code_dictionary(dict)
#  tc$tokens

## ---- eval=F------------------------------------------------------------------
#  browse_texts(tc, scale='sentiment')

## ---- eval=F------------------------------------------------------------------
#  agg_tcorpus(tc, sent = mean(sentiment), N = length(sentiment), .id = 'code_id')

## ---- eval=F------------------------------------------------------------------
#  agg_tcorpus(tc, sent = mean(sentiment), .id = 'code_id', by='president')
#  agg_tcorpus(tc, sent = mean(sentiment), .id = 'code_id', by=c('token','president'))

## -----------------------------------------------------------------------------
tc = create_tcorpus(sotu_texts, doc_col='id')
tc$preprocess(min_docfreq = 20, remove_stopwords = T, remove_numbers = T)

g = semnet_window(tc, 'feature')

## ---- fig.width = 8, fig.height = 6, fig.align="center"-----------------------
gb = backbone_filter(g, alpha = 0.001, max_vertices = 100)
plot_semnet(gb)

## -----------------------------------------------------------------------------
comp = compare_subset(tc, feature='feature',
                      subset_meta_x = president == "Barack Obama")

## ---- fig.width = 6, fig.height = 4, fig.align="center"-----------------------
plot(comp)

## -----------------------------------------------------------------------------
tc = create_tcorpus(sotu_texts, doc_col='id')
tc$preprocess(use_stemming = T, remove_stopwords = T, min_docfreq = 10)

fa = feature_associations(tc, 'feature', query = 'terror*')

## -----------------------------------------------------------------------------
head(fa)

## ---- fig.width = 6, fig.height = 6, fig.align="center"-----------------------
plot(fa, col=c('lightblue','navyblue'))

## ---- eval=F------------------------------------------------------------------
#  tc$subset()

## -----------------------------------------------------------------------------
tc = create_tcorpus('this is an example')
tc$subset(token_id < 3)
tc$tokens

## -----------------------------------------------------------------------------
tc = create_tcorpus('this is an example')
tc2 = tc$subset(token_id < 3)

## tc and tc2 are both subsets with token_id < 3
identical(tc$tokens, tc2$tokens)

## -----------------------------------------------------------------------------
tc = create_tcorpus('this is an example')
tc2 = tc
tc2$subset(token_id < 3)

## tc and tc2 are both subsets with token_id < 3
identical(tc$tokens, tc2$tokens)

## -----------------------------------------------------------------------------
tc2 = subset(tc, token_id < 2)

## -----------------------------------------------------------------------------
tc2 = tc$subset(token_id < 2, copy=T)

## -----------------------------------------------------------------------------
tc2 = tc         ## shallow copy
tc3 = tc$copy()  ## deep copy

Try the corpustools package in your browser

Any scripts or data that you put into this service are public.

corpustools documentation built on May 31, 2023, 8:45 p.m.