Nothing
## ---- include=FALSE-----------------------------------------------------------
options(digits=3)
library(knitr)
## -----------------------------------------------------------------------------
library(corpustools)
## ---- eval=F------------------------------------------------------------------
# ?tcorpus
## -----------------------------------------------------------------------------
colnames(sotu_texts)
## -----------------------------------------------------------------------------
tc = create_tcorpus(sotu_texts, doc_column = 'id', text_columns = 'text')
## -----------------------------------------------------------------------------
tc
## -----------------------------------------------------------------------------
## first 6 rows (head) of tokens and meta data.tables
head(tc$tokens)
head(tc$meta)
## -----------------------------------------------------------------------------
tc = tokens_to_tcorpus(corenlp_tokens, doc_col = 'doc_id',
sentence_col = 'sentence', token_id_col = 'id')
tc
## -----------------------------------------------------------------------------
tc = create_tcorpus('This is an example')
tc$set('new_column', "any value")
tc$tokens
## -----------------------------------------------------------------------------
tc$set('new_column', toupper(token))
## -----------------------------------------------------------------------------
tc$set('new_column', tolower(token), token_id <= 2)
## -----------------------------------------------------------------------------
tc$set_name('new_column', 'token2')
## -----------------------------------------------------------------------------
tc$delete_columns('token2')
## -----------------------------------------------------------------------------
tc = create_tcorpus(sotu_texts, doc_col='id', split_sentences = T)
tc2 = subset(tc, sentence==1)
## number of tokens in full corpus
tc$n
## number of tokens in first sentence
tc2$n
## -----------------------------------------------------------------------------
tc2 = subset(tc, subset_meta = president == 'Barack Obama')
## number of tokens in president Obama documents
tc2$n
## -----------------------------------------------------------------------------
tc$subset(sentence == 1, subset_meta = president == 'Barack Obama')
tc$n
## ---- message=F---------------------------------------------------------------
d = data.frame(doc_id = paste('doc', 1:3),
text = c('I am a text','I am also text', 'Ich bin ein Berliner'),
date = as.POSIXct(c('2010-01-01','2010-01-02','2010-01-03')))
# document 1 and 2 are duplicates, and the first is deleted
tc = create_tcorpus(d)
tc$deduplicate(feature='token', date_col = 'date', similarity = 0.75, verbose = F)
tc$meta$doc_id
# document 1 and 2 are duplicates, and the last is deleted
tc = create_tcorpus(d)
tc$deduplicate(feature='token', date_col = 'date', similarity = 0.75, keep = 'last', verbose = F)
tc$meta$doc_id
## -----------------------------------------------------------------------------
tc = create_tcorpus(sotu_texts, doc_column = 'id', text_columns = 'text')
tc$preprocess(use_stemming = T, remove_stopwords=T)
tc$tokens
## ---- eval=F------------------------------------------------------------------
# tc = create_tcorpus("This is a quick example", udpipe_model='english-ewt')
# tc$tokens ## output not shown in this vignette
## ---- eval=F------------------------------------------------------------------
# tc$feats_to_columns(c('Tense','Number','Person'))
#
## ---- eval=F------------------------------------------------------------------
# tc = create_tcorpus(sotu_texts, doc_column = 'id', udpipe_model='english-ewt', updipe_cores=4)
## ---- eval=F------------------------------------------------------------------
# ## for sake of clarity, recreate the corpus and preprocess it
# tc = create_tcorpus(sotu_texts, doc_column = 'id', text_columns = 'text')
# tc$preprocess(use_stemming = T, remove_stopwords=T)
#
# tc$feature_subset('feature', !token_id == 5)
## ---- eval=F------------------------------------------------------------------
# tc$feature_subset('feature', min_freq=10)
## -----------------------------------------------------------------------------
tc = create_tcorpus(sotu_texts, doc_col='id', split_sentences=T)
tc$preprocess(column = 'token', new_column = 'feature',
remove_stopwords=T, use_stemming=T, min_docfreq = 5)
dfm = get_dfm(tc, 'feature')
dfm
## -----------------------------------------------------------------------------
## Use sentences as rows instead of documents
dfm_sent = get_dfm(tc, 'feature', context_level = 'sentence')
## Use a weighting scheme.
dfm_weighted = get_dfm(tc, 'feature', weight = 'tfidf')
## Only use a subset of the tCorpus
dfm_obama = get_dfm(tc, 'feature', subset_meta = president == "Barack Obama")
## -----------------------------------------------------------------------------
## create tcorpus and preprocess tokens
tc = create_tcorpus(sotu_texts, doc_column = 'id', text_columns = 'text')
tc$preprocess(use_stemming = T, remove_stopwords=T, min_docfreq = 5)
## fit lda model, using the create_feature argument to store the topic assignments
m = tc$lda_fit('feature', create_feature = 'topic', K = 5, alpha = 0.001)
## -----------------------------------------------------------------------------
head(tc$tokens, 10)
## ---- eval=F------------------------------------------------------------------
# url = browse_texts(tc, category='topic', view=T)
## ---- eval=F------------------------------------------------------------------
# ?search_features()
## -----------------------------------------------------------------------------
tc = create_tcorpus(sotu_texts, doc_column = 'id', text_columns = 'text', split_sentences=T)
## -----------------------------------------------------------------------------
hits = search_features(tc, query = c('terror*','war*'))
## -----------------------------------------------------------------------------
hits
summary(hits)
## -----------------------------------------------------------------------------
hits = search_features(tc, query = c('Terror# terror*','War# war*'))
summary(hits)
## -----------------------------------------------------------------------------
hits$queries
head(hits$hits)
## -----------------------------------------------------------------------------
queries = data.frame(label = c('War','Terrorism','Economy','Education'),
query = c('war* OR army OR bomb*', 'terroris*', 'econom* OR bank*', 'educat* OR school*'))
hits = search_features(tc, query=queries$query, code=queries$label)
## -----------------------------------------------------------------------------
count_tcorpus(tc, hits=hits)
count_tcorpus(tc, hits=hits, meta_cols = 'president') ## break by meta variables
## ---- fig.width = 6, fig.height = 3, fig.align="center", eval=F---------------
# library(ggplot2)
#
# date_hits = count_tcorpus(tc, hits, meta_cols='date', wide = F)
#
# ggplot(date_hits, aes(x=date, y=count, group=code)) +
# geom_line(aes(linetype=code))
#
# pres_hits = count_tcorpus(tc, hits, meta_cols='president', wide = F)
#
# ggplot(pres_hits, aes(president, count)) +
# geom_col(aes(fill=code), width=.5, position = "dodge")
## ---- fig.width = 6, fig.height = 3, fig.align="center"-----------------------
g = semnet(hits, measure = 'con_prob')
## -----------------------------------------------------------------------------
igraph::get.adjacency(g, attr = 'weight')
## ---- fig.width = 7, fig.height = 4, fig.align="center"-----------------------
plot(hits)
## ---- eval=F------------------------------------------------------------------
# url = browse_hits(tc, hits, view=T)
## -----------------------------------------------------------------------------
get_kwic(tc, query = 'freedom* AND america*', n = 2)
## -----------------------------------------------------------------------------
## example query that matches first words in corpus
q = 'Example# unfinished OR restore OR "basic bargain"'
tc$code_features(q)
head(tc$tokens, 10)
## -----------------------------------------------------------------------------
tc = create_tcorpus(sotu_texts, doc_column = 'id', text_columns = 'text', split_sentences=T)
con = search_contexts(tc, 'war')
con
## -----------------------------------------------------------------------------
tc = create_tcorpus(sotu_texts, doc_column = 'id', text_columns = 'text', split_sentences=T)
tc_war = subset_query(tc, 'war')
## -----------------------------------------------------------------------------
tc$subset_query('war')
## ---- eval=F------------------------------------------------------------------
# library(quanteda)
# dict = quanteda::data_dictionary_LSD2015
## ---- eval=F------------------------------------------------------------------
# tc = create_tcorpus(sotu_texts, doc_column = 'id', text_columns = 'text', split_sentences=T)
# hits = search_dictionary(tc, dict)
## ---- eval=F------------------------------------------------------------------
# library(ggplot2)
# agg_hits = count_tcorpus(tc, 'date', hits, wide = F)
# ggplot(agg_hits, aes(x=date, y=count, group=code)) +
# geom_line(aes(linetype=code))
## ---- eval=F------------------------------------------------------------------
# dict = melt_quanteda_dict(dict)
# dict$sentiment = ifelse(dict$code %in% c('positive','neg_negative'), 1, -1)
# tc$code_dictionary(dict)
# tc$tokens
## ---- eval=F------------------------------------------------------------------
# browse_texts(tc, scale='sentiment')
## ---- eval=F------------------------------------------------------------------
# agg_tcorpus(tc, sent = mean(sentiment), N = length(sentiment), .id = 'code_id')
## ---- eval=F------------------------------------------------------------------
# agg_tcorpus(tc, sent = mean(sentiment), .id = 'code_id', by='president')
# agg_tcorpus(tc, sent = mean(sentiment), .id = 'code_id', by=c('token','president'))
## -----------------------------------------------------------------------------
tc = create_tcorpus(sotu_texts, doc_col='id')
tc$preprocess(min_docfreq = 20, remove_stopwords = T, remove_numbers = T)
g = semnet_window(tc, 'feature')
## ---- fig.width = 8, fig.height = 6, fig.align="center"-----------------------
gb = backbone_filter(g, alpha = 0.001, max_vertices = 100)
plot_semnet(gb)
## -----------------------------------------------------------------------------
comp = compare_subset(tc, feature='feature',
subset_meta_x = president == "Barack Obama")
## ---- fig.width = 6, fig.height = 4, fig.align="center"-----------------------
plot(comp)
## -----------------------------------------------------------------------------
tc = create_tcorpus(sotu_texts, doc_col='id')
tc$preprocess(use_stemming = T, remove_stopwords = T, min_docfreq = 10)
fa = feature_associations(tc, 'feature', query = 'terror*')
## -----------------------------------------------------------------------------
head(fa)
## ---- fig.width = 6, fig.height = 6, fig.align="center"-----------------------
plot(fa, col=c('lightblue','navyblue'))
## ---- eval=F------------------------------------------------------------------
# tc$subset()
## -----------------------------------------------------------------------------
tc = create_tcorpus('this is an example')
tc$subset(token_id < 3)
tc$tokens
## -----------------------------------------------------------------------------
tc = create_tcorpus('this is an example')
tc2 = tc$subset(token_id < 3)
## tc and tc2 are both subsets with token_id < 3
identical(tc$tokens, tc2$tokens)
## -----------------------------------------------------------------------------
tc = create_tcorpus('this is an example')
tc2 = tc
tc2$subset(token_id < 3)
## tc and tc2 are both subsets with token_id < 3
identical(tc$tokens, tc2$tokens)
## -----------------------------------------------------------------------------
tc2 = subset(tc, token_id < 2)
## -----------------------------------------------------------------------------
tc2 = tc$subset(token_id < 2, copy=T)
## -----------------------------------------------------------------------------
tc2 = tc ## shallow copy
tc3 = tc$copy() ## deep copy
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.