vignettes/RNewsflow.R

## ---- include=FALSE-----------------------------------------------------------
options(digits=3)
library(knitr)

## ---- message=FALSE, warning=FALSE, echo=TRUE---------------------------------
library(quanteda)
d = data.frame(id = c(1,2,3),
               text = c('Socrates is human', 'Humans are mortal', 'Therefore, Socrates is mortal'),
               author = c('Aristotle','Aristotle','Aristotle'),
               stringsAsFactors = F)

corp = corpus(d, docid_field = 'id', text_field='text')
dtm = dfm(tokens(corp))

dtm
docvars(dtm)

## ---- message=FALSE, warning=FALSE, echo=TRUE---------------------------------
library(RNewsflow)

dtm = rnewsflow_dfm ## copy the demo data
dtm

## -----------------------------------------------------------------------------
head(docvars(dtm), 3)

## -----------------------------------------------------------------------------
tdd = term_day_dist(dtm)
tail(tdd, 3)

## -----------------------------------------------------------------------------
select_terms = tdd$term[tdd$days.entropy.norm <= 0.1]
dtm = dtm[,select_terms]

## -----------------------------------------------------------------------------
dtm = quanteda::dfm_tfidf(dtm)

## ----results='hide', message=FALSE, warning=FALSE-----------------------------
g = newsflow_compare(dtm, date_var='date',
                     hour_window = c(0,36), 
                     min_similarity = 0.4)

## -----------------------------------------------------------------------------
vertex_sourcetype = V(g)$sourcetype
edge_hourdiff = E(g)$hourdiff

head(vertex_sourcetype)
head(edge_hourdiff)

## ---- fig.show='hold'---------------------------------------------------------
v = as_data_frame(g, 'vertices')
e = as_data_frame(g, 'edges')

head(v[,c('name','date','source','sourcetype')],3)
head(e,3)    

## ---- fig.width = 7, fig.height = 3-------------------------------------------
hist(E(g)$hourdiff, main='Time distance of document pairs', 
     xlab = 'Time difference in hours', breaks = 150, right=F)

## -----------------------------------------------------------------------------
# set window for all vertices
g = filter_window(g,  hour_window = c(0.1, 36))

# set window for print newspapers
g = filter_window(g,  hour_window = c(6, 36), 
           to_vertices = V(g)$sourcetype == 'Print NP')

## -----------------------------------------------------------------------------
show_window(g, to_attribute = 'source')

## -----------------------------------------------------------------------------
g_subcomps = decompose(g)

## ---- fig.width = 7, fig.height = 3-------------------------------------------
gs = g_subcomps[[20]] # select the second sub-component
document_network_plot(gs)

## ---- fig.width = 7, fig.height = 4-------------------------------------------
document_network_plot(gs, source_attribute = 'sourcetype', 
                      dtm=dtm)

## ---- fig.width = 7, fig.height = 3-------------------------------------------
gs_onlyfirst = only_first_match(gs)
document_network_plot(gs_onlyfirst)

## -----------------------------------------------------------------------------
g_agg = network_aggregate(g, by='source', 
                          edge_attribute='hourdiff', 
                          agg_FUN=median)

e = as_data_frame(g_agg, 'edges')
head(e)

## -----------------------------------------------------------------------------
adj_m = as_adjacency_matrix(g_agg, attr= 'to.Vprop', sparse = FALSE)
round(adj_m, 2) # round on 2 decimals

## ---- fig.align='center', fig.width=7, fig.height=5---------------------------
directed_network_plot(g_agg, weight_var = 'to.Vprop',
                       weight_thres = 0.2)

## ---- fig.align='center', fig.width=7, fig.height=5---------------------------
g2 = only_first_match(g)
g2_agg = network_aggregate(g2, by='source', 
                           edge_attribute='hourdiff', 
                           agg_FUN=median)

directed_network_plot(g2_agg, weight_var = 'to.Vprop',
                       weight_thres = 0.2)

## -----------------------------------------------------------------------------
V(g)$day = format(as.Date(V(g)$date), '%Y-%m-%d')
agg_perday = network_aggregate(g, 
              by_from='sourcetype', by_to=c('sourcetype', 'day'), 
              edge_attribute='hourdiff', agg_FUN=median, 
              return_df=TRUE)

head(agg_perday[agg_perday$to.sourcetype == 'Online NP',  
                c('from.sourcetype', 'to.sourcetype', 'to.day','to.Vprop')])

## -----------------------------------------------------------------------------
agg_perdoc = network_aggregate(g, 
                  by_from='name', by_to='sourcetype', 
                  edge_attribute='weight', agg_FUN=max,
                  return_df=TRUE)
docXsource = xtabs(agg.weight ~ from.name + to.sourcetype, 
                   agg_perdoc, sparse = FALSE)
head(docXsource)

## ----eval=FALSE---------------------------------------------------------------
#  library(RNewsflow)
#  library(quanteda)
#  
#  # Prepare DTM
#  dtm = rnewsflow_dfm  ## copy demo data
#  
#  tdd = term_day_dist(dtm)
#  dtm = dtm[,tdd$term[tdd$days.entropy.norm <= 0.1]]
#  dtm = dfm_tfidf(dtm)
#  
#  # Prepare document similarity network
#  g = newsflow_compare(dtm, hour_window = c(-0.1,60),
#                       min_similarity = 0.4)
#  g = filter_window(g, hour_window = c(6, 36),
#                    to_vertices = V(g)$sourcetype == 'Print NP')
#  show_window(g, to_attribute = 'source')
#  
#  g_subcomps = decompose(g)
#  document_network_plot(g_subcomps[[55]], dtm=dtm)
#  
#  g_agg = network_aggregate(g, by='source',
#                            edge_attribute='hourdiff', agg_FUN=median)
#  
#  as_adjacency_matrix(g_agg, attr='to.Vprop')
#  directed_network_plot(g_agg, weight_var = 'to.Vprop',
#                        weight_thres=0.2)
#  
#  g2 = only_first_match(g)
#  g2_agg = network_aggregate(g2, by='source',
#                             edge_attribute='hourdiff', agg_FUN=median)
#  
#  as_adjacency_matrix(g2_agg, attr='to.Vprop')
#  directed_network_plot(g2_agg, weight_var = 'to.Vprop',
#                        weight_thres=0.2)
kasperwelbers/RNewsflow documentation built on April 8, 2024, 4:39 p.m.