##########################################################################################################
# THREADNET Misc functions
# This software may be used according to the terms provided in the
# GNU General Public License (GPL-3.0) https://opensource.org/licenses/GPL-3.0?
# Absolutely no warranty!
##########################################################################################################
#' @title make_example_DF
#' @description Make an example data frame for display...
#' @name make_example_DF
#' @return DF with some data
#' @export
make_example_DF = function(){
correct_occ = read.table(text="tStamp actor action object location
'2017-4-7 17:52:04' jimmy tosses ball playground
'2017-4-7 17:52:12' rover fetches ball playground
'2017-5-18 9:05:52' jimmy tosses ball forest
'2017-5-18 9:06:24' rover fetches stick forest
'2017-5-18 9:10:48' jimmy searches ball forest ", header=TRUE)
}
#' @title numThreads counts how many threads in the data set
#' @description Threads must have unique thred numbers for this function to work
#' @name numThreads
#' @param o data frame with occurrences or events
#' @param TN column with thread number
#' @return number of threads
#' @export
numThreads = function(o,TN) {length(unique(o[[TN]]))}
# Time range for the data set (not really needed but nice)
timeRange= function(o){
# get the min/max time in the whole set of occurrences
start = min(as.POSIXlt.date(o$tStamp))
finish = max(as.POSIXlt.date(o$tStamp))
# take the difference
difftime(finish,start)}
# Put it into a nice phrase
timeRangePhrase = function(tr){
rangeunits = attr(tr,"units")
paste(floor(as.numeric(tr)),rangeunits,"from start to finish.")}
# this function is used to split up the threads into n ~equal buckets
#' @title make_subsets
#' @description this function is used to split up the threads into n ~equal buckets
#' @name make_subsets
#' @param d data frame with occurrences or events
#' @param n number of buckets
#' @return list of smaller data frames
#' @export
make_subsets <- function(d,n){
return(split(d, ceiling(seq_along(d)/(length(d)/n))))
}
#' @title zoomColumn
#' @description This function takes a slider value and returns a valid column name for zooming
#' if the argument is null, then use ZM_1
#' @name zoomColumn
#' @param z integer for the zoom level
#' @return column name for that zoom level ('ZM_n)
#' @export
zoomColumn <- function(z){
# print(paste("In zoomColumn z=",z))
if (is.null(z))
{r="ZM_1"}
else
{r=paste0("ZM_",z)}
# print(paste("In zoomColumn r=",r))
return(r)
}
######### Functions that return column names #######
#' @title cfnames provides names of all the contextual factors (except the time stamp)
#' @description grab all of the columns except the first, which has the time stamp
#' tStamp in the first colummn
#' @name cfnames
#' @param o data frame with threads
#' @return list of column names
#' @export
cfnames <- function(o){
colnames(o)[2:length(colnames(o))]}
#' @title get_CF_levels returns the levels of a contextual factor
#' @description this is used to populate the UI for comparison of categories within a CF
#' @name get_CF_levels
#' @param o data frame with threads
#' @param cf a contextual factors (column)
#' @return list of unique factor levels
#' @export
get_CF_levels <- function(o,cf){
return(levels(o[,cf]))
}
##########################################################################################################
#' @title combineContextFactors creates a new column that combines some set of other columns
#' @description this function adds a new column to the occurrenes table based on a combination of context factors CF). For example, actor+action
#' @name combineContextFactors
#' @param o data frame with threads
#' @param CF contextual factors to be combined.
#' @param newCol name of the new combined conextual factor
#' @return data frame with the new column
#' @export
combineContextFactors <- function(o,CF,newCol){
# Use the old column if there is one
if (!(newCol %in% names(o))) {
# Need to get the CF parameters into the right format for tidyr::unite function
cfn= sapply(CF, as.character)
newCol = as.character(newCol)
# unite the columns, but keep the old ones
o= unite_(o, newCol, cfn, sep="+", remove=FALSE)
}
# Coerce the new column into a factor
o[newCol] = as.factor(o[,newCol])
return(o)
}
#' @title newColName
#' @description returns new combined column name from a list of CFs
#' @name newColName
#' @param CF_list list of context factors to define events
#' @return column name
#' @export
newColName <- function(CF_list){
return(paste0(CF_list,collapse="_")) }
# These were used on the occ-to-event tab to configure the slider
#' @title threshold_slider_min
#' @description These were used on the occ-to-event tab to configure the slider
#' @name threshold_slider_min
#' @param o dataframe of occurrences
#' @return slider min
#' @export
threshold_slider_min <- function(o){
return(floor(min(o$timeGap)))
}
#' @title threshold_slider_max
#' @description These were used on the occ-to-event tab to configure the slider
#' @name threshold_slider_max
#' @param o dataframe of occurrences
#' @return slider max
#' @export
threshold_slider_max <- function(o){
return(ceiling(max(o$timeGap)))
}
#' @title threshold_slider_selected
#' @description These were used on the occ-to-event tab to configure the slider
#' @name threshold_slider_selected
#' @param o dataframe of occurrences
#' @return selected value = min
#' @export
threshold_slider_selected <- function(o){
return(min(o$timeGap))
}
#### count the handoffs, but reverse coded -- zero = all different
#' @title diff_handoffs
#' @description count the handoffs, but reverse coded -- zero = all different
#' @name diff_handoffs
#' @param o dataframe of occurrences
#' @return o dataframe of occurrences with handoff count filled in
#' @export
diff_handoffs <- function(o){
# initialize the previous row
previous_row <<- o[1,]
return(apply(o,1, row_diff_handoff))
}
row_diff_handoff <- function(this_row){
# just add up the differences.
d <-sum(this_row==previous_row)
# store the previous row
previous_row <<-this_row
# return the number ofdifferences
return(d)
}
#### Time gaps -- just pass in the column of time stamps
#' @title diff_tStamp
#' @description Time gaps -- just pass in the column of time stamps - uses auto units
#' @name diff_tStamp
#' @param ts column of time stamps
#' @return Column of differences between timestamps
#' @export
diff_tStamp <- function(ts){
# initialize the first row
previous_row <<- ts[1]
return(sapply(ts, row_diff_tStamp))
}
row_diff_tStamp <- function(this_row){
# print(paste("this_row",this_row))
# print(paste("previous_row",previous_row))
# just add up the differences.
d <-max(0,difftime(this_row, previous_row, units='auto'))
# store the previous row
previous_row <<-this_row
# return the time difference
return(d)
}
#' @title threadSizeTable provides a distribution of the length of threads
#' @description This function should work on either ocurrences or events.
#' it returns length and duration of each thread.It requires tStamp field to compute duration.
#' @name threadSizeTable
#' @param o data frame with threads
#' @param TN column comtaining the threadNumber
#' @return data frame with table of thread lengths
#' @export
threadSizeTable <- function(o,TN){
# get the number of threads
nThreads = nrow(unique(o[TN]))
id=integer(nThreads)
num=integer(nThreads)
dur=numeric(nThreads)
sizes = data.frame(id,
num,
dur)
for (i in 1:nThreads){
sizes$id = i
sizes$num[i] = sum(o[TN]==i)
}
s = as.data.frame(table(sizes$num))
return(s)
}
#########################################################
#' @title convert_TN_to_TramineR
#' @description converts the csv format used in ThreadNet to the format used by TraMiner. Should provide a way to save this, as well.
#' @name convert_TN_to_TramineR
#' @param df threads (occurrences or events)
#' @param CF Contextual factor that will be used to define the state sequences in TraMineR
#' @return Dataframe in TraMineR format (state sequeces in horizontal rows)
#' @export
convert_TN_to_TramineR <- function(df, CF){
# dataframe must be sorted by time or sequence within each threadNumber
# TN is the threadnumber
# CF is some attribute we will use in TramineR
# first find the threads
threads = unique(df$threadNum)
nThreads = length(threads)
# Initialize list of empty lists
s = rep( list(list()), nThreads )
for ( th in 1:nThreads){
#subset of df that contains the sequence
s[[th]] = as.character(df[df$threadNum==threads[th],CF])
}
# add NA to make all the lists the same length
s = lapply(s, `length<-`, max(lengths(s)))
# convert to data frame
df <- data.frame(matrix(unlist(s), nrow=nThreads, byrow=T))
return(df)
}
# these functions support the moving window
#' @title get_threadList
#' @description get_threadList returns a list of all thread numbers
#' @name get_threadList
#' @param e data frame with threaded events
#' @param TN Column with threadNumber
#' @param SN Column with sequence numbers
#' @return list of thread numbers
#' @export
get_threadList <- function(e,TN,SN){
# for the current data structure for events, you just pick all of the threads where seqNum == 1
return(e[e[[SN]]==1,TN])
}
#' @title get_moving_window
#' @description get_moving_window returns a set of threads for a moving window
#' @name get_moving_window
#' @param e data frame with threads (needs to have threadNum and seqNum)
#' @param s size of window
#' @param l location of window
#' @return data from with just the threads in the window
#' @export
get_moving_window <- function(e, s, l ){
# get the list of threads
w=get_threadList(e,"threadNum","seqNum")
# get get the appropriate subset of threads for the window
w=w[l:(l+s-1)]
# and now subset the rows for those threads
return(e[e$threadNum %in% w,])
}
#' @title window_correlation
#' @description Correlation between moving windows
#' @name window_correlation
#' @param e data fraom for POV
#' @param w width of moving window
#' @param s step - how far to move window in each increment (default is 1)
#' @param n number of windows (default is 2)
#' @export
window_correlation <- function(e,w,s=1,n=2){
# make data frame
vt=data.frame( ngrams=character(), freq=integer(), wid=integer() )
# use the finest granularity
zcf = zoom_upper_limit(e)
# now many threads?
nThreads = numThreads(e,'threadNum')
wcount=0
# scan through the data
for (wloc in seq( 1, nThreads, s)){
wcount= wcount +1
# print(paste('wloc =',wloc))
# get text vector for the whole data set - just keep the first two colomns
ngdf = count_ngrams(get_moving_window(e, w, wloc), 'threadNum', zcf, n)[1:2]
# print(paste('nrow ngdf =',nrow(ngdf)))
# add a the row number
ngdf$wid = wcount
# append the columns to the end
vt=rbind(vt,ngdf)
}
# convert to factor so that we can compute distances using the factor levels
vt$ngrams = factor(vt$ngrams)
nWindows = length(unique(vt$wid))
# get the set of unique ngrams for the whole data set
vt_unique = data.frame(ngrams=unique(vt$ngrams))
# put the results here
windowFreqMatrix = matrix(0,nrow=nWindows, ncol=nrow(vt_unique))
for (i in 1:nWindows){
# get the merged list
vtmerge = merge(x=vt_unique, y=vt[vt$wid==i,], by='ngrams', all.x = TRUE)
# use the wid.y to get the whole vector, but replace the NA with zeros
b=vtmerge[vtmerge$wid==i,'freq']
b[is.na(b)] <- 0
windowFreqMatrix[i,]=b
}
# old way: correlate one row with the next and stick it in a dataframe
df =data.frame(window=1:(nWindows-1),
thread = seq( 1, nThreads-s, s),
correlation= unlist(lapply(1:(nWindows-1),
function(i){abs( cor(windowFreqMatrix[i,],windowFreqMatrix[i+1,])) })))
# add the last row explicitly
df = rbind( df, data.frame(window=nWindows, thread=nThreads, correlation=0))
# return( df )
# get the ngram data and labels
b_df=as.data.frame(windowFreqMatrix)
colnames(b_df)=vt_unique$ngrams
# stick the ngram frequencies on the end for good measure
return(cbind(df,b_df))
}
#' @title dual_window_correlation
#' @description similar to moving window, except one window on each side of a focal thread.
#' @name dual_window_correlation
#' @param e data fraom for POV
#' @param w width of moving window
#' @param s step - how far to move window in each increment (default is 1)
#' @param n number of windows (default is 2)
#' @export
dual_window_correlation <- function(e,w,s=1,n=2){
# make data frame
vt=data.frame( ngrams=character(), freq=integer(), id=integer() )
# use the finest granularity
zcf = zoom_upper_limit(e)
# now many threads?
nThreads = numThreads(e,'threadNum')
# scan through the threads - treat each thread as a window of one
# can probably do with the split and apply much faster
for (t in 1:nThreads){
# print(paste('wloc =',wloc))
# get text vector for the whole data set - just keep the first two colomns
ngdf = count_ngrams(get_moving_window(e, 1, t), 'threadNum', zcf, n)[1:2]
# print(paste('nrow ngdf =',nrow(ngdf)))
# add a the row number
ngdf$id = t
# append the columns to the end
vt=rbind(vt,ngdf)
}
# convert to factor
vt$ngrams = factor(vt$ngrams)
# compute number of windows.
nWindows = floor(nThreads/w)
# get the set of unique ngrams for the whole data set
vt_unique = data.frame(ngrams=unique(vt$ngrams))
# put the results here
ngramFreqMatrix = matrix(0,nrow=nThreads, ncol=nrow(vt_unique))
for (i in 1:nThreads){
# get the merged list
vtmerge = merge(x=vt_unique, y=vt[vt$id==i,], by='ngrams', all.x = TRUE)
# use the wid.y to get the whole vector, but replace the NA with zeros
b=vtmerge[vtmerge$id==i,'freq']
b[is.na(b)] <- 0
ngramFreqMatrix[i,]=b
}
# return(ngramFreqMatrix)
# old way: correlate one row with the next and stick it in a dataframe
df =data.frame( thread = seq(w,nThreads-(w+1),s),
correlation= unlist(lapply(seq(w,nThreads-(w+1),s),
function(i){abs( cor(colSums( ngramFreqMatrix[(i-w):i, ] ),
colSums( ngramFreqMatrix[(i+1):(i+w+1), ] ))) })))
# # add the last row explicitly
# df = rbind( df, data.frame( thread=nThreads, correlation=0))
return( df )
# # get the ngram data and labels
# b_df=as.data.frame(ngramFreqMatrix)
# colnames(b_df)=vt_unique$ngrams
#
# # stick the ngram frequencies on the end for good measure
# return(cbind(df,b_df))
}
# Make a nice dataframe to display
#' @title make_nice_event_DT
#' @description Issue is that DT::renderdatatable cannot display lists correctly, so cut then out.
#' @name make_nice_event_DT
#' @description Removes columns that do not need to be displayed
#' @param e data frame with POV
#' @export
make_nice_event_DT <- function(e){
# Add new column for the occurrences as a character string for display
# tibble::add_column(e, paste(e$occurrences,sep=","), .after=1)
e$OccurrenceList = paste(e$occurrences,sep=",")
# move occurences directly after tstamp
col_occ = which(colnames(e)=="OccurrenceList")
e = e[, c(1, col_occ, (2:ncol(e))[-col_occ])]
# Now remove the columns that have lists
e$occurrences = NULL
e$OccurrenceList.1 = NULL
e[grep("V_",colnames(e))]=NULL
return(e)
}
#' @title zoom_upper_limit
#' @description find the biggest column with ZM_, and then get the number that goes with that.
#' @name zoom_upper_limit
#' @description Used to set upper limit on sliders for zooming
#' @param e data frame
#' @return biggest zoom level
#' @export
zoom_upper_limit <- function(e){
upper_limit = as.integer(str_replace(colnames(e[max(grep("ZM_",colnames(e)))]),"ZM_",""))
return(upper_limit)
}
# sliderInput("ThreadMapZoomID",
# "Zoom in and out by event similarity:",
# 1,100,5, step = 1, ticks=FALSE)
######################################################
# Just putting this code here to play with for now.
# this function finds the common events in two subsets of thread data
# common_events <- function(ss1, ss2, TN, CF, n){
#
# # get the list of ngrams for each subset of threads
# e1 = count_ngrams(ss1, TN, CF, n)[1]
# e2 = count_ngrams(ss2, TN, CF, n)[1]
#
# # return the intersection
# return(intersect(as.matrix(e1), as.matrix(e2)))
#
# }
#
# rr_grams <- function(o,TN, CF, N, R) {
# # N - max length of ngram
# # R = threshold for repetition
#
# }
# Ideas for regex work
# https://stackoverflow.com/questions/35704369/identify-repetitive-pattern-in-numeric-vector-in-r-with-fuzzy-search
# sapply(1:(length(x)/2), function(m) sum(rep(x[1:m], length = length(x)) != x))
# x <- rep(c(1, 4, 2), 10)
# for(k in seq_len(length(x)/2)) {
# pat <- x[1:k]
# if (identical(rep(pat, length = length(x)), x)) {
# print(pat)
# break
# }
# }
## [1] 1 4 2
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.