# @title Create sessions on timestamp data
#
# @description Extracts session information, i.e. start and end date of session, duration, badges involved,
# nr of detects in this session. Session data is usually generated by the
# \code{cluster_interact}, \code{cluster_ts} or \code{cluster_ids} functions which group several
# timestamp data rows together. This function then merely summarizes the available data,
# extracting start and end time, number of detects, duration and badges involved.
#
# @param df dataframe containg interaction data.
# @param by.col String, specify the name of the column which groups interactions into the same session.
# Usually this is either "clusterTS" or "clusterDyad" produced by the cluster.interaction.X methods.
# @param min.detects A number. Sessions with lower number of detects will be discarded. Default 0
# will not write anything.
# @param edge.list String c("none" | "undirected" | "directed"). Produces additional result columns for
# easily constructing networks and timebased network data from sessions. Adds the following columns:
# Head: head of edge
# Tail: tail of edge
# Onset: starting time for spell (session) used in library(networkDynamic)
# Terminus: ending time for spell (session)
# Pairs: badge pair involved
# If "undirected" creates edges from all badges listed in "Badges". If a session contains more than 2 badges,
# edges for all combinations of nodes will be constructed.
# If "directed", directed edges are constructed. Default is "none".
#
# @return dataframe indicating cluster id, start and end timestamp, day, the nr of detects in the
# session, the duration of the session, all badge ids involved in session. If edge.list is set, then
# Head and Tail columns are added, indicating edges between all involved badges.
#
get_ses <- function(df, by.col, min.detects=0, write.session=NULL, edge.list="none"){
#tmp storeage
bids <- c()
rsb <- data.frame()
#get starting timestamps of all clusters
tsmin <- aggregate(df$Timestamp, by=list(df[[by.col]]), FUN=min)
#ending timestamp of all clusters
tsmax <- aggregate(df$Timestamp, by=list(df[[by.col]]), FUN=max)
#create dateframe with cluster id, start and end time of interaction
rs <- merge(tsmin, tsmax, by="Group.1")
names(rs) <- c("Cluster", "Start", "End")
rs$Day <- format(rs$Start, "%Y-%m-%d")
#calculate number of detects within this session
detects <- aggregate(df[[by.col]], by=list(df[[by.col]]), FUN=length)
rs$Detects <- detects$x
#list the badge ids involved in this session.
#this is coming from interaction data where column "Pairs" stores data Badge.ID_Other.ID
if("Pairs" %in% names(df)){
badges <- aggregate(df$Pairs, by=list(df[[by.col]]), FUN=function(bid){
bids <- append(bids, as.character(levels(bid))[bid])
bids <- unlist(strsplit(bids, "_"))
bids <- sort(unique(bids))
bids <- paste(bids, collapse="+")
bids
})
#store badges
rs$Badges <- badges$x
#this is coming from cluster badge sequence (audio) see cluster.badge.seq()
} else if ("Badge.ID" %in% names(df)){
#the session data looses the Badge.ID column that we need
rs <- df %>% rename(Cluster=clusterIDSeq) %>%
left_join(rs, ., by="Cluster") %>% #copy the Badge.ID
distinct(., Cluster, .keep_all=T) %>% #remove duplicated rows
select(-c(Timestamp,Volume)) #remove columns not needed
}
#duration of session (or interaction)
rs$Duration <- round(difftime(rs$End, rs$Start, units="secs"),2)
if (any(rs$Duration/3600 > 8)) {
warning("\nSession duration > 8 hours detected!")
}
if (any(format(rs$Start, "%Y-%m-%d") != format(rs$End, "%Y-%m-%d"))){
warning("\n Session across two days detected!")
}
#constructs a head - tail edge list for all badges in each cluster
#this probably should be separate function!!
if (edge.list == "undirected" | edge.list=="directed"){
rs$Head <- rs$Badges
rs$Tail <- rs$Badges
rs <- separate_rows(rs, Head, sep="\\+" )
rs <- separate_rows(rs, Tail, sep="\\+" )
rs$Head <- as.numeric(rs$Head)
rs$Tail <- as.numeric(rs$Tail)
rs$Onset <- as.numeric(rs$Start)
rs$Terminus <- as.numeric(rs$End)
#remove self-loops
rs <- filter(rs, Head != Tail)
#collapse direct to undirected edges
if (edge.list=="undirected"){
rs <- to.undirected.pairs(df, col=c("Head","Tail"))
#remove duplicated rows - for undirected graphs
rs <- rs[duplicated(rs[,c("Cluster","Pairs")]),]
}
}
rs
}
# Write session data to xls (or other formats)
#
# @param sessions Dataframe produced by calling \code{get.sessions}.
# @param ses.prefix A string. Sessions can have names stored in column of the xls. The session name
# is given by the start and end time, but can have an additional prefix.
# @param file.name String of the name for this file
# @param ses.offset Sessions are numbered by default by the \code{get.sessions} function. Sessions
# can be renumbered by the specified numeric offset.
# @param format String Currently only "xls" or "txt" files can be saved.
# @param millisecs Logical. If set to FALSE (default) will remove milliseconds from timestamp before saving to xls
#
write_ses <- function(sessions, ses.prefix="", file.name="SessionData.xls", ses.offset=0, format="xls", millisec=F){
library(WriteXLS)
#create session name
sessions$sesname <- paste0(ses.prefix,"_",format(sessions$Start, "%d%b_%H%M"),"_", format(sessions$End, "%H%M"))
#formatting for sociometric data lab. Date needs to be: 2/15/16 9:00:00,000
#m/d/aa H:M:OS
sessions$Start <- format(sessions$Start, "%m/%d/%y %H:%M:%OS")
sessions$End <- format(sessions$End, "%m/%d/%y %H:%M:%OS")
#remove milliseconds
if (!millisec){
sessions$Start <- str_replace(sessions$Start, "\\.[0-9]+", "")
sessions$End <- str_replace(sessions$End, "\\.[0-9]+", "")
} else {
#replace "." with "," for miliseconds separator. Required for import format
sessions$Start <- str_replace(sessions$Start, "\\.", ",")
sessions$End <- str_replace(sessions$End, "\\.", ",")
}
#offset session numbering
sessions$Cluster <- as.numeric(sessions$Cluster) + ses.offset
sessions <- select(sessions, Cluster, sesname, Start, End)
names(sessions) <- c("Session number", "Session name", "Start", "End")
WriteXLS(sessions, ExcelFileName=file.name, Encoding="UTF-8")
}
# Summarize session data, clustered over timestamps according to number of participants.
# Shows the duration for each group size, i.e. small and large configurations or small
# interactions between dyads and then big groups. Could tell something about how
# groups interact/cluster.
#
# @param Dataframe, output of \code{get.sessions()} Needs to have at least the "Badges" column
# of concetenated Badges like 34+56+23 of session participants.
#
# @return Dataframe with columns SesSize (session size), Duration, Duration normalized, and involved
# Badges listed as 23+34+45
summarize.sessions <- function(ses){
#same badge groups can have several sessions. Sum up total duration for each badge cluster
dd <- ses %>%
group_by(Badges) %>%
summarise(sum(Duration))
dd$SesSize <- sapply(dd$Badges, FUN= function(x){
b <- length(unique(unlist(str_split(x, pattern="[+|_]"))))
b
})
names(dd) <- c("Badges", "Duration", "SesSize")
dd <- dd %>%
mutate(DurationNorm = Duration/SesSize) %>%
arrange(desc(SesSize)) %>%
select(3,2,4,1)
dd
}
# Batch reading experimental xls files inside a directory requires to
# extract the session ID which is given in the file name in the format
# S1, S2, S3, etc. This function extracts the numeric value of the session in order
# to store it with the actual audio/interaction data.
extract_sesinfo_file <- function(file, pattern="S[0-9]+"){
ses_id<- regmatches(file, regexpr(pattern, file))
ses_id <- substr(ses_id, 2, nchar(ses_id))
ses_id <- as.numeric(ses_id)
ses_id
}
# Calculates basic session data such as mean duration for give badge pairs over the whole dataframe
#
# THIS IS OBSOLTE BY tiedDuration() of network package?!
#
# @param badges A list of numeric badge pairs, e.g list(c(x,y), c(x, z)). If no list is specified (default)
# all badge pairs of data.frame will be used.
# @param filterRSSI Integer value indicating the strength of the RSSI signal to be included.
# @param twl Integer value indicating the time window length in which detects should be counted as
# being part of the same session.
# @param average Boolean value if basic metrics of session are calculated.
#
# @return Returns dataframe with interaction sessions, containing start, end timestamp
# day, nr of detects, badge pair involved, and duraction in seconds.
#
get_ses_pairs <- function(df, badges=NULL, filterRSSI=NULL, twl=120, average=F){
#take all pairs of data.frame
if (is.null(badges)){
badges <- lapply(unique(df$Pairs), as.NumPair)
}
dfr <- NULL
for (i in 1:length(badges)){
tmp <- filter.interactions(df, badges=badges[[i]], filterRSSI=filterRSSI)
if (nrow(tmp)>0){
tmp <- cluster.interaction.ts(tmp, twl=twl)
ses <- get.sessions(tmp, by.col="clusterTS")
} else {
ses <- NULL
}
if (is.null(dfr)){
dfr <- ses
} else {
dfr <- rbind(dfr, ses)
}
}
if (average){
#aggregate session durations of each pair
meanSessionDuration <- aggregate(dfr$Duration, by=list(dfr$Badges), FUN=mean )
meanSessionDuration$N <- aggregate(dfr$Badges, by=list(dfr$Badges), length )["x"]
names(meanSessionDuration) <- c("Pair", "MeanMinutes")
meanSessionDuration$sd <- aggregate(dfr$Duration, by=list(dfr$Badges), FUN=sd )["x"]
meanSessionDuration <- meanSessionDuration[with(meanSessionDuration, order(-MeanMinutes)), ]
return(meanSessionDuration)
} else {
return(dfr)
}
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.