R/session.R

Defines functions get_ses_pairs extract_sesinfo_file summarize.sessions write_ses get_ses

# @title Create sessions on timestamp data
#
# @description Extracts session information, i.e. start and end date of session, duration, badges involved,
#  nr of detects in this session. Session data is usually generated by the
#  \code{cluster_interact}, \code{cluster_ts} or \code{cluster_ids} functions which group several
#  timestamp data rows together. This function then merely summarizes the available data,
#  extracting start and end time, number of detects, duration and badges involved.
#
# @param df dataframe containg interaction data.
# @param by.col String, specify the name of the column which groups interactions into the same session.
# Usually this is either "clusterTS" or "clusterDyad" produced by the cluster.interaction.X methods.
# @param min.detects A number. Sessions with lower number of detects will be discarded. Default 0
# will not write anything.
# @param edge.list String c("none" | "undirected" | "directed"). Produces additional result columns for
# easily constructing networks and timebased network data from sessions. Adds the following columns:
#   Head: head of edge
#   Tail: tail of edge
#   Onset: starting time for spell (session) used in library(networkDynamic)
#   Terminus: ending time for spell (session)
#   Pairs: badge pair involved
# If "undirected" creates edges from all badges listed in "Badges". If a session contains more than 2 badges,
# edges for all combinations of nodes will be constructed.
# If "directed", directed edges are constructed. Default is "none".
#
# @return dataframe indicating cluster id, start and end timestamp, day, the nr of detects in the
# session, the duration of the session, all badge ids involved in session. If edge.list is set, then
# Head and Tail columns are added, indicating edges between all involved badges.
#
get_ses <- function(df, by.col, min.detects=0, write.session=NULL, edge.list="none"){

  #tmp storeage
  bids <- c()
  rsb <- data.frame()

  #get starting timestamps of all clusters
  tsmin <- aggregate(df$Timestamp, by=list(df[[by.col]]), FUN=min)

  #ending timestamp of all clusters
  tsmax <- aggregate(df$Timestamp, by=list(df[[by.col]]), FUN=max)

  #create dateframe with cluster id, start and end time of interaction
  rs <- merge(tsmin, tsmax, by="Group.1")
  names(rs) <- c("Cluster", "Start", "End")

  rs$Day <- format(rs$Start, "%Y-%m-%d")

  #calculate number of detects within this session
  detects <- aggregate(df[[by.col]], by=list(df[[by.col]]), FUN=length)
  rs$Detects <- detects$x

  #list the badge ids involved in this session.
  #this is coming from interaction data where column "Pairs" stores data Badge.ID_Other.ID
  if("Pairs" %in% names(df)){
    badges <- aggregate(df$Pairs, by=list(df[[by.col]]), FUN=function(bid){
      bids <- append(bids, as.character(levels(bid))[bid])
      bids <- unlist(strsplit(bids, "_"))
      bids <- sort(unique(bids))
      bids <- paste(bids, collapse="+")
      bids
    })

    #store badges
    rs$Badges <- badges$x

    #this is coming from cluster badge sequence (audio) see cluster.badge.seq()
  } else if ("Badge.ID" %in% names(df)){

    #the session data looses the Badge.ID column that we need
    rs <- df %>% rename(Cluster=clusterIDSeq) %>%
      left_join(rs, ., by="Cluster") %>% #copy the Badge.ID
      distinct(., Cluster, .keep_all=T) %>% #remove duplicated rows
      select(-c(Timestamp,Volume)) #remove columns not needed
  }


  #duration of session (or interaction)
  rs$Duration <- round(difftime(rs$End, rs$Start, units="secs"),2)

  if (any(rs$Duration/3600 > 8)) {
    warning("\nSession duration > 8 hours detected!")
  }

  if (any(format(rs$Start, "%Y-%m-%d") != format(rs$End, "%Y-%m-%d"))){
    warning("\n Session across two days detected!")
  }

  #constructs a head - tail edge list for all badges in each cluster
  #this probably should be separate function!!
  if (edge.list == "undirected" | edge.list=="directed"){

    rs$Head <- rs$Badges
    rs$Tail <- rs$Badges

    rs <- separate_rows(rs, Head, sep="\\+" )
    rs <- separate_rows(rs, Tail, sep="\\+" )

    rs$Head <- as.numeric(rs$Head)
    rs$Tail <- as.numeric(rs$Tail)

    rs$Onset <- as.numeric(rs$Start)
    rs$Terminus <- as.numeric(rs$End)

    #remove self-loops
    rs <- filter(rs, Head != Tail)

    #collapse direct to undirected edges
    if (edge.list=="undirected"){
      rs <- to.undirected.pairs(df, col=c("Head","Tail"))

      #remove duplicated rows - for undirected graphs
      rs <- rs[duplicated(rs[,c("Cluster","Pairs")]),]
    }

  }

  rs
}




# Write session data to xls (or other formats)
#
# @param sessions Dataframe produced by calling \code{get.sessions}.
# @param ses.prefix A string. Sessions can have names stored in column of the xls. The session name
# is given by the start and end time, but can have an additional prefix.
# @param file.name String of the name for this file
# @param ses.offset Sessions are numbered by default by the \code{get.sessions} function. Sessions
# can be renumbered by the specified numeric offset.
# @param format String Currently only "xls" or "txt" files can be saved.
# @param millisecs Logical. If set to FALSE (default) will remove milliseconds from timestamp before saving to xls
#
write_ses <- function(sessions, ses.prefix="", file.name="SessionData.xls", ses.offset=0, format="xls", millisec=F){

  library(WriteXLS)

  #create session name
  sessions$sesname <- paste0(ses.prefix,"_",format(sessions$Start, "%d%b_%H%M"),"_", format(sessions$End, "%H%M"))

  #formatting for sociometric data lab. Date needs to be: 2/15/16 9:00:00,000
  #m/d/aa H:M:OS
  sessions$Start <- format(sessions$Start, "%m/%d/%y %H:%M:%OS")
  sessions$End <- format(sessions$End, "%m/%d/%y %H:%M:%OS")

  #remove milliseconds
  if (!millisec){
    sessions$Start <- str_replace(sessions$Start, "\\.[0-9]+", "")
    sessions$End <- str_replace(sessions$End, "\\.[0-9]+", "")

  } else {

    #replace "." with "," for miliseconds separator. Required for import format
    sessions$Start <- str_replace(sessions$Start, "\\.", ",")
    sessions$End <- str_replace(sessions$End, "\\.", ",")
  }


  #offset session numbering
  sessions$Cluster <- as.numeric(sessions$Cluster) + ses.offset

  sessions <- select(sessions, Cluster, sesname, Start, End)

  names(sessions) <- c("Session number", "Session name", "Start", "End")


  WriteXLS(sessions, ExcelFileName=file.name, Encoding="UTF-8")

}



# Summarize session data, clustered over timestamps according to number of participants.
# Shows the duration for each group size, i.e. small and large configurations or small
# interactions between dyads and then big groups. Could tell something about how
# groups interact/cluster.
#
# @param Dataframe, output of \code{get.sessions()} Needs to have at least the "Badges" column
# of concetenated Badges like 34+56+23 of session participants.
#
# @return Dataframe with columns SesSize (session size), Duration, Duration normalized, and involved
# Badges listed as 23+34+45
summarize.sessions <- function(ses){

  #same badge groups can have several sessions. Sum up total duration for each badge cluster
  dd <- ses %>%
    group_by(Badges) %>%
    summarise(sum(Duration))


  dd$SesSize <- sapply(dd$Badges, FUN= function(x){
    b <- length(unique(unlist(str_split(x, pattern="[+|_]"))))
    b
  })

  names(dd) <- c("Badges", "Duration", "SesSize")

  dd <- dd %>%
    mutate(DurationNorm = Duration/SesSize) %>%
    arrange(desc(SesSize)) %>%
    select(3,2,4,1)

  dd
}

# Batch reading experimental xls files inside a directory requires to
# extract the session ID which is given in the file name in the format
# S1, S2, S3, etc. This function extracts the numeric value of the session in order
# to store it with the actual audio/interaction data.
extract_sesinfo_file <- function(file, pattern="S[0-9]+"){
  ses_id<- regmatches(file, regexpr(pattern, file))
  ses_id <- substr(ses_id, 2, nchar(ses_id))
  ses_id <- as.numeric(ses_id)
  ses_id
}



# Calculates basic session data such as mean duration for give badge pairs over the whole dataframe
#
# THIS IS OBSOLTE BY tiedDuration() of network package?!
#
# @param badges A list of numeric badge pairs, e.g list(c(x,y), c(x, z)). If no list is specified (default)
# all badge pairs of data.frame will be used.
# @param filterRSSI Integer value indicating the strength of the RSSI signal to be included.
# @param twl Integer value indicating the time window length in which detects should be counted as
# being part of the same session.
# @param average Boolean value if basic metrics of session are calculated.
#
# @return Returns dataframe with interaction sessions, containing start, end timestamp
# day, nr of detects, badge pair involved, and duraction in seconds.
#
get_ses_pairs <- function(df, badges=NULL, filterRSSI=NULL, twl=120, average=F){

  #take all pairs of data.frame
  if (is.null(badges)){
    badges <- lapply(unique(df$Pairs), as.NumPair)
  }

  dfr <- NULL

  for (i in 1:length(badges)){

    tmp <- filter.interactions(df, badges=badges[[i]], filterRSSI=filterRSSI)

    if (nrow(tmp)>0){
      tmp <- cluster.interaction.ts(tmp, twl=twl)
      ses <- get.sessions(tmp, by.col="clusterTS")
    } else {
      ses <- NULL
    }

    if (is.null(dfr)){
      dfr <- ses
    } else {
      dfr <- rbind(dfr, ses)
    }
  }

  if (average){
    #aggregate session durations of each pair
    meanSessionDuration <- aggregate(dfr$Duration, by=list(dfr$Badges), FUN=mean )
    meanSessionDuration$N <- aggregate(dfr$Badges, by=list(dfr$Badges), length )["x"]

    names(meanSessionDuration) <- c("Pair", "MeanMinutes")
    meanSessionDuration$sd <- aggregate(dfr$Duration, by=list(dfr$Badges), FUN=sd )["x"]

    meanSessionDuration <- meanSessionDuration[with(meanSessionDuration, order(-MeanMinutes)), ]

    return(meanSessionDuration)
  } else {
    return(dfr)
  }
}
jmueller17/sociometrics documentation built on March 20, 2024, 1:04 a.m.