R/read.R

Defines functions load_strings hash_source read_smtrx_file read_body read_audio read_interaction

Documented in hash_source read_audio read_body read_interaction

#' @importFrom readr read_delim
#' @importFrom tibble tibble
#' @importFrom readxl read_excel
#'
#' @import stringr
#' @import tidyr
#' @import dplyr
#'
#'
#'
#' @title Import interaction data
#'
#' @description Read CSV, XLS or XLSX files from Sociometrics that contain Infrared-, Bluetooth- or both data types.
#'
#' @param file Path to source data file (xls, xlxs or csv).
#' @param type Indicates the type of interaction data to be imported (see details for available
#'  abbreviations).
#' @param undirect Logical. Convert directed to undirected edge: \code{A->B | B->A = A<->B}
#' @param replv Anonymization. Default \code{replv=FALSE} will leave original (badge) IDs in place. Set
#'  to \code{replv=TRUE} will replace IDs with numbers starting from 1..n. Provide \code{replv=data.frame} with
#'  values: First column holds original values, second column replacement values.
#' @param delim Single delimiter character for reading CSV data. Ignored for Excel files.
#' @param format Optional format for parsing timestamp data. If no format is specified two
#'  pre-established timestamp formats are tried ut. See  \code{\link{parse.smtrx}} for details.
#' @param tz String. Default \code{tz=NULL} will use system timezone (\code{Sys.timezone()}) and
#'  assign to timestamp. Useful for explicitly setting other than system timezone for timestamp
#'  data.
#' @param cls Vector of class names. Default \code{cls=NULL} uses pre-defined sociometric
#'  classes associated with the \code{type} abbreviation. However, class names can be specified
#'  explicitly as well.
#'
#' @return Tibble with data in tidy format
#'
#' @details Excel file reading is performed by \code{readxl::read_excel} function. Column type
#' specification might be required at times with the \code{col_type} parameter, passed via the "..."
#'
#' The following data sheets can be specified by setting the appropriate \code{type}
#'  parameter:
#'
#'  \itemize{
#'    \item{"IR" - Infrared data}
#'    \item{"BT" - Bluetooth detects}
#'    \item{"IR_BT" - Both, Infrared and Bluetooth detects.}
#'    \item{"RIC" - Reads the interactions spell (or durations) between nodes. It contains "from", "to",
#'     columns and start/end timestamps combining both Infrared and Bluetooth readings.}
#'    \item{"NIC" - Reads the exported matrix of absolute interaction counts between all badges. Does
#'     not contain any timestamp data.}
#'  }
#'
#'  All specified types will produce a data frame with five columns. Timestamp, Badge.ID, Other.ID,
#'  RSSI and Source. The data frame format is the same for IR, BT, IR_BT imports. The RSSI (Radio
#'  Signal Strength Indictor) holds a numeric value for Bluetooth signals (usually ranging from -40 to
#'  -90) indicating the relative proximity during BT signal detects. Smaller values indicate a weaker
#'  signal. Infrared detects do not have a measure of signal strength. IR detects are stored as NA values
#'  in the RSSI column.
#'
#' @examples
#'
#' #load interaction including both BT and IR.
#' df1 <- read_interaction(file="data/datalab-full.xlsx", type="IR_BT")
#' df1
#'
#' #load only Bluetooth data, replace Badge ids with numbers starting 1...n
#' df2 <- read_interaction(file="data/datalab-full.xlsx", type="BT", replv=T)
#' df2
#'
#' #load only Infrared data, replace Badge ids with provided list
#' replv <- data.frame(mv=c(3182, 3189, 3183, 3178, 3184, 3081), rv=c(LETTERS[1:5]))
#' df3 <- read_interaction(file="data/datalab-full.xlsx", type="IR", replv=replv)
#' df3
#'
#' #load interaction data, adding an undirected pair column
#' df4 <- read_interaction(file="data/datalab-full.xlsx", type="IR_BT", undirect=T)
#' df4
#'
#'
#' @seealso \code{\link{read_audio}}, \code{\link{read_body}}
#'
#' @export
#'
read_interaction <- function(file, type, undirect=F,
                             replv=F, delim="\t", format=NULL, tz=NULL, cls=NULL, ...){

  raw_df <- read_smtrx_file(file, type=type, delim=delim, ...)

  if (is.null(cls)){
    class(raw_df) <- c(attr(raw_df, "pclass"), class(raw_df))
  } else {
    class(raw_df) <- cls
  }

  df <- parse(raw_df, format=format, tz=tz)

  #anonymize Badge ID entries
  if (is.data.frame(replv)){
    df <- anonymize(df, ids=replv[[1]], replv=replv[[2]])

  } else if (replv == T){
    df <- anonymize(df)
  }

  #add extra column of undirected edge list
  if (undirect){
    df <- mreverse(df, into="Pairs")
  }

  df
}




#' @title Import audio data
#'
#' @description Read audio CSV, XLS or XLSX files from Sociometrics. Original Excel sheets have
#'  often a nested column structure where 1 or more badges produce data over several subcolumns.
#'  This strucutre will be converted into a tidy data format.
#'
#' @inheritParams read_interaction
#' @param type Indicates the type of audio data to be imported (see details for available
#'  abbreviations).
#' @param ses_info Logical. Extract and store session info from file path if available.
#' @param na.rm Logical. Calls \code{na.omit} on the entire data frame after conversion to
#'  tidy format.
#'
#' @details Excel file reading is performed by \code{readxl::read_excel} function. Column type
#'  specification might be required at times with the \code{col_type} parameter, passed via the "..."
#'
#' Volume, pitch and frequencies are avaible for the front- and back microphone which
#'  can be indicated by the "_F" or "_B" suffix on each abbreviation. If no suffix is included for
#'  both microphone data sheets (back + front) will be loaded. The following abbreviations are
#'  available for the \code{type} parameter:
#'
#'  \itemize{
#'    \item{"VOL[_F|_B]" - Volume. "VOL" will load both front- and back microphone sheet. "VOL_F"
#'     only front and "VOL_B" only back microphone data. Volume levels range between 0 and 1.
#'     Values < 0.01 indicate not speaking, 0.01 - 0.02 speaking quitely, 0.03 - 0.08 speaking louder, and
#'     > 0.08 speaking loudly.}
#'    \item{"PITCH[_F|_B]" - Pitch. Depending on the DataLab export settings pitch measures are
#'     aggregated over a given time interval, starting from 1 - 60+ seconds. Typical male fundamental
#'     frequency ranges from 85 to 180 Hz; typical adult female from 165 to 255 Hz.}
#'    \item{"SP" - Speech profile (no front/back microphone option). The speech profile indicates for each badge
#'     Speaking, Overlap, Listening, Silent, Total Speaking and Total Silent duration. Values depend on the
#'     chosen time interval: if the speech profile has been exported over a period of 60s, Speaking (and all
#'     other) measures indicate fractions over the 60 second period, ranging from 0 to 60.
#'     If the speech profile has been exported over 1 second intervals, the columns indicate values that
#'     range from 0 to 1 second. \emph{Speaking} indicates the total time fraction a particular badge
#'     wearer was speaking; \emph{Overlap} the time fraction a person was speaking while someone else
#'     was speaking; \emph{Listening} the time fraction a particular badge was silent while someone else was speaking; \emph{Silent} the time fraction nobody
#'     was speaking. \emph{Total Speaking} is the sum of Speaking + Overlap per badge.}
#'    \item{"PAR" - Speech participation (no front/back microphone option). Logical. Indicates if a particular
#'     badge was speaking or not during the given time interval.}
#'    \item{"VOL_MIR[_F|_B]" - Volume mirroring. \emph{Similar} indicates the similarity between volume
#'     readings between two badges and ranges between 0 (no match) and 1 (perfect match) within the
#'     given time interval. \emph{Lag} indicates the time lag between matches. }
#'    \item{"VOL_CON[_F|_B]" - Volume consistency of each badge’s front audio amplitude, as measured in
#'     Activity (volume) (front). Consistency ranges from 0 to 1, where 1 indicates no changes in speech
#'     amplitude, and 0 indicates the maximum amount of variation in speech amplitude.}
#'    \item{"FRQ_[_F|_B]" - Dominant frequency. Contains three frequency bands hz_0, hz_1...hz_2 and
#'     corresponding amplitude readings amp_0, ...amp_2. Converted to tidy format, the resulting
#'     tibble contains the usual Timestamp, Badge.ID column followed by \emph{Band} column indicating
#'     one of the three bands \emph{Band_0, ... Band_2}} and two further columns \emph{Hz} and \emph{Amplitude}.
#'     There are potentially 4 frequency bands shown, hz_0 & amp_0 is the strongest PEAK in cepstrum,
#'     hz_1 & amp_1 is the second strongest PEAK, and so on. If there are fewer than k peaks in cepstrum,
#'     the hz_k and larger values are empty. E.g if there are only two peaks in cepstrum, hz_2 and hz_3 are empty and not exported.
#'    \item{"TT" - Turn taking sheet. \emph{Speaking Segment}: Any continuous, uninterrupted length of
#'     speech made by a single person. \emph{Turns}: Turns are speaking segments that occur after and
#'     within 10 seconds of, another speaking segment. By default a speech segment must be made within
#'     10 seconds after the previous one ended in order to be considered a turn. \emph{Self-turn}: A
#'     speaker starts speaking, pauses for greater than 0.5 seconds (but less than 10 seconds), and then
#'     resumes speaking. \emph{Successful interruptions}: Person A is talking. Peron B starts talking over
#'      A. If Person A talks for less than 5 out of the next 10 seconds, then Person B successfully
#'      interrupted Person A. \emph{Unsuccessful interruptions}: Person A is talking. Peron B starts talking
#'      over A. If Person A talks for more than 5 out of the next 10 seconds, then Person B successfully
#'      interrupted Person A. \emph{Pause}: A pause is a period of time within which there is no speaking.
#'      All pauses are between .5s and 10s.
#'    }
#'  }
#'
#'
#' @return Tibble with data in tidy format
#'
#' @seealso \code{\link{read_body}} \code{\link{read_interaction}}
#'
#' @export
#'
read_audio <- function(file, type, ses_info=F, replv=F, delim="\t",
                       format = NULL, tz=NULL, na.rm=F, cls=NULL, ...){

  raw_df <- NULL

  ext <- tolower(tools::file_ext(file))

  #load either front or back microphone sheet for excel files
  if (ext == "xls" | ext == "xlsx") {

    if (length(type)>1){
      warning("Loading several sheets simultaneously likely causes column mismatch!")
    }

    for (tt in type){
      abbrs <- load_strings(tt) #get abbreviations, including expansion from VOL to VOL_F VOL_B
      for (abbr in abbrs){
        tmp <- read_smtrx_file(file, type=abbr, delim=delim, ...)

        if (is.null(raw_df)){
          raw_df <- tmp
        } else {
          raw_df <- rbind(raw_df, tmp)
        }
      }
    }

  #type is ignored for CSV files which have no sheets
  } else {
    raw_df <- read_smtrx_file(file, type=type, delim=delim, ...)

  }


  if (is.null(cls)){
    class(raw_df) <- c(attr(raw_df, "pclass"), class(raw_df))
  } else {
    class(raw_df) <- cls
  }

  df <- parse(raw_df, format=format, tz=tz)

  if (na.rm){
    df <- na.omit(df)
    class(df) <- class(raw_df)
  }

  #anonymize Badge ID entries
  if (is.data.frame(replv)){
    df <- anonymize(df, ids=replv[[1]], replv=replv[[2]])

  } else if (replv == T){
    df <- anonymize(df)
  }

  df
}


#' @title Import accelerometer data
#'
#' @description
#'
#' @inheritParams read_interaction
#' @param type Indicates the type of accelerometer data to be imported (see details for available
#'  abbreviations).
#' @param na.rm Logical. Calls \code{na.omit} on the entire data frame after conversion to
#'  tidy format.
#'
#' @return Object of type "act", "smtrx"
#'
#' @details Excel file reading is performed by \code{readxl::read_excel} function. Column type
#'  specification might be required at times with the \code{col_type} parameter, passed via the "..."
#'
#'  The following accelerometer data sheets can be read by setting the \code{type} parameter
#'  to one of the following values:
#'  \itemize{
#'    \item{"BM" - Body movement activity. This is the absolute value of the first derivative of energy.
#'     This provides a more reliable measure of someone’s activity, while eliminating the accelerometer’s
#'     magnitude natural offset. Values < 0.002 indicate very low activity; values between
#'     0.003 - 0.19 moderate amounts of activity and > 0.2 movements like walking.}
#'    \item{"BM_ACC" - Body movement. Accelerometer’s energy magnitude over the 3 axes of measurement.}
#'    \item{"BM_RATE" - Body movement rate. This is the second derivative of energy. The sign
#'     (positive or negative) of Rate (BM) indicates the direction of the change in someone’s activity
#'     levels, as measured by Activity (BM). A positive Rate (BM) indicates the person’s activity is
#'     increasing. A negative Rate(BM) indicates the activity is decreasing.}
#'    \item{"BM_CON" - Body movement consistency. Consistency ranges from 0 to 1, where 1 indicates no
#'     changes in activity level, and 0 indicates the maximum amount of variation in activity levels.}
#'    \item{"BM_MIR" - Body movement mirroring. Mirroring (BM) values indicate how similar one badge’s
#'     Activity (BM) data series is to another badge’s Activity (BM) data series over time. The values
#'     range from 0 to 1, where 0 indicates no similarity and 1 indicates the two data series are identical.}
#'    \item{"POS" - Posture. Left-right / front-back.}
#'    \item{"POS_ACT" - Posture activity. Activity (Posture) shows the absolute angular velocity for
#'     every badge at every timestamp. }
#'    \item{"POS_RATE"- Posture rate. Rate (Posture) shows the angular acceleration for every badge at every timestamp.}
#'    \item{"POS_MIR" - Posture mirroring. See "BM_MIR".}
#' }
#'
#'
#'
#' @export
#'
read_body <- function(file, type, undirect=F,
                      replv=F, delim="\t", format=NULL, tz=NULL, na.rm=F, cls=NULL, ...){


  raw_df <- read_smtrx_file(file, type=type, delim=delim, ...)

  if (is.null(cls)){
    class(raw_df) <- c(attr(raw_df, "pclass"), class(raw_df))
  } else {
    class(raw_df) <- cls
  }

  df <- parse(raw_df, format=format, tz=tz)

  if (na.rm){
    df <- na.omit(df)
    class(df) <- class(raw_df)
  }


  #anonymize Badge ID entries
  if (is.data.frame(replv)){
    df <- anonymize(df, ids=replv[[1]], replv=replv[[2]])

  } else if (replv == T){
    df <- anonymize(df)
  }

  #add extra column for undirected edge list
  if (undirect & type %in% c("BM_MIR", "POS_MIR")){
    df <- mreverse(df, into="Pairs")
  }

  df
}


# Read xls or csv file
#
# @details Read xls or csv data file. The function returns the "raw" sociometric data sheet.
#
# @param file File path.
# @param type The abbreviated data type to be loaded. \See{hash_source}
# @param delim Single delimiting character for CSV files. Default is TAB
# @param ... Further parameters passed on to \code{read_delim()} or \code{read_excel()} functions
#
# @return Tibble of raw sociometric data sheet
#
read_smtrx_file <- function(file, type, delim="\t",...){

  fformat <- tolower(tools::file_ext(file))

  if (fformat == "xls" || fformat == "xlsx") {
    sheet <- hash_source(type)$sheet #convert abbreviation type to real sheet name
    df <- readxl::read_excel(file, sheet=sheet, ...)

  } else if (fformat == "csv") {
    df <- readr::read_delim(file, delim=delim, ...)

  } else {
    stop("Can't recognize file format. Should be 'xlsx', 'xls' or 'csv': ", sQuote(fformat), call. = FALSE)
  }

  message("\nReading ", nrow(df), " rows from ", file, "\n")

  if (nrow(df) == 0){
    stop("File is empty.", call. = F)
  }

  df$Source <- type

  #suggest class
  attr(df, "pclass") <- hash_source(type)$pclass

  df
}



#' @title Map data type abbreviations to source names
#'
#' @description Sociometric DataLab exports several different data sheets with often
#'  cumbersome names. The following function converts abbreviated shortcuts
#'  to actual sheet names and suggests the S3 class which determines the corresponding parsing function.
#'
#' @param abbr String of abbreviated data types.
#'
#' @return List with source (sheet) name and likely S3 class for parsing
#'
#' @examples
#'
#' #produce a list of all currently available abbreviations
#' hash_source()
#'
#' #name of data sheet for Infrared data
#' hash_shource("IR")
#'
#' #name of data sheet for front microphone volume
#' hash_shource("VOL_F")
#'
#' @export
hash_source <- function(abbr=NULL){

  aabbr <- c("IR", "BT", "IR_BT", "RIC", "NIC",
             "VOL_F", "VOL_B", "PITCH_F", "PITCH_B", "PAR", "SP", "VOL_MIR_F", "VOL_MIR_B",
             "VOL_CON_F", "VOL_CON_B", "FRQ_F", "FRQ_B", "TT",
             "BM", "BM_ACC", "BM_RATE", "BM_CON", "BM_MIR",
             "POS", "POS_ACT", "POS_RATE", "POS_MIR")

  if (is.null(abbr)){
    return(aabbr)
  }

  if (abbr == "IR") {
    return(list(sheet="r_facetoface_matrix1", pclass=c("interact", "smtrx")))

  } else if (abbr == "BT"){
    return(list(sheet="r_proximity_matrix1", pclass=c("interact", "smtrx")))

  } else if (abbr == "IR_BT"){
    return(list(sheet="r_combined_matrix1", pclass=c("interact", "smtrx")))

  } else if (abbr == "RIC"){
    return(list(sheet="r_interactions_combined1", pclass=c("ric", "smtrx")))

  } else if (abbr == "NIC"){
    return(list(sheet="n_combined_matrix", plcass=c("rmatrix")))

  } else if (abbr == "VOL_F"){
    return(list(sheet="t_audio_front_volume1", pclass=c("vol", "ego", "smtrx")))

  } else if (abbr == "VOL_B") {
    return(list(sheet="t_audio_back_volume1", pclass=c("vol", "ego", "smtrx")))

  } else if (abbr == "PITCH_F") {
    return(list(sheet="t_audio_front_pitch1", pclass=c("pitch", "ego", "smtrx")))

  } else if (abbr == "PITCH_B"){
    return(list(sheet="t_audio_back_pitch1", pclass=c("pitch", "ego", "smtrx")))

  } else if (abbr == "PAR") {
    return(list(sheet="t_speech_participation1", pclass=c("par", "ego", "smtrx")))

  } else if (abbr == "SP") {
    return (list(sheet="t_speech_profile1", pclass=c("sp", "ego", "smtrx")))

  } else if (abbr == "VOL_MIR_F"){
    return(list(sheet="t_audio_front_vol_mirroring1", pclass=c("mirror")))

  } else if (abbr == "VOL_MIR_B") {
    return(list(sheet="t_audio_back_vol_mirroring1", pclass=c("mirror", "smtrx")))

  } else if (abbr == "VOL_CON_F") {
    return(list(sheet="t_audio_front_vol_consistency1", pclass=c("con", "ego", "smtrx")))

  } else if (abbr == "VOL_CON_B") {
    return(list(sheet="t_audio_back_vol_consistency1", pclass=c("con", "ego", "smtrx")))

  } else if (abbr == "FRQ_F"){
    return(list(sheet="t_audio_front_frequency1", pclass=c("frq", "ego", "smtrx")))

  } else if (abbr == "FRQ_B") {
    return(list(sheet="t_audio_back_frequency1", pclass=c("frq", "ego",  "smtrx")))

  } else if (abbr == "TT") {
    return(list(sheet="r_tt_turntaking1", pclass=c("tt")))

  } else if (abbr == "BM") {
    return(list(sheet="t_BM_activity1", pclass=c("act", "ego", "smtrx")))

  } else if (abbr == "BM_ACC") {
    return(list(sheet="t_BM_bm1", pclass=c("act", "ego", "smtrx")))

  } else if (abbr == "BM_RATE") {
    return(list(sheet="t_BM_rate1", pclass=c("rate","ego", "smtrx")))

  } else if (abbr == "BM_CON") {
    return(list(sheet="t_BM_consistency1", pclass=c("con", "ego", "smtrx")))

  } else if (abbr == "BM_MIR") {
    return(list(sheet="t_BM_mirroring1", pclass=c("mirror", "smtrx")))

  } else if (abbr == "POS") {
    return(list(sheet="t_posture_posture1", pclass=c("pos", "ego", "smtrx")))

  } else if (abbr=="POS_ACT"){
    return(list(sheet="t_posture_activity1", pclass=c("act", "ego", "smtrx")))

  } else if (abbr =="POS_RATE") {
    return(list(sheet="t_posture_rate1", pclass=c("rate", "ego", "smtrx")))

  } else if (abbr == "POS_MIR") {
    return(list(sheet="t_posture_mirroring1", pclass=c("mirror", "smtrx")))

  } else {
    stop("Unknown sheet type '", abbr, "'. Available abbreviations are: \n", paste(aabbr, sep=", "))
    return(abbr)
  }

}



# @title Constructs string abbreviations for loading excel data sheets.
#
# @description Excel sheets store audio data in two different data sheets: one for
#   front microphone and one for the back-mic. The loading abbreviation indicates
#   the sheet globally, e.g "VOL" indicates the "VOL_F" and "VOL_B" sheet. If both sheets
#   need to be loaded, "VOL" will be converted into the back and front sheet.
#   Some data such as speech participation or speech profile or turn-taking
#   do no have the front/back difference. This function constructs the necessary loading
#   abbreviations passed to \code{hash_source}
#
# @param abbr String. Global or front/back loading abbreviation.
#
# @return Vector of loading abbreviations.
load_strings <- function(abbr){

  single_sheets <- c("SP", "PAR", "TT")

  is_single <- stringr::str_detect(abbr, single_sheets)

  if (any(is_single)) {
    return(single_sheets[is_single])

  } else if (stringr::str_detect(abbr, "_F") | stringr::str_detect(abbr, "_B")){
    return(abbr)

  #speech participation or speech profile do not have back/front difference
  } else {
    return(c(paste0(abbr, "_F"), paste0(abbr, "_B")))
  }
}
jmueller17/sociometrics documentation built on March 20, 2024, 1:04 a.m.