R/agg.sleepsight.R

Defines functions agg.sleepsight

Documented in agg.sleepsight

#' agg.sleepsight
#'
#' @param aggregatefile RData file where the output of aggregation will be stored.
#' @param csvfile csv file where data overview was stored
#' @param surveyfile RData file where the preprocessed survey data is stored
#' @param desiredtz timezone (character) in Europe/London format
#' @param minmisratio minimum missing ratio per day, day will be ignored if more data is missing
#' @param shortwindow short window length in minutes to aggregate to
#' @param longwindow long window length in minutes to aggregate to
#' @param withings.mode character ("pdk", "dd") to indicate whether to prioritise pdk or dd
#' @param startDate Character,  optional argument to specify the startDate of the recording, e.g. 2019-01-01
#' @param endDate Character, optional argument to specify the startDate of the recording, e.g. 2019-01-01
#' @return List with data frames that hold the aggregated data: D24HR, Dshort, Dlong, and Dsurvey. 
#' Note that sleepdur_night starts at noon and continous to the noon on the following day.
#' @export
agg.sleepsight = function(aggregatefile, csvfile, surveyfile, 
                          desiredtz, minmisratio=1/3, shortwindow = 1, longwindow = 30,
                          withings.mode = "dd",startDate = c(), endDate=c()) {
  Sys.setlocale("LC_TIME", "C") # Set language to English
  D24HR = Dshort = Dlong = Dsurvey = c()
  if (!file.exists(csvfile)) {
    cat("\nWarning: csvfile not found as input for aggregation")
  } else {
    #----------------------------------------------------
    # declear generic functions used in the code below
    mysum = function(x) {
      if (length(which(is.na(x) == FALSE)) > 0) {
        S = sum(x, na.rm = TRUE)
      } else {
        S = 0
      }
      return(S)
    }
    mymean = function(x) {
      if (length(which(is.na(x) == FALSE)) > 0) {
        M = mean(x, na.rm = TRUE)
      } else {
        M = 0
      }
      return(M)
    }
    AddTimeToDF = function(df, desiredtz) {
      df$time.POSIX = as.POSIXlt(df$time,tz=desiredtz,origin="1970-01-01")
      df$date = as.character(as.Date(df$time.POSIX))
      df$date_12HrShiftBack = as.character(as.Date(df$time.POSIX - (12*3600)))
      df$hour = df$time.POSIX$hour
      df$min = df$time.POSIX$min
      df$hour_in_day = df$hour + (df$min/60)
      df$month = paste0(format(df$time.POSIX,"%m"),"-",format(df$time.POSIX,"%Y"))
      df$day = format(df$time.POSIX,"%d")
      df$weekday = weekdays(df$time.POSIX)
      return(df)
    }
    status2factor = function(df) {
      # To ease intrerpretation, replace numeric coding by factor names
      df$status = as.character(df$status)
      df$status[which(df$status == "1")] = "active"
      df$status[which(df$status == "0")] = "inactive"
      df$status[which(df$status == "-1")] = "sleep"
      df$status[which(df$status == "-2")] = "sustained inactive"
      df$status[which(df$status == "4")] = "inconclusive"
      df$status[which(df$status == "5")] = "no data"
      df$status = as.factor(df$status)
      return(df)
    }
    mydivfun = function(tmp,dn) {
      # divide tmp by dn while ignore NA values
      nax = which(is.na(tmp$x) == FALSE); tmp$x[nax] = tmp$x[nax] / dn 
      return(tmp)
    }
    #----------------------------------------------------
    # Load data
    # check that file exists:
    if (file.exists(csvfile) == FALSE) {
      stop("\nCsv-file ",csvfile," (as specified by used) is not available, first use export2csv to export the preprocessed data to csv format.")
    }
    
    D = data.table::fread(csvfile)
    D = as.data.frame(D)
    if (length(startDate) > 0 & length(endDate) > 0) {
      startDateNum = as.numeric(as.Date(startDate)) * 3600*24
      endDateNum = as.numeric(as.Date(endDate)) * 3600*24
      timeNum = as.numeric(as.POSIXlt(D$time,"1970-01-01",tz=desiredtz))
      validdates = which(timeNum > startDateNum & timeNum < endDateNum)
      if (length(validdates) != 0) D = D[validdates,]
    }
    
    # Potentially Withings data is available from both Direct download and from PDK.
    # Use direct download if available, if not then use pdk, if neither is available then do not process 
    # any data from this person.
    CDF = colnames(D)
    do.withings = TRUE
    # cat(paste0("\nCDF: ",CDF[which(CDF %in% c("steps_dd", "withingsMove_dd","steps_pdk", "withingsMove_pdk") == TRUE)]),collapse = "")
    if ("withingsMove_dd" %in% CDF == FALSE & "withingsMove_pdk" %in% CDF == FALSE) {
      do.withings = FALSE
      cat("\nNo Withings data was found from either direct download or pdk, this package only runs when Withings data is avaialble.")
    } else {
      
      if ("withingsMove_dd" %in% CDF & withings.mode != "pdk" | 
          (withings.mode == "pdk" & ("withingsMove_pdk" %in% CDF == FALSE))) { # use direct download
        D$withingsactive = D$withingsMove_dd
        D$steps = D$steps_dd
        D = D[,-which(colnames(D) %in% c("steps_dd", "withingsMove_dd") == TRUE)]
        D$withingsleep = rowSums(cbind(D$deepsleep_dd, D$lightsleep_dd),na.rm=TRUE)
        D = D[,-which(colnames(D) %in% c("deepsleep_dd", "lightsleep_dd") == TRUE)]
        if ("withingsMove_pdk" %in% CDF) { #ignore pdk if direct download is available
          D = D[,-which(colnames(D) %in% c("steps_pdk", "withingsMove_pdk") == TRUE)]
        }
        withings.mode = "dd"
        # cat("\nusing direct download")
      } else {
        if ("withingsMove_pdk" %in% CDF) { # use pdk
          D$withingsactive = D$withingsMove_pdk
          D$steps = D$steps_pdk
          D = D[,-which(colnames(D) %in% c("steps_pdk", "withingsMove_pdk") == TRUE)]
          D$withingsleep = rowSums(cbind(D$deepsleep_pdk, D$lightsleep_pdk),na.rm=TRUE)
          D = D[,-which(colnames(D) %in% c("deepsleep_pdk", "lightsleep_pdk") == TRUE)]
          if ("withingsMove_dd" %in% CDF) { #ignore dd if direct download is available
            D = D[,-which(colnames(D) %in% c("steps_dd", "withingsMove_dd") == TRUE)]
          }
          withings.mode = "pdk"
          # cat("\nusing pdk")
        }
      }
    }
    if (do.withings == TRUE) { # only process file if there is Withingsdata
      # Define status categories
      #-------------------------------------------------------
      D$status = 5 # no data is default
      #----------------------------------------------------
      # SLEEP
      # old (before 10 May 2019):
      # sleep = which(D$withingsleep == 1 | (is.na(D$GPSmove) == TRUE & is.na(D$batinteract) == TRUE &
      #                 is.na(D$withingsactive) == TRUE & is.na(D$phoneacc) == TRUE)) #& is.na(D$AppAct) == TRUE)
      sleep = which(D$withingsleep == 1) #& is.na(D$AppAct) == TRUE)
      D$status[sleep] = -1 # withings sleep AND no phone movement AND no phone app activity AND no withings activity
      #----------------------------------------------------
      # INACTIVE
      inactive = which(is.na(D$GPSmove) == TRUE & is.na(D$batinteract) == TRUE &
                         is.na(D$withingsactive) == TRUE & is.na(D$phoneacc) == TRUE &
                         (D$withingsleep == 0 | is.na(D$withingsleep) == TRUE))
      D$status[inactive] = 0 # lack of movement, but no sleep detected with Withings (phone app activity is allowed)
      #----------------------------------------------------
      # ACTIVE
      active = which((D$GPSmove == TRUE | D$batinteract == TRUE | D$phoneacc == TRUE | D$withingsactive == TRUE) & 
                       (D$withingsleep == 0 | is.na(D$withingsleep) == TRUE)) # note: I added this line to make sure sleep is not overwritten
      D$status[active] = 1 # any movement (phone app activity is not sufficient, only phone movement or Withings activity counts)
      #----------------------------------------------------
      # INCONCLUSIVE
      inconclusive = which(is.na(D$screenon) == TRUE & #is.na(D$lighton) == TRUE & 
                             is.na(D$GPSmove) == TRUE & is.na(D$AppAct == TRUE) &
                             is.na(D$batinteract == TRUE) & is.na(D$phoneacc) == TRUE &
                             is.na(D$AppHalted) == TRUE & is.na(D$withingsactive) == TRUE & is.na(D$withingsleep) == TRUE)
      D$status[inconclusive] = 4 
      #----------------------------------------------------
      # Create new dataframe with only status and timestamps
      ligth.available = as.logical(length(which(colnames(D) %in% "lightLevel" == TRUE)))
      if (ligth.available == TRUE) {
        tmpmin = D[,which(colnames(D) %in% c("time","status", "steps","lightLevel") == TRUE)]
      } else {
        tmpmin = D[,which(colnames(D) %in% c("time","status", "steps") == TRUE)]
        cat("\nWarning: Light sensor data is not available for this individual!")
      }
      # Untill here the data may include gaps in time, to minimize data storage needs.
      # We will now create continuous time series to ease plotting
      tmpmin$time_num = as.numeric(as.POSIXlt(tmpmin$time, tz=desiredtz, origin = "1970-1-1", format="%Y-%m-%d %H:%M:%OS"))
      D_complete_time = data.frame(time_num=seq(min(tmpmin$time_num, na.rm = T),max(tmpmin$time_num, na.rm = T),by=60))

      tmpmin = tmpmin[,-c(which(colnames(tmpmin) %in% c("time") == TRUE))]
      Dminute = base::merge(D_complete_time,tmpmin,by="time_num",all = TRUE)
      
      missing_status = which(is.na(as.numeric(Dminute$status)) == TRUE) # missing
      if (length(missing_status) > 0) Dminute$status[missing_status] = 5
      
      Dminute$time = as.POSIXlt(Dminute$time_num,origin="1970-1-1",tz=desiredtz)
      Dminute$date = as.Date(Dminute$time)
    
      #========================================================
      # create new category sustained inactivity, if 95% or more of 90 minutes is inactive
      D6 = zoo::rollapply(Dminute$status,FUN=function(x) length(which(x == 0)),width = 90)
      replacestatus = which(D6 > (90*0.95))
      statuszero = which(Dminute$status == 0)
      replacestatus = replacestatus[which(replacestatus %in% statuszero == TRUE)]
      if (length(replacestatus) > 0) Dminute$status[replacestatus] = -2 # sustained inactive
      #========================================================
      #
      # NOTE: This is probably a good place to impute missing values, if we want to impute values...
      #
      #========================================================
      # Aggregate per shortwindow and longwindow (the most common status is used if windowlength is larger than 1)
      Dminute$time_num_short = round(Dminute$time_num / (shortwindow *60)) * (shortwindow *60)
      Dminute$time_num_long = round(Dminute$time_num / (longwindow *60)) * (longwindow*60)
      if (nrow(Dminute) > 1440) {
        calcmode = function(x) {
          if (length(x) == 1) {
            MM = x
          } else {
            MM = pracma::Mode(x)
            if (length(MM) == 0) MM = NA
          }
          return(MM)
        }
        if (shortwindow == 1) {
          if (ligth.available == TRUE) {
            Dshort = Dminute[,c("time", "status", "steps","lightLevel")]
          } else {
            Dshort = Dminute[,c("time", "status", "steps")]
          }
        } else {
          Dshort = aggregate(x = Dminute[,c("status")],by = list(time = Dminute$time_num_short),FUN = calcmode)
          Dshort2 = aggregate(x = Dminute[,c("steps")],by = list(time = Dminute$time_num_short),FUN = mysum)
          colnames(Dshort) = c("time", "status")
          colnames(Dshort2) = c("time", "steps")
          if (ligth.available == TRUE) {
            Dshort3 = aggregate(x = Dminute[,c("lightLevel")],by = list(time = Dminute$time_num_short),FUN = mymean)
            colnames(Dshort3) = c("time", "lightLevel")
          }
          Dshort = merge(Dshort, Dshort2,by="time")
          if (ligth.available == TRUE) {
            Dshort = merge(Dshort, Dshort3,by="time")
          }
        }
        Dlong = aggregate(x = Dminute[,c("status")],by = list(time = Dminute$time_num_long),FUN = calcmode)
        Dlong2 = aggregate(x = Dminute[,c("steps")],by = list(time = Dminute$time_num_long),FUN = mysum)
        colnames(Dlong) = c("time", "status")
        colnames(Dlong2) = c("time", "steps")
        if (ligth.available == TRUE) {
          Dlong3 = aggregate(x = Dminute[,c("lightLevel")],by = list(time = Dminute$time_num_long),FUN = mymean)
          colnames(Dlong3) = c("time", "lightLevel")
        }
        Dlong = merge(Dlong, Dlong2,by="time")
        if (ligth.available == TRUE) {
          Dlong = merge(Dlong, Dlong3,by="time")
        }
        Dshort = AddTimeToDF(Dshort, desiredtz) 
        DshortB = Dshort # used only for calculating D24HR
        Dshort = status2factor(Dshort) # used as output
        Dlong = AddTimeToDF(Dlong, desiredtz)
        Dlong = status2factor(Dlong)
        #========================================================
        # Aggregate per day
        NSinH = 60 / shortwindow #number of shortwindows (length in minuntes) in an hours
        
        sleepmedian = function(x) {
          slp = which(x==-1)
          if (length(slp) > 1) {
            md = slp[round(length(slp)/2)]
          } else {
            md = NA
          }
          return(md)
        }
        sleepmedian_pernight = aggregate(x = DshortB$status,by = list(date = DshortB$date_12HrShiftBack),FUN = sleepmedian)
        colnames(sleepmedian_pernight) = c("date","sleep_median") # medium of all timestamps in a day on which sleep occurs

        sleepdur_pernight = aggregate(x = DshortB$status,by = list(date = DshortB$date_12HrShiftBack),FUN = function(x) length(which(x==-1)))
        sleepdur_pernight = mydivfun(sleepdur_pernight,dn=NSinH) # convert x-5-minute window of a day into hour
        colnames(sleepdur_pernight) = c("date","sleepdur_night") # duration expressed in minutes
        
        sleepdur_perday = aggregate(x = DshortB$status,by = list(date = DshortB$date),FUN = function(x) length(which(x==-1)))
        sleepdur_perday = mydivfun(sleepdur_perday,dn=NSinH) # convert x-5-minute window of a day into hour
        colnames(sleepdur_perday) = c("date","sleepdur_day") # duration expressed in minutes
        
        susindur_perday = aggregate(x = DshortB$status,by = list(date = DshortB$date),FUN = function(x) length(which(x==-2)))
        susindur_perday = mydivfun(susindur_perday,dn=NSinH) # convert x-5-minute window of a day into hour
        colnames(susindur_perday) = c("date","susindur") # duration expressed in minutes
        
        activedur_perday = aggregate(x = DshortB$status,by = list(date = DshortB$date),FUN = function(x) length(which(x==1)))
        activedur_perday = mydivfun(activedur_perday,dn=NSinH)
        colnames(activedur_perday) = c("date","activedur")
        
        inactivedur_perday = aggregate(x = DshortB$status,by = list(date = DshortB$date),FUN = function(x) length(which(x==0)))
        inactivedur_perday = mydivfun(inactivedur_perday,dn=NSinH)
        colnames(inactivedur_perday) = c("date","inactivedur")
        
        inconclusive_dur_perday = aggregate(x = DshortB$status,by = list(date = DshortB$date),FUN = function(x) length(which(x==4)))
        inconclusive_dur_perday = mydivfun(inconclusive_dur_perday,dn=NSinH)
        colnames(inconclusive_dur_perday) = c("date","inconclusive_dur")
        
        missing_dur_perday = aggregate(x = DshortB$status,by = list(date = DshortB$date),FUN = function(x) length(which(x==5)))
        missing_dur_perday = mydivfun(missing_dur_perday,dn=NSinH)
        colnames(missing_dur_perday) = c("date","missing_dur")
        if (ligth.available == TRUE) {
          lightExposureDur_perday = aggregate(x = DshortB$lightLevel,by = list(date = DshortB$date),FUN = function(x) length(which(x>0)))
          lightExposureDur_perday = mydivfun(lightExposureDur_perday,dn=NSinH)
          colnames(lightExposureDur_perday) = c("date","lightExposureDur_perday")
          
          lightExposureMean_perday = aggregate(x = DshortB$lightLevel,by = list(date = DshortB$date),mymean)
          colnames(lightExposureMean_perday) = c("date","lightExposureMean_perday")
        }
        steps_perday = aggregate(x = DshortB$steps,by = list(date = DshortB$date),FUN = mysum)
        # Note: not dividing by number of short windows in a hours, because we want this to be the sum
        colnames(steps_perday) = c("date","total_steps")
        D24HR = merge(sleepdur_perday,activedur_perday,by="date")
        D24HR = merge(D24HR,sleepmedian_pernight,by="date")
        D24HR = merge(D24HR,inactivedur_perday,by="date")
        D24HR = merge(D24HR,susindur_perday,by="date")
        D24HR = merge(D24HR,inconclusive_dur_perday,by="date")
        D24HR = merge(D24HR,missing_dur_perday,by="date")
        D24HR = merge(D24HR,steps_perday,by="date")
        if (ligth.available == TRUE) {
          D24HR = merge(D24HR,lightExposureDur_perday,by="date")
          D24HR = merge(D24HR,lightExposureMean_perday,by="date")
        }
        D24HR = merge(D24HR,sleepdur_pernight,by="date")
        
        #--------------------------------------------------------
        # Calculate other summary statistics:
        # simplify classes to be able to do L5M10 analysis
        DshortB$status[which(DshortB$status == -1)] = 0 #sleep becomes inactivity
        DshortB$status[which(DshortB$status == -2)] = 0 #possible sleep becomes inactivity
        is.na(DshortB$status[which(DshortB$status == 4 | DshortB$status == 5)]) = TRUE #inconclusive and missing data become NA
        
        DshortB$rollmean5HR = zoo::rollapply(DshortB$status,FUN=function(x) mean(x,na.rm=TRUE),width=(5*NSinH),fill=NA)
        tmp = aggregate(x = DshortB$rollmean5HR,by = list(date = DshortB$date),FUN = function(x) which.min(x)[1])
        tmp = mydivfun(tmp,dn=NSinH) # convert x-5-minute window of a day into hour
        colnames(tmp) = c("date","L5hour")
        D24HR = merge(D24HR,tmp,by="date")
        tmp = aggregate(x = DshortB$rollmean5HR,by = list(date = DshortB$date),FUN = function(x) {
          mv = NA
          if (length(which(is.na(x) == FALSE)) > 0) {
            mv = min(x,na.rm = TRUE)
            if (mv == Inf | mv == Inf) mv = NA
          }
          return(mv)
        })
        colnames(tmp) = c("date","L5value")
        D24HR = merge(D24HR,tmp,by="date")
        #--------------------------------------------------------
        DshortB$rollmean10HR = zoo::rollapply(DshortB$status,FUN=function(x) mean(x,na.rm=TRUE),width=(10*NSinH),fill=NA)
        tmp = aggregate(x = DshortB$rollmean10HR,by = list(date = DshortB$date),FUN = function(x) which.max(x)[1])
        tmp = mydivfun(tmp,dn=NSinH)
        colnames(tmp) = c("date","M10hour")
        D24HR = merge(D24HR,tmp,by="date")
        tmp = aggregate(x = DshortB$rollmean10HR,by = list(date = DshortB$date),FUN = function(x) {
          mv = NA
          if (length(which(is.na(x) == FALSE)) > 0) {
            mv = max(x,na.rm = TRUE)
            if (mv == Inf | mv == Inf) mv = NA
          }
          return(mv)
        })
        colnames(tmp) = c("date","M10value")
        D24HR = merge(D24HR,tmp,by="date")
        #--------------------------------------------------------
        tmp = aggregate(x = DshortB$status,by = list(date = DshortB$date),FUN = function(x) length(x))
        tmp = mydivfun(tmp,dn=NSinH)
        colnames(tmp) = c("date","daylength")
        D24HR = merge(D24HR,tmp,by="date")
        #--------------------------------------------------------
        tmp = aggregate(x = DshortB$status,by = list(date = DshortB$date),FUN = function(x) mean(x,na.rm = TRUE))
        colnames(tmp) = c("date","mean")
        D24HR = merge(D24HR,tmp,by="date")
        #----------------------------------------------
        # Also load survey data
        if (file.exists(surveyfile) == TRUE) {
          SleepSurvey = c()
          load(surveyfile)
          Dsurvey = SleepSurvey[,c("positiveFeelings","negativeFeelings","Sleep.Quality.Value","surveytime","bedtime","risetime")]
          Dsurvey$date = as.Date(as.POSIXlt(Dsurvey$surveytime,tz=desiredtz))
        }
        
      }
    }
    Dshort = Dshort[!duplicated(Dshort),]
    Dlong = Dlong[!duplicated(Dlong),]
    D24HR = D24HR[!duplicated(D24HR),]
    return(list(D24HR=D24HR,Dshort=Dshort,Dlong=Dlong,Dsurvey=Dsurvey, withings.mode = withings.mode))
  }
}
wadpac/sleepsight-analytics-pipeline documentation built on Aug. 1, 2020, 10:41 a.m.