postGGIR part 7b: Extracts features from minute level actigraphy data

knitr::opts_chunk$set(echo = TRUE)
# {r, results="hide"} - The chunks is run but all results are hidden. The code shows in the doc,  
# {r, include=FALSE} - the code is run but neither the code or the results are shown
# {r, echo=FLASE} - The code is not shown, but results are

After complete extracting features using R actigraphy package [1], We need to merge the other features from GGIR output summary files. For example, UKbiobank use 8 measures in the genetic study [2]. Most features are at the day-level and were averaged at the subject level for the data analysis in the next step.

Reference

  1. Di J, Spira A, Bai J, et al. Joint and individual representation of domains of physical activity, sleep, and circadian rhythmicity. Stat Biosci. 2019;11(2):371-402. doi:10.1007/s12561-019-09236-4

    This R pargram extracts multiple commonly used features from minute level actigraphy data using for "actigraphy" package (Junrui Di).
    
  2. Jones SE, van Hees VT, Mazzotti DR, et. al. Genetic studies of accelerometer-based sleep measures yield new insights into human sleep behaviour. Nat Commun. 2019 Apr 5;10(1):1585. doi: 10.1038/s41467-019-09576-1.

    They analyse a total of eight accelerometer-based measures of sleep and activity timing. These include measures representative of 
       + (1) sleep quality, including sleep efficiency (sleep duration divided by the time between the start and end of the first and last nocturnal inactivity period, respectively) and 
       + (2 "Nblocks_nightsleep") the number of nocturnal sleep episodes. 
       + (3) timing of sleep midpoint, 
       + (4) timing of the least-active 5 h (L5), defined as the midpoint of the least-active 5h of each day.
       + (5) timing of the most-active 10 h (M10)), and 
       + (6)  diurnal inactivity was the total daily duration of estimated bouts of inactivity that fell outside of the SPT-window. 
       + (7)  sleep duration. Individuals with an average sleep suration <3h or >12h were excluded from all analyses.
       + (8)  sleep variability of duration
    
currentdir_xxx 
studyname_xxx
bindir_xxx
outputdir_xxx 
epochIn_xxx
epochOut_xxx 
flag.epochOut_xxx 
log.multiplier_xxx 
QCnights.feature.alpha_xxx 
part5FN_xxx
setwd(currentdir)

Parameters

r paste("Calculate subject-level mean when Nvalid.night>=",QCnights.feature.alpha[1], " for each feature.",sep="")

r paste("Calculate subject-level SD when Nvalid.night>=",QCnights.feature.alpha[2], " for each feature.",sep="")

#---------------------------------main-body--------------------------------------- 

foutFN<-function(x,studyname) paste("part7a_",studyname,"_some_features_page",x,".csv",sep="")   
page1<-read.csv(foutFN("1_features",studyname),nrow=100,head=1,stringsAsFactors=F)
S1<-which(colnames(page1)=="SubjectLevel")+1
page1.subjectlevel<-colnames(page1)[S1:ncol(page1)] 
subjectlevel<-c(page1.subjectlevel, paste("PC",1:20,sep="")) 


outFN<-paste("part7b_",studyname,"_all_features_",1:4,c("_day","_dayclean","_subject","_subjectSD"),".csv",sep="")
outFN.anno<-paste("part7_",studyname,"_all_features_dictionary.xlsx",sep="") 

library(xlsx)   
library(knitr) 
library(postGGIR)
library(dplyr)
library(kableExtra)

setwd(currentdir) 

feaFN<-system.file("template", "features.dictionary.xlsx", package = "postGGIR")  
# system(paste("cat ",feaFN," > ",outFN.anno,sep=""))
file.copy(from=feaFN, to=outFN.anno, overwrite = TRUE)

1 Read R features

#BD1<-read.xlsx(inFN,sheetName="1_features", header = TRUE)
#BD2<-read.xlsx(inFN, sheetName = "5_denseFLMM_subject_PCs", header = TRUE)

BD1<-read.csv(foutFN("1_features",studyname), header = TRUE,stringsAsFactors=F)
BD2<-read.csv(foutFN("5_denseFLMM_subject_PCs",studyname), header = TRUE,stringsAsFactors=F)

colnames(BD2)[3:ncol(BD2)]<-paste("PC",1:20,sep="")
BD<-merge(BD1,BD2[,1:12],by=c("filename","newID"),all=TRUE)

2 Read sleep features from GGIR summary files

data.dir2<-paste(outputdir,"/results",sep="")   
inFN2<-c("part2_summary",
         "part2_daysummary",
         "part4_summary_sleep_cleaned",
         "part4_nightsummary_sleep_cleaned",
          paste("part5_",c("personsummary","daysummary"),"_",part5FN,sep="") )  
inFN3<-paste(data.dir2,"/",inFN2,".csv",sep="")  
sleepFull.fn<-paste(outputdir,"/results/QC/part4_nightsummary_sleep_full.csv",sep="")  
sleepFull<-read.csv(sleepFull.fn,head=1,stringsAsFactors=F)  
dict<-read.xlsx(outFN.anno,head=1,sheetName="dictionary",stringsAsFactors=F)

SLdomain<-dict[which(dict[,"Domain"]=="SL"),"Variable"] 
PAdomain<-dict[which(dict[,"Domain"]=="PA"),"Variable"] 
CRdomain<-dict[which(dict[,"Domain"]=="CR"),"Variable"] 

part2<-dict[which(dict[,"Source"]=="part2"),"Variable"] #9/1 IV and IS
part4<-dict[which(dict[,"Source"]=="part4"),"Variable"]
part6<-dict[which(dict[,"Source"]=="part5"),"Variable"]
partR<-dict[which(dict[,"Source"]=="R"),"Variable"]

kable(dict[,2:5]) %>%
    kable_styling(bootstrap_options = c("striped", "hover"))

table(dict[,"Source"])
##################################################### 
# 1.1 sleep variables for sleep domain
D<-list()
for (i in 1:length(inFN3)) {
   D[[i]]<-read.csv(inFN3[i],head=1,stringsAsFactors=F)
   #print(colnames(D[[i]]))
}

data<-D[[4]]
colnames(data)
data[,"sleep_Midpoint"]<-(data[,"sleeponset"]+data[,"wakeup"])/2   
data[,"sleep_efficiency"]<- data[,"SleepDurationInSpt"]/data[,"SptDuration"]     
colnames(data)[colnames(data)=="calendar_date"]<- "calendardate"  #6/5/2020



S0<-which(data[,"sleep_Midpoint"]==0) 
if (length(S0)>=1) {data[S0,c("wakeup","sleeponset")]
                    data[S0,"sleep_Midpoint"]<-NA}

S1<- c("filename","calendardate",intersect(part4,colnames(data))) 
d4<-data[,S1] 
d4[,"calendardate"]<- format(as.Date(d4[,"calendardate"],"%d/%m/%Y"), "%Y-%m-%d")



##################################################### 
# 1.2 sleep variables for PA domain
# adjust part5ww to match and extract features such as N_atleast5minwakenight,ACC_spt_sleep_mg and dur_day_total_IN_min....
# difficulty: part5mm not right in date, part5ww has duplicate dates. 

fixDup<-function(part5ww,part4){ #fix duplicates in part5ww

# Note: part4= 25/2/2016, but part5=2016-02-26.

length(which(is.na(part5ww[,"calendar_date"]))) 
part5ww[,"calendar_date_old"]<-part5ww[,"calendar_date"]
id5w<-paste(part5ww[, "filename"],part5ww[, "calendar_date"],sep="@")   
dupid<-id5w[which(duplicated(id5w))] 

if (length(dupid)>=1){ 
S1<-which(id5w %in% dupid)  
for (i in  S1){
# print(i)
j<-which(part4[,"filename"]==part5ww[i,"filename"] & part4[,"sleeponset_ts"]==part5ww[i,"sleeponset_ts"] & part4[,"wakeup_ts"]==part5ww[i,"wakeup_ts"] )
if (length(j)==1) part5ww[i,"calendar_date"]<-as.character(as.Date(part4[j,"calendar_date"],format="%d/%m/%Y")) 
if (length(j)==0)  part5ww[i,"calendar_date"]<-NA #no match in part4
if (length(j)>=2)  {j2<- which.min(abs(as.Date(part4[j,"calendar_date"],format="%d/%m/%Y")-as.Date(part5ww[i,"calendar_date"],format="%Y-%m-%d")) )
                    part5ww[i,"calendar_date"]<-as.character(as.Date(part4[j2,"calendar_date"],format="%d/%m/%Y"))
                    print(paste("Found duplicate sleeponset and wakeup on ",id5w[i],", so use the closest day",sep="")) 
                   } 
} 
change<-part5ww[S1, c("filename","calendar_date","sleeponset_ts","wakeup_ts", "calendar_date_old")] 
} else change<-"NO duplicate days" 

# remove calendar_date=NA from the report, which cause error in chunk4~aggregate
S2<-which(is.na(part5ww[,"calendar_date"])) 
if (length(S2)>=1)  part5ww<-part5ww[-S1,]

ans<-list(data=part5ww,note=change,N=length(which(duplicated(id5w)))  ) 
return(ans)
}


t<-fixDup(part5ww=D[[6]],part4=sleepFull)
data<-t$data  #D[[6]] #part5 daysummary
S2a<-which(data[,"cleaningcode"]>=2) #If the study did not propose a sleep log to the individuals, then all nights are removed with cleaningcode higher than 1.
S2b<-which(colnames(data) %in% intersect(part6,SLdomain))
data[S2a,S2b]<-NA 
colnames(data) 

quantile(as.numeric(data[,"L10TIME_num"]),na.rm=T)
quantile(as.numeric(data[,"M10TIME_num"]),na.rm=T)
quantile(as.numeric(data[,"dur_day_total_IN_min"]),na.rm=T)
colnames(data)[colnames(data)=="calendar_date"]<- "calendardate"  #6/5/2020 
data[,"RA_ggir"]<-(data[,"M10VALUE"]- data[,"L5VALUE"])/(data[,"M10VALUE"]+ data[,"L5VALUE"])  
S2<-c("filename","calendardate",intersect(part6 ,colnames(data)),"cleaningcode"   ) 
d6<-data[,S2] 
table(d6[,"cleaningcode"])

##################################################### 
# 1.3 part2_summary for IS and IV

data<-D[[1]]
colnames(data)    
S3<-c("filename", intersect(part2 ,colnames(data))   ) 
d2<-data[,S3] 
miss<-which(rowSums(is.na(d2[,-1]))==ncol(d2)-1)
if (length(miss)>=1) d2<-d2[-miss,]

Note 1: r inFN2[6] had r t$N duplicate days

if (t$N>=1) kable(t$note,caption="Duplicate days Correction according to part4")  %>%
    kable_styling(bootstrap_options = c("striped", "hover"))

Note 2: Cleaningcode in GGIR part4 and part5; Set sleep feature=NA for Cleaningcode>=2 (freq=r length(S2a) nights)

To monitor possible problems with the sleep assessment, the variable cleaningcode is recorded for each night. Cleaningcode per night (noon-noon or 6pm-6pm as described above) can have one of the following values:

All the information for each night is stored in the results/QC folder allowing tracing of the data analysis and night selection. The cleaned results stored in the results folder. In part 4 a night is excluded from the 'cleaned' results based on the following criteria:

If the study proposed a sleep log to the individuals, then nights are excluded for which the sleep log was not used as a guider (i.o.w. nights with cleaningcode not equal to 0 or variable sleep log used equals FALSE).
If the study did not propose a sleep log to the individuals, then all nights are removed with cleaningcode higher than 1.
Notice that part 4 is focused on sleep research, by which the cleaned reported is the way it is. In the next section we will discuss the analysis done by part 5. There, the choice of guider may be considered less important, by which we use different criteria for including nights. So, you may see that a night that is excluded from the cleaned results in part 4 still appears in the cleaned results for part 5.

3 Merge all features (all=TRUE)

# d4,d6=bin.RData, d2=.bin
##################################################### 
# 1.3  merge all features
# Dmer=summary file,BD=junrui ouput using clean imputation data only

Dmer0<-merge(d4,d6,by=c("filename","calendardate"),all=TRUE)
Dmer0[,"filename"]<-gsub(".RData","",Dmer0[,"filename"])  
Dmer<-merge(Dmer0,d2,by=c("filename" ),all.x=TRUE,all.y=FALSE)  
dim(d4)
dim(d6)
dim(Dmer0) 
dim(d2)
head(Dmer)

Fmer<-merge(BD,Dmer,by.x=c("filename","Date"),by.y=c("filename","calendardate"),all=TRUE)
dim(BD)
dim(Dmer)
dim(Fmer)
which(is.na(BD[,"newID"])) 
which(is.na(Fmer[,"Date"]))  
dim(Fmer)
head(Fmer)


if (length(setdiff(c(SLdomain,PAdomain,CRdomain),colnames(Fmer)))>=1) stop("check missing features")

S3<-c("filename","Date","newID","cleaningcode",SLdomain,PAdomain,CRdomain)
Fmer2<-Fmer[,S3]
anno<-NULL
anno[1:4]<-"#"
anno[which(colnames(Fmer2) %in% SLdomain)]<-"SL"
anno[which(colnames(Fmer2) %in% PAdomain)]<-"PA" 
anno[which(colnames(Fmer2) %in% CRdomain)]<-"CR"



category<-NULL
category[1:4]<-"#"
category[5:ncol(Fmer2)]<-"day"  
category[which(colnames(Fmer2) %in% part2)]<-"ggir-subject"  
category[which(colnames(Fmer2) %in% subjectlevel)]<-"subject"  

dim(Fmer2) 
Fmer3<-rbind(anno,category,Fmer2)
dim(Fmer3)

#write.xlsx(Fmer2, file=outFN, sheetName = "features",   col.names = TRUE, row.names = FALSE,append=TRUE)  
write.csv(Fmer3, file=outFN[1],   row.names = FALSE )  
# Note: newID=NA for those lines with R features, but no part2/5summary input. So use filename as ID index.

# d<-read.csv(outFN,head=1,comment.char = '#',stringsAsFactors=F) #not work
A=min(which(colnames(Fmer2) %in% SLdomain))-1  #A=4  number of head columns
B=A+1 #first column need to average
MustHaveTAC<-TRUE

4 Average variable at the subject level

One thing to note is that a lot of nights have 0 sleep duration - this may be where GGIR has failed to identify the start and end points of the sleep period window during that day. To account for this, I excluded all nights where 'acc_wake' minus 'acc_onset' was zero. This means that individuals might have different numbers of weekday and weekend day measures going into their averages. It's a good idea to keep track of how many 'valid' weekdays and weekend days were included in each person's mean, then adjust any statistical analyses for these covariates.

Here, the values of sleep variables were changed to NA when the nights have 0 sleep duration.

Fmer4<-Fmer2

numWD<-function(x) which(c("Monday","Tuesday","Wednesday","Thursday","Friday","Saturday","Sunday")==x) 
Fmer4[,"weekday"] <- unlist(lapply( weekdays(as.Date(Fmer4[,"Date"])) ,numWD))   

S6<-which(Fmer4[,"wakeup"]-Fmer4[,"sleeponset"] ==0)
Fmer4[,"zeroSleep"]<-0
Fmer4[S6,"zeroSleep"]<-1
for (i in 1:length(SLdomain)) Fmer4[S6,SLdomain[i]]<-NA  #change sldomain to NA for zerosleep rows
Fmer4[S6,1:16]  

Fmer4[,"Nwd"] <-ifelse(Fmer4[,"weekday"]<=5,1,0)
Fmer4[,"Nwe"] <-ifelse(Fmer4[,"weekday"]>=6,1,0) 
Fmer4[,"Nwd_0s"] <-Fmer4[,"Nwd"]*Fmer4[,"zeroSleep"]  # Number of weekdays of zero sleep
Fmer4[,"Nwe_0s"] <-Fmer4[,"Nwe"]*Fmer4[,"zeroSleep"]  # Number of weekends of zero sleep

S7<-which(Fmer4[,"TAC"]>=0)
if (MustHaveTAC & length(S7)>=1) Fmer5<-Fmer4[S7,] else Fmer5<-Fmer4   #clean file: filename,Date,newID,cleaningcode 

write.csv(Fmer5, file=outFN[2],   row.names = FALSE )  


Ncol8<-which(colnames(Fmer5)=="PC10")+1 # Ncol8 after are weekday,zeroSleep,Nwd,Nwe,..
Fmer81<-aggregate(.~filename,data=Fmer5[, c(1,B:(Ncol8-1))], mean,na.rm=T,na.action=na.pass)
Fmer82<-aggregate(.~filename,data=Fmer5[,c(1,Ncol8:ncol(Fmer5))], sum,na.rm=T,na.action=na.pass)

# set mean=NA for those features with Nvalid night< 3 or 7.
NcountSum<-function(x) sum(as.numeric(!is.na(x))) #calcuate number of good value of each feature
Fmer81.N<-aggregate(.~filename,data=Fmer5[, c(1,B:(Ncol8-1))], NcountSum,na.action=na.pass) 
Fmer81.mask<-Fmer81
for (j in 2:ncol(Fmer81.mask)) {S81<- which(Fmer81.N[,j] < QCnights.feature.alpha[1] )
                                if (length(S81)>=1) Fmer81.mask[S81,j]<-NA}
if (length(which(Fmer81.N[,"filename"]!=Fmer81[,"filename"]))>=1) stop("Mismatch IDs in Fmer81.mask")

length(unique(Fmer5[,1]))
dim(Fmer81)
dim(Fmer82) 
if (sum(Fmer81[,1]!=Fmer82[,1])>0)  stop("Check IDs of two Fmer8 matrix")
Fmer8<-cbind(Fmer81.mask,Fmer82[,-c(1,2)]) 
Fmer8[,"cleancode2"]<-rowSums( (is.na(Fmer81.mask)))
Fmer8.bad<-Fmer81.N[which(Fmer8[,"cleancode2"]>=1),]
rownames(Fmer8.bad)<-NULL
write.csv(Fmer8, file=outFN[3],row.names = FALSE )  

r paste("Calculate subject-level mean when Nvalid.night>=",QCnights.feature.alpha[1], " for each feature. Set them as NAs and will be removed in JIVE for ", nrow(Fmer8.bad), " subjects (",length(which(Fmer5[,1] %in% Fmer8.bad[,1]))," days) as follows,",sep="")

cleancode2: number of missingness of features of each subject

if (nrow(Fmer8.bad)>=1) kable(Fmer8.bad,caption="N valid night of removed subjects for mean features")  %>%
    kable_styling(bootstrap_options = c("striped", "hover"))

5 SD of each variable at the subject level

sd=0 for day-level features such as amp_L, acro_L, PC1, PC2, ...

r paste("Calculate subject-level SD for Nvalid.nihgt>=",QCnights.feature.alpha[2], " for features",sep="")

Fmer91<-aggregate(.~filename,data=Fmer5[, c(1,B:(Ncol8-1))], sd,na.rm=T,na.action=na.pass)
Fmer92<-aggregate(.~filename,data=Fmer5[,c(1,Ncol8:ncol(Fmer5))], sum,na.rm=T,na.action=na.pass)  
Fmer91.mask<-Fmer91
for (j in 2:ncol(Fmer91.mask)) {S91<- which(Fmer81.N[,j] < QCnights.feature.alpha[2] )
                                if (length(S91)>=1) Fmer91.mask[S91,j]<-NA}
if (length(which(Fmer81.N[,"filename"]!=Fmer91[,"filename"]))>=1) stop("Mismatch IDs in Fmer91.mask")

length(unique(Fmer5[,1]))
dim(Fmer91)
dim(Fmer92) 
if (sum(Fmer91[,1]!=Fmer92[,1])>0)  stop("Check IDs of two Fmer9 matrix")
Fmer9<-cbind(Fmer91.mask,Fmer92[,-c(1,2)])  
Fmer9[,"cleancode2"]<-rowSums( (is.na(Fmer91.mask)))
Fmer9.bad<-Fmer81.N[which(Fmer9[,"cleancode2"]>=1),]
rownames(Fmer9.bad)<-NULL
write.csv(Fmer9, file=outFN[4],row.names = FALSE )  

r paste("Calculate subject-level SD when Nvalid.nihgt>=",QCnights.feature.alpha[2], " for each feature. Set them as NAs and will be removed in JIVE for ", nrow(Fmer9.bad), " subjects (",length(which(Fmer5[,1] %in% Fmer9.bad[,1]))," days) as follows,",sep="")

if (nrow(Fmer9.bad)>=1) kable(Fmer9.bad,caption="N valid night of removed subjects for SD features")  %>%
    kable_styling(bootstrap_options = c("striped", "hover"))

6 Output

library(knitr)
ansM0<-cbind(outFN[1:A],c("all features by merging Junrui and ggir outputs","Keep sample with valid ENMO inputs","Average variable at the subject level","calculate SD of each feature"))


numberCheck<-function(Data,S=1:nrow(Data)){ d<-Data[S,]
ids<-unique(d[,"filename"])
ans<-c(dim(d),length(ids))
return(list(N=ans,ids=ids))
}

ans1<-numberCheck(Fmer2) 
ans2<-numberCheck(Fmer5) 
ans3<-numberCheck(Fmer8)
ans4<-numberCheck(Fmer9)
ansM<-rbind(c(ans1$N,""),c(ans2$N,paste(setdiff(ans1$ids,ans2$ids),collapse=",")), 
                         c(ans3$N,paste(setdiff(ans2$ids,ans3$ids),collapse=",")), 
                         c(ans4$N,paste(setdiff(ans2$ids,ans4$ids),collapse=","))  ) 
ansM<-cbind(ansM0,ansM)
colnames(ansM)<-c("Filename","Description","Nrow","Ncol","Nsubjects","Removed Subjects") 
kable(ansM,caption="Description of output files")   %>%
    kable_styling(bootstrap_options = c("striped", "hover"))

Note. In the output of part7b_1, the day summary files include either

(1) activity data TAC etc for cleaned data only
(2) sleep features from part4_clean file, set to be missing when cleaningcode>1 from part5 (3) dur_PA features are kept from part5 even sleep cleaningcode>1

7 Plot features

t1<-pheno.plot(inputFN=outFN[2],csv=TRUE, start=4)   
t2<-pheno.plot(inputFN=outFN[3],csv=TRUE, start=4)    
kable(t1,caption=paste("Summary of features (",outFN[2],")",sep=""))   %>%
    kable_styling(bootstrap_options = c("striped", "hover"))

How to run this?



Try the postGGIR package in your browser

Any scripts or data that you put into this service are public.

postGGIR documentation built on Jan. 6, 2022, 5:08 p.m.