require(dplyr)
require (psych)
require(foreach)
require(doParallel)
require(Rcpp)
build_vector_of_sizes <- function(m1_size,m2_size,m3_size,is_inverse){
if (!is_inverse){
total <- m1_size+m2_size+m3_size
if (total>0){
w<- c(m1_size/total, m2_size/total,m3_size/total)
} else {
w<- c(0,0,0)
}
} else{
m1_inv <- ifelse(m1_size>0,1/m1_size,0)
m2_inv <- ifelse(m2_size>0,1/m2_size,0)
m3_inv <- ifelse(m3_size>0,1/m3_size,0)
total <- m1_inv+m2_inv+m3_inv
if (total>0){
w<- c(m1_inv/total, m2_inv/total,m3_inv/total)
} else {
w<- c(0,0,0)
}
}
w
}
to.valid.matrix<- function(data){
tmp<-data
if(!is.null(data)){
if(is.vector(data)){
tmp<-matrix(unlist(data),nrow=1)
colnames(tmp)<- names(data)
} else{
# if(dim(data)[1]>0){
tmp<- as.matrix(data)
# }
}
}
tmp
}
# R_sample_start_info<- function(m,factor.1,factor.2,factor.3,method,p_cluster_id=NULL){
#
# if( !is.null(p_cluster_id)){
# cur.nofactor.start_info<- to.valid.matrix(dplyr::filter(as.data.frame(m$nofactor.start_info),cluster_id==p_cluster_id)[,c(2,3,6,7,8)])
# #if(dim(cur.nofactor.start_info)[1]>0 & !is.null(p_cluster_id)){
# cur.factor.1.start_info<- to.valid.matrix(dplyr::filter(as.data.frame(m$factor.1.start_info),cluster_id==p_cluster_id,factor==factor.1)[,c(3,4,7,8,9,10)])
# cur.factor.2.start_info<- to.valid.matrix(dplyr::filter(as.data.frame(m$factor.2.start_info),cluster_id==p_cluster_id,factor==factor.2)[,c(3,4,7,8,9,10)])
# cur.factor.3.start_info<- to.valid.matrix(dplyr::filter(as.data.frame(m$factor.3.start_info),cluster_id==p_cluster_id,factor==factor.3)[,c(3,4,7,8,9,10)])
# cur.m1<- ifelse(dim(cur.factor.1.start_info)[1]>0,sum(cur.factor.1.start_info[,6]),0)
# cur.m2<- ifelse(dim(cur.factor.2.start_info)[1]>0,sum(cur.factor.2.start_info[,6]),0)
# cur.m3<- ifelse(dim(cur.factor.3.start_info)[1]>0,sum(cur.factor.3.start_info[,6]),0)
#
# c.w<- build_vector_of_sizes(m1_size=cur.m1,
# m2_size=cur.m2,
# m3_size=cur.m3,
# is_inverse=method)
#
# cur.start.info <- sample_start_info(
# m1=to.valid.matrix(cur.factor.1.start_info[,-6]),
# m2=to.valid.matrix(cur.factor.2.start_info[,-6]),
# m3=to.valid.matrix(cur.factor.3.start_info[,-6]),
# m4=cur.nofactor.start_info,
# w=c.w)
# } else{
# cur.nocluster.nofactor.start_info<- to.valid.matrix(m$nocluster.nofactor.start_info[,c(1,2,5,6,7)])
# if(dim(cur.nocluster.nofactor.start_info)[1]>0){
# cur.nocluster.factor.1.start_info<- to.valid.matrix(dplyr::filter(as.data.frame(m$nocluster.factor.1.start_info),factor==factor.1)[,c(2,3,6,7,8,9)])
# cur.nocluster.factor.2.start_info<- to.valid.matrix(dplyr::filter(as.data.frame(m$nocluster.factor.2.start_info),factor==factor.2)[,c(2,3,6,7,8,9)])
# cur.nocluster.factor.3.start_info<- to.valid.matrix(dplyr::filter(as.data.frame(m$nocluster.factor.3.start_info),factor==factor.3)[,c(2,3,6,7,8,9)])
# cur.m1<- ifelse(dim(cur.nocluster.factor.1.start_info)[1]>0,sum(cur.nocluster.factor.1.start_info[,6]),0)
# cur.m2<- ifelse(dim(cur.nocluster.factor.2.start_info)[1]>0,sum(cur.nocluster.factor.2.start_info[,6]),0)
# cur.m3<- ifelse(dim(cur.nocluster.factor.3.start_info)[1]>0,sum(cur.nocluster.factor.3.start_info[,6]),0)
#
# c.w<- build_vector_of_sizes(m1_size=cur.m1,
# m2_size=cur.m2,
# m3_size=cur.m3,
# is_inverse=method)
#
# cur.start.info <- sample_start_info(
# m1=to.valid.matrix(cur.nocluster.factor.1.start_info[,-6]),
# m2=to.valid.matrix(cur.nocluster.factor.2.start_info[,-6]),
# m3=to.valid.matrix(cur.nocluster.factor.3.start_info[,-6]),
# m4=cur.nocluster.nofactor.start_info,
# w=c.w)
# } else{
# if(dim(m$mean.seq_duration)[1]>0){
# cur.index<- sample(1:dim(m$mean.seq_duration)[1],1,prob=m$mean.seq_duration$freq)
# cur.numeric_duration <- rnorm(1,mean = unlist(m$mean.seq_duration[cur.index,"mean"]),sd = unlist(m$mean.seq_duration[cur.index,"sd"]))
# cur.seq_duration <-m$mean.seq_duration[cur.index,"seq_duration"]
# } else{
# cur.numeric_duration <- rnorm(1,mean = m$mean.seq_duration[2],sd = m$mean.seq_duration[3])
# cur.seq_duration <- m$mean.seq_duration[1]
# }
# if(dim(m$mean.start_hour)[1]>1){
# cur.hour<- sample(m$mean.start_hour$hour,size=1,prob=m$mean.start_hour$freq)
# } else if(dim(m$mean.start_hour)[1]==0){
# cur.hour<- m$mean.start_hour[1]
# } else{
# cur.hour<- m$mean.start_hour$hour
# }
# cur.start.info <- c(cur.hour,cur.seq_duration,cur.numeric_duration)
# }
# }
# cur.start.info
# }
# R_sample_cluster<- function(m,factor.1,factor.2,factor.3,method,prev_id=NULL){
#
# #first cluster
# if(is.null(prev_id)){
# cur.nofactor.start.cluster<- to.valid.matrix(as.data.frame(m$nofactor.start.cluster)[,c(1,4)])
# cur.factor.1.start.cluster<- to.valid.matrix(dplyr::filter(as.data.frame(m$factor.1.start.cluster),factor==factor.1)[,c(1,5,6)])
# cur.factor.2.start.cluster<- to.valid.matrix(dplyr::filter(as.data.frame(m$factor.2.start.cluster),factor==factor.2)[,c(1,5,6)])
# cur.factor.3.start.cluster<- to.valid.matrix(dplyr::filter(as.data.frame(m$factor.3.start.cluster),factor==factor.3)[,c(1,5,6)])
# cur.m1<- ifelse(dim(cur.factor.1.start.cluster)[1]>0,sum(cur.factor.1.start.cluster[,3]),0)
# cur.m2<- ifelse(dim(cur.factor.2.start.cluster)[1]>0,sum(cur.factor.2.start.cluster[,3]),0)
# cur.m3<- ifelse(dim(cur.factor.3.start.cluster)[1]>0,sum(cur.factor.3.start.cluster[,3]),0)
#
# c.w<- build_vector_of_sizes(m1_size=cur.m1,
# m2_size=cur.m2,
# m3_size=cur.m3,
# is_inverse=method)
#
#
# if(dim(cur.nofactor.start.cluster)[1]>0){
# cur.sampled.cluster <- sample_starting_state(
# m1=to.valid.matrix(cur.factor.1.start.cluster[,-3]),
# m2=to.valid.matrix(cur.factor.2.start.cluster[,-3]),
# m3=to.valid.matrix(cur.factor.3.start.cluster[,-3]),
# m4=cur.nofactor.start.cluster,
# w=c.w)
# } else{
# if (dim(m$common_clusters)[1]>0){
# cur.sampled.cluster <- sample(unlist(unique(m$common_clusters[,1])),size=1)
# } else{
# cur.sampled.cluster <- NA
# }
# }
# } else{
# cur.nofactor.cluster.transition<- to.valid.matrix(dplyr::filter(as.data.frame(m$nofactor.cluster.transition),cluster_id==prev_id)[,c(2,5,6,7)])
# cur.factor.1.cluster.transition<- to.valid.matrix(dplyr::filter(as.data.frame(m$factor.1.cluster.transition),factor==factor.1,cluster_id==prev_id)[,c(3,6,7,8,9)])
# cur.factor.2.cluster.transition<- to.valid.matrix(dplyr::filter(as.data.frame(m$factor.2.cluster.transition),factor==factor.2,cluster_id==prev_id)[,c(3,6,7,8,9)])
# cur.factor.3.cluster.transition<- to.valid.matrix(dplyr::filter(as.data.frame(m$factor.3.cluster.transition),factor==factor.3,cluster_id==prev_id)[,c(3,6,7,8,9)])
# cur.m1<- ifelse(dim(cur.factor.1.cluster.transition)[1]>0,sum(cur.factor.1.cluster.transition[,5]),0)
# cur.m2<- ifelse(dim(cur.factor.2.cluster.transition)[1]>0,sum(cur.factor.2.cluster.transition[,5]),0)
# cur.m3<- ifelse(dim(cur.factor.3.cluster.transition)[1]>0,sum(cur.factor.3.cluster.transition[,5]),0)
#
# c.w<- build_vector_of_sizes(m1_size=cur.m1,
# m2_size=cur.m2,
# m3_size=cur.m3,
# is_inverse=method)
# if(dim(cur.nofactor.cluster.transition)[1]>0){
# cur.sampled.cluster <- sample_transition(
# m1=to.valid.matrix(cur.factor.1.cluster.transition[,-5]),
# m2=to.valid.matrix(cur.factor.2.cluster.transition[,-5]),
# m3=to.valid.matrix(cur.factor.3.cluster.transition[,-5]),
# m4=cur.nofactor.cluster.transition,
# w=c.w)
# } else{
# c_cluster_id = NA
# cur.near <- dplyr::filter(
# as.data.frame(to.valid.matrix(m$common_clusters)),"cluster_id.x" ==prev_id)[,c(2,5)]
#
# if(dim(cur.near)[1]>1){
# c_cluster_id <- sample(as.character(cur.near[,1]),size=1,prob=as.numeric(cur.near[,2]))
# } else if(dim(cur.near)[1]==1){
# c_cluster_id <-as.character(cur.near[1])
# }
# # if (!is.na(c_cluster_id)){
# if (!is.null(prev_id)){
# cur.sampled.cluster <- c(c_cluster_id,as.numeric(m$mean.cluster.tbe))
# }
# else{
# cur.sampled.cluster <- c(c_cluster_id)
# }
# # }
# }
# }
# cur.sampled.cluster
# }
# R_sample_state<- function(m,p_cluster_id,factor.1,factor.2,factor.3,method,prev_id=NULL){
# #first state
# if(is.null(prev_id)){
# if(is.null(p_cluster_id)){
# cur.nofactor.starting.state<- to.valid.matrix(dplyr::filter(as.data.frame(m$nofactor.starting.state),cluster_id==p_cluster_id)[,c(2,5)])
# #if(dim(cur.nofactor.starting.state)[1]>0 & !is.null(p_cluster_id)){
# cur.factor.1.starting.state<- to.valid.matrix(dplyr::filter(as.data.frame(m$factor.1.starting.state),cluster_id==p_cluster_id,factor==factor.1)[,c(3,6,7)])
# cur.factor.2.starting.state<- to.valid.matrix(dplyr::filter(as.data.frame(m$factor.2.starting.state),cluster_id==p_cluster_id,factor==factor.2)[,c(3,6,7)])
# cur.factor.3.starting.state<- to.valid.matrix(dplyr::filter(as.data.frame(m$factor.3.starting.state),cluster_id==p_cluster_id,factor==factor.3)[,c(3,6,7)])
# cur.m1<- ifelse(dim(cur.factor.1.starting.state)[1]>0,sum(cur.factor.1.starting.state[,3]),0)
# cur.m2<- ifelse(dim(cur.factor.2.starting.state)[1]>0,sum(cur.factor.2.starting.state[,3]),0)
# cur.m3<- ifelse(dim(cur.factor.3.starting.state)[1]>0,sum(cur.factor.3.starting.state[,3]),0)
#
# c.w<- build_vector_of_sizes(m1_size=cur.m1,
# m2_size=cur.m2,
# m3_size=cur.m3,
# is_inverse=method)
#
# cur.state <- sample_starting_state(
# m1=to.valid.matrix(cur.factor.1.starting.state[,-3]),
# m2=to.valid.matrix(cur.factor.2.starting.state[,-3]),
# m3=to.valid.matrix(cur.factor.3.starting.state[,-3]),
# m4=cur.nofactor.starting.state,
# w=c.w)
# #}
# } else{
# cur.nocluster.nofactor.starting.state<- to.valid.matrix(as.data.frame(m$nocluster.nofactor.starting.state[,c(1,4)]))
# if(dim(cur.nocluster.nofactor.starting.state)[1]>0){
# cur.nocluster.factor.1.starting.state<- to.valid.matrix(dplyr::filter(as.data.frame(m$nocluster.factor.1.starting.state),factor==factor.1)[,c(2,5,6)])
# cur.nocluster.factor.2.starting.state<- to.valid.matrix(dplyr::filter(as.data.frame(m$nocluster.factor.2.starting.state),factor==factor.2)[,c(2,5,6)])
# cur.nocluster.factor.3.starting.state<- to.valid.matrix(dplyr::filter(as.data.frame(m$nocluster.factor.3.starting.state),factor==factor.3)[,c(2,5,6)])
# cur.m1<- ifelse(dim(cur.nocluster.factor.1.starting.state)[1]>0,sum(cur.nocluster.factor.1.starting.state[,3]),0)
# cur.m2<- ifelse(dim(cur.nocluster.factor.2.starting.state)[1]>0,sum(cur.nocluster.factor.2.starting.state[,3]),0)
# cur.m3<- ifelse(dim(cur.nocluster.factor.3.starting.state)[1]>0,sum(cur.nocluster.factor.3.starting.state[,3]),0)
#
# c.w<- build_vector_of_sizes(m1_size=cur.m1,
# m2_size=cur.m2,
# m3_size=cur.m3,
# is_inverse=method)
#
# cur.state <- sample_starting_state(
# m1=to.valid.matrix(cur.nocluster.factor.1.starting.state[,-3]),
# m2=to.valid.matrix(cur.nocluster.factor.2.starting.state[,-3]),
# m3=to.valid.matrix(cur.nocluster.factor.3.starting.state[,-3]),
# m4=cur.nocluster.nofactor.starting.state,
# w=c.w)
# } else if(dim(m$common_states)[1]>0){
# cur.state <-sample(unlist(unique(m$common_states[,1])),1)
# } else{
# cur.state <- -1
# }
# }
# #not the starting state
# } else {
# if( !is.null(p_cluster_id)){
# cur.nofactor.transition<- to.valid.matrix((dplyr::filter(as.data.frame(m$nofactor.transition),cluster_id==p_cluster_id,state_id==prev_id))[,c(3,6,7,8)])
# #if(dim(cur.nofactor.transition)[1]>0 & !is.null(p_cluster_id)){
# cur.factor.1.transition<- to.valid.matrix(dplyr::filter(as.data.frame(m$factor.1.transition),cluster_id==p_cluster_id,factor==factor.1,state_id==prev_id)[,c(4,7,8,9,10)])
# cur.factor.2.transition<- to.valid.matrix(dplyr::filter(as.data.frame(m$factor.2.transition),cluster_id==p_cluster_id,factor==factor.2,state_id==prev_id)[,c(4,7,8,9,10)])
# cur.factor.3.transition<- to.valid.matrix(dplyr::filter(as.data.frame(m$factor.3.transition),cluster_id==p_cluster_id,factor==factor.3,state_id==prev_id)[,c(4,7,8,9,10)])
# cur.m1<- ifelse(dim(cur.factor.1.transition)[1]>0,sum(cur.factor.1.transition[,5]),0)
# cur.m2<- ifelse(dim(cur.factor.2.transition)[1]>0,sum(cur.factor.2.transition[,5]),0)
# cur.m3<- ifelse(dim(cur.factor.3.transition)[1]>0,sum(cur.factor.3.transition[,5]),0)
#
# c.w<- build_vector_of_sizes(m1_size=cur.m1,
# m2_size=cur.m2,
# m3_size=cur.m3,
# is_inverse=method)
#
# cur.state <- sample_transition(
# m1=to.valid.matrix(cur.factor.1.transition[,-5]),
# m2=to.valid.matrix(cur.factor.2.transition[,-5]),
# m3=to.valid.matrix(cur.factor.3.transition[,-5]),
# m4=cur.nofactor.transition,
# w=c.w)
# } else{
# cur.nocluster.nofactor.transition<- to.valid.matrix(dplyr::filter(as.data.frame(m$nocluster.nofactor.transition),state_id==prev_id)[,c(2,5,6,7)])
# if(dim(cur.nocluster.nofactor.transition)[1]>0){
# cur.nocluster.factor.1.transition<- to.valid.matrix(dplyr::filter(as.data.frame(m$nocluster.factor.1.transition),factor==factor.1,state_id==prev_id)[,c(3,6,7,8,9)])
# cur.nocluster.factor.2.transition<- to.valid.matrix(dplyr::filter(as.data.frame(m$nocluster.factor.2.transition),factor==factor.2,state_id==prev_id)[,c(3,6,7,8,9)])
# cur.nocluster.factor.3.transition<- to.valid.matrix(dplyr::filter(as.data.frame(m$nocluster.factor.3.transition),factor==factor.3,state_id==prev_id)[,c(3,6,7,8,9)])
# cur.m1<- ifelse(nrow(cur.nocluster.factor.1.transition)>0,sum(cur.nocluster.factor.1.transition[,5]),0)
# cur.m2<- ifelse(nrow(cur.nocluster.factor.2.transition)>0,sum(cur.nocluster.factor.2.transition[,5]),0)
# cur.m3<- ifelse(nrow(cur.nocluster.factor.3.transition)>0,sum(cur.nocluster.factor.3.transition[,5]),0)
#
# c.w<- build_vector_of_sizes(m1_size=cur.m1,
# m2_size=cur.m2,
# m3_size=cur.m3,
# is_inverse=method)
#
# cur.state <- sample_transition(
# m1=to.valid.matrix(cur.nocluster.factor.1.transition[,-5]),
# m2=to.valid.matrix(cur.nocluster.factor.2.transition[,-5]),
# m3=to.valid.matrix(cur.nocluster.factor.3.transition[,-5]),
# m4=cur.nocluster.nofactor.transition,
# w=c.w)
# } else{cur.state<-c(-1,0)}
# }
# }
# if(cur.state[1]==-1){
# c.state_id <- -1
# cur.near <- dplyr::filter(as.data.frame(to.valid.matrix(m$common_states)),"state_id.x" ==prev_id)[,c(2,5)]
# if(dim(cur.near)[1]>1){
# c.state_id <- sample(cur.near[,1],size=1,prob=cur.near[,2])
# } else if(dim(cur.near)[1]==1){
# c.state_id<-cur.near[1]
# }
# if (c.state_id!=-1){
# cur.tbe<-NULL
# if (!is.null(prev_id)){
# cur.sampled.tbe<- dplyr::filter(as.data.frame(m$mean.transition.tbe),cluster_id==p_cluster_id)[2]
# if(nrow(cur.sampled.tbe)==0){
# cur.tbe<- mean(m$mean.transition.tbe[,2])
# } else if(nrow(cur.sampled.tbe)==1){
# cur.tbe<-cur.sampled.tbe
# }
# cur.state <- c(c.state_id,as.numeric(cur.tbe))
# }
# else{
# cur.state <- c(c.state_id)
# }
# }
# }
# cur.state
# }
sample.sequence.chain<- function(m,p_objID,p_seqID,p_cluster_id,p_seq_duration,p_start_time,p_end_time,p_method){
#seq_duration
cur.factor.1 <- p_seq_duration
#cur.hour
cur.factor.2 <- as.integer(as.POSIXlt(p_start_time,origin = "1970-01-01")$hour)
#cur.weekday
cur.factor.3 <- as.integer(as.POSIXlt(p_start_time,origin = "1970-01-01")$wday)
cur.timestamp <- p_start_time
cur.state <- R_sample_state(m,p_cluster_id=p_cluster_id,factor.1=cur.factor.1,factor.2=cur.factor.2,factor.3=cur.factor.3,method=p_method)
cur.sequence <- NULL
if(!is.null(cur.state) && cur.state!=-1){
cur.sequence <- as.matrix(c(p_objID,p_seqID,as.character(cur.timestamp),cur.state))
dim(cur.sequence)<-c(1,4)
names(cur.sequence)<-c("objectid","seq_id","timestamp","state_id")
cur.sequence<- to.valid.matrix(cur.sequence)
while(cur.timestamp<=p_end_time & (!is.null(cur.state) && cur.state!=-1)){
cur.transition <- R_sample_state(m,p_cluster_id=p_cluster_id,factor.1=cur.factor.1,factor.2=cur.factor.2,factor.3=cur.factor.3,method=p_method,prev_id=cur.state)
cur.state <- cur.transition[1]
if(is.na(cur.state)){
break
}
if(cur.state==-1){
break
}
cur.tbe <- as.numeric(cur.transition[2])
if(cur.tbe<=2){
cur.tbe <- sample(c(2,60,3600),size=1,replace = T,prob=c(0.5,0.3,0.2))
}
cur.timestamp <- cur.timestamp + cur.tbe
#seq_duration
#cur.factor.1 <- p_seq_duration
#cur.hour
cur.factor.2 <- as.integer(as.POSIXlt(cur.timestamp,origin = "1970-01-01")$hour)
#cur.weekday
cur.factor.3 <- as.integer(as.POSIXlt(cur.timestamp,origin = "1970-01-01")$wday)
cur.sequence<-rbind(cur.sequence,c(p_objID,p_seqID,as.character(cur.timestamp),cur.state))
}
}
to.valid.matrix(cur.sequence)
}
sample.meta.chain<- function(m,p_method){
sequences.frame<-NULL
end.timestamp<-NULL
for(i in 1:dim(m$start.list)[1]){
for(cur.obj in 1:m$start.list$objects[i]){
cur.date <- m$start.list$date[i]
#cur.weekday
cur.factor.1<- as.integer(as.numeric(format(as.POSIXlt(cur.date,origin = "1970-01-01"), "%w")))
#cur.month
cur.factor.2<- as.integer(as.POSIXlt(cur.date,origin = "1970-01-01")$mon)
#cur.year
cur.factor.3<- as.integer(as.POSIXlt(cur.date,origin = "1970-01-01")$year)
mean.num.seq <- as.numeric(unlist(m$start.list$mean.num.seq[i]))
sd.num.seq <- as.numeric(unlist(m$start.list$sd.num.seq[i] ))
cur.num.seq <- max(1,round(rnorm(n=1, mean=mean.num.seq, sd=sd.num.seq),0))
start.timestamp<- as.POSIXlt(paste(cur.date,"00:00:01",sep=""),origin = "1970-01-01")
cur.seq.duration.numeric<- 0
for (cur.seq in 1:cur.num.seq){
if (cur.seq==1){
c.cluster_id <- R_sample_cluster(m,factor.1=cur.factor.1,factor.2=cur.factor.2,factor.3=cur.factor.3,method=p_method,prev_id=NULL)
} else{
if(is.na(c.cluster_id)){
c.cluster_id<-NULL
cur.cluster_tbe <- m$mean.cluster.tbe
cur.cluster_id <- R_sample_cluster(m,factor.1=cur.factor.1,factor.2=cur.factor.2,factor.3=cur.factor.3,method=p_method,prev_id=c.cluster_id )
} else{
cur.cluster_id <- R_sample_cluster(m,factor.1=cur.factor.1,factor.2=cur.factor.2,factor.3=cur.factor.3,method=p_method,prev_id=c.cluster_id )
cur.cluster_tbe <- as.numeric(cur.cluster_id[2])
}
c.cluster_id <- cur.cluster_id[1]
}
cur.start_info <- R_sample_start_info(m,factor.1=cur.factor.1,factor.2=cur.factor.2,factor.3=cur.factor.3,method=p_method,p_cluster_id=c.cluster_id)
cur.hour<-as.numeric(cur.start_info[1])
if (cur.seq==1){
start.timestamp<- as.POSIXlt(paste(substr(start.timestamp,1,11),cur.hour,":00:00",sep=""),origin = "1970-01-01")+runif(1,1,3599)
} else{
start.timestamp<- start.timestamp + max(cur.cluster_tbe,cur.seq.duration.numeric+1)
start.timestamp<- as.POSIXlt(paste(substr(start.timestamp,1,11),cur.hour,substr(start.timestamp,14,19),sep=""),origin = "1970-01-01")
}
if (!is.null(end.timestamp)){
while( start.timestamp<= end.timestamp){
start.timestamp<- start.timestamp + 60*60*24 #add one day
}
}
#cur.weekday
cur.factor.1<- as.integer(as.numeric(format(as.POSIXlt(start.timestamp,origin = "1970-01-01"), "%w")))
#cur.month
cur.factor.2<- as.integer(as.POSIXlt(start.timestamp,origin = "1970-01-01")$mon)
#cur.year
cur.factor.3<- as.integer(as.POSIXlt(start.timestamp,origin = "1970-01-01")$year)
cur.seq.duration<-cur.start_info[2]
cur.seq.duration.numeric<-as.numeric(cur.start_info[3])
end.timestamp <- start.timestamp +cur.seq.duration.numeric
if(is.null(sequences.frame)){
sequences.frame <- as.matrix(c(i,cur.obj,cur.seq,c.cluster_id,cur.seq.duration,as.character(start.timestamp),as.character(end.timestamp)))
dim(sequences.frame)<- c(1,7)
sequences.frame<-to.valid.matrix(sequences.frame)
colnames(sequences.frame)<- c("dateNo","objectID","seqID","clusterID","seq_duration","startTime","endTime")
} else{
sequences.frame<- rbind(sequences.frame,c(i,cur.obj,cur.seq,c.cluster_id,cur.seq.duration,as.character(start.timestamp),as.character(end.timestamp)))
}
}
}
}
sequences.frame[,c("objectID")]<- as.numeric(factor(paste0(sequences.frame[,c("dateNo")],"-",sequences.frame[,c("objectID")])))
sequences.frame[,c("seqID")]<- as.numeric(factor(paste0(sequences.frame[,c("dateNo")],"-",sequences.frame[,c("objectID")],"-",sequences.frame[,c("seqID")])))
apply(sequences.frame,2,unlist)
}
generate.synthetic.data <- function(m,p.method){
doParallel::registerDoParallel(parallel::detectCores())
sequences.frame<-sample.meta.chain(m,p_method=p.method)
require(foreach)
synthetic.data<-foreach(i = 1:dim(sequences.frame)[1],.packages=c("dplyr","psych","Rcpp","sequencesAnonymizer"),.combine="rbind") %dopar% {
#for(i in 1:dim(sequences.frame)[1]) {
cur.cluster_id <- as.integer(sequences.frame[i,"clusterID"])
if(is.na(cur.cluster_id)){
cur.cluster_id<-NULL
}
cur.seq_duration <- as.integer(sequences.frame[i,"seq_duration"])
cur.objectID <- sequences.frame[i,"objectID"]
cur.seqID <- as.integer(sequences.frame[i,"seqID"])
cur.startTime <- as.POSIXlt(unlist(sequences.frame[i,"startTime"]),origin = "1970-01-01")
cur.endTime <- as.POSIXlt(unlist(sequences.frame[i,"endTime"]),origin = "1970-01-01")
cur.bulk<- sample.sequence.chain(m,
p_objID=cur.objectID,
p_seqID=cur.seqID,
p_cluster_id=cur.cluster_id,
p_seq_duration=cur.seq_duration,
p_start_time=cur.startTime,
p_end_time=cur.endTime,
p_method=p.method)
if(is.null(cur.bulk) || dim(cur.bulk)[1]==0){
cur.bulk<-c("*","*","*","*")
dim(cur.bulk)<-c(1,4)
#colnames(cur.bulk)<-c("objectid","seq_id","timestamp","state_id")
cur.bulk<- to.valid.matrix(cur.bulk)
}
colnames(cur.bulk)<-c("objectid","seq_id","timestamp","state_id")
cur.bulk
}
foreach::registerDoSEQ()
synthetic.data<-as.data.frame(synthetic.data,stringsAsFactors = F)
synthetic.data<-apply(synthetic.data,2,unlist)
synthetic.data<-synthetic.data[synthetic.data[,"objectid"]!='*',]
synthetic.data[order(synthetic.data[,"objectid"],synthetic.data[,"timestamp"]),]
}
sample.by.ensemble<- function(m1,m2,m3,m4,p_func,method){
cur.m1<- ifelse(dim(m1)[1]>0,sum(m1[,"total_numeric"]),0)
cur.m2<- ifelse(dim(m2)[1]>0,sum(m2[,"total_numeric"]),0)
cur.m3<- ifelse(dim(m3)[1]>0,sum(m3[,"total_numeric"]),0)
c.w<- build_vector_of_sizes(m1_size=cur.m1,
m2_size=cur.m2,
m3_size=cur.m3,
is_inverse=method)
if(c.w==c(0,0,0) && dim(m4)[1]==0){
res<- NA
} else{
func<- match.fun(p_func)
res<- func(m1=to.valid.matrix(m1[,!colnames(m1) %in% c("total_numeric")]),
m2=to.valid.matrix(m2[,!colnames(m2) %in% c("total_numeric")]),
m3=to.valid.matrix(m3[,!colnames(m3) %in% c("total_numeric")]),
m4=m4,
w=c.w)
}
res
}
R_sample_start_info <- function(m,factor.1,factor.2,factor.3,method,p_cluster_id=NULL){
cur.support_level<- 1 #1 for factor&cluster(inclusing cluster); 2 for factor (including all); 3 for basic means
if(is.null(p_cluster_id)){
cur.support_level<- 2
}
if(cur.support_level==1){# for cluster&factor+cluster
res<- sample.by.ensemble(m1= to.valid.matrix(dplyr::filter(as.data.frame(m$factor.1.start_info),cluster_id==p_cluster_id,factor==factor.1)[,c(3,4,7,8,9,10)]),
m2= to.valid.matrix(dplyr::filter(as.data.frame(m$factor.2.start_info),cluster_id==p_cluster_id,factor==factor.2)[,c(3,4,7,8,9,10)]),
m3= to.valid.matrix(dplyr::filter(as.data.frame(m$factor.3.start_info),cluster_id==p_cluster_id,factor==factor.3)[,c(3,4,7,8,9,10)]),
m4= to.valid.matrix(dplyr::filter(as.data.frame(m$nofactor.start_info),cluster_id==p_cluster_id)[,c(2,3,6,7,8)]),
p_func= "sample_start_info",
method)
if(is.na(res[1])){
cur.support_level<- 2
}
}
if(cur.support_level==2){# for factor+all
res<- sample.by.ensemble(m1= to.valid.matrix(dplyr::filter(as.data.frame(m$nocluster.factor.1.start_info),factor==factor.1)[,c(2,3,6,7,8,9)]),
m2= to.valid.matrix(dplyr::filter(as.data.frame(m$nocluster.factor.2.start_info),factor==factor.2)[,c(2,3,6,7,8,9)]),
m3= to.valid.matrix(dplyr::filter(as.data.frame(m$nocluster.factor.3.start_info),factor==factor.3)[,c(2,3,6,7,8,9)]),
m4= to.valid.matrix(m$nocluster.nofactor.start_info[,c(1,2,5,6,7)]),
p_func= "sample_start_info",
method)
if(is.na(res[1])){
cur.support_level<- 3
}
}
if(cur.support_level==3){# for basic statistics
#sampling sequence duration
if(dim(m$mean.seq_duration)[1]>0){
cur.index<- sample(1:dim(m$mean.seq_duration)[1],1,prob=m$mean.seq_duration$freq)
cur.numeric_duration <- rnorm(1,mean = unlist(m$mean.seq_duration[cur.index,"mean"]),sd = unlist(m$mean.seq_duration[cur.index,"sd"]))
cur.seq_duration <-m$mean.seq_duration[cur.index,"seq_duration"]
} else{
cur.numeric_duration <- rnorm(1,mean = m$mean.seq_duration[2],sd = m$mean.seq_duration[3])
cur.seq_duration <- m$mean.seq_duration[1]
}
#sampling start hour
if(dim(m$mean.start_hour)[1]>1){
cur.hour<- sample(m$mean.start_hour$hour,size=1,prob=m$mean.start_hour$freq)
} else if(dim(m$mean.start_hour)[1]==0){
cur.hour<- m$mean.start_hour[1]
} else{
cur.hour<- m$mean.start_hour$hour
}
res <- c(cur.hour,cur.seq_duration,cur.numeric_duration)
}
res
}
R_sample_cluster<- function(m,factor.1,factor.2,factor.3,method,prev_id=NULL){
cur.support_level<- 2 # 2 for factor (including all); 3 for basic means
starting.cluster<-F
if (is.null(prev_id)){
starting.cluster<-T
}
#first cluster
if(starting.cluster){
res<- sample.by.ensemble(m1= to.valid.matrix(dplyr::filter(as.data.frame(m$factor.1.start.cluster),factor==factor.1)[,c(1,5,6)]),
m2= to.valid.matrix(dplyr::filter(as.data.frame(m$factor.2.start.cluster),factor==factor.2)[,c(1,5,6)]),
m3= to.valid.matrix(dplyr::filter(as.data.frame(m$factor.3.start.cluster),factor==factor.3)[,c(1,5,6)]),
m4= to.valid.matrix(as.data.frame(m$nofactor.start.cluster)[,c(1,4)]),
p_func= "sample_starting_state",
method)
if(is.na(res[1])){
cur.support_level<- 3
}
if(cur.support_level==3){# for basic statistics
if (dim(m$common_clusters)[1]>1){
cur.sampled.cluster <- sample(unlist(unique(m$common_clusters[,1])),size=1)
} else if(dim(m$common_clusters)[1]==1){
cur.sampled.cluster<- m$common_clusters[1]
} else {
cur.sampled.cluster <- NA
}
res <- c(cur.sampled.cluster,as.numeric(m$mean.cluster.tbe))
}
} else{ #cluster transition
res<- sample.by.ensemble(m1= to.valid.matrix(dplyr::filter(as.data.frame(m$factor.1.cluster.transition),factor==factor.1,cluster_id==prev_id)[,c(3,6,7,8,9)]),
m2= to.valid.matrix(dplyr::filter(as.data.frame(m$factor.2.cluster.transition),factor==factor.2,cluster_id==prev_id)[,c(3,6,7,8,9)]),
m3= to.valid.matrix(dplyr::filter(as.data.frame(m$factor.3.cluster.transition),factor==factor.3,cluster_id==prev_id)[,c(3,6,7,8,9)]),
m4= to.valid.matrix(dplyr::filter(as.data.frame(m$nofactor.cluster.transition),cluster_id==prev_id)[,c(2,5,6,7)]),
p_func= "sample_transition",
method)
if(is.na(res[1])){
cur.support_level<- 3
}
if(cur.support_level==3){# for basic statistics
#sample cluster
cur.near <- dplyr::filter(
as.data.frame(to.valid.matrix(m$common_clusters)),"cluster_id.x" ==prev_id)[,c(2,5)]
if(dim(cur.near)[1]>1){
c_cluster_id <- sample(as.character(cur.near[,1]),size=1,prob=as.numeric(cur.near[,2]))
} else if(dim(cur.near)[1]==1){
c_cluster_id <-as.character(cur.near[1])
} else{
c_cluster_id<- NA
}
#add tbe
res <- c(c_cluster_id,as.numeric(m$mean.cluster.tbe))
}
}
res
}
R_sample_state<- function(m,p_cluster_id,factor.1,factor.2,factor.3,method,prev_id=NULL){
starting.state<-F
if (is.null(prev_id)){
starting.state<-T
}
cur.support_level<- 1 #1 for factor&cluster(inclusing cluster); 2 for factor (including all); 3 for basic means
if(is.null(p_cluster_id)){
cur.support_level<- 2
}
if(starting.state){
if(cur.support_level==1){# for cluster&factor+cluster
res<- sample.by.ensemble(m1= to.valid.matrix(dplyr::filter(as.data.frame(m$factor.1.starting.state),cluster_id==p_cluster_id,factor==factor.1)[,c(3,6,7)]),
m2= to.valid.matrix(dplyr::filter(as.data.frame(m$factor.2.starting.state),cluster_id==p_cluster_id,factor==factor.2)[,c(3,6,7)]),
m3= to.valid.matrix(dplyr::filter(as.data.frame(m$factor.3.starting.state),cluster_id==p_cluster_id,factor==factor.3)[,c(3,6,7)]),
m4= to.valid.matrix(dplyr::filter(as.data.frame(m$nofactor.starting.state),cluster_id==p_cluster_id)[,c(2,5)]),
p_func= "sample_starting_state",
method)
if(is.na(res[1])){
cur.support_level<- 2
}
}
if(cur.support_level==2){# for factor+all
res<- sample.by.ensemble(m1= to.valid.matrix(dplyr::filter(as.data.frame(m$nocluster.factor.1.starting.state),factor==factor.1)[,c(2,5,6)]),
m2= to.valid.matrix(dplyr::filter(as.data.frame(m$nocluster.factor.2.starting.state),factor==factor.2)[,c(2,5,6)]),
m3= to.valid.matrix(dplyr::filter(as.data.frame(m$nocluster.factor.3.starting.state),factor==factor.3)[,c(2,5,6)]),
m4= to.valid.matrix(as.data.frame(m$nocluster.nofactor.starting.state[,c(1,4)])),
p_func= "sample_starting_state",
method)
if(is.na(res[1])){
cur.support_level<- 3
}
}
if(cur.support_level==3){# for basic statistics
if(dim(m$common_states)[1]>1){
cur.state <-sample(unlist(unique(m$common_states[,1])),1)
} else if(dim(m$common_states)[1]==1){
cur.state <-unlist(m$common_states[1])
} else {
cur.state <- -1
}
res<- cur.state
}
} else{ # for state transition
if(cur.support_level==1){# for cluster&factor+cluster
res<- sample.by.ensemble(m1= to.valid.matrix(dplyr::filter(as.data.frame(m$factor.1.transition),cluster_id==p_cluster_id,factor==factor.1,state_id==prev_id)[,c(4,7,8,9,10)]),
m2= to.valid.matrix(dplyr::filter(as.data.frame(m$factor.2.transition),cluster_id==p_cluster_id,factor==factor.2,state_id==prev_id)[,c(4,7,8,9,10)]),
m3= to.valid.matrix(dplyr::filter(as.data.frame(m$factor.3.transition),cluster_id==p_cluster_id,factor==factor.3,state_id==prev_id)[,c(4,7,8,9,10)]),
m4= to.valid.matrix((dplyr::filter(as.data.frame(m$nofactor.transition),cluster_id==p_cluster_id,state_id==prev_id))[,c(3,6,7,8)]),
p_func= "sample_transition",
method)
if(is.na(res[1])){
cur.support_level<- 2
}
}
if(cur.support_level==2){# for factor+all
res<- sample.by.ensemble(m1= to.valid.matrix(dplyr::filter(as.data.frame(m$nocluster.factor.1.transition),factor==factor.1,state_id==prev_id)[,c(3,6,7,8,9)]),
m2= to.valid.matrix(dplyr::filter(as.data.frame(m$nocluster.factor.2.transition),factor==factor.2,state_id==prev_id)[,c(3,6,7,8,9)]),
m3= to.valid.matrix(dplyr::filter(as.data.frame(m$nocluster.factor.3.transition),factor==factor.3,state_id==prev_id)[,c(3,6,7,8,9)]),
m4= to.valid.matrix(dplyr::filter(as.data.frame(m$nocluster.nofactor.transition),state_id==prev_id)[,c(2,5,6,7)]),
p_func= "sample_transition",
method)
if(is.na(res[1])){
cur.support_level<- 3
}
}
if(cur.support_level==3){# for basic statistics
#sample state
cur.near <- dplyr::filter(as.data.frame(to.valid.matrix(m$common_states)),"state_id.x" ==prev_id)[,c(2,5)]
if(dim(cur.near)[1]==0){
cur.state <- -1
} else if(dim(cur.near)[1]==1){
cur.state <-unlist(cur.near[1])
} else {
if(dim(cur.near)[1]>1){
c.state_id <- sample(cur.near[,1],size=1,prob=cur.near[,2])
} else if(dim(cur.near)[1]==1){
c.state_id<-cur.near[1]
} else{
c.state_id <- -1
}
}
#sample tbe
if(!is.null(p_cluster_id)){
cur.tbe<- dplyr::filter(as.data.frame(m$mean.transition.tbe),cluster_id==p_cluster_id)[2]
} else{
cur.tbe<- mean(m$mean.transition.tbe[,2])
}
res <- c(c.state_id,as.numeric(cur.tbe))
}
}
res
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.