R/IRASD.R

Defines functions IRASD

Documented in IRASD

#' Results for Irregular Activity Sequence Detection
#'
#' @param newseqdf Dataframe of new sequences to be tested: first column time
#' stamps in POSIXct format, second column sequence vector in factor format.
#' @param seqdf Dataframe of regular sequences: first column time
#' stamps in POSIXct format, second column sequence vector in factor format.
#' @param timetype ime window for separating the dataset, i.e. day = '%Y-%m-%d', week = '%Y-%W' and month '%Y-%m'.
#' @param k Maximum length of sequence silhouette.
#' @param beta Score parameter.
#' @param lam Score parameter.
#' @param percentpwr Significance level.
#' @param bndwidth Bandwidth for the KDE (refer to stats::density function for options).
#' @param kern Kernel of choice for the KDE (refer to stats::density function for options).
#'
#' @return A list with the results of the test, showing which SQS have flagged as irregular.
#' @export
#'
#' @details
#' Ensure lam(k-1)k/2<beta to comply with the method constraints.
#'
#' @examples
#' seqdf<-data.frame(Time=c("2020-01-01 00:10:09", "2020-01-01 01:12:34" , "2020-01-02 06:38:09",
#' "2020-01-02 07:21:51"),Cat=as.factor(c('A','B','A','C')))
#' newseqdf<-data.frame(Time=c("2020-01-03 01:30:20", "2020-01-03 04:19:14" , "2020-01-03 06:51:29"),
#' Cat=as.factor(c('A','B','A')))
#' #For daily data:
#' IRASD(newseqdf, seqdf, '%Y-%m-%d',2, 1, 0.5,0.05,'nrd0','gaussian')
IRASD <- function(newseqdf, seqdf, timetype, k, beta, lam, percentpwr, bndwidth, kern) {
  if(class(newseqdf)!='data.frame'){
    stop('newseqdf is not a data frame. Try data.frame()')
  }
  if(class(seqdf)!='data.frame'){
    stop('seqdf is not a data frame. Try data.frame()')
  }
  if(dim(seqdf)[2]!=2){
    stop('newseqdf does not have two columns. See documentation for instructions.')
  }
  if(dim(newseqdf)[2]!=2){
    stop('seqdf does not have two columns. See documentation for instructions.')
  }
  if(class(seqdf[,2])!='factor'){
    stop('newseqdf second column is not a factor. Try as.factor()')
  }
  if(class(newseqdf[,2])!='factor'){
    stop('seqdf second column is not a factor. Try as.factor()')
  }
  reglev<-levels(seqdf[,2])
  newlev<-levels(newseqdf[,2])
  catvec<-unique(c(reglev,newlev))
  if(length(catvec)==0){
    stop('No categories found, check column two of your dataframes and the instructions in the documentation.')
  }
  if(length(catvec)==1){
    warning('Only one category found!')
  }
  seq1<-seqdf
  colnames(seq1)<-c('Time','Cat')
  probabilities<-seq1 %>% dplyr::group_by(Cat) %>% dplyr::tally() %>% dplyr::mutate(freq=n/sum(n)) %>% dplyr::select(freq) %>% unlist(use.names = FALSE)
  seqlist<-SeqList(seqdf,'%Y-%m-%d')
  if(length(seqlist)==1){
    stop('Need more than one regular sequence for analysis.')
  }
  if(length(seqlist)<14){
    warning('For a better false positive rate, more regular sequences are needed. Over 30 is preferred. ')
  }
  newseqlist<-SeqList(newseqdf,'%Y-%m-%d')
  dimseq<-sapply(seqlist,function(x) dim(x)[1])
  dimnewseq<-sapply(newseqlist,function(x) dim(x)[1])
  if(!all(dimseq>length(catvec))){
    warning('Inconsistent results may occur with sequences smaller than the number of categories.')
  }

  numtestseq <- length(newseqlist)
  allsqs <- AllSQS(catvec, k)
  sqslist <- SQSList(seqlist, k)
  sqsindex <- SQSIndex(allsqs, sqslist)
  regseqscore <- RegSeqScore(seqlist, allsqs, sqsindex, catvec, probabilities, k, beta, lam)
  newsqslist <- SQSList(newseqlist, k)
  newsqsindex <- SQSIndex(allsqs, newsqslist)
  scoretable <- vector("list", numtestseq)
  for (i in 1:numtestseq) {
    scoretable[[i]] <- ScoreTable(regseqscore, newseqlist[[i]], seqlist, allsqs, newsqsindex[,i], sqsindex, catvec, probabilities, k, beta, lam)
  }

  dataset <- DensEst(scoretable, percentpwr, bndwidth, kern)
  dataset <- dataset[[1]]

  return(dataset)
}
jgillam13/IRASD documentation built on Feb. 10, 2021, 9:38 a.m.