R/SPTMData.R

Defines functions SPTMData

Documented in SPTMData

#' Conversion from a long dataframe to a sptm object
#'
#' This function allows you to calculate a Gaussian process kernel
#' @param df.obs entry points
#' @param frequency frequency
#' @keywords data conversion
#' @export
#' @examples
#' data("wq_analysis_week2")
#' SPTMData(wq.raw.obs, frequency = "quarter")
SPTMData = function(df.obs, agg = "month", seasonality = "month", method = "loess", debug=F, degFree = 10){

  SiteID = levels(df.obs$ID)

  if(agg == "year"){
    df.obs$date.temp = paste0(format(df.obs$date, "%Y"), "-01-01")
    temp = aggregate(obs ~ date.temp + ID, data = df.obs, FUN = mean)
    colnames(temp)[1] <- "date"
    dates = seq(min(as.Date(temp$date)), max(as.Date(temp$date)), by = "year")
    dfd = data.frame(date=dates)
  }
  if(agg == "6-months"){
  temp = aggregate(list(obs = df.obs$obs),
            list(date = cut(df.obs$date, "6 months"), ID = df.obs$ID),
            mean)
  dates = seq(min(as.Date(temp$date)), max(as.Date(temp$date)), by = "6 months")
  dfd = data.frame(date=dates)
  }
  if(agg == "4-months"){
    temp = aggregate(list(obs = df.obs$obs),
                     list(date = cut(df.obs$date, "4 months"), ID = df.obs$ID),
                     mean)
    dates = seq(min(as.Date(temp$date)), max(as.Date(temp$date)), by = "4 months")
    dfd = data.frame(date=dates)
  }
  if(agg == "quarter"){
    df.obs$date.temp = as.yearqtr(df.obs$date)
    temp = aggregate(obs ~ date.temp + ID, data = df.obs, FUN = mean)
    colnames(temp)[1] <- "date"
    dates = seq(min(as.Date(temp$date)), max(as.Date(temp$date)), by = "quarter")
    dfd = data.frame(date=dates)
  }
  if(agg == "2-months"){
    temp = aggregate(list(obs = df.obs$obs),
                     list(date = cut(df.obs$date, "2 months"), ID = df.obs$ID),
                     mean)
    dates = seq(min(as.Date(temp$date)), max(as.Date(temp$date)), by = "2 months")
    dfd = data.frame(date=dates)
  }
  if(agg == "month"){
    df.obs$date.temp = as.yearmon(df.obs$date)
    temp = aggregate(obs ~ date.temp + ID, data = df.obs, FUN = mean)
    colnames(temp)[1] <- "date"
    dates = seq(min(as.Date(temp$date)), max(as.Date(temp$date)), by = "month")
    dfd = data.frame(date=dates)
  }
  if(agg == "week"){
    df.obs$date.temp = format(df.obs$date, "%Y-%U %u")
    temp = aggregate(obs ~ date.temp + ID, data = df.obs, FUN = mean)
    temp$date = format(strptime(temp$date.temp, "%Y-%U %u"), "%Y-%m-%d")
    temp$date.temp <- NULL
    dates = seq(min(as.Date(temp$date)), max(as.Date(temp$date)), by = "week")
    dfd = data.frame(date=dates)
  }
  if(agg == "day"){
    df.obs$date.temp = format(df.obs$date, "%Y-%m-%d")
    temp = aggregate(obs ~ date.temp + ID, data = df.obs, FUN = mean)
    colnames(temp)[1] <- "date"
    dates = seq(min(as.Date(temp$date)), max(as.Date(temp$date)), by = "day")
    dfd = data.frame(date=dates)
  }

  temp$date = as.Date(temp$date)
  dfd$date= as.Date(dfd$date)
  y=NULL
  for(i in SiteID){
    df.m = merge(dfd, temp[temp$ID %in% i,], by="date", all.x = TRUE)
    df.m[which(is.na(df.m[,3])),2] <- i
    y = rbind(y, df.m)
  }

  y = y[order(y$date),]


  if(debug==T){browser()}

  basis = NULL
  list.idx = NULL

  if(method == "loess"){

  season = array(NA, dim = c(length(dates), length(seasonality) + 2, length(SiteID)),
                   dimnames=list(as.character(dates), c("intercept","trend",seasonality), SiteID))

  idx.n = which(!(colnames(season) %in% c("intercept","trend","year","6-months","4-months","quarter","month","week","day")))

  if(length(idx.n)>0){season <- season[,-idx.n,]}

  for(i in 1:length(SiteID)){

    Y = y[which(y$ID %in% SiteID[i]),]

    freq.gas = freqFun(agg, seasonality)

    if(agg == "year"){
      start.date = as.numeric(format(min(dates),"%Y"))
      gas <- ts((Y$obs),freq=freq.gas, start=start.date)
      dates.i = as.numeric(format(dates, "%Y"))
    }
    if(agg == "6-months"){
      start.date = as.numeric(format(min(dates),"%Y")) + round((as.numeric(format(min((dates)),"%m"))-1)/6)/2 -.5
      gas <- ts((Y$obs),freq=freq.gas, start=start.date)
      dates.i = as.numeric(format((dates),"%Y")) + round((as.numeric(format(((dates)),"%m"))-1)/6)/2 -.5
    }
    if(agg == "4-months"){
      start.date = as.numeric(format(min(dates),"%Y")) + (as.numeric(format(min((dates)),"%m"))-1)/12
      gas <- ts((Y$obs),freq=freq.gas, start=start.date)
      dates.i = as.numeric(format((dates),"%Y")) + (as.numeric(format(min((dates)),"%m"))-1)/12
    }
    if(agg == "quarter"){
      start.date = as.numeric(format(min(dates),"%Y")) + (as.numeric(format(min(as.yearqtr(dates)),"%q"))-1)/4
      gas <- ts((Y$obs),freq=freq.gas, start=start.date)
      dates.i = as.numeric(as.yearqtr(dates))
    }
    if(agg == "month"){
      start.date = as.numeric(format(min(dates),"%Y")) + (as.numeric(format(min(dates),"%m"))-1)/12
      gas <- ts((Y$obs),freq=freq.gas, start=start.date)
      dates.i = as.numeric(as.yearmon(dates))
    }
    if(agg == "week"){
      dates = as.Date(dates)
      start.date = as.numeric(format(min(dates),"%Y")) + (as.numeric(format(min(dates),"%U"))-1)/52
      gas <- ts((Y$obs),freq=freq.gas, start=start.date)
      dates.i = as.numeric(format(dates, "%Y")) + as.numeric(format(dates,"%U")) / 52
    }
    if(agg == "day"){
      dates = as.Date(dates)
      start.date = as.numeric(format(min(dates),"%Y")) + (as.numeric(format(min(dates),"%j"))-1)/366
      gas <- ts((Y$obs),freq=freq.gas, start=start.date)
      dates.i = as.numeric(format(dates, "%Y")) + as.numeric(format(dates,"%m")) / 12 + as.numeric(format(dates,"%d")) / 31
    }



    if (freq.gas < 2 || as.integer(length(gas)) <= 2 * freq.gas){
      dim.j=1
      idx.season = match(round(index(gas),3),round(dates.i,3))
      dim.j = dim.j + 1
      if(dim.j==2){
        if(length(idx.season)- sum(is.na(as.matrix(gas)))==1){
          season[idx.season,dim.j,i] <- as.matrix(gas)
        }else{
          season[idx.season,dim.j,i] <- as.matrix(gas - mean(gas, na.rm=T))
        }
      dim.j = dim.j + 1}
      season[idx.season,dim.j,i] = 0
    }else{
    dim.j = 1
    if("decade"%in% seasonality){
      s.win = round(diff(as.numeric(format(range(dates),"%Y")))/10)
      loess.ts = stl(gas, s.window = s.win, na.action = na.approx)
      idx.season = match(round(index(loess.ts$time.series),3),round(dates.i,3))
      dim.j = dim.j + 1
      if(dim.j==2){season[idx.season,dim.j,i] <- loess.ts$time.series[,2] - mean(loess.ts$time.series[,2],na.rm=T)
      dim.j = dim.j + 1}
      season[idx.season,dim.j,i] = loess.ts$time.series[,1]
      start.date = index(loess.ts$time.series)[1]
      gas <- ts(loess.ts$time.series[,3],freq=freq.gas, start=start.date)
    }
    if("5-years"%in% seasonality){
      s.win = round(diff(as.numeric(format(range(dates),"%Y")))/5)
      loess.ts = stl(gas, s.window = s.win, na.action = na.approx)
      idx.season = match(round(index(loess.ts$time.series),3),round(dates.i,3))
      dim.j = dim.j + 1
      if(dim.j==2){season[idx.season,dim.j,i] <- loess.ts$time.series[,2] - mean(loess.ts$time.series[,2],na.rm=T)
      dim.j = dim.j + 1}
      season[idx.season,dim.j,i] = loess.ts$time.series[,1]
      start.date = index(loess.ts$time.series)[1]
      gas <- ts(loess.ts$time.series[,3],freq=freq.gas, start=start.date)
    }
    if("2-years"%in% seasonality){
      s.win = round(diff(as.numeric(format(range(dates),"%Y")))/2)
      loess.ts = stl(gas, s.window = s.win, na.action = na.approx)
      idx.season = match(round(index(loess.ts$time.series),3),round(dates.i,3))
      dim.j = dim.j + 1
      if(dim.j==2){season[idx.season,dim.j,i] <- loess.ts$time.series[,2] - mean(loess.ts$time.series[,2],na.rm=T)
      dim.j = dim.j + 1}
      season[idx.season,dim.j,i] = loess.ts$time.series[,1]
      start.date = index(loess.ts$time.series)[1]
      gas <- ts(loess.ts$time.series[,3],freq=freq.gas, start=start.date)
    }
    if("year"%in% seasonality){
      s.win = diff(as.numeric(format(range(dates),"%Y")))
      loess.ts = stl(gas, s.window = s.win, na.action = na.approx)
      idx.season = match(round(index(loess.ts$time.series),3),round(dates.i,3))
      dim.j = dim.j + 1
      if(dim.j==2){season[idx.season,dim.j,i] <- loess.ts$time.series[,2] - mean(loess.ts$time.series[,2],na.rm=T)
      dim.j = dim.j + 1}
      season[idx.season,dim.j,i] = loess.ts$time.series[,1]
      start.date = index(loess.ts$time.series)[1]
      gas <- ts(loess.ts$time.series[,3],freq=freq.gas, start=start.date)
    }
    if("quarter"%in% seasonality){
      s.win = diff(as.numeric(format(range(dates),"%Y")))*4
      loess.ts = stl(gas, s.window = s.win, na.action = na.approx)
      idx.season = match(round(index(loess.ts$time.series),3),round(dates.i,3))
      dim.j = dim.j + 1
      if(dim.j==2){season[idx.season,dim.j,i] <- loess.ts$time.series[,2] - mean(loess.ts$time.series[,2],na.rm=T)
      dim.j = dim.j + 1}
      season[idx.season,dim.j,i] = loess.ts$time.series[,1]
      start.date = index(loess.ts$time.series)[1]
      gas <- ts(loess.ts$time.series[,3],freq=freq.gas, start=start.date)
    }
    if("month"%in% seasonality){
      s.win = diff(as.numeric(format(range(dates),"%Y")))*12
      loess.ts = stl(gas, s.window = s.win, na.action = na.approx)
      idx.season = match(round(index(loess.ts$time.series),3),round(dates.i,3))
      dim.j = dim.j + 1
      if(dim.j==2){season[idx.season,dim.j,i] <- loess.ts$time.series[,2] - mean(loess.ts$time.series[,2],na.rm=T)
      dim.j = dim.j + 1}
      season[idx.season,dim.j,i] = loess.ts$time.series[,1]
      start.date = index(loess.ts$time.series)[1]
      gas <- ts(loess.ts$time.series[,3],freq=freq.gas, start=start.date)
    }
    if("week"%in% seasonality){
      s.win = diff(as.numeric(format(range(dates),"%Y")))*52
      loess.ts = stl(gas, s.window = s.win, na.action = na.approx)
      idx.season = match(round(index(loess.ts$time.series),3),round(dates.i,3))
      dim.j = dim.j + 1
      if(dim.j==2){season[idx.season,dim.j,i] <- loess.ts$time.series[,2] - mean(loess.ts$time.series[,2],na.rm=T)
      dim.j = dim.j + 1}
      season[idx.season,dim.j,i] = loess.ts$time.series[,1]
      start.date = index(loess.ts$time.series)[1]
      gas <- ts(loess.ts$time.series[,3],freq=freq.gas, start=start.date)
    }
    if("day"%in% seasonality){
      s.win = diff(as.numeric(format(range(dates),"%Y")))*365
      loess.ts = stl(gas, s.window = s.win, na.action = na.approx)
      idx.season = match(round(index(loess.ts$time.series),3),round(dates.i,3))
      dim.j = dim.j + 1
      if(dim.j==2){season[idx.season,dim.j,i] <- loess.ts$time.series[,2] - mean(loess.ts$time.series[,2],na.rm=T)
      dim.j = dim.j + 1}
      season[idx.season,dim.j,i] = loess.ts$time.series[,1]
      start.date = index(loess.ts$time.series)[1]
      gas <- ts(loess.ts$time.series[,3],freq=freq.gas, start=start.date)
    }
    }
    season[,1,i] <- 1
  }

  }
  if(method == "splines"){

  season = array(NA, dim = c(length(dates), length(seasonality) + 2, length(SiteID)),
                   dimnames=list(as.character(dates), c("intercept","trend",seasonality), SiteID))

  idx.n = which(!(colnames(season) %in% c("intercept","trend","year","6-months","4-months","quarter","month","week","day")))

  if(length(idx.n)>0){season <- season[,-idx.n,]}

    for(i in 1:length(SiteID)){

      # i = 1

      Y = y[which(y$ID %in% SiteID[i]),]

        bbF = matrix(1, ncol = 1, nrow = length(Y$date))
        list.idx = list(1)
        #if("trend" %in% seasonality){
          bbTemp = bs(as.numeric(Y$date)/1000,df = degFree)
          tmp <- cor(bbTemp)
          tmp[upper.tri(tmp)] <- 0
          diag(tmp) <- 0
          bbTemp <- bbTemp[,!apply(tmp,2,function(x) any(x > 0.75))]
          bbF = cbind(bbF,bbTemp)
          list.idx = c(list.idx, list((ncol(bbF) - ncol(bbTemp)+1):ncol(bbF)))
        #}
        if("decade"%in% seasonality){
          datef = as.numeric(format(Y$date, "%Y")) + (as.numeric(format(Y$date, "%m"))-1)/12 + (as.numeric(format(Y$date, "%d"))-1)/31
          bbTemp = bs(datef,df = diff(range(as.numeric(format(Y$date, "%Y"))))/10)
          tmp <- cor(bbTemp)
          tmp[upper.tri(tmp)] <- 0
          diag(tmp) <- 0
          bbTemp <- bbTemp[,!apply(tmp,2,function(x) any(x > 0.75))]
          bbF = cbind(bbF,bbTemp)
          list.idx = c(list.idx, list((ncol(bbF) - ncol(bbTemp)+1):ncol(bbF)))
        }
        if("5-years"%in% seasonality){
          datef = as.numeric(format(Y$date, "%Y")) + (as.numeric(format(Y$date, "%m"))-1)/12 + (as.numeric(format(Y$date, "%d"))-1)/31
          bbTemp = bs(datef,df = diff(range(as.numeric(format(Y$date, "%Y"))))/5)
          tmp <- cor(bbTemp)
          tmp[upper.tri(tmp)] <- 0
          diag(tmp) <- 0
          bbTemp <- bbTemp[,!apply(tmp,2,function(x) any(x > 0.75))]
          bbF = cbind(bbF,bbTemp)
          list.idx = c(list.idx, list((ncol(bbF) - ncol(bbTemp)+1):ncol(bbF)))
        }
        if("2-years"%in% seasonality){
          datef = as.numeric(format(Y$date, "%Y")) + (as.numeric(format(Y$date, "%m"))-1)/12 + (as.numeric(format(Y$date, "%d"))-1)/31
          bbTemp = bs(datef,df = diff(range(as.numeric(format(Y$date, "%Y"))))/2)
          tmp <- cor(bbTemp)
          tmp[upper.tri(tmp)] <- 0
          diag(tmp) <- 0
          bbTemp <- bbTemp[,!apply(tmp,2,function(x) any(x > 0.75))]
          bbF = cbind(bbF,bbTemp)
          list.idx = c(list.idx, list((ncol(bbF) - ncol(bbTemp)+1):ncol(bbF)))
        }
        if("year"%in% seasonality){
          datef = as.numeric(format(Y$date, "%Y")) + (as.numeric(format(Y$date, "%m"))-1)/12 + (as.numeric(format(Y$date, "%d"))-1)/31
          bbTemp = bs(datef,df = diff(range(as.numeric(format(Y$date, "%Y"))))/1)
          tmp <- cor(bbTemp)
          tmp[upper.tri(tmp)] <- 0
          diag(tmp) <- 0
          bbTemp <- bbTemp[,!apply(tmp,2,function(x) any(x > 0.75))]
          bbF = cbind(bbF,bbTemp)
          list.idx = c(list.idx, list((ncol(bbF) - ncol(bbTemp)+1):ncol(bbF)))
        }
        if("quarter"%in% seasonality){
          datef = as.numeric(format(Y$date, "%m")) + as.numeric(format(Y$date, "%d"))/31
          bbTemp = bs(datef,df = 4)
          tmp <- cor(bbTemp)
          tmp[upper.tri(tmp)] <- 0
          diag(tmp) <- 0
          bbTemp <- bbTemp[,!apply(tmp,2,function(x) any(x > 0.75))]
          bbF = cbind(bbF,bbTemp)
          list.idx = c(list.idx, list((ncol(bbF) - ncol(bbTemp)+1):ncol(bbF)))
        }
        if("month"%in% seasonality){
          datef = as.numeric(format(Y$date, "%m")) + as.numeric(format(Y$date, "%d"))/31
          bbTemp = bs(datef,df = length(unique(as.numeric(format(Y$date, "%m")))))
          tmp <- cor(bbTemp)
          tmp[upper.tri(tmp)] <- 0
          diag(tmp) <- 0
          bbTemp <- bbTemp[,!apply(tmp,2,function(x) any(x > 0.75))]
          bbF = cbind(bbF,bbTemp)
          list.idx = c(list.idx, list((ncol(bbF) - ncol(bbTemp)+1):ncol(bbF)))
        }
        if("week"%in% seasonality){
          datef =as.numeric(format(Y$date, "%W")) + as.POSIXlt(Y$date)$wday / 7
          bbTemp = bs(datef,df = length(unique(as.numeric(format(Y$date, "%W")))))
          tmp <- cor(bbTemp)
          tmp[upper.tri(tmp)] <- 0
          diag(tmp) <- 0
          bbTemp <- bbTemp[,!apply(tmp,2,function(x) any(x > 0.75))]
          bbF = cbind(bbF,bbTemp)
          list.idx = c(list.idx, list((ncol(bbF) - ncol(bbTemp)+1):ncol(bbF)))
        }
        if("day"%in% seasonality){
          bbTemp = bs(as.numeric(format(Y$date, "%j")),df = length(unique(as.numeric(format(Y$date, "%j")))))
          tmp <- cor(bbTemp)
          tmp[upper.tri(tmp)] <- 0
          diag(tmp) <- 0
          bbTemp <- bbTemp[,!apply(tmp,2,function(x) any(x > 0.75))]
          bbF = cbind(bbF,bbTemp)
          list.idx = c(list.idx, list((ncol(bbF) - ncol(bbTemp)+1):ncol(bbF)))
        }

        idx.na = which(!is.na(Y$obs))
        w = ginv(t(bbF[idx.na,]) %*% bbF[idx.na,]) %*% t(bbF[idx.na,]) %*% matrix(Y$obs[idx.na], ncol=1)
        var.w = ginv(t(bbF[idx.na,]) %*% bbF[idx.na,]) * mean((Y$obs-bbF %*% w)^2, na.rm=T)

        for(j in 1:length(list.idx)){
          if(length(list.idx[[j]]) == 1){season[,j,i] = bbF[,list.idx[[j]]] * w[list.idx[[j]]]}
          if(length(list.idx[[j]]) > 1){season[,j,i] = bbF[,list.idx[[j]]] %*% w[list.idx[[j]]]}
          delta = diff(range(season[,j,i]))
          if(delta == 0){delta = season[,j,i] = 1}else{
            season[,j,i] = (season[,j,i] - min(season[,j,i]))/delta*2 - 1
          }
        }

    }

  basis = bbF
  }

  sptm.data = list(obs.data = y, season = season, basis = basis, list.idx = list.idx)
  class(sptm.data) <- 'sptm'
  return(sptm.data)
}
ick003/SpTMixture documentation built on May 18, 2019, 2:32 a.m.