kdpec: K-Dimensional Partitioned Episodic Clustering

Description Usage Arguments Value Author(s) Examples

View source: R/kdpec.R

Description

Overlapping or sudo-overlapping observation clustering within k-dimensional partitions.

Usage

1
2
kdpec(id, kdim, startdate, enddate, 
      slack = 0, restartindex = FALSE)

Arguments

id

A vector or data.frame representing a unique identifier or key.

kdim

A vector or data.frame representing a "k-dimensional" index across which clusters cannot be formed.

startdate

A date vector of each observation's start date.

enddate

A date vector of each observation's end date.

slack

a positive numeric value representing the gap in days over which a cluster can be formed. the default is 0.

restartindex

If TRUE, the squential cluster IDs will restart at 1 within each k-dimensional partition.

Value

kdpec returns a data.frame with the column or group of columns used to uniquely identify each observation along with the following:

kdimidx

K-Dimensional Index. A sequential ID indexing each k-dimensional set.

episode

A sequential ID indexing each episodic cluster.

Author(s)

Robert P. Bronaugh

Examples

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
## Not run: 
  # merge a patient's claims for a specific diagnosis together:
  # use kdim to prevent episode clustering across patient and diagnosis
  # (i.e.,) the combination of PatientID and Diagnosis become a partition
  # across which episodic clusters cannot be formed). 
  # restartindex = TRUE starts the episode index over at 1 for each k-dimensional partition
  data(epclaims)
  attach(epclaims)
  require(sqldf)
  kd = kdpec(id = epclaims$ClaimNumber,kdim = cbind(epclaims$PatientID,epclaims$Diagnosis)
            ,startdate = epclaims$ServiceStart,enddate = epclaims$ServiceEnd
            ,restartindex=TRUE)
  
  # print the id, k-dimensional partition index (kdimidx), and the episodes
  print(kd)
  
  # restartindex = FALSE 
  kd = kdpec(epclaims$ClaimNumber,cbind(epclaims$PatientID,epclaims$Diagnosis),
             epclaims$ServiceStart,epclaims$ServiceEnd,restartindex=FALSE)
  print(kd)
  
  # merge episode indexes with original data 
  ep.2 = sqldf("SELECT  ep.PatientID
               ,ep.ClaimNumber
               ,ep.Diagnosis
               ,ep.ServiceStart
               ,ep.ServiceEnd
               ,kd.kdimidx
               ,kd.episode
               FROM    epclaims ep
               INNER JOIN kd
               ON ep.ClaimNumber = kd.id")
  
  
  # plot time spans of original records
  washcol = wash("gry",0.8)
  for (i in 1:nrow(epclaims)) {
    if (i ==1) {
      plot(c(epclaims$ServiceStart[i],epclaims$ServiceEnd[i]),rep(i,2)
           ,type="l", col = washcol, lwd = 3
           ,xlim = c(min(epclaims$ServiceStart)-3
                     ,max(epclaims$ServiceStart)+3)
           ,ylim = c(0,15)
           ,xlab = "length of service"
           ,ylab = "claim record index")
    } else if (i < 6) {
      lines(c(epclaims$ServiceStart[i],epclaims$ServiceEnd[i])
            ,rep(i,2),col = washcol, lwd = 3) 
    } else if ( i < 10) {
      lines(c(epclaims$ServiceStart[i],epclaims$ServiceEnd[i])
            ,rep(i,2),col = washcol, lwd = 3) 
    } else if (i == 10) {
      lines(c(epclaims$ServiceStart[i],epclaims$ServiceEnd[i])
            ,rep(i,2),col = washcol, lwd = 3) 
    } else {
      lines(c(epclaims$ServiceStart[i],epclaims$ServiceEnd[i])
            ,rep(i,2),col = washcol, lwd = 3) 
    }
  }
  
  # plot time spans of original records.  Color by assigned k-dim index
  washcol = c(wash("blu1",1),wash("grn2",1),wash("org",1),wash("red1",1))
  for (i in 1:nrow(ep.2)) {
    if (i ==1) { 
      plot(c(ep.2$ServiceStart[i],ep.2$ServiceEnd[i]),rep(i,2)
           ,type="l", col = washcol[ep.2$kdimidx[i]], lwd = 3
           ,xlim = c(min(ep.2$ServiceStart)-3,max(ep.2$ServiceStart)+3)
           ,ylim = c(0,15)
           ,xlab = "length of service"
           ,ylab = "claim record index")
    } else if (i < 6) {
      lines(c(ep.2$ServiceStart[i],ep.2$ServiceEnd[i]),rep(i,2)
            ,col = washcol[ep.2$kdimidx[i]], lwd = 3) 
    } else if ( i < 10) {
      lines(c(ep.2$ServiceStart[i],ep.2$ServiceEnd[i]),rep(i,2)
            ,col = washcol[ep.2$kdimidx[i]], lwd = 3) 
    } else if (i == 10) {
      lines(c(ep.2$ServiceStart[i],ep.2$ServiceEnd[i]),rep(i,2)
            ,col = washcol[ep.2$kdimidx[i]], lwd = 3) 
    } else {
      lines(c(ep.2$ServiceStart[i],ep.2$ServiceEnd[i]),rep(i,2)
            ,col = washcol[ep.2$kdimidx[i]], lwd = 3) 
    }
  }
  
  # merge records to get the full length of each episode
  ep.episodes = data.frame("kdimidx" = tapply(ep.2$kdimidx,ep.2$episode,min),
                           "episodeStart" = as.Date(tapply(ep.2$ServiceStart
                              ,ep.2$episode,min),origin = "1970-01-01"),
                           "episodeEnd" = as.Date(tapply(ep.2$ServiceEnd
                              ,ep.2$episode,max),origin = "1970-01-01"))
  
  # plot the length of service of each episode.  kdimidx, not claim 
  # records, are on the y axis colors represent each kdimidx
  washcol = c(wash("blu1",1),wash("grn2",1),wash("org",1),wash("red1",1))
  i = 1
  for (i in 1:nrow(ep.episodes)) {
    if (i ==1) {
      plot(c(ep.episodes$episodeStart[i],ep.episodes$episodeEnd[i])
            ,rep(ep.episodes$kdimidx[i],2)
           ,type="l", col = washcol[ep.episodes$kdimidx[i]], lwd = 3
           ,xlim = c(min(ep.2$ServiceStart)-3,max(ep.2$ServiceStart)+3)
           ,ylim = c(0,4)
           ,xlab = "length of episode"
           ,ylab = "k-dimensional index")
    } else if (i < 6) {
      lines(c(ep.episodes$episodeStart[i],ep.episodes$episodeEnd[i])
            ,rep(ep.episodes$kdimidx[i],2)
            ,col = washcol[ep.episodes$kdimidx[i]], lwd = 3) 
    } else if ( i < 10) {
      lines(c(ep.episodes$episodeStart[i],ep.episodes$episodeEnd[i])
            ,rep(ep.episodes$kdimidx[i],2)
            ,col = washcol[ep.episodes$kdimidx[i]], lwd = 3) 
    } else if (i == 10) {
      lines(c(ep.episodes$episodeStart[i],ep.episodes$episodeEnd[i])
            ,rep(ep.episodes$kdimidx[i],2)
            ,col = washcol[ep.episodes$kdimidx[i]], lwd = 3) 
    } else {
      lines(c(ep.episodes$episodeStart[i],ep.episodes$episodeEnd[i])
            ,rep(ep.episodes$kdimidx[i],2)
            ,col = washcol[ep.episodes$kdimidx[i]], lwd = 3) 
    }
  }
  detach(epclaims)

## End(Not run)
  

phalen documentation built on May 29, 2017, 4:22 p.m.

Related to kdpec in phalen...