R/makeCOVbin.R

#' make the bins with equal number of observations  
#' or using user-defined breaks
#'
#' @title Bin-maker
#' @param COV.data numeric vector that need to make bins
#' @param N.covbin the number of bins
#' @param breaks.data user-defined breaks
#' @return information of the binning with summary
#' @export
#' @examples
#' data(origdata)
#' makeCOVbin(origdata$TIME,7)

makeCOVbin<-function(COV.data,N.covbin=NULL,breaks.data=NULL){   
   find.LU<-function(bin.ID){
      LU<-NULL
      for(i in 1:length(bin.ID)){
         LU<-rbind(LU,as.numeric(unlist(strsplit(unlist(strsplit(
                   unlist(strsplit(unlist(strsplit(
                   unlist(strsplit(as.character(bin.ID[i]),",")),"]")),
                                 '\\[')),"\\(")),"\\("))))    
      }
      colnames(LU)<-c("Lower","Upper")
      return(LU) 
   }

   data.temp<-data.frame(COV.data=COV.data)
   if(!is.null(breaks.data)){
      range.temp<-range(COV.data)
      if(min(breaks.data)>range.temp[1]){
         breaks.data[1]<-range.temp[1]-(range.temp[2]-range.temp[1])*0.5
      } else if(max(breaks.data)<range.temp[2]){
         breaks.data[length(breaks.data)]<-range.temp[2]+
                                         (range.temp[2]-range.temp[1])*0.5      
      }
      cut.temp<-cut(COV.data, breaks=breaks.data) 
      tab<-ddply(data.temp,.(cut.temp), summarize,
                       mid.COV=round(mean(COV.data, na.rm=T),2),.drop=FALSE)
        
   } else{
      if(N.covbin<length(table(COV.data))){
         cutpoints<-quantile(COV.data,(0:N.covbin)/N.covbin)
         temp.id<-which(diff(cutpoints)==0)
         if(length(temp.id)!=0)
            cutpoints<-cutpoints[-temp.id]
         cut.temp<-cut(COV.data,cutpoints,include.lowest=TRUE)
         if(sum(table(cut.temp)==0)!=0){
            temp.id<-which(table(cut.temp)==0)
            cutpoints<-cutpoints[-temp.id]
            cut.temp<-cut(COV.data,cutpoints,include.lowest=TRUE)
         }
         tab<-ddply(data.temp,.(cut.temp),summarize, 
                      med.COV=round(median(COV.data, na.rm=T),2),.drop=FALSE)
      } else{
         cut.temp.id<-as.numeric(names(table(COV.data)))
         temp.diff<-diff(cut.temp.id)/2
         cut.temp.id<-c(cut.temp.id[1]-temp.diff[1],
                        cut.temp.id[-length(cut.temp.id)]+temp.diff,
                        cut.temp.id[length(cut.temp.id)]+temp.diff[length(temp.diff)])
         cut.temp<-cut(COV.data,cut.temp.id)     
         tab<-ddply(data.temp,.(cut.temp),summarize,
                   med.COV=round(median(COV.data, na.rm=T),2),.drop=FALSE)      
      }
   } 
   LU.temp<-find.LU(tab[,1])
   colnames(LU.temp)<-c("lower.COV","upper.COV")
   mid.LU<-apply(LU.temp,1,mean)
   tab<-data.frame(tab,n.bin=c(table(cut.temp)),LU.temp,mid.LU=mid.LU)
   return(list(COV.bin=cut.temp,COV.bin.summary=tab))
}
EK-Lee/asVPC documentation built on May 6, 2019, 3:09 p.m.