R/coscilibrary.R

#' Rank the p features in an n by p design matrix
#'
#' Ranks the p features in an n by p design matrix
#' where n represents the sample size and p is the number of features.
#'
#' @param dat n by p data matrix
#' @param min.alpha the smallest threshold (typically set to 0)
#' @param small.perturbation a small positive number to remove ties. Default value is 10^(-6)
#'
#' @return  a p vector of scores
#'
#' @details Uses the univariate merging algorithm \code{\link{bmt}} and produces a score
#'     for each feature that reflects its relative importance for clustering.
#'
#' @seealso \code{\link{bmt}},\code{\link{cosci_is_select}}
#'
#' @examples
#' \donttest{
#' library(fusionclust)
#' set.seed(42)
#' noise<-matrix(rnorm(49000),nrow=1000,ncol=49)
#' set.seed(42)
#' signal<-c(rnorm(500,-1.5,1),rnorm(500,1.5,1))
#' x<-cbind(signal,noise)
#' scores<- cosci_is(x,0)
#' }
#' \dontshow{
#' library(fusionclust)
#' set.seed(42)
#' x<-matrix(rnorm(10,0,1),5,2)
#' scores<- cosci_is(x,0)
#' }
#'
#' @references
#' \enumerate{
#' \item  Banerjee, T., Mukherjee, G. and Radchenko P., Feature Screening in
#' Large Scale Cluster Analysis, Journal of Multivariate Analysis,
#' Volume 161, 2017, Pages 191-212
#' \item P. Radchenko, G. Mukherjee, Convex clustering via l1 fusion penalization,
#'  J. Roy. Statist, Soc. Ser. B (Statistical Methodology) (2017)
#'  doi:10.1111/rssb.12226.
#' }
#'
#' @export

cosci_is<-function(dat,min.alpha,small.perturbation=10^(-6)){

  p <- ncol(dat)
  dim.score<- matrix(0,nrow=p,ncol=3)
  for (j in 1:p){
    #remove ties
    x<- dat[,j]
    set.seed(1)
    nn<- rnorm(length(x))*small.perturbation
    x<-x+nn
    x<-sort(x);
    n=length(x);
    #initializiations
    C=n;#representing number of clusters by C
    ind<-1:C;#cluster index of observations
    ind.along.path<-c();
    l.old<-0;
    lambda<-0;
    merge.points<-c();
    no.of.clusters<-c();
    probability<-c();
    boundaries<-c();
    freq<-rep(1,n);
    centroids<-x;
    unique.ind<-ind;
    counter=3;
    state<- 1
    while(state==1){
      l.new<-(centroids[2:C]-centroids[1:(C-1)])/(freq[2:C]+freq[1:(C-1)]);
      l.new.mod<-l.new[l.new>=l.old];
      l.old<-min(l.new.mod);
      i=match(l.old,l.new);
      merge.points<-c(merge.points,C);
      a.2<-min(x[(ind==unique.ind[i+1])]);
      a.1<-max(x[(ind==unique.ind[i])])
      probability<-rbind(probability,c(freq[i],freq[i+1])/n);
      centroids[i]<-(centroids[i]*freq[i]+centroids[i+1]*freq[i+1])/(freq[i]+freq[i+1]);
      centroids<-centroids[-(i+1)];
      freq[i]=freq[i]+freq[i+1];
      freq<-freq[-(i+1)];
      ind[(ind==unique.ind[i+1])]<-unique.ind[i];
      unique.ind<-unique.ind[-(i+1)];
      lambda<-c(lambda,l.old);
      C=C-1;
      if (C==1){
        #paste('We only have one cluster if the penalization parameter exceeds');
        l<-list(prob=probability);
        state<-0;
      }
    }
    prob1<- as.matrix(l$prob)
    score<- as.matrix(cbind(apply(prob1,1,min),apply(prob1,1,sum)))
    grp<- matrix(0,nrow=nrow(prob1),ncol=1)
    idx <- c(score[,1]>min.alpha & score[,2]>=0.5)
    grp[idx]<-score[idx,1]
    m<- max(grp)
    l<- min(grp)
    dim.score[j,]<- cbind(max(grp[idx]),mean(grp[idx]),sum(idx))

  }
  dim.score[is.nan(dim.score)]<-0

  return(dim.score[,1])
}

#' Use a data driven approach to select the features
#'
#' Once you have the feature scores from \code{\link{cosci_is}}, you can select the features
#' \enumerate{
#' \item based on a pre-defined threshold,
#' \item using table A.10 in the paper[1] to determine an appropriate threshold or,
#' \item using a data driven approach described in the references to select the features
#' and obtain an implicit threshold value.
#' }
#' cosci_is_select implements option 3.
#'
#' @import bbmle
#' @importFrom graphics hist
#' @importFrom stats dbeta
#' @importFrom stats glm
#' @importFrom stats pbeta
#' @importFrom stats rnorm
#'
#' @param score a p vector of scores
#' @param gamma what proportion of the p features is noise? If your sample size n
#' is smaller than 100, setting gamma = 0.85 is recommended. Otherwise set gamma = 0.9
#'
#' @return  a vector of selected features
#'
#' @details Converts the problem of screening out features with lower scores into a
#'     problem in large scale multiple testing and uses the procedure described in
#'     reference [2] to determine the signal features.
#'
#' @seealso \code{\link{cosci_is}}
#'
#' @examples
#' \donttest{
#' library(fusionclust)
#' set.seed(42)
#' noise<-matrix(rnorm(49000),nrow=1000,ncol=49)
#' set.seed(42)
#' signal<-c(rnorm(500,-1.5,1),rnorm(500,1.5,1))
#' x<-cbind(signal,noise)
#' scores<- cosci_is(x,0)
#' features<-cosci_is_select(scores,0.9)
#' }
#' \dontshow{
#' library(fusionclust)
#' set.seed(42)
#' scores<- c(0.418,0.35,0.5*rbeta(49,0.7,6))
#' features<-cosci_is_select(scores,0.9)
#' }
#'
#' @references
#' \enumerate{
#' \item  Banerjee, T., Mukherjee, G. and Radchenko P., Feature Screening in
#' Large Scale Cluster Analysis, Journal of Multivariate Analysis,
#' Volume 161, 2017, Pages 191-212
#' \item T. Cai, W. Sun, W., Optimal screening and discovery of sparse signals
#' with applications to multistage high throughput studies,
#' J. Roy.Statist. Soc. Ser. B (Statistical Methodology) 79, no. 1 (2017) 197-223
#' }
#'
#' @export

cosci_is_select<-function(score,gamma){

  warnings('off')
  p<- length(score)
  x<- score/0.5
  see<- order(x)
  interval<- c(0,x[see[ceiling(gamma*p)]])
  xi<- x[x<=interval[2]]
  N0<- length(xi)
  N<- length(x)
  theta<- (N0/N)

  nll<-function(a,b,n0,u){

    -sum(log(dbeta(xi,a,b)))+n0*log(pbeta(u,a,b))
  }

  out <- mle2(nll,start=list(a=0.2,b=5),data=list(n0=N0,u=interval[2]))

  ahat<-as.vector(out@fullcoef[1])
  bhat<- as.vector(out@fullcoef[2])
  pi0<- min(theta/pbeta(interval[2],ahat,bhat),0.99)

  f.null<-matrix(0,p,1)
  index<- x<=interval[2]
  f.null<- dbeta(x,ahat,bhat)

  histinfo<-hist(x,breaks=min(p/2,150),plot=FALSE)
  xx<- histinfo$mids
  X<- cbind(xx,xx^{2},xx^{3},xx^{4},xx^{5})
  y<- histinfo$counts
  m<-glm(y~X,family="poisson")
  beta<-as.vector(m$coefficients)
  beta<- matrix(beta,ncol=1,byrow=TRUE)
  X<-cbind(matrix(1,p,1),x,x^{2},x^{3},x^{4},x^{5})
  v<- exp(X%*%beta)
  gap<-histinfo$breaks[2]-histinfo$breaks[1]
  fhat<- v/(length(x)*gap)

  fdr<- (pi0*f.null)/fhat
  fdr[fdr>1]=1
  # ---------------- We will do MDR screening now -----------------------------------------

  Ti<- fdr[order(fdr)]
  Ti[Ti>1]<- 1
  Yi<- 1-Ti
  eps<- 1-pi0
  nom.level<-(1/log(p))
  cutoff.1<- p*eps*nom.level
  ks<- min(which((rev(cumsum(rev(Yi))))<=cutoff.1))
  idx.1<- (fdr<=Ti[ks])
  alpha.1<- score[idx.1]
  select.d1<- (1:p)*idx.1
  select.d1<- select.d1[select.d1>0]
  r.s<- sum(idx.1)

  # ---------------- Now do FPR signal discovery -----------------------------------------
  fdr.1<- fdr[idx.1]
  Ti<- fdr.1[order(fdr.1)]
  Ti[Ti>1]<- 1
  cutoff.2<- min(nom.level,0.1)
  kd<-  max(which((cumsum(Ti)/(1:r.s))<=cutoff.2))
  idx.2<- (fdr.1<=Ti[kd])
  alpha.2<- alpha.1[idx.2]
  select.d2<- select.d1[idx.2]

return(list("selected"=select.d2))

}

Try the fusionclust package in your browser

Any scripts or data that you put into this service are public.

fusionclust documentation built on May 2, 2019, 9:39 a.m.