R/ExtrapolationFunctions.R

Defines functions IndexesValues.extrap IndexesValues.calculation ConsM.generation MostProbableClustering.Extrapolation ConsMatrix.Extrapolation IndexesPlot.Extrapolation

Documented in ConsMatrix.Extrapolation IndexesPlot.Extrapolation MostProbableClustering.Extrapolation

#' Extrapolate objects from ClusterAnalysis
#' 
#' @description Extrapolate objects from ClusterAnalysis output list (see \code{\link{ClusterAnalysis}}).
#'
#' @param stability.list The list obtained from the ClusterAnalysis function. (see \code{\link{ClusterAnalysis}})
#' @param G  The number of clusters.
#' @param q The quantiles used to calculate the time grid interval on which the distances are calculated. If NULL then the time grid outliers will be ignored through the distance calculation. If double (0<q<1) then the cutting is symmetrical w.r.t. the quantile setled (e.g., q = 0.25). If a vector, then the minimum value is used for the lower cutting and the maximum value for the upper cutting.
#' @details 
#' \itemize{
#' \item{IndexesPlot.Extrapolation}{extrapolates from ClusterAnalysis output list the box plot fixing the h value.}
#' \item{ConsMatrix.Extrapolation}{extrapolates from ClusterAnalysis output list the Stability Matrix for G fixed.}
#' \item{MostProbableClustering.Extrapolation}{extrapolates from ClusterAnalysis output list the most frequent clustering among the several runs obtained  for G fixed.}
#' }
#' 
#' @author Cordero Francesca, Pernice Simone, Sirovich Roberta
#'  
#' @name ExtrapolationFuncs
NULL
#> NULL
#' @import ggplot2 viridis tidyr statmod dplyr
#' 
#' @rdname ExtrapolationFuncs
#' @export
IndexesPlot.Extrapolation <- function(stability.list,q=NULL){
  
  Clusters.List<-stability.list$Clusters.List 
  G <- as.numeric(sub("G","",names(Clusters.List)))
  
  Allerrors = which(sapply(1:length(Clusters.List),
                           function(x) is.character(Clusters.List[[x]]) ))
  if(length(Allerrors)>0)
  {
    errors = sapply(Allerrors, function(x) 
      paste("Cluster",G[x],"has the following errors: ", Clusters.List[[x]]) )
    
    print(errors)
    G = G[-Allerrors]
    Clusters.List = Clusters.List[-Allerrors]
  }
  if(length(G)==0)
  {
    return("All the CONNECTOR runs have errors!")
  }
  
  IndexesValues.list <- IndexesValues.calculation(Clusters.List, G,q)
  IndexesValues <- IndexesValues.extrap(IndexesValues.list,Clusters.List, G)
  Indexes.MostProb <- IndexesValues$Indexes.MostProb
  Indexes.Rep <- IndexesValues$Indexes.Rep
  
  Box.pl<- ggplot(data= Indexes.Rep)+
    facet_wrap(~Index,scales = "free")+
    geom_violin(aes(x=Cluster,y=V,fill=ClusterH,group=Cluster),scale = "width")+
    geom_line(data= Indexes.MostProb,aes(x=Cluster,y=V,col="Most probable"))+
    geom_jitter(aes(x=Cluster,y=V),color="black", width = .1, height = 0, alpha=0.5)+   
    scale_fill_viridis("",discrete = TRUE, alpha=0.6) +
    #geom_point(col="red",aes(x=Cluster,y=V,size=Freq/runs) )+
    labs(size="Counts freq.",col= "",x = "Number of Clusters",y="Indexes Values")+
    scale_x_continuous(breaks = G)+
    scale_color_manual(values = c("Most probable" = "blue") ) +
    theme(axis.text=element_text(size = 14, hjust = 0.5),
          axis.text.x=element_text(vjust=0.5, hjust=1),
          axis.title=element_text(size=16,face="bold"),
          axis.line = element_line(colour="black"),
          plot.title=element_text(size=30, face="bold", vjust=1, lineheight=0.6),
          legend.text=element_text(size=16),
          legend.position="bottom",
          legend.key=element_blank(),
          legend.title = element_text(size=16,face="bold"),
          legend.key.size = unit(.9, "cm"),
          legend.key.width = unit(.9,"cm"),
          panel.background = element_rect(colour = NA),
          plot.background = element_rect(colour = NA),
          plot.margin=unit(c(5,5,5,5),"mm"),
          strip.text = element_text(size = 20)) 
  
  return(list(Plot=Box.pl, IndexesValues = IndexesValues$DataIndexes) )
}

#' @rdname ExtrapolationFuncs
#' @export
ConsMatrix.Extrapolation <- function(stability.list,q=NULL){
  Clusters.List<-stability.list$Clusters.List 
  data <-stability.list$CONNECTORList
  runs <-stability.list$runs 
  
  G <- as.numeric(sub("G","",names(Clusters.List)))
  Allerrors = which(sapply(1:length(Clusters.List),
                           function(x) is.character(Clusters.List[[x]]) ))
  if(length(Allerrors)>0)
  {
    errors = sapply(Allerrors, function(x) 
      paste("Cluster",G[x],"has the following errors: ", Clusters.List[[x]]) )
    
    print(errors)
    G = G[-Allerrors]
    Clusters.List = Clusters.List[-Allerrors]
  }
  if(length(G)==0)
  {
    return("All the CONNECTOR runs have errors!")
  }
  
  IndexesValues.list <- IndexesValues.calculation(Clusters.List, G,q)
  IndexesValues <- IndexesValues.extrap(IndexesValues.list,Clusters.List, G)
  Indexes.MostProb <- IndexesValues$Indexes.MostProb
  
  Freq = Indexes.MostProb %>% 
    dplyr::select(Cluster,Config) %>%
    dplyr::distinct(Cluster, .keep_all = T)
  
  Freq.ConfigCl<- Freq[order(Freq$Cluster),]
  
  ConsensusInfo<-lapply(1:length(G), function(Gind){
    ConsM.generation(Gind,Clusters.List,runs,data,Freq.ConfigCl,q)
  })
  names(ConsensusInfo)<-paste0("G",G)
  
  return(ConsensusInfo)
}

#' @rdname ExtrapolationFuncs
#' @export
MostProbableClustering.Extrapolation <- function(stability.list, G,q=NULL){
  Clusters.List<-stability.list$Clusters.List 
  runs <-stability.list$runs 
  
  G.all <- as.numeric(sub("G","",names(Clusters.List)))
  Allerrors = which(sapply(1:length(Clusters.List),
                           function(x) is.character(Clusters.List[[x]]) ))
  if(length(Allerrors)>0)
  {
    errors = sapply(Allerrors, function(x) 
      paste("Cluster",G.all[x],"has the following errors: ", Clusters.List[[x]]) )
    
    print(errors)
    G.all = G.all[-Allerrors]
    Clusters.List = Clusters.List[-Allerrors]
  }
  if(length(G.all)==0)
  {
    return("All the CONNECTOR runs have errors!")
  }
  
  IndexesValues.list <- IndexesValues.calculation(Clusters.List, G.all,q)
  IndexesValues <- IndexesValues.extrap(IndexesValues.list,Clusters.List, G.all)
  Indexes.MostProb <- IndexesValues$Indexes.MostProb
  
  Freq.ConfigCl<-unique(Indexes.MostProb[,c("Cluster","Config")])
  ############# the most probably clustering:
  IndexBestClustering <- Freq.ConfigCl[which(Freq.ConfigCl$Cluster == G),"Config"]
  
  # If there are more configuration with the same freq. than we will selct the one with smaller fdb
  if(length(IndexBestClustering) >1){
    fdb = sapply(IndexBestClustering,function(ic){
      return(c(ic,IndexesValues.list[[paste0("G", G)]][[ic]]$Coefficents$fDB.index))
    })
    IndexBestClustering <- fdb[1,which.min(fdb[2,])]
  }
  ##
  MostProbableClustering<-Clusters.List[[paste0("G",G)]]$ClusterAll[[IndexBestClustering]]
  
  MostProbableClustering$Cl.Info <- IndexesValues.list[[paste0("G",G)]][[IndexBestClustering]]
  MostProbableClustering$CONNECTORList <- stability.list$CONNECTORList
  
  return(MostProbableClustering)
}

ConsM.generation<-function(Gind,ALL.runs,runs,data,Freq.ConfigCl,q) 
{
  FixedG.runs<-ALL.runs[[Gind]]$ClusterAll
  L1<- length(FixedG.runs)
  #Check some errors in the predictions:
  FixedG.runs.tmp <-lapply(1:L1,function(x){
    if(!is.character(FixedG.runs[[x]]$Error))
      FixedG.runs[[x]]
    else NA
  } )
  # if(length(which(is.na(FixedG.runs.tmp)))!=0){
  #   ErrorConfiguration <- list(FromFitting=ALL.runs[[Gind]]$ErrorConfigurationFit,
  #                              FromPrediction = ALL.runs[[which(is.na(FixedG.runs.tmp))]] )
  # }else{
  #   ErrorConfiguration <-list(FromFitting=ALL.runs[[Gind]]$ErrorConfigurationFit,
  #                             FromPrediction = NULL)
  # }
  ###
  
  L1<- length(FixedG.runs)
  ############# first we found the most probably clustering:
  IndexBestClustering <- Freq.ConfigCl[Gind,"Config"]
  BestClustering<-FixedG.runs[[IndexBestClustering]]
  
  ##########################################################
  #### Let build the consensus matrix
  
  consensus.runs<-lapply(1:L1, function(x){
    consensusM<-diag(0,length(data$LenCurv))
    fcm.G<-FixedG.runs[[x]]
    cl.vector<-fcm.G$FCM$cluster$cluster.member
    for(i in 1:length(data$LenCurv)) consensusM[i,which(cl.vector%in%cl.vector[i])]<-FixedG.runs[[x]]$ParamConfig.Freq
    consensusM
  })
  
  consensusM<-Reduce("+",consensus.runs)
  
  colnames(consensusM)<- data$LabCurv$ID
  row.names(consensusM)<- data$LabCurv$ID
  
  consensusM<-as.data.frame(consensusM)/runs
  
  ################
  ### ordering the samples to have all curves in the same cluster close with each otehrs.
  
  cl.member<-BestClustering$FCM$cluster$cluster.m
  #  mat[do.call(order, as.data.frame(mat)),]
  
  fcm<-BestClustering
  curvename<-data$LabCurv$ID
  curve<-fcm$FCM$prediction$gpred
  rownames(curve)<-curvename
  
  grid<-fcm$FCM$fit$grid
  
  # gauss.quad(10) -> gauss
  # a <- min(grid)
  # b <- max(grid)
  # itempi <- (a+b)/2 + (b-a)/2*gauss$nodes
  # 
  # match(itempi,grid) -> itimeindex 
  # 
  # itimeindex <- match(database$Time[database$ID == i],grid)
  # fxG <- (curve[,itimeindex] )^2
  # int <- (b-a)/2 * rowSums( gauss$weights * fxG )
  # dist.curve <- sqrt(int)
  
  dist.curve<-L2dist.curve20(clust = fcm$FCM$cluster$cluster.member,
                             fcm.curve = fcm$FCM$prediction,
                             database = data$Dataset,
                             q = q)
  
  names(dist.curve) <- curvename
  # names(which.min(dist.curve))->lowest.curve
  # 
  # m.lowercurve<-matrix(curve[lowest.curve,itimeindex],
  #                      nrow = length(curve[,1]),
  #                      ncol = length(itimeindex),byrow = T)
  # fxG <- (curve[,itimeindex]-m.lowercurve )^2
  # int <- (b-a)/2 * rowSums( gauss$weights * fxG )
  # dist.curve <- sqrt(int)
  ################## Sorting the names!!!
  
  #1) sorting depending by the l2 distance with the zero x-axis!
  curvename.ordered<-names(sort(dist.curve)) 
  
  #2) Sorting depending on the cluster membership
  
  cl.member.tmp <- data.frame(curvename,names(cl.member),cl.member)
  
  cl.member.tmp<-cl.member.tmp[order(cl.member.tmp$names.cl.member.),]
  cl.member <- cl.member.tmp$cl.member
  names(cl.member) <- cl.member.tmp$curvename
  
  #3) Defining a dataframe in order to sort the curves depending by the cluster and distance!
  
  namefroml2<-1:length(curvename)
  names(namefroml2)<-curvename.ordered
  namefroml2<-namefroml2[names(cl.member)] # ordering the curve ordered by the L2 distance
  
  curve.name.dtfr<-data.frame(namefroml2=namefroml2,cl.mem=cl.member,1:length(curvename.ordered))
  
  # ind<-curve.name.dtfr[order(curve.name.dtfr$namefroml2,curve.name.dtfr$cl.mem,curve.name.dtfr$namefroml2),3]
  ind<-curve.name.dtfr[order(curve.name.dtfr$cl.mem,curve.name.dtfr$namefroml2),3]
  curvename.ordered<-names(cl.member)[ind]
  consensusM <- consensusM[curvename.ordered,curvename.ordered]
  consensusM <- as.matrix(consensusM)
  consensusM[upper.tri(consensusM)] <- NA

  m<- as.data.frame(consensusM) %>% 
    mutate(Var1 = row.names(consensusM)) %>% 
    gather(-Var1,value = "value", key = "Var2") %>%
    na.omit()
  
  m$Var2<-factor(m$Var2,levels = curvename.ordered  )
  m$Var1<-factor(m$Var1,levels = rev(curvename.ordered )  )
  
  ############
  length.cl<-rev(as.numeric(table(cl.member[ind])[unique(cl.member[ind])]))
  coor<-c(0,cumsum(length.cl))
  coor.2<-sum(length.cl)-coor
  
  cluster.lines<-data.frame(x=c(coor[-length(coor)],coor[-length(coor)]),
                            y=c(coor.2[-1],coor.2[-length(coor.2)]),
                            xend=c(coor[-1],coor[-length(coor)]),
                            yend=c(coor.2[-1],coor.2[-1] ) ) +.5
  
  x.text<-(cluster.lines$xend-cluster.lines$x)/2 + cluster.lines$x
  y.text<-(cluster.lines$yend-cluster.lines$y)/2 + cluster.lines$y
  x.text <- x.text[1:length(length.cl)]
  y.text <- y.text[-(1:length(length.cl) )]
  
  lab = rev(BestClustering$FCM$cluster$cluster.names)[rev(BestClustering$FCM$cluster$cluster.names) %in% unique(names(BestClustering$FCM$cluster$cluster.member)) ]
  
  ### Freq in each cluster
  G <- unique(cl.member)
  Freq.cl<-lapply(G,function(g){
    as.numeric(names(cl.member[which(cl.member == g)])) -> curve.cl.fixed
    m %>% filter( Var1 %in% curve.cl.fixed & Var2 %in% curve.cl.fixed  & Var1 != Var2) %>%
      dplyr::summarise(Mean = round(mean(value),2),Median= round(median(value),2), Cluster = g )
    
    # indexes <- which(as.numeric(m$Var1) %in% curve.cl.fixed &
    #                    m$Var2 %in% curve.cl.fixed &
    #                    as.numeric(levels(m$Var1)[c(m$Var1)]) - as.numeric(levels(m$Var2)[c(m$Var2)])!=0)
    # data.frame(Median= median(m[indexes,3]), Mean = round(mean(m[indexes,3]),2),Cluster = g)
  })
  Freq.cl<-do.call("rbind",Freq.cl)
  Freq.cl$Cluster<-BestClustering$FCM$cluster$cluster.names[Freq.cl$Cluster]
  Freq.cl<-Freq.cl[order(match(Freq.cl$Cluster, lab)),]
  MeanFreq<-mean(Freq.cl$Mean,na.rm = T)
  Freq.cl$Mean[is.na(Freq.cl$Mean)] <- 1
  
  labText<-sapply(1:length(G),function(i) paste(Freq.cl[i,c("Cluster","Mean")],collapse = ": ") )
  
  ##
  
  ConsensusPlot <- ggplot() +
    geom_tile(data = m, 
              aes(x = Var1, y = Var2, fill = value)) +
    labs(x = "",y = "", fill = "Same cluster \ncounting") + 
    theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(), 
          panel.background = element_blank(), axis.line = element_line(colour = "black"), 
          axis.text.x = element_text(angle = 45, hjust = 1)) + 
    scale_fill_gradient2(midpoint = 0.5, low = "blue", 
                         mid = "yellow", high = "red") +
    geom_segment(data = cluster.lines, 
                 aes(x, y, xend = xend, yend = yend), size = 1.5, 
                 inherit.aes = F) +
    labs(title = paste( "Stability Matrix, G=", length(G) ),
         subtitle = paste("Black line for the most probable clustering: ",MeanFreq) ) + 
    annotate(geom = "text", x = x.text, y = y.text, 
             label = labText , 
             size = 6)
  
  return(list(ConsensusMatrix = consensusM,
              ConsensusPlot = ConsensusPlot) )
} 

IndexesValues.calculation <- function(Clusters.List, G,q){
  
  Indexes <- lapply(1:length(G),function(i){
    ## Check the same parameter configurations:
    Indexes.Uniq.Par<-length(Clusters.List[[i]]$ClusterAll)
    
    Cl.Info<- lapply(1:Indexes.Uniq.Par,function(j){
      if(is.character(Clusters.List[[i]]$ClusterAll[[j]]$Error) ){
        return(Clusters.List[[i]]$ClusterAll[[j]]$Error)
      }else{
        Clusters.List[[i]]$ClusterAll[[j]]$FCM$prediction -> fcm.prediction
        Clusters.List[[i]]$ClusterAll[[j]]$FCM$cluster$cl.info$class.pred -> cluster
        Clusters.List[[i]]$ClusterAll[[j]]$FCM$cluster$ClustCurve[,c("ID","Observation","Time")] -> database
        Clusters.List[[i]]$ClusterAll[[j]]$FCM$fit -> fcm.fit  
        ## Goodness coefficents calculation
        fcm.prediction$meancurves->meancurves
        distances <- L2dist.curve2mu(clust=cluster, fcm.curve = fcm.prediction, database = database, fcm.fit = fcm.fit, deriv = 0,q)
        distances.zero<-L2dist.mu20(clust=cluster,fcm.prediction,database = database,fcm.fit,deriv=0,q)
        Coefficents<-Rclust.coeff(clust=cluster, fcm.curve = fcm.prediction, database = database, fcm.fit = fcm.fit, deriv = 0,q)
        Deriv.Coefficents<-Rclust.coeff(clust=cluster, fcm.curve = fcm.prediction, database = database, fcm.fit = fcm.fit, deriv = 1,q)
        Deriv2.Coefficents<-Rclust.coeff(clust=cluster, fcm.curve = fcm.prediction, database = database, fcm.fit = fcm.fit, deriv = 2,q)
        
        return(list(Tight.indexes = sum(distances),
                    Coefficents=Coefficents,
                    Deriv.Coefficents=Deriv.Coefficents,
                    Deriv2.Coefficents=Deriv2.Coefficents))
      }
      
    })
    return(Cl.Info)
  })
  names(Indexes) <-paste0("G",G)
  
  return(Indexes)
}

IndexesValues.extrap <- function(IndexesValues.list,Clusters.List,G){
  
  l.tight <- lapply(1:length(G),function(i){
    ClusterAll <- IndexesValues.list[[i]]
    V.all<- lapply(1:(length(ClusterAll)),function(j){
      if(!is.character(ClusterAll[[j]]) )
        data.frame(Config = j, 
                   V = ClusterAll[[j]]$Tight.indexes, 
                   Freq= Clusters.List[[i]]$ClusterAll[[j]]$ParamConfig.Freq)
      # else NA
    })
    V.all <- do.call("rbind",V.all)
    V.all$Cluster <-G[i]
    V.all$Index = "Tightness"
    V.all$ClusterH = paste("G:",G[i],"; h:",Clusters.List[[i]]$h.selected)
    return(V.all)
  })
  l.fdb <- lapply(1:length(G),function(i){
    ClusterAll <- IndexesValues.list[[i]]
    V.all<- lapply(1:(length(ClusterAll)),function(j){
      if(!is.character(ClusterAll[[j]]) )
        data.frame(Config = j, 
                   V = ClusterAll[[j]]$Coefficents$fDB.index,
                   Freq= Clusters.List[[i]]$ClusterAll[[j]]$ParamConfig.Freq)
      # else NA
    })
    V.all <- do.call("rbind",V.all)
    V.all$Cluster <-G[i]
    V.all$Index = "fDB"
    V.all$ClusterH = paste("G:",G[i],"; h:",Clusters.List[[i]]$h.selected)
    return(V.all)
  })
  l.fdb1 <- lapply(1:length(G),function(i){
    ClusterAll <- IndexesValues.list[[i]]
    V.all<- lapply(1:(length(ClusterAll)),function(j){
      if(!is.character(ClusterAll[[j]]) )
        data.frame(Config = j, V = ClusterAll[[j]]$Deriv.Coefficents$fDB.index,
                   Freq= Clusters.List[[i]]$ClusterAll[[j]]$ParamConfig.Freq)
      #  else NA
    })
    V.all <- do.call("rbind",V.all)
    V.all$Cluster <-G[i]
    V.all$Index = "Deriv.fDB"
    V.all$ClusterH = paste("G:",G[i],"; h:",Clusters.List[[i]]$h.selected)
    return(V.all)
  })
  l.fdb2 <- lapply(1:length(G),function(i){
    ClusterAll <-IndexesValues.list[[i]]
    V.all<- lapply(1:(length(ClusterAll)),function(j){
      if(!is.character(ClusterAll[[j]]) )
        data.frame(Config = j, V = ClusterAll[[j]]$Deriv2.Coefficents$fDB.index, 
                   Freq= Clusters.List[[i]]$ClusterAll[[j]]$ParamConfig.Freq)
      #  else NA
    })
    V.all <- do.call("rbind",V.all)
    V.all$Cluster <-G[i]
    V.all$Index = "Deriv2.fDB"
    V.all$ClusterH = paste("G:",G[i],"; h:",Clusters.List[[i]]$h.selected)
    return(V.all)
  })
  ## Grouping all the indexes:
  Data <- list(Tight =do.call("rbind", l.tight),
               fDB =do.call("rbind", l.fdb),
               fDB1 =do.call("rbind", l.fdb1),
               fDB2 =do.call("rbind", l.fdb2))
  ##
  dt.fr <- rbind(do.call("rbind", l.tight), do.call("rbind",l.fdb))
  dt.fr.rep <- lapply(1:length(dt.fr[, 1]), function(i) {
    freq <- dt.fr[i, "Freq"]
    do.call("rbind", lapply(1:freq, function(j) dt.fr[i,]))
  })
  dt.fr.rep <- do.call("rbind", dt.fr.rep)
  dt.fr.max <- aggregate(dt.fr$Freq, by = list(Cluster = dt.fr$Cluster,
                                               Index = dt.fr$Index,
                                               ClusterH = dt.fr$ClusterH),
                         FUN = "max")
  colnames(dt.fr.max)[4] <- "Freq"
  dt.fr.max <- merge(dt.fr.max, dt.fr)
  dt.fr.max <- aggregate(dt.fr.max$V, by = list(Cluster = dt.fr.max$Cluster,
                                                Index = dt.fr.max$Index, ClusterH = dt.fr.max$ClusterH,
                                                Freq = dt.fr.max$Freq), FUN = "min")
  colnames(dt.fr.max)[5] <- "Obs"
  dt.fr.max <- merge(dt.fr.max, dt.fr)
  
  return(list(Indexes.MostProb = dt.fr.max,Indexes.Rep = dt.fr.rep,DataIndexes = Data))
}
sysbioTurin/connector documentation built on April 9, 2024, 12:10 p.m.