R/innerLoop.R

#  R package UPMASK file R/innerLoop.R
#  Copyright (C) 2014 Alberto Krone-Martins
#
#This program is free software: you can redistribute it and/or modify
#it under the terms of the GNU General Public License version 3 as published by
#the Free Software Foundation.

#This program is distributed in the hope that it will be useful,
#but WITHOUT ANY WARRANTY; without even the implied warranty of
#MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#GNU General Public License for more details.

#  A copy of the GNU General Public License is available at
#  http://www.r-project.org/Licenses/
#

#' @title UPMASK inner loop
#' 
#' @description \code{innerLoop} executes the UPMASK method's inner loop and returns 
#' the stars which were considered as cluster member stars.
#' 
#' The \code{innerLoop} perform the PCA, runs the clustering algorithm and check for 
#' overdensities in the spatial distribution for the clustered stars in the PC space using
#' a 2d kernel density estimation. 
#' 
#' @param ocdata_full a data frame with the data to perform the analysis
#' @param ocdata a data frame with the data to consider in the PCA step
#' @param classAlgol a string indicating the type of clustering algorithm to consider. Only k-means is implemented at this moment (defaults to kmeans)
#' @param autoThresholdLevel an integer indicating the level for thresholding of the spatial distribution
#' @param autoThreshold a boolean indicating if autoThresolding should be adopted (defaults to TRUE)
#' @param iiter and integer indicating the number of the iteration (passed by the \code{outerLoop})
#' @param plotIter a boolean indicating if the user wants to see iteration plots (defaults to FALSE)
#' @param verbosity a flag indicating the verbosity level: it can be 0 (no screen output at all), 1 (minimum), >=2 (all)
#' @param starsPerClust_kmeans an integer with the average number of stars per k-means cluster
#' @param nstarts_kmeans an integer the amount of random re-initializations of the k-means clustering method (usually it is not necessary to modify this)
#' @param runId an integer greater than zero indicating the run Id (passed by the \code{outerLoop})
#' @param autoCalibrated a boolean indicating if the number of random field realizations for the clustering check in the position space should be autocalibrated (experimental code, defaults to FALSE).
#' @param stopIfEmpty a boolean indicating if the code should completely stop if no spatial clustering is detected (defaults to FALSE)
#' @param positionDataIndexes an array of integers indicating the columns of the data frame containing the spatial position measurements
#' @param smartTableDB a database connection to the smart look-up table
#' @param nDimsToKeep an integer with the number of dimensions to consider (defaults to 4)
#' @param dimRed a string with the dimensionality reduction method to use (defaults to PCA. The only other options are LaplacianEigenmaps or None)
#' @param scale a boolean indicating if the data should be scaled and centered
#' 
#'
#' @return A data frame with objects considered as members at this iteration.
#'
#' @references \href{http://dx.doi.org/10.1051/0004-6361/201321143}{Krone-Martins, A. & Moitinho, A., A&A, v.561, p.A57, 2014}
#' 
#' @examples
#' \dontrun{
#' # Perform a one run of the innerLoop using a simulated open cluster with 
#' # spatial and photometric data 
#' # Load the data into a data frame
#' fileName <- "oc_12_500_1000_1.0_p019_0880_1_25km_120nR_withcolors.dat"
#' inputFileName <- system.file("extdata", fileName, package="UPMASK")
#' ocData <- read.table(inputFileName, header=TRUE)
#' ocData <- data.frame(ocData, id=(1:length(ocData[,1]))) # create an id
#' 
#' # Prepare the data to run the inner loop
#' posIdx <- c(1,2)
#' photIdx <- c(3,5,7,9,11,19,21,23,25,27)
#' 
#' # Create the look up table
#' library(RSQLite)
#' stcon <- create_smartTable()
#' 
#' # Run the inner loop 
#' innerLoopRes <- innerLoop(ocData, ocData[,photIdx], autoThresholdLevel=1, verbosity=2,
#'                           starsPerClust_kmeans=25, positionDataIndexes=posIdx, 
#'                           smartTableDB=stcon)
#' 
#' # Clean the environment
#' rm(list=c("inputFileName", "ocData", "posIdx", "photIdx", "innerLoopRes", 
#'    "fileName"))
#' dbDisconnect(stcon)
#' } 
#'  
#' @usage innerLoop(ocdata_full, ocdata, classAlgol="kmeans", autoThresholdLevel=3, 
#' autoThreshold=TRUE, iiter=0, plotIter=FALSE, verbosity=1, starsPerClust_kmeans=50, 
#' nstarts_kmeans=50, runId=0, autoCalibrated=FALSE, stopIfEmpty=FALSE, 
#' positionDataIndexes=c(1,2), smartTableDB, nDimsToKeep=4, dimRed="PCA", scale=TRUE)
#' 
#' @author Alberto Krone-Martins, Andre Moitinho
#' 
#' @keywords cluster, methods, multivariate, nonparametric
#' @export
#
innerLoop <- function(ocdata_full, ocdata, classAlgol="kmeans", autoThresholdLevel=3, 
                         autoThreshold=TRUE, iiter=0, plotIter=FALSE, verbosity=1, 
                         starsPerClust_kmeans=50, nstarts_kmeans=50, runId=0, 
                         autoCalibrated=FALSE, stopIfEmpty=FALSE, 
                         positionDataIndexes=c(1,2), smartTableDB, nDimsToKeep=4, dimRed="PCA", scale=TRUE) {
  if(verbosity!=0) {
    cat(paste(" [runId:",runId,"] ITERATION:",iiter," RUNNING...\n"))
  }
  inSize <- length(ocdata_full)
  
  # Perform the dimensionality reduction step
  if(dimRed!="None") {
    if(verbosity>=1) {
      cat(paste(" [runId:",runId,"] ITERATION:",iiter," -- [1/3] Performing Dimensionality Reduction...\n"))
    }
    if(dimRed=="PCA") {
      if(verbosity>=1) {
        cat(paste(" [runId:",runId,"] ITERATION:",iiter," --       Using PCA...\n"))
      }
      ocdata_pca <- prcomp(ocdata, scale=scale, center=scale, cor=TRUE)
      ocdata_px <- predict(ocdata_pca)
    } else 
    if(dimRed=="LaplacianEigenmaps") {
      minPoints <- round(runif(1, min=5, max=starsPerClust_kmeans))
      if(starsPerClust_kmeans > 25) {
        minPoints <- round(runif(1, min=5, max=25))
      } 
      if(nrow(ocdata) <= minPoints) {
        minPoints <- round(nrow(ocdata) - 1)
      }
      if(verbosity>=1) {
        cat(paste(" [runId:",runId,"] ITERATION:",iiter," --       Using Laplacian Eigenmaps... pts =",minPoints," Data=",nrow(ocdata),"\n"))
      }
      leim <- dimRed::LaplacianEigenmaps()
      leim@stdpars$ndim <- nDimsToKeep
      leim@stdpars$knn <- minPoints
      if(scale==TRUE) {
        ocemb <- leim@fun(dimRed::dimRedData(scale(ocdata)), leim@stdpars)
      } else {
        ocemb <- leim@fun(dimRed::dimRedData(ocdata), leim@stdpars)
      }
      ocdata_px <- as.data.frame(ocemb@data@data)
    } else {
      stop("You should select PCA, LaplacianEigenmaps as the dimensionality reduction method, or implement your own. Alternatively, you can select None to ignore dimensionality reduction.")
    }
  } else {
    if(verbosity>=1) {
      cat(paste(" [runId:",runId,"] ITERATION:",iiter," -- [1/3] You have chosen not to perform dimensionality reduction...\n"))
    }
    if(scale==TRUE) {
      ocdata_px <- scale(ocdata)
    } else {
      ocdata_px <- ocdata
    }
  }
  #########
  
  # Perform the clustering step
  if (classAlgol=="kmeans") {
    if(verbosity>=1) {
      cat(paste(" [runId:",runId,"] ITERATION:",iiter," -- [2/3] Performing k-means clustering analysis in the transformed data space...\n"))
    }
    starsPerClust <- starsPerClust_kmeans
    nclust <- round(length(ocdata_full[,1])/starsPerClust)
    if(nclust > 1) {
      fit <- kmeans(ocdata_px[,1:nDimsToKeep], nclust, nstart=nstarts_kmeans, iter.max=100)
      # get cluster means
      aggregate(ocdata_px, by=list(fit$cluster), FUN=mean)
      # append cluster assignment
      ocdata_px <- data.frame(ocdata_px, resMclust.class=fit$cluster)
    } else {
      # if the number of predicted clusters is less than one, lets prevent the kmeans method from 
      # crashing the code by assigning a randomized choice as the result (this is expected to happen if
      # a very small number of stars are assigned at a certain iteration).
      ocdata_px <- data.frame(ocdata_px, resMclust.class=round(runif(length(ocdata_px[,1]), 0, 1)))
    } 
  } else {
     stop(" Error: the selected method for the clustering in the inner loop is not implemented.")
  }
  
  # Print Dimensionality reduction info and create plots -- KEPT FOR DEPURATION PURPOSES
  if(plotIter){
    dev.new()
    par(cex=0.3)
    #	plot(resMclust, data=ocdata_px[,1:4])
    pairs(ocdata_px[,1:4], pch=19, cex=0.2, col=rainbow(max(ocdata_px$resMclust.class))[ocdata_px$resMclust.class])
  }
  
  # Merge the colors, the dimensionality reduction columns and cluster classification with the original results
  ocdata_full_withColorsAndPcaCols <- data.frame(ocdata_full, ocdata, ocdata_px)
    
  # Select the classes with densities above the threshold
  if(verbosity>=1) {
    cat(paste(" [runId:",runId,"] ITERATION:",iiter," -- [3/3] Performing comparison with random density fields for",max(ocdata_px$resMclust.class),"individual classes...\n"))
    if(verbosity==1) {
      cat(paste(" [runId:",runId,"] ITERATION:",iiter,"          You can get a coffee. This can take long! \n"))
    }
  }
  
  vclass <- c()
  not_class <- c()
  for(i in 1:max(ocdata_px$resMclust.class)) {
    dfn <- subset(ocdata_full_withColorsAndPcaCols, ocdata_full_withColorsAndPcaCols$resMclust.class==i)
        
    if(nrow(dfn)>2) {
      
      dif_max_mean <- kde2dForSubset(ocdata_full_withColorsAndPcaCols, setw=i, returnDistance=TRUE, showStats=FALSE, printPlots=FALSE, positionDataIndexes=positionDataIndexes)
      
      # First, get the thresholding level...
      if(autoThreshold) {
        if(verbosity>=2) {
          cat(paste(" [runId:",runId,"] ITERATION:",iiter,"	-- Class",i," -- Performing analysis of random fields...\n"))
        }
        
        if(autoCalibrated) {
          # This is an experimental code for performing automatic calibration of the 
          # number of random realisations
          at <- analyse_randomKde2d_AutoCalibrated(
                 nstars=length(dfn$resMclust.class), 
                 (max(ocdata_full_withColorsAndPcaCols[,positionDataIndexes[1]])-min(ocdata_full_withColorsAndPcaCols[,positionDataIndexes[1]])),
                 (max(ocdata_full_withColorsAndPcaCols[,positionDataIndexes[2]])-min(ocdata_full_withColorsAndPcaCols[,positionDataIndexes[2]])), 
                 nKde=50, showStats=FALSE, returnStats=TRUE)
        } else {					
          at <- analyse_randomKde2d_smart(
                 nfields=2000, nstars=length(dfn$resMclust.class), 
                 (max(ocdata_full_withColorsAndPcaCols[,positionDataIndexes[1]])-min(ocdata_full_withColorsAndPcaCols[,positionDataIndexes[1]])),
                 (max(ocdata_full_withColorsAndPcaCols[,positionDataIndexes[2]])-min(ocdata_full_withColorsAndPcaCols[,positionDataIndexes[2]])), 
                 nKde=50, showStats=FALSE, returnStats=TRUE, smartTableDB=smartTableDB)
        }
        threshold <- at$mean + autoThresholdLevel*at$sd
        if(verbosity>=2) {
          cat(paste(" [runId:",runId,"] ITERATION:",iiter," 		Automatic threshold for class",i," spatial clustering selected at ",round(threshold,1),"above the mean density.\n"))
          cat(paste(" [runId:",runId,"] ITERATION:",iiter," 		                        class",i," got dif_max_mean             = ",round(dif_max_mean,1),"above the mean density.\n"))
        }
      } else {
      	stop(" The code without autothresolding was deprecated. \b Aborting cowardly.\n\n")
      }
      
      if(is.na(round(threshold,1))) {
        cat(paste(" [runId:",runId,"] ITERATION:",iiter,"/- PROBLEM REPORT -----------------------------------------------------------------\n"))
        cat(paste(" [runId:",runId,"] ITERATION:",iiter,"| 		Class",i," has a NA value in the threshold!\n"))
        cat(paste(" [runId:",runId,"] ITERATION:",iiter,"| 		probably due to its small number of stars: ", nrow(dfn),".\n"))
        kde2dForSubset(ocdata_full_withColorsAndPcaCols, setw=i, returnDistance=FALSE, showStats=TRUE, printPlots=FALSE, positionDataIndexes=positionDataIndexes)
        print(dfn)
        cat(paste(" [runId:",runId,"] ITERATION:",iiter,"\\----------------------------------------------------------------------------------\n"))
      } 
      
      if(!is.na(round(threshold,1))) {
        if(round(dif_max_mean,1) >= round(threshold,1)) {
          vclass <- c(vclass, i)
          if(verbosity>=2) {
            cat(paste(" [runId:",runId,"] ITERATION:",iiter," <<<< --	Class",i," : ok!\n"))
          }
        } else {
          not_class <- c(not_class, i)
          if(verbosity>=2) {
            cat(paste(" [runId:",runId,"] ITERATION:",iiter,"      --	Class",i," : WILL BE ELIMINATED!!\n"))
          }
        }
      }
    } else {
      if(verbosity>=2) {
        cat(paste(" [runId:",runId,"] ITERATION:",iiter,"	-- Class",i," -- THERE ARE TWO OR LESS STARS IN THIS CLASS! \n"))
      }
      not_class <- c(not_class, i)
    }
  }
  
  if(length(vclass)==0) {
    cat(" No spatial clustering detected in the real space based on the clustered photometric data in the transformed data space!\n")
    #cat(paste(" Spatial thresholding at ",round(threshold,2), "\n")) ### ---- USED FOR DEPURATION PURPOSES
    if(stopIfEmpty) {
      stop(" No spatial clustering detected!\n Aborting cowardly!")
    } else {
      oc_reconst <- ocdata_full_withColorsAndPcaCols[0,]
    }
  }
  
  if(verbosity>=1) {
    if(length(vclass)!=0) {
      if(verbosity >= 2) {
         cat(paste(" [runId:",runId,"] ITERATION:",iiter," -- The ",length(vclass),"selected classes from the kernel density estimation in the X-Y space are: "))
         print(vclass)
      } else {
         cat(paste(" [runId:",runId,"] ITERATION:",iiter," -- Number of classes selected from the kde analysis : ",length(vclass)))
      }
      cat("\n")
    } else {
      cat(paste(" [runId:",runId,"] ITERATION:",iiter," -- No classes were selected from the kernel density estimation in the X-Y space.\n"))
    }
  }
  
  # Organize the data for returning to the outer loop
  if(length(vclass)>=1) {
    # First get the data of the selected objects
    for(i in 1:length(vclass)) {
      oc_tmp <- subset(ocdata_full_withColorsAndPcaCols, ocdata_full_withColorsAndPcaCols$resMclust.class==vclass[i])
      if(i==1) {
        oc_reconst <- oc_tmp
      } else {
        oc_reconst <- rbind(oc_reconst, oc_tmp)
      }
    }
    # now the data of the not selected objects
    for(i in 1:length(not_class)) {
      not_tmp <- subset(ocdata_full_withColorsAndPcaCols, ocdata_full_withColorsAndPcaCols$resMclust.class==not_class[i])
      if(i==1) {
        field_reconst <- not_tmp
      } else {
        field_reconst <- rbind(field_reconst, not_tmp)
      }			
    }
  }
  
  if(verbosity!=0) {
    cat(paste(" [runId:",runId,"] ITERATION:",iiter," DONE!\n"))
    cat(paste("-------------------------------------------------------------------\n"))
  }
  
  # What is going out of this function must be only the astronomical cluster stars only...
  return(oc_reconst[,1:inSize])    
}

Try the UPMASK package in your browser

Any scripts or data that you put into this service are public.

UPMASK documentation built on May 2, 2019, 2:39 p.m.