tests/testthat/test-computeKmeans.R

context("computeKmeans") 

batting_info = dget("_battingInfo.dat")

test_that("computeKmeans throws errors", {
  
  expect_error(computeKmeans(NULL),
               "Connection is not valid RODBC object.")
  
  expect_error(computeKmeans(NULL, tableName="fielding", test=TRUE),
               "Must provide tableInfo when test==TRUE")
  
  expect_error(computeKmeans(NULL, tableName="fielding", tableInfo=batting_info, 
                             persist = TRUE, test=TRUE, version="5.20"),
               "Persisting clustered data with versions before AAF 6.21 is not supported.")
  
  expect_error(computeKmeans(NULL, tableName='XXXXX', centers="a", 
                             tableInfo=batting_info, test=TRUE),
               "Parameter centers must be one of following: number of clusters, numeric matrix of initial centroids, or canopy object.")
  
  expect_error(computeKmeans(NULL, tableName='XXXXX', centers=0.1, 
                             tableInfo=batting_info, test=TRUE),
               "Number of clusters must be greater or equal to 1.")
  
  expect_error(computeKmeans(NULL, tableName="batting", centers=4, id="id", include=c('lgid','playerid'),
                             tableInfo=batting_info, test=TRUE),
               "Kmeans operates on one or more numeric variables.")
  
  expect_error(computeKmeans(NULL, tableName="batting", centers=4, id="id", idAlias="g",
                             tableInfo=batting_info, test=TRUE),
               "Id alias 'g' can't be one of variable names")
  
  expect_error(computeKmeans(NULL, tableName="batting", centers=4, id="id", include=c('g','h','r'),
                             idAlias="g", tableInfo=batting_info, test=TRUE), 
               "Id alias 'g' can't be one of variable names")
  
  expect_error(computeKmeans(NULL, tableName="batting", centers=matrix(c(10,20,30,40,50,60,70,80), nrow=2, byrow=TRUE),
                             id="id", include=c('g','h','r'), tableInfo=batting_info, test=TRUE),
               "Kmeans received incompatible parameters: dimension of initial cluster centers doesn't match variables: 'g', 'h', 'r'")
  
  expect_error(computeKmeans(NULL, tableName="batting", centers=4, id="id", include=c('g','h','r'),
                             aggregates=c("AVG(a) a", "a"), tableInfo=batting_info, test=TRUE),
               "Check aggregates: at least one missing alias found.")
  
})


test_that("computeClusterSample throws errors", {
  
  expect_error(computeClusterSample(NULL),
               "Connection is not valid RODBC object.")
  
  expect_error(computeClusterSample(NULL, test=TRUE),
               "Kmeans object must be specified.")
  
  expect_error(computeClusterSample(NULL, NULL, test=TRUE),
               "Kmeans object must be specified.")
  
  expect_error(computeClusterSample(NULL, character(1), test=TRUE),
               "Kmeans object must be specified.")
  
  expect_error(computeClusterSample(NULL, data.frame(1:5), test=TRUE),
               "Kmeans object must be specified.")
  
  expect_error(computeClusterSample(NULL, structure(list(cluster=integer(0)), class = c("toakmeans", "kmeans")), 
                                    test=TRUE),
               "Sample fraction or sample size must be specified.")
  
  expect_error(computeClusterSample(NULL, structure(list(cluster=integer(0), centers=matrix(1:10, nrow = 5)), 
                                                    class = c("toakmeans", "kmeans")), 
                                    sampleFraction = 10.0, test=TRUE),
               "All sample fractions must be between 0 and 1 inclusively.")
  
  expect_error(computeClusterSample(NULL, structure(list(cluster=integer(0), centers=matrix(1:10, nrow = 5)), 
                                                    class = c("toakmeans", "kmeans")), 
                                    sampleFraction = c(0.0, -0.1), test=TRUE),
               "All sample fractions must be between 0 and 1 inclusively.")
  
  expect_error(computeClusterSample(NULL, structure(list(cluster=integer(0), centers=matrix(1:10, nrow = 5)), 
                                                    class = c("toakmeans", "kmeans")), 
                                    sampleFraction = c(0.0, 0.1, 1.0001), test=TRUE),
               "All sample fractions must be between 0 and 1 inclusively.")
  
  expect_error(computeClusterSample(NULL, structure(list(cluster=integer(0), centers=matrix(1:10, nrow = 5)), 
                                                    class = c("toakmeans", "kmeans")), 
                                    sampleFraction = c(0.0, 0.1, 1.0), test=TRUE),
               "Fraction vector length must be either 1 or equal to the number of clusters.")
  
  expect_error(computeClusterSample(NULL, structure(list(cluster=integer(0), centers=matrix(1:10, nrow = 5)), 
                                                    class = c("toakmeans", "kmeans")), 
                                    sampleFraction = c(0.0, 0.1, 1.0, 0.5, 0.5, 0.6), test=TRUE),
               "Fraction vector length must be either 1 or equal to the number of clusters.")
  
  expect_error(computeClusterSample(NULL, structure(list(cluster=integer(0), centers=matrix(1:10, nrow = 5)), 
                                                    class = c("toakmeans", "kmeans")), 
                                    sampleSize = c(1,2,-100), test=TRUE),
               "All sample sizes must be equal to or greater than 0.")
  
  expect_error(computeClusterSample(NULL, structure(list(cluster=integer(0), centers=matrix(1:10, nrow = 5)), 
                                                    class = c("toakmeans", "kmeans")), 
                                    sampleSize = c(0, 0, 1.0), test=TRUE),
               "Size vector length must be either 1 or equal to the number of clusters.")
  
  expect_error(computeClusterSample(NULL, structure(list(cluster=integer(0), centers=matrix(1:10, nrow = 5)), 
                                                    class = c("toakmeans", "kmeans")), 
                                    sampleSize = c(0, 0, 1.0, 2, 5, 8), test=TRUE),
               "Size vector length must be either 1 or equal to the number of clusters.")
})



test_that("computeSilhouette throws errors", {
  
  expect_error(computeSilhouette(NULL),
               "Connection is not valid RODBC object.")
  
  expect_error(computeSilhouette(NULL, test=TRUE),
               "Silhouette table name is required when test=TRUE.")
  
  expect_error(computeSilhouette(NULL, silhouetteTableName='name', test=TRUE),
               "Kmeans object must be specified.")
  
  expect_error(computeSilhouette(NULL, NULL, silhouetteTableName='name', test=TRUE),
               "Kmeans object must be specified.")
  
  expect_error(computeSilhouette(NULL, character(1), silhouetteTableName='name', test=TRUE),
               "Kmeans object must be specified.")
  
  expect_error(computeSilhouette(NULL, data.frame(1:5), silhouetteTableName='name', test=TRUE),
               "Kmeans object must be specified.")
  
  expect_error(computeSilhouette(NULL, km=structure(list(centers=matrix(c(1), nrow=1, byrow = TRUE)),
                                                    class = c("toakmeans", "kmeans")), 
                                 silhouetteTableName = 'name', test=TRUE),
               "Silhouette values are trivial in case of single cluster model.")
})


test_that("computeKmeans SQL is correct", {
  
  expect_equal_normalized(computeKmeans(NULL, "batting", centers=3, iterMax = 25,
                                        tableInfo=batting_info, include=c('g','ab','r','h'),
                                        id="playerid || '-' || stint || '-' || teamid || '-' || yearid", idAlias="id",
                                        aggregates = c("COUNT(*) cnt", "AVG(g) avg_g", "AVG(ab) avg_ab", "AVG(r) avg_r", "AVG(h) avg_h"),
                                        scaledTableName='kmeans_test_scaled', centroidTableName='kmeans_test_centroids', 
                                        schema='baseball', test=TRUE, version="5.20"),
                         "-- Data Prep: scale
                          DROP TABLE IF EXISTS baseball.kmeans_test_scaled;
                          CREATE FACT TABLE baseball.kmeans_test_scaled DISTRIBUTE BY HASH(id) AS 
                          SELECT * FROM Scale(
                            ON (SELECT playerid || '-' || stint || '-' || teamid || '-' || yearid id, ab, g, h, r FROM batting ) AS input PARTITION BY ANY
                            ON (SELECT * FROM ScaleMap (
                              ON (SELECT playerid || '-' || stint || '-' || teamid || '-' || yearid id, ab, g, h, r FROM batting )
                              InputColumns ('ab', 'g', 'h', 'r')
                              -- MissValue ('OMIT')
                            )) AS STATISTIC DIMENSION
                            Method ('STD')
                            Accumulate('id')
                            GlobalScale ('false')
                            InputColumns ('ab', 'g', 'h', 'r')
                          );
                          --;
                          -- Run k-means;
                          DROP TABLE IF EXISTS baseball.kmeans_test_centroids;
                          SELECT * FROM kmeans(
                            ON (SELECT 1)
                            PARTITION BY 1
                            INPUTTABLE('baseball.kmeans_test_scaled')
                            OUTPUTTABLE('baseball.kmeans_test_centroids')
                            NUMBERK('3')
                            THRESHOLD('0.0395')
                            MAXITERNUM('25')
                          );
                          --;
                          -- Run cluster assignment, cluster stats, and within-cluster sum of squares;
                          SELECT c1.*, c2.withinss  
                          FROM (SELECT clusterid, means, COUNT(*) cnt, AVG(g) avg_g, AVG(ab) avg_ab, AVG(r) avg_r, AVG(h) avg_h          
                                  FROM (SELECT c.clusterid, c.means means, d.* 
                                          FROM baseball.kmeans_test_centroids c JOIN 
                                               kmeansplot (
                                                 ON baseball.kmeans_test_scaled PARTITION BY ANY
                                                 ON baseball.kmeans_test_centroids DIMENSION
                                                 centroidsTable('baseball.kmeans_test_centroids')
                                               ) kmp ON (c.clusterid = kmp.clusterid) JOIN 
                                               (SELECT playerid || '-' || stint || '-' || teamid || '-' || yearid id, * 
                                                  FROM batting ) d on (kmp.id = d.id)
                                       ) clustered_data
                                 GROUP BY clusterid, means
                                ) c1 JOIN 
                               ( SELECT 0 clusterid, SUM(distance::double ^ 2) withinss 
                                   FROM VectorDistance(
                                     ON ( SELECT clusterid, id, variable, coalesce(value_double, value_long, value_str::double) value 
                                            FROM unpivot(
                                              ON (SELECT d.* 
                                                    FROM kmeansplot (
                                                      ON baseball.kmeans_test_scaled PARTITION BY ANY
                                                      ON baseball.kmeans_test_centroids DIMENSION
                                                      centroidsTable('baseball.kmeans_test_centroids')
                                                    ) d 
                                                   WHERE clusterid = 0
                                              )
                                              COLSTOUNPIVOT('ab', 'g', 'h', 'r')
                                              COLSTOACCUMULATE('id','clusterid')
                                              ATTRIBUTECOLUMNNAME('variable')
                                              VALUECOLUMNNAME('value')
                                              KEEPINPUTCOLUMNTYPES('true')
                                            )
                                         ) AS target PARTITION BY id
                                      ON ( SELECT *, regexp_split_to_table(means, ' ')::numeric value, regexp_split_to_table('ab, g, h, r', ', ') variable 
                                FROM baseball.kmeans_test_centroids
                                WHERE clusterid = 0
                              ) AS ref DIMENSION
                              TARGETIDCOLUMNS('id')
                              TARGETFEATURECOLUMN('variable')
                              TARGETVALUECOLUMN('value')
                              REFIDCOLUMNS('clusterid')
                              REFFEATURECOLUMN('variable')
                              REFVALUECOLUMN('value')
                              MEASURE('Euclidean')
                            )
                            UNION ALL
                            SELECT 1 clusterid, SUM(distance::double ^ 2) withinss FROM VectorDistance(
                              ON (
                                SELECT clusterid, id, variable, coalesce(value_double, value_long, value_str::double) value
                                FROM unpivot(
                                  ON (SELECT d.* 
                                        FROM kmeansplot (
                                          ON baseball.kmeans_test_scaled PARTITION BY ANY
                                          ON baseball.kmeans_test_centroids DIMENSION
                                          centroidsTable('baseball.kmeans_test_centroids')
                                        ) d 
                                      WHERE clusterid = 1
                                  )
                                  COLSTOUNPIVOT('ab', 'g', 'h', 'r')
                                  COLSTOACCUMULATE('id','clusterid')
                                  ATTRIBUTECOLUMNNAME('variable')
                                  VALUECOLUMNNAME('value')
                                  KEEPINPUTCOLUMNTYPES('true')
                                )
                              ) AS target PARTITION BY id
                              ON (
                                SELECT *, regexp_split_to_table(means, ' ')::numeric value, regexp_split_to_table('ab, g, h, r', ', ') variable 
                                FROM baseball.kmeans_test_centroids
                                WHERE clusterid = 1
                              ) AS ref DIMENSION
                              TARGETIDCOLUMNS('id')
                              TARGETFEATURECOLUMN('variable')
                              TARGETVALUECOLUMN('value')
                              REFIDCOLUMNS('clusterid')
                              REFFEATURECOLUMN('variable')
                              REFVALUECOLUMN('value')
                              MEASURE('Euclidean')
                            )
                            UNION ALL
                            SELECT 2 clusterid, SUM(distance::double ^ 2) withinss FROM VectorDistance(
                              ON (
                                SELECT clusterid, id, variable, coalesce(value_double, value_long, value_str::double) value
                                FROM unpivot(
                                  ON (SELECT d.* 
                                        FROM kmeansplot (
                                          ON baseball.kmeans_test_scaled PARTITION BY ANY
                                          ON baseball.kmeans_test_centroids DIMENSION
                                          centroidsTable('baseball.kmeans_test_centroids')
                                        ) d 
                                      WHERE clusterid = 2
                                  )
                                  COLSTOUNPIVOT('ab', 'g', 'h', 'r')
                                  COLSTOACCUMULATE('id','clusterid')
                                  ATTRIBUTECOLUMNNAME('variable')
                                  VALUECOLUMNNAME('value')
                                  KEEPINPUTCOLUMNTYPES('true')
                                )
                              ) AS target PARTITION BY id
                              ON (
                                SELECT *, regexp_split_to_table(means, ' ')::numeric value, regexp_split_to_table('ab, g, h, r', ', ') variable 
                                FROM baseball.kmeans_test_centroids
                                WHERE clusterid = 2
                              ) AS ref DIMENSION
                              TARGETIDCOLUMNS('id')
                              TARGETFEATURECOLUMN('variable')
                              TARGETVALUECOLUMN('value')
                              REFIDCOLUMNS('clusterid')
                              REFFEATURECOLUMN('variable')
                              REFVALUECOLUMN('value')
                              MEASURE('Euclidean')
                            )
                          ) c2 ON (c1.clusterid = c2.clusterid)
                          ORDER BY clusterid;
                          --;
                          -- Compute Total Sum of Squares;
                          SELECT SUM(distance::double ^ 2) totss FROM VectorDistance(
                            ON (SELECT id, variable, coalesce(value_double, value_long, value_str::double) value
                                FROM unpivot(
                                  ON baseball.kmeans_test_scaled
                                  COLSTOUNPIVOT('ab', 'g', 'h', 'r')
                                  COLSTOACCUMULATE('id')
                                  ATTRIBUTECOLUMNNAME('variable')
                                  VALUECOLUMNNAME('value')
                                  KEEPINPUTCOLUMNTYPES('true')
                                ) 
                            ) AS target PARTITION BY id
                            ON (SELECT id, variable, value_double
                                FROM unpivot(
                                  ON (SELECT 1 id, 0.0::double ab, 0.0::double g, 0.0::double h, 0.0::double r)
                                  COLSTOUNPIVOT('ab', 'g', 'h', 'r')
                                  COLSTOACCUMULATE('id')
                                  ATTRIBUTECOLUMNNAME('variable')
                                  VALUECOLUMNNAME('value')
                                  KEEPINPUTCOLUMNTYPES('true')
                                )
                            ) AS ref DIMENSION
                            TARGETIDCOLUMNS('id')
                            TARGETFEATURECOLUMN('variable')
                            TARGETVALUECOLUMN('value')
                            REFIDCOLUMNS('id')
                            REFFEATURECOLUMN('variable')
                            REFVALUECOLUMN('value_double')
                            MEASURE('Euclidean')
                          );",
                         "kmeans for 3 clusters with aggregates without WHERE clause")
  
  expect_equal_normalized(computeKmeans(NULL, "batting", centers=2, iterMax = 25,
                                        tableInfo=batting_info, include=c('g','ab','r','h'),
                                        id="playerid || '-' || stint || '-' || teamid || '-' || yearid", idAlias="id",
                                        aggregates = c("COUNT(*) cnt", "AVG(g) avg_g", "AVG(ab) avg_ab", "AVG(r) avg_r", "AVG(h) avg_h"),
                                        persist = TRUE,
                                        scaledTableName='kmeans_test_scaled', 
                                        centroidTableName='kmeans_test_centroids', 
                                        clusteredTableName = 'kmeans_test_clustered',
                                        tempTableName = 'kmeans_test_temp',
                                        schema='public', test=TRUE, version="6.21"),
                         "-- Data Prep: scale
                          DROP TABLE IF EXISTS public.kmeans_test_scaled;
                          CREATE FACT TABLE public.kmeans_test_scaled DISTRIBUTE BY HASH(id) AS 
                          SELECT * FROM Scale(
                            ON (SELECT playerid || '-' || stint || '-' || teamid || '-' || yearid id, ab, g, h, r FROM batting ) AS input PARTITION BY ANY
                            ON (SELECT * FROM ScaleMap (
                              ON (SELECT playerid || '-' || stint || '-' || teamid || '-' || yearid id, ab, g, h, r FROM batting )
                              InputColumns ('ab', 'g', 'h', 'r')
                              -- MissValue ('OMIT')
                            )) AS STATISTIC DIMENSION
                            Method ('STD')
                            Accumulate('id')
                            GlobalScale ('false')
                            InputColumns ('ab', 'g', 'h', 'r')
                          );
                          --;
                          -- Run k-means;
                          DROP TABLE IF EXISTS public.kmeans_test_centroids;
                          DROP TABLE IF EXISTS public.kmeans_test_temp;
                          SELECT * FROM kmeans(
                            ON (SELECT 1)
                            PARTITION BY 1
                            INPUTTABLE('public.kmeans_test_scaled')
                            OUTPUTTABLE('public.kmeans_test_centroids')
                            ClusteredOutput('public.kmeans_test_temp')
                            NUMBERK('2')
                            THRESHOLD('0.0395')
                            MAXITERNUM('25')
                          );
                          --;
                          -- Combine clustered ids with data;
                          DROP TABLE IF EXISTS public.kmeans_test_clustered;
                          CREATE FACT TABLE public.kmeans_test_clustered 
                          DISTRIBUTE BY HASH(id) AS 
                          SELECT d.*, c.clusterid 
                            FROM public.kmeans_test_temp c JOIN 
                                 public.kmeans_test_scaled d ON (c.id = d.id);
                          DROP TABLE IF EXISTS public.kmeans_test_temp;
                          --;
                          -- Run cluster assignment, cluster stats, and within-cluster sum of squares;
                          SELECT c1.*, c2.withinss  
                          FROM (SELECT clusterid, means, COUNT(*) cnt, AVG(g) avg_g, AVG(ab) avg_ab, AVG(r) avg_r, AVG(h) avg_h          
                                  FROM (SELECT c.clusterid, c.\"ab g h r\" means, d.* 
                                          FROM public.kmeans_test_centroids c JOIN 
                                               public.kmeans_test_clustered kmp ON (c.clusterid = kmp.clusterid) JOIN 
                                               (SELECT playerid || '-' || stint || '-' || teamid || '-' || yearid id, * 
                                                  FROM batting ) d on (kmp.id = d.id)
                                       ) clustered_data
                                 GROUP BY clusterid, means
                                ) c1 JOIN 
                               ( SELECT 0 clusterid, SUM(distance::double ^ 2) withinss 
                                   FROM VectorDistance(
                                     ON ( SELECT clusterid, id, variable, coalesce(value_double, value_long, value_str::double) value 
                                            FROM unpivot(
                                              ON (SELECT d.* 
                                                    FROM public.kmeans_test_clustered d 
                                                   WHERE clusterid = 0
                                              )
                                              COLSTOUNPIVOT('ab', 'g', 'h', 'r')
                                              COLSTOACCUMULATE('id','clusterid')
                                              ATTRIBUTECOLUMNNAME('variable')
                                              VALUECOLUMNNAME('value')
                                              KEEPINPUTCOLUMNTYPES('true')
                                            )
                                         ) AS target PARTITION BY id
                                      ON ( SELECT *, regexp_split_to_table(\"ab g h r\", ' ')::numeric value, regexp_split_to_table('ab, g, h, r', ', ') variable 
                                FROM public.kmeans_test_centroids
                                WHERE clusterid = 0
                              ) AS ref DIMENSION
                              TARGETIDCOLUMNS('id')
                              TARGETFEATURECOLUMN('variable')
                              TARGETVALUECOLUMN('value')
                              REFIDCOLUMNS('clusterid')
                              REFFEATURECOLUMN('variable')
                              REFVALUECOLUMN('value')
                              MEASURE('Euclidean')
                            )
                            UNION ALL
                            SELECT 1 clusterid, SUM(distance::double ^ 2) withinss FROM VectorDistance(
                              ON (
                                SELECT clusterid, id, variable, coalesce(value_double, value_long, value_str::double) value
                                FROM unpivot(
                                  ON (SELECT d.* 
                                        FROM public.kmeans_test_clustered d 
                                      WHERE clusterid = 1
                                  )
                                  COLSTOUNPIVOT('ab', 'g', 'h', 'r')
                                  COLSTOACCUMULATE('id','clusterid')
                                  ATTRIBUTECOLUMNNAME('variable')
                                  VALUECOLUMNNAME('value')
                                  KEEPINPUTCOLUMNTYPES('true')
                                )
                              ) AS target PARTITION BY id
                              ON (
                                SELECT *, regexp_split_to_table(\"ab g h r\", ' ')::numeric value, regexp_split_to_table('ab, g, h, r', ', ') variable 
                                FROM public.kmeans_test_centroids
                                WHERE clusterid = 1
                              ) AS ref DIMENSION
                              TARGETIDCOLUMNS('id')
                              TARGETFEATURECOLUMN('variable')
                              TARGETVALUECOLUMN('value')
                              REFIDCOLUMNS('clusterid')
                              REFFEATURECOLUMN('variable')
                              REFVALUECOLUMN('value')
                              MEASURE('Euclidean')
                            )
                          ) c2 ON (c1.clusterid = c2.clusterid)
                          ORDER BY clusterid;
                          --;
                          -- Compute Total Sum of Squares;
                          SELECT SUM(distance::double ^ 2) totss FROM VectorDistance(
                            ON (SELECT id, variable, coalesce(value_double, value_long, value_str::double) value
                                FROM unpivot(
                                  ON public.kmeans_test_scaled
                                  COLSTOUNPIVOT('ab', 'g', 'h', 'r')
                                  COLSTOACCUMULATE('id')
                                  ATTRIBUTECOLUMNNAME('variable')
                                  VALUECOLUMNNAME('value')
                                  KEEPINPUTCOLUMNTYPES('true')
                                ) 
                            ) AS target PARTITION BY id
                            ON (SELECT id, variable, value_double
                                FROM unpivot(
                                  ON (SELECT 1 id, 0.0::double ab, 0.0::double g, 0.0::double h, 0.0::double r)
                                  COLSTOUNPIVOT('ab', 'g', 'h', 'r')
                                  COLSTOACCUMULATE('id')
                                  ATTRIBUTECOLUMNNAME('variable')
                                  VALUECOLUMNNAME('value')
                                  KEEPINPUTCOLUMNTYPES('true')
                                )
                            ) AS ref DIMENSION
                            TARGETIDCOLUMNS('id')
                            TARGETFEATURECOLUMN('variable')
                            TARGETVALUECOLUMN('value')
                            REFIDCOLUMNS('id')
                            REFFEATURECOLUMN('variable')
                            REFVALUECOLUMN('value_double')
                            MEASURE('Euclidean')
                          );",
                         "kmeans for 2 clusters kmeans version 6.21 with aggregates without WHERE clause")
  
  expect_equal_normalized(computeKmeans(NULL, "batting", centers=1, tableInfo=batting_info, 
                                        include=c('g','ab','r','h'),
                                        id="playerid || '-' || stint || '-' || teamid || '-' || yearid", idAlias="id",
                                        scaledTableName='kmeans_test_scaled', centroidTableName='kmeans_test_centroids', 
                                        where="yearid > 2000", test=TRUE, version="5.20"),
                         "-- Data Prep: scale
                          DROP TABLE IF EXISTS kmeans_test_scaled;
                          CREATE FACT TABLE kmeans_test_scaled DISTRIBUTE BY HASH(id) AS 
                          SELECT * FROM Scale(
                            ON (SELECT playerid || '-' || stint || '-' || teamid || '-' || yearid id, ab, g, h, r FROM batting WHERE yearid > 2000  ) AS input PARTITION BY ANY
                            ON (SELECT * FROM ScaleMap (
                              ON (SELECT playerid || '-' || stint || '-' || teamid || '-' || yearid id, ab, g, h, r FROM batting WHERE yearid > 2000  )
                              InputColumns ('ab', 'g', 'h', 'r')
                              -- MissValue ('OMIT')
                            )) AS STATISTIC DIMENSION
                            Method ('STD')
                            Accumulate('id')
                            GlobalScale ('false')
                            InputColumns ('ab', 'g', 'h', 'r')
                          );
                          --;
                          -- Run k-means;
                          DROP TABLE IF EXISTS kmeans_test_centroids;
                          SELECT * FROM kmeans(
                            ON (SELECT 1)
                            PARTITION BY 1
                            INPUTTABLE('kmeans_test_scaled')
                            OUTPUTTABLE('kmeans_test_centroids')
                            NUMBERK('1')
                            THRESHOLD('0.0395')
                            MAXITERNUM('10')
                          );
                          --;
                          -- Run cluster assignment, cluster stats, and within-cluster sum of squares;
                          SELECT c1.*, c2.withinss  
                            FROM (SELECT clusterid, means, COUNT(*) cnt          
                                    FROM (SELECT c.clusterid, c.means means, d.* 
                                      FROM kmeans_test_centroids c JOIN 
                                           kmeansplot (
                                             ON kmeans_test_scaled PARTITION BY ANY
                                             ON kmeans_test_centroids DIMENSION
                                             centroidsTable('kmeans_test_centroids')
                                           ) kmp ON (c.clusterid = kmp.clusterid) JOIN 
                                           (SELECT playerid || '-' || stint || '-' || teamid || '-' || yearid id, * 
                                              FROM batting WHERE yearid > 2000 ) d on (kmp.id = d.id)
                                          ) clustered_data
                                    GROUP BY clusterid, means
                                  ) c1 JOIN ( 
                                  SELECT 0 clusterid, SUM(distance::double ^ 2) withinss 
                                    FROM VectorDistance(
                                      ON ( SELECT clusterid, id, variable, coalesce(value_double, value_long, value_str::double) value
                                             FROM unpivot(
                                               ON (SELECT d.* 
                                                     FROM kmeansplot (
                                                       ON kmeans_test_scaled PARTITION BY ANY
                                                       ON kmeans_test_centroids DIMENSION
                                                       centroidsTable('kmeans_test_centroids')
                                                   ) d 
                                            WHERE clusterid = 0
                                               )
                                               COLSTOUNPIVOT('ab', 'g', 'h', 'r')
                                               COLSTOACCUMULATE('id','clusterid')
                                               ATTRIBUTECOLUMNNAME('variable')
                                               VALUECOLUMNNAME('value')
                                               KEEPINPUTCOLUMNTYPES('true')
                                            )
                                      ) AS target PARTITION BY id
                                      ON (
                                          SELECT *, regexp_split_to_table(means, ' ')::numeric value, regexp_split_to_table('ab, g, h, r', ', ') variable 
                                            FROM kmeans_test_centroids
                                           WHERE clusterid = 0
                                      ) AS ref DIMENSION
                                      TARGETIDCOLUMNS('id')
                                      TARGETFEATURECOLUMN('variable')
                                      TARGETVALUECOLUMN('value')
                                      REFIDCOLUMNS('clusterid')
                                      REFFEATURECOLUMN('variable')
                                      REFVALUECOLUMN('value')
                                      MEASURE('Euclidean')
                                )
                              ) c2 ON (c1.clusterid = c2.clusterid)
                              ORDER BY clusterid;
                          --;
                          -- Compute Total Sum of Squares;
                          SELECT SUM(distance::double ^ 2) totss FROM VectorDistance(
                            ON (SELECT id, variable, coalesce(value_double, value_long, value_str::double) value
                                  FROM unpivot(
                                    ON kmeans_test_scaled
                                    COLSTOUNPIVOT('ab', 'g', 'h', 'r')
                                    COLSTOACCUMULATE('id')
                                    ATTRIBUTECOLUMNNAME('variable')
                                    VALUECOLUMNNAME('value')
                                    KEEPINPUTCOLUMNTYPES('true')
                                  ) 
                            ) AS target PARTITION BY id
                            ON (SELECT id, variable, value_double
                                  FROM unpivot(
                                    ON (SELECT 1 id, 0.0::double ab, 0.0::double g, 0.0::double h, 0.0::double r)
                                    COLSTOUNPIVOT('ab', 'g', 'h', 'r')
                                    COLSTOACCUMULATE('id')
                                    ATTRIBUTECOLUMNNAME('variable')
                                    VALUECOLUMNNAME('value')
                                    KEEPINPUTCOLUMNTYPES('true')
                                )
                            ) AS ref DIMENSION
                            TARGETIDCOLUMNS('id')
                            TARGETFEATURECOLUMN('variable')
                            TARGETVALUECOLUMN('value')
                            REFIDCOLUMNS('id')
                            REFFEATURECOLUMN('variable')
                            REFVALUECOLUMN('value_double')
                            MEASURE('Euclidean')
                          );",
                          "kmeans for 1 cluster with no aggregates with WHERE clause")
  

  expect_equal_normalized(computeKmeans(NULL, "batting", centers=1, tableInfo=batting_info, 
                                        include=c('g','ab','r','h'),
                                        id="playerid || '-' || stint || '-' || teamid || '-' || yearid", 
                                        scaledTableName='kmeans_test_scaled', centroidTableName='kmeans_test_centroids', 
                                        where="yearid > 2000", test=TRUE, version="5.20"),
                         "-- Data Prep: scale
                          DROP TABLE IF EXISTS kmeans_test_scaled;
                          CREATE FACT TABLE kmeans_test_scaled DISTRIBUTE BY HASH(playerid_stint_teamid_yearid) AS 
                          SELECT * FROM Scale(
                            ON (SELECT playerid || '-' || stint || '-' || teamid || '-' || yearid playerid_stint_teamid_yearid, 
                                       ab, g, h, r FROM batting WHERE yearid > 2000  
                            ) AS input PARTITION BY ANY
                            ON (SELECT * FROM ScaleMap (
                              ON (SELECT playerid || '-' || stint || '-' || teamid || '-' || yearid playerid_stint_teamid_yearid, 
                                         ab, g, h, r FROM batting WHERE yearid > 2000  )
                              InputColumns ('ab', 'g', 'h', 'r')
                              -- MissValue ('OMIT')
                            )) AS STATISTIC DIMENSION
                            Method ('STD')
                            Accumulate('playerid_stint_teamid_yearid')
                            GlobalScale ('false')
                            InputColumns ('ab', 'g', 'h', 'r')
                          );
                          --;
                          -- Run k-means;
                          DROP TABLE IF EXISTS kmeans_test_centroids;
                          SELECT * FROM kmeans(
                            ON (SELECT 1)
                            PARTITION BY 1
                            INPUTTABLE('kmeans_test_scaled')
                            OUTPUTTABLE('kmeans_test_centroids')
                            NUMBERK('1')
                            THRESHOLD('0.0395')
                            MAXITERNUM('10')
                          );
                          --;
                          -- Run cluster assignment, cluster stats, and within-cluster sum of squares;
                          SELECT c1.*, c2.withinss  
                            FROM (SELECT clusterid, means, COUNT(*) cnt          
                                    FROM (SELECT c.clusterid, c.means means, d.* 
                                      FROM kmeans_test_centroids c JOIN 
                                           kmeansplot (
                                             ON kmeans_test_scaled PARTITION BY ANY
                                             ON kmeans_test_centroids DIMENSION
                                             centroidsTable('kmeans_test_centroids')
                                           ) kmp ON (c.clusterid = kmp.clusterid) JOIN 
                                           (SELECT playerid || '-' || stint || '-' || teamid || '-' || yearid playerid_stint_teamid_yearid, *
                                              FROM batting WHERE yearid > 2000 ) d on (kmp.playerid_stint_teamid_yearid = d.playerid_stint_teamid_yearid)
                                          ) clustered_data
                                    GROUP BY clusterid, means
                                  ) c1 JOIN ( 
                                  SELECT 0 clusterid, SUM(distance::double ^ 2) withinss 
                                    FROM VectorDistance(
                                      ON ( SELECT clusterid, playerid_stint_teamid_yearid, variable, coalesce(value_double, value_long, value_str::double) value
                                             FROM unpivot(
                                               ON (SELECT d.* 
                                                     FROM kmeansplot (
                                                       ON kmeans_test_scaled PARTITION BY ANY
                                                       ON kmeans_test_centroids DIMENSION
                                                       centroidsTable('kmeans_test_centroids')
                                                   ) d 
                                            WHERE clusterid = 0
                                               )
                                               COLSTOUNPIVOT('ab', 'g', 'h', 'r')
                                               COLSTOACCUMULATE('playerid_stint_teamid_yearid','clusterid')
                                               ATTRIBUTECOLUMNNAME('variable')
                                               VALUECOLUMNNAME('value')
                                               KEEPINPUTCOLUMNTYPES('true')
                                            )
                                      ) AS target PARTITION BY playerid_stint_teamid_yearid
                                      ON (
                                          SELECT *, regexp_split_to_table(means, ' ')::numeric value, regexp_split_to_table('ab, g, h, r', ', ') variable 
                                            FROM kmeans_test_centroids
                                           WHERE clusterid = 0
                                      ) AS ref DIMENSION
                                      TARGETIDCOLUMNS('playerid_stint_teamid_yearid')
                                      TARGETFEATURECOLUMN('variable')
                                      TARGETVALUECOLUMN('value')
                                      REFIDCOLUMNS('clusterid')
                                      REFFEATURECOLUMN('variable')
                                      REFVALUECOLUMN('value')
                                      MEASURE('Euclidean')
                                )
                              ) c2 ON (c1.clusterid = c2.clusterid)
                              ORDER BY clusterid;
                          --;
                          -- Compute Total Sum of Squares;
                          SELECT SUM(distance::double ^ 2) totss FROM VectorDistance(
                            ON (SELECT playerid_stint_teamid_yearid, variable, coalesce(value_double, value_long, value_str::double) value
                                  FROM unpivot(
                                    ON kmeans_test_scaled
                                    COLSTOUNPIVOT('ab', 'g', 'h', 'r')
                                    COLSTOACCUMULATE('playerid_stint_teamid_yearid')
                                    ATTRIBUTECOLUMNNAME('variable')
                                    VALUECOLUMNNAME('value')
                                    KEEPINPUTCOLUMNTYPES('true')
                                  ) 
                            ) AS target PARTITION BY playerid_stint_teamid_yearid
                            ON (SELECT id, variable, value_double
                                  FROM unpivot(
                                    ON (SELECT 1 id, 0.0::double ab, 0.0::double g, 0.0::double h, 0.0::double r)
                                    COLSTOUNPIVOT('ab', 'g', 'h', 'r')
                                    COLSTOACCUMULATE('id')
                                    ATTRIBUTECOLUMNNAME('variable')
                                    VALUECOLUMNNAME('value')
                                    KEEPINPUTCOLUMNTYPES('true')
                                )
                            ) AS ref DIMENSION
                            TARGETIDCOLUMNS('playerid_stint_teamid_yearid')
                            TARGETFEATURECOLUMN('variable')
                            TARGETVALUECOLUMN('value')
                            REFIDCOLUMNS('id')
                            REFFEATURECOLUMN('variable')
                            REFVALUECOLUMN('value_double')
                            MEASURE('Euclidean')
                          );",
                          "kmeans for 1 cluster with no aggregates with default id and WHERE clause")
  
  expect_equal_normalized(computeKmeans(NULL, "batting", centers=1, tableInfo=batting_info, 
                                        include=c('g','ab','r','h'),
                                        id="playerid || '-' || stint || '-' || teamid || '-' || yearid", 
                                        aggregates = NULL,
                                        scaledTableName='kmeans_test_scaled', centroidTableName='kmeans_test_centroids', 
                                        where="yearid > 2000", test=TRUE, version="5.20"),
                         "-- Data Prep: scale
                          DROP TABLE IF EXISTS kmeans_test_scaled;
                          CREATE FACT TABLE kmeans_test_scaled DISTRIBUTE BY HASH(playerid_stint_teamid_yearid) AS 
                          SELECT * FROM Scale(
                            ON (SELECT playerid || '-' || stint || '-' || teamid || '-' || yearid playerid_stint_teamid_yearid, 
                                       ab, g, h, r FROM batting WHERE yearid > 2000  
                            ) AS input PARTITION BY ANY
                            ON (SELECT * FROM ScaleMap (
                              ON (SELECT playerid || '-' || stint || '-' || teamid || '-' || yearid playerid_stint_teamid_yearid, 
                                         ab, g, h, r FROM batting WHERE yearid > 2000  )
                              InputColumns ('ab', 'g', 'h', 'r')
                              -- MissValue ('OMIT')
                            )) AS STATISTIC DIMENSION
                            Method ('STD')
                            Accumulate('playerid_stint_teamid_yearid')
                            GlobalScale ('false')
                            InputColumns ('ab', 'g', 'h', 'r')
                          );
                          --;
                          -- Run k-means;
                          DROP TABLE IF EXISTS kmeans_test_centroids;
                          SELECT * FROM kmeans(
                            ON (SELECT 1)
                            PARTITION BY 1
                            INPUTTABLE('kmeans_test_scaled')
                            OUTPUTTABLE('kmeans_test_centroids')
                            NUMBERK('1')
                            THRESHOLD('0.0395')
                            MAXITERNUM('10')
                          );
                          --;
                          -- Run cluster assignment, cluster stats, and within-cluster sum of squares;
                          SELECT c1.*, c2.withinss  
                            FROM (SELECT clusterid, means, COUNT(*) cnt          
                                    FROM (SELECT c.clusterid, c.means means, d.* 
                                      FROM kmeans_test_centroids c JOIN 
                                           kmeansplot (
                                             ON kmeans_test_scaled PARTITION BY ANY
                                             ON kmeans_test_centroids DIMENSION
                                             centroidsTable('kmeans_test_centroids')
                                           ) kmp ON (c.clusterid = kmp.clusterid) JOIN 
                                           (SELECT playerid || '-' || stint || '-' || teamid || '-' || yearid playerid_stint_teamid_yearid, *
                                              FROM batting WHERE yearid > 2000 ) d on (kmp.playerid_stint_teamid_yearid = d.playerid_stint_teamid_yearid)
                                          ) clustered_data
                                    GROUP BY clusterid, means
                                  ) c1 JOIN ( 
                                  SELECT 0 clusterid, SUM(distance::double ^ 2) withinss 
                                    FROM VectorDistance(
                                      ON ( SELECT clusterid, playerid_stint_teamid_yearid, variable, coalesce(value_double, value_long, value_str::double) value
                                             FROM unpivot(
                                               ON (SELECT d.* 
                                                     FROM kmeansplot (
                                                       ON kmeans_test_scaled PARTITION BY ANY
                                                       ON kmeans_test_centroids DIMENSION
                                                       centroidsTable('kmeans_test_centroids')
                                                   ) d 
                                            WHERE clusterid = 0
                                               )
                                               COLSTOUNPIVOT('ab', 'g', 'h', 'r')
                                               COLSTOACCUMULATE('playerid_stint_teamid_yearid','clusterid')
                                               ATTRIBUTECOLUMNNAME('variable')
                                               VALUECOLUMNNAME('value')
                                               KEEPINPUTCOLUMNTYPES('true')
                                            )
                                      ) AS target PARTITION BY playerid_stint_teamid_yearid
                                      ON (
                                          SELECT *, regexp_split_to_table(means, ' ')::numeric value, regexp_split_to_table('ab, g, h, r', ', ') variable 
                                            FROM kmeans_test_centroids
                                           WHERE clusterid = 0
                                      ) AS ref DIMENSION
                                      TARGETIDCOLUMNS('playerid_stint_teamid_yearid')
                                      TARGETFEATURECOLUMN('variable')
                                      TARGETVALUECOLUMN('value')
                                      REFIDCOLUMNS('clusterid')
                                      REFFEATURECOLUMN('variable')
                                      REFVALUECOLUMN('value')
                                      MEASURE('Euclidean')
                                )
                              ) c2 ON (c1.clusterid = c2.clusterid)
                              ORDER BY clusterid;
                          --;
                          -- Compute Total Sum of Squares;
                          SELECT SUM(distance::double ^ 2) totss FROM VectorDistance(
                            ON (SELECT playerid_stint_teamid_yearid, variable, coalesce(value_double, value_long, value_str::double) value
                                  FROM unpivot(
                                    ON kmeans_test_scaled
                                    COLSTOUNPIVOT('ab', 'g', 'h', 'r')
                                    COLSTOACCUMULATE('playerid_stint_teamid_yearid')
                                    ATTRIBUTECOLUMNNAME('variable')
                                    VALUECOLUMNNAME('value')
                                    KEEPINPUTCOLUMNTYPES('true')
                                  ) 
                            ) AS target PARTITION BY playerid_stint_teamid_yearid
                            ON (SELECT id, variable, value_double
                                  FROM unpivot(
                                    ON (SELECT 1 id, 0.0::double ab, 0.0::double g, 0.0::double h, 0.0::double r)
                                    COLSTOUNPIVOT('ab', 'g', 'h', 'r')
                                    COLSTOACCUMULATE('id')
                                    ATTRIBUTECOLUMNNAME('variable')
                                    VALUECOLUMNNAME('value')
                                    KEEPINPUTCOLUMNTYPES('true')
                                )
                            ) AS ref DIMENSION
                            TARGETIDCOLUMNS('playerid_stint_teamid_yearid')
                            TARGETFEATURECOLUMN('variable')
                            TARGETVALUECOLUMN('value')
                            REFIDCOLUMNS('id')
                            REFFEATURECOLUMN('variable')
                            REFVALUECOLUMN('value_double')
                            MEASURE('Euclidean')
                          );",
                          "kmeans for 1 cluster with NULL aggregates with default id and WHERE clause")
    
  expect_equal_normalized(computeKmeans(NULL, "batting", centers=1, tableInfo=batting_info, 
                                        include=c('g','ab','r','h'),
                                        id="playerid || '-' || stint || '-' || teamid || '-' || yearid", 
                                        aggregates = character(0),
                                        scaledTableName='kmeans_test_scaled', centroidTableName='kmeans_test_centroids', 
                                        where="yearid > 2000", test=TRUE, version="5.20"),
                         "-- Data Prep: scale
                          DROP TABLE IF EXISTS kmeans_test_scaled;
                          CREATE FACT TABLE kmeans_test_scaled DISTRIBUTE BY HASH(playerid_stint_teamid_yearid) AS 
                          SELECT * FROM Scale(
                            ON (SELECT playerid || '-' || stint || '-' || teamid || '-' || yearid playerid_stint_teamid_yearid, 
                                       ab, g, h, r FROM batting WHERE yearid > 2000  
                            ) AS input PARTITION BY ANY
                            ON (SELECT * FROM ScaleMap (
                              ON (SELECT playerid || '-' || stint || '-' || teamid || '-' || yearid playerid_stint_teamid_yearid, 
                                         ab, g, h, r FROM batting WHERE yearid > 2000  )
                              InputColumns ('ab', 'g', 'h', 'r')
                              -- MissValue ('OMIT')
                            )) AS STATISTIC DIMENSION
                            Method ('STD')
                            Accumulate('playerid_stint_teamid_yearid')
                            GlobalScale ('false')
                            InputColumns ('ab', 'g', 'h', 'r')
                          );
                          --;
                          -- Run k-means;
                          DROP TABLE IF EXISTS kmeans_test_centroids;
                          SELECT * FROM kmeans(
                            ON (SELECT 1)
                            PARTITION BY 1
                            INPUTTABLE('kmeans_test_scaled')
                            OUTPUTTABLE('kmeans_test_centroids')
                            NUMBERK('1')
                            THRESHOLD('0.0395')
                            MAXITERNUM('10')
                          );
                          --;
                          -- Run cluster assignment, cluster stats, and within-cluster sum of squares;
                          SELECT c1.*, c2.withinss  
                            FROM (SELECT clusterid, means, COUNT(*) cnt          
                                    FROM (SELECT c.clusterid, c.means means, d.* 
                                      FROM kmeans_test_centroids c JOIN 
                                           kmeansplot (
                                             ON kmeans_test_scaled PARTITION BY ANY
                                             ON kmeans_test_centroids DIMENSION
                                             centroidsTable('kmeans_test_centroids')
                                           ) kmp ON (c.clusterid = kmp.clusterid) JOIN 
                                           (SELECT playerid || '-' || stint || '-' || teamid || '-' || yearid playerid_stint_teamid_yearid, *
                                              FROM batting WHERE yearid > 2000 ) d on (kmp.playerid_stint_teamid_yearid = d.playerid_stint_teamid_yearid)
                                          ) clustered_data
                                    GROUP BY clusterid, means
                                  ) c1 JOIN ( 
                                  SELECT 0 clusterid, SUM(distance::double ^ 2) withinss 
                                    FROM VectorDistance(
                                      ON ( SELECT clusterid, playerid_stint_teamid_yearid, variable, coalesce(value_double, value_long, value_str::double) value
                                             FROM unpivot(
                                               ON (SELECT d.* 
                                                     FROM kmeansplot (
                                                       ON kmeans_test_scaled PARTITION BY ANY
                                                       ON kmeans_test_centroids DIMENSION
                                                       centroidsTable('kmeans_test_centroids')
                                                   ) d 
                                            WHERE clusterid = 0
                                               )
                                               COLSTOUNPIVOT('ab', 'g', 'h', 'r')
                                               COLSTOACCUMULATE('playerid_stint_teamid_yearid','clusterid')
                                               ATTRIBUTECOLUMNNAME('variable')
                                               VALUECOLUMNNAME('value')
                                               KEEPINPUTCOLUMNTYPES('true')
                                            )
                                      ) AS target PARTITION BY playerid_stint_teamid_yearid
                                      ON (
                                          SELECT *, regexp_split_to_table(means, ' ')::numeric value, regexp_split_to_table('ab, g, h, r', ', ') variable 
                                            FROM kmeans_test_centroids
                                           WHERE clusterid = 0
                                      ) AS ref DIMENSION
                                      TARGETIDCOLUMNS('playerid_stint_teamid_yearid')
                                      TARGETFEATURECOLUMN('variable')
                                      TARGETVALUECOLUMN('value')
                                      REFIDCOLUMNS('clusterid')
                                      REFFEATURECOLUMN('variable')
                                      REFVALUECOLUMN('value')
                                      MEASURE('Euclidean')
                                )
                              ) c2 ON (c1.clusterid = c2.clusterid)
                              ORDER BY clusterid;
                          --;
                          -- Compute Total Sum of Squares;
                          SELECT SUM(distance::double ^ 2) totss FROM VectorDistance(
                            ON (SELECT playerid_stint_teamid_yearid, variable, coalesce(value_double, value_long, value_str::double) value
                                  FROM unpivot(
                                    ON kmeans_test_scaled
                                    COLSTOUNPIVOT('ab', 'g', 'h', 'r')
                                    COLSTOACCUMULATE('playerid_stint_teamid_yearid')
                                    ATTRIBUTECOLUMNNAME('variable')
                                    VALUECOLUMNNAME('value')
                                    KEEPINPUTCOLUMNTYPES('true')
                                  ) 
                            ) AS target PARTITION BY playerid_stint_teamid_yearid
                            ON (SELECT id, variable, value_double
                                  FROM unpivot(
                                    ON (SELECT 1 id, 0.0::double ab, 0.0::double g, 0.0::double h, 0.0::double r)
                                    COLSTOUNPIVOT('ab', 'g', 'h', 'r')
                                    COLSTOACCUMULATE('id')
                                    ATTRIBUTECOLUMNNAME('variable')
                                    VALUECOLUMNNAME('value')
                                    KEEPINPUTCOLUMNTYPES('true')
                                )
                            ) AS ref DIMENSION
                            TARGETIDCOLUMNS('playerid_stint_teamid_yearid')
                            TARGETFEATURECOLUMN('variable')
                            TARGETVALUECOLUMN('value')
                            REFIDCOLUMNS('id')
                            REFFEATURECOLUMN('variable')
                            REFVALUECOLUMN('value_double')
                            MEASURE('Euclidean')
                          );",
                          "kmeans for 1 cluster with 0-length aggregates with default id and WHERE clause")
  
  expect_equal_normalized(computeKmeans(NULL, "batting", centers=1, tableInfo=batting_info, 
                                        include=c('g','ab','r','h'),
                                        id="playerid || '-' || stint || '-' || teamid || '-' || yearid", 
                                        aggregates = c("AVG( a ) a"),
                                        scaledTableName='kmeans_test_scaled', centroidTableName='kmeans_test_centroids', 
                                        where="yearid > 2000", test=TRUE, version="5.20"),
                         "-- Data Prep: scale
                          DROP TABLE IF EXISTS kmeans_test_scaled;
                          CREATE FACT TABLE kmeans_test_scaled DISTRIBUTE BY HASH(playerid_stint_teamid_yearid) AS 
                          SELECT * FROM Scale(
                            ON (SELECT playerid || '-' || stint || '-' || teamid || '-' || yearid playerid_stint_teamid_yearid, 
                                       ab, g, h, r FROM batting WHERE yearid > 2000  
                            ) AS input PARTITION BY ANY
                            ON (SELECT * FROM ScaleMap (
                              ON (SELECT playerid || '-' || stint || '-' || teamid || '-' || yearid playerid_stint_teamid_yearid, 
                                         ab, g, h, r FROM batting WHERE yearid > 2000  )
                              InputColumns ('ab', 'g', 'h', 'r')
                              -- MissValue ('OMIT')
                            )) AS STATISTIC DIMENSION
                            Method ('STD')
                            Accumulate('playerid_stint_teamid_yearid')
                            GlobalScale ('false')
                            InputColumns ('ab', 'g', 'h', 'r')
                          );
                          --;
                          -- Run k-means;
                          DROP TABLE IF EXISTS kmeans_test_centroids;
                          SELECT * FROM kmeans(
                            ON (SELECT 1)
                            PARTITION BY 1
                            INPUTTABLE('kmeans_test_scaled')
                            OUTPUTTABLE('kmeans_test_centroids')
                            NUMBERK('1')
                            THRESHOLD('0.0395')
                            MAXITERNUM('10')
                          );
                          --;
                          -- Run cluster assignment, cluster stats, and within-cluster sum of squares;
                          SELECT c1.*, c2.withinss  
                            FROM (SELECT clusterid, means, AVG( a ) a, COUNT(*) cnt          
                                    FROM (SELECT c.clusterid, c.means means, d.* 
                                      FROM kmeans_test_centroids c JOIN 
                                           kmeansplot (
                                             ON kmeans_test_scaled PARTITION BY ANY
                                             ON kmeans_test_centroids DIMENSION
                                             centroidsTable('kmeans_test_centroids')
                                           ) kmp ON (c.clusterid = kmp.clusterid) JOIN 
                                           (SELECT playerid || '-' || stint || '-' || teamid || '-' || yearid playerid_stint_teamid_yearid, *
                                              FROM batting WHERE yearid > 2000 ) d on (kmp.playerid_stint_teamid_yearid = d.playerid_stint_teamid_yearid)
                                          ) clustered_data
                                    GROUP BY clusterid, means
                                  ) c1 JOIN ( 
                                  SELECT 0 clusterid, SUM(distance::double ^ 2) withinss 
                                    FROM VectorDistance(
                                      ON ( SELECT clusterid, playerid_stint_teamid_yearid, variable, coalesce(value_double, value_long, value_str::double) value
                                             FROM unpivot(
                                               ON (SELECT d.* 
                                                     FROM kmeansplot (
                                                       ON kmeans_test_scaled PARTITION BY ANY
                                                       ON kmeans_test_centroids DIMENSION
                                                       centroidsTable('kmeans_test_centroids')
                                                   ) d 
                                            WHERE clusterid = 0
                                               )
                                               COLSTOUNPIVOT('ab', 'g', 'h', 'r')
                                               COLSTOACCUMULATE('playerid_stint_teamid_yearid','clusterid')
                                               ATTRIBUTECOLUMNNAME('variable')
                                               VALUECOLUMNNAME('value')
                                               KEEPINPUTCOLUMNTYPES('true')
                                            )
                                      ) AS target PARTITION BY playerid_stint_teamid_yearid
                                      ON (
                                          SELECT *, regexp_split_to_table(means, ' ')::numeric value, regexp_split_to_table('ab, g, h, r', ', ') variable 
                                            FROM kmeans_test_centroids
                                           WHERE clusterid = 0
                                      ) AS ref DIMENSION
                                      TARGETIDCOLUMNS('playerid_stint_teamid_yearid')
                                      TARGETFEATURECOLUMN('variable')
                                      TARGETVALUECOLUMN('value')
                                      REFIDCOLUMNS('clusterid')
                                      REFFEATURECOLUMN('variable')
                                      REFVALUECOLUMN('value')
                                      MEASURE('Euclidean')
                                )
                              ) c2 ON (c1.clusterid = c2.clusterid)
                              ORDER BY clusterid;
                          --;
                          -- Compute Total Sum of Squares;
                          SELECT SUM(distance::double ^ 2) totss FROM VectorDistance(
                            ON (SELECT playerid_stint_teamid_yearid, variable, coalesce(value_double, value_long, value_str::double) value
                                  FROM unpivot(
                                    ON kmeans_test_scaled
                                    COLSTOUNPIVOT('ab', 'g', 'h', 'r')
                                    COLSTOACCUMULATE('playerid_stint_teamid_yearid')
                                    ATTRIBUTECOLUMNNAME('variable')
                                    VALUECOLUMNNAME('value')
                                    KEEPINPUTCOLUMNTYPES('true')
                                  ) 
                            ) AS target PARTITION BY playerid_stint_teamid_yearid
                            ON (SELECT id, variable, value_double
                                  FROM unpivot(
                                    ON (SELECT 1 id, 0.0::double ab, 0.0::double g, 0.0::double h, 0.0::double r)
                                    COLSTOUNPIVOT('ab', 'g', 'h', 'r')
                                    COLSTOACCUMULATE('id')
                                    ATTRIBUTECOLUMNNAME('variable')
                                    VALUECOLUMNNAME('value')
                                    KEEPINPUTCOLUMNTYPES('true')
                                )
                            ) AS ref DIMENSION
                            TARGETIDCOLUMNS('playerid_stint_teamid_yearid')
                            TARGETFEATURECOLUMN('variable')
                            TARGETVALUECOLUMN('value')
                            REFIDCOLUMNS('id')
                            REFFEATURECOLUMN('variable')
                            REFVALUECOLUMN('value_double')
                            MEASURE('Euclidean')
                          );",
                          "kmeans for 1 cluster with aggregates without COUNT with default id and WHERE clause")
  
  expect_equal_normalized(computeKmeans(NULL, "batting", centers=1, tableInfo=batting_info, 
                                        include=c('g','ab','r','h'), scale=FALSE,
                                        id="playerid || '-' || stint || '-' || teamid || '-' || yearid", idAlias="id",
                                        scaledTableName='kmeans_test_scaled', centroidTableName='kmeans_test_centroids', 
                                        where="yearid > 2000", test=TRUE, version="5.20"),
                         "-- Data Prep: omit nulls
DROP TABLE IF EXISTS kmeans_test_scaled;
CREATE FACT TABLE kmeans_test_scaled DISTRIBUTE BY HASH(id) AS 
       SELECT * FROM (SELECT playerid || '-' || stint || '-' || teamid || '-' || yearid id, ab, g, h, r FROM batting WHERE yearid > 2000  ) d
      WHERE id IS NOT NULL AND ab IS NOT NULL AND g IS NOT NULL AND h IS NOT NULL AND r IS NOT NULL;
--;
-- Run k-means;
DROP TABLE IF EXISTS kmeans_test_centroids;
SELECT * FROM kmeans(
      ON (SELECT 1)
      PARTITION BY 1
      INPUTTABLE('kmeans_test_scaled')
      OUTPUTTABLE('kmeans_test_centroids')
   NUMBERK('1')
      THRESHOLD('0.0395')
      MAXITERNUM('10')
    );
--;
-- Run cluster assignment, cluster stats, and within-cluster sum of squares;
SELECT c1.*, c2.withinss  
       FROM (SELECT clusterid, means, COUNT(*) cnt          FROM (SELECT c.clusterid, c.means means, d.* 
      FROM kmeans_test_centroids c JOIN 
    kmeansplot (
      ON kmeans_test_scaled PARTITION BY ANY
      ON kmeans_test_centroids DIMENSION
      centroidsTable('kmeans_test_centroids')
    ) kmp ON (c.clusterid = kmp.clusterid) JOIN 
    (SELECT playerid || '-' || stint || '-' || teamid || '-' || yearid id, * FROM batting WHERE yearid > 2000  ) d on (kmp.id = d.id)
                    ) clustered_data
              GROUP BY clusterid, means
            ) c1 JOIN ( 
            SELECT 0 clusterid, SUM(distance::double ^ 2) withinss FROM VectorDistance(
       ON (
         SELECT clusterid, id, variable, coalesce(value_double, value_long, value_str::double) value
           FROM unpivot(
                  ON (SELECT d.* 
                        FROM kmeansplot (
                               ON kmeans_test_scaled PARTITION BY ANY
                               ON kmeans_test_centroids DIMENSION
                               centroidsTable('kmeans_test_centroids')
                             ) d 
                       WHERE clusterid = 0
                  )
                  COLSTOUNPIVOT('ab', 'g', 'h', 'r')
                  COLSTOACCUMULATE('id','clusterid')
                  ATTRIBUTECOLUMNNAME('variable')
                  VALUECOLUMNNAME('value')
                  KEEPINPUTCOLUMNTYPES('true')
                )
       ) AS target PARTITION BY id
       ON (
         SELECT *, regexp_split_to_table(means, ' ')::numeric value, regexp_split_to_table('ab, g, h, r', ', ') variable 
           FROM kmeans_test_centroids WHERE clusterid = 0
       ) AS ref DIMENSION
       TARGETIDCOLUMNS('id')
       TARGETFEATURECOLUMN('variable')
       TARGETVALUECOLUMN('value')
       REFIDCOLUMNS('clusterid')
       REFFEATURECOLUMN('variable')
       REFVALUECOLUMN('value')
       MEASURE('Euclidean')
     )
            ) c2 ON (c1.clusterid = c2.clusterid)
      ORDER BY clusterid;
--;
-- Compute Total Sum of Squares;
SELECT SUM(distance::double ^ 2) totss FROM VectorDistance(
       ON (SELECT id, variable, coalesce(value_double, value_long, value_str::double) value
             FROM unpivot(
               ON kmeans_test_scaled
               COLSTOUNPIVOT('ab', 'g', 'h', 'r')
               COLSTOACCUMULATE('id')
               ATTRIBUTECOLUMNNAME('variable')
               VALUECOLUMNNAME('value')
               KEEPINPUTCOLUMNTYPES('true')
             ) 
       ) AS target PARTITION BY id
       ON (SELECT id, variable, value_double
             FROM unpivot(
               ON (SELECT 1 id, AVG(ab)::double ab, AVG(g)::double g, AVG(h)::double h, AVG(r)::double r FROM kmeans_test_scaled)
               COLSTOUNPIVOT('ab', 'g', 'h', 'r')
               COLSTOACCUMULATE('id')
               ATTRIBUTECOLUMNNAME('variable')
               VALUECOLUMNNAME('value')
               KEEPINPUTCOLUMNTYPES('true')
             )
       ) AS ref DIMENSION
       TARGETIDCOLUMNS('id')
       TARGETFEATURECOLUMN('variable')
       TARGETVALUECOLUMN('value')
       REFIDCOLUMNS('id')
       REFFEATURECOLUMN('variable')
       REFVALUECOLUMN('value_double')
       MEASURE('Euclidean')
     );",
                          "kmeans for 1 cluster not scaled with no aggregates with WHERE clause")
  
  
  expect_equal_normalized(computeKmeans(NULL, "batting", centers=1, tableInfo=batting_info, 
                                        include=c('g','ab','r','h'),
                                        id="playerid",
                                        scaledTableName='kmeans_test_scaled', centroidTableName='kmeans_test_centroids', 
                                        where="yearid > 2000", test=TRUE, version="5.20"),
                         "-- Data Prep: scale
                          DROP TABLE IF EXISTS kmeans_test_scaled;
                          CREATE FACT TABLE kmeans_test_scaled DISTRIBUTE BY HASH(__playerid__) AS 
                          SELECT * FROM Scale(
                            ON (SELECT playerid __playerid__, 
                                       ab, g, h, r FROM batting WHERE yearid > 2000  
                            ) AS input PARTITION BY ANY
                            ON (SELECT * FROM ScaleMap (
                              ON (SELECT playerid __playerid__, 
                                         ab, g, h, r FROM batting WHERE yearid > 2000  )
                              InputColumns ('ab', 'g', 'h', 'r')
                              -- MissValue ('OMIT')
                            )) AS STATISTIC DIMENSION
                            Method ('STD')
                            Accumulate('__playerid__')
                            GlobalScale ('false')
                            InputColumns ('ab', 'g', 'h', 'r')
                          );
                          --;
                          -- Run k-means;
                          DROP TABLE IF EXISTS kmeans_test_centroids;
                          SELECT * FROM kmeans(
                            ON (SELECT 1)
                            PARTITION BY 1
                            INPUTTABLE('kmeans_test_scaled')
                            OUTPUTTABLE('kmeans_test_centroids')
                            NUMBERK('1')
                            THRESHOLD('0.0395')
                            MAXITERNUM('10')
                          );
                          --;
                          -- Run cluster assignment, cluster stats, and within-cluster sum of squares;
                          SELECT c1.*, c2.withinss  
                            FROM (SELECT clusterid, means, COUNT(*) cnt          
                                    FROM (SELECT c.clusterid, c.means means, d.* 
                                      FROM kmeans_test_centroids c JOIN 
                                           kmeansplot (
                                             ON kmeans_test_scaled PARTITION BY ANY
                                             ON kmeans_test_centroids DIMENSION
                                             centroidsTable('kmeans_test_centroids')
                                           ) kmp ON (c.clusterid = kmp.clusterid) JOIN 
                                           (SELECT playerid __playerid__, *
                                              FROM batting WHERE yearid > 2000 ) d on (kmp.__playerid__ = d.__playerid__)
                                          ) clustered_data
                                    GROUP BY clusterid, means
                                  ) c1 JOIN ( 
                                  SELECT 0 clusterid, SUM(distance::double ^ 2) withinss 
                                    FROM VectorDistance(
                                      ON ( SELECT clusterid, __playerid__, variable, coalesce(value_double, value_long, value_str::double) value
                                             FROM unpivot(
                                               ON (SELECT d.* 
                                                     FROM kmeansplot (
                                                       ON kmeans_test_scaled PARTITION BY ANY
                                                       ON kmeans_test_centroids DIMENSION
                                                       centroidsTable('kmeans_test_centroids')
                                                   ) d 
                                            WHERE clusterid = 0
                                               )
                                               COLSTOUNPIVOT('ab', 'g', 'h', 'r')
                                               COLSTOACCUMULATE('__playerid__','clusterid')
                                               ATTRIBUTECOLUMNNAME('variable')
                                               VALUECOLUMNNAME('value')
                                               KEEPINPUTCOLUMNTYPES('true')
                                            )
                                      ) AS target PARTITION BY __playerid__
                                      ON (
                                          SELECT *, regexp_split_to_table(means, ' ')::numeric value, regexp_split_to_table('ab, g, h, r', ', ') variable 
                                            FROM kmeans_test_centroids
                                           WHERE clusterid = 0
                                      ) AS ref DIMENSION
                                      TARGETIDCOLUMNS('__playerid__')
                                      TARGETFEATURECOLUMN('variable')
                                      TARGETVALUECOLUMN('value')
                                      REFIDCOLUMNS('clusterid')
                                      REFFEATURECOLUMN('variable')
                                      REFVALUECOLUMN('value')
                                      MEASURE('Euclidean')
                                )
                              ) c2 ON (c1.clusterid = c2.clusterid)
                              ORDER BY clusterid;
                          --;
                          -- Compute Total Sum of Squares;
                          SELECT SUM(distance::double ^ 2) totss FROM VectorDistance(
                            ON (SELECT __playerid__, variable, coalesce(value_double, value_long, value_str::double) value
                                  FROM unpivot(
                                    ON kmeans_test_scaled
                                    COLSTOUNPIVOT('ab', 'g', 'h', 'r')
                                    COLSTOACCUMULATE('__playerid__')
                                    ATTRIBUTECOLUMNNAME('variable')
                                    VALUECOLUMNNAME('value')
                                    KEEPINPUTCOLUMNTYPES('true')
                                  ) 
                            ) AS target PARTITION BY __playerid__
                            ON (SELECT id, variable, value_double
                                  FROM unpivot(
                                    ON (SELECT 1 id, 0.0::double ab, 0.0::double g, 0.0::double h, 0.0::double r)
                                    COLSTOUNPIVOT('ab', 'g', 'h', 'r')
                                    COLSTOACCUMULATE('id')
                                    ATTRIBUTECOLUMNNAME('variable')
                                    VALUECOLUMNNAME('value')
                                    KEEPINPUTCOLUMNTYPES('true')
                                )
                            ) AS ref DIMENSION
                            TARGETIDCOLUMNS('__playerid__')
                            TARGETFEATURECOLUMN('variable')
                            TARGETVALUECOLUMN('value')
                            REFIDCOLUMNS('id')
                            REFFEATURECOLUMN('variable')
                            REFVALUECOLUMN('value_double')
                            MEASURE('Euclidean')
                          );",
                          "kmeans for 1 cluster with no aggregates with id one of the columns and WHERE clause")
  
  expect_equal_normalized(computeKmeans(NULL, "batting", centers=1, tableInfo=batting_info, 
                                        include=c('g','ab','r','h'),
                                        id="playerid", idAlias="playerid",
                                        scaledTableName='kmeans_test_scaled', centroidTableName='kmeans_test_centroids', 
                                        where="yearid > 2000", test=TRUE, version="5.20"),
                         "-- Data Prep: scale
                          DROP TABLE IF EXISTS kmeans_test_scaled;
                          CREATE FACT TABLE kmeans_test_scaled DISTRIBUTE BY HASH(__playerid__) AS 
                          SELECT * FROM Scale(
                            ON (SELECT playerid __playerid__, 
                                       ab, g, h, r FROM batting WHERE yearid > 2000  
                            ) AS input PARTITION BY ANY
                            ON (SELECT * FROM ScaleMap (
                              ON (SELECT playerid __playerid__, 
                                         ab, g, h, r FROM batting WHERE yearid > 2000  )
                              InputColumns ('ab', 'g', 'h', 'r')
                              -- MissValue ('OMIT')
                            )) AS STATISTIC DIMENSION
                            Method ('STD')
                            Accumulate('__playerid__')
                            GlobalScale ('false')
                            InputColumns ('ab', 'g', 'h', 'r')
                          );
                          --;
                          -- Run k-means;
                          DROP TABLE IF EXISTS kmeans_test_centroids;
                          SELECT * FROM kmeans(
                            ON (SELECT 1)
                            PARTITION BY 1
                            INPUTTABLE('kmeans_test_scaled')
                            OUTPUTTABLE('kmeans_test_centroids')
                            NUMBERK('1')
                            THRESHOLD('0.0395')
                            MAXITERNUM('10')
                          );
                          --;
                          -- Run cluster assignment, cluster stats, and within-cluster sum of squares;
                          SELECT c1.*, c2.withinss  
                            FROM (SELECT clusterid, means, COUNT(*) cnt          
                                    FROM (SELECT c.clusterid, c.means means, d.* 
                                      FROM kmeans_test_centroids c JOIN 
                                           kmeansplot (
                                             ON kmeans_test_scaled PARTITION BY ANY
                                             ON kmeans_test_centroids DIMENSION
                                             centroidsTable('kmeans_test_centroids')
                                           ) kmp ON (c.clusterid = kmp.clusterid) JOIN 
                                           (SELECT playerid __playerid__, *
                                              FROM batting WHERE yearid > 2000 ) d on (kmp.__playerid__ = d.__playerid__)
                                          ) clustered_data
                                    GROUP BY clusterid, means
                                  ) c1 JOIN ( 
                                  SELECT 0 clusterid, SUM(distance::double ^ 2) withinss 
                                    FROM VectorDistance(
                                      ON ( SELECT clusterid, __playerid__, variable, coalesce(value_double, value_long, value_str::double) value
                                             FROM unpivot(
                                               ON (SELECT d.* 
                                                     FROM kmeansplot (
                                                       ON kmeans_test_scaled PARTITION BY ANY
                                                       ON kmeans_test_centroids DIMENSION
                                                       centroidsTable('kmeans_test_centroids')
                                                   ) d 
                                            WHERE clusterid = 0
                                               )
                                               COLSTOUNPIVOT('ab', 'g', 'h', 'r')
                                               COLSTOACCUMULATE('__playerid__','clusterid')
                                               ATTRIBUTECOLUMNNAME('variable')
                                               VALUECOLUMNNAME('value')
                                               KEEPINPUTCOLUMNTYPES('true')
                                            )
                                      ) AS target PARTITION BY __playerid__
                                      ON (
                                          SELECT *, regexp_split_to_table(means, ' ')::numeric value, regexp_split_to_table('ab, g, h, r', ', ') variable 
                                            FROM kmeans_test_centroids
                                           WHERE clusterid = 0
                                      ) AS ref DIMENSION
                                      TARGETIDCOLUMNS('__playerid__')
                                      TARGETFEATURECOLUMN('variable')
                                      TARGETVALUECOLUMN('value')
                                      REFIDCOLUMNS('clusterid')
                                      REFFEATURECOLUMN('variable')
                                      REFVALUECOLUMN('value')
                                      MEASURE('Euclidean')
                                )
                              ) c2 ON (c1.clusterid = c2.clusterid)
                              ORDER BY clusterid;
                          --;
                          -- Compute Total Sum of Squares;
                          SELECT SUM(distance::double ^ 2) totss FROM VectorDistance(
                            ON (SELECT __playerid__, variable, coalesce(value_double, value_long, value_str::double) value
                                  FROM unpivot(
                                    ON kmeans_test_scaled
                                    COLSTOUNPIVOT('ab', 'g', 'h', 'r')
                                    COLSTOACCUMULATE('__playerid__')
                                    ATTRIBUTECOLUMNNAME('variable')
                                    VALUECOLUMNNAME('value')
                                    KEEPINPUTCOLUMNTYPES('true')
                                  ) 
                            ) AS target PARTITION BY __playerid__
                            ON (SELECT id, variable, value_double
                                  FROM unpivot(
                                    ON (SELECT 1 id, 0.0::double ab, 0.0::double g, 0.0::double h, 0.0::double r)
                                    COLSTOUNPIVOT('ab', 'g', 'h', 'r')
                                    COLSTOACCUMULATE('id')
                                    ATTRIBUTECOLUMNNAME('variable')
                                    VALUECOLUMNNAME('value')
                                    KEEPINPUTCOLUMNTYPES('true')
                                )
                            ) AS ref DIMENSION
                            TARGETIDCOLUMNS('__playerid__')
                            TARGETFEATURECOLUMN('variable')
                            TARGETVALUECOLUMN('value')
                            REFIDCOLUMNS('id')
                            REFFEATURECOLUMN('variable')
                            REFVALUECOLUMN('value_double')
                            MEASURE('Euclidean')
                          );",
                          "kmeans for 1 cluster with no aggregates with id one of the columns, explicit id alias, and WHERE clause")

})


kmeans_obj <- structure(list(
    cluster=integer(0),
    centers=matrix(c(1,2,3,
                     11,22,33,
                     111,222,333,
                     1111,2222,3333,
                     11111,22222,33333), nrow=5, byrow = TRUE),
    totss=100000,
    withinss = c(10,20,30,40,50),
    tot.withinss = sum(c(10,20,30,40,50)),
    betweenss = 99999,
    size = c(1,2,3,4,5),
    iter=2,
    ifault = 0,
    scale=TRUE,
    tableName="batting",
    columns=c('g', 'h', 'r'),
    scaledTableName="kmeans_test_scaled",
    centroidTableName="kmeans_test_centroids",
    id="playerid || '-' || stint || '-' || teamid || '-' || yearid",
    idAlias="playerid_stint_teamid_yearid",
    whereClause=" WHERE yearid > 2010  "),
    version = 6.21,
    class = c("toakmeans", "kmeans"))

kmeans_obj_clustered = kmeans_obj
kmeans_obj_clustered$persist = TRUE
kmeans_obj_clustered$clusteredTableName = "kmeans_test_clustered"

test_that("computeClusterSample SQL is correct", {
  
  expect_equal_normalized(computeClusterSample(NULL, kmeans_obj, 0.01, includeId=FALSE, test=TRUE), 
                          "SELECT * FROM antiselect(
         ON 
           (SELECT * FROM sample(
             ON (SELECT  clusterid, d.* 
                   FROM kmeansplot(
                     ON kmeans_test_scaled PARTITION BY ANY
                     ON kmeans_test_centroids DIMENSION
                     centroidsTable('kmeans_test_centroids')
                   ) kmp JOIN (SELECT playerid || '-' || stint || '-' || teamid || '-' || yearid playerid_stint_teamid_yearid, 
                                      g, h, r 
                                 FROM batting WHERE yearid > 2010  ) d ON (kmp.playerid_stint_teamid_yearid = d.playerid_stint_teamid_yearid)
                  WHERE clusterid != -1
             )
             CONDITIONONCOLUMN('clusterid')
             CONDITIONON('0','1','2','3','4')
             SAMPLEFRACTION('0.01')
       )
           )
         EXCLUDE('playerid_stint_teamid_yearid')
       )",
                          info="Sampling fraction unscaled data without row id.")
  
  expect_equal_normalized(computeClusterSample(NULL, kmeans_obj, c(0.01,0.02,0.03,0.04,0.05), includeId=FALSE, test=TRUE), 
                          "SELECT * FROM antiselect(
         ON 
           (SELECT * FROM sample(
             ON (SELECT  clusterid, d.* 
                   FROM kmeansplot(
                     ON kmeans_test_scaled PARTITION BY ANY
                     ON kmeans_test_centroids DIMENSION
                     centroidsTable('kmeans_test_centroids')
                   ) kmp JOIN (SELECT playerid || '-' || stint || '-' || teamid || '-' || yearid playerid_stint_teamid_yearid, 
                                      g, h, r 
                                 FROM batting WHERE yearid > 2010  ) d ON (kmp.playerid_stint_teamid_yearid = d.playerid_stint_teamid_yearid)
                  WHERE clusterid != -1
             )
             CONDITIONONCOLUMN('clusterid')
             CONDITIONON('0','1','2','3','4')
             SAMPLEFRACTION('0.01','0.02','0.03','0.04','0.05')
       )
           )
         EXCLUDE('playerid_stint_teamid_yearid')
       )",
                          info="Sampling fraction vector unscaled data without row id.")
  
  expect_equal_normalized(computeClusterSample(NULL, kmeans_obj_clustered, 0.01, includeId=FALSE, test=TRUE), 
                          "SELECT * FROM antiselect(
         ON 
           (SELECT * FROM sample(
             ON (SELECT  clusterid, d.* 
                   FROM kmeans_test_clustered kmp JOIN (SELECT playerid || '-' || stint || '-' || teamid || '-' || yearid playerid_stint_teamid_yearid, g, h, r FROM batting WHERE yearid > 2010  ) d ON (kmp.playerid_stint_teamid_yearid = d.playerid_stint_teamid_yearid)
                  WHERE clusterid != -1
             )
             CONDITIONONCOLUMN('clusterid')
             CONDITIONON('0','1','2','3','4')
             SAMPLEFRACTION('0.01')
       )
           )
         EXCLUDE('playerid_stint_teamid_yearid')
       )",
                          info="Sampling clustered fraction unscaled data without row id.")
  
  expect_equal_normalized(computeClusterSample(NULL, kmeans_obj, 0.01, test=TRUE), 
                          "SELECT * FROM sample(
             ON (SELECT  clusterid, d.* 
                   FROM kmeansplot(
                     ON kmeans_test_scaled PARTITION BY ANY
                     ON kmeans_test_centroids DIMENSION
                     centroidsTable('kmeans_test_centroids')
                   ) kmp JOIN (SELECT playerid || '-' || stint || '-' || teamid || '-' || yearid playerid_stint_teamid_yearid, 
                                      g, h, r 
                                 FROM batting 
                                WHERE yearid > 2010  ) d ON (kmp.playerid_stint_teamid_yearid = d.playerid_stint_teamid_yearid)
                  WHERE clusterid != -1
             )
             CONDITIONONCOLUMN('clusterid')
             CONDITIONON('0','1','2','3','4')
             SAMPLEFRACTION('0.01')
       )",
                          info="Sampling fraction unscaled data with row id.")
  
  expect_equal_normalized(computeClusterSample(NULL, kmeans_obj_clustered, 0.01, test=TRUE), 
                          "SELECT * FROM sample(
             ON (SELECT  clusterid, d.* 
                   FROM kmeans_test_clustered kmp JOIN (SELECT playerid || '-' || stint || '-' || teamid || '-' || yearid playerid_stint_teamid_yearid, g, h, r FROM batting WHERE yearid > 2010  ) d ON (kmp.playerid_stint_teamid_yearid = d.playerid_stint_teamid_yearid)
                  WHERE clusterid != -1
             )
             CONDITIONONCOLUMN('clusterid')
             CONDITIONON('0','1','2','3','4')
             SAMPLEFRACTION('0.01')
       )",
                          info="Sampling clustered fraction unscaled data with row id.")
  
  expect_equal_normalized(computeClusterSample(NULL, kmeans_obj, 0.01, includeId=FALSE, scaled=TRUE, test=TRUE), 
      "SELECT * FROM antiselect(
         ON 
           (SELECT * FROM sample(
             ON (SELECT  d.* 
                   FROM kmeansplot(
                     ON kmeans_test_scaled PARTITION BY ANY
                     ON kmeans_test_centroids DIMENSION
                     centroidsTable('kmeans_test_centroids')
                   ) d 
                  WHERE clusterid != -1
             )
             CONDITIONONCOLUMN('clusterid')
             CONDITIONON('0','1','2','3','4')
             SAMPLEFRACTION('0.01')
       )
           )
         EXCLUDE('playerid_stint_teamid_yearid')
       )",
      info="Sampling fraction scaled data without row id.")
  
  expect_equal_normalized(computeClusterSample(NULL, kmeans_obj_clustered, 0.01, includeId=FALSE, scaled=TRUE, test=TRUE), 
      "SELECT * FROM antiselect(
         ON 
           (SELECT * FROM sample(
             ON (SELECT  d.* 
                   FROM kmeans_test_clustered d 
                  WHERE clusterid != -1
             )
             CONDITIONONCOLUMN('clusterid')
             CONDITIONON('0','1','2','3','4')
             SAMPLEFRACTION('0.01')
       )
           )
         EXCLUDE('playerid_stint_teamid_yearid')
       )",
      info="Sampling clustered fraction scaled data without row id.")
  
  expect_equal_normalized(computeClusterSample(NULL, kmeans_obj, 0.01, scaled=TRUE, includeId=TRUE, test=TRUE), 
          "SELECT * FROM sample(
             ON (SELECT  d.* 
                   FROM kmeansplot(
                     ON kmeans_test_scaled PARTITION BY ANY
                     ON kmeans_test_centroids DIMENSION
                     centroidsTable('kmeans_test_centroids')
                   ) d 
                  WHERE clusterid != -1
             )
             CONDITIONONCOLUMN('clusterid')
             CONDITIONON('0','1','2','3','4')
             SAMPLEFRACTION('0.01')
       )",
          info="Sampling fraction scaled with row id.")
  
  expect_equal_normalized(computeClusterSample(NULL, kmeans_obj_clustered, 0.01, scaled=TRUE, includeId=TRUE, test=TRUE), 
          "SELECT * FROM sample(
             ON (SELECT  d.* 
                   FROM kmeans_test_clustered d 
                  WHERE clusterid != -1
             )
             CONDITIONONCOLUMN('clusterid')
             CONDITIONON('0','1','2','3','4')
             SAMPLEFRACTION('0.01')
       )",
          info="Sampling clustered fraction scaled with row id.")
  
  expect_equal_normalized(computeClusterSample(NULL, kmeans_obj, sampleSize=1000, includeId=FALSE, test=TRUE),
"SELECT * FROM antiselect(
          ON 
            (WITH stratum_counts AS (
          SELECT clusterid stratum, count(*) stratum_count 
            FROM kmeansplot(
              ON kmeans_test_scaled PARTITION BY ANY
              ON kmeans_test_centroids DIMENSION
              centroidsTable('kmeans_test_centroids')
            ) 
           WHERE clusterid != -1
           GROUP BY 1
         )
         SELECT * FROM sample (
           ON (SELECT clusterid, d.* FROM kmeansplot(
             ON kmeans_test_scaled PARTITION BY ANY
             ON kmeans_test_centroids DIMENSION
             centroidsTable('kmeans_test_centroids')
                ) kmp JOIN (SELECT playerid || '-' || stint || '-' || teamid || '-' || yearid playerid_stint_teamid_yearid, g, h, r 
                                 FROM batting 
                                WHERE yearid > 2010  ) d 
                         ON (kmp.playerid_stint_teamid_yearid = d.playerid_stint_teamid_yearid)
                   WHERE clusterid != -1
               ) AS data PARTITION BY ANY
               ON stratum_counts AS summary DIMENSION
               CONDITIONONCOLUMN('clusterid')
               CONDITIONON('0','1','2','3','4')
               ApproximateSampleSize('1000')
             )
             )
             EXCLUDE('playerid_stint_teamid_yearid')
          )",
          info="Sampling size unscaled data without row id.")
  
  expect_equal_normalized(computeClusterSample(NULL, kmeans_obj_clustered, sampleSize=1000, includeId=FALSE, test=TRUE),
"SELECT * FROM antiselect(
         ON 
           (WITH stratum_counts AS (
         SELECT clusterid stratum, count(*) stratum_count 
           FROM kmeans_test_clustered 
          WHERE clusterid != -1
         GROUP BY 1
       )
       SELECT * FROM sample (
         ON (SELECT  clusterid, d.* 
                   FROM kmeans_test_clustered kmp JOIN (SELECT playerid || '-' || stint || '-' || teamid || '-' || yearid playerid_stint_teamid_yearid, g, h, r FROM batting WHERE yearid > 2010  ) d ON (kmp.playerid_stint_teamid_yearid = d.playerid_stint_teamid_yearid)
                  WHERE clusterid != -1
            ) AS data PARTITION BY ANY
         ON stratum_counts AS summary DIMENSION
         CONDITIONONCOLUMN('clusterid')
         CONDITIONON('0','1','2','3','4')
         ApproximateSampleSize('1000')
      )
           )
         EXCLUDE('playerid_stint_teamid_yearid')
       )",
          info="Sampling clustered size unscaled data without row id.")
  
  expect_equal_normalized(computeClusterSample(NULL, kmeans_obj, sampleSize = 1000, test=TRUE),
          "WITH stratum_counts AS (
            SELECT clusterid stratum, count(*) stratum_count 
              FROM kmeansplot(
                ON kmeans_test_scaled PARTITION BY ANY
                ON kmeans_test_centroids DIMENSION
                centroidsTable('kmeans_test_centroids')
              ) 
             WHERE clusterid != -1
            GROUP BY 1
           )
           SELECT * FROM sample (
             ON (SELECT clusterid, d.* FROM kmeansplot(
               ON kmeans_test_scaled PARTITION BY ANY
               ON kmeans_test_centroids DIMENSION
               centroidsTable('kmeans_test_centroids')
                 ) kmp JOIN (SELECT playerid || '-' || stint || '-' || teamid || '-' || yearid playerid_stint_teamid_yearid, g, h, r 
                                 FROM batting 
                                WHERE yearid > 2010  ) d 
                         ON (kmp.playerid_stint_teamid_yearid = d.playerid_stint_teamid_yearid) 
                 WHERE clusterid != -1
             ) AS data PARTITION BY ANY
             ON stratum_counts AS summary DIMENSION
             CONDITIONONCOLUMN('clusterid')
             CONDITIONON('0','1','2','3','4')
             ApproximateSampleSize('1000')
           )",
          info="Sampling size unscaled data with id.")
  
  expect_equal_normalized(computeClusterSample(NULL, kmeans_obj_clustered, sampleSize = 1000, test=TRUE),
          "WITH stratum_counts AS (
         SELECT clusterid stratum, count(*) stratum_count 
           FROM kmeans_test_clustered 
          WHERE clusterid != -1
         GROUP BY 1
       )
       SELECT * FROM sample (
         ON (SELECT  clusterid, d.* 
                   FROM kmeans_test_clustered kmp JOIN (SELECT playerid || '-' || stint || '-' || teamid || '-' || yearid playerid_stint_teamid_yearid, g, h, r FROM batting WHERE yearid > 2010  ) d ON (kmp.playerid_stint_teamid_yearid = d.playerid_stint_teamid_yearid)
                  WHERE clusterid != -1
            ) AS data PARTITION BY ANY
         ON stratum_counts AS summary DIMENSION
         CONDITIONONCOLUMN('clusterid')
         CONDITIONON('0','1','2','3','4')
         ApproximateSampleSize('1000')
      )",
          info="Sampling clustered size unscaled data with id.")
  
  expect_equal_normalized(computeClusterSample(NULL, kmeans_obj_clustered, sampleSize = c(1000,100,10,100,1000), test=TRUE),
          "WITH stratum_counts AS (
         SELECT clusterid stratum, count(*) stratum_count 
           FROM kmeans_test_clustered 
          WHERE clusterid != -1
         GROUP BY 1
       )
       SELECT * FROM sample (
         ON (SELECT  clusterid, d.* 
                   FROM kmeans_test_clustered kmp JOIN (SELECT playerid || '-' || stint || '-' || teamid || '-' || yearid playerid_stint_teamid_yearid, g, h, r FROM batting WHERE yearid > 2010  ) d ON (kmp.playerid_stint_teamid_yearid = d.playerid_stint_teamid_yearid)
                  WHERE clusterid != -1
            ) AS data PARTITION BY ANY
         ON stratum_counts AS summary DIMENSION
         CONDITIONONCOLUMN('clusterid')
         CONDITIONON('0','1','2','3','4')
         ApproximateSampleSize('1000','100','10','100','1000')
      )",
          info="Sampling clustered size vector unscaled data with id.")
  
  expect_equal_normalized(computeClusterSample(NULL, kmeans_obj, sampleSize=1000, includeId=FALSE, scaled=TRUE, test=TRUE),
          "SELECT * FROM antiselect(
             ON 
             (WITH stratum_counts AS (
               SELECT clusterid stratum, count(*) stratum_count 
                 FROM kmeansplot(
                   ON kmeans_test_scaled PARTITION BY ANY
                   ON kmeans_test_centroids DIMENSION
                   centroidsTable('kmeans_test_centroids')
                 ) 
                WHERE clusterid != -1
               GROUP BY 1
             )
             SELECT * FROM sample (
               ON (SELECT d.* FROM kmeansplot(
                 ON kmeans_test_scaled PARTITION BY ANY
                 ON kmeans_test_centroids DIMENSION
                 centroidsTable('kmeans_test_centroids')
                  ) d
                  WHERE clusterid != -1
               ) AS data PARTITION BY ANY
               ON stratum_counts AS summary DIMENSION
               CONDITIONONCOLUMN('clusterid')
               CONDITIONON('0','1','2','3','4')
               ApproximateSampleSize('1000')
             )
             )
             EXCLUDE('playerid_stint_teamid_yearid')
           )",
          info="Sampling size scaled data without id.")
  
  expect_equal_normalized(computeClusterSample(NULL, kmeans_obj_clustered, sampleSize=1000, includeId=FALSE, scaled=TRUE, test=TRUE),
          "SELECT * FROM antiselect(
         ON 
           (WITH stratum_counts AS (
         SELECT clusterid stratum, count(*) stratum_count 
           FROM kmeans_test_clustered 
          WHERE clusterid != -1
         GROUP BY 1
       )
       SELECT * FROM sample (
         ON (SELECT  d.* 
                   FROM kmeans_test_clustered d 
                  WHERE clusterid != -1
            ) AS data PARTITION BY ANY
         ON stratum_counts AS summary DIMENSION
         CONDITIONONCOLUMN('clusterid')
         CONDITIONON('0','1','2','3','4')
         ApproximateSampleSize('1000')
      )
           )
         EXCLUDE('playerid_stint_teamid_yearid')
       )",
          info="Sampling clustered size scaled data without id.")
  
  expect_equal_normalized(computeClusterSample(NULL, kmeans_obj, sampleSize = 1000, scaled=TRUE, includeId=TRUE, test=TRUE),
          "WITH stratum_counts AS (
            SELECT clusterid stratum, count(*) stratum_count 
              FROM kmeansplot(
                ON kmeans_test_scaled PARTITION BY ANY
                ON kmeans_test_centroids DIMENSION
                centroidsTable('kmeans_test_centroids')
              ) 
             WHERE clusterid != -1
            GROUP BY 1
           )
           SELECT * FROM sample (
             ON (SELECT d.* FROM kmeansplot(
               ON kmeans_test_scaled PARTITION BY ANY
               ON kmeans_test_centroids DIMENSION
               centroidsTable('kmeans_test_centroids')
                 ) d
                 WHERE clusterid != -1
             ) AS data PARTITION BY ANY
             ON stratum_counts AS summary DIMENSION
             CONDITIONONCOLUMN('clusterid')
             CONDITIONON('0','1','2','3','4')
             ApproximateSampleSize('1000')
           )",
          info="Sampling size scaled data with id.")
  
  expect_equal_normalized(computeClusterSample(NULL, kmeans_obj_clustered, sampleSize = 1000, scaled=TRUE, includeId=TRUE, test=TRUE),
          "WITH stratum_counts AS (
         SELECT clusterid stratum, count(*) stratum_count 
           FROM kmeans_test_clustered 
          WHERE clusterid != -1
         GROUP BY 1
       )
       SELECT * FROM sample (
         ON (SELECT  d.* 
                   FROM kmeans_test_clustered d 
                  WHERE clusterid != -1
            ) AS data PARTITION BY ANY
         ON stratum_counts AS summary DIMENSION
         CONDITIONONCOLUMN('clusterid')
         CONDITIONON('0','1','2','3','4')
         ApproximateSampleSize('1000')
      )",
          info="Sampling clustered size scaled data with id.")
})


test_that("computeSilhouette SQL is correct", {
  
  expect_equal_normalized(computeSilhouette(NULL, kmeans_obj, scaled=TRUE, silhouetteTableName='kmeans_test_sil', test=TRUE),
    "-- Create Analytical Table with Silhouette Data
DROP TABLE IF EXISTS kmeans_test_sil;
CREATE ANALYTIC TABLE kmeans_test_sil
     DISTRIBUTE BY HASH(clusterid)
     AS
     WITH kmeansplotresult AS (
         SELECT clusterid, playerid_stint_teamid_yearid, variable, coalesce(value_double, value_long, value_str::double) value 
           FROM unpivot(
                  ON (SELECT  d.* 
                   FROM kmeansplot(
                     ON kmeans_test_scaled PARTITION BY ANY
                     ON kmeans_test_centroids DIMENSION
                     centroidsTable('kmeans_test_centroids')
                   )  d 
                  WHERE clusterid != -1
                  )
                  COLSTOUNPIVOT('g', 'h', 'r')
                  COLSTOACCUMULATE('playerid_stint_teamid_yearid','clusterid')
                  ATTRIBUTECOLUMNNAME('variable')
                  VALUECOLUMNNAME('value')
                  KEEPINPUTCOLUMNTYPES('true')
                ) 
     )
     SELECT target_clusterid clusterid, target_playerid_stint_teamid_yearid playerid_stint_teamid_yearid, a, b 
       FROM (
         SELECT target_clusterid, target_playerid_stint_teamid_yearid,
                MAX(CASE WHEN target_clusterid = ref_clusterid THEN dissimilarity ELSE 0 END) a,
                MIN(CASE WHEN target_clusterid = ref_clusterid THEN 'Infinity' ELSE dissimilarity END) b 
           FROM
             (SELECT target_clusterid, target_playerid_stint_teamid_yearid, ref_clusterid, avg(distance) dissimilarity 
                FROM VectorDistance(
                  ON kmeansplotresult AS target PARTITION BY playerid_stint_teamid_yearid
                  ON kmeansplotresult AS ref DIMENSION
                  TARGETIDCOLUMNS('clusterid','playerid_stint_teamid_yearid')
                  TARGETFEATURECOLUMN('variable')
                  TARGETVALUECOLUMN('value')
                  REFIDCOLUMNS('clusterid','playerid_stint_teamid_yearid')
                  REFFEATURECOLUMN('variable')
                  REFVALUECOLUMN('value')
                  MEASURE('Euclidean')
                ) 
          WHERE target_playerid_stint_teamid_yearid != ref_playerid_stint_teamid_yearid 
          GROUP BY 1,2,3
         ) agg  
       GROUP BY 1,2
     ) sil;
--;
-- Compute overall silhouette value;
SELECT AVG((b-a)/greatest(a,b)) silhouette_value FROM kmeans_test_sil;
--;
-- Compute silhouette cluster profiles;
SELECT * FROM Hist_Reduce(
       ON Hist_Map(
         ON (SELECT clusterid::varchar clusterid, (b-a)/greatest(a,b) silhouette_value FROM kmeans_test_sil
         )
         STARTVALUE('-1')
         BINSIZE('0.05')
         ENDVALUE('1')
         VALUE_COLUMN('silhouette_value')
         GROUP_COLUMNS('clusterid')
       ) PARTITION BY clusterid
     );
--;
-- Drop Analytical Table with Silhouette Data;
DROP TABLE IF EXISTS kmeans_test_sil;",
          "compute Silhouette on scaled data")
  
  expect_equal_normalized(computeSilhouette(NULL, kmeans_obj_clustered, scaled=TRUE, silhouetteTableName='kmeans_test_sil', test=TRUE),
    "-- Create Analytical Table with Silhouette Data
DROP TABLE IF EXISTS kmeans_test_sil;
CREATE ANALYTIC TABLE kmeans_test_sil
     DISTRIBUTE BY HASH(clusterid)
     AS
     WITH kmeansplotresult AS (
         SELECT clusterid, playerid_stint_teamid_yearid, variable, coalesce(value_double, value_long, value_str::double) value
           FROM unpivot(
                  ON (SELECT  d.* 
                   FROM kmeans_test_clustered d 
                  WHERE clusterid != -1
                  )
                  COLSTOUNPIVOT('g', 'h', 'r')
                  COLSTOACCUMULATE('playerid_stint_teamid_yearid','clusterid')
                  ATTRIBUTECOLUMNNAME('variable')
                  VALUECOLUMNNAME('value')
                  KEEPINPUTCOLUMNTYPES('true')
                ) 
     )
     SELECT target_clusterid clusterid, target_playerid_stint_teamid_yearid playerid_stint_teamid_yearid, a, b 
       FROM (
         SELECT target_clusterid, target_playerid_stint_teamid_yearid,
                MAX(CASE WHEN target_clusterid = ref_clusterid THEN dissimilarity ELSE 0 END) a,
                MIN(CASE WHEN target_clusterid = ref_clusterid THEN 'Infinity' ELSE dissimilarity END) b 
           FROM
             (SELECT target_clusterid, target_playerid_stint_teamid_yearid, ref_clusterid, avg(distance) dissimilarity 
                FROM VectorDistance(
                  ON kmeansplotresult AS target PARTITION BY playerid_stint_teamid_yearid
                  ON kmeansplotresult AS ref DIMENSION
                  TARGETIDCOLUMNS('clusterid','playerid_stint_teamid_yearid')
                  TARGETFEATURECOLUMN('variable')
                  TARGETVALUECOLUMN('value')
                  REFIDCOLUMNS('clusterid','playerid_stint_teamid_yearid')
                  REFFEATURECOLUMN('variable')
                  REFVALUECOLUMN('value')
                  MEASURE('Euclidean')
                ) 
          WHERE target_playerid_stint_teamid_yearid != ref_playerid_stint_teamid_yearid 
          GROUP BY 1,2,3
         ) agg  
       GROUP BY 1,2
     ) sil;
--;
-- Compute overall silhouette value;
SELECT AVG((b-a)/greatest(a,b)) silhouette_value FROM kmeans_test_sil;
--;
-- Compute silhouette cluster profiles;
SELECT * FROM Hist_Reduce(
       ON Hist_Map(
         ON (SELECT clusterid::varchar clusterid, (b-a)/greatest(a,b) silhouette_value FROM kmeans_test_sil
         )
         STARTVALUE('-1')
         BINSIZE('0.05')
         ENDVALUE('1')
         VALUE_COLUMN('silhouette_value')
         GROUP_COLUMNS('clusterid')
       ) PARTITION BY clusterid
     );
--;
-- Drop Analytical Table with Silhouette Data;
DROP TABLE IF EXISTS kmeans_test_sil;",
          "compute Silhouette on clustered and scaled data")
  
  expect_equal_normalized(computeSilhouette(NULL, kmeans_obj, scaled=TRUE, silhouetteTableName='kmeans_test_sil', drop=FALSE, 
                                            test=TRUE),
    "-- Create Analytical Table with Silhouette Data
DROP TABLE IF EXISTS kmeans_test_sil;
CREATE ANALYTIC TABLE kmeans_test_sil
     DISTRIBUTE BY HASH(clusterid)
     AS
     WITH kmeansplotresult AS (
         SELECT clusterid, playerid_stint_teamid_yearid, variable, coalesce(value_double, value_long, value_str::double) value 
           FROM unpivot(
                  ON (SELECT  d.* 
                   FROM kmeansplot(
                     ON kmeans_test_scaled PARTITION BY ANY
                     ON kmeans_test_centroids DIMENSION
                     centroidsTable('kmeans_test_centroids')
                   )  d 
                  WHERE clusterid != -1
                  )
                  COLSTOUNPIVOT('g', 'h', 'r')
                  COLSTOACCUMULATE('playerid_stint_teamid_yearid','clusterid')
                  ATTRIBUTECOLUMNNAME('variable')
                  VALUECOLUMNNAME('value')
                  KEEPINPUTCOLUMNTYPES('true')
                ) 
     )
     SELECT target_clusterid clusterid, target_playerid_stint_teamid_yearid playerid_stint_teamid_yearid, a, b 
       FROM (
         SELECT target_clusterid, target_playerid_stint_teamid_yearid,
                MAX(CASE WHEN target_clusterid = ref_clusterid THEN dissimilarity ELSE 0 END) a,
                MIN(CASE WHEN target_clusterid = ref_clusterid THEN 'Infinity' ELSE dissimilarity END) b 
           FROM
             (SELECT target_clusterid, target_playerid_stint_teamid_yearid, ref_clusterid, avg(distance) dissimilarity 
                FROM VectorDistance(
                  ON kmeansplotresult AS target PARTITION BY playerid_stint_teamid_yearid
                  ON kmeansplotresult AS ref DIMENSION
                  TARGETIDCOLUMNS('clusterid','playerid_stint_teamid_yearid')
                  TARGETFEATURECOLUMN('variable')
                  TARGETVALUECOLUMN('value')
                  REFIDCOLUMNS('clusterid','playerid_stint_teamid_yearid')
                  REFFEATURECOLUMN('variable')
                  REFVALUECOLUMN('value')
                  MEASURE('Euclidean')
                ) 
          WHERE target_playerid_stint_teamid_yearid != ref_playerid_stint_teamid_yearid 
          GROUP BY 1,2,3
         ) agg  
       GROUP BY 1,2
     ) sil;
--;
-- Compute overall silhouette value;
SELECT AVG((b-a)/greatest(a,b)) silhouette_value FROM kmeans_test_sil;
--;
-- Compute silhouette cluster profiles;
SELECT * FROM Hist_Reduce(
       ON Hist_Map(
         ON (SELECT clusterid::varchar clusterid, (b-a)/greatest(a,b) silhouette_value FROM kmeans_test_sil
         )
         STARTVALUE('-1')
         BINSIZE('0.05')
         ENDVALUE('1')
         VALUE_COLUMN('silhouette_value')
         GROUP_COLUMNS('clusterid')
       ) PARTITION BY clusterid
     );",
          "compute Silhouette on scaled data without drop of silhouette table")
  
  expect_equal_normalized(computeSilhouette(NULL, kmeans_obj_clustered, scaled=TRUE, silhouetteTableName='kmeans_test_sil', drop=FALSE, 
                                            test=TRUE),
    "-- Create Analytical Table with Silhouette Data
DROP TABLE IF EXISTS kmeans_test_sil;
CREATE ANALYTIC TABLE kmeans_test_sil
     DISTRIBUTE BY HASH(clusterid)
     AS
     WITH kmeansplotresult AS (
         SELECT clusterid, playerid_stint_teamid_yearid, variable, coalesce(value_double, value_long, value_str::double) value
           FROM unpivot(
                  ON (SELECT  d.* 
                   FROM kmeans_test_clustered d 
                  WHERE clusterid != -1
                  )
                  COLSTOUNPIVOT('g', 'h', 'r')
                  COLSTOACCUMULATE('playerid_stint_teamid_yearid','clusterid')
                  ATTRIBUTECOLUMNNAME('variable')
                  VALUECOLUMNNAME('value')
                  KEEPINPUTCOLUMNTYPES('true')
                ) 
     )
     SELECT target_clusterid clusterid, target_playerid_stint_teamid_yearid playerid_stint_teamid_yearid, a, b 
       FROM (
         SELECT target_clusterid, target_playerid_stint_teamid_yearid,
                MAX(CASE WHEN target_clusterid = ref_clusterid THEN dissimilarity ELSE 0 END) a,
                MIN(CASE WHEN target_clusterid = ref_clusterid THEN 'Infinity' ELSE dissimilarity END) b 
           FROM
             (SELECT target_clusterid, target_playerid_stint_teamid_yearid, ref_clusterid, avg(distance) dissimilarity 
                FROM VectorDistance(
                  ON kmeansplotresult AS target PARTITION BY playerid_stint_teamid_yearid
                  ON kmeansplotresult AS ref DIMENSION
                  TARGETIDCOLUMNS('clusterid','playerid_stint_teamid_yearid')
                  TARGETFEATURECOLUMN('variable')
                  TARGETVALUECOLUMN('value')
                  REFIDCOLUMNS('clusterid','playerid_stint_teamid_yearid')
                  REFFEATURECOLUMN('variable')
                  REFVALUECOLUMN('value')
                  MEASURE('Euclidean')
                ) 
          WHERE target_playerid_stint_teamid_yearid != ref_playerid_stint_teamid_yearid 
          GROUP BY 1,2,3
         ) agg  
       GROUP BY 1,2
     ) sil;
--;
-- Compute overall silhouette value;
SELECT AVG((b-a)/greatest(a,b)) silhouette_value FROM kmeans_test_sil;
--;
-- Compute silhouette cluster profiles;
SELECT * FROM Hist_Reduce(
       ON Hist_Map(
         ON (SELECT clusterid::varchar clusterid, (b-a)/greatest(a,b) silhouette_value FROM kmeans_test_sil
         )
         STARTVALUE('-1')
         BINSIZE('0.05')
         ENDVALUE('1')
         VALUE_COLUMN('silhouette_value')
         GROUP_COLUMNS('clusterid')
       ) PARTITION BY clusterid
     );",
          "compute Silhouette on scaled and clustered data without drop of silhouette table")
  
  expect_equal_normalized(computeSilhouette(NULL, kmeans_obj, scaled=FALSE, silhouetteTableName='kmeans_test_sil', test=TRUE),
    "-- Create Analytical Table with Silhouette Data
DROP TABLE IF EXISTS kmeans_test_sil;
CREATE ANALYTIC TABLE kmeans_test_sil
     DISTRIBUTE BY HASH(clusterid)
     AS
     WITH kmeansplotresult AS (
         SELECT clusterid, playerid_stint_teamid_yearid, variable, coalesce(value_double, value_long, value_str::double) value 
           FROM unpivot(
                  ON (SELECT  clusterid, d.* 
                   FROM kmeansplot(
                     ON kmeans_test_scaled PARTITION BY ANY
                     ON kmeans_test_centroids DIMENSION
                     centroidsTable('kmeans_test_centroids')
                   )  kmp JOIN (SELECT playerid || '-' || stint || '-' || teamid || '-' || yearid playerid_stint_teamid_yearid, g, h, r 
                                  FROM batting WHERE yearid > 2010  ) d ON (kmp.playerid_stint_teamid_yearid = d.playerid_stint_teamid_yearid)
                  WHERE clusterid != -1
                  )
                  COLSTOUNPIVOT('g', 'h', 'r')
                  COLSTOACCUMULATE('playerid_stint_teamid_yearid','clusterid')
                  ATTRIBUTECOLUMNNAME('variable')
                  VALUECOLUMNNAME('value')
                  KEEPINPUTCOLUMNTYPES('true')
                ) 
     )
     SELECT target_clusterid clusterid, target_playerid_stint_teamid_yearid playerid_stint_teamid_yearid, a, b 
       FROM (
         SELECT target_clusterid, target_playerid_stint_teamid_yearid,
                MAX(CASE WHEN target_clusterid = ref_clusterid THEN dissimilarity ELSE 0 END) a,
                MIN(CASE WHEN target_clusterid = ref_clusterid THEN 'Infinity' ELSE dissimilarity END) b 
           FROM
             (SELECT target_clusterid, target_playerid_stint_teamid_yearid, ref_clusterid, avg(distance) dissimilarity 
                FROM VectorDistance(
                  ON kmeansplotresult AS target PARTITION BY playerid_stint_teamid_yearid
                  ON kmeansplotresult AS ref DIMENSION
                  TARGETIDCOLUMNS('clusterid','playerid_stint_teamid_yearid')
                  TARGETFEATURECOLUMN('variable')
                  TARGETVALUECOLUMN('value')
                  REFIDCOLUMNS('clusterid','playerid_stint_teamid_yearid')
                  REFFEATURECOLUMN('variable')
                  REFVALUECOLUMN('value')
                  MEASURE('Euclidean')
                ) 
          WHERE target_playerid_stint_teamid_yearid != ref_playerid_stint_teamid_yearid 
          GROUP BY 1,2,3
         ) agg  
       GROUP BY 1,2
     ) sil;
--;
-- Compute overall silhouette value;
SELECT AVG((b-a)/greatest(a,b)) silhouette_value FROM kmeans_test_sil;
--;
-- Compute silhouette cluster profiles;
SELECT * FROM Hist_Reduce(
       ON Hist_Map(
         ON (SELECT clusterid::varchar clusterid, (b-a)/greatest(a,b) silhouette_value FROM kmeans_test_sil
         )
         STARTVALUE('-1')
         BINSIZE('0.05')
         ENDVALUE('1')
         VALUE_COLUMN('silhouette_value')
         GROUP_COLUMNS('clusterid')
       ) PARTITION BY clusterid
     );
--;
-- Drop Analytical Table with Silhouette Data;
DROP TABLE IF EXISTS kmeans_test_sil;",
          "compute Silhouette on non-scaled data")
})
teradata-aster-field/toaster documentation built on May 31, 2019, 8:36 a.m.