Description Usage Arguments Details Note See Also Examples
Aggregate functions defined for Column
.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 | approxCountDistinct(x, rsd, ...)
collect_list(x)
collect_set(x)
countDistinct(x, ...)
grouping_bit(x)
grouping_id(x, ...)
kurtosis(x)
n_distinct(x, ...)
skewness(x)
stddev(x)
stddev_pop(x)
stddev_samp(x)
sumDistinct(x)
variance(x)
var_pop(x)
var_samp(x)
.n_distinct(...)
## S4 method for signature 'Column'
approxCountDistinct(x, rsd = 0.05)
## S4 method for signature 'Column'
kurtosis(x)
## S3 method for class 'Column'
max(x, na.rm = FALSE, ...)
## S3 method for class 'Column'
mean(x, ...)
## S3 method for class 'Column'
min(x, na.rm = FALSE, ...)
## S4 method for signature 'Column'
skewness(x)
## S4 method for signature 'Column'
stddev(x)
## S4 method for signature 'Column'
stddev_pop(x)
## S4 method for signature 'Column'
stddev_samp(x)
## S4 method for signature 'Column'
sum(x)
## S4 method for signature 'Column'
sumDistinct(x)
## S4 method for signature 'Column'
variance(x)
## S4 method for signature 'Column'
var_pop(x)
## S4 method for signature 'Column'
var_samp(x)
## S4 method for signature 'Column'
countDistinct(x, ...)
## S4 method for signature 'Column'
collect_list(x)
## S4 method for signature 'Column'
collect_set(x)
## S4 method for signature 'Column'
grouping_bit(x)
## S4 method for signature 'Column'
grouping_id(x, ...)
|
x |
Column to compute on. |
rsd |
maximum estimation error allowed (default = 0.05). |
... |
additional argument(s). For example, it could be used to pass additional Columns. |
na.rm |
currently unused |
n_distinct
: Returns the number of distinct items in a group.
approxCountDistinct
: Returns the approximate number of distinct items
in a group.
kurtosis
: Returns the kurtosis of the values in a group.
max
: Returns the maximum value of the expression in a group.
mean
: Returns the average of the values in a group. Alias for avg
.
min
: Returns the minimum value of the expression in a group.
skewness
: Returns the skewness of the values in a group.
stddev
: Alias for std_dev
.
stddev_pop
: Returns the population standard deviation of the
expression in a group.
stddev_samp
: Returns the unbiased sample standard deviation of the
expression in a group.
sum
: Returns the sum of all values in the expression.
sumDistinct
: Returns the sum of distinct values in the expression.
var_pop
: Returns the population variance of the values in a group.
var_samp
: Returns the unbiased variance of the values in a group.
countDistinct
: Returns the number of distinct items in a group.
collect_list
: Creates a list of objects with duplicates.
Note: the function is non-deterministic because the order of collected results depends
on order of rows which may be non-deterministic after a shuffle.
collect_set
: Creates a list of objects with duplicate elements eliminated.
Note: the function is non-deterministic because the order of collected results depends
on order of rows which may be non-deterministic after a shuffle.
grouping_bit
: Indicates whether a specified column in a GROUP BY list is aggregated or
not, returns 1 for aggregated or 0 for not aggregated in the result set. Same as GROUPING
in SQL and grouping
function in Scala.
grouping_id
: Returns the level of grouping.
Equals to
grouping_bit(c1) * 2^(n - 1) + grouping_bit(c2) * 2^(n - 2) + ... + grouping_bit(cn)
.
n_distinct since 1.4.0
approxCountDistinct(Column) since 1.4.0
kurtosis since 1.6.0
max since 1.5.0
mean since 1.5.0
min since 1.5.0
skewness since 1.6.0
stddev since 1.6.0
stddev_pop since 1.6.0
stddev_samp since 1.6.0
sum since 1.5.0
sumDistinct since 1.4.0
variance since 1.6.0
var_pop since 1.5.0
var_samp since 1.6.0
countDistinct since 1.4.0
collect_list since 2.3.0
collect_set since 2.3.0
grouping_bit since 2.3.0
grouping_id since 2.3.0
Other aggregate functions:
avg()
,
corr()
,
firstItem()
,
lastItem()
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 | ## Not run:
# Dataframe used throughout this doc
df <- spark_tbl(cbind(model = rownames(mtcars),
mtcars))
## End(Not run)
## Not run:
head(select(df, approxCountDistinct(df$gear)))
head(select(df, approxCountDistinct(df$gear, 0.02)))
head(select(df, countDistinct(df$gear, df$cyl)))
head(select(df, n_distinct(df$gear)))
head(distinct(select(df, "gear")))
## End(Not run)
## Not run:
head(select(df, mean(df$mpg), sd(df$mpg), skewness(df$mpg), kurtosis(df$mpg)))
## End(Not run)
## Not run:
head(select(df, avg(df$mpg), mean(df$mpg), sum(df$mpg), min(df$wt), max(df$qsec)))
# metrics by num of cylinders
tmp <- agg(groupBy(df, "cyl"), avg(df$mpg), avg(df$hp), avg(df$wt), avg(df$qsec))
head(orderBy(tmp, "cyl"))
# car with the max mpg
mpg_max <- as.numeric(collect(agg(df, max(df$mpg))))
head(where(df, df$mpg == mpg_max))
## End(Not run)
## Not run:
head(select(df, sumDistinct(df$gear)))
head(distinct(select(df, "gear")))
## End(Not run)
## Not run:
df2 = df[df$mpg > 20, ]
collect(select(df2, collect_list(df2$gear)))
collect(select(df2, collect_set(df2$gear)))
## End(Not run)
## Not run:
# With cube
agg(
cube(df, "cyl", "gear", "am"),
mean(df$mpg),
grouping_bit(df$cyl), grouping_bit(df$gear), grouping_bit(df$am)
)
# With rollup
agg(
rollup(df, "cyl", "gear", "am"),
mean(df$mpg),
grouping_bit(df$cyl), grouping_bit(df$gear), grouping_bit(df$am)
)
## End(Not run)
## Not run:
# With cube
agg(
cube(df, "cyl", "gear", "am"),
mean(df$mpg),
grouping_id(df$cyl, df$gear, df$am)
)
# With rollup
agg(
rollup(df, "cyl", "gear", "am"),
mean(df$mpg),
grouping_id(df$cyl, df$gear, df$am)
)
## End(Not run)
|
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.