# tidyfst包实例分析" In tidyfst: Tidy Verbs for Fast Data Manipulation

```knitr::opts_chunk\$set(
collapse = TRUE,
comment = "#>"
)
```

## 测试数据构造

```library(tidyfst)
diamonds <- ggplot2::diamonds
n = 1e5  #如果想做工业级测试，可以继续增加数量
set.seed(2020)
dtranges <- seq.Date(from = as.Date("2011-01-01"),
to = as.Date("2020-01-01"),
by = 1)
n1 <- sample(nrow(diamonds), n, replace = TRUE)
dat1 <- as.data.table(diamonds[n1, ])
dat1[, "dt"] <- sample(dtranges, n, replace = TRUE)  # 增加dt列
n2 <- sample(nrow(dat1), nrow(dat1)/1000)
dat1[n2, "price"] <- NA # price列构造千分之一缺失值
dat2 <- data.table(dt = sample(dtranges, min(n/1000, length(dtranges))),
price1 = sample(1000, min(n/1000, length(dtranges)), replace = TRUE))

dat3 <- data.table(dt = sample(dtranges, min(n/1000, length(dtranges))),
price2 = sample(1000, min(n/1000, length(dtranges)), replace = TRUE))

print(dat1)
```

## 基础

### 小技巧

```dat1 = arrange_dt(dat1,dt)
dat1
```

### 聚合

#### 1.求每种切割类型、每种颜色钻石的平均价格、中位数价格与最高价格

```sys_time_print({
r1_1 <- dat1 %>%
summarise_dt(
by = .(cut,color),
mean_price = mean(price, na.rm = TRUE),
median_price = median(price, na.rm = TRUE),
max_price = max(price, na.rm = TRUE)
)
})
r1_1
```

tidyfst是永远不可能比data.table快的，但是如果你觉得上面的代码更容易掌握、更容易读懂，而在日常工作中多花零点几秒的运行时间没有太大问题（实际上节省了大家的交流时间，甚至就是节省将来自己再次读懂自己代码的时间），tidyfst就值得拥有。

#### 2.求每天最高出售价格对应的那笔订单

```sys_time_print({
r1_2 <- dat1 %>%
arrange_dt(dt,-price) %>%
drop_na_dt(price) %>%
group_dt(
by = dt,
)
})
r1_2
```

### join

#### 1.dat1与dat2以dt列左连接

```sys_time_print({
r2_1 <- dat1 %>%
left_join_dt(dat2,by = "dt")
})
r2_1
```

#### 2.多重join

```sys_time_print({
mymerge <- function(x, y) left_join_dt(x, y, by = "dt")
r2_2 <- Reduce(mymerge, list(dat1, dat2, dat3))
})
r2_2
```

### 长宽表转换

#### 1.长表转宽表

```sys_time_print({
mean1 <- function(x) mean(x, na.rm = TRUE)
max1 <- function(x) max(x, na.rm = TRUE)
r3_1 <-dat1 %>%
wider_dt(cut,
value = c("depth", "price"),
name = "color",
fun = list(mean1,max1))
})
r3_1
```

#### 2.宽表转长表

```sys_time_print({
r3_2 <-dat1 %>%
select_dt(cut,color,x,y,z) %>%
longer_dt(cut,color,
name = "xyz",
value = "xyzvalue")
})

r3_2
```

## 高阶

### 向上/下填充空值

```sys_time_print({
dat1 %>% fill_na_dt(price) -> dat1
})
dat1
```

### 添加子维度聚合结果为新列

#### 1.以dat1为例，添加两列，一列为以cut、color聚合求price的均值，另一列是求标准差

```sys_time_print({
mutate_dt(dat1,
mean_price = mean(price, na.rm = TRUE),
sd_price = sd(price, na.rm = TRUE),
by = .(cut, color))
})

dat1
```

#### 2.以dat1为例，以dt分组添加一列序号id

```sys_time_print({
dat1 %>%
group_dt(
by = dt,
mutate_dt(id = seq(.N))
) -> dat1
})
dat1
```

### 移动函数

```sys_time_print({
dat1 %>%
group_dt(
by = color,
mutate_dt(
MA10_price = frollmean(price, 10),
MSD10_price = frollapply(price, 10, FUN = sd)
)
) -> dat1
})

dat1
```

## 系统参数

```sessionInfo()
```

## Try the tidyfst package in your browser

Any scripts or data that you put into this service are public.

tidyfst documentation built on Oct. 29, 2022, 1:15 a.m.