R for Data Science(终)

R for Data Science(终)

这是学习 R for Data Science 的最后一篇笔记了。可以确定这是本很好的书。

模型基础

1
2
library(tidyverse)
library(modelr)

一个简单的模型

1
2
ggplot(sim1, aes(x, y)) +
geom_point()

我们先随机绘制 250 个线性模型拟合线:

1
2
3
4
5
6
7
8
9
models <- tibble(
a1 = runif(250, -20, 40),
a2 = runif(250, -5, 5)
)

ggplot(sim1, aes(x, y)) +
geom_abline(aes(intercept = a1,
slope = a2), data = models) +
geom_point()

最优的模型是散点和拟合线平均竖直距离最小的那个模型,先编写一个计算拟合线和散点平均竖直距离的函数,mod 参数是拟合线的截距和斜率组成的向量。

1
2
3
4
5
6
measure_distance <- function(mod, data){
diff <- data$y - (mod[1] + data$x * mod[2])
return(sqrt(mean(diff ^ 2)))
}
measure_distance(c(7, 1.5), sim1)
#> [1] 2.665212

再计算上面的 250 个模型的平均竖直距离:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
sim1_dist <- function(a1, a2){
measure_distance(c(a1, a2), sim1)
}

# 然后就可以计算上面的250个模型的平均竖直距离:
models <- models %>%
mutate(dist = purrr::map2_dbl(a1, a2, sim1_dist)) %>%
print()
#> # A tibble: 250 x 3
#> a1 a2 dist
#> <dbl> <dbl> <dbl>
#> 1 28.3 -2.22 12.5
#> 2 25.2 -0.527 10.3
#> 3 30.5 -4.47 21.1
#> 4 36.3 4.81 48.0
#> 5 -0.0800 3.09 3.93
#> 6 -13.8 2.02 18.3
#> 7 13.9 1.96 9.42
#> 8 -7.90 1.32 16.4
#> 9 -14.6 -1.21 38.0
#> 10 24.9 4.31 33.8
#> # … with 240 more rows

再来观察距离最小的前十个:

1
2
3
4
ggplot(sim1, aes(x, y)) +
geom_point(size = 2, colour = "grey30") +
geom_abline(aes(intercept = a1, slope = a2, colour = -dist),
data = dplyr::filter(models, rank(dist) <= 10))

1
2
3
ggplot(models, aes(a1, a2)) +
geom_point(data = dplyr::filter(models, rank(dist) <= 10), size = 4, color = 'red') +
geom_point(aes(colour = - dist))

还可以通过格点搜索:

1
2
3
4
5
6
7
8
9
10
11
grid <- expand.grid(
a1 = seq(-5, 20, length = 25),
a2 = seq(1, 3, length = 25)
) %>%
mutate(
dist = purrr::map2_dbl(a1, a2, sim1_dist)
)
grid %>%
ggplot(aes(a1, a2)) +
geom_point(data = dplyr::filter(grid, rank(dist) <= 10), size = 4, color = "red") +
geom_point(aes(color = -dist))

最好的十个模式拟合的效果为:

1
2
3
4
5
6
ggplot(sim1, aes(x, y)) +
geom_point(size = 2, colour = "grey30") +
geom_abline(
aes(intercept = a1, slope = a2, colour = - dist),
data = dplyr::filter(grid, rank(dist) <= 10)
)

使用最优化函数:

1
2
3
4
5
6
best <- optim(c(0, 0), measure_distance, data = sim1)
best$par
#> [1] 4.222248 2.051204
ggplot(sim1, aes(x, y)) +
geom_point(size = 2, colour = "grey30") +
geom_abline(intercept = best$par[1], slope = best$par[2])

最后就是直接使用回归函数进行拟合了:

1
2
3
4
sim1_mod <- lm(y ~ x, data = sim1)
coef(sim1_mod)
#> (Intercept) x
#> 4.220822 2.051533

预测

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
grid <- sim1 %>%
data_grid(x)
grid
#> # A tibble: 10 x 1
#> x
#> <int>
#> 1 1
#> 2 2
#> 3 3
#> 4 4
#> 5 5
#> 6 6
#> 7 7
#> 8 8
#> 9 9
#> 10 10

grid <- grid %>%
add_predictions(sim1_mod)
grid
#># A tibble: 10 x 2
#> x pred
#> <int> <dbl>
#> 1 1 6.27
#> 2 2 8.32
#> 3 3 10.4
#> 4 4 12.4
#> 5 5 14.5
#> 6 6 16.5
#> 7 7 18.6
#> 8 8 20.6
#> 9 9 22.7
#>10 10 24.7
1
2
3
4
ggplot(sim1, aes(x)) +
geom_point(aes(y = y)) +
geom_line(aes(y = pred),
data = grid, colour = "red", size = 1)

残差

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
sim1 <- sim1 %>%
add_residuals(sim1_mod)
sim1
#> # A tibble: 30 x 3
#> x y resid
#> <int> <dbl> <dbl>
#> 1 1 4.20 -2.07
#> 2 1 7.51 1.24
#> 3 1 2.13 -4.15
#> 4 2 8.99 0.665
#> 5 2 10.2 1.92
#> 6 2 11.3 2.97
#> 7 3 7.36 -3.02
#> 8 3 10.5 0.130
#> 9 3 10.5 0.136
#> 10 4 12.4 0.00763
#> # … with 20 more rows
1
2
ggplot(sim1, aes(resid)) +
geom_freqpoly(binwidth = 0.5)

1
2
3
ggplot(sim1, aes(x, resid)) +
geom_ref_line(h = 0, colour = "blue") +
geom_point()

Formulas and model families

1
2
3
4
5
6
df <- tribble(
~y, ~x1, ~x2,
4, 2, 5,
5, 1, 6,
3, 4, 5
)

model_matrix()函数可以显示在拟合模型时使用的实际矩阵:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
model_matrix(df, y ~ x1)
#> # A tibble: 3 x 2
#> `(Intercept)` x1
#> <dbl> <dbl>
#> 1 1 2
#> 2 1 1
#> 3 1 4

model_matrix(df, y ~ x1 - 1)
#> # A tibble: 3 x 1
#> x1
#> <dbl>
#> 1 2
#> 2 1
#> 3 4

model_matrix(df, y ~ x1 + x2)
# A tibble: 3 x 3
#> `(Intercept)` x1 x2
#> <dbl> <dbl> <dbl>
#> 1 1 2 5
#> 2 1 1 6
#> 3 1 4 5

分类变量:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
df <- tribble(
~ sex, ~ response,
"male", 1,
"female", 2,
"male", 1
)

model_matrix(df, response ~ sex)
#> # A tibble: 3 x 2
#> `(Intercept)` sexmale
#> <dbl> <dbl>
#> 1 1 1
#> 2 1 0
#> 3 1 1

可以看到,字符串向量会被自动转换成数值向量。

1
2
ggplot(sim2) +
geom_point(aes(x, y))

1
2
3
4
5
6
7
8
9
10
11
12
mod2 <- lm(y ~ x, data = sim2)
grid <- sim2 %>%
data_grid(x) %>%
add_predictions(mod2)
grid
#> # A tibble: 4 x 2
#> x pred
#> <chr> <dbl>
#> 1 a 1.15
#> 2 b 8.12
#> 3 c 6.13
#> 4 d 1.91
1
2
3
ggplot(sim2, aes(x)) +
geom_point(aes(y = y)) +
geom_point(data = grid, aes(y = pred), colour = "red", size = 4)

你无法对未观测到的水平进行预测:

1
2
3
4
tibble(x = "e") %>%
add_predictions(mod2)
#> Error in model.frame.default(Terms, newdata, na.action = na.action, xlev = object$xlevels) :
#> factor x has new level e

多元模型

1
2
ggplot(sim3, aes(x1, y)) +
geom_point(aes(colour = x2))

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
mod1 <- lm(y ~ x1 + x2, data = sim3)
mod2 <- lm(y ~ x1 * x2, data = sim3)

grid <- sim3 %>%
data_grid(x1, x2) %>%
gather_predictions(mod1, mod2) %>%
print()

#> # A tibble: 80 x 4
#> model x1 x2 pred
#> <chr> <int> <fct> <dbl>
#> 1 mod1 1 a 1.67
#> 2 mod1 1 b 4.56
#> 3 mod1 1 c 6.48
#> 4 mod1 1 d 4.03
#> 5 mod1 2 a 1.48
#> 6 mod1 2 b 4.37
#> 7 mod1 2 c 6.28
#> 8 mod1 2 d 3.84
#> 9 mod1 3 a 1.28
#> 10 mod1 3 b 4.17
#> # … with 70 more rows
1
2
3
4
ggplot(sim3, aes(x1, y, colour = x2)) +
geom_point() +
geom_line(data = grid, aes(y = pred)) +
facet_wrap(~ model)

1
2
3
4
5
sim3 <- sim3 %>%
gather_residuals(mod1, mod2)
ggplot(sim3, aes(x1, resid, colour = x2)) +
geom_point() +
facet_grid(model ~ x2)

交互项

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
mod1 <- lm(y ~ x1 + x2, data = sim4)
mod2 <- lm(y ~ x1 * x2, data = sim4)
grid <- sim4 %>%
data_grid(
x1 = seq_range(x1, 5),
x2 = seq_range(x2, 5)
) %>%
gather_predictions(mod1, mod2)
grid
#> # A tibble: 50 x 4
#> model x1 x2 pred
#> <chr> <dbl> <dbl> <dbl>
#> 1 mod1 -1 -1 0.996
#> 2 mod1 -1 -0.5 -0.395
#> 3 mod1 -1 0 -1.79
#> 4 mod1 -1 0.5 -3.18
#> 5 mod1 -1 1 -4.57
#> 6 mod1 -0.5 -1 1.91
#> 7 mod1 -0.5 -0.5 0.516
#> 8 mod1 -0.5 0 -0.875
#> 9 mod1 -0.5 0.5 -2.27
#> 10 mod1 -0.5 1 -3.66
#> # … with 40 more rows

seq_range() 函数:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
seq_range(c(0.0123, 0.923423), n = 5)
#> [1] 0.0123000 0.2400808 0.4678615 0.6956423 0.9234230

seq_range(c(0.0123, 0.923423), n = 5, pretty = TRUE)
#> [1] 0.0 0.2 0.4 0.6 0.8 1.0

# rcauchy():生成柯西分布随机数
(x1 <- rcauchy(100))
#> [1] -38.72970382 -0.96658772 -1.23310389 -1.00410032
#> [5] -2.29682139 0.49836297 1.25553202 -0.49103844
#> ······
#> [93] -0.16878755 -0.38431114 -14.65646278 -0.66320419
#> [97] 0.57966325 -0.76662258 -2.20590348 0.55266619

x1 %>% range()
#> [1] -38.72970 29.46959

seq_range(x1, n = 5)
#> [1] -38.729704 -21.679881 -4.630058 12.419766 29.469589

# 删除超过上下5%分位数的值
seq_range(x1, n = 5, trim = 0.1)
#> [1] -14.4472468 -9.8599326 -5.2726184 -0.6853042 3.9020100

# expand 参数则正好和trim参数的作用相反
x2 <- c(0, 1)
seq_range(x2, n = 5)
#> [1] 0.00 0.25 0.50 0.75 1.00
seq_range(x2, n = 5, expand = 0.1)
#> [1] -0.050 0.225 0.500 0.775 1.050

1
2
3
ggplot(grid, aes(x1, x2)) +
geom_tile(aes(fill = pred)) +
facet_grid( ~ model)

1
2
3
4
5
6
7
8
library(patchwork)
ggplot(grid, aes(x1, pred, colour = x2, group = x2)) +
geom_line() +
facet_wrap(~ model) +
ggplot(grid, aes(x2, pred, colour = x1, group = x1)) +
geom_line() +
facet_wrap(~ model) +
plot_layout(ncol = 1)

Transformations

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
df <- tribble(
~y, ~x,
1, 1,
2, 2,
3, 3
)

model_matrix(df, y ~ x^2 + x)
#> # A tibble: 3 x 2
#> `(Intercept)` x
#> <dbl> <dbl>
#> 1 1 1
#> 2 1 2
#> 3 1 3

model_matrix(df, y ~ I(x^2) + x)
#> # A tibble: 3 x 3
#> `(Intercept)` `I(x^2)` x
#> <dbl> <dbl> <dbl>
#> 1 1 1 1
#> 2 1 4 2
#> 3 1 9 3

model_matrix(df, y ~ poly(x, 2))
#> # A tibble: 3 x 3
#> `(Intercept)` `poly(x, 2)1` `poly(x, 2)2`
#> <dbl> <dbl> <dbl>
#> 1 1 -7.07e- 1 0.408
#> 2 1 -7.85e-17 -0.816
#> 3 1 7.07e- 1 0.408
1
2
3
4
5
6
7
8
library(splines)
model_matrix(df, y ~ ns(x, 2))
#> # A tibble: 3 x 3
#> `(Intercept)` `ns(x, 2)1` `ns(x, 2)2`
#> <dbl> <dbl> <dbl>
#> 1 1 0 0
#> 2 1 0.566 -0.211
#> 3 1 0.344 0.771
1
2
3
4
5
6
sim5 <- tibble(
x = seq(0, 3.5 * pi, length = 50),
y = 4 * sin(x) + rnorm(length(x))
)
ggplot(sim5, aes(x, y)) +
geom_point()

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
mod1 <- lm(y ~ ns(x, 1), data = sim5)
mod2 <- lm(y ~ ns(x, 2), data = sim5)
mod3 <- lm(y ~ ns(x, 3), data = sim5)
mod4 <- lm(y ~ ns(x, 4), data = sim5)
mod5 <- lm(y ~ ns(x, 5), data = sim5)
mod6 <- lm(y ~ ns(x, 6), data = sim5)

grid <- sim5 %>%
data_grid(x = seq_range(x, n = 50, expand = 0.1)) %>%
gather_predictions(mod1, mod2, mod3, mod4, mod5, mod6, .pred = "y")

ggplot(sim5, aes(x, y)) +
geom_point() +
geom_line(data = grid, colour = "red") +
facet_wrap( ~ model)

Missing values

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
df <- tribble(
~x, ~y,
1, 2.2,
2, NA,
3, 3.5,
4, 8.3,
NA, 10
)

options(na.action = na.warn)
mod <- lm(y ~ x, data = df)
#> Warning message:
#> Dropping 2 rows with missing values

mod <- lm(y ~ x, data = df, na.action = na.exclude)
# 可以看出只使用了三个观测值进行拟合:
nobs(mod)
#> [1] 3

Model building

1
2
3
4
5
library(tidyverse)
library(modelr)
library(nycflights13)
library(lubridate)
options(na.action = na.warn)

Why are low quality diamonds more expansive?

1
2
3
4
5
6
ggplot(diamonds, aes(cut, price)) + geom_boxplot() +
ggplot(diamonds, aes(color, price)) +
geom_boxplot() +
ggplot(diamonds, aes(clarity, price)) +
geom_boxplot() +
plot_layout(ncol = 1)

Price and carat:

1
2
ggplot(diamonds, aes(carat, price)) +
geom_hex(bins = 50)

1
2
3
4
5
6
diamonds2 <- diamonds %>%
dplyr::filter(carat <= 2.5) %>%
mutate(lprice = log2(price), lcarat = log2(carat))
diamonds2 %>%
ggplot(aes(lcarat, lprice)) +
geom_hex(bins = 50)

1
2
3
4
5
6
7
8
9
10
mod_diamond <- lm(lprice ~ lcarat, data = diamonds2)
grid <- diamonds2 %>%
data_grid(carat = seq_range(carat, 20)) %>%
mutate(lcarat = log2(carat)) %>%
add_predictions(mod_diamond, "lprice") %>%
mutate(price = 2 ^ lprice)

ggplot(data = diamonds2, aes(carat, price)) +
geom_hex(bins = 50) +
geom_line(data = grid, colour = "red", size = 1)

1
2
3
4
5
diamonds2 <- diamonds2 %>%
add_residuals(mod_diamond, "lresid")
diamonds2 %>%
ggplot(aes(lcarat, lresid)) +
geom_hex(bins = 50)

1
2
3
4
5
6
7
ggplot(diamonds2, aes(cut, lresid)) +
geom_boxplot() +
ggplot(diamonds2, aes(cut, lresid)) +
geom_boxplot() +
ggplot(diamonds2, aes(clarity, lresid)) +
geom_boxplot() +
plot_layout(ncol = 1)

A more complicated model

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
mod_diamond2 <- lm(lprice ~ lcarat + color + cut + clarity, data = diamonds2)
grid <- diamonds2 %>%
data_grid(cut, .model = mod_diamond2) %>%
add_predictions(mod_diamond2)
grid
#> # A tibble: 5 x 5
#> cut lcarat color clarity pred
#> <ord> <dbl> <chr> <chr> <dbl>
#> 1 Fair -0.515 G VS2 11.2
#> 2 Good -0.515 G VS2 11.3
#> 3 Very Good -0.515 G VS2 11.4
#> 4 Premium -0.515 G VS2 11.4
#> 5 Ideal -0.515 G VS2 11.4

ggplot(grid, aes(cut, pred)) +
geom_point()

1
2
3
4
diamonds2 <- diamonds2 %>%
add_residuals(mod_diamond2, "lresid2")
ggplot(diamonds2, aes(lcarat, lresid2)) +
geom_hex(bins = 50)

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
diamonds2 %>%
dplyr::filter(abs(lresid2) > 1) %>%
add_predictions(mod_diamond2) %>%
mutate(pred = round(2 ^ pred)) %>%
select(price, pred, carat:table, x:z) %>%
arrange(price)

#> # A tibble: 16 x 11
#> price pred carat cut color clarity depth table x y
#> <int> <dbl> <dbl> <ord> <ord> <ord> <dbl> <dbl> <dbl> <dbl>
#> 1 1013 264 0.25 Fair F SI2 54.4 64 4.3 4.23
#> 2 1186 284 0.25 Prem… G SI2 59 60 5.33 5.28
#> 3 1186 284 0.25 Prem… G SI2 58.8 60 5.33 5.28
#> 4 1262 2644 1.03 Fair E I1 78.2 54 5.72 5.59
#> 5 1415 639 0.35 Fair G VS2 65.9 54 5.57 5.53
#> 6 1415 639 0.35 Fair G VS2 65.9 54 5.57 5.53
#> 7 1715 576 0.32 Fair F VS2 59.6 60 4.42 4.34
#> 8 1776 412 0.290 Fair F SI1 55.8 60 4.48 4.41
#> 9 2160 314 0.34 Fair F I1 55.8 62 4.72 4.6
#> 10 2366 774 0.3 Very… D VVS2 60.6 58 4.33 4.35
#> 11 3360 1373 0.51 Prem… F SI1 62.7 62 5.09 4.96
#> 12 3807 1540 0.61 Good F SI2 62.5 65 5.36 5.29
#> 13 3920 1705 0.51 Fair F VVS2 65.4 60 4.98 4.9
#> 14 4368 1705 0.51 Fair F VVS2 60.7 66 5.21 5.11
#> 15 10011 4048 1.01 Fair D SI2 64.6 58 6.25 6.2
#> 16 10470 23622 2.46 Prem… E SI2 59.7 59 8.82 8.76
#> # … with 1 more variable: z <dbl>

What affects the number of daily flights?

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
library(nycflights13)
library(lubridate)
daily <- flights %>%
mutate(date = make_date(year, month, day)) %>%
group_by(date) %>%
summarise(n = n()) %>%
print()

#> # A tibble: 365 x 2
#> date n
#> <date> <int>
#> 1 2013-01-01 842
#> 2 2013-01-02 943
#> 3 2013-01-03 914
#> 4 2013-01-04 915
#> 5 2013-01-05 720
#> 6 2013-01-06 832
#> 7 2013-01-07 933
#> 8 2013-01-08 899
#> 9 2013-01-09 902
#> 10 2013-01-10 932
#> # … with 355 more rows
1
2
3
daily %>%
ggplot(aes(x = date, n)) +
geom_line()

Day of week:

1
2
3
4
5
daily <- daily %>%
mutate(wday = wday(date, label = TRUE))

ggplot(daily, aes(wday, n)) +
geom_boxplot()

1
2
3
4
5
6
7
mod <- lm(n ~ wday, data = daily)
grid <- daily %>%
data_grid(wday) %>%
add_predictions(mod, "n")
ggplot(daily, aes(wday, n)) +
geom_boxplot() +
geom_point(data = grid, colour = "red", size = 4)

1
2
3
4
5
6
daily <- daily %>%
add_residuals(mod)
daily %>%
ggplot(aes(date, resid)) +
geom_ref_line(h = 0, colour = "blue") +
geom_line()

1
2
3
ggplot(daily, aes(date, resid, colour = wday)) +
geom_ref_line(h = 0, colour = "blue") +
geom_line()

1
2
3
4
5
daily %>%
ggplot(aes(date, resid)) +
geom_ref_line(h = 0) +
geom_line(colour = "grey50") +
geom_smooth(se = FALSE, span = 0.2)

Seasonal Saturday effect:

1
2
3
4
5
6
daily %>%
dplyr::filter(wday == "Sat") %>%
ggplot(aes(date, n)) +
geom_point() +
geom_line() +
scale_x_date(NULL, date_breaks = "1 month", date_labels = "%b")

分季节:

1
2
3
4
5
6
7
8
9
10
11
12
13
term <- function(date){
cut(date,
breaks = ymd(20130101, 20130605, 20130825, 20140101),
labels = c("春天", "夏天", "秋天"))
}
daily <- daily %>%
mutate(term = term(date))
daily %>%
dplyr::filter(wday == "Sat") %>%
ggplot(aes(date, n, color = term)) +
geom_point(alpha = 1/3) +
geom_line() +
scale_x_date(NULL, date_breaks = "1 month", date_labels = "%b")

1
2
3
daily %>%
ggplot(aes(wday, n, colour = term)) +
geom_boxplot()

1
2
3
4
5
6
mod1 <- lm(n ~ wday, data = daily)
mod2 <- lm(n ~ wday * term, data = daily)
daily %>%
gather_residuals(without_term = mod1, with_term = mod2) %>%
ggplot(aes(date, resid, colour = model)) +
geom_line(alpha = 0.75)

1
2
3
4
5
6
7
8
grid <- daily %>%
data_grid(wday, term) %>%
add_predictions(mod2, "n")

ggplot(daily, aes(wday, n)) +
geom_boxplot() +
geom_point(data = grid, colour= "red") +
facet_wrap( ~ term)

拟合稳健的线性模型

1
2
3
4
5
6
7
mod3 <- MASS::rlm(n ~ wday * term, data = daily)

daily %>%
add_residuals(mod3, "resid") %>%
ggplot(aes(date, resid)) +
geom_hline(yintercept = 0, size = 2, colour = "blue") +
geom_line()

Computed variables:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
compute_vars <- function(data){
data %>%
mutate(
term = term(date),
wday = wday(date, label = TRUE)
)
}
wday2 <- function(x) wday(x, label = TRUE)
mod3 <- lm(n ~ wday2(date) * term(date), data = daily)
daily %>%
add_residuals(mod3, "resid") %>%
ggplot(aes(date, resid)) +
geom_hline(yintercept = 0, size = 2, colour = "blue") +
geom_line()

Time of year: an alternative approach:

1
2
3
4
5
6
7
8
library(splines)
mod <- MASS::rlm(n ~ wday * ns(date, 5), data = daily)
daily %>%
data_grid(wday, date = seq_range(date, n = 13)) %>%
add_predictions(mod) %>%
ggplot(aes(date, pred, colour = wday)) +
geom_line() +
geom_point()

Many models

gapminder 数据集总结了各个国家随着时间的推移,预期寿命、GDP 等数据。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
library(gapminder)
gapminder
#> # A tibble: 1,704 x 6
#> country continent year lifeExp pop gdpPercap
#> <fct> <fct> <int> <dbl> <int> <dbl>
#> 1 Afghanistan Asia 1952 28.8 8425333 779.
#> 2 Afghanistan Asia 1957 30.3 9240934 821.
#> 3 Afghanistan Asia 1962 32.0 10267083 853.
#> 4 Afghanistan Asia 1967 34.0 11537966 836.
#> 5 Afghanistan Asia 1972 36.1 13079460 740.
#> 6 Afghanistan Asia 1977 38.4 14880372 786.
#> 7 Afghanistan Asia 1982 39.9 12881816 978.
#> 8 Afghanistan Asia 1987 40.8 13867957 852.
#> 9 Afghanistan Asia 1992 41.7 16317921 649.
#> 10 Afghanistan Asia 1997 41.8 22227415 635.
#> # … with 1,694 more rows

下面我们将研究每个国家的预期寿命如何随着时间变化:

1
2
3
gapminder %>%
ggplot(aes(year, lifeExp, group = country)) +
geom_line(alpha = 1/3)

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
cn <- dplyr::filter(gapminder, country == "China")
p1 <- cn %>%
ggplot(aes(year, lifeExp)) +
geom_line() +
ggtitle("Full data = ")
cn_mod <- lm(lifeExp ~ year, data = cn)
p2 <- cn %>%
add_predictions(cn_mod) %>%
ggplot(aes(year, pred)) +
geom_line() +
ggtitle("Linear trend + ")

p3 <- cn %>% add_residuals(cn_mod) %>%
ggplot(aes(year, resid)) +
geom_hline(yintercept = 0, colour = "blue", size = 3) +
geom_line() +
ggtitle("Remaining pattern")
p1 + p2 + p3 +
plot_layout(ncol = 1)

嵌套数据

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
(by_country <- gapminder %>%
group_by(country, continent) %>%
nest())
#> # A tibble: 142 x 3
#> country continent data
#> <fct> <fct> <list>
#> 1 Afghanistan Asia <tibble [12 × 4]>
#> 2 Albania Europe <tibble [12 × 4]>
#> 3 Algeria Africa <tibble [12 × 4]>
#> 4 Angola Africa <tibble [12 × 4]>
#> 5 Argentina Americas <tibble [12 × 4]>
#> 6 Australia Oceania <tibble [12 × 4]>
#> 7 Austria Europe <tibble [12 × 4]>
#> 8 Bahrain Asia <tibble [12 × 4]>
#> 9 Bangladesh Asia <tibble [12 × 4]>
#> 10 Belgium Europe <tibble [12 × 4]>
#> # … with 132 more rows

List-columns

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
country_model <- function(df){
lm(lifeExp ~ year, data = df)
}
models <- map(by_country$data, country_model)
by_country <- by_country %>%
mutate(model = map(data, country_model))
by_country
#> # A tibble: 142 x 4
#> country continent data model
#> <fct> <fct> <list> <list>
#> 1 Afghanistan Asia <tibble [12 × 4]> <S3: lm>
#> 2 Albania Europe <tibble [12 × 4]> <S3: lm>
#> 3 Algeria Africa <tibble [12 × 4]> <S3: lm>
#> 4 Angola Africa <tibble [12 × 4]> <S3: lm>
#> 5 Argentina Americas <tibble [12 × 4]> <S3: lm>
#> 6 Australia Oceania <tibble [12 × 4]> <S3: lm>
#> 7 Austria Europe <tibble [12 × 4]> <S3: lm>
#> 8 Bahrain Asia <tibble [12 × 4]> <S3: lm>
#> 9 Bangladesh Asia <tibble [12 × 4]> <S3: lm>
#> 10 Belgium Europe <tibble [12 × 4]> <S3: lm>
#> # … with 132 more rows

by_country %>%
dplyr::filter(continent == "Europe")
#> # A tibble: 30 x 4
#> country continent data model
#> <fct> <fct> <list> <list>
#> 1 Albania Europe <tibble [12 × 4]> <S3: lm>
#> 2 Austria Europe <tibble [12 × 4]> <S3: lm>
#> 3 Belgium Europe <tibble [12 × 4]> <S3: lm>
#> 4 Bosnia and Herzegovina Europe <tibble [12 × 4]> <S3: lm>
#> 5 Bulgaria Europe <tibble [12 × 4]> <S3: lm>
#> 6 Croatia Europe <tibble [12 × 4]> <S3: lm>
#> 7 Czech Republic Europe <tibble [12 × 4]> <S3: lm>
#> 8 Denmark Europe <tibble [12 × 4]> <S3: lm>
#> 9 Finland Europe <tibble [12 × 4]> <S3: lm>
#> 10 France Europe <tibble [12 × 4]> <S3: lm>
#> # … with 20 more rows

by_country %>%
arrange(continent, country)

#> # A tibble: 142 x 4
#> country continent data model
#> <fct> <fct> <list> <list>
#> 1 Algeria Africa <tibble [12 × 4]> <S3: lm>
#> 2 Angola Africa <tibble [12 × 4]> <S3: lm>
#> 3 Benin Africa <tibble [12 × 4]> <S3: lm>
#> 4 Botswana Africa <tibble [12 × 4]> <S3: lm>
#> 5 Burkina Faso Africa <tibble [12 × 4]> <S3: lm>
#> 6 Burundi Africa <tibble [12 × 4]> <S3: lm>
#> 7 Cameroon Africa <tibble [12 × 4]> <S3: lm>
#> 8 Central African Republic Africa <tibble [12 × 4]> <S3: lm>
#> 9 Chad Africa <tibble [12 × 4]> <S3: lm>
#> 10 Comoros Africa <tibble [12 × 4]> <S3: lm>
#> # … with 132 more rows

Unnesting

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
(by_country <- by_country %>%
mutate(
resid = map2(data, model, add_residuals)
))
#> # A tibble: 142 x 5
#> country continent data model resid
#> <fct> <fct> <list> <list> <list>
#> 1 Afghanistan Asia <tibble [12 × 4]> <S3: lm> <tibble [12 × 5]>
#> 2 Albania Europe <tibble [12 × 4]> <S3: lm> <tibble [12 × 5]>
#> 3 Algeria Africa <tibble [12 × 4]> <S3: lm> <tibble [12 × 5]>
#> 4 Angola Africa <tibble [12 × 4]> <S3: lm> <tibble [12 × 5]>
#> 5 Argentina Americas <tibble [12 × 4]> <S3: lm> <tibble [12 × 5]>
#> 6 Australia Oceania <tibble [12 × 4]> <S3: lm> <tibble [12 × 5]>
#> 7 Austria Europe <tibble [12 × 4]> <S3: lm> <tibble [12 × 5]>
#> 8 Bahrain Asia <tibble [12 × 4]> <S3: lm> <tibble [12 × 5]>
#> 9 Bangladesh Asia <tibble [12 × 4]> <S3: lm> <tibble [12 × 5]>
#> 10 Belgium Europe <tibble [12 × 4]> <S3: lm> <tibble [12 × 5]>
#> # … with 132 more rows

(resid <- unnest(by_country, resid))
#> # A tibble: 1,704 x 7
#> country continent year lifeExp pop gdpPercap resid
#> <fct> <fct> <int> <dbl> <int> <dbl> <dbl>
#> 1 Afghanistan Asia 1952 28.8 8425333 779. -1.11
#> 2 Afghanistan Asia 1957 30.3 9240934 821. -0.952
#> 3 Afghanistan Asia 1962 32.0 10267083 853. -0.664
#> 4 Afghanistan Asia 1967 34.0 11537966 836. -0.0172
#> 5 Afghanistan Asia 1972 36.1 13079460 740. 0.674
#> 6 Afghanistan Asia 1977 38.4 14880372 786. 1.65
#> 7 Afghanistan Asia 1982 39.9 12881816 978. 1.69
#> 8 Afghanistan Asia 1987 40.8 13867957 852. 1.28
#> 9 Afghanistan Asia 1992 41.7 16317921 649. 0.754
#> 10 Afghanistan Asia 1997 41.8 22227415 635. -0.534
#> # … with 1,694 more rows

resid %>%
ggplot(aes(year, resid)) +
geom_line(aes(group = country), alpha = 1/3) +
geom_smooth(se = FALSE)

1
2
3
4
resid %>%
ggplot(aes(year, resid, group = country)) +
geom_line(alpha = 1/3) +
facet_wrap( ~ continent, scales = "free_y")

Model quality

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
# Model quality
broom::glance(cn_mod)
#> # A tibble: 1 x 11
#> r.squared adj.r.squared sigma statistic p.value df logLik AIC
#> <dbl> <dbl> <dbl> <dbl> <dbl> <int> <dbl> <dbl>
#> 1 0.871 0.858 3.86 67.7 9.21e-6 2 -32.1 70.3
#> # … with 3 more variables: BIC <dbl>, deviance <dbl>,
#> # df.residual <int>

(glance <- by_country %>%
mutate(glance = map(model, broom::glance)) %>%
unnest(glance, .drop = TRUE))
#> # A tibble: 142 x 13
#> country continent r.squared adj.r.squared sigma statistic p.value
#> <fct> <fct> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 Afghan… Asia 0.948 0.942 1.22 181. 9.84e- 8
#> 2 Albania Europe 0.911 0.902 1.98 102. 1.46e- 6
#> 3 Algeria Africa 0.985 0.984 1.32 662. 1.81e-10
#> 4 Angola Africa 0.888 0.877 1.41 79.1 4.59e- 6
#> 5 Argent… Americas 0.996 0.995 0.292 2246. 4.22e-13
#> 6 Austra… Oceania 0.980 0.978 0.621 481. 8.67e-10
#> 7 Austria Europe 0.992 0.991 0.407 1261. 7.44e-12
#> 8 Bahrain Asia 0.967 0.963 1.64 291. 1.02e- 8
#> 9 Bangla… Asia 0.989 0.988 0.977 930. 3.37e-11
#> 10 Belgium Europe 0.995 0.994 0.293 1822. 1.20e-12
#> # … with 132 more rows, and 6 more variables: df <int>,
#> # logLik <dbl>, AIC <dbl>, BIC <dbl>, deviance <dbl>,
#> # df.residual <int>
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
glance %>%
arrange(r.squared)

#> # A tibble: 142 x 13
#> country continent r.squared adj.r.squared sigma statistic p.value
#> <fct> <fct> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 Rwanda Africa 0.0172 -0.0811 6.56 0.175 0.685
#> 2 Botswa… Africa 0.0340 -0.0626 6.11 0.352 0.566
#> 3 Zimbab… Africa 0.0562 -0.0381 7.21 0.596 0.458
#> 4 Zambia Africa 0.0598 -0.0342 4.53 0.636 0.444
#> 5 Swazil… Africa 0.0682 -0.0250 6.64 0.732 0.412
#> 6 Lesotho Africa 0.0849 -0.00666 5.93 0.927 0.358
#> 7 Cote d… Africa 0.283 0.212 3.93 3.95 0.0748
#> 8 South … Africa 0.312 0.244 4.74 4.54 0.0588
#> 9 Uganda Africa 0.342 0.276 3.19 5.20 0.0457
#> 10 Congo,… Africa 0.348 0.283 2.43 5.34 0.0434
#> # … with 132 more rows, and 6 more variables: df <int>,
#> # logLik <dbl>, AIC <dbl>, BIC <dbl>, deviance <dbl>,
#> # df.residual <int>
1
2
3
glance %>%
ggplot(aes(continent, r.squared)) +
geom_jitter(width = 0.5)

1
2
3
4
5
bad.fit <- dplyr::filter(glance, r.squared < 0.25)
gapminder %>%
semi_join(bad.fit, by = "country") %>%
ggplot(aes(year, lifeExp, colour = country)) +
geom_line()

List-columns

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
data.frame(x = list(1:3, 3:5))
data.frame(
x = I(list(1:3, 3:5)),
y = c("1, 2", "3, 4, 5")
)

tibble(
x = list(1:3, 3:5),
y = c("1, 2", "3, 4, 5")
)
tribble(
~x, ~y,
1:3, "1, 2",
3:5, "3, 4, 5"
)

Creating list-columns

With nesting

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
gapminder %>%
group_by(country, continent) %>%
nest()
#> # A tibble: 142 x 3
#> country continent data
#> <fct> <fct> <list>
#> 1 Afghanistan Asia <tibble [12 × 4]>
#> 2 Albania Europe <tibble [12 × 4]>
#> 3 Algeria Africa <tibble [12 × 4]>
#> 4 Angola Africa <tibble [12 × 4]>
#> 5 Argentina Americas <tibble [12 × 4]>
#> 6 Australia Oceania <tibble [12 × 4]>
#> 7 Austria Europe <tibble [12 × 4]>
#> 8 Bahrain Asia <tibble [12 × 4]>
#> 9 Bangladesh Asia <tibble [12 × 4]>
#> 10 Belgium Europe <tibble [12 × 4]>
#> # … with 132 more rows

gapminder %>%
nest(year:gdpPercap)
#> # A tibble: 142 x 3
#> country continent data
#> <fct> <fct> <list>
#> 1 Afghanistan Asia <tibble [12 × 4]>
#> 2 Albania Europe <tibble [12 × 4]>
#> 3 Algeria Africa <tibble [12 × 4]>
#> 4 Angola Africa <tibble [12 × 4]>
#> 5 Argentina Americas <tibble [12 × 4]>
#> 6 Australia Oceania <tibble [12 × 4]>
#> 7 Austria Europe <tibble [12 × 4]>
#> 8 Bahrain Asia <tibble [12 × 4]>
#> 9 Bangladesh Asia <tibble [12 × 4]>
#> 10 Belgium Europe <tibble [12 × 4]>
#> # … with 132 more rows

From vectorised functions

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
df <- tribble(
~ x1,
"a,b,c",
"d,e,f,g"
)
df %>%
mutate(
x2 = stringr::str_split(x1, ",")
)
#> # A tibble: 2 x 2
#> x1 x2
#> <chr> <list>
#> 1 a,b,c <chr [3]>
#> 2 d,e,f,g <chr [4]>

df %>%
mutate(x2 = stringr::str_split(x1, ",")) %>%
unnest()

#> # A tibble: 7 x 2
#> x1 x2
#> <chr> <chr>
#> 1 a,b,c a
#> 2 a,b,c b
#> 3 a,b,c c
#> 4 d,e,f,g d
#> 5 d,e,f,g e
#> 6 d,e,f,g f
#> 7 d,e,f,g g

sim <- tribble(
~f, ~params,
"runif", list(min = -1, max = 1),
"rnorm", list(sd = 5),
"rpois", list(lambda = 10)
)
sim %>%
mutate(sims = invoke_map(f, params, n = 10))
#> # A tibble: 3 x 3
#> f params sims
#> <chr> <list> <list>
#> 1 runif <list [2]> <dbl [10]>
#> 2 rnorm <list [1]> <dbl [10]>
#> 3 rpois <list [1]> <int [10]>

From multivalued summaries

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
# 下面的代码会出错
mtcars %>%
group_by(cyl) %>%
summarise(q = quantile(mpg))
#> 错误: Column `q` must be length 1 (a summary value), not 5

mtcars %>%
group_by(cyl) %>%
summarise(q = list(quantile(mpg)))
#> # A tibble: 3 x 2
#> cyl q
#> <dbl> <list>
#> 1 4 <dbl [5]>
#> 2 6 <dbl [5]>
#> 3 8 <dbl [5]>

probs <- c(0.01, 0.25, 0.5, 0.75, 0.99)
mtcars %>%
group_by(cyl) %>%
summarise(p = list(probs),
q = list(quantile(mpg, probs))) %>%
unnest()

#> # A tibble: 15 x 3
#> cyl p q
#> <dbl> <dbl> <dbl>
#> 1 4 0.01 21.4
#> 2 4 0.25 22.8
#> 3 4 0.5 26
#> 4 4 0.75 30.4
#> 5 4 0.99 33.8
#> 6 6 0.01 17.8
#> 7 6 0.25 18.6
#> 8 6 0.5 19.7
#> 9 6 0.75 21
#> 10 6 0.99 21.4
#> 11 8 0.01 10.4
#> 12 8 0.25 14.4
#> 13 8 0.5 15.2
#> 14 8 0.75 16.2
#> 15 8 0.99 19.1

From a named list

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
x <- list(
a = 1:5,
b = 3:4,
c = 5:6
)
df <- enframe(x)
df
#> # A tibble: 3 x 2
#> name value
#> <chr> <list>
#> 1 a <int [5]>
#> 2 b <int [2]>
#> 3 c <int [2]>

df %>%
mutate(smry = map2_chr(name, value, ~ stringr::str_c(.x, ": ", .y[1])))
#> # A tibble: 3 x 3
#> name value smry
#> <chr> <list> <chr>
#> 1 a <int [5]> a: 1
#> 2 b <int [2]> b: 3
#> 3 c <int [2]> c: 5

List to vector

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
df <- tribble(
~x,
letters[1:5],
1:3,
runif(5)
)
df
#> # A tibble: 3 x 1
#> x
#> <list>
#> 1 <chr [5]>
#> 2 <int [3]>
#> 3 <dbl [5]>

df %>%
mutate(
type = map_chr(x, typeof),
length = map_int(x, length)
)

#> # A tibble: 3 x 3
#> x type length
#> <list> <chr> <int>
#> 1 <chr [5]> character 5
#> 2 <int [3]> integer 3
#> 3 <dbl [5]> double 5

df <- tribble(
~ x,
list(a = 1, b = 2),
list(a = 2, c = 4)
)
df %>%
mutate(
a = map_dbl(x, "a"),
b = map_dbl(x, "b", .null = NA_real_),
c = map_dbl(x, "c", .null = NA_real_)
)
#> # A tibble: 2 x 4
#> x a b c
#> <list> <dbl> <dbl> <dbl>
#> 1 <list [2]> 1 2 NA
#> 2 <list [2]> 2 NA 4

Unnesting

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
tibble(x = 1:2, y = list(1:4, 1)) %>%
unnest(y)

#> # A tibble: 5 x 2
#> x y
#> <int> <dbl>
#> 1 1 1
#> 2 1 2
#> 3 1 3
#> 4 1 4
#> 5 2 1

df1 <- tribble(
~x, ~y, ~z,
1, c("a", "b"), 1:2,
2, "c", 3
)
df1
#> # A tibble: 2 x 3
#> x y z
#> <dbl> <list> <list>
#> 1 1 <chr [2]> <int [2]>
#> 2 2 <chr [1]> <dbl [1]>

df1 %>%
unnest(y, z)
#> # A tibble: 3 x 3
#> x y z
#> <dbl> <chr> <dbl>
#> 1 1 a 1
#> 2 1 b 2
#> 3 2 c 3

如果 x 和 y 有不同熟练的元素上述操作就会出错:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
df2 <- tribble(
~x, ~y, ~z,
1, "a", 1:2,
2, c("b", "c"), 3
)
df2
#> # A tibble: 2 x 3
#> x y z
#> <dbl> <list> <list>
#> 1 1 <chr [1]> <int [2]>
#> 2 2 <chr [2]> <dbl [1]>

df2 %>%
unnest(y, z)
#> 错误: All nested columns must have the same number of elements.
#> Call `rlang::last_error()` to see a backtrace

剩余的一些内容

1
2
3
4
5
comma <- function(x) format(x, digits = 2, big.mark = ",")
comma(1245)
#> [1] "1,245"
comma(0.12345)
#> [1] "0.12"
1
2
3
4
5
6
7
8
9
10
11
df <- tibble(
x = runif(10),
y = runif(10)
)

ggplot(df, aes(x, y)) +
geom_point() +
labs(
x = quote(sum(x[i]^2, i == 1, n)),
y = quote(alpha + beta + frac(delta, theta))
)

1
2
3
4
5
6
7
8
best_in_class <- mpg %>%
group_by(class) %>%
dplyr::filter(row_number(desc(hwy)) == 1)

ggplot(mpg, aes(displ, hwy)) +
geom_point(aes(colour = class)) +
geom_point(size = 3, shape = 1, data = best_in_class) +
ggrepel::geom_label_repel(aes(label = model), data = best_in_class)

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
class_avg <- mpg %>%
group_by(class) %>%
summarise(
displ = median(displ),
hwy = median(hwy)
)
ggplot(mpg, aes(displ, hwy, colour = class)) +
ggrepel::geom_label_repel(
aes(label = class),
data = class_avg,
size = 6,
label.size = 0,
segment.color = NA
) +
geom_point() +
theme(legend.position = "none")

1
2
3
4
5
6
7
8
9
label <- mpg %>%
summarise(
displ = max(displ),
hwy = max(hwy),
label = "Increasing engine size is \nrelated to decreasing fuel economy."
)
ggplot(mpg, aes(displ, hwy)) +
geom_point() +
geom_text(aes(label = label), data = label, vjust = "top", hjust = "right", family = enfont)

1
2
3
4
5
6
7
8
9
label <- mpg %>%
summarise(
displ = Inf,
hwy = Inf,
label = "Increasing engine size is \nrelated to decreasing fuel economy."
)
ggplot(mpg, aes(displ, hwy)) +
geom_point() +
geom_text(aes(label = label), data = label, vjust = "top", hjust = "right", family = enfont)

1
2
3
4
5
6
"Increasing engine size is \nrelated to decreasing fuel economy." %>%
stringr::str_wrap(width = 40) %>%
writeLines()

#> Increasing engine size is related to
#> decreasing fuel economy.
1
2
3
4
5
ggplot(mpg, aes(displ, hwy)) +
geom_point(aes(colour = class)) +
geom_smooth(se = FALSE) +
theme(legend.position = "bottom") +
guides(colour = guide_legend(nrow = 1, override.aes = list(size = 4)))

Replacing a scale:

1
2
3
4
5
6
7
8
ggplot(diamonds, aes(carat, price)) +
geom_bin2d()
ggplot(diamonds, aes(log10(carat), log10(price))) +
geom_bin2d()
ggplot(diamonds, aes(carat, price)) +
geom_bin2d() +
scale_x_log10() +
scale_y_log10()

1
2
3
4
5
6
7
presidential %>%
mutate(id = 33 + row_number()) %>%
ggplot(aes(start, id, colour = party)) +
geom_point() +
geom_segment(aes(xend = end, yend = id)) +
scale_colour_manual(values = c(Republican = "red",
Democratic = "blue"))

1
2
3
4
5
6
7
8
9
10
11
df <- tibble(
x = rnorm(10000),
y = rnorm(10000)
)
ggplot(df, aes(x, y)) +
geom_hex() +
coord_fixed() +
ggplot(df, aes(x, y)) +
geom_hex() +
viridis::scale_fill_viridis() +
coord_fixed()

Zooming:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
p1 <- ggplot(mpg, aes(displ, hwy)) +
geom_point(aes(color = class)) +
geom_smooth() +
coord_cartesian(xlim = c(5, 7), ylim = c(10, 30))

p2 <- mpg %>%
dplyr::filter(displ >= 5,
displ <= 7,
hwy >= 10,
hwy <= 30) %>%
ggplot(aes(displ, hwy)) +
geom_point(aes(color = class)) +
geom_smooth()

p1 + p2

# R

Comments

Your browser is out-of-date!

Update your browser to view this website correctly. Update my browser now

×