我有三个不同疾病组的三个数据集,每个疾病组都有一个健康评分。当我将它们合并到一个数据集中并绘制它们的图时,它们与各自的图相比看起来不同(如下所示)。
在“健康、心力衰竭、中风”中加入一个称为“疾病”的变量后,我以这种方式将三个数据集合并起来
d1<- rbind(healthy_clean,HF_clean, Stroke_clean)
然后,我检查了新数据集(d1)中的个体数与原始数据集的个体数是否相同,还检查了d1中每个疾病组的健康评分的均值、SD、max和min,它们与原始数据集完全相同。
你知道为什么在整个线图上不是相同的线吗?
这是我的代码为单独的情节和合并之一
#####example of separate:
ggplot(data=healthy_clean, aes(x=age, y=Mental.Health_T.score,fill=Gender, linetype=Gender)) +
geom_smooth(alpha=0) +
scale_x_log10() +
scale_y_log10() +
xlab("Age (Years)") +
ylab("Health Global Score")+
ggtitle("healthy individuals")+
xlim(19,97)
#####code for the combined plot
ggplot(data=d1, aes(x=age, y=Mental.Health_T.score, color=disease , linetype=Gender)) +
geom_smooth(alpha=0) +
scale_x_log10() +
scale_y_log10() +
xlab("Age (Years)") +
ylab("Health Global Score")+
xlim(19,97)+
scale_color_manual(values = c("#00AFBB", "#E7B800", "#FC4E07"))+
scale_linetype_manual(values = c('solid','dashed'))+
guides(lty = guide_legend(override.aes = list(col = 'black')))+
theme(legend.position = "top", legend.title = element_blank())
这里可以看到我三个独立的数据集
healthy <-
structure(
list(
PIN = 3:32,
age = c(
38L,
40L,
38L,
39L,
51L,
44L,
59L,
46L,
39L,
39L,
42L,
53L,
37L,
46L,
36L,
51L,
36L,
61L,
39L,
48L,
47L,
43L,
51L,
41L,
39L,
53L,
41L,
52L,
46L,
44L
),
Gender = c(
"M",
"F",
"M",
"F",
"M",
"M",
"F",
"M",
"F",
"M",
"M",
"M",
"F",
"F",
"F",
"M",
"F",
"M",
"F",
"M",
"M",
"F",
"F",
"F",
"F",
"F",
"M",
"M",
"M",
"M"
),
Mental.Health_T.score = c(
46.6,
32.9,
28.2,
46.8,
58.7,
58,
49.3,
54,
43.3,
37.4,
51.6,
53.1,
21.3,
61.8,
48,
33.8,
46.6,
51.6,
62.4,
54.6,
50.9,
54,
51.6,
49.8,
67.6,
66.5,
62.4,
67.6,
58.3,
56.8
),
Physical.Health_T.score = c(
42.6,
37.2,
42,
34.7,
55.4,
59.3,
55.2,
42.1,
31.1,
31.9,
45.6,
54.4,
42.6,
59.3,
39.5,
19.7,
45.2,
44,
54.5,
46.5,
48.6,
52,
41.6,
36.2,
67.7,
57.8,
57.8,
54.7,
54.4,
48.9
),
disease = c(
"Apparently healthy",
"Apparently healthy",
"Apparently healthy",
"Apparently healthy",
"Apparently healthy",
"Apparently healthy",
"Apparently healthy",
"Apparently healthy",
"Apparently healthy",
"Apparently healthy",
"Apparently healthy",
"Apparently healthy",
"Apparently healthy",
"Apparently healthy",
"Apparently healthy",
"Apparently healthy",
"Apparently healthy",
"Apparently healthy",
"Apparently healthy",
"Apparently healthy",
"Apparently healthy",
"Apparently healthy",
"Apparently healthy",
"Apparently healthy",
"Apparently healthy",
"Apparently healthy",
"Apparently healthy",
"Apparently healthy",
"Apparently healthy",
"Apparently healthy"
)
),
row.names = c(NA,
30L),
class = "data.frame"
)
heart failure_dataset <-
structure(
list(
PIN = c(
4201L,
4331L,
4357L,
4358L,
4360L,
4362L,
4366L,
4367L,
4368L,
4370L,
4374L,
4377L,
4378L,
4383L,
4384L,
4386L,
4387L,
4388L,
4390L,
4391L,
4393L,
4394L,
4399L,
4401L,
4408L,
4410L,
4412L,
4414L,
4415L,
4416L
),
age = c(
59L,
38L,
29L,
21L,
28L,
34L,
35L,
26L,
34L,
39L,
26L,
34L,
28L,
42L,
19L,
49L,
75L,
53L,
55L,
22L,
27L,
35L,
35L,
40L,
45L,
44L,
32L,
27L,
30L,
48L
),
Gender = c(
"M",
"M",
"M",
"F",
"F",
"F",
"F",
"M",
"M",
"F",
"F",
"M",
"M",
"M",
"F",
"F",
"M",
"M",
"F",
"M",
"F",
"M",
"F",
"M",
"F",
"M",
"M",
"F",
"F",
"F"
),
Mental.Health_T.score = c(
44.8,
51.8,
67.6,
37.5,
56.7,
52.2,
30.7,
56.7,
51.6,
48,
44.2,
54.6,
62.4,
67.6,
45.3,
48,
58.7,
56,
63.6,
27.9,
43.5,
53.2,
55.3,
67.6,
58.9,
51.6,
49.3,
63.6,
58.7,
37.5
),
Physical.Health_T.score = c(
46.3,
26.5,
61.6,
51.5,
46.3,
44.3,
28.5,
54.7,
44.3,
40.5,
62.2,
48.6,
54.6,
59.3,
45.6,
54.4,
48.2,
39.5,
44.3,
47.8,
37.5,
53.5,
42.4,
47.8,
42.1,
49.2,
51.5,
62.2,
46.9,
32.2
),
disease = c(
"Heart failure",
"Heart failure",
"Heart failure",
"Heart failure",
"Heart failure",
"Heart failure",
"Heart failure",
"Heart failure",
"Heart failure",
"Heart failure",
"Heart failure",
"Heart failure",
"Heart failure",
"Heart failure",
"Heart failure",
"Heart failure",
"Heart failure",
"Heart failure",
"Heart failure",
"Heart failure",
"Heart failure",
"Heart failure",
"Heart failure",
"Heart failure",
"Heart failure",
"Heart failure",
"Heart failure",
"Heart failure",
"Heart failure",
"Heart failure"
)
),
row.names = c(NA, 30L),
class = "data.frame"
)
stroke_dataset <-
structure(
list(
PIN = c(
4155L,
4156L,
4157L,
4158L,
4159L,
4160L,
4161L,
4162L,
4164L,
4165L,
4166L,
4167L,
4168L,
4169L,
4170L,
4171L,
4172L,
4173L,
4174L,
4175L,
4176L,
4177L,
4178L,
4179L,
4180L,
4181L,
4182L,
4183L,
4184L,
4185L
),
age = c(
54L,
42L,
66L,
45L,
28L,
55L,
70L,
36L,
60L,
65L,
40L,
57L,
41L,
41L,
37L,
39L,
19L,
42L,
58L,
33L,
63L,
41L,
45L,
42L,
42L,
39L,
55L,
41L,
52L,
63L
),
Gender = c(
"M",
"M",
"F",
"M",
"M",
"M",
"M",
"M",
"M",
"M",
"M",
"M",
"F",
"M",
"F",
"M",
"M",
"F",
"M",
"M",
"M",
"M",
"F",
"M",
"M",
"F",
"M",
"M",
"M",
"M"
),
Mental.Health_T.score = c(
29.6,
67.6,
32.8,
40.8,
50.5,
43.3,
40.7,
34.7,
46.6,
36.7,
51.6,
44.3,
25,
45.3,
35.1,
45.6,
45.5,
51.6,
45.3,
62.4,
46.6,
48.1,
40.7,
42,
56.8,
47.9,
62.4,
57.6,
51.6,
40.7
),
Physical.Health_T.score = c(
32.9,
67.7,
26.5,
38.4,
43.4,
39,
49,
31.9,
37.6,
34.2,
41.6,
36.9,
24.5,
50.4,
31.9,
53.8,
44.7,
37.3,
41.6,
57.9,
39.9,
29,
31.8,
34.2,
54.4,
40.5,
47.2,
58.7,
50.7,
41.6
),
disease = c(
"Stroke",
"Stroke",
"Stroke",
"Stroke",
"Stroke",
"Stroke",
"Stroke",
"Stroke",
"Stroke",
"Stroke",
"Stroke",
"Stroke",
"Stroke",
"Stroke",
"Stroke",
"Stroke",
"Stroke",
"Stroke",
"Stroke",
"Stroke",
"Stroke",
"Stroke",
"Stroke",
"Stroke",
"Stroke",
"Stroke",
"Stroke",
"Stroke",
"Stroke",
"Stroke"
)
),
row.names = c(NA, 30L),
class = "data.frame"
)
1条答案
按热度按时间5n0oy7gb1#
这是因为
geom_smooth
对组合数据集和单个数据集使用了不同的平滑方法。您可以在运行绘图时出现的消息中看到这一点。来自文档:
方法:要使用的平滑方法(函数),例如lm、glm、gam、loess、rlm。对于n〈1000的数据集,默认值为loess。对于具有1000个或更多观测值的数据集,默认值为gam
如果我不选择这些方法,则较小图的单个图使用
loess
平滑,而“健康”和组合图使用gam
。你会看到这样的信息:
如果我强制每个
geom_smooth
使用loess
(与method="loess"
一起),那么图是相同的:不过要小心,我收到了一些关于数据点被删除的警告,我不确定每个数据点的黄土平滑参数是否相同,所以您仍然应该检查一下。