R语言 为什么当我使用ggplot合并平滑线图时,它们会发生变化?

relj7zay  于 2023-03-10  发布在  其他
关注(0)|答案(1)|浏览(141)

我有三个不同疾病组的三个数据集,每个疾病组都有一个健康评分。当我将它们合并到一个数据集中并绘制它们的图时,它们与各自的图相比看起来不同(如下所示)。
在“健康、心力衰竭、中风”中加入一个称为“疾病”的变量后,我以这种方式将三个数据集合并起来

d1<- rbind(healthy_clean,HF_clean, Stroke_clean)

然后,我检查了新数据集(d1)中的个体数与原始数据集的个体数是否相同,还检查了d1中每个疾病组的健康评分的均值、SD、max和min,它们与原始数据集完全相同。
你知道为什么在整个线图上不是相同的线吗?
这是我的代码为单独的情节和合并之一

#####example of separate:
ggplot(data=healthy_clean, aes(x=age, y=Mental.Health_T.score,fill=Gender, linetype=Gender)) + 
  geom_smooth(alpha=0) + 
  scale_x_log10() + 
  scale_y_log10() + 
  xlab("Age (Years)") + 
  ylab("Health Global Score")+
  ggtitle("healthy individuals")+
  xlim(19,97)


#####code for the combined plot
ggplot(data=d1, aes(x=age, y=Mental.Health_T.score, color=disease , linetype=Gender)) + 
  geom_smooth(alpha=0) + 
  scale_x_log10() + 
  scale_y_log10() + 
  xlab("Age (Years)") + 
  ylab("Health Global Score")+
  xlim(19,97)+
  scale_color_manual(values = c("#00AFBB", "#E7B800", "#FC4E07"))+
  scale_linetype_manual(values = c('solid','dashed'))+
  guides(lty = guide_legend(override.aes = list(col = 'black')))+
  theme(legend.position = "top", legend.title = element_blank())

这里可以看到我三个独立的数据集

healthy <-
      structure(
        list(
          PIN = 3:32,
          age = c(
            38L,
            40L,
            38L,
            39L,
            51L,
            44L,
            59L,
            46L,
            39L,
            39L,
            42L,
            53L,
            37L,
            46L,
            36L,
            51L,
            36L,
            61L,
            39L,
            48L,
            47L,
            43L,
            51L,
            41L,
            39L,
            53L,
            41L,
            52L,
            46L,
            44L
          ),
          Gender = c(
            "M",
            "F",
            "M",
            "F",
            "M",
            "M",
            "F",
            "M",
            "F",
            "M",
            "M",
            "M",
            "F",
            "F",
            "F",
            "M",
            "F",
            "M",
            "F",
            "M",
            "M",
            "F",
            "F",
            "F",
            "F",
            "F",
            "M",
            "M",
            "M",
            "M"
          ),
          Mental.Health_T.score = c(
            46.6,
            32.9,
            28.2,
            46.8,
            58.7,
            58,
            49.3,
            54,
            43.3,
            37.4,
            51.6,
            53.1,
            21.3,
            61.8,
            48,
            33.8,
            46.6,
            51.6,
            62.4,
            54.6,
            50.9,
            54,
            51.6,
            49.8,
            67.6,
            66.5,
            62.4,
            67.6,
            58.3,
            56.8
          ),
          Physical.Health_T.score = c(
            42.6,
            37.2,
            42,
            34.7,
            55.4,
            59.3,
            55.2,
            42.1,
            31.1,
            31.9,
            45.6,
            54.4,
            42.6,
            59.3,
            39.5,
            19.7,
            45.2,
            44,
            54.5,
            46.5,
            48.6,
            52,
            41.6,
            36.2,
            67.7,
            57.8,
            57.8,
            54.7,
            54.4,
            48.9
          ),
          disease = c(
            "Apparently healthy",
            "Apparently healthy",
            "Apparently healthy",
            "Apparently healthy",
            "Apparently healthy",
            "Apparently healthy",
            "Apparently healthy",
            "Apparently healthy",
            "Apparently healthy",
            "Apparently healthy",
            "Apparently healthy",
            "Apparently healthy",
            "Apparently healthy",
            "Apparently healthy",
            "Apparently healthy",
            "Apparently healthy",
            "Apparently healthy",
            "Apparently healthy",
            "Apparently healthy",
            "Apparently healthy",
            "Apparently healthy",
            "Apparently healthy",
            "Apparently healthy",
            "Apparently healthy",
            "Apparently healthy",
            "Apparently healthy",
            "Apparently healthy",
            "Apparently healthy",
            "Apparently healthy",
            "Apparently healthy"
          )
        ),
        row.names = c(NA,
                      30L),
        class = "data.frame"
      )

heart failure_dataset <-
  structure(
    list(
      PIN = c(
        4201L,
        4331L,
        4357L,
        4358L,
        4360L,
        4362L,
        4366L,
        4367L,
        4368L,
        4370L,
        4374L,
        4377L,
        4378L,
        4383L,
        4384L,
        4386L,
        4387L,
        4388L,
        4390L,
        4391L,
        4393L,
        4394L,
        4399L,
        4401L,
        4408L,
        4410L,
        4412L,
        4414L,
        4415L,
        4416L
      ),
      age = c(
        59L,
        38L,
        29L,
        21L,
        28L,
        34L,
        35L,
        26L,
        34L,
        39L,
        26L,
        34L,
        28L,
        42L,
        19L,
        49L,
        75L,
        53L,
        55L,
        22L,
        27L,
        35L,
        35L,
        40L,
        45L,
        44L,
        32L,
        27L,
        30L,
        48L
      ),
      Gender = c(
        "M",
        "M",
        "M",
        "F",
        "F",
        "F",
        "F",
        "M",
        "M",
        "F",
        "F",
        "M",
        "M",
        "M",
        "F",
        "F",
        "M",
        "M",
        "F",
        "M",
        "F",
        "M",
        "F",
        "M",
        "F",
        "M",
        "M",
        "F",
        "F",
        "F"
      ),
      Mental.Health_T.score = c(
        44.8,
        51.8,
        67.6,
        37.5,
        56.7,
        52.2,
        30.7,
        56.7,
        51.6,
        48,
        44.2,
        54.6,
        62.4,
        67.6,
        45.3,
        48,
        58.7,
        56,
        63.6,
        27.9,
        43.5,
        53.2,
        55.3,
        67.6,
        58.9,
        51.6,
        49.3,
        63.6,
        58.7,
        37.5
      ),
      Physical.Health_T.score = c(
        46.3,
        26.5,
        61.6,
        51.5,
        46.3,
        44.3,
        28.5,
        54.7,
        44.3,
        40.5,
        62.2,
        48.6,
        54.6,
        59.3,
        45.6,
        54.4,
        48.2,
        39.5,
        44.3,
        47.8,
        37.5,
        53.5,
        42.4,
        47.8,
        42.1,
        49.2,
        51.5,
        62.2,
        46.9,
        32.2
      ),
      disease = c(
        "Heart failure",
        "Heart failure",
        "Heart failure",
        "Heart failure",
        "Heart failure",
        "Heart failure",
        "Heart failure",
        "Heart failure",
        "Heart failure",
        "Heart failure",
        "Heart failure",
        "Heart failure",
        "Heart failure",
        "Heart failure",
        "Heart failure",
        "Heart failure",
        "Heart failure",
        "Heart failure",
        "Heart failure",
        "Heart failure",
        "Heart failure",
        "Heart failure",
        "Heart failure",
        "Heart failure",
        "Heart failure",
        "Heart failure",
        "Heart failure",
        "Heart failure",
        "Heart failure",
        "Heart failure"
      )
    ),
    row.names = c(NA, 30L),
    class = "data.frame"
  )



stroke_dataset <-
  structure(
    list(
      PIN = c(
        4155L,
        4156L,
        4157L,
        4158L,
        4159L,
        4160L,
        4161L,
        4162L,
        4164L,
        4165L,
        4166L,
        4167L,
        4168L,
        4169L,
        4170L,
        4171L,
        4172L,
        4173L,
        4174L,
        4175L,
        4176L,
        4177L,
        4178L,
        4179L,
        4180L,
        4181L,
        4182L,
        4183L,
        4184L,
        4185L
      ),
      age = c(
        54L,
        42L,
        66L,
        45L,
        28L,
        55L,
        70L,
        36L,
        60L,
        65L,
        40L,
        57L,
        41L,
        41L,
        37L,
        39L,
        19L,
        42L,
        58L,
        33L,
        63L,
        41L,
        45L,
        42L,
        42L,
        39L,
        55L,
        41L,
        52L,
        63L
      ),
      Gender = c(
        "M",
        "M",
        "F",
        "M",
        "M",
        "M",
        "M",
        "M",
        "M",
        "M",
        "M",
        "M",
        "F",
        "M",
        "F",
        "M",
        "M",
        "F",
        "M",
        "M",
        "M",
        "M",
        "F",
        "M",
        "M",
        "F",
        "M",
        "M",
        "M",
        "M"
      ),
      Mental.Health_T.score = c(
        29.6,
        67.6,
        32.8,
        40.8,
        50.5,
        43.3,
        40.7,
        34.7,
        46.6,
        36.7,
        51.6,
        44.3,
        25,
        45.3,
        35.1,
        45.6,
        45.5,
        51.6,
        45.3,
        62.4,
        46.6,
        48.1,
        40.7,
        42,
        56.8,
        47.9,
        62.4,
        57.6,
        51.6,
        40.7
      ),
      Physical.Health_T.score = c(
        32.9,
        67.7,
        26.5,
        38.4,
        43.4,
        39,
        49,
        31.9,
        37.6,
        34.2,
        41.6,
        36.9,
        24.5,
        50.4,
        31.9,
        53.8,
        44.7,
        37.3,
        41.6,
        57.9,
        39.9,
        29,
        31.8,
        34.2,
        54.4,
        40.5,
        47.2,
        58.7,
        50.7,
        41.6
      ),
      disease = c(
        "Stroke",
        "Stroke",
        "Stroke",
        "Stroke",
        "Stroke",
        "Stroke",
        "Stroke",
        "Stroke",
        "Stroke",
        "Stroke",
        "Stroke",
        "Stroke",
        "Stroke",
        "Stroke",
        "Stroke",
        "Stroke",
        "Stroke",
        "Stroke",
        "Stroke",
        "Stroke",
        "Stroke",
        "Stroke",
        "Stroke",
        "Stroke",
        "Stroke",
        "Stroke",
        "Stroke",
        "Stroke",
        "Stroke",
        "Stroke"
      )
    ),
    row.names = c(NA, 30L),
    class = "data.frame"
  )
5n0oy7gb

5n0oy7gb1#

这是因为geom_smooth对组合数据集和单个数据集使用了不同的平滑方法。您可以在运行绘图时出现的消息中看到这一点。
来自文档:
方法:要使用的平滑方法(函数),例如lm、glm、gam、loess、rlm。对于n〈1000的数据集,默认值为loess。对于具有1000个或更多观测值的数据集,默认值为gam
如果我不选择这些方法,则较小图的单个图使用loess平滑,而“健康”和组合图使用gam

你会看到这样的信息:

`geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
`geom_smooth()` using formula 'y ~ x'
`geom_smooth()` using formula 'y ~ x'
`geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

如果我强制每个geom_smooth使用loess(与method="loess"一起),那么图是相同的:

`geom_smooth()` using formula 'y ~ x'
`geom_smooth()` using formula 'y ~ x'
`geom_smooth()` using formula 'y ~ x'
`geom_smooth()` using formula 'y ~ x'

不过要小心,我收到了一些关于数据点被删除的警告,我不确定每个数据点的黄土平滑参数是否相同,所以您仍然应该检查一下。

相关问题