从交叉验证的tidycluster工作流集中收集指标

2nbm6dog  于 2023-03-27  发布在  其他
关注(0)|答案(1)|浏览(132)

我试图为我的交叉验证工作流集绘制结果(K与平方比之和),但当我使用collect_metrics时,我没有看到可以针对平均误差绘制的K的独立列。

我已经从配置名称中解析出了K的值,但我不确定这是一种有效的方法:

tune_results <- wf_set %>%
  collect_metrics() %>%
  filter(.metric == "sse_ratio")

tune_results %>%
  ggplot(aes(x = as.numeric(stringr::str_sub(.config, -2, -1)), y = mean, color = wflow_id)) +
  geom_point() +
  geom_line() +
  theme_minimal() +
  ggtitle("Plot of WSS/TSS ratio by Cluster Number") +
  ylab("mean WSS/TSS ratio, over 10 folds") +
  xlab("Number of clusters") +
  scale_x_continuous(breaks = 1:10)

下面是我正在处理的示例的reprex,我已经将CV折叠更改为3以加快计算时间:

if (!requireNamespace("pacman", quietly = TRUE)) {
  message("Installing pacman...")
  install.packages("pacman")
}

#INSTALL PACKAGES
pacman::p_load(tidyverse, tidymodels, tidyclust, janitor, ClusterR, knitr, moments, visdat, skimr, DescTools)

mtcars <- mtcars %>%
  mutate(
    `am` = factor(`am`, labels = c(`0` = "auto", `1` = "man")),
    `vs` = factor(`vs`, labels = c(`0` = "V-shaped", `1` = "straight")),
    `cyl` = factor(`cyl`),
    `gear` = factor(`gear`),
    `carb` = factor(`carb`)
  )

# SET UP 10 FOLD CROSS VALIDATION
mtcars_cv <- vfold_cv(mtcars, v = 3)

# SET SEED FOR REPRODUCABILITY
set.seed(123)

# EDA ---------------------------------------------------------------------

# skimr::skim(mtcars)

# DescTools::Desc(mtcars)

# MODEL SPEC --------------------------------------------------------------

kmeans_spec <- k_means(num_clusters = tune())

# PREPROCESSING RECIPES ---------------------------------------------------

rec1 <- recipe(~., data = mtcars) %>%
  step_dummy(all_nominal_predictors()) %>%
  step_zv(all_predictors()) %>%
  step_normalize(all_numeric_predictors())

rec2 <- recipe(~., data = mtcars) %>%
  step_novel(all_nominal()) %>%
  step_dummy(all_nominal()) %>%
  step_zv(all_predictors()) %>%
  step_normalize(all_predictors()) %>%
  step_pca(all_predictors(), num_comp = 2)

rec3 <- recipe(~ ., data = mtcars) %>%
  step_dummy(all_nominal_predictors()) %>%
  step_zv(all_predictors()) %>% 
  step_normalize(all_numeric_predictors()) %>% 
  step_center(all_numeric())

clust_num_grid <- grid_regular(num_clusters(),
  levels = 10
)

# WORKFLOW ----------------------------------------------------------------

wf_set <- workflow_set(
  preproc = list(rec1, rec2, rec3),
  models = list(kmeans_spec)
)

# TUNE HYPER-PARAMETERS ---------------------------------------------------

tune_cluster_wf <- function(id) {
  tune_cluster(
    extract_workflow(wf_set, id),
    resamples = mtcars_cv,
    grid = clust_num_grid,
    metrics = cluster_metric_set(sse_within_total, sse_total, sse_ratio),
    control = tune::control_grid(save_pred = TRUE, extract = identity)
  )
}

wf_set$result <- map(wf_set$wflow_id, tune_cluster_wf)

tune_results <- wf_set %>%
  collect_metrics() %>%
  filter(.metric == "sse_ratio")

tune_results %>%
  ggplot(aes(x = as.numeric(stringr::str_sub(.config, -2, -1)), y = mean, color = wflow_id)) +
  geom_point() +
  geom_line() +
  theme_minimal() +
  ggtitle("Plot of WSS/TSS ratio by Cluster Number") +
  ylab("mean WSS/TSS ratio, over 10 folds") +
  xlab("Number of clusters") +
  scale_x_continuous(breaks = 1:10)
hkmswyz6

hkmswyz61#

workflow_setcollect_metrics()方法的documentation在这里可能会有帮助。
该条内容如下:
应用于工作流集时,返回的度量和预测不包含实际的调整参数列和值(与在其他对象上运行这些收集函数时不同)。原因是工作流集可以包含不同类型的模型或具有不同调整参数的模型。
如果需要这些列,有两个选项。首先,可以使用.config列将调优参数列合并到适当的对象中。或者,可以使用map()函数从原始对象中获取度量(请参见下面的示例)。
该示例包含的代码演示了如何通过优化结果map(),并在此过程中为每个结果提取k
请注意,as.numeric(stringr::str_sub(.config, -2, -1)的输出不是该模型的k值,而是该模型/预处理组合的唯一标识符。

相关问题