我在一个验证集上优化了一些算法(在mlr3中):
- 随机森林
- xgboost
- SVM
我已经提取了每个算法的平衡精度,但我想知道是否有可能将每个预测的p值与随机预测(无特征选择)......使用弗里德曼测试,我可以通过基准测试计算它,但它只在我的测试集上。下面是我的代码:
#Auto tuning Ranger
learner_ranger = lrn("classif.ranger", predict_type = "prob", num.trees = to_tune(1, 2000), mtry.ratio = to_tune(0, 1), sample.fraction = to_tune(1e-1, 1), importance = "impurity")
set.seed(1234)
at_ranger = auto_tuner(
tuner= tnr("random_search"),
learner = learner_ranger,
resampling = resampling_inner,
measure = msr("classif.bacc"),
term_evals = 20,
store_tuning_instance = TRUE,
store_models = TRUE
)
set.seed(1234)
#Auto tuning knn
learner_kknn = lrn("classif.kknn", predict_type = "prob", k = to_tune(1, 30))
at_kknn = auto_tuner(
tuner= tnr("random_search"),
learner = learner_kknn,
resampling = resampling_inner,
measure = msr("classif.bacc"),
term_evals = 20,
store_tuning_instance = TRUE,
store_models = TRUE
)
set.seed(1234)
#Auto tuning xgboost
learner_xgboost = lrn("classif.xgboost", predict_type = "prob", nrounds = to_tune(1, 5000), eta = to_tune(1e-4, 1, logscale = TRUE), subsample = to_tune(0.1,1), max_depth = to_tune(1,15), min_child_weight = to_tune(0, 7), colsample_bytree = to_tune(0,1), colsample_bylevel = to_tune(0,1), lambda = to_tune(1e-3, 1e3, logscale = TRUE), alpha = to_tune(1e-3, 1e3, logscale = TRUE))
at_xgboost = auto_tuner(
tuner= tnr("random_search"),
learner = learner_xgboost,
resampling = resampling_inner,
measure = msr("classif.bacc"),
term_evals = 20,
store_tuning_instance = TRUE,
store_models = TRUE
)
set.seed(1234)
#Auto tuning rpart
learner_rpart = lrn("classif.rpart", cp = to_tune(0.0001,1), minsplit = to_tune(1, 60), maxdepth = to_tune(1, 30), minbucket = to_tune(1,60), predict_type = "prob")
at_rpart = auto_tuner(
tuner= tnr("random_search"),
learner = learner_rpart,
resampling = resampling_inner,
measure = msr("classif.bacc"),
term_evals = 20,
store_tuning_instance = TRUE,
store_models = TRUE
)
set.seed(1234)
#Auto tuning svm
learner_svm = lrn("classif.svm", type = "C-classification", cost = to_tune(p_dbl(1e-5, 1e5, logscale = TRUE)), gamma = to_tune(p_dbl(1e-5, 1e5, logscale = TRUE)), kernel = to_tune(c("polynomial", "radial")), degree = to_tune(1, 4), predict_type = "prob")
at_svm = auto_tuner(
tuner= tnr("random_search"),
learner = learner_svm,
resampling = resampling_inner,
measure = msr("classif.bacc"),
term_evals = 20,
store_tuning_instance = TRUE,
store_models = TRUE
)
以Xgboost为例:
learners = c(lrn("classif.featureless"), at_xgboost)
set.seed(1234)
design = benchmark_grid(
tasks = list(task, task_imp, task_Emb, task_top, task_Pearson, task_IG),
learners = learners,
resamplings = resampling_outer)
bmr = benchmark(design,store_models = TRUE)
results_xgboost <- bmr$aggregate(measures)
print(results_xgboost)
archives = extract_inner_tuning_archives(bmr)
inner_learners = map(archives$resample_result, "learners")
best_bacc_xgboost <- aggregate(classif.bacc ~ task_id, data = archives, max)
我已经在这里提取了我最好的平衡准确性结果,但我想知道,与我的特征选择技术相比,是否存在显著差异,或者只是风险?
我在这里有一个关于这个主题的开始:https://stats.stackexchange.com/questions/368176/how-to-determine-whether-a-classifier-is-significantly-better-than-random-guessi
为了评估一个分类器是否比“基准”分类器更好(例如,总是将概率1分配给多数类,将概率0分配给所有其他类),您需要导出基准得分的分布。你可以通过bootstrapping得到这个:对测试集重新取样,将基准应用于重新取样的集,记录分数。重复多次。现在,您已经获得了所需的分布。检查这些重新采样的分数中有多少比候选分类器的分数小(因为分数越小越好)。
最大的问题是:为了创建我的测试集,我手动定制了不同的训练集、验证集和测试集之间的分区……
因此,我很快就会在重采样方面受到限制。我的原生数据非常特殊(我有患者,每个患者都有不同的病变,这导致必须分割数据,以将属于每个患者的病变保持在一起......)
编辑:根据Lars的回复,我将添加我工作的分流:
我创建了三个集合:- train-validation-test通过自定义重采样分割数据:
task = as_task_classif(data, target = "LesionResponse")
#Creation of the OUTER resampling via customization
resampling_outer = rsmp("custom")
resampling_outer$instantiate(task, train = list(train_rows_outer), test = list(test_rows))
#Creation of the INNER resampling via customization
resampling_inner = rsmp("custom")
resampling_inner$instantiate(task, train = list(train_rows), test = list(valid_rows))
#train_rows_inner divide my training set in two for the optimization of the models
#train_rows_outer divide my data set in two for : the training outer and the test.
编辑:以下是我对多个学习者进行基准测试后的策略。我在验证集上得到了这些结果。
使用Xgboost获得了最好的结果(这里绝对不奇怪),但是“task_importance”受到过拟合,正如你所看到的,所以我检查并绘制了测试结果,它证实了这种特征选择技术在验证集上过拟合。
resample_result_importance <- bmr$resample_results$resample_result[[6]]
resample_result_embarquee <- bmr$resample_results$resample_result[[4]]
prediction_xgboost_importance <- resample_result_importance$prediction()
prediction_xgboost_embarquee <- resample_result_embarquee$prediction()
pred_importance<-as.data.table(prediction_xgboost_importance)
pred_embarquee<-as.data.table(prediction_xgboost_embarquee)
levels(pred_importance$response) <- levels(pred_importance$truth)
levels(pred_embarquee$response) <- levels(pred_embarquee$truth)
# calculate confusion matrices
cm_importance <- mlr3measures::confusion_matrix(truth = pred_importance$truth, response = pred_importance$response, positive = "0")
cm_embarquee <- mlr3measures::confusion_matrix(truth = pred_embarquee$truth, response = pred_embarquee$response, positive = "0")
# print confusion matrices
print(cm_importance)
print(cm_embarquee)
# calculate AUC
auc_score_importance <- mlr3measures::auc(truth = pred_importance$truth, prob = pred_importance$`prob.0`, positive = "0")
auc_score_embarquee <- mlr3measures::auc(truth = pred_embarquee$truth, prob = pred_embarquee$`prob.0`, positive = "0")
# print AUC
print(auc_score_importance)
print(auc_score_embarquee)
# calculate p-values and confidence intervals
roc_obj_importance <- roc(pred_importance$truth, pred_importance$prob.1)
roc_obj_embarquee <- roc(pred_embarquee$truth, pred_embarquee$prob.1)
random_preds_importance <- rep(1, length(pred_importance$truth))
random_preds_embarquee <- rep(1, length(pred_embarquee$truth))
roc_obj_random_importance <- roc(pred_importance$truth, random_preds_importance)
roc_obj_random_embarquee <- roc(pred_embarquee$truth, random_preds_embarquee)
roc_test_result_importance <- roc.test(roc_obj_importance, roc_obj_random_importance)
roc_test_result_embarquee <- roc.test(roc_obj_embarquee, roc_obj_random_embarquee)
p_value_importance <- roc_test_result_importance$p.value
p_value_embarquee <- roc_test_result_embarquee$p.value
auc_ci_importance <- ci.auc(roc_obj_importance, method = "delong")
auc_ci_embarquee <- ci.auc(roc_obj_embarquee, method = "delong")
print(p_value_importance)
print(p_value_embarquee)
print(auc_ci_importance)
print(auc_ci_embarquee)
rocvals_importance <- data.frame(
threshold = unique(pred_importance$`prob.1`),
TPR = sapply(unique(pred_importance$`prob.1`), function(x) sum(pred_importance$truth[pred_importance$`prob.1` >= x] == "1") / sum(pred_importance$truth == "1")),
FPR = sapply(unique(pred_importance$`prob.1`), function(x) sum(pred_importance$truth[pred_importance$`prob.1` >= x] == "0") / sum(pred_importance$truth == "0")),
model = "importance"
)
rocvals_embarquee <- data.frame(
threshold = unique(pred_embarquee$`prob.1`),
TPR = sapply(unique(pred_embarquee$`prob.1`), function(x) sum(pred_embarquee$truth[pred_embarquee$`prob.1` >= x] == "1") / sum(pred_embarquee$truth == "1")),
FPR = sapply(unique(pred_embarquee$`prob.1`), function(x) sum(pred_embarquee$truth[pred_embarquee$`prob.1` >= x] == "0") / sum(pred_embarquee$truth == "0")),
model = "embarquee"
)
rocvals <- rbind(rocvals_importance, rocvals_embarquee)
caption_text <- paste(
"Model importance: AUC = ", round(auc_score_importance, 3),
", CI = [", round(auc_ci_importance[1], 3), ", ", round(auc_ci_importance[2], 3), "], p-value = ", round(p_value_importance, 3),
"\nModel embarquee: AUC = ", round(auc_score_embarquee, 3),
", CI = [", round(auc_ci_embarquee[1], 3), ", ", round(auc_ci_embarquee[2], 3), "], p-value = ", round(p_value_embarquee, 3)
)
auc_labels <- data.frame(
model = c("embarquee", "importance"),
AUC = c(auc_score_embarquee, auc_score_importance),
x = c(0.7, 0.7),
y = c(0.5, 0.4)
)
# plot ROC curves
p <- ggplot(rocvals, aes(FPR, TPR, color = model)) +
geom_point() +
geom_line() +
geom_abline(linetype = "dashed", color = "red") +
geom_text(data = auc_labels, aes(x = x, y = y, label = paste("AUC =", round(AUC, 3)), color = model), hjust = 1, size = 4) +
coord_fixed(xlim = c(0, 1), ylim = c(0, 1)) +
labs(
title = "ROC Curves for XGBoost on Test Data",
x = "1 - Specificity (FPR)",
y = "Sensitivity (TPR)",
color = "Model",
caption = caption_text # add the caption
) +
theme_minimal() +
theme(
plot.title = element_text(size = 14, face = "bold", hjust = 0.5, color = "black"),
axis.title.x = element_text(size = 12, face = "bold", color = "black"),
axis.title.y = element_text(size = 12, face = "bold", color = "black"),
panel.grid.major = element_line(color = "grey"),
panel.grid.minor = element_line(color = "grey"),
plot.caption = element_text(hjust = 0, face= "italic") # style for the caption
)
print(p)
1条答案
按热度按时间kzmpq1sx1#
mlr3支持可以直接在基准测试结果上运行的统计测试,请参阅mlr3手册中的相关章节。这也将自动校正多个测试,如果您手动提取p值,这并不令人满意。
要获得您所描述的 Bootstrap 估计,您需要做的就是将基准设计的重采样方法设置为具有所需迭代次数的 Bootstrap 。