c++ 使用存储在链表中的数据遍历二叉决策树

我有一个二叉决策树，它将根据一些属性来决定所讨论的肿瘤是良性还是恶性。每个病人的这些属性的数据存储在一个链表中。我很难找出如何通过决策树来处理链表中存储的数据，以获得良性还是恶性的预期结果。
到目前为止，我已经为决策树和链表编写了所有代码。链表中填充了从.csv文件中读取的数据。我已经创建了一个名为ProcessingPatientData. h的头文件。
在这个文件中，我想声明一个void process_data函数，它将决策树和链表作为参数，然后在树中运行数据，以确定每个患者的结果。
我也想声明其他函数，但这不是我的问题。我在弄清楚如何将我的链表作为参数包括在内时遇到了麻烦
下面你会发现我的代码的链接列表和决策树以及我有什么到目前为止的函数“进程_数据”。
`

#include "CancerLogic.h"
#include "DecisionNode.h"
#include "LinkedList.h"
#include "ProcessPatientData.h"

typedef struct
{
    int id;
    int clump_thickness;
    int uniformity_of_cell_size;
    int uniformity_of_cell_shape;
    int marginal_adhesion;
    int single_epithelial_cell_size;
    int bare_nuclei;
    int bland_chromatin;
    int normal_nucleoli;
    int mitoses;
    int class_;
}Patients;

void cancer_tree()
{
    // Create the tree
    // Terminal Nodes
    DecisionNode<CancerLogic::patient_processing_data> malignant_node(CancerLogic::malignant);
    DecisionNode<CancerLogic::patient_processing_data> benign_node(CancerLogic::benign);

    // Right side (true) nodes
    DecisionNode<CancerLogic::patient_processing_data> ma_greater_than_3_r_node(CancerLogic::ma_greater_than_3_r, &malignant_node, &benign_node);
    DecisionNode<CancerLogic::patient_processing_data> ct_greater_than_5_r_node(CancerLogic::ct_greater_than_5_r, &malignant_node, &benign_node);
    DecisionNode<CancerLogic::patient_processing_data> ma_greater_than_5_r_node(CancerLogic::ma_greater_than_5_r, &malignant_node, &benign_node);
    DecisionNode<CancerLogic::patient_processing_data> csize_greater_than_3_r_node(CancerLogic::csize_greater_than_3_r, &ma_greater_than_5_r_node, &malignant_node);
    DecisionNode<CancerLogic::patient_processing_data> ct_greater_than_6_r_node(CancerLogic::ct_greater_than_6_r, &malignant_node, &csize_greater_than_3_r_node);
    DecisionNode<CancerLogic::patient_processing_data> bn_greater_than_2_r_node(CancerLogic::bn_greater_than_2_r, &ct_greater_than_6_r_node, &ma_greater_than_3_r_node);
    DecisionNode<CancerLogic::patient_processing_data> csize_greater_than_4_r_node(CancerLogic::csize_greater_than_4_r, &malignant_node, &bn_greater_than_2_r_node);
    DecisionNode<CancerLogic::patient_processing_data> cshape_greater_than_2_r_node(CancerLogic::cshape_greater_than_2_r, &csize_greater_than_4_r_node, &ct_greater_than_5_r_node);

    // Left side (false) nodes
    DecisionNode<CancerLogic::patient_processing_data> ma_greater_than_3_l_node(CancerLogic::ma_greater_than_3_l, &malignant_node, &benign_node);
    DecisionNode<CancerLogic::patient_processing_data> bc_greater_than_2_l_node(CancerLogic::bc_greater_than_2_l, &malignant_node, &ma_greater_than_3_l_node);
    DecisionNode<CancerLogic::patient_processing_data> ct_greater_than_3_l_node(CancerLogic::ct_greater_than_3_l, &bc_greater_than_2_l_node, &benign_node);
    DecisionNode<CancerLogic::patient_processing_data> bn_greater_than_3_l_node(CancerLogic::bn_greater_than_3_l, &ct_greater_than_3_l_node, &benign_node);
    DecisionNode<CancerLogic::patient_processing_data> csize_greater_than_2_node(CancerLogic::csize_greater_than_2, &cshape_greater_than_2_r_node, &bn_greater_than_3_l_node);

    
    process_data<CancerLogic::patient_processing_data>(&csize_greater_than_2_node, );   <----

class CancerLogic
{
public:
    struct patient_processing_data
    {
        bool csize_greater_than_2_l;
        bool bn_greater_than_3_l;
        bool ct_greater_than_3_l;
        bool bc_greater_than_2_l;
        bool ma_greater_than_3_l;

        bool csize_greater_than_2_r;
        bool cshape_greater_than2_r;
        bool ct_greater_than_5_r;
        bool csize_greater_than_4_r;
        bool bn_greater_than_2_r;
        bool ma_greater_than_3_r;
        bool ct_greater_than_6_r;
        bool csize_greater_than_3_r;
        bool ma_greater_than_5_r;
        bool result;
        std::string result_string;

        patient_processing_data(): csize_greater_than_2_l(false), bn_greater_than_3_l(false),
                                   ct_greater_than_3_l(false), bc_greater_than_2_l(false),
                                   ma_greater_than_3_l(false),
                                   csize_greater_than_2_r(false),
                                   cshape_greater_than2_r(false),
                                   ct_greater_than_5_r(false),
                                   csize_greater_than_4_r(false),
                                   bn_greater_than_2_r(false),
                                   ma_greater_than_3_r(false), ct_greater_than_6_r(false),
                                   csize_greater_than_3_r(false),
                                   ma_greater_than_5_r(false), result(false)
        {
        }
    };

    static bool malignant(patient_processing_data& data)
    {
        data.result = true;
        data.result_string = "Malignant";

        return data.result;
    }

    static bool benign(patient_processing_data& data)
    {
        data.result = false;
        data.result_string = "Benign";

        return data.result;
    }

    static bool csize_greater_than_4_r(patient_processing_data& data)
    {
        data.result = true;
        return data.result;
    }

    static bool ma_greater_than_5_r(patient_processing_data& data)
    {
        data.result = true;
        return data.result;
    }

    static bool csize_greater_than_3_r(patient_processing_data& data)
    {
        data.result = true;
        return data.result;
    }

    static bool ct_greater_than_6_r(patient_processing_data& data)
    {
        data.result = true;
        return data.result;
    }

    static bool bn_greater_than_2_r(patient_processing_data& data)
    {
        data.result = true;
        return data.result;
    }

    static bool ma_greater_than_3_r(patient_processing_data& data)
    {
        data.result = true;
        return data.result;
    }

    static bool cshape_greater_than_2_r(patient_processing_data& data)
    {
        data.result = true;
        return data.result;
    }

    static bool ct_greater_than_5_r(patient_processing_data& data)
    {
        data.result = true;
        return data.result;
    }

    static bool csize_greater_than_2(patient_processing_data& data)
    {
        data.result = true;
        return data.result;
    }

    static bool bn_greater_than_3_l(patient_processing_data& data)
    {
        data.result = true;
        return data.result;
    }

    static bool ct_greater_than_3_l(patient_processing_data& data)
    {
        data.result = true;
        return data.result;
    }

    static bool bc_greater_than_2_l(patient_processing_data& data)
    {
        data.result = true;
        return data.result;
    }

    static bool ma_greater_than_3_l(patient_processing_data& data)
    {
        data.result = true;
        return data.result;
    }
};

#include "LinkedList.h"
#include "Node.h"

PatientList::PatientList()
{
    head_ = nullptr;
    tail_ = nullptr;
}

void PatientList::insert(int ct, int csize, int cshape, int ma, int bn, int bc)
{
    Node* new_node_ = new Node(ct, csize, cshape, ma, bn, bc);

    if(head_ == nullptr)
    {
        head_ = new_node_;
        tail_ = new_node_;
        return;
    }

    new_node_->previous = tail_;
    tail_->next = new_node_;
    tail_ = new_node_;
}

#pragma once

#include "DecisionNode.h"
#include "LinkedList.h"

#include <iostream>

template<typename T>
void process_data(DecisionNode<typename T::processing_data>* d_tree, PatientList<typename T::processing_data>& patient_data)
{
    for(auto& data : patient_data)
    {
        d_tree->process(data);
    }
}

template<typename T>
void print_data(const PatientList<typename T::processing_data>& patient_data)
{
    for(const auto& data : patient_data)
    {
        std::cout << data << std::endl;
    }
}

对不起，所有的代码，我只是想确保我有尽可能多的信息，我的问题在那里的每个人都试图了解我的问题是什么。任何帮助将不胜感激。

二叉决策树非常简单。在树中的每个节点上，要么有一个决策（如果它是叶节点），要么有一个连接点。要在连接点向左或向右遍历，需要一个分类器。
分类器的一个例子是检查数据中的特定值，并将其与另一个值进行比较。因此，让我们取树中的第一个结点（引用original question中的表）：这一条规定：
细胞大小均匀性〈= 2
然后，它会根据测试的正确与否来选择一条路径或另一条路径，这就是分类器。
与其试图修复你不完整的代码，我将逐步向你介绍我自己如何用C++来处理这个问题。我的目的不是给予你代码，而是向你展示分解任务的 * 一种方法 *。如果你注意了，那么在某个时候你会大喊“啊哈！"，然后去修复你自己的程序。

步骤1：定义数据源

在考虑创建一棵树之前，你需要弄清楚分类器的外观。在此之前，你需要一些数据。我将主要复制你的结构，做一些小的修改：

struct LabResults
{
    int clump_thickness;
    int uniformity_of_cell_size;
    int uniformity_of_cell_shape;
    int marginal_adhesion;
    int single_epithelial_cell_size;
    int bare_nuclei;
    int bland_chromatin;
    int normal_nucleoli;
    int mitoses;
};

enum class Diagnosis
{
    Unknown,
    Benign,
    Malignant
};

struct Patient
{
    int id;
    LabResults results;
    Diagnosis diagnosis;
};

现在我们有了一个结构来保存病人的信息，实验室结果和诊断明确分开，尽管Patient存储了大量的信息，但我们只需要LabResults来执行分类。

步骤2：定义分类器

您现在可以考虑分类器本身，因为您现在知道它需要检查LabResults对象。
让我们更进一步，让分类器成为泛型。我们可以为此定义一个抽象类模板，它将对 * 任何 * 数据进行操作。这使得它可以重用，如果你想为其他东西构建决策树：

template<class Data>
class Classifier
{
public:
    virtual ~Classifier() = default;
    virtual bool Classify(const Data&) const = 0;
};

这就是它所需要的！我们将能够子类化它来编写特定的分类器，但是主接口将通过一个Classifier<LabResults>*值，我们将在这个值上调用多态Classify方法。这个方法接受对一个病人的LabResults数据的引用。
现在，让我们构建一个非常有用的分类器。这将是整个决策树所需要的唯一分类器！同样，所有内容都保持通用。让我们先看看它，然后我会解释：

// Classifier to perform binary comparison
template<class Data, class Value, class Compare>
class BinaryClassifier : public Classifier<Data>
{
public:
    BinaryClassifier(Value LabResults::*member, Value value)
        : mMember(member)
        , mValue(value)
    { }

    bool Classify(const Data& data) const override
    {
        return Compare()(data.*mMember, mValue);
    }

private:
    Value LabResults::*mMember;
    Value mValue;
};

此分类器的构造函数使用指向成员的指针语义来定位Data类型的任何成员。（通过Value）尽管在你的例子中，它总是int，最后它提供了一个函子Compare，它提供了一种方法来指定要做什么样的比较运算。您的决策树恰好只需要<=，它可以方便地作为std::less_equal使用。
如果您不熟悉 pointer-to-member，请搜索它。functors 也是如此。您将看到这两个函数都在Classify方法中使用。很好！
现在我们可以创建一个分类器了，让我们为clump_thickness <= 5做一个分类器：

using LessEqualClassifier = BinaryClassifier<LabResults, int, std::less_equal<int>>;
LessEqualClassifier classifier(&LabResults::clump_thickness, 5);

步骤3：测试分类器！

代码编写到此为止......现在是停下来检查分类器是否真的工作的最佳时机。让我们快速地插入一些数据，并在其上运行分类器：

int main()
{
    using LessEqualClassifier = BinaryClassifier<LabResults, int, std::less_equal<int>>;

    LessEqualClassifier classifier(&LabResults::clump_thickness, 5);

    Patient patients[] {
        { 1000025, { 5, 1, 1, 1, 2, 1, 3, 1, 1 } },
        { 1002945, { 5, 4, 4, 5, 7, 10, 3, 2, 1 } },
        { 1015425, { 3, 1, 1, 1, 2, 2, 3, 1, 1 } },
        { 1016277, { 6, 8, 8, 1, 3, 4, 3, 7, 1 } },
        { 1017023, { 4, 1, 1, 3, 2, 1, 3, 1, 1 } },
        { 1017122, { 8, 10, 10, 8, 7, 10, 9, 7, 1 } },
    };

    std::cout << std::boolalpha;
    for (auto& p : patients)
    {
        std::cout << "Patient " << p.id << ": ";
        std::cout << classifier.Classify(p.results) << "\n";
    }
}

输出量：

Patient 1000025: true
Patient 1002945: true
Patient 1015425: true
Patient 1016277: false
Patient 1017023: true
Patient 1017122: false

是的，你确实可以看到这是按预期运行的。我们几乎没有写任何代码，但我们已经有了一个通用的分类器，可以在LabResults的任何成员上工作。继续，用其他成员和值试试。

步骤4：定义决策树

有了一个有效的分类器，我们现在可以把注意力集中在决策树本身上了。正如我在开始时所说的，树中的节点要么是最终决策，要么是结点。让我们从树的一个类开始：

template <class Data, class Result>
class DecisionTree
{
public:
    class Node;

    DecisionTree(Node&& definition)
        : mRoot(std::move(definition))
    { }

    Result Run(const Data& data) const
    {
        return mRoot.Run(data);
    }

private:
    Node mRoot;
};

这个树只有一个根节点，它可以在这个节点上调用Run方法（我们很快就会定义），这将执行决策树并返回结果，数据树的类型为DecisionTree<LabResults, Diagnosis>，这意味着你将传入一个病人的LabResults，并返回一个Diagnosis值。
现在让我们定义节点类本身：

template <class Data, class Result>
class DecisionTree<Data, Result>::Node
{
public:
    typedef std::unique_ptr<Classifier<Data>> ClassifierPtr;
    typedef std::unique_ptr<Node> NodePtr;

    Node(const Result& result)
        : mResult(result)
    { }

    Node(ClassifierPtr&& classifier, Node&& trueCase, Node&& falseCase)
        : mResult()
        , mClassifier(std::move(classifier))
        , mTrueCase(std::make_unique<Node>(std::move(trueCase)))
        , mFalseCase(std::make_unique<Node>(std::move(falseCase)))
    { }

private:
    friend class DecisionTree;
    Result Run(const Data& data) const
    {
        return mClassifier
            ? (mClassifier->Classify(data) ? mTrueCase : mFalseCase)->Run(data)
            : mResult;
    }

private:
    Result          mResult;
    ClassifierPtr   mClassifier;
    NodePtr         mTrueCase;
    NodePtr         mFalseCase;
};

你可以看到有两种可能的方法来构造DecisionTree::Node。你可以指定一个Result，或者你可以提供一个指向一个分类器的指针，沿着一个表示该分类器的两种可能结果的树节点。如果分类器为空，则该节点是一个叶子，否则它是一个连接点。
Run方法将执行分类器（如果存在），然后根据结果递归运行相应的子树。如果没有分类器，则返回存储在该节点的Result。
这差不多就是整个系统了。到现在，你已经迫不及待地想开始构建你的树了，所以让我们开始吧。

步骤5：构建树

这种设计的好处是，我们可以一次性构建树，嵌套两个不同的节点构造函数来定义决策结构。但你不必这样做。你可能希望在某个文件中定义它，解析它，然后动态构建树。这取决于你。
不过，和往常一样，从一些简单的东西开始。你希望尽可能早地分阶段测试代码。

首先，让我们用一种更简洁的方式来创建分类器，因为它必须是std::unique_ptr，所以我们在所有地方都写std::make_unique<LessEqualClassifier>(...)，或者不写这个别名：std::make_unique<BinaryClassifier<LabResults, int, std::less_equal<int>>(...)...呃！
下面这个简单的函数模板如何：

// Classifier factories make life easier!
template <class Data, class Value>
std::unique_ptr<Classifier<Data>> LessEqual(Value Data::*member, Value value)
{
    typedef BinaryClassifier<LabResults, Value, std::less_equal<Value>> LessEqualType;
    return std::make_unique<LessEqualType>(member, value);
}

现在，让我们去掉树中的左分支，留下右分支诊断为“未知”，这样就有了一些东西可以测试：

DecisionTree<LabResults, Diagnosis> cancerDiagnoser({
    LessEqual(&LabResults::uniformity_of_cell_size, 2),{
        // uniformity_of_cell_size <= 2
        LessEqual(&LabResults::bare_nuclei, 3),{
            Diagnosis::Benign
        },{
            LessEqual(&LabResults::clump_thickness, 3),{
                Diagnosis::Benign
            },{
                LessEqual(&LabResults::bland_chromatin, 2),{
                    LessEqual(&LabResults::marginal_adhesion, 3),{
                        Diagnosis::Malignant
                    },{
                        Diagnosis::Benign
                    }
                },{
                    Diagnosis::Malignant
                }
            }
        }
    },{
        // uniformity_of_cell_size > 2

        //// TODO !!!
        Diagnosis::Unknown
    }
});

第6步：测试树！

我们希望能够输出诊断结果。一个简洁的方法是为它定义一个流输出操作符。这样，您就可以直接将一个Diagnosis值发送到std::cout，并让它输出友好的内容：

std::ostream& operator<<(std::ostream& s, const Diagnosis& d)
{
    std::array<const char*, 3> names{ "Unknown", "Benign", "Malignant" };
    return s << names.at(static_cast<int>(d));
}

现在更新前面的测试循环，唯一需要做的改变是运行决策树，存储结果并输出：

for (auto& p : patients)
{
    p.diagnosis = cancerDiagnoser.Run(p.results);
    std::cout << "Patient " << p.id << ": " << p.diagnosis << "\n";
}

输出量：

Patient 1000025: Benign
Patient 1002945: Unknown
Patient 1015425: Benign
Patient 1016277: Unknown
Patient 1017023: Benign
Patient 1017122: Unknown

看起来我们没有用这个数据得到任何恶性诊断。用树的那一边是恶性的数据试试。你可以随便发明一些东西。事实上，你可以编造一些假数据来测试树：

Patient unitTests[] {
    { 1001, { 0, 2, 0, 0, 0, 3, 0, 0, 0 }, Diagnosis::Benign },
    { 1002, { 3, 2, 0, 0, 0, 4, 0, 0, 0 }, Diagnosis::Benign },
    { 1003, { 4, 1, 0, 0, 0, 4, 2, 0, 0 }, Diagnosis::Malignant },
    { 1004, { 4, 1, 0, 4, 0, 4, 2, 0, 0 }, Diagnosis::Benign },
    { 1005, { 4, 2, 0, 3, 0, 4, 2, 0, 0 }, Diagnosis::Malignant },
};

for (auto& p : unitTests)
{
    auto diagnosis = cancerDiagnoser.Run(p.results);
    bool pass = diagnosis == p.diagnosis;
    std::cout << "Patient " << p.id << ": " << diagnosis;
    std::cout << " - " << (pass ? "PASS" : "FAIL") << "\n";
}

输出量：

Patient 1001: Benign - PASS
Patient 1002: Benign - PASS
Patient 1003: Malignant - PASS
Patient 1004: Benign - PASS
Patient 1005: Malignant - PASS

当你构建剩下的树的时候，你会想包含更多的测试来确保逻辑是正确的。因为你真的不想因为软件错误而误诊癌症！！！

摘要

到现在为止，你已经有了你需要的所有工具。这是故意未完成的代码。如果这是为了学校的作业，而你决定完全复制并完成它，我想人们会怀疑这不是你的！
相反，你应该做的是研究实际发生的事情，并将概念应用到你自己的程序中。如果你从这个答案中只学到一件事，我希望它是通过一系列小的、逻辑的、可测试的步骤从无到有地构建程序的方法。

c++ 使用存储在链表中的数据遍历二叉决策树

1条答案

步骤1：定义数据源

步骤2：定义分类器

步骤3：测试分类器！

步骤4：定义决策树

步骤5：构建树

第6步：测试树！

摘要

相关问题

热门标签

最新问答