python-3.x 尝试使用pyspark运行决策树算法

js5cn81o  于 2023-03-20  发布在  Python
关注(0)|答案(1)|浏览(110)

我试图运行一个决策树模型对我的数据集,其中包含分类和数字数据,但我不断running到一个问题时,试图配置决策树分类器使用训练数据。
这是密码:

import pyspark from pyspark.ml import Pipeline from pyspark.ml.classification import DecisionTreeClassifier from pyspark.ml.feature import StringIndexer, VectorIndexer, OneHotEncoder,VectorAssembler  from pyspark.ml.evaluation import MulticlassClassificationEvaluator, BinaryClassificationEvaluator from pyspark.sql import SparkSession 

 # Create a SparkSession spark = SparkSession.builder.appName('ML_algo').getOrCreate()  

# Load the CSV file into a DataFrame df = spark.read.csv('C:/Users/johnc/Downloads/hospital.csv', header=True, inferSchema=True)  

# Create a list of the categorical columns in the DataFrame categoricalColumns = ['ethnicity', 'gender', 'apache_3j_bodysystem'] 

 # Create a list to hold the Pipeline stages 
     stages = []  
# Create a StringIndexer for the target variable and add it to the stages list
 label_stringIdx = StringIndexer(inputCol='isHospitalDeath', outputCol='label') stages += [label_stringIdx] 

 # Create a StringIndexer and OneHotEncoder for each categorical column and add them to the stages list for categoricalCol in categoricalColumns:   
   stringIndexer = StringIndexer(inputCol=categoricalCol, outputCol=categoricalCol + 'Index')     
   encoder = OneHotEncoder(inputCols=[stringIndexer.getOutputCol()], outputCols=[categoricalCol + 'classVec'])     stages += [stringIndexer, encoder] 

 # Create a VectorAssembler to combine the categorical and numeric columns and add it to the stages list numericCols = ['age', 'bmi', 'gcs_eyes_apache', 'gcs_verbal_apache', 'heart_rate_apache', 'intubated_apache', 'resprate_apache', 'temp_apache', 'ventilated_apache', 'd1_mbp_min', 'd1_spo2_min', 'd1_sysbp_min', 'd1_temp_min', 'h1_diasbp_min', 'h1_mbp_min', 'h1_resprate_max', 'h1_sysbp_min', 'apache_4a_hospital_death_prob'] 
assemblerInputs = [c + 'classVec' for c in categoricalColumns] + numericCols assembler = VectorAssembler(inputCols=assemblerInputs, outputCol='features') stages += [assembler]  

# Create a Pipeline object with the stages list 

pipeline =Pipeline(stages=stages)   

# Fit the pipeline to the training data 
pipelineModel = pipeline.fit(trainingData) 
 
# Transform the training data with the pipeline 
df = pipelineModel.transform(trainingData) 

 # Select the desired columns for the DataFrame cols = df.columns selectedCols = ['label', 'features'] + cols df = df.select(selectedCols) 
 # Print the schema of the DataFrame 
 df.printSchema()  
root
 |-- label: double (nullable = false)
 |-- features: vector (nullable = true)
 |-- age: double (nullable = true)
 |-- bmi: double (nullable = true)
 |-- ethnicity: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- gcs_eyes_apache: integer (nullable = true)
 |-- gcs_verbal_apache: integer (nullable = true)
 |-- heart_rate_apache: double (nullable = true)
 |-- intubated_apache: integer (nullable = true)
 |-- resprate_apache: double (nullable = true)
 |-- temp_apache: double (nullable = true)
 |-- ventilated_apache: integer (nullable = true)
 |-- d1_mbp_min: double (nullable = true)
 |-- d1_spo2_min: double (nullable = true)
 |-- d1_sysbp_min: double (nullable = true)
 |-- d1_temp_min: double (nullable = true)
 |-- h1_diasbp_min: double (nullable = true)
 |-- h1_mbp_min: double (nullable = true)
 |-- h1_resprate_max: double (nullable = true)
 |-- h1_sysbp_min: double (nullable = true)
 |-- apache_4a_hospital_death_prob: double (nullable = true)
 |-- apache_3j_bodysystem: string (nullable = true)
 |-- isHospitalDeath: string (nullable = true)
 |-- label: double (nullable = false)
 |-- ethnicityIndex: double (nullable = false)
 |-- ethnicityclassVec: vector (nullable = true)
 |-- genderIndex: double (nullable = false)
 |-- genderclassVec: vector (nullable = true)
 |-- apache_3j_bodysystemIndex: double (nullable = false)
 |-- apache_3j_bodysystemclassVec: vector (nullable = true)
 |-- features: vector (nullable = true)

# Split the data into training and testing sets 

(trainingData, testData) = df.randomSplit([0.7, 0.3], seed=100)   

dt = DecisionTreeClassifier(featuresCol = 'features', labelCol = 'label', maxDepth = 3) 

dtModel = dt.fit(trainingData) ```

代码看起来很抱歉,这是我第一次使用这个网站,我想弄清楚一些事情。我只是真的想知道为什么它不能运行。
这是个例外:

---------------------------------------------------------------------------
AnalysisException                         Traceback (most recent call last)
Input In [35], in <cell line: 2>()
      1 dt = DecisionTreeClassifier(featuresCol = 'features', labelCol = 'label', maxDepth = 3)
----> 2 dtModel = dt.fit(trainingData)

File ~\anaconda3\lib\site-packages\pyspark\ml\base.py:205, in Estimator.fit(self, dataset, params)
    203         return self.copy(params)._fit(dataset)
    204     else:
--> 205         return self._fit(dataset)
    206 else:
    207     raise TypeError(
    208         "Params must be either a param map or a list/tuple of param maps, "
    209         "but got %s." % type(params)
    210     )

File ~\anaconda3\lib\site-packages\pyspark\ml\wrapper.py:383, in JavaEstimator._fit(self, dataset)
    382 def _fit(self, dataset: DataFrame) -> JM:
--> 383     java_model = self._fit_java(dataset)
    384     model = self._create_model(java_model)
    385     return self._copyValues(model)

File ~\anaconda3\lib\site-packages\pyspark\ml\wrapper.py:380, in JavaEstimator._fit_java(self, dataset)
    377 assert self._java_obj is not None
    379 self._transfer_params_to_java()
--> 380 return self._java_obj.fit(dataset._jdf)

File ~\anaconda3\lib\site-packages\py4j\java_gateway.py:1321, in JavaMember.__call__(self, *args)
   1315 command = proto.CALL_COMMAND_NAME +\
   1316     self.command_header +\
   1317     args_command +\
   1318     proto.END_COMMAND_PART
   1320 answer = self.gateway_client.send_command(command)
-> 1321 return_value = get_return_value(
   1322     answer, self.gateway_client, self.target_id, self.name)
   1324 for temp_arg in temp_args:
   1325     temp_arg._detach()

File ~\anaconda3\lib\site-packages\pyspark\sql\utils.py:196, in capture_sql_exception.<locals>.deco(*a, **kw)
    192 converted = convert_exception(e.java_exception)
    193 if not isinstance(converted, UnknownException):
    194     # Hide where the exception came from that shows a non-Pythonic
    195     # JVM exception message.
--> 196     raise converted from None
    197 else:
    198     raise

AnalysisException: Reference 'label' is ambiguous, could be: label, label.
myzjeezk

myzjeezk1#

我觉得问题是

selectedCols = ['label', 'features'] + cols

此处生成的列表包含重复条目,“label”重复两次。这将导致歧义。

相关问题