我试图运行一个决策树模型对我的数据集,其中包含分类和数字数据,但我不断running到一个问题时,试图配置决策树分类器使用训练数据。
这是密码:
import pyspark from pyspark.ml import Pipeline from pyspark.ml.classification import DecisionTreeClassifier from pyspark.ml.feature import StringIndexer, VectorIndexer, OneHotEncoder,VectorAssembler from pyspark.ml.evaluation import MulticlassClassificationEvaluator, BinaryClassificationEvaluator from pyspark.sql import SparkSession
# Create a SparkSession spark = SparkSession.builder.appName('ML_algo').getOrCreate()
# Load the CSV file into a DataFrame df = spark.read.csv('C:/Users/johnc/Downloads/hospital.csv', header=True, inferSchema=True)
# Create a list of the categorical columns in the DataFrame categoricalColumns = ['ethnicity', 'gender', 'apache_3j_bodysystem']
# Create a list to hold the Pipeline stages
stages = []
# Create a StringIndexer for the target variable and add it to the stages list
label_stringIdx = StringIndexer(inputCol='isHospitalDeath', outputCol='label') stages += [label_stringIdx]
# Create a StringIndexer and OneHotEncoder for each categorical column and add them to the stages list for categoricalCol in categoricalColumns:
stringIndexer = StringIndexer(inputCol=categoricalCol, outputCol=categoricalCol + 'Index')
encoder = OneHotEncoder(inputCols=[stringIndexer.getOutputCol()], outputCols=[categoricalCol + 'classVec']) stages += [stringIndexer, encoder]
# Create a VectorAssembler to combine the categorical and numeric columns and add it to the stages list numericCols = ['age', 'bmi', 'gcs_eyes_apache', 'gcs_verbal_apache', 'heart_rate_apache', 'intubated_apache', 'resprate_apache', 'temp_apache', 'ventilated_apache', 'd1_mbp_min', 'd1_spo2_min', 'd1_sysbp_min', 'd1_temp_min', 'h1_diasbp_min', 'h1_mbp_min', 'h1_resprate_max', 'h1_sysbp_min', 'apache_4a_hospital_death_prob']
assemblerInputs = [c + 'classVec' for c in categoricalColumns] + numericCols assembler = VectorAssembler(inputCols=assemblerInputs, outputCol='features') stages += [assembler]
# Create a Pipeline object with the stages list
pipeline =Pipeline(stages=stages)
# Fit the pipeline to the training data
pipelineModel = pipeline.fit(trainingData)
# Transform the training data with the pipeline
df = pipelineModel.transform(trainingData)
# Select the desired columns for the DataFrame cols = df.columns selectedCols = ['label', 'features'] + cols df = df.select(selectedCols)
# Print the schema of the DataFrame
df.printSchema()
root
|-- label: double (nullable = false)
|-- features: vector (nullable = true)
|-- age: double (nullable = true)
|-- bmi: double (nullable = true)
|-- ethnicity: string (nullable = true)
|-- gender: string (nullable = true)
|-- gcs_eyes_apache: integer (nullable = true)
|-- gcs_verbal_apache: integer (nullable = true)
|-- heart_rate_apache: double (nullable = true)
|-- intubated_apache: integer (nullable = true)
|-- resprate_apache: double (nullable = true)
|-- temp_apache: double (nullable = true)
|-- ventilated_apache: integer (nullable = true)
|-- d1_mbp_min: double (nullable = true)
|-- d1_spo2_min: double (nullable = true)
|-- d1_sysbp_min: double (nullable = true)
|-- d1_temp_min: double (nullable = true)
|-- h1_diasbp_min: double (nullable = true)
|-- h1_mbp_min: double (nullable = true)
|-- h1_resprate_max: double (nullable = true)
|-- h1_sysbp_min: double (nullable = true)
|-- apache_4a_hospital_death_prob: double (nullable = true)
|-- apache_3j_bodysystem: string (nullable = true)
|-- isHospitalDeath: string (nullable = true)
|-- label: double (nullable = false)
|-- ethnicityIndex: double (nullable = false)
|-- ethnicityclassVec: vector (nullable = true)
|-- genderIndex: double (nullable = false)
|-- genderclassVec: vector (nullable = true)
|-- apache_3j_bodysystemIndex: double (nullable = false)
|-- apache_3j_bodysystemclassVec: vector (nullable = true)
|-- features: vector (nullable = true)
# Split the data into training and testing sets
(trainingData, testData) = df.randomSplit([0.7, 0.3], seed=100)
dt = DecisionTreeClassifier(featuresCol = 'features', labelCol = 'label', maxDepth = 3)
dtModel = dt.fit(trainingData) ```
代码看起来很抱歉,这是我第一次使用这个网站,我想弄清楚一些事情。我只是真的想知道为什么它不能运行。
这是个例外:
---------------------------------------------------------------------------
AnalysisException Traceback (most recent call last)
Input In [35], in <cell line: 2>()
1 dt = DecisionTreeClassifier(featuresCol = 'features', labelCol = 'label', maxDepth = 3)
----> 2 dtModel = dt.fit(trainingData)
File ~\anaconda3\lib\site-packages\pyspark\ml\base.py:205, in Estimator.fit(self, dataset, params)
203 return self.copy(params)._fit(dataset)
204 else:
--> 205 return self._fit(dataset)
206 else:
207 raise TypeError(
208 "Params must be either a param map or a list/tuple of param maps, "
209 "but got %s." % type(params)
210 )
File ~\anaconda3\lib\site-packages\pyspark\ml\wrapper.py:383, in JavaEstimator._fit(self, dataset)
382 def _fit(self, dataset: DataFrame) -> JM:
--> 383 java_model = self._fit_java(dataset)
384 model = self._create_model(java_model)
385 return self._copyValues(model)
File ~\anaconda3\lib\site-packages\pyspark\ml\wrapper.py:380, in JavaEstimator._fit_java(self, dataset)
377 assert self._java_obj is not None
379 self._transfer_params_to_java()
--> 380 return self._java_obj.fit(dataset._jdf)
File ~\anaconda3\lib\site-packages\py4j\java_gateway.py:1321, in JavaMember.__call__(self, *args)
1315 command = proto.CALL_COMMAND_NAME +\
1316 self.command_header +\
1317 args_command +\
1318 proto.END_COMMAND_PART
1320 answer = self.gateway_client.send_command(command)
-> 1321 return_value = get_return_value(
1322 answer, self.gateway_client, self.target_id, self.name)
1324 for temp_arg in temp_args:
1325 temp_arg._detach()
File ~\anaconda3\lib\site-packages\pyspark\sql\utils.py:196, in capture_sql_exception.<locals>.deco(*a, **kw)
192 converted = convert_exception(e.java_exception)
193 if not isinstance(converted, UnknownException):
194 # Hide where the exception came from that shows a non-Pythonic
195 # JVM exception message.
--> 196 raise converted from None
197 else:
198 raise
AnalysisException: Reference 'label' is ambiguous, could be: label, label.
1条答案
按热度按时间myzjeezk1#
我觉得问题是
此处生成的列表包含重复条目,“label”重复两次。这将导致歧义。