尝试从csv创建sparkDataframe时出错

4uqofj5v  于 2021-05-24  发布在  Spark
关注(0)|答案(1)|浏览(362)

我正在尝试从s3读取csv文件并创建sparkDataframe。我犯了一些我不明白的错误。
我的代码是这样的

from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS
from pyspark.sql import Row
from pyspark.sql import SparkSession
from pyspark.sql import SQLContext
from pyspark.sql.types import *
spark = SparkSession.builder.getOrCreate()
df = spark.read.csv("https://s3.myaws.com/datastore/apprecords.csv")

当我从我的jupyter笔记本运行上面的内容时,我得到以下错误

---------------------------------------------------------------------------
Py4JJavaError                             Traceback (most recent call last)
<ipython-input-15-09011108d9e8> in <module>

----> 1 df = spark.read.csv("https://s3.myaws.com/datastore/apprecords.csv")
      2 
      3 

~/spark-2.4.4-bin-hadoop2.7/python/lib/pyspark.zip/pyspark/sql/readwriter.py in csv(self, path, schema, sep, encoding, quote, escape, comment, header, inferSchema, ignoreLeadingWhiteSpace, ignoreTrailingWhiteSpace, nullValue, nanValue, positiveInf, negativeInf, dateFormat, timestampFormat, maxColumns, maxCharsPerColumn, maxMalformedLogPerPartition, mode, columnNameOfCorruptRecord, multiLine, charToEscapeQuoteEscaping, samplingRatio, enforceSchema, emptyValue)
    474             path = [path]
    475         if type(path) == list:
--> 476             return self._df(self._jreader.csv(self._spark._sc._jvm.PythonUtils.toSeq(path)))
    477         elif isinstance(path, RDD):
    478             def func(iterator):

~/spark-2.4.4-bin-hadoop2.7/python/lib/py4j-0.10.7-src.zip/py4j/java_gateway.py in __call__(self, *args)
   1255         answer = self.gateway_client.send_command(command)
   1256         return_value = get_return_value(
-> 1257             answer, self.gateway_client, self.target_id, self.name)
   1258 
   1259         for temp_arg in temp_args:

~/spark-2.4.4-bin-hadoop2.7/python/lib/pyspark.zip/pyspark/sql/utils.py in deco(*a,**kw)
     61     def deco(*a,**kw):
     62         try:
---> 63             return f(*a,**kw)
     64         except py4j.protocol.Py4JJavaError as e:
     65             s = e.java_exception.toString()
sxpgvts3

sxpgvts31#

因为您是从aws本身使用它,所以您可以尝试不使用http://和格式吗 s3://bucket-name/file.csv ```
import pandas as pd
df = pd.read_csv("s3://datastore/apprecords.csv")

相关问题