我正在尝试从googlebucket读取文件,虽然我可以通过sparkshell读取它,在进入sparkshell时包含gcs jar。通过spark提交时抛出以下错误。
Exception in thread "main" java.lang.NoSuchMethodError: com.google.common.base.Splitter.splitToList(Ljava/lang/CharSequence;)Ljava/util/List;
at com.google.cloud.hadoop.fs.gcs.GoogleHadoopFileSystemBase$ParentTimestampUpdateIncludePredicate.create(GoogleHadoopFileSystemBase.java:780)
at com.google.cloud.hadoop.fs.gcs.GoogleHadoopFileSystemBase.createOptionsBuilderFromConfig(GoogleHadoopFileSystemBase.java:2130)
at com.google.cloud.hadoop.fs.gcs.GoogleHadoopFileSystemBase.configure(GoogleHadoopFileSystemBase.java:1822)
at com.google.cloud.hadoop.fs.gcs.GoogleHadoopFileSystemBase.initialize(GoogleHadoopFileSystemBase.java:1003)
at com.google.cloud.hadoop.fs.gcs.GoogleHadoopFileSystemBase.initialize(GoogleHadoopFileSystemBase.java:966)
at org.apache.hadoop.fs.FileSystem.createFileSystem(FileSystem.java:2689)
at org.apache.hadoop.fs.FileSystem.access$200(FileSystem.java:94)
at org.apache.hadoop.fs.FileSystem$Cache.getInternal(FileSystem.java:2723)
at org.apache.hadoop.fs.FileSystem$Cache.get(FileSystem.java:2705)
at org.apache.hadoop.fs.FileSystem.get(FileSystem.java:407)
at org.apache.hadoop.fs.FileSystem.get(FileSystem.java:172)
at org.apache.hadoop.mapred.JobConf.getWorkingDirectory(JobConf.java:656)
at org.apache.hadoop.mapred.FileInputFormat.setInputPaths(FileInputFormat.java:440)
at org.apache.hadoop.mapred.FileInputFormat.setInputPaths(FileInputFormat.java:413)
at org.apache.spark.SparkContext$$anonfun$hadoopFile$1$$anonfun$33.apply(SparkContext.scala:1015)
at org.apache.spark.SparkContext$$anonfun$hadoopFile$1$$anonfun$33.apply(SparkContext.scala:1015)
at org.apache.spark.rdd.HadoopRDD$$anonfun$getJobConf$6.apply(HadoopRDD.scala:176)
at org.apache.spark.rdd.HadoopRDD$$anonfun$getJobConf$6.apply(HadoopRDD.scala:176)
at scala.Option.map(Option.scala:145)
at org.apache.spark.rdd.HadoopRDD.getJobConf(HadoopRDD.scala:176)
at org.apache.spark.rdd.HadoopRDD.getPartitions(HadoopRDD.scala:195)
at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:239)
at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:237)
at scala.Option.getOrElse(Option.scala:120)
at org.apache.spark.rdd.RDD.partitions(RDD.scala:237)
at org.apache.spark.rdd.MapPartitionsRDD.getPartitions(MapPartitionsRDD.scala:35)
at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:239)
at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:237)
at scala.Option.getOrElse(Option.scala:120)
at org.apache.spark.rdd.RDD.partitions(RDD.scala:237)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:1929)
at org.apache.spark.rdd.RDD.count(RDD.scala:1157)
at com.google.reader.GoogleRead$.main(GoogleRead.scala:41)
at com.google.reader.GoogleRead.main(GoogleRead.scala)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:498)
at org.apache.spark.deploy.SparkSubmit$.org$apache$spark$deploy$SparkSubmit$$runMain(SparkSubmit.scala:752)
at org.apache.spark.deploy.SparkSubmit$.doRunMain$1(SparkSubmit.scala:181)
at org.apache.spark.deploy.SparkSubmit$.submit(SparkSubmit.scala:206)
at org.apache.spark.deploy.SparkSubmit$.main(SparkSubmit.scala:121)
at org.apache.spark.deploy.SparkSubmit.main(SparkSubmit.scala)
我试图排除Guava依赖,但我仍然得到上述错误。下面是我正在使用的build.sbt文件
import sbt.ExclusionRule
name := "GoogleFileReader"
version := "1.0"
scalaVersion := "2.10.4"
libraryDependencies += "org.apache.spark" % "spark-sql_2.10" % "1.6.1" exclude("com.google.guava", "guava")
libraryDependencies += "com.google.cloud.bigdataoss" % "gcs-connector" % "1.6.0-hadoop2"
assemblyMergeStrategy in assembly := {
case PathList("javax", "servlet", xs @ _*) => MergeStrategy.last
case PathList("javax", "activation", xs @ _*) => MergeStrategy.last
case PathList("org", "apache", xs @ _*) => MergeStrategy.last
case PathList("com", "google", xs @ _*) => MergeStrategy.last
case PathList("com", "esotericsoftware", xs @ _*) => MergeStrategy.last
case PathList("com", "codahale", xs @ _*) => MergeStrategy.last
case PathList("com", "yammer", xs @ _*) => MergeStrategy.last
case "about.html" => MergeStrategy.rename
case "META-INF/ECLIPSEF.RSA" => MergeStrategy.last
case "META-INF/mailcap" => MergeStrategy.last
case "META-INF/mimetypes.default" => MergeStrategy.last
case "plugin.properties" => MergeStrategy.last
case "log4j.properties" => MergeStrategy.last
case "pom.properties" => MergeStrategy.last
case x =>
val oldStrategy = (assemblyMergeStrategy in assembly).value
oldStrategy(x)
}
2条答案
按热度按时间inkz8wg91#
在集群中运行时实际使用的guava库是hadoop依赖的旧库,并且在驱动程序/执行程序启动时提供。它首先出现在类路径中,所以这就是为什么不使用代码所依赖的较新版本的原因;因此,nosuchmethoderror。
你可以试试这个办法(http://jhz.name/2016/01/10/spark-classpath.html)它基本上指示应该首先检查用户类路径,然后检查hadoop类路径。它使用以下spark配置参数:
很可能你会像我一样遇到其他类型的麻烦(java.lang.linkageerror;因为在本例中使用了类加载器)。
帮助我的解决方案是对代码使用的guava版本进行着色处理,这样它就不再与hadoop所依赖的版本冲突。请参阅对此的答复:https://groups.google.com/a/lists.datastax.com/forum/#!主题/Spark连接器用户/8gcd8iqpy3c
它包括在build.sbt中添加此着色规则
请确保同时更新com.google的合并策略:
wljmcqd82#
请找到附件pom文件的依赖关系。