我已经整合了 nutch
1.14以及 solr-6.6.0
在centos linux 7.3.1611版上,我在seedlist中给出了大约10个URL,位于/usr/local/apache-nutch-1.13/url/seed.txt,我遵循了教程
[root@localhost apache-nutch-1.14]# bin/nutch dedup http://ip:8983/solr/
DeduplicationJob: starting at 2018-01-09 15:07:52
DeduplicationJob: java.io.IOException: No FileSystem for scheme: http
at org.apache.hadoop.fs.FileSystem.getFileSystemClass(FileSystem.java:2660)
at org.apache.hadoop.fs.FileSystem.createFileSystem(FileSystem.java:2667)
at org.apache.hadoop.fs.FileSystem.access$200(FileSystem.java:94)
at org.apache.hadoop.fs.FileSystem$Cache.getInternal(FileSystem.java:2703)
at org.apache.hadoop.fs.FileSystem$Cache.get(FileSystem.java:2685)
at org.apache.hadoop.fs.FileSystem.get(FileSystem.java:373)
at org.apache.hadoop.fs.Path.getFileSystem(Path.java:295)
at org.apache.hadoop.mapred.FileInputFormat.singleThreadedListStatus(FileInputFormat.java:258)
at org.apache.hadoop.mapred.FileInputFormat.listStatus(FileInputFormat.java:229)
at org.apache.hadoop.mapred.SequenceFileInputFormat.listStatus(SequenceFileInputFormat.java:45)
at org.apache.hadoop.mapred.FileInputFormat.getSplits(FileInputFormat.java:315)
at org.apache.hadoop.mapreduce.JobSubmitter.writeOldSplits(JobSubmitter.java:329)
at org.apache.hadoop.mapreduce.JobSubmitter.writeSplits(JobSubmitter.java:320)
at org.apache.hadoop.mapreduce.JobSubmitter.submitJobInternal(JobSubmitter.java:196)
at org.apache.hadoop.mapreduce.Job$10.run(Job.java:1290)
at org.apache.hadoop.mapreduce.Job$10.run(Job.java:1287)
at java.security.AccessController.doPrivileged(Native Method)
at javax.security.auth.Subject.doAs(Subject.java:422)
at org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1746)
at org.apache.hadoop.mapreduce.Job.submit(Job.java:1287)
at org.apache.hadoop.mapred.JobClient$1.run(JobClient.java:575)
at org.apache.hadoop.mapred.JobClient$1.run(JobClient.java:570)
at java.security.AccessController.doPrivileged(Native Method)
at javax.security.auth.Subject.doAs(Subject.java:422)
at org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1746)
at org.apache.hadoop.mapred.JobClient.submitJobInternal(JobClient.java:570)
at org.apache.hadoop.mapred.JobClient.submitJob(JobClient.java:561)
at org.apache.hadoop.mapred.JobClient.runJob(JobClient.java:870)
at org.apache.nutch.crawl.DeduplicationJob.run(DeduplicationJob.java:326)
at org.apache.hadoop.util.ToolRunner.run(ToolRunner.java:70)
at org.apache.nutch.crawl.DeduplicationJob.main(DeduplicationJob.java:369)
所有与solr相关的命令都起作用。请帮忙。他们在nutch教程中讨论的hadoop元素在哪里。我们是否必须为hadoop、nutch和solr安装除java以外的任何东西才能共同构建搜索引擎?
2条答案
按热度按时间muk1a3rh1#
试试这个
ljo96ir52#
我在看同一本指南时遇到了同样的问题。这可能有助于: