我是Scrapy的初学者,我正在努力创建一个蜘蛛,并为每一篇新闻文章抓取新闻和评论。我创建了一个管道,在MySQL数据库中存储新闻文章和评论。
items.py
class ArticleItem(Item):
provider = Field()
topic = Field()
article_url = Field()
article_datetime = Field()
article_title = Field()
article_text = Field()
article_category = Field()
comments_lst = []
class CommentItem(Item):
#article_id = Field()
comment_datetime = Field()
comment_text = Field()
npd_蜘蛛.py
class NpdSpider(scrapy.Spider):
name = "npd"
start_urls = [
'https://localhost:81/search/interreg/', # wordpress site
]
def parse(self, response):
for item in response.css('div.results-container'):
"""yield {
'article_url': item.css('article a.post-preview::attr(href)').getall()
}
yield scrapy.Request(
url=item.css('article a.post-preview::attr(href)').get(),
callback=self.parse_item
)"""
links = item.css('article a.post-preview::attr(href)').getall()
for link in links:
headers = {"User-Agent": user_agent_list[random.randint(0, len(user_agent_list) - 1)]}
yield response.follow(link, callback=self.parse_item, headers=headers)
next_page = response.xpath('//span[contains(@class,"page-numbers")]/following-sibling::a/@href').get()
if next_page is not None:
next_page_link = response.urljoin(next_page)
headers = {"User-Agent": user_agent_list[random.randint(0, len(user_agent_list) - 1)]}
yield response.follow(next_page_link, callback=self.parse, headers=headers)
def parse_item(self, response):
articleItem = ArticleItem()
articleItem['provider'] = 1
articleItem['topic'] = 1 # 1
articleItem['article_url'] = response.xpath('//link[@rel="canonical"]/@href').get()
articleItem['article_datetime'] = response.xpath('//*[@class="published"]/text()').get()
articleItem['article_category'] = ''.join([x.get().strip() for x in response.xpath('//*[@itemprop="entry-header clearfix"]//a//text()')])
articleItem['article_title'] = response.xpath('//*[@class="entry-title"]/text()').get()
articleItem['article_text'] = ' '.join([x.get().strip() for x in response.xpath('//*[@class="entry-content"]//p//text()')])
for comment in response.css('ol.comment-list'): #response.css('ol.comment-list'):
commentItem = CommentItem()
commentItem['comment_datetime'] = comment.xpath('//*[@class="comment-body"]/footer/div[2]/a/time/@datetime').get()
commentItem['comment_text'] = '<br/>'.join([x.get().strip() for x in comment.xpath('//*[@class="comment-content"]//p//text()')])
articleItem.comments_lst.append(commentItem)
yield articleItem
MySQLPipeline.py
def process_item(self, item, spider):
cursor = self.con.cursor()
## Check to see if text is already in database
cursor.execute("select id from news where uri = %s", (item['article_url'],))
result = cursor.fetchone()
## If it is in DB, create log message
if result:
spider.logger.warn("Item already in database: %s" % item['article_url'])
## If text isn't in the DB, insert data
else:
## Define insert statement for news table
cursor.execute(""" insert into news (provider_id, topic_id, title, content, uri, published_date, crawled_date,
categories, author, avg_score, is_active, origin_url_id)
values (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)""",
(int(item["provider"]),int(item["topic"]),item["article_title"],item["article_text"],item["article_url"],
item["article_datetime"],datetime.now(),
item["article_category"],'',-1000.00,1,1))
self.con.commit()
cursor.execute('SELECT LAST_INSERT_ID()')
lastrowid = cursor.fetchone()[0]
for cit in item.comments_lst:
## Define insert statement for news table
cursor.execute(""" insert into comment (news_id,content,is_active,added_date,published_date,
published_by, avg_score)
values (%s,%s,%s,%s,%s,%s,%s)""",
(lastrowid, cit["comment_text"],1,datetime.now(),
cit["comment_datetime"], '', -1000.00))
self.con.commit()
return item
问题在于,对于特定新闻项的一个唯一评论会为不同的新闻项保存多次。
示例:在“新闻项目1”上找到的“注解ABC”,在保存到数据库期间将与“新闻项目2”和“新闻项目3”链接
MySQL管道或www.example.com类中是否有问题items.py?
1条答案
按热度按时间rkue9o1l1#
问题在于您的项目以及向其中添加数据的方式。
Field
可以是任何东西,包括一个列表,通过将comments_lst
属性设置为list
而不是使用Field()
,您实际上所做的是使comments_lst
成为一个在ArticleItem
的所有示例之间共享的类属性。您需要做的就是将
ArticleItem
更改为scrapy.Field()
然后在spider中,您需要为
comments_lst
分配一个唯一的列表,就像该项中的其他字段一样。然后在您的管道中,您将需要获得
comments_lst
,就像其他项目字段一样...