python-3.x Mysql管道混乱,一个特定新闻项的唯一评论被保存多次,用于不同的新闻项

f0ofjuux  于 2023-01-27  发布在  Python
关注(0)|答案(1)|浏览(123)

我是Scrapy的初学者,我正在努力创建一个蜘蛛,并为每一篇新闻文章抓取新闻和评论。我创建了一个管道,在MySQL数据库中存储新闻文章和评论。
items.py

class ArticleItem(Item):
    provider = Field()
    topic = Field()
    article_url = Field()
    article_datetime = Field()
    article_title = Field()
    article_text = Field()
    article_category = Field()
    comments_lst = []

class CommentItem(Item):
    #article_id = Field()
    comment_datetime = Field()
    comment_text = Field()

npd_蜘蛛.py

class NpdSpider(scrapy.Spider):

    name = "npd"

    start_urls = [

        'https://localhost:81/search/interreg/', # wordpress site

    ]


    def parse(self, response):

        for item in response.css('div.results-container'):

            """yield {

                'article_url': item.css('article a.post-preview::attr(href)').getall()

            }

            yield scrapy.Request(

                url=item.css('article a.post-preview::attr(href)').get(),

                callback=self.parse_item

            )"""

            links = item.css('article a.post-preview::attr(href)').getall()

            for link in links:

                headers = {"User-Agent": user_agent_list[random.randint(0, len(user_agent_list) - 1)]}

                yield response.follow(link, callback=self.parse_item, headers=headers)

        next_page = response.xpath('//span[contains(@class,"page-numbers")]/following-sibling::a/@href').get()

        if next_page is not None:

            next_page_link = response.urljoin(next_page)

            headers = {"User-Agent": user_agent_list[random.randint(0, len(user_agent_list) - 1)]}

            yield response.follow(next_page_link, callback=self.parse, headers=headers)



    def parse_item(self, response):

        articleItem = ArticleItem()

        articleItem['provider'] = 1

        articleItem['topic'] = 1  # 1

        articleItem['article_url'] = response.xpath('//link[@rel="canonical"]/@href').get()

        articleItem['article_datetime'] = response.xpath('//*[@class="published"]/text()').get()

        articleItem['article_category'] = ''.join([x.get().strip() for x in response.xpath('//*[@itemprop="entry-header clearfix"]//a//text()')])

        articleItem['article_title'] = response.xpath('//*[@class="entry-title"]/text()').get()

        articleItem['article_text'] = ' '.join([x.get().strip() for x in response.xpath('//*[@class="entry-content"]//p//text()')])


        for comment in response.css('ol.comment-list'):   #response.css('ol.comment-list'):

            commentItem = CommentItem()

            commentItem['comment_datetime'] = comment.xpath('//*[@class="comment-body"]/footer/div[2]/a/time/@datetime').get()

            commentItem['comment_text'] = '<br/>'.join([x.get().strip() for x in comment.xpath('//*[@class="comment-content"]//p//text()')])

            articleItem.comments_lst.append(commentItem)

        yield articleItem

MySQLPipeline.py

def process_item(self, item, spider):
    cursor = self.con.cursor()
    ## Check to see if text is already in database
    cursor.execute("select id from news where uri = %s", (item['article_url'],))
    result = cursor.fetchone()

    ## If it is in DB, create log message
    if result:
        spider.logger.warn("Item already in database: %s" % item['article_url'])

    ## If text isn't in the DB, insert data
    else:
        ## Define insert statement for news table
        cursor.execute(""" insert into news (provider_id, topic_id, title, content, uri, published_date, crawled_date,
                                             categories, author, avg_score, is_active, origin_url_id) 
                           values (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)""",
                           (int(item["provider"]),int(item["topic"]),item["article_title"],item["article_text"],item["article_url"],
                            item["article_datetime"],datetime.now(),
                            item["article_category"],'',-1000.00,1,1))
        self.con.commit()
        cursor.execute('SELECT LAST_INSERT_ID()')
        lastrowid = cursor.fetchone()[0]

        for cit in item.comments_lst:
            ## Define insert statement for news table
            cursor.execute(""" insert into comment (news_id,content,is_active,added_date,published_date,
                                                    published_by, avg_score) 
                               values (%s,%s,%s,%s,%s,%s,%s)""",
                       (lastrowid, cit["comment_text"],1,datetime.now(),
                        cit["comment_datetime"], '', -1000.00))
    self.con.commit()
    return item

问题在于,对于特定新闻项的一个唯一评论会为不同的新闻项保存多次。
示例:在“新闻项目1”上找到的“注解ABC”,在保存到数据库期间将与“新闻项目2”和“新闻项目3”链接
MySQL管道或www.example.com类中是否有问题items.py?

rkue9o1l

rkue9o1l1#

问题在于您的项目以及向其中添加数据的方式。
Field可以是任何东西,包括一个列表,通过将comments_lst属性设置为list而不是使用Field(),您实际上所做的是使comments_lst成为一个在ArticleItem的所有示例之间共享的类属性。
您需要做的就是将ArticleItem更改为scrapy.Field()

class ArticleItem(Item):
    provider = Field()
    topic = Field()
    article_url = Field()
    article_datetime = Field()
    article_title = Field()
    article_text = Field()
    article_category = Field()
    comments_lst = Field()

然后在spider中,您需要为comments_lst分配一个唯一的列表,就像该项中的其他字段一样。

def parse_item(self, response):
    articleItem = ArticleItem()
    articleItem['provider'] = 1
    articleItem['topic'] = 1  # 1
    articleItem['article_url'] = response.xpath('//link[@rel="canonical"]/@href').get()
    articleItem['article_datetime'] = response.xpath('//*[@class="published"]/text()').get()
    articleItem['article_category'] = ''.join([x.get().strip() for x in response.xpath('//*[@itemprop="entry-header clearfix"]//a//text()')])
    articleItem['article_title'] = response.xpath('//*[@class="entry-title"]/text()').get()
    articleItem['article_text'] = ' '.join([x.get().strip() for x in response.xpath('//*[@class="entry-content"]//p//text()')])
    comments_lst = []
    for comment in response.css('ol.comment-list'):   
        commentItem = CommentItem()
        commentItem['comment_datetime'] = comment.xpath('//*[@class="comment-body"]/footer/div[2]/a/time/@datetime').get()
        commentItem['comment_text'] = '<br/>'.join([x.get().strip() for x in comment.xpath('//*[@class="comment-content"]//p//text()')])
        comments_lst.append(commentItem)
     articleItem["comments_lst"] = comments_lst
     yield articleItem

然后在您的管道中,您将需要获得comments_lst,就像其他项目字段一样...

for cit in item["comments_lst"]:
            ## Define insert statement for news table
            cursor.execute(""" insert into comment (news_id,content,is_active,added_date,published_date,
                                                    published_by, avg_score) 
                               values (%s,%s,%s,%s,%s,%s,%s)""",
                       (lastrowid, cit["comment_text"],1,datetime.now(),
                        cit["comment_datetime"], '', -1000.00))

相关问题