用Excel和产品图片解决Scrapy问题

d7v8vwbk  于 2023-04-30  发布在  其他
关注(0)|答案(1)|浏览(121)

我的问题已经解决了,我创建了一个脚本,用Scrapy抓取在线hsop。不幸的是,并非所有指定的URL都写入。中的csv文件。csv文件。总有一些缺失,像在下面的例子中我只拿了5个产品页面,但我只得到4个在.csv文件。
图片来自。csv文件:enter image description here
但我注意到当我再次运行代码时,它们都交换为:
enter image description here
我的代码是:

import scrapy
import csv
import os
import requests

class ProductSpider(scrapy.Spider):
    name = "product_spider"

    start_urls = [
        'https://softwarekaufen24.de/microsoft-office-2021-home-and-student/',
        'https://softwarekaufen24.de/windows-11-pro/',
        'https://softwarekaufen24.de/windows-11-home/',
        'https://softwarekaufen24.de/windows-10-professional/',
        'https://softwarekaufen24.de/windows-10-home/'
    ]

    def __init__(self, *args, **kwargs):
        super(ProductSpider, self).__init__(*args, **kwargs)
        self.filename = 'product.csv'
        self.logo_dir = 'logos'
        if not os.path.exists(self.logo_dir):
            os.makedirs(self.logo_dir)
        with open(self.filename, mode='w', encoding='utf-8-sig', newline='') as file:
            writer = csv.writer(file, delimiter=';')
            writer.writerow(
                ['Product Title', 'Product Price', 'Old Product Price', 'Product Description', 'Breadcrumb', 'Product Link', 'Logo Path'])

    def parse(self, response):
        product_title = response.css('h1[class="product--title"]::text').get()
        if product_title:
            product_title = product_title.strip()

        product_price = response.css(
            'meta[itemprop="price"]::attr(content)').get()

        # Old Product Price auswählen
        old_product_price = response.css(
            'span.price--line-through::text').get()

        # Beschreibung auswählen
        product_description = response.css(
            'div.product--description:nth-child(2)').extract_first()
        if product_description:
            # Unerwünschten Text entfernen
            product_description = product_description.replace(
                "SOFTWARE KAUFEN LEICHT GEMACHT!", "").strip()
            product_description = product_description.replace(
                "\n", " ").replace("\r", "")

        # Breadcrumb auswählen
        breadcrumb_items = response.css(
            'span.breadcrumb--title[itemprop="name"]')
        breadcrumb = ''
        for item in breadcrumb_items:
            breadcrumb += item.css('::text').get() + ' > '
        breadcrumb = breadcrumb[:-3]

        # Produkt-Link auswählen
        product_link = response.url

# Logo herunterladen
        logo_path = ''
        supplier_div = response.css('div[class="product--supplier"]')
        if supplier_div:
            img_url = supplier_div.css('img::attr(src)').get()
            img_alt = supplier_div.css('img::attr(alt)').get()
            if img_url and img_alt:
                img_filename = img_alt.replace(
                    ' ', '_') + os.path.splitext(img_url)[1]
                logo_path = os.path.join(self.logo_dir, img_filename)
                if not os.path.exists(logo_path):
                    self.log(f'Downloading logo: {img_url}')
                    img_data = requests.get(img_url).content
                    with open(logo_path, 'wb') as img_file:
                        img_file.write(img_data)
                    # Check if the downloaded image is SVG format
                    if os.path.splitext(img_filename)[1].lower() == '.svg':
                        # Convert SVG to PNG using the inkscape command-line tool
                        subprocess.run(
                            ['inkscape', '--export-type=png', logo_path], check=True)

        with open(self.filename, mode='a', encoding='utf-8-sig', newline='') as file:
            writer = csv.writer(file, delimiter=';')
            writer.writerow(
                [product_title, product_price, old_product_price, product_description, breadcrumb, product_link, logo_path])

        self.log(
            f'Produktname "{product_title}", Preis "{product_price}", Alter Preis "{old_product_price}", Beschreibung "{product_description}", Breadcrumb "{breadcrumb}", Link "{product_link}", Firmenname "{company_name}" und Firmenlogo "{company_logo}" wurden erfolgreich in {self.filename} gespeichert.')

谁能告诉我错误在哪里?我整天都在上面,但我不能解决这个问题,为什么不是所有的产品都写在。csv文件第一次?

kmb7vmvb

kmb7vmvb1#

我不确定是什么具体问题导致了你的问题,但我发现了你的代码中的一些错误。
1.您正在调用subprocess.run,但未导入subprocess模块。
1.在代码的最后一行,你引用了两个不存在的变量。

  1. company_namecompany_logo永远不会在脚本中的任何地方定义。
    通过简单地修复上面提到的问题可能会解决您的问题与当前的结果,您得到的问题。
    例如:
import scrapy
import csv
import os
import requests
import subprocess

class ProductSpider(scrapy.Spider):
    name = "product_spider"

    start_urls = [
        'https://softwarekaufen24.de/microsoft-office-2021-home-and-student/',
        'https://softwarekaufen24.de/windows-11-pro/',
        'https://softwarekaufen24.de/windows-11-home/',
        'https://softwarekaufen24.de/windows-10-professional/',
        'https://softwarekaufen24.de/windows-10-home/'
    ]

    def __init__(self, *args, **kwargs):
        super(ProductSpider, self).__init__(*args, **kwargs)
        self.filename = 'product.csv'
        self.logo_dir = 'logos'
        if not os.path.exists(self.logo_dir):
            os.makedirs(self.logo_dir)
        with open(self.filename, mode='w', encoding='utf-8-sig', newline='') as file:
            writer = csv.writer(file, delimiter=';')
            writer.writerow(
                ['Product Title', 'Product Price', 'Old Product Price', 'Product Description', 'Breadcrumb', 'Product Link', 'Logo Path'])

    def parse(self, response):
        product_title = response.css('h1[class="product--title"]::text').get()
        if product_title:
            product_title = product_title.strip()

        product_price = response.css(
            'meta[itemprop="price"]::attr(content)').get()

        # Old Product Price auswählen
        old_product_price = response.css(
            'span.price--line-through::text').get()

        # Beschreibung auswählen
        product_description = response.css(
            'div.product--description:nth-child(2)').extract_first()
        if product_description:
            # Unerwünschten Text entfernen
            product_description = product_description.replace(
                "SOFTWARE KAUFEN LEICHT GEMACHT!", "").strip()
            product_description = product_description.replace(
                "\n", " ").replace("\r", "")

        # Breadcrumb auswählen
        breadcrumb_items = response.css(
            'span.breadcrumb--title[itemprop="name"]')
        breadcrumb = ''
        for item in breadcrumb_items:
            breadcrumb += item.css('::text').get() + ' > '
        breadcrumb = breadcrumb[:-3]

        # Produkt-Link auswählen
        product_link = response.url

# Logo herunterladen
        logo_path = ''
        supplier_div = response.css('div[class="product--supplier"]')
        if supplier_div:
            img_url = supplier_div.css('img::attr(src)').get()
            img_alt = supplier_div.css('img::attr(alt)').get()
            if img_url and img_alt:
                img_filename = img_alt.replace(
                    ' ', '_') + os.path.splitext(img_url)[1]
                logo_path = os.path.join(self.logo_dir, img_filename)
                if not os.path.exists(logo_path):
                    self.log(f'Downloading logo: {img_url}')
                    img_data = requests.get(img_url).content
                    with open(logo_path, 'wb') as img_file:
                        img_file.write(img_data)
                    # Check if the downloaded image is SVG format
                    if os.path.splitext(img_filename)[1].lower() == '.svg':
                        # Convert SVG to PNG using the inkscape command-line tool
                        subprocess.run(
                            ['inkscape', '--export-type=png', logo_path], check=True)

        with open(self.filename, mode='a', encoding='utf-8-sig', newline='') as file:
            writer = csv.writer(file, delimiter=';')
            writer.writerow(
                [product_title, product_price, old_product_price, product_description, breadcrumb, product_link, logo_path])

        self.log(
            f'Produktname "{product_title}", Preis "{product_price}", Alter Preis "{old_product_price}", Beschreibung "{product_description}", Breadcrumb "{breadcrumb}", Link "{product_link}", Firmenname "" und Firmenlogo "" wurden erfolgreich in {self.filename} gespeichert.')

另外,我建议利用scrapy内置的feed导出和媒体管道功能,而不是自己创建和管理csv文件,并使用requests库在单独的请求中下载图像信息。
通过在settings.py文件或custom_settings字段中填充一些设置和/或使用-o filename.csv命令行选项,您可以消除脚本中大约一半的代码:
我建议阅读这里和这里的scrapy文档,特别是最后一个链接的-o--output命令行参数。

相关问题