Scrapy错误地输出正确刮除的项目

pgvzfuti  于 2022-11-09  发布在  其他
关注(0)|答案(1)|浏览(130)

我已经成功地为一个名为ticketweb的网站编写了一个spider。它在名为event_parser的解析器函数中抓取事件页面的细节(示例链接:(第10页)
有趣的是,尽管我编写的spider正确地抓取了数据点,但当我用-o output命令对csv运行scrapy时,输出csv中的一些值是shiftet,也就是说,它被打印在了错误的数据字段下面。
我没有在代码中使用任何提要导出或管道。我只是在终端上给予-o命令,它应该根据我在www.example.com中定义的项的内容生成一个csv文件items.py

**为什么scrapy会错误地输出它在蜘蛛中正确抓取的项目?**已经三天了。我一次又一次地重写代码,但仍然无法修复输出csv中显示在错误数据点下的数据的问题?

有问题的输出csv
x1c 0d1x这是我的蜘蛛代码:

class XxxSpider(Spider):
    name = 'xXx'
    allowed_domains = ['ticketweb.com', 'ticketweb.ca']  
    def __init__(self):
        user_agent_rotator = UserAgent(software_names=['chrome'], operating_systems=['windows', 'linux'])
        self.user_agents = user_agent_rotator.get_user_agents()
        self.UA_rand = random.choice(self.user_agents)['user_agent'] #User Agent set  #bunu her kullanımda yeniden yaz bence
        self.headers = {'user-agent': random.choice(self.user_agents)['user_agent'] }
        self.SCRAPED_ITEMS_NUMBER = 0
        self.FINISHED_VENUES_NUMBER = 0

    def start_requests(self):  #venue links from which start requests are startes
        venue_urls = pd.read_csv("urls.csv").iloc[:,0].to_list()
        for venue_url in venue_urls:
            yield Request(venue_url,callback=self.parse,
            #meta={'proxy': self.proxy},
            headers=self.headers)
            self.FINISHED_VENUES_NUMBER += 1

        #print('---(start_requests) => NUMBER OF VENUES SCRAPED =>', self.FINISHED_VENUES_NUMBER )
        return None        
    def parse(self, response):  #this parses the venue page for event links
       # print("===>  Layer 1 Parser opened for => ", response.url)
        item = TicketwebItem()
        myfuncs = MiniFuncs()
        # items to be scraped : event link, event name
        events_box = response.xpath('//ul[@class="media-list event-list event"]')
        event_urls_list = events_box.xpath(".//li/a/@data-ng-href").extract()
        event_urls_list = [myfuncs.url_cleaner(x) for x in event_urls_list]
        ## SENDING REQUESTS TO EVENTS
        event_names = events_box.xpath("./li/a/img/@title").extract()
        for i,event in enumerate(event_urls_list):
            yield Request(event, self.parse_event, 
            headers = self.headers, 
            priority = 100,
            cb_kwargs={'item':item,
                        'name':event_names[i],
                        'v_url': response.url},

            #meta={'proxy': self.proxy},
            )
        next_page = self.paginator(response)     
        if next_page:           
            yield Request(next_page, self.parse, headers=  self.headers,
            #meta={'proxy': self.proxy},
             priority = 150)
        else:
            print("====> FINISHED VENUE PAGE SCRAPING")
            #print("===> NO NEXT PAGE FOUND")            
            #self.FINISHED_VENUES_NUMBER += 1
            print(f'---(parser) => VENUES SCRAPED IN TOTAL => {self.FINISHED_VENUES_NUMBER}')
            return None
    def parse_event(self,response, item,name, v_url):
        myparser = BatuParserFuncs()
        item = response.cb_kwargs['item']        
        body = response.body
        #page = NotUsedDataCategories(body)
        captcha = "<recaptcha-script>" in response.text
        if captcha:
            item['Recaptcha'] = 'True'
        else:
            item['Recaptcha'] = 'False'
       # print("===>  Layer 2 Parser opened for => ", response.url)
        item['event_url'] = response.url
        # item['error_logs']= []
        #item['is_free_list'] = is_free
        ## HASAN => data string entry

        HASAN =response.xpath('//div[@class="info-item info-time"]/h4/text()').get()
        if HASAN:
            try:          
                date_ = myparser.date_fixer(str(parse(HASAN).date()))   ###buraya tireli veri gelebiliyor
                item['date'] = date_     
            except:
                try:
                    d1, d2 =  HASAN.split("-")              
                    item['date'] = str(parse(d1.split(" - ")[0])).split(" ")[0]
                except:
                    item['date'] = HASAN                
                    print('-----unkwown date format =>', HASAN)

        elif not 'ticketweb' in response.url:
            print('-----unkwown domain=>', )
            item['date'] = 'OUT_OF_DOMAIN'
        else:
            item['date'] = HASAN

        #VENUE URLs
        item['venue_url'] = response.cb_kwargs['v_url']
        item['site_venue_id'] = response.cb_kwargs['v_url'].split('/')[-1].split('?')[0]
    #            item['over41_tickets'] =[]
        #           item['over41_tickets'].append(xx.__str__)    
        item['event_id'] = response.url.split('/')[-1].split('?')[0]
        item['site_event_id'] = f"tweb{item['event_id']}"
        item['name'] = response.cb_kwargs['name']
        #https://www.ticketweb.ca/venue/club-saw-ottawa-on/498585          
        try:
            item['quantity_'] = response.xpath('//span[@class="more theme-primary-color table-display-item"]/text()')[-1].get().strip().split("per")[0].replace("limit", "").strip()
        except:  #bilet sürümde değilse

            item['quantity_'] = None
        #     if not item['quantity_']:
        #         item['ticket1']: 'NONE'
        item['venue_name'] = response.xpath('//div[@class="event-summary"]/div[3]/h4/a/text()').get()
        item['time'] = response.xpath('//div[@class="info-item info-time"]/h5/text()').get()    
        #delivery_box = response.xpath('//div[@class="section-body"]')

        item['no_ticket'] =False
        ticket3 = self.tickets_parser(response)
        if not ticket3:  #ticket yoksa
            item['no_ticket'] = True
            item['no_ticket_info_selectors'] = self.no_ticket_parser(response)
        else:
            item['ticketTyp3'] = ticket3
        print("----item=>",item)
        yield item

        print("\n\n---Number of Items Scraped => ", self.SCRAPED_ITEMS_NUMBER )
        #print('ITEM YIELDED')

    def tickets_parser(self, response):   

        try:
            tickets_box = response.xpath('//div[@id="edp-section-tickets"]')[0]
        except IndexError:
            return None
        sub_category_items = tickets_box.xpath('.//div[@class="content-group"]') ## burada bir sıkıntı yok
        if sub_category_items:
            tickets_list = []
            prices = [x.strip()for x in sub_category_items.xpath('.//span[contains(@class, "price theme")]/text()').extract()]
            names =[x.strip()for x in sub_category_items.xpath('.//dl[@class="definition-list"]//dt[contains(@class, "theme-title")]/text()').extract()]
            tickets_list.append(list(zip(names,prices)))
            return tickets_list#adapter
        else:             
            defANDprice = tickets_box.xpath('.//dl[@class="definition-list"]')
            ticket_descriptions = defANDprice.xpath("./dt/text()").extract()
            ticket_prices = defANDprice.xpath("./dd/span[1]/text()").extract()

            return list(zip(ticket_descriptions,ticket_prices))  #adapter

    def no_ticket_parser(self, response):
        no_ticket_info_selectors = response.xpath("""//div[contains(@class,"event-detail")]/div[contains(@class,"section section-status theme-mod" )]""")          
        if no_ticket_info_selectors:

                return no_ticket_info_selectors
        else:
            return None

    def paginator(self, response):
            url = response.url
            next_page = None
            if not response.xpath('//nav[@class="pagination-nav"]'):
               # print("=====> Page doesn't have pagination")
                return None

            next_page = self.get_next_page_number(response)   # next_page bir
            #print("=====> Page has pagination")
            next_page_exists = self.check_if_next_page_exist(response,response.url)
            #print(f"====>PAGINATOR FOUND NEXT PAGE AS {next_page} FOR URL {response.url}")
            if next_page_exists:
                if 'page=' in response.url:
                    new_url = response.url.split('?page')[0] + f"?page={next_page}"
                else:
                    #print("------(paginator)=> CURRENTLY ON THE FIRST PAGE OF THE VENUE")
                    new_url = response.url + f"?page={next_page}"
                #print("---(paginator) NEXT PAGE URL => ", new_url)
                return new_url
            else:
                print("---(paginator)==> NEXT PAGE CHECK RETURNED NEGATIVE")  ### burada dur demesi lazım
                return None            
    def check_if_next_page_exist(self, response, url, checked = False):  ### bulunulan sayfanın bir sonraki sayfası var mı diye bakar
        if checked:
            return False

        if 'page=' in response.url:
            current_page_number = int(url.split("page=")[1])
            next_page_url = url.split("page=")[0] + '?page=' + str(current_page_number+1)            
        else:
            next_page_url = response.url + f"?page=2"
        next_page_exists = self.next_page_check_callback(response)
        if next_page_exists:
            #print("---(NON EMPTY NEXT PAGE SPOTTED =>", next_page_url)
            return True
        else:
            return False
    def next_page_check_callback(self,response):

        if "Oops, no matching events were found" in str(response.body):
            print('---(next_page_check_callback)=> Next page is empty')
            return False

            yield Request(
                response.url.split('?page=')[0] + '?page=' + str(int(response.url.split('=')[-1])-1),
                callback = self.check_if_next_page_exist,
                headers=self.headers,
                cb_kwargs={'checked': True}

                )
        else:
            return True
    def get_next_page_number(self,response):
        try:
            output = response.xpath("""//ul[@class="pagination"]/li[contains(@data-ng-class,"active':true")]/following-sibling::li/a""").extract()
            pattern = r"goToPage\([0-9]*\)"
            next_page_str = re.search(pattern, output[0]).captures()
            last_chars = (next_page_str[0].split("(")[-1])
            pattern = "[0-9]+"
            next_page = re.search(pattern, last_chars).captures()[0]

            #print("====>FOUND NEXT PAGE = ",next_page)
            if len(output)>1:
                #next page exists
                return next_page
            else:
                #NEXT PAGE BULUNAMADI
                return None
        except IndexError:

            #print("---(check_next_page)====> REGEX BULAMADI")
            return None
    def proxy_reorg(self, prx):
        prx = prx.strip()
        cont_list = prx.split(":")
        server = cont_list[0]
        port = cont_list[1]
        username = cont_list[2]
        passw = cont_list[3]
        new_str = username + ":" + passw + "@" + server + ":" + port
        return "http://" + new_str
class MiniFuncs:
    def url_cleaner(self, url):
        cleaned_ = url.split("{{")[0]

      #  print("---(url_cleaner) => Cleaned url: ",cleaned_)
        return cleaned_        
class BatuParserFuncs:

    def delivery_parser(self, response):
        arrow = response.xpath(
        '//div[@class="content theme-separator-strokes radio-btn-ui"]/dl[@class="definition-list"]/dt')

        item1 = [x.strip() for x in arrow.xpath("./text()").extract()]
        item2 = [x.strip() for x in arrow.xpath("./following-sibling::dd/label/small/text()").extract()]

        x = arrow.xpath("./following-sibling::dd/label/text()").extract()

        item3_raw = [x.strip() for x in arrow.xpath("./following-sibling::dd/label/text()").extract()]
        item3 = [y for y in item3_raw if y != ""]
        #print("====>ITEMs CAUGHT/// ITEM1=>",item1,"ITEM2=>",item2,"ITEM3=>",item3)
        if item1:
            final_delivery_text =''
            for i in range(len(arrow)):
                final_delivery_text += item1[i] + item2[i] + item3[i]

            return final_delivery_text    
        else:
            return None
    def date_fixer(self,date_str):   #gelen string'i ay gün yıl formatına
        year, month, day = date_str.split("-")
        return month + "-" + day + "-" + year

我的终端输出:
'''

2022-08-02 00:23:01 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.ticketweb.ca/venue/rio-theatre-vancouver-bc/99754> (referer: None)
====> FINISHED VENUE PAGE SCRAPING
---(parser) => VENUES SCRAPED IN TOTAL => 15
2022-08-02 00:23:02 [scrapy.downloadermiddlewares.redirect] DEBUG: Redirecting (301) to <GET https://www.ticketweb.ca/venue/the-axis-club-toronto-on/10301> from <GET https://www.ticketweb.ca/venue/the-axis-club-formerly-the-mod-club-toronto-on/10301>
2022-08-02 00:23:02 [scrapy.dupefilters] DEBUG: Filtered duplicate request: <GET https://www.ticketweb.ca/venue/the-axis-club-toronto-on/10301> - no more duplicates will be shown (see DUPEFILTER_DEBUG to show all duplicates)
2022-08-02 00:23:05 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.ticketweb.ca/venue/the-fox-cabaret-vancouver-bc/403015> (referer: None)
====> FINISHED VENUE PAGE SCRAPING
---(parser) => VENUES SCRAPED IN TOTAL => 15
2022-08-02 00:23:07 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.ticketweb.ca/venue/the-garrison-toronto-on/200954> (referer: None)
====> FINISHED VENUE PAGE SCRAPING
---(parser) => VENUES SCRAPED IN TOTAL => 15
2022-08-02 00:23:08 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.ticketweb.ca/venue/the-horseshoe-tavern-toronto-on/10304> (referer: None)
====> FINISHED VENUE PAGE SCRAPING
---(parser) => VENUES SCRAPED IN TOTAL => 15
2022-08-02 00:23:11 [filelock] DEBUG: Attempting to acquire lock 139743194248672 on /home/draco/.cache/python-tldextract/3.8.10.final__usr__7d8fdf__tldextract-3.3.1/publicsuffix.org-tlds/de84b5ca2167d4c83e38fb162f2e8738.tldextract.json.lock
2022-08-02 00:23:11 [filelock] DEBUG: Lock 139743194248672 acquired on /home/draco/.cache/python-tldextract/3.8.10.final__usr__7d8fdf__tldextract-3.3.1/publicsuffix.org-tlds/de84b5ca2167d4c83e38fb162f2e8738.tldextract.json.lock
2022-08-02 00:23:11 [filelock] DEBUG: Attempting to release lock 139743194248672 on /home/draco/.cache/python-tldextract/3.8.10.final__usr__7d8fdf__tldextract-3.3.1/publicsuffix.org-tlds/de84b5ca2167d4c83e38fb162f2e8738.tldextract.json.lock
2022-08-02 00:23:11 [filelock] DEBUG: Lock 139743194248672 released on /home/draco/.cache/python-tldextract/3.8.10.final__usr__7d8fdf__tldextract-3.3.1/publicsuffix.org-tlds/de84b5ca2167d4c83e38fb162f2e8738.tldextract.json.lock

...

2022-08-02 01:40:55 [scrapy.core.scraper] DEBUG: Scraped from <200 https://www.ticketweb.ca/event/bad-suns-last-dinosaurs-quarters-the-axis-club-tickets/12352755>
{'Method': None,
 'Recaptcha': 'False',
 'date': '10-19-2022',
 'event_id': '12352755',
 'event_url': 'https://www.ticketweb.ca/event/bad-suns-last-dinosaurs-quarters-the-axis-club-tickets/12352755',
 'name': 'Bad Suns, Last Dinosaurs, Quarters of Change',
 'no_ticket': True,
 'no_ticket_info_selectors': [<Selector xpath='//div[contains(@class,"event-detail")]/div[contains(@class,"section section-status theme-mod" )]' data='<div class="section section-status th...'>,
                              <Selector xpath='//div[contains(@class,"event-detail")]/div[contains(@class,"section section-status theme-mod" )]' data='<div class="section section-status th...'>],
 'time': '6:00 PM Doors',
 'venue_name': 'The Axis Club'}
----item=> {'Method': 'ticketFastFREEDisplay on your mobile device or print out your '
           'ticket for entry!',
 'Recaptcha': 'True',
 'date': '10-15-2022',
 'event_id': '11530355',
 'event_url': 'https://www.ticketweb.ca/event/psychedelic-porn-crumpets-acid-dad-the-axis-club-tickets/11530355',
 'name': 'Psychedelic Porn Crumpets with Acid Dad',
 'no_ticket': False,
 'ticketTyp3': [('\n'
                 '                                    General Admission\n'
                 '                                    ',
                 '    $31.62\n')],
 'time': '7:00 PM Doors',
 'venue_name': 'The Axis Club'}
2022-08-02 01:40:55 [scrapy.core.scraper] DEBUG: Scraped from <200 https://www.ticketweb.ca/event/psychedelic-porn-crumpets-acid-dad-the-axis-club-tickets/11530355>
{'Method': 'ticketFastFREEDisplay on your mobile device or print out your '
           'ticket for entry!',
 'Recaptcha': 'True',
 'date': '10-15-2022',
 'event_id': '11530355',
 'event_url': 'https://www.ticketweb.ca/event/psychedelic-porn-crumpets-acid-dad-the-axis-club-tickets/11530355',
 'name': 'Psychedelic Porn Crumpets with Acid Dad',
 'no_ticket': False,
 'ticketTyp3': [('\n'
                 '                                    General Admission\n'
                 '                                    ',
                 '    $31.62\n')],
 'time': '7:00 PM Doors',
 'venue_name': 'The Axis Club'}
----item=> {'Method': 'ticketFastFREEDisplay on your mobile device or print out your '
           'ticket for entry!',
 'Recaptcha': 'True',
 'date': '10-14-2022',
 'event_id': '12273215',
 'event_url': 'https://www.ticketweb.ca/event/the-comet-is-coming-the-axis-club-tickets/12273215',
 'name': 'The Comet Is Coming',
 'no_ticket': False,
 'ticketTyp3': [('\n'
                 '                                    General Admission\n'
                 '                                    ',
                 '    $36.15\n')],
 'time': '8:00 PM Doors',
 'venue_name': 'The Axis Club'}
2022-08-02 01:40:55 [scrapy.core.scraper] DEBUG: Scraped from <200 https://www.ticketweb.ca/event/the-comet-is-coming-the-axis-club-tickets/12273215>
{'Method': 'ticketFastFREEDisplay on your mobile device or print out your '
           'ticket for entry!',
 'Recaptcha': 'True',
 'date': '10-14-2022',
 'event_id': '12273215',
 'event_url': 'https://www.ticketweb.ca/event/the-comet-is-coming-the-axis-club-tickets/12273215',
 'name': 'The Comet Is Coming',
 'no_ticket': False,
 'ticketTyp3': [('\n'
                 '                                    General Admission\n'
                 '                                    ',
                 '    $36.15\n')],
 'time': '8:00 PM Doors',
 'venue_name': 'The Axis Club'}
----item=> {'Method': None,
 'Recaptcha': 'False',
 'date': '10-10-2022',
 'event_id': '11674915',
 'event_url': 'https://www.ticketweb.ca/event/spacey-jane-joe-p-the-axis-club-tickets/11674915',
 'name': 'Spacey Jane, Joe P',
 'no_ticket': True,
 'no_ticket_info_selectors': [<Selector xpath='//div[contains(@class,"event-detail")]/div[contains(@class,"section section-status theme-mod" )]' data='<div class="section section-status th...'>,
                              <Selector xpath='//div[contains(@class,"event-detail")]/div[contains(@class,"section section-status theme-mod" )]' data='<div class="section section-status th...'>],
 'time': '7:30 PM Doors',
 'venue_name': 'The Axis Club'}

2022-08-02 00:23:39 [scrapy.core.scraper] DEBUG: Scraped from <200 https://www.ticketweb.ca/event/nikki-lane-the-horseshoe-tavern-tickets/12333655>
{'Recaptcha': 'True',
 'date': '12-06-2022',
 'event_id': '12333655',
 'event_url': 'https://www.ticketweb.ca/event/nikki-lane-the-horseshoe-tavern-tickets/12333655',
 'name': 'Nikki Lane',
 'no_ticket': False,
 'site_event_id': 'tweb12333655',
 'site_venue_id': '10304',
 'ticketTyp3': [('\n'
                 '                                    General Admission\n'
                 '                                    ',
                 '    $28.06\n')],
 'time': '8:00 PM Doors',
 'venue_name': 'The Horseshoe Tavern',
 'venue_url': 'https://www.ticketweb.ca/venue/the-horseshoe-tavern-toronto-on/10304'}

---Number of Items Scraped =>  13
...

...
2022-08-02 00:24:04 [scrapy.extensions.feedexport] INFO: Stored csv feed (23 items) in: denemey22.csv
2022-08-02 00:24:04 [scrapy.statscollectors] INFO: Dumping Scrapy stats:
{'downloader/request_bytes': 25540,
 'downloader/request_count': 45,
 'downloader/request_method_count/GET': 45,
 'downloader/response_bytes': 2081795,
 'downloader/response_count': 45,
 'downloader/response_status_count/200': 40,
 'downloader/response_status_count/301': 3,
 'downloader/response_status_count/500': 2,
 'dupefilter/filtered': 1,
 'elapsed_time_seconds': 85.742405,
 'feedexport/success_count/FileFeedStorage': 1,
 'finish_reason': 'shutdown',
 'finish_time': datetime.datetime(2022, 8, 1, 21, 24, 4, 693704),
 'httpcompression/response_bytes': 8086543,
 'httpcompression/response_count': 42,
 'item_scraped_count': 23,
 'log_count/DEBUG': 74,
 'log_count/INFO': 13,
 'memusage/max': 165437440,
 'memusage/startup': 140574720,
 'request_depth_max': 3,
 'response_received_count': 40,
 'retry/count': 2,
 'retry/reason_count/500 Internal Server Error': 2,
 'scheduler/dequeued': 45,
 'scheduler/dequeued/memory': 45,
 'scheduler/enqueued': 231,
 'scheduler/enqueued/memory': 231,
 'start_time': datetime.datetime(2022, 8, 1, 21, 22, 38, 951299)}
2022-08-02 00:24:04 [scrapy.core.engine] INFO: Spider closed (shutdown)
draco@draco:~/projects/ticketweb2/ticketweb/ticketweb$

'''

iezvtpos

iezvtpos1#

这是我的items.py代码

import scrapy

from scrapy.loader import ItemLoader

from scrapy.item import Item, Field

class TicketwebItem(Item):
    # define the fields for your item here like:

    Recaptcha = scrapy.Field()
    venue_name = scrapy.Field()
    event_url = scrapy.Field()

    #quantity= scrapy.Field()
    Method = scrapy.Field()
    time = scrapy.Field()
    site_venue_id = scrapy.Field()
    date = scrapy.Field()
    site_event_id = scrapy.Field()
    # ticket1 = scrapy.Field()
    venue_url = scrapy.Field()

    name = scrapy.Field()
    event_id = scrapy.Field()

    #quantity_ = scrapy.Field()
    no_ticket = scrapy.Field()
    no_ticket_info_selectors = scrapy.Field()
    ticketTyp3 = scrapy.Field()

    no_ticket_at_all = scrapy.Field()

相关问题