我已经成功地为一个名为ticketweb的网站编写了一个spider。它在名为event_parser的解析器函数中抓取事件页面的细节(示例链接:(第10页)
有趣的是,尽管我编写的spider正确地抓取了数据点,但当我用-o output命令对csv运行scrapy时,输出csv中的一些值是shiftet,也就是说,它被打印在了错误的数据字段下面。
我没有在代码中使用任何提要导出或管道。我只是在终端上给予-o命令,它应该根据我在www.example.com中定义的项的内容生成一个csv文件items.py
**为什么scrapy会错误地输出它在蜘蛛中正确抓取的项目?**已经三天了。我一次又一次地重写代码,但仍然无法修复输出csv中显示在错误数据点下的数据的问题?
有问题的输出csv
x1c 0d1x这是我的蜘蛛代码:
class XxxSpider(Spider):
name = 'xXx'
allowed_domains = ['ticketweb.com', 'ticketweb.ca']
def __init__(self):
user_agent_rotator = UserAgent(software_names=['chrome'], operating_systems=['windows', 'linux'])
self.user_agents = user_agent_rotator.get_user_agents()
self.UA_rand = random.choice(self.user_agents)['user_agent'] #User Agent set #bunu her kullanımda yeniden yaz bence
self.headers = {'user-agent': random.choice(self.user_agents)['user_agent'] }
self.SCRAPED_ITEMS_NUMBER = 0
self.FINISHED_VENUES_NUMBER = 0
def start_requests(self): #venue links from which start requests are startes
venue_urls = pd.read_csv("urls.csv").iloc[:,0].to_list()
for venue_url in venue_urls:
yield Request(venue_url,callback=self.parse,
#meta={'proxy': self.proxy},
headers=self.headers)
self.FINISHED_VENUES_NUMBER += 1
#print('---(start_requests) => NUMBER OF VENUES SCRAPED =>', self.FINISHED_VENUES_NUMBER )
return None
def parse(self, response): #this parses the venue page for event links
# print("===> Layer 1 Parser opened for => ", response.url)
item = TicketwebItem()
myfuncs = MiniFuncs()
# items to be scraped : event link, event name
events_box = response.xpath('//ul[@class="media-list event-list event"]')
event_urls_list = events_box.xpath(".//li/a/@data-ng-href").extract()
event_urls_list = [myfuncs.url_cleaner(x) for x in event_urls_list]
## SENDING REQUESTS TO EVENTS
event_names = events_box.xpath("./li/a/img/@title").extract()
for i,event in enumerate(event_urls_list):
yield Request(event, self.parse_event,
headers = self.headers,
priority = 100,
cb_kwargs={'item':item,
'name':event_names[i],
'v_url': response.url},
#meta={'proxy': self.proxy},
)
next_page = self.paginator(response)
if next_page:
yield Request(next_page, self.parse, headers= self.headers,
#meta={'proxy': self.proxy},
priority = 150)
else:
print("====> FINISHED VENUE PAGE SCRAPING")
#print("===> NO NEXT PAGE FOUND")
#self.FINISHED_VENUES_NUMBER += 1
print(f'---(parser) => VENUES SCRAPED IN TOTAL => {self.FINISHED_VENUES_NUMBER}')
return None
def parse_event(self,response, item,name, v_url):
myparser = BatuParserFuncs()
item = response.cb_kwargs['item']
body = response.body
#page = NotUsedDataCategories(body)
captcha = "<recaptcha-script>" in response.text
if captcha:
item['Recaptcha'] = 'True'
else:
item['Recaptcha'] = 'False'
# print("===> Layer 2 Parser opened for => ", response.url)
item['event_url'] = response.url
# item['error_logs']= []
#item['is_free_list'] = is_free
## HASAN => data string entry
HASAN =response.xpath('//div[@class="info-item info-time"]/h4/text()').get()
if HASAN:
try:
date_ = myparser.date_fixer(str(parse(HASAN).date())) ###buraya tireli veri gelebiliyor
item['date'] = date_
except:
try:
d1, d2 = HASAN.split("-")
item['date'] = str(parse(d1.split(" - ")[0])).split(" ")[0]
except:
item['date'] = HASAN
print('-----unkwown date format =>', HASAN)
elif not 'ticketweb' in response.url:
print('-----unkwown domain=>', )
item['date'] = 'OUT_OF_DOMAIN'
else:
item['date'] = HASAN
#VENUE URLs
item['venue_url'] = response.cb_kwargs['v_url']
item['site_venue_id'] = response.cb_kwargs['v_url'].split('/')[-1].split('?')[0]
# item['over41_tickets'] =[]
# item['over41_tickets'].append(xx.__str__)
item['event_id'] = response.url.split('/')[-1].split('?')[0]
item['site_event_id'] = f"tweb{item['event_id']}"
item['name'] = response.cb_kwargs['name']
#https://www.ticketweb.ca/venue/club-saw-ottawa-on/498585
try:
item['quantity_'] = response.xpath('//span[@class="more theme-primary-color table-display-item"]/text()')[-1].get().strip().split("per")[0].replace("limit", "").strip()
except: #bilet sürümde değilse
item['quantity_'] = None
# if not item['quantity_']:
# item['ticket1']: 'NONE'
item['venue_name'] = response.xpath('//div[@class="event-summary"]/div[3]/h4/a/text()').get()
item['time'] = response.xpath('//div[@class="info-item info-time"]/h5/text()').get()
#delivery_box = response.xpath('//div[@class="section-body"]')
item['no_ticket'] =False
ticket3 = self.tickets_parser(response)
if not ticket3: #ticket yoksa
item['no_ticket'] = True
item['no_ticket_info_selectors'] = self.no_ticket_parser(response)
else:
item['ticketTyp3'] = ticket3
print("----item=>",item)
yield item
print("\n\n---Number of Items Scraped => ", self.SCRAPED_ITEMS_NUMBER )
#print('ITEM YIELDED')
def tickets_parser(self, response):
try:
tickets_box = response.xpath('//div[@id="edp-section-tickets"]')[0]
except IndexError:
return None
sub_category_items = tickets_box.xpath('.//div[@class="content-group"]') ## burada bir sıkıntı yok
if sub_category_items:
tickets_list = []
prices = [x.strip()for x in sub_category_items.xpath('.//span[contains(@class, "price theme")]/text()').extract()]
names =[x.strip()for x in sub_category_items.xpath('.//dl[@class="definition-list"]//dt[contains(@class, "theme-title")]/text()').extract()]
tickets_list.append(list(zip(names,prices)))
return tickets_list#adapter
else:
defANDprice = tickets_box.xpath('.//dl[@class="definition-list"]')
ticket_descriptions = defANDprice.xpath("./dt/text()").extract()
ticket_prices = defANDprice.xpath("./dd/span[1]/text()").extract()
return list(zip(ticket_descriptions,ticket_prices)) #adapter
def no_ticket_parser(self, response):
no_ticket_info_selectors = response.xpath("""//div[contains(@class,"event-detail")]/div[contains(@class,"section section-status theme-mod" )]""")
if no_ticket_info_selectors:
return no_ticket_info_selectors
else:
return None
def paginator(self, response):
url = response.url
next_page = None
if not response.xpath('//nav[@class="pagination-nav"]'):
# print("=====> Page doesn't have pagination")
return None
next_page = self.get_next_page_number(response) # next_page bir
#print("=====> Page has pagination")
next_page_exists = self.check_if_next_page_exist(response,response.url)
#print(f"====>PAGINATOR FOUND NEXT PAGE AS {next_page} FOR URL {response.url}")
if next_page_exists:
if 'page=' in response.url:
new_url = response.url.split('?page')[0] + f"?page={next_page}"
else:
#print("------(paginator)=> CURRENTLY ON THE FIRST PAGE OF THE VENUE")
new_url = response.url + f"?page={next_page}"
#print("---(paginator) NEXT PAGE URL => ", new_url)
return new_url
else:
print("---(paginator)==> NEXT PAGE CHECK RETURNED NEGATIVE") ### burada dur demesi lazım
return None
def check_if_next_page_exist(self, response, url, checked = False): ### bulunulan sayfanın bir sonraki sayfası var mı diye bakar
if checked:
return False
if 'page=' in response.url:
current_page_number = int(url.split("page=")[1])
next_page_url = url.split("page=")[0] + '?page=' + str(current_page_number+1)
else:
next_page_url = response.url + f"?page=2"
next_page_exists = self.next_page_check_callback(response)
if next_page_exists:
#print("---(NON EMPTY NEXT PAGE SPOTTED =>", next_page_url)
return True
else:
return False
def next_page_check_callback(self,response):
if "Oops, no matching events were found" in str(response.body):
print('---(next_page_check_callback)=> Next page is empty')
return False
yield Request(
response.url.split('?page=')[0] + '?page=' + str(int(response.url.split('=')[-1])-1),
callback = self.check_if_next_page_exist,
headers=self.headers,
cb_kwargs={'checked': True}
)
else:
return True
def get_next_page_number(self,response):
try:
output = response.xpath("""//ul[@class="pagination"]/li[contains(@data-ng-class,"active':true")]/following-sibling::li/a""").extract()
pattern = r"goToPage\([0-9]*\)"
next_page_str = re.search(pattern, output[0]).captures()
last_chars = (next_page_str[0].split("(")[-1])
pattern = "[0-9]+"
next_page = re.search(pattern, last_chars).captures()[0]
#print("====>FOUND NEXT PAGE = ",next_page)
if len(output)>1:
#next page exists
return next_page
else:
#NEXT PAGE BULUNAMADI
return None
except IndexError:
#print("---(check_next_page)====> REGEX BULAMADI")
return None
def proxy_reorg(self, prx):
prx = prx.strip()
cont_list = prx.split(":")
server = cont_list[0]
port = cont_list[1]
username = cont_list[2]
passw = cont_list[3]
new_str = username + ":" + passw + "@" + server + ":" + port
return "http://" + new_str
class MiniFuncs:
def url_cleaner(self, url):
cleaned_ = url.split("{{")[0]
# print("---(url_cleaner) => Cleaned url: ",cleaned_)
return cleaned_
class BatuParserFuncs:
def delivery_parser(self, response):
arrow = response.xpath(
'//div[@class="content theme-separator-strokes radio-btn-ui"]/dl[@class="definition-list"]/dt')
item1 = [x.strip() for x in arrow.xpath("./text()").extract()]
item2 = [x.strip() for x in arrow.xpath("./following-sibling::dd/label/small/text()").extract()]
x = arrow.xpath("./following-sibling::dd/label/text()").extract()
item3_raw = [x.strip() for x in arrow.xpath("./following-sibling::dd/label/text()").extract()]
item3 = [y for y in item3_raw if y != ""]
#print("====>ITEMs CAUGHT/// ITEM1=>",item1,"ITEM2=>",item2,"ITEM3=>",item3)
if item1:
final_delivery_text =''
for i in range(len(arrow)):
final_delivery_text += item1[i] + item2[i] + item3[i]
return final_delivery_text
else:
return None
def date_fixer(self,date_str): #gelen string'i ay gün yıl formatına
year, month, day = date_str.split("-")
return month + "-" + day + "-" + year
我的终端输出:
'''
2022-08-02 00:23:01 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.ticketweb.ca/venue/rio-theatre-vancouver-bc/99754> (referer: None)
====> FINISHED VENUE PAGE SCRAPING
---(parser) => VENUES SCRAPED IN TOTAL => 15
2022-08-02 00:23:02 [scrapy.downloadermiddlewares.redirect] DEBUG: Redirecting (301) to <GET https://www.ticketweb.ca/venue/the-axis-club-toronto-on/10301> from <GET https://www.ticketweb.ca/venue/the-axis-club-formerly-the-mod-club-toronto-on/10301>
2022-08-02 00:23:02 [scrapy.dupefilters] DEBUG: Filtered duplicate request: <GET https://www.ticketweb.ca/venue/the-axis-club-toronto-on/10301> - no more duplicates will be shown (see DUPEFILTER_DEBUG to show all duplicates)
2022-08-02 00:23:05 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.ticketweb.ca/venue/the-fox-cabaret-vancouver-bc/403015> (referer: None)
====> FINISHED VENUE PAGE SCRAPING
---(parser) => VENUES SCRAPED IN TOTAL => 15
2022-08-02 00:23:07 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.ticketweb.ca/venue/the-garrison-toronto-on/200954> (referer: None)
====> FINISHED VENUE PAGE SCRAPING
---(parser) => VENUES SCRAPED IN TOTAL => 15
2022-08-02 00:23:08 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.ticketweb.ca/venue/the-horseshoe-tavern-toronto-on/10304> (referer: None)
====> FINISHED VENUE PAGE SCRAPING
---(parser) => VENUES SCRAPED IN TOTAL => 15
2022-08-02 00:23:11 [filelock] DEBUG: Attempting to acquire lock 139743194248672 on /home/draco/.cache/python-tldextract/3.8.10.final__usr__7d8fdf__tldextract-3.3.1/publicsuffix.org-tlds/de84b5ca2167d4c83e38fb162f2e8738.tldextract.json.lock
2022-08-02 00:23:11 [filelock] DEBUG: Lock 139743194248672 acquired on /home/draco/.cache/python-tldextract/3.8.10.final__usr__7d8fdf__tldextract-3.3.1/publicsuffix.org-tlds/de84b5ca2167d4c83e38fb162f2e8738.tldextract.json.lock
2022-08-02 00:23:11 [filelock] DEBUG: Attempting to release lock 139743194248672 on /home/draco/.cache/python-tldextract/3.8.10.final__usr__7d8fdf__tldextract-3.3.1/publicsuffix.org-tlds/de84b5ca2167d4c83e38fb162f2e8738.tldextract.json.lock
2022-08-02 00:23:11 [filelock] DEBUG: Lock 139743194248672 released on /home/draco/.cache/python-tldextract/3.8.10.final__usr__7d8fdf__tldextract-3.3.1/publicsuffix.org-tlds/de84b5ca2167d4c83e38fb162f2e8738.tldextract.json.lock
...
2022-08-02 01:40:55 [scrapy.core.scraper] DEBUG: Scraped from <200 https://www.ticketweb.ca/event/bad-suns-last-dinosaurs-quarters-the-axis-club-tickets/12352755>
{'Method': None,
'Recaptcha': 'False',
'date': '10-19-2022',
'event_id': '12352755',
'event_url': 'https://www.ticketweb.ca/event/bad-suns-last-dinosaurs-quarters-the-axis-club-tickets/12352755',
'name': 'Bad Suns, Last Dinosaurs, Quarters of Change',
'no_ticket': True,
'no_ticket_info_selectors': [<Selector xpath='//div[contains(@class,"event-detail")]/div[contains(@class,"section section-status theme-mod" )]' data='<div class="section section-status th...'>,
<Selector xpath='//div[contains(@class,"event-detail")]/div[contains(@class,"section section-status theme-mod" )]' data='<div class="section section-status th...'>],
'time': '6:00 PM Doors',
'venue_name': 'The Axis Club'}
----item=> {'Method': 'ticketFastFREEDisplay on your mobile device or print out your '
'ticket for entry!',
'Recaptcha': 'True',
'date': '10-15-2022',
'event_id': '11530355',
'event_url': 'https://www.ticketweb.ca/event/psychedelic-porn-crumpets-acid-dad-the-axis-club-tickets/11530355',
'name': 'Psychedelic Porn Crumpets with Acid Dad',
'no_ticket': False,
'ticketTyp3': [('\n'
' General Admission\n'
' ',
' $31.62\n')],
'time': '7:00 PM Doors',
'venue_name': 'The Axis Club'}
2022-08-02 01:40:55 [scrapy.core.scraper] DEBUG: Scraped from <200 https://www.ticketweb.ca/event/psychedelic-porn-crumpets-acid-dad-the-axis-club-tickets/11530355>
{'Method': 'ticketFastFREEDisplay on your mobile device or print out your '
'ticket for entry!',
'Recaptcha': 'True',
'date': '10-15-2022',
'event_id': '11530355',
'event_url': 'https://www.ticketweb.ca/event/psychedelic-porn-crumpets-acid-dad-the-axis-club-tickets/11530355',
'name': 'Psychedelic Porn Crumpets with Acid Dad',
'no_ticket': False,
'ticketTyp3': [('\n'
' General Admission\n'
' ',
' $31.62\n')],
'time': '7:00 PM Doors',
'venue_name': 'The Axis Club'}
----item=> {'Method': 'ticketFastFREEDisplay on your mobile device or print out your '
'ticket for entry!',
'Recaptcha': 'True',
'date': '10-14-2022',
'event_id': '12273215',
'event_url': 'https://www.ticketweb.ca/event/the-comet-is-coming-the-axis-club-tickets/12273215',
'name': 'The Comet Is Coming',
'no_ticket': False,
'ticketTyp3': [('\n'
' General Admission\n'
' ',
' $36.15\n')],
'time': '8:00 PM Doors',
'venue_name': 'The Axis Club'}
2022-08-02 01:40:55 [scrapy.core.scraper] DEBUG: Scraped from <200 https://www.ticketweb.ca/event/the-comet-is-coming-the-axis-club-tickets/12273215>
{'Method': 'ticketFastFREEDisplay on your mobile device or print out your '
'ticket for entry!',
'Recaptcha': 'True',
'date': '10-14-2022',
'event_id': '12273215',
'event_url': 'https://www.ticketweb.ca/event/the-comet-is-coming-the-axis-club-tickets/12273215',
'name': 'The Comet Is Coming',
'no_ticket': False,
'ticketTyp3': [('\n'
' General Admission\n'
' ',
' $36.15\n')],
'time': '8:00 PM Doors',
'venue_name': 'The Axis Club'}
----item=> {'Method': None,
'Recaptcha': 'False',
'date': '10-10-2022',
'event_id': '11674915',
'event_url': 'https://www.ticketweb.ca/event/spacey-jane-joe-p-the-axis-club-tickets/11674915',
'name': 'Spacey Jane, Joe P',
'no_ticket': True,
'no_ticket_info_selectors': [<Selector xpath='//div[contains(@class,"event-detail")]/div[contains(@class,"section section-status theme-mod" )]' data='<div class="section section-status th...'>,
<Selector xpath='//div[contains(@class,"event-detail")]/div[contains(@class,"section section-status theme-mod" )]' data='<div class="section section-status th...'>],
'time': '7:30 PM Doors',
'venue_name': 'The Axis Club'}
2022-08-02 00:23:39 [scrapy.core.scraper] DEBUG: Scraped from <200 https://www.ticketweb.ca/event/nikki-lane-the-horseshoe-tavern-tickets/12333655>
{'Recaptcha': 'True',
'date': '12-06-2022',
'event_id': '12333655',
'event_url': 'https://www.ticketweb.ca/event/nikki-lane-the-horseshoe-tavern-tickets/12333655',
'name': 'Nikki Lane',
'no_ticket': False,
'site_event_id': 'tweb12333655',
'site_venue_id': '10304',
'ticketTyp3': [('\n'
' General Admission\n'
' ',
' $28.06\n')],
'time': '8:00 PM Doors',
'venue_name': 'The Horseshoe Tavern',
'venue_url': 'https://www.ticketweb.ca/venue/the-horseshoe-tavern-toronto-on/10304'}
---Number of Items Scraped => 13
...
...
2022-08-02 00:24:04 [scrapy.extensions.feedexport] INFO: Stored csv feed (23 items) in: denemey22.csv
2022-08-02 00:24:04 [scrapy.statscollectors] INFO: Dumping Scrapy stats:
{'downloader/request_bytes': 25540,
'downloader/request_count': 45,
'downloader/request_method_count/GET': 45,
'downloader/response_bytes': 2081795,
'downloader/response_count': 45,
'downloader/response_status_count/200': 40,
'downloader/response_status_count/301': 3,
'downloader/response_status_count/500': 2,
'dupefilter/filtered': 1,
'elapsed_time_seconds': 85.742405,
'feedexport/success_count/FileFeedStorage': 1,
'finish_reason': 'shutdown',
'finish_time': datetime.datetime(2022, 8, 1, 21, 24, 4, 693704),
'httpcompression/response_bytes': 8086543,
'httpcompression/response_count': 42,
'item_scraped_count': 23,
'log_count/DEBUG': 74,
'log_count/INFO': 13,
'memusage/max': 165437440,
'memusage/startup': 140574720,
'request_depth_max': 3,
'response_received_count': 40,
'retry/count': 2,
'retry/reason_count/500 Internal Server Error': 2,
'scheduler/dequeued': 45,
'scheduler/dequeued/memory': 45,
'scheduler/enqueued': 231,
'scheduler/enqueued/memory': 231,
'start_time': datetime.datetime(2022, 8, 1, 21, 22, 38, 951299)}
2022-08-02 00:24:04 [scrapy.core.engine] INFO: Spider closed (shutdown)
draco@draco:~/projects/ticketweb2/ticketweb/ticketweb$
'''
1条答案
按热度按时间iezvtpos1#
这是我的items.py代码