import scrapy
from scrapy.http import Request
class TestSpider(scrapy.Spider):
name = 'test'
start_urls = ['https://www.ifep.ro/justice/lawyers/lawyerspanel.aspx']
custom_settings = {
'CONCURRENT_REQUESTS_PER_DOMAIN': 1,
'DOWNLOAD_DELAY': 1,
'USER_AGENT': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36'
}
def parse(self, response):
books = response.xpath("//div[@class='list-group']//@href").extract()
for book in books:
url = response.urljoin(book)
print(url)
我想从链接中删除这些不必要的网址该网站是https://www.ifep.ro/justice/lawyers/lawyerspanel.aspx
http://www.unbr.ro
http://www.inppa.ro
http://www.uniuneanotarilor.ro/
http://www.caav.ro
http://www.executori.ro/
http://www.csm1909.ro
http://www.inm-lex.ro
http://www.just.ro
1条答案
按热度按时间cgyqldqp1#
您可以应用
endswith
方法沿着continue
关键字来删除所需的url输出: