我有一个脚本(见下文),我目前能够刮一个单一的网址,可能有或没有“下一页”。
我现在尝试扩展脚本的功能,从一列csv文件中读取多个URL,目前我的输入文件有以下URL:https://dockets.justia.com/search?parties=Agfa&cases=between&sort-by-last-update=false&after=2015-1-1&before=2015-1-1&before=2023-3-27 https://dockets.justia.com/search?parties=Aggregate+Industries+Management&cases=between&sort-by-last-update=false&after=2015-1-1&before=2015-1-1&before=2023-3-27 https://dockets.justia.com/search?parties=Agility+Fuel+Solutions&cases=between&sort-by-last-update=false&after=2015-1-1&before=2015-1-1&before=2023-3-27 https://dockets.justia.com/search?parties=Agrana&cases=between&sort-by-last-update=false&after=2015-1-1&before=2015-1-1&before=2023-3-27 https://dockets.justia.com/search?parties=Agtek+Development&cases=between&sort-by-last-update=false&after=2015-1-1&before=2015-1-1&before=2023-3-27 https://dockets.justia.com/search?parties=Ahead&cases=between&sort-by-last-update=false&after=2015-1-1&before=2015-1-1&before=2023-3-27 https://dockets.justia.com/search?parties=Ahold&cases=between&sort-by-last-update=false&after=2015-1-1&before=2015-1-1&before=2023-3-27
如何让我的脚本从csv文件中读取这些URL;然后使用这些URL捕获/提取/抓取这些html页面中的案例URL。<div class="has-padding-content-block-30 -zb"> <a href="https://dockets.justia.com/docket/illinois/ilndce/1:2017cv02145/337903" class="case-name"><strong>RAH Color Technologies LLC v. Agfa Gevaert N V et al</strong></a>
我只需要它像当前脚本一样打印到控制台。我希望这是对我的困境的合理清晰的描述。
这是我当前状态下的脚本:
from aiohttp import ClientSession
from pyuseragents import random
from bs4 import BeautifulSoup
from asyncio import run
class DocketsJustia:
def __init__(self):
self.headers = {
'authority': 'dockets.justia.com',
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
'accept-language': 'en-US,en;q=0.5',
'cache-control': 'max-age=0',
'referer': 'https://dockets.justia.com/search?parties=Agfa&cases=between&sort-by-last-update=false&after=2015-1-1&before=2023-3-27',
'user-agent': random(),
}
self.PatchFile = "nametxt.txt"
async def Parser(self, session):
count = 1
while True:
params = {
'parties': 'Agfa',
'page': f'{count}',
}
async with session.get(f'https://dockets.justia.com/search?parties=Agfa&cases=between&sort-by-last-update=false&after=2015-1-1&before=2023-3-27&page={count}',
params=params) as response:
links = BeautifulSoup(await response.text(), "lxml").find_all("div", {
"class": "has-padding-content-block-30 -zb"})
for link in links:
try:
case_link = link.find("a", {"class": "case-name"}).get("href")
case_number = link.find("span", {"class": "citation"}).text
print(case_number + "\t" + case_link + "\n")
with open(self.PatchFile, "a", encoding='utf-8') as file:
file.write(case_number + "\t" + case_link + "\n")
except:
pass
count += 1
async def LoggerParser(self):
async with ClientSession(headers=self.headers) as session:
await self.Parser(session)
def StartDocketsJustia():
run(DocketsJustia().LoggerParser())
if __name__ == '__main__':
StartDocketsJustia()
正如建议的那样,但我得到了一个SyntaxError:
from aiohttp import ClientSession
from pyuseragents import random
from bs4 import BeautifulSoup
from asyncio import run
class DocketsJustia:
def __init__(self):
self.headers = {
'authority': 'dockets.justia.com',
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
'accept-language': 'en-US,en;q=0.5',
'cache-control': 'max-age=0',
'referer': 'https://dockets.justia.com/search?parties=Agfa&cases=between&sort-by-last-update=false&after=2015-1-1&before=2023-3-27',
'user-agent': random(),
}
self.PatchFile = "nametxt.txt"
# old line: async def Parser(self, session):
async def Parser(selfself, session, searchUrl):
count = 1
while True:
params = {
'parties': 'Agfa',
'page': f'{count}',
}
async with session.get(searchUrl, params=params as response:
links = BeautifulSoup(await response.text(), "lxml").find_all("div", {
"class": "has-padding-content-block-30 -zb"})
for link in links:
try:
case_link = link.find("a", {"class": "case-name"}).get("href")
case_number = link.find("span", {"class": "citation"}).text
print(case_number + "\t" + case_link + "\n")
with open(self.PatchFile, "a", encoding='utf-8') as file:
file.write(case_number + "\t" + case_link + "\n")
except:
pass
count += 1
async def LoggerParser(self):
# old line: async with ClientSession(headers=self.headers) as session:
searchUrls=set(pd.read_csv('input_file.csv', header=None)[0])
for url in searchUrls: await self.Parser(session, url)
def StartDocketsJustia():
run(DocketsJustia().LoggerParser())
if __name__ == '__main__':
StartDocketsJustia()
我不得不导入pandas(如searchUrls=set(pd.read_csv('input_file.csv', header=None)[0])
中所建议的,我似乎没有收到任何错误消息。我按照建议更改了脚本,但似乎没有得到任何输出(根本))。这是我正在运行的:
from aiohttp import ClientSession
from pyuseragents import random
from bs4 import BeautifulSoup
from asyncio import run
import pandas as pd
class DocketsJustia:
def __init__(self):
self.headers = {
'authority': 'dockets.justia.com',
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
'accept-language': 'en-US,en;q=0.5',
'cache-control': 'max-age=0',
'user-agent': random(),
}
self.PatchFile = "nametxt.txt"
# old line: async def Parser(self, session):
async def Parser(selfself, session, searchUrl):
count = 1
while True:
async with session.get(f"{searchUrl}&page={count}") as response:
links = BeautifulSoup(await response.text(), "lxml").find_all("div", {
"class": "has-padding-content-block-30 -zb"})
for link in links:
try:
case_link = link.find("a", {"class": "case-name"}).get("href")
case_number = link.find("span", {"class": "citation"}).text
print(case_number + "\t" + case_link + "\n")
with open(self.PatchFile, "a", encoding='utf-8') as file:
file.write(case_number + "\t" + case_link + "\n")
except:
pass
count += 1
async def LoggerParser(self):
async with ClientSession(headers=self.headers) as session:
searchUrls=set(pd.read_csv('input_file.csv', header=None)[0])
for url in searchUrls:
await self.Parser(session, url)
def StartDocketsJustia():
run(DocketsJustia().LoggerParser())
if __name__ == '__main__':
StartDocketsJustia()
1条答案
按热度按时间krcsximq1#
我只是从评论中列出我的建议:
Parser
的搜索URL,如def Parser(self, session, searchUrl)
...session.get(searchUrl...as response:...
LoggerParser
中,使用类似的内容读取URL,并使用类似
的内容循环遍历它们
headers['referer']
,除非绝对必要while True...
* 循环if not links: break
在最后一页后停止soup.find('div', class_='row-label')
查找分页进度,但如果未找到div
则中断-类似于if count>max_page: break
* 限制页数count += 1
* 行以将其移动到while
循环内,这样就不会反复抓取同一页params
参数或searchUrl
中的查询字符串params
,则使用urllib.parse
分解searchUrl
,然后重建为count
化为: