我正在使用httpx和trio进行抓取,在链接上迭代几次后,它返回一个错误-readerror:socket connection breaked:[winerror 10053]主机中的软件中止了已建立的连接
这是我的剧本:
allin = []
async def worker(channel):
async with channel:
async for key_ in channel:
async with httpx.AsyncClient(timeout=None) as client:
client.headers.update(h)
params = {
"e": key_
}
r = await client.get('https://disclosure.bursamalaysia.com/FileAccess/viewHtml', params=params)
soup =bs(r.text,'html.parser')
try :
all = soup.find('td',class_='FootNote').text
except :
all = np.nan
name = pd.read_html(r.text, match='Name')[0].set_index([0, pd.read_html(r.text, match='Name')[0].groupby(0).cumcount()])[1].unstack(0)
comname = pd.read_html(r.text, match='Company Name')[0].set_index([0, pd.read_html(r.text, match='Company Name')[0].groupby(0).cumcount()])[1].unstack(0)
try :
adm = pd.read_html(r.text, match='Admission Sponsor', index_col=0)[0].T
except :
adm = pd.DataFrame({'Admission Sponsor':np.nan,'Sponsor':np.nan},index=[0])
df = name.join(comname).join(adm)
df['remark']= all
allin.append(df)
finaldf = pd.concat(allin, ignore_index=True)
# print(finaldf )
finaldf.to_excel(exportpath, index=False,sheet_name='Change_of_Company_secretary', engine='xlsxwriter')
async def main():
async with trio.open_nursery() as nurse:
sender, receiver = trio.open_memory_channel(0)
async with receiver:
for _ in range(1000):
nurse.start_soon(worker, receiver.clone())
count=0
async with sender:
for k in titlelink:
await sender.send(k)
count +=1
print(count,'ID-',k,'|', end=' ')
if __name__ == "__main__":
start = datetime.datetime.now()
trio.run(main)
finish = datetime.datetime.now() - start
print("Time Taken:",finish)
要迭代的总链接是9731,它甚至在达到2000个链接之前就中断了,我不确定我的脚本的哪一部分需要修复才能遇到这个问题。
完整的错误消息如下:
---------------------------------------------------------------------------
ReadError Traceback (most recent call last)
c:\users\baizura\appdata\local\programs\python\python38\lib\site-packages\httpx\_transports\default.py in map_httpcore_exceptions()
60 try:
---> 61 yield
62 except Exception as exc:
c:\users\baizura\appdata\local\programs\python\python38\lib\site-packages\httpx\_transports\default.py in handle_async_request(self, method, url, headers, stream, extensions)
282 extensions,
--> 283 ) = await self._pool.handle_async_request(
284 method=method,
c:\users\baizura\appdata\local\programs\python\python38\lib\site-packages\httpcore\_async\connection_pool.py in handle_async_request(self, method, url, headers, stream, extensions)
236 try:
--> 237 response = await connection.handle_async_request(
238 method, url, headers=headers, stream=stream, extensions=extensions
c:\users\baizura\appdata\local\programs\python\python38\lib\site-packages\httpcore\_async\connection.py in handle_async_request(self, method, url, headers, stream, extensions)
147 )
--> 148 return await self.connection.handle_async_request(
149 method, url, headers, stream, extensions
c:\users\baizura\appdata\local\programs\python\python38\lib\site-packages\httpcore\_async\http11.py in handle_async_request(self, method, url, headers, stream, extensions)
127 headers,
--> 128 ) = await self._receive_response(timeout)
129 response_stream = AsyncIteratorByteStream(
c:\users\baizura\appdata\local\programs\python\python38\lib\site-packages\httpcore\_async\http11.py in _receive_response(self, timeout)
188 while True:
--> 189 event = await self._receive_event(timeout)
190 if isinstance(event, h11.Response):
c:\users\baizura\appdata\local\programs\python\python38\lib\site-packages\httpcore\_async\http11.py in _receive_event(self, timeout)
224 if event is h11.NEED_DATA:
--> 225 data = await self.socket.read(self.READ_NUM_BYTES, timeout)
226
c:\users\baizura\appdata\local\programs\python\python38\lib\site-packages\httpcore\_backends\trio.py in read(self, n, timeout)
65 await self.stream.aclose()
---> 66 raise exc
67
c:\users\baizura\appdata\local\programs\python\python38\lib\contextlib.py in __exit__(self, type, value, traceback)
130 try:
--> 131 self.gen.throw(type, value, traceback)
132 except StopIteration as exc:
c:\users\baizura\appdata\local\programs\python\python38\lib\site-packages\httpcore\_exceptions.py in map_exceptions(map)
11 if isinstance(exc, from_exc):
---> 12 raise to_exc(exc) from None
13 raise
ReadError: socket connection broken: [WinError 10053] An established connection was aborted by the software in your host machine
The above exception was the direct cause of the following exception:
ReadError Traceback (most recent call last)
<ipython-input-14-59a009208fba> in <module>
59 if __name__ == "__main__":
60 start = datetime.datetime.now()
---> 61 trio.run(main)
62 asyncio.sleep(1)
63 finish = datetime.datetime.now() - start
[... skipping hidden 1 frame]
<ipython-input-14-59a009208fba> in main()
51 await sender.send(k)
52 count +=1
---> 53 print(count,'ID-',k,'|', end=' ')
54
55
c:\users\baizura\appdata\local\programs\python\python38\lib\site-packages\trio\_core\_run.py in __aexit__(self, etype, exc, tb)
813 old_context = combined_error_from_nursery.__context__
814 try:
--> 815 raise combined_error_from_nursery
816 finally:
817 _, value, _ = sys.exc_info()
<ipython-input-14-59a009208fba> in worker(channel)
9 "e": key_
10 }
---> 11 r = await client.get('https://disclosure.bursamalaysia.com/FileAccess/viewHtml', params=params)
12
13 soup =bs(r.text,'html.parser')
c:\users\baizura\appdata\local\programs\python\python38\lib\site-packages\httpx\_client.py in get(self, url, params, headers, cookies, auth, allow_redirects, timeout)
1720 **Parameters**: See `httpx.request`.
1721 """
-> 1722 return await self.request(
1723 "GET",
1724 url,
c:\users\baizura\appdata\local\programs\python\python38\lib\site-packages\httpx\_client.py in request(self, method, url, content, data, files, json, params, headers, cookies, auth, allow_redirects, timeout)
1479 cookies=cookies,
1480 )
-> 1481 response = await self.send(
1482 request, auth=auth, allow_redirects=allow_redirects, timeout=timeout
1483 )
c:\users\baizura\appdata\local\programs\python\python38\lib\site-packages\httpx\_client.py in send(self, request, stream, auth, allow_redirects, timeout)
1566 auth = self._build_request_auth(request, auth)
1567
-> 1568 response = await self._send_handling_auth(
1569 request,
1570 auth=auth,
c:\users\baizura\appdata\local\programs\python\python38\lib\site-packages\httpx\_client.py in _send_handling_auth(self, request, auth, timeout, allow_redirects, history)
1602
1603 while True:
-> 1604 response = await self._send_handling_redirects(
1605 request,
1606 timeout=timeout,
c:\users\baizura\appdata\local\programs\python\python38\lib\site-packages\httpx\_client.py in _send_handling_redirects(self, request, timeout, allow_redirects, history)
1638 )
1639
-> 1640 response = await self._send_single_request(request, timeout)
1641 try:
1642 response.history = list(history)
c:\users\baizura\appdata\local\programs\python\python38\lib\site-packages\httpx\_client.py in _send_single_request(self, request, timeout)
1679 stream,
1680 extensions,
-> 1681 ) = await transport.handle_async_request(
1682 request.method.encode(),
1683 request.url.raw,
c:\users\baizura\appdata\local\programs\python\python38\lib\site-packages\httpx\_transports\default.py in handle_async_request(self, method, url, headers, stream, extensions)
276 ]:
277 with map_httpcore_exceptions():
--> 278 (
279 status_code,
280 headers,
c:\users\baizura\appdata\local\programs\python\python38\lib\contextlib.py in __exit__(self, type, value, traceback)
129 value = type()
130 try:
--> 131 self.gen.throw(type, value, traceback)
132 except StopIteration as exc:
133 # Suppress StopIteration *unless* it's the same exception that
c:\users\baizura\appdata\local\programs\python\python38\lib\site-packages\httpx\_transports\default.py in map_httpcore_exceptions()
76
77 message = str(exc)
---> 78 raise mapped_exc(message) from exc
79
80
ReadError: socket connection broken: [WinError 10053] An established connection was aborted by the software in your host machine
暂无答案!
目前还没有任何答案,快来回答吧!