httpx套接字连接在几次迭代后断开

zrfyljdw  于 2021-09-29  发布在  Java
关注(0)|答案(0)|浏览(283)

我正在使用httpx和trio进行抓取,在链接上迭代几次后,它返回一个错误-readerror:socket connection breaked:[winerror 10053]主机中的软件中止了已建立的连接
这是我的剧本:

allin = []

async def worker(channel):
    async with channel:
        async for key_ in channel:
            async with httpx.AsyncClient(timeout=None) as client:
                client.headers.update(h)
                params = {
                    "e": key_
                }
                r = await client.get('https://disclosure.bursamalaysia.com/FileAccess/viewHtml', params=params)

                soup =bs(r.text,'html.parser')

                try :
                    all = soup.find('td',class_='FootNote').text
                except :
                    all = np.nan

                name = pd.read_html(r.text, match='Name')[0].set_index([0, pd.read_html(r.text, match='Name')[0].groupby(0).cumcount()])[1].unstack(0)
                comname = pd.read_html(r.text, match='Company Name')[0].set_index([0, pd.read_html(r.text, match='Company Name')[0].groupby(0).cumcount()])[1].unstack(0)

                try :
                    adm = pd.read_html(r.text, match='Admission Sponsor', index_col=0)[0].T
                except :
                    adm = pd.DataFrame({'Admission Sponsor':np.nan,'Sponsor':np.nan},index=[0])

                df = name.join(comname).join(adm)

                df['remark']= all
                allin.append(df)

                finaldf = pd.concat(allin, ignore_index=True)

# print(finaldf )

                finaldf.to_excel(exportpath, index=False,sheet_name='Change_of_Company_secretary', engine='xlsxwriter')

async def main():
    async with trio.open_nursery() as nurse:

        sender, receiver = trio.open_memory_channel(0)

        async with receiver:
            for _ in range(1000):
                nurse.start_soon(worker, receiver.clone())
            count=0
            async with sender:

                for k in titlelink:
                    await sender.send(k)
                    count +=1
                    print(count,'ID-',k,'|', end=' ')

if __name__ == "__main__":
    start = datetime.datetime.now()
    trio.run(main)

    finish = datetime.datetime.now() - start 
    print("Time Taken:",finish)

要迭代的总链接是9731,它甚至在达到2000个链接之前就中断了,我不确定我的脚本的哪一部分需要修复才能遇到这个问题。
完整的错误消息如下:

---------------------------------------------------------------------------
ReadError                                 Traceback (most recent call last)
c:\users\baizura\appdata\local\programs\python\python38\lib\site-packages\httpx\_transports\default.py in map_httpcore_exceptions()
     60     try:
---> 61         yield
     62     except Exception as exc:

c:\users\baizura\appdata\local\programs\python\python38\lib\site-packages\httpx\_transports\default.py in handle_async_request(self, method, url, headers, stream, extensions)
    282                 extensions,
--> 283             ) = await self._pool.handle_async_request(
    284                 method=method,

c:\users\baizura\appdata\local\programs\python\python38\lib\site-packages\httpcore\_async\connection_pool.py in handle_async_request(self, method, url, headers, stream, extensions)
    236             try:
--> 237                 response = await connection.handle_async_request(
    238                     method, url, headers=headers, stream=stream, extensions=extensions

c:\users\baizura\appdata\local\programs\python\python38\lib\site-packages\httpcore\_async\connection.py in handle_async_request(self, method, url, headers, stream, extensions)
    147         )
--> 148         return await self.connection.handle_async_request(
    149             method, url, headers, stream, extensions

c:\users\baizura\appdata\local\programs\python\python38\lib\site-packages\httpcore\_async\http11.py in handle_async_request(self, method, url, headers, stream, extensions)
    127             headers,
--> 128         ) = await self._receive_response(timeout)
    129         response_stream = AsyncIteratorByteStream(

c:\users\baizura\appdata\local\programs\python\python38\lib\site-packages\httpcore\_async\http11.py in _receive_response(self, timeout)
    188         while True:
--> 189             event = await self._receive_event(timeout)
    190             if isinstance(event, h11.Response):

c:\users\baizura\appdata\local\programs\python\python38\lib\site-packages\httpcore\_async\http11.py in _receive_event(self, timeout)
    224             if event is h11.NEED_DATA:
--> 225                 data = await self.socket.read(self.READ_NUM_BYTES, timeout)
    226 

c:\users\baizura\appdata\local\programs\python\python38\lib\site-packages\httpcore\_backends\trio.py in read(self, n, timeout)
     65                     await self.stream.aclose()
---> 66                     raise exc
     67 

c:\users\baizura\appdata\local\programs\python\python38\lib\contextlib.py in __exit__(self, type, value, traceback)
    130             try:
--> 131                 self.gen.throw(type, value, traceback)
    132             except StopIteration as exc:

c:\users\baizura\appdata\local\programs\python\python38\lib\site-packages\httpcore\_exceptions.py in map_exceptions(map)
     11             if isinstance(exc, from_exc):
---> 12                 raise to_exc(exc) from None
     13         raise

ReadError: socket connection broken: [WinError 10053] An established connection was aborted by the software in your host machine

The above exception was the direct cause of the following exception:

ReadError                                 Traceback (most recent call last)
<ipython-input-14-59a009208fba> in <module>
     59 if __name__ == "__main__":
     60     start = datetime.datetime.now()
---> 61     trio.run(main)
     62     asyncio.sleep(1)
     63     finish = datetime.datetime.now() - start

    [... skipping hidden 1 frame]

<ipython-input-14-59a009208fba> in main()
     51                     await sender.send(k)
     52                     count +=1
---> 53                     print(count,'ID-',k,'|', end=' ')
     54 
     55 

c:\users\baizura\appdata\local\programs\python\python38\lib\site-packages\trio\_core\_run.py in __aexit__(self, etype, exc, tb)
    813             old_context = combined_error_from_nursery.__context__
    814             try:
--> 815                 raise combined_error_from_nursery
    816             finally:
    817                 _, value, _ = sys.exc_info()

<ipython-input-14-59a009208fba> in worker(channel)
      9                     "e": key_
     10                 }
---> 11                 r = await client.get('https://disclosure.bursamalaysia.com/FileAccess/viewHtml', params=params)
     12 
     13                 soup =bs(r.text,'html.parser')

c:\users\baizura\appdata\local\programs\python\python38\lib\site-packages\httpx\_client.py in get(self, url, params, headers, cookies, auth, allow_redirects, timeout)
   1720       **Parameters**: See `httpx.request`.
   1721         """
-> 1722         return await self.request(
   1723             "GET",
   1724             url,

c:\users\baizura\appdata\local\programs\python\python38\lib\site-packages\httpx\_client.py in request(self, method, url, content, data, files, json, params, headers, cookies, auth, allow_redirects, timeout)
   1479             cookies=cookies,
   1480         )
-> 1481         response = await self.send(
   1482             request, auth=auth, allow_redirects=allow_redirects, timeout=timeout
   1483         )

c:\users\baizura\appdata\local\programs\python\python38\lib\site-packages\httpx\_client.py in send(self, request, stream, auth, allow_redirects, timeout)
   1566         auth = self._build_request_auth(request, auth)
   1567 
-> 1568         response = await self._send_handling_auth(
   1569             request,
   1570             auth=auth,

c:\users\baizura\appdata\local\programs\python\python38\lib\site-packages\httpx\_client.py in _send_handling_auth(self, request, auth, timeout, allow_redirects, history)
   1602 
   1603             while True:
-> 1604                 response = await self._send_handling_redirects(
   1605                     request,
   1606                     timeout=timeout,

c:\users\baizura\appdata\local\programs\python\python38\lib\site-packages\httpx\_client.py in _send_handling_redirects(self, request, timeout, allow_redirects, history)
   1638                 )
   1639 
-> 1640             response = await self._send_single_request(request, timeout)
   1641             try:
   1642                 response.history = list(history)

c:\users\baizura\appdata\local\programs\python\python38\lib\site-packages\httpx\_client.py in _send_single_request(self, request, timeout)
   1679                 stream,
   1680                 extensions,
-> 1681             ) = await transport.handle_async_request(
   1682                 request.method.encode(),
   1683                 request.url.raw,

c:\users\baizura\appdata\local\programs\python\python38\lib\site-packages\httpx\_transports\default.py in handle_async_request(self, method, url, headers, stream, extensions)
    276     ]:
    277         with map_httpcore_exceptions():
--> 278             (
    279                 status_code,
    280                 headers,

c:\users\baizura\appdata\local\programs\python\python38\lib\contextlib.py in __exit__(self, type, value, traceback)
    129                 value = type()
    130             try:
--> 131                 self.gen.throw(type, value, traceback)
    132             except StopIteration as exc:
    133                 # Suppress StopIteration *unless* it's the same exception that

c:\users\baizura\appdata\local\programs\python\python38\lib\site-packages\httpx\_transports\default.py in map_httpcore_exceptions()
     76 
     77         message = str(exc)
---> 78         raise mapped_exc(message) from exc
     79 
     80 

ReadError: socket connection broken: [WinError 10053] An established connection was aborted by the software in your host machine

暂无答案!

目前还没有任何答案,快来回答吧!

相关问题