docsgpt-worker-1 | [2023-09-11 04:53:00,787: INFO/MainProcess] Task application.app.ingest[a893c966-35c5-401c-b3ac-114e6f05e95d] received
docsgpt-worker-1 | [2023-09-11 04:53:00,841: ERROR/ForkPoolWorker-1] Task application.app.ingest[a893c966-35c5-401c-b3ac-114e6f05e95d] raised unexpected: PdfReadError('EOF marker not found')
docsgpt-worker-1 | Traceback (most recent call last):
docsgpt-worker-1 | File "/usr/local/lib/python3.10/site-packages/celery/app/trace.py", line 451, in trace_task
docsgpt-worker-1 | R = retval = fun(*args, **kwargs)
docsgpt-worker-1 | File "/usr/local/lib/python3.10/site-packages/celery/app/trace.py", line 734, in __protected_call__
docsgpt-worker-1 | return self.run(*args, **kwargs)
docsgpt-worker-1 | File "/app/application/app.py", line 164, in ingest
docsgpt-worker-1 | resp = ingest_worker(self, directory, formats, name_job, filename, user)
docsgpt-worker-1 | File "/app/application/worker.py", line 66, in ingest_worker
docsgpt-worker-1 | exclude_hidden=exclude, file_metadata=metadata_from_filename).load_data()
docsgpt-worker-1 | File "/app/application/parser/file/bulk.py", line 146, in load_data
docsgpt-worker-1 | data = parser.parse_file(input_file, errors=self.errors)
docsgpt-worker-1 | File "/app/application/parser/file/docs_parser.py", line 28, in parse_file
docsgpt-worker-1 | pdf = PyPDF2.PdfReader(fp)
docsgpt-worker-1 | File "/usr/local/lib/python3.10/site-packages/PyPDF2/_reader.py", line 319, in __init__
docsgpt-worker-1 | self.read(stream)
docsgpt-worker-1 | File "/usr/local/lib/python3.10/site-packages/PyPDF2/_reader.py", line 1415, in read
docsgpt-worker-1 | self._find_eof_marker(stream)
docsgpt-worker-1 | File "/usr/local/lib/python3.10/site-packages/PyPDF2/_reader.py", line 1471, in _find_eof_marker
docsgpt-worker-1 | raise PdfReadError("EOF marker not found")
docsgpt-worker-1 | PyPDF2.errors.PdfReadError: EOF marker not found
docsgpt-backend-1 | [2023-09-11 04:53:05 +0000] [8] [ERROR] Error handling request /api/task_status?task_id=a893c966-35c5-401c-b3ac-114e6f05e95d
docsgpt-backend-1 | Traceback (most recent call last):
docsgpt-backend-1 | File "/usr/local/lib/python3.10/site-packages/gunicorn/workers/sync.py", line 136, in handle
docsgpt-backend-1 | self.handle_request(listener, req, client, addr)
docsgpt-backend-1 | File "/usr/local/lib/python3.10/site-packages/gunicorn/workers/sync.py", line 179, in handle_request
docsgpt-backend-1 | respiter = self.wsgi(environ, resp.start_response)
docsgpt-backend-1 | File "/usr/local/lib/python3.10/site-packages/flask/app.py", line 2552, in __call__
docsgpt-backend-1 | return self.wsgi_app(environ, start_response)
docsgpt-backend-1 | File "/usr/local/lib/python3.10/site-packages/flask/app.py", line 2532, in wsgi_app
docsgpt-backend-1 | response = self.handle_exception(e)
docsgpt-backend-1 | File "/usr/local/lib/python3.10/site-packages/flask/app.py", line 2529, in wsgi_app
docsgpt-backend-1 | response = self.full_dispatch_request()
docsgpt-backend-1 | File "/usr/local/lib/python3.10/site-packages/flask/app.py", line 1826, in full_dispatch_request
docsgpt-backend-1 | return self.finalize_request(rv)
docsgpt-backend-1 | File "/usr/local/lib/python3.10/site-packages/flask/app.py", line 1845, in finalize_request
docsgpt-backend-1 | response = self.make_response(rv)
docsgpt-backend-1 | File "/usr/local/lib/python3.10/site-packages/flask/app.py", line 2157, in make_response
docsgpt-backend-1 | rv = self.json.response(rv)
docsgpt-backend-1 | File "/usr/local/lib/python3.10/site-packages/flask/json/provider.py", line 309, in response
docsgpt-backend-1 | f"{self.dumps(obj, **dump_args)}\n", mimetype=mimetype
docsgpt-backend-1 | File "/usr/local/lib/python3.10/site-packages/flask/json/provider.py", line 230, in dumps
docsgpt-backend-1 | return json.dumps(obj, **kwargs)
docsgpt-backend-1 | File "/usr/local/lib/python3.10/json/__init__.py", line 238, in dumps
docsgpt-backend-1 | **kw).encode(obj)
docsgpt-backend-1 | File "/usr/local/lib/python3.10/json/encoder.py", line 201, in encode
docsgpt-backend-1 | chunks = list(chunks)
docsgpt-backend-1 | File "/usr/local/lib/python3.10/json/encoder.py", line 431, in _iterencode
docsgpt-backend-1 | yield from _iterencode_dict(o, _current_indent_level)
docsgpt-backend-1 | File "/usr/local/lib/python3.10/json/encoder.py", line 405, in _iterencode_dict
docsgpt-backend-1 | yield from chunks
docsgpt-backend-1 | File "/usr/local/lib/python3.10/json/encoder.py", line 438, in _iterencode
docsgpt-backend-1 | o = _default(o)
docsgpt-backend-1 | File "/usr/local/lib/python3.10/site-packages/flask/json/provider.py", line 122, in _default
docsgpt-backend-1 | raise TypeError(f"Object of type {type(o).__name__} is not JSON serializable")
docsgpt-backend-1 | TypeError: Object of type PdfReadError is not JSON serializabledocsgpt-backend-1 | [2023-09-11 04:53:05 +0000] [8] [ERROR] Error handling request /api/task_status?task_id=a893c966-35c5-401c-b3ac-114e6f05e95d
docsgpt-backend-1 | Traceback (most recent call last):
docsgpt-backend-1 | File "/usr/local/lib/python3.10/site-packages/gunicorn/workers/sync.py", line 136, in handle
docsgpt-backend-1 | self.handle_request(listener, req, client, addr)
docsgpt-backend-1 | File "/usr/local/lib/python3.10/site-packages/gunicorn/workers/sync.py", line 179, in handle_request
docsgpt-backend-1 | respiter = self.wsgi(environ, resp.start_response)
docsgpt-backend-1 | File "/usr/local/lib/python3.10/site-packages/flask/app.py", line 2552, in __call__
docsgpt-backend-1 | return self.wsgi_app(environ, start_response)
docsgpt-backend-1 | File "/usr/local/lib/python3.10/site-packages/flask/app.py", line 2532, in wsgi_app
docsgpt-backend-1 | response = self.handle_exception(e)
docsgpt-backend-1 | File "/usr/local/lib/python3.10/site-packages/flask/app.py", line 2529, in wsgi_app
docsgpt-backend-1 | response = self.full_dispatch_request()
docsgpt-backend-1 | File "/usr/local/lib/python3.10/site-packages/flask/app.py", line 1826, in full_dispatch_request
docsgpt-backend-1 | return self.finalize_request(rv)
docsgpt-backend-1 | File "/usr/local/lib/python3.10/site-packages/flask/app.py", line 1845, in finalize_request
docsgpt-backend-1 | response = self.make_response(rv)
docsgpt-backend-1 | File "/usr/local/lib/python3.10/site-packages/flask/app.py", line 2157, in make_response
docsgpt-backend-1 | rv = self.json.response(rv)
docsgpt-backend-1 | File "/usr/local/lib/python3.10/site-packages/flask/json/provider.py", line 309, in response
docsgpt-backend-1 | f"{self.dumps(obj, **dump_args)}\n", mimetype=mimetype
docsgpt-backend-1 | File "/usr/local/lib/python3.10/site-packages/flask/json/provider.py", line 230, in dumps
docsgpt-backend-1 | return json.dumps(obj, **kwargs)
docsgpt-backend-1 | File "/usr/local/lib/python3.10/json/__init__.py", line 238, in dumps
docsgpt-backend-1 | **kw).encode(obj)
docsgpt-backend-1 | File "/usr/local/lib/python3.10/json/encoder.py", line 201, in encode
docsgpt-backend-1 | chunks = list(chunks)
docsgpt-backend-1 | File "/usr/local/lib/python3.10/json/encoder.py", line 431, in _iterencode
docsgpt-backend-1 | yield from _iterencode_dict(o, _current_indent_level)
docsgpt-backend-1 | File "/usr/local/lib/python3.10/json/encoder.py", line 405, in _iterencode_dict
docsgpt-backend-1 | yield from chunks
docsgpt-backend-1 | File "/usr/local/lib/python3.10/json/encoder.py", line 438, in _iterencode
docsgpt-backend-1 | o = _default(o)
docsgpt-backend-1 | File "/usr/local/lib/python3.10/site-packages/flask/json/provider.py", line 122, in _default
docsgpt-backend-1 | raise TypeError(f"Object of type {type(o).__name__} is not JSON serializable")
docsgpt-backend-1 | TypeError: Object of type PdfReadError is not JSON serializable^Xdocsgpt-worker-1 | [2023-09-11 04:55:44,852: INFO/MainProcess] Task application.app.ingest[0aba0fe2-a64b-4462-8f6b-ce73cd2b3da9] received
docsgpt-worker-1 | [2023-09-11 04:55:44,860: ERROR/ForkPoolWorker-1] Task application.app.ingest[0aba0fe2-a64b-4462-8f6b-ce73cd2b3da9] raised unexpected: PdfReadError('EOF marker not found')
docsgpt-worker-1 | Traceback (most recent call last):
docsgpt-worker-1 | File "/usr/local/lib/python3.10/site-packages/celery/app/trace.py", line 451, in trace_task
docsgpt-worker-1 | R = retval = fun(*args, **kwargs)
docsgpt-worker-1 | File "/usr/local/lib/python3.10/site-packages/celery/app/trace.py", line 734, in __protected_call__
docsgpt-worker-1 | return self.run(*args, **kwargs)
docsgpt-worker-1 | File "/app/application/app.py", line 164, in ingest
docsgpt-worker-1 | resp = ingest_worker(self, directory, formats, name_job, filename, user)
docsgpt-worker-1 | File "/app/application/worker.py", line 66, in ingest_worker
docsgpt-worker-1 | exclude_hidden=exclude, file_metadata=metadata_from_filename).load_data()
docsgpt-worker-1 | File "/app/application/parser/file/bulk.py", line 146, in load_data
docsgpt-worker-1 | data = parser.parse_file(input_file, errors=self.errors)
docsgpt-worker-1 | File "/app/application/parser/file/docs_parser.py", line 28, in parse_file
docsgpt-worker-1 | pdf = PyPDF2.PdfReader(fp)
docsgpt-worker-1 | File "/usr/local/lib/python3.10/site-packages/PyPDF2/_reader.py", line 319, in __init__
docsgpt-worker-1 | self.read(stream)
docsgpt-worker-1 | File "/usr/local/lib/python3.10/site-packages/PyPDF2/_reader.py", line 1415, in read
docsgpt-worker-1 | self._find_eof_marker(stream)
docsgpt-worker-1 | File "/usr/local/lib/python3.10/site-packages/PyPDF2/_reader.py", line 1471, in _find_eof_marker
docsgpt-worker-1 | raise PdfReadError("EOF marker not found")
docsgpt-worker-1 | PyPDF2.errors.PdfReadError: EOF marker not found
docsgpt-backend-1 | [2023-09-11 04:55:49 +0000] [7] [ERROR] Error handling request /api/task_status?task_id=0aba0fe2-a64b-4462-8f6b-ce73cd2b3da9
docsgpt-backend-1 | Traceback (most recent call last):
docsgpt-backend-1 | File "/usr/local/lib/python3.10/site-packages/gunicorn/workers/sync.py", line 136, in handle
docsgpt-backend-1 | self.handle_request(listener, req, client, addr)
docsgpt-backend-1 | File "/usr/local/lib/python3.10/site-packages/gunicorn/workers/sync.py", line 179, in handle_request
docsgpt-backend-1 | respiter = self.wsgi(environ, resp.start_response)
docsgpt-backend-1 | File "/usr/local/lib/python3.10/site-packages/flask/app.py", line 2552, in __call__
docsgpt-backend-1 | return self.wsgi_app(environ, start_response)
docsgpt-backend-1 | File "/usr/local/lib/python3.10/site-packages/flask/app.py", line 2532, in wsgi_app
docsgpt-backend-1 | response = self.handle_exception(e)
docsgpt-backend-1 | File "/usr/local/lib/python3.10/site-packages/flask/app.py", line 2529, in wsgi_app
docsgpt-backend-1 | response = self.full_dispatch_request()
docsgpt-backend-1 | File "/usr/local/lib/python3.10/site-packages/flask/app.py", line 1826, in full_dispatch_request
docsgpt-backend-1 | return self.finalize_request(rv)
docsgpt-backend-1 | File "/usr/local/lib/python3.10/site-packages/flask/app.py", line 1845, in finalize_request
docsgpt-backend-1 | response = self.make_response(rv)
docsgpt-backend-1 | File "/usr/local/lib/python3.10/site-packages/flask/app.py", line 2157, in make_response
docsgpt-backend-1 | rv = self.json.response(rv)
docsgpt-backend-1 | File "/usr/local/lib/python3.10/site-packages/flask/json/provider.py", line 309, in response
docsgpt-backend-1 | f"{self.dumps(obj, **dump_args)}\n", mimetype=mimetype
docsgpt-backend-1 | File "/usr/local/lib/python3.10/site-packages/flask/json/provider.py", line 230, in dumps
docsgpt-backend-1 | return json.dumps(obj, **kwargs)
docsgpt-backend-1 | File "/usr/local/lib/python3.10/json/__init__.py", line 238, in dumps
docsgpt-backend-1 | **kw).encode(obj)
docsgpt-backend-1 | File "/usr/local/lib/python3.10/json/encoder.py", line 201, in encode
docsgpt-backend-1 | chunks = list(chunks)
docsgpt-backend-1 | File "/usr/local/lib/python3.10/json/encoder.py", line 431, in _iterencode
docsgpt-backend-1 | yield from _iterencode_dict(o, _current_indent_level)
docsgpt-backend-1 | File "/usr/local/lib/python3.10/json/encoder.py", line 405, in _iterencode_dict
docsgpt-backend-1 | yield from chunks
docsgpt-backend-1 | File "/usr/local/lib/python3.10/json/encoder.py", line 438, in _iterencode
docsgpt-backend-1 | o = _default(o)
docsgpt-backend-1 | File "/usr/local/lib/python3.10/site-packages/flask/json/provider.py", line 122, in _default
docsgpt-backend-1 | raise TypeError(f"Object of type {type(o).__name__} is not JSON serializable")
docsgpt-backend-1 | TypeError: Object of type PdfReadError is not JSON serializabledocsgpt-backend-1 | [2023-09-11 04:55:50 +0000] [7] [ERROR] Error handling request /api/task_status?task_id=0aba0fe2-a64b-4462-8f6b-ce73cd2b3da9
docsgpt-backend-1 | Traceback (most recent call last):
docsgpt-backend-1 | File "/usr/local/lib/python3.10/site-packages/gunicorn/workers/sync.py", line 136, in handle
docsgpt-backend-1 | self.handle_request(listener, req, client, addr)
docsgpt-backend-1 | File "/usr/local/lib/python3.10/site-packages/gunicorn/workers/sync.py", line 179, in handle_request
docsgpt-backend-1 | respiter = self.wsgi(environ, resp.start_response)
docsgpt-backend-1 | File "/usr/local/lib/python3.10/site-packages/flask/app.py", line 2552, in __call__
docsgpt-backend-1 | return self.wsgi_app(environ, start_response)
docsgpt-backend-1 | File "/usr/local/lib/python3.10/site-packages/flask/app.py", line 2532, in wsgi_app
docsgpt-backend-1 | response = self.handle_exception(e)
docsgpt-backend-1 | File "/usr/local/lib/python3.10/site-packages/flask/app.py", line 2529, in wsgi_app
docsgpt-backend-1 | response = self.full_dispatch_request()
docsgpt-backend-1 | File "/usr/local/lib/python3.10/site-packages/flask/app.py", line 1826, in full_dispatch_request
docsgpt-backend-1 | return self.finalize_request(rv)
docsgpt-backend-1 | File "/usr/local/lib/python3.10/site-packages/flask/app.py", line 1845, in finalize_request
docsgpt-backend-1 | response = self.make_response(rv)
docsgpt-backend-1 | File "/usr/local/lib/python3.10/site-packages/flask/app.py", line 2157, in make_response
docsgpt-backend-1 | rv = self.json.response(rv)
docsgpt-backend-1 | File "/usr/local/lib/python3.10/site-packages/flask/json/provider.py", line 309, in response
docsgpt-backend-1 | f"{self.dumps(obj, **dump_args)}\n", mimetype=mimetype
docsgpt-backend-1 | File "/usr/local/lib/python3.10/site-packages/flask/json/provider.py", line 230, in dumps
docsgpt-backend-1 | return json.dumps(obj, **kwargs)
docsgpt-backend-1 | File "/usr/local/lib/python3.10/json/__init__.py", line 238, in dumps
docsgpt-backend-1 | **kw).encode(obj)
docsgpt-backend-1 | File "/usr/local/lib/python3.10/json/encoder.py", line 201, in encode
docsgpt-backend-1 | chunks = list(chunks)
docsgpt-backend-1 | File "/usr/local/lib/python3.10/json/encoder.py", line 431, in _iterencode
docsgpt-backend-1 | yield from _iterencode_dict(o, _current_indent_level)
docsgpt-backend-1 | File "/usr/local/lib/python3.10/json/encoder.py", line 405, in _iterencode_dict
docsgpt-backend-1 | yield from chunks
docsgpt-backend-1 | File "/usr/local/lib/python3.10/json/encoder.py", line 438, in _iterencode
docsgpt-backend-1 | o = _default(o)
docsgpt-backend-1 | File "/usr/local/lib/python3.10/site-packages/flask/json/provider.py", line 122, in _default
docsgpt-backend-1 | raise TypeError(f"Object of type {type(o).__name__} is not JSON serializable")
docsgpt-backend-1 | TypeError: Object of type PdfReadError is not JSON serializable
4条答案
按热度按时间kgqe7b3p1#
看起来PyPDF2在打开它时有问题,你能把pdf发给我吗?这样我可以检查一下。
mnowg1ta2#
它是否有文本,还是只是扫描?
bpsygsoo3#
我尝试了几个PDF文件,但都遇到了问题。
jgovgodb4#
这里也是。