regex 在python正则表达式匹配后提取下一行

x3naxklr  于 2023-03-20  发布在  Python
关注(0)|答案(3)|浏览(169)

我有一个基于图像的多页PDF,我必须提取包含EPC的行和与之关联的下一行
示例:

ENERGY PERFORMANCE CERTIFICATE
D(139)

我试了这个代码:

import os    
from PIL import Image
import pytesseract
from pdf2image import convert_from_path
import re

poppler_path = "C:/Users/poddaral/temp/poppler-0.68.0/bin"
pytesseract.pytesseract.tesseract_cmd = r"C:/Users/poddaral/temp/tesseract.exe"

pdf_path = "C:/Users/poddaral/temp/4-6 Etloe Road, Westbury Park, Bristol, BS6 7PF.pdf"

images = convert_from_path(pdf_path=pdf_path, poppler_path=poppler_path)

for count, img in enumerate(images):
  img_name = f"page_{count}.png"  
  img.save(img_name, "PNG")

png_files = [f for f in os.listdir(".") if f.endswith(".png")]

for png_file in png_files:
  extracted_text = pytesseract.image_to_string(Image.open(png_file))
  print(extracted_text)


pattern = re.compile('ENERGY PERFORMANCE CERTIFICATE')

def find_following_line(extracted_text):
    lines = extracted_text.splitlines()
    for i, line in enumerate(lines):
        if re.search(pattern, line):
            return lines[i+2]

print(find_following_line(extracted_text))
yzuktlbb

yzuktlbb1#

lines = fp.readlines()
        for i, line in enumerate(lines):
            if re.search(pattern, line):
                    print (line)
                    print (lines[i+1])

在函数def find_following_line(extracted_text):中对索引做一个小改动就可以了

d4so4syb

d4so4syb2#

我改变了你的索引从2到1,这正是你搜索:

import re

extracted_text="""ENERGY PERFORMANCE CERTIFICATE
D(139)"""
print(repr(extracted_text))

pattern = re.compile('ENERGY PERFORMANCE CERTIFICATE')

def find_following_line(extracted_text):
    lines = extracted_text.splitlines()
    for i, line in enumerate(lines):
        if re.search(pattern, line):
            return lines[i+1] # 2 changed to 1

print(find_following_line(extracted_text))

提供以下输出:

'ENERGY PERFORMANCE CERTIFICATE\nD(139)'
D(139)
xfb7svmp

xfb7svmp3#

您可以避免计算行数,如果前一行匹配,则返回当前行

import re

extracted_text = '''\
ppweiopo
epjijorerg
ENERGY PERFORMANCE CERTIFICATE
D(139)
owiffoij
oidevjoij
'''

# ffl stands for "find following line"
def ffl(text, pattern, out=None):
    for line in text.splitlines():
        if out: return line
        out = re.search(pattern, line)

print(ffl(extracted_text, '^ENERGY PERFORMANCE CERTIFICATE$')) ###> D(139)

如果其他人想***有时候***也返回匹配的行,

def ffl(text, pattern, both=False, out=False):
    for line in text.splitlines():
        if out: return prevline+'\n'+line if both else line
        out = re.search(pattern, line)
        if out and both: prevline = line

相关问题