我试图从文件夹中的每个PDF文件中提取特定信息到单个CSV文件中。每个PDF都有多个页面的信息。然而,我的循环或者它是如何实现的,我不太清楚为什么。输出CSV包含所有六个标题,但只有其中两列的信息。这两列中的信息可以在每个PDF的第一页找到,而其他四列中的信息缺失则完全在第一页之后找到。由于某种原因,我不能让它循环通过每个PDF的每一页。
编辑:正则表达式看起来关闭(相同),因为我不得不匿名他们张贴前,因为我不能张贴实际使用的文本在这里。
我已经尝试了以下代码的几个不同的迭代,输出没有变化:
import os
import csv
import re
import fitz
# Define regular expressions patterns to match the desired information.
I_T_Pattern = r'Lorem ipsum dolor sit amet (\w+ \d{1,2}, \d{4}) \(Lorem ipsum dolor sit amet\)'
R_P_pattern = r'Lorem ipsum dolor sit amet (\d+) Lorem ipsum dolor sit amet'
R_T_pattern = r'Lorem ipsum dolor sit amet (\d+) Lorem ipsum dolor sit amet'
NON_R_pattern = r'Lorem ipsum dolor sit amet (\d+)'
E_D_pattern = r'Lorem ipsum dolor sit amet \(“Lorem ipsum dolor sit amet”\), Lorem ipsum dolor sit amet (\w+ \d{1,2}, \d{4})'
L_pattern = r'“Lorem ipsum dolor sit amet”\)\.(\s+)(\w+[\s\w]*),'
def extract_contract_data(pdf_file):
# Open the PDF file.
doc = fitz.open(pdf_file)
# Initialize variables to store extracted information.
I_T= None
R_P= None
R_T= None
NON_R= None
E_D= None
L = None
for page_num in range(doc.page_count):
page = doc.load_page(page_num)
# Extract the text from the current page.
text = page.get_text()
# Use regular expressions to find the desired information.
I_T_match= re.search(I_T_Pattern, text)
R_P_match= re.search(R_P_pattern, text)
R_T_match= re.search(R_T_pattern , text)
NON_R_match = re.search(NON_R_pattern , text)
E_D_match = re.search(E_D_pattern, text)
L_match = re.search(L_pattern, text)
# If the information is found and not already extracted, store it in the corresponding variable.
if I_T_matchand not I_T:
I_T= I_T_match.group(1)
if R_P_matchand not R_P:
R_P= R_P_match.group(1)
if R_T_matchand not R_T:
R_T= R_T_match.group(1)
if NON_R_match and not NON_R:
NON_R= NON_R_match.group(1)
if E_D_match and not E_D:
E_D= E_D_match .group(1)
if L_match and not L:
L = L_match.group(2)
# Close the PDF file.
doc.close()
return {
"L": L,
"E D": E_D,
"I T": I_T,
"R P": R_P,
"R T": R_T,
"NON R": NON_R
}
def main():
# Get the path to the directory that contains the PDFs.
pdf_dir = r'C:\\path'
# Create a list of all the PDF files in the directory.
pdf_files = [os.path.join(pdf_dir, f) for f in os.listdir(pdf_dir) if f.endswith(".pdf")]
# Create an empty list to store the data found in the PDFs.
data = []
# Iterate over the PDF files.
for pdf_file in pdf_files:
# Extract contract data from each PDF.
contract_data = extract_contract_data(pdf_file)
data.append(contract_data)
# Define the output file path.
output_file = r'C:\\path'
with open(output_file, "w", newline='') as file:
fieldnames = ["L", "E D", "I T", "R P", "R T", "NON R"]
writer = csv.DictWriter(file, fieldnames=fieldnames)
writer.writeheader()
writer.writerows(data)
print(f"output: {output_file}")
# Call the main function to run the code.
main()
1条答案
按热度按时间qyswt5oh1#
我觉得你的雷克斯脱了。
我使用了this PDF的逻辑,这段代码在第1页上查找两个文字字符串,在第4页上查找一个文字字符串:
我得到了预期的CSV: