PyMuPdf从多个文件中提取pdf信息到csv文件中,为什么这段代码只从每个PDF的第一页提取数据?

3xiyfsfu  于 2023-06-27  发布在  其他
关注(0)|答案(1)|浏览(115)

我试图从文件夹中的每个PDF文件中提取特定信息到单个CSV文件中。每个PDF都有多个页面的信息。然而,我的循环或者它是如何实现的,我不太清楚为什么。输出CSV包含所有六个标题,但只有其中两列的信息。这两列中的信息可以在每个PDF的第一页找到,而其他四列中的信息缺失则完全在第一页之后找到。由于某种原因,我不能让它循环通过每个PDF的每一页。
编辑:正则表达式看起来关闭(相同),因为我不得不匿名他们张贴前,因为我不能张贴实际使用的文本在这里。
我已经尝试了以下代码的几个不同的迭代,输出没有变化:

import os
import csv
import re
import fitz

# Define regular expressions patterns to match the desired information.
I_T_Pattern = r'Lorem ipsum dolor sit amet (\w+ \d{1,2}, \d{4}) \(Lorem ipsum dolor sit amet\)'
R_P_pattern = r'Lorem ipsum dolor sit amet (\d+) Lorem ipsum dolor sit amet'
R_T_pattern  = r'Lorem ipsum dolor sit amet (\d+) Lorem ipsum dolor sit amet'
NON_R_pattern  = r'Lorem ipsum dolor sit amet (\d+)'
E_D_pattern = r'Lorem ipsum dolor sit amet \(“Lorem ipsum dolor sit amet”\), Lorem ipsum dolor sit amet (\w+ \d{1,2}, \d{4})'
L_pattern = r'“Lorem ipsum dolor sit amet”\)\.(\s+)(\w+[\s\w]*),'

def extract_contract_data(pdf_file):
    # Open the PDF file.
    doc = fitz.open(pdf_file)

    # Initialize variables to store extracted information.
I_T= None
    R_P= None
    R_T= None
    NON_R= None
    E_D= None
    L = None

    for page_num in range(doc.page_count):
        page = doc.load_page(page_num)
        # Extract the text from the current page.
        text = page.get_text()

        # Use regular expressions to find the desired information.
        I_T_match= re.search(I_T_Pattern, text)
        R_P_match= re.search(R_P_pattern, text)
        R_T_match= re.search(R_T_pattern , text)
        NON_R_match = re.search(NON_R_pattern , text)
        E_D_match  = re.search(E_D_pattern, text)
        L_match = re.search(L_pattern, text)

        # If the information is found and not already extracted, store it in the corresponding variable.
        if I_T_matchand not I_T:
        I_T= I_T_match.group(1)

        if R_P_matchand not R_P:
            R_P= R_P_match.group(1)

        if R_T_matchand not R_T:
            R_T= R_T_match.group(1)

        if NON_R_match and not NON_R:
            NON_R= NON_R_match.group(1)

        if E_D_match  and not E_D:
            E_D= E_D_match .group(1)

        if L_match and not L:
            L = L_match.group(2)

    # Close the PDF file.
    doc.close()

    return {
        "L": L,
        "E D": E_D,
        "I T": I_T,
        "R P": R_P,
        "R T": R_T,
        "NON R": NON_R
    }

def main():
    # Get the path to the directory that contains the PDFs.
    pdf_dir = r'C:\\path'

    # Create a list of all the PDF files in the directory.
    pdf_files = [os.path.join(pdf_dir, f) for f in os.listdir(pdf_dir) if f.endswith(".pdf")]

    # Create an empty list to store the data found in the PDFs.
    data = []

    # Iterate over the PDF files.
    for pdf_file in pdf_files:
        # Extract contract data from each PDF.
        contract_data = extract_contract_data(pdf_file)
        data.append(contract_data)

   # Define the output file path.
    output_file = r'C:\\path'

    with open(output_file, "w", newline='') as file:
        fieldnames = ["L", "E D", "I T", "R P", "R T", "NON R"]
        writer = csv.DictWriter(file, fieldnames=fieldnames)

        writer.writeheader()
        writer.writerows(data)

    print(f"output: {output_file}")

# Call the main function to run the code.
main()
qyswt5oh

qyswt5oh1#

我觉得你的雷克斯脱了。
我使用了this PDF的逻辑,这段代码在第1页上查找两个文字字符串,在第4页上查找一个文字字符串:

import csv
import os
import re

import fitz

# Define regular expressions patterns to match the desired information.
Pg1_Op_Manual = r"OPERATOR’S MANUAL"
Pg1_Model = r"BTS20R-1"
Pg4_Explosive = r"NEVER USE IN AN EXPLOSIVE ATMOSPHERE."

def extract_contract_data(pdf_file):
    # Open the PDF file.
    doc = fitz.open(pdf_file)

    # Initialize variables to store extracted information.
    Op_Manual = None
    Model = None
    Explosive = None

    for page_num in range(doc.page_count):
        page = doc.load_page(page_num)
        # Extract the text from the current page.
        text = page.get_text()

        # Use regular expressions to find the desired information.
        Manual_Match = re.search(Pg1_Op_Manual, text)
        Model_Match = re.search(Pg1_Model, text)
        Explosive_Match = re.search(Pg4_Explosive, text)

        # If the information is found and not already extracted, store it in the corresponding variable.
        if Manual_Match and not Op_Manual:
            Op_Manual = Manual_Match.group(0)

        if Model_Match and not Model:
            Model = Model_Match.group(0)

        if Explosive_Match and not Explosive:
            Explosive = Explosive_Match.group(0)

    # Close the PDF file.
    doc.close()

    return {"Op_Manual": Op_Manual, "Model": Model, "Explosive": Explosive}

def main():
    # Get the path to the directory that contains the PDFs.
    pdf_dir = r"."

    # Create a list of all the PDF files in the directory.
    pdf_files = [os.path.join(pdf_dir, f) for f in os.listdir(pdf_dir) if f.endswith(".pdf")]

    # Create an empty list to store the data found in the PDFs.
    data = []

    # Iterate over the PDF files.
    for pdf_file in pdf_files:
        # Extract contract data from each PDF.
        contract_data = extract_contract_data(pdf_file)
        data.append(contract_data)

    # Define the output file path.
    output_file = r"output.csv"

    with open(output_file, "w", newline="", encoding="utf-8") as file:
        fieldnames = ["Op_Manual", "Model", "Explosive"]
        writer = csv.DictWriter(file, fieldnames=fieldnames)

        writer.writeheader()
        writer.writerows(data)

    print(f"output: {output_file}")

# Call the main function to run the code.
main()

我得到了预期的CSV:

Op_Manual,Model,Explosive
OPERATOR’S MANUAL,BTS20R-1,NEVER USE IN AN EXPLOSIVE ATMOSPHERE.

相关问题