json 重新运行代码时，文件观察器循环无法从停止的地方继续

2lpgd968 于 2023-08-08 发布在其他

关注(0)|答案(2)|浏览(95)

我创建了这个文件监视器循环，当我运行代码时，它会扫描特定文件夹中的.json文件并附加到'output.xls'文件中。然后，代码继续循环运行，扫描文件夹中的新文件，并重复该过程。这工作得很好，但是，当我停止代码时（笔记本电脑关闭或其他），新文件仍在添加到文件夹中，然后当我重新运行代码时，我不能继续我离开的地方，我必须删除output.xls文件并重新开始。
有没有一种方法可以在我停止代码时保存已经追加的文件的历史记录，并在我重新运行代码时继续添加尚未追加的文件？

import os
import glob
import json
import pandas as pd
import time
from datetime import datetime
import openpyxl
from openpyxl.utils.dataframe import dataframe_to_rows

def jsonFilesInDirectory(my_dir: str):
    # Get a list of JSON files in the directory
    json_files = glob.glob(os.path.join(my_dir, "*.json"))
    return json_files

def clean_value(value):
    # Clean up numeric values by removing unnecessary characters
    return float(value.replace('\xa0s', '').replace('\xa0ms', '').replace(',', ''))

def doThingsWithNewFiles(fileDiff: list, my_dir: str, workbook):
    for file_name in fileDiff:
        file_path = os.path.join(my_dir, file_name)
        with open(file_path, 'r', encoding='utf-8') as file:
            try:
                json_data = json.load(file)

                # Extract data from the JSON file
                url = json_data["finalUrl"]
                fetch_time = json_data["fetchTime"]

                audits = json_data["audits"]
                fcp_metric = audits["first-contentful-paint"]["id"]
                fcp_value = audits["first-contentful-paint"]["displayValue"]
                fcp_score = audits["first-contentful-paint"]["score"]
                lcp_metric = audits["largest-contentful-paint"]["id"]
                lcp_value = audits["largest-contentful-paint"]["displayValue"]
                lcp_score = audits["largest-contentful-paint"]["score"]
                fmp_metric = audits["first-meaningful-paint"]["id"]
                fmp_value = audits["first-meaningful-paint"]["displayValue"]
                fmp_score = audits["first-meaningful-paint"]["score"]
                si_metric = audits["speed-index"]["id"]
                si_value = audits["speed-index"]["displayValue"]
                si_score = audits["speed-index"]["score"]
                tbt_metric = audits["total-blocking-time"]["id"]
                tbt_value = audits["total-blocking-time"]["displayValue"]
                tbt_score = audits["total-blocking-time"]["score"]
                cls_metric = audits["cumulative-layout-shift"]["id"]
                cls_value = audits["cumulative-layout-shift"]["displayValue"]
                cls_score = audits["cumulative-layout-shift"]["score"]

                categories = json_data["categories"]
                perf_metric = categories["performance"]["id"]
                perf_value = 0
                perf_score = categories["performance"]["score"]

                # Clean up values and format the fetch time
                cleaned_fcp_value = clean_value(fcp_value)
                cleaned_lcp_value = clean_value(lcp_value)
                cleaned_fmp_value = clean_value(fmp_value)
                cleaned_si_value = clean_value(si_value)
                cleaned_tbt_value = clean_value(tbt_value)
                datetime_obj = datetime.strptime(fetch_time, "%Y-%m-%dT%H:%M:%S.%fZ")
                cleaned_fetch_time = datetime_obj.strftime("%Y-%m-%d %H:%M:%S")

                # Create a data dictionary for the DataFrame
                data_dict = {
                    "fetch_time": [cleaned_fetch_time] * 7,
                    "url": [url] * 7,
                    "metric": ["performance","first_contentful_paint", "largest_contentful_paint",
                               "first-meaningful-paint", "speed-index", "total-blocking-time",
                               "cumulative-layout-shift"],
                    "value": [perf_value, cleaned_fcp_value, cleaned_lcp_value,
                              cleaned_fmp_value, cleaned_si_value, cleaned_tbt_value,
                              cls_value],
                    "score": [perf_score, fcp_score, lcp_score, fmp_score, si_score, tbt_score, cls_score]
                }

                df = pd.DataFrame(data_dict)

                # Append the DataFrame to the Excel file
                sheet_name = "Sheet1"
                if sheet_name in workbook.sheetnames:
                    sheet = workbook[sheet_name]
                    startrow = sheet.max_row
                    for row in dataframe_to_rows(df, index=False, header=False):
                        sheet.append(row)
                else:
                    sheet = workbook.create_sheet(sheet_name)
                    for row in dataframe_to_rows(df, index=False, header=True):
                        sheet.append(row)

                print(f"Data extracted from {file_name} and appended to the Excel file")

            except KeyError as e:
                print(f"KeyError occurred while processing file '{file_name}': {e}")
            except json.JSONDecodeError as e:
                print(f"JSONDecodeError occurred while processing file '{file_name}': {e}")
            except Exception as e:
                print(f"An error occurred while processing file '{file_name}': {e}")

def fileWatcher(my_dir: str, pollTime: int):
    excel_file_path = os.path.join(my_dir, 'output.xlsx')
    existingFiles = []

    # Check if the output file already exists
    if os.path.isfile(excel_file_path):
        try:
            workbook = openpyxl.load_workbook(excel_file_path)
            existingFiles = jsonFilesInDirectory(my_dir)
            # Process the existing JSON files and append data to the Excel file
            doThingsWithNewFiles(existingFiles, my_dir, workbook)
            print("Existing JSON files processed and data appended to the Excel file")
        except openpyxl.utils.exceptions.InvalidFileException:
            workbook = openpyxl.Workbook()
    else:
        workbook = openpyxl.Workbook()

    # Check for new files at startup
    newFileList = jsonFilesInDirectory(my_dir)
    fileDiff = listComparison(existingFiles, newFileList)
    existingFiles = newFileList

    if len(fileDiff) > 0:
        # Process the new files and append data to the Excel file
        doThingsWithNewFiles(fileDiff, my_dir, workbook)

        # Save the Excel file
        workbook.save(excel_file_path)
        print(f"DataFrame exported to {excel_file_path}")

    while True:
        time.sleep(pollTime)

        # Get the updated list of JSON files in the directory
        newFileList = jsonFilesInDirectory(my_dir)

        # Find the difference between the previous and new file lists
        fileDiff = listComparison(existingFiles, newFileList)
        existingFiles = newFileList

        if len(fileDiff) > 0:
            # Process the new files and append data to the Excel file
            doThingsWithNewFiles(fileDiff, my_dir, workbook)

            # Save the Excel file
            workbook.save(excel_file_path)
            print(f"DataFrame exported to {excel_file_path}")

def listComparison(originalList: list, newList: list):
    # Compare two lists and return the differences
    differencesList = [x for x in newList if x not in originalList]
    return differencesList

my_dir = r"Z:"
pollTime = 60

fileWatcher(my_dir, pollTime)

字符串

JSON

来源：https://stackoverflow.com/questions/76684076/file-watcher-loop-cannot-continue-where-it-left-off-when-re-running-the-code

2条答案

按热度按时间

kupeojn61#

您可以创建一个文本文件，其中存储了扫描文件的列表。
更新了你的代码，如果存在，读取并写入文本文件。

from datetime import datetime
import glob
import json
import openpyxl
from openpyxl.utils.dataframe import dataframe_to_rows
import os
import pandas as pd
import time

def jsonFilesInDirectory(my_dir: str):
    # Get a list of JSON files in the directory
    json_files = glob.glob(os.path.join(my_dir, "*.json"))
    
    return json_files

def clean_value(value):
    # Clean up numeric values by removing unnecessary characters
    return float(value.replace('\xa0s', '').replace('\xa0ms', '').replace(',', ''))

def doThingsWithNewFiles(fileDiff: list, my_dir: str, workbook):
    for file_name in fileDiff:
        file_path = os.path.join(my_dir, file_name)
        with open(file_path, 'r', encoding='utf-8') as file:
            try:
                json_data = json.load(file)

                # Extract data from the JSON file
                url = json_data["finalUrl"]
                fetch_time = json_data["fetchTime"]

                audits = json_data["audits"]
                fcp_metric = audits["first-contentful-paint"]["id"]
                fcp_value = audits["first-contentful-paint"]["displayValue"]
                fcp_score = audits["first-contentful-paint"]["score"]
                lcp_metric = audits["largest-contentful-paint"]["id"]
                lcp_value = audits["largest-contentful-paint"]["displayValue"]
                lcp_score = audits["largest-contentful-paint"]["score"]
                fmp_metric = audits["first-meaningful-paint"]["id"]
                fmp_value = audits["first-meaningful-paint"]["displayValue"]
                fmp_score = audits["first-meaningful-paint"]["score"]
                si_metric = audits["speed-index"]["id"]
                si_value = audits["speed-index"]["displayValue"]
                si_score = audits["speed-index"]["score"]
                tbt_metric = audits["total-blocking-time"]["id"]
                tbt_value = audits["total-blocking-time"]["displayValue"]
                tbt_score = audits["total-blocking-time"]["score"]
                cls_metric = audits["cumulative-layout-shift"]["id"]
                cls_value = audits["cumulative-layout-shift"]["displayValue"]
                cls_score = audits["cumulative-layout-shift"]["score"]

                categories = json_data["categories"]
                perf_metric = categories["performance"]["id"]
                perf_value = 0
                perf_score = categories["performance"]["score"]

                # Clean up values and format the fetch time
                cleaned_fcp_value = clean_value(fcp_value)
                cleaned_lcp_value = clean_value(lcp_value)
                cleaned_fmp_value = clean_value(fmp_value)
                cleaned_si_value = clean_value(si_value)
                cleaned_tbt_value = clean_value(tbt_value)
                datetime_obj = datetime.strptime(fetch_time, "%Y-%m-%dT%H:%M:%S.%fZ")
                cleaned_fetch_time = datetime_obj.strftime("%Y-%m-%d %H:%M:%S")

                # Create a data dictionary for the DataFrame
                data_dict = {
                    "fetch_time": [cleaned_fetch_time] * 7,
                    "url": [url] * 7,
                    "metric": [
                        "performance",
                        "first_contentful_paint",
                        "largest_contentful_paint",
                        "first-meaningful-paint",
                        "speed-index",
                        "total-blocking-time",
                        "cumulative-layout-shift"
                    ],
                    "value": [
                        perf_value, 
                        cleaned_fcp_value, 
                        cleaned_lcp_value,
                        cleaned_fmp_value,
                        cleaned_si_value,
                        cleaned_tbt_value,
                        cls_value
                    ],
                    "score": [
                        perf_score, 
                        fcp_score, 
                        lcp_score, 
                        fmp_score, 
                        si_score, 
                        tbt_score, 
                        cls_score]
                }

                df = pd.DataFrame(data_dict)

                # Append the DataFrame to the Excel file
                sheet_name = "Sheet1"
                if sheet_name in workbook.sheetnames:
                    sheet = workbook[sheet_name]

                else:
                    sheet = workbook.create_sheet(sheet_name)

                for row in dataframe_to_rows(df, index=False, header=True):
                    sheet.append(row)

                print(f"Data extracted from {file_name} and appended to the Excel file")

            except KeyError as e:
                print(f"KeyError occurred while processing file '{file_name}': {e}")
            
            except json.JSONDecodeError as e:
                print(f"JSONDecodeError occurred while processing file '{file_name}': {e}")
            
            except Exception as e:
                print(f"An error occurred while processing file '{file_name}': {e}")

def fileWatcher(my_dir: str, pollTime: int):
    excel_file_path = os.path.join(my_dir, 'output.xlsx')
    
    existingFiles = []
    if os.path.exists(os.path.join(os.getcwd(), 'scanned_files.txt')):
        with open('scanned_files.txt', 'a+') as f:
            existingFiles = f.read().split('\n')

    # Check if the output file already exists
    if os.path.isfile(excel_file_path):
        try:
            workbook = openpyxl.load_workbook(excel_file_path)
    
        except openpyxl.utils.exceptions.InvalidFileException:
            workbook = openpyxl.Workbook()

    else:
        workbook = openpyxl.Workbook()
        
        
    # Process the existing JSON files and append data to the Excel file
    
    if not "Sheet1" in workbook.sheetnames:
        doThingsWithNewFiles(existingFiles, my_dir, workbook)
        print("Existing JSON files processed and data appended to the Excel file")
    
    # Check for new files at startup
    while True:
        time.sleep(pollTime)

        # Get the updated list of JSON files in the directory
        newFileList = jsonFilesInDirectory(my_dir)

        # Find the difference between the previous and new file lists
        fileDiff = listComparison(existingFiles, newFileList)
        existingFiles = newFileList

        if len(fileDiff) > 0:
            # Process the new files and append data to the Excel file
            doThingsWithNewFiles(fileDiff, my_dir, workbook)

            # Save the Excel file
            workbook.save(excel_file_path)
            print(f"DataFrame exported to {excel_file_path}")

        with open('scanned_files.txt', 'w') as f:
            f.write('\n'.join(existingFiles))

def listComparison(originalList: list, newList: list):
    # Compare two lists and return the differences
    differencesList = [x for x in newList if x not in originalList]
    
    return differencesList

my_dir = r"Z:"
pollTime = 60

fileWatcher(my_dir, pollTime)

字符串
无法测试代码，如果有任何问题请告诉我。

赞(0）回复(0）举报 2023-08-08

p8h8hvxi2#

最简单的想法：要获取自上次更新Excel文件以来更改的文件列表，请使用os.path.getmtime获取Excel文件和所有JSON文件的上次更改时间，并选择较新的JSON文件。如果Excel文件存在，请在启动时执行此操作，并处理每个选定的JSON文件，就像监视器检测到它们一样。
然而，这可能会引入关于在非常接近功率损耗时处理的文件的一些模糊性。所以，更准确的想法是：保存已处理JSON文件的列表，无论是在Excel文件中，还是在单独的位置（例如另一文件或数据库）。
一个更精细的想法是使用一个数据库，将数据保存到JSON文件中，使用数据库作为唯一的真实数据源，并根据需要从数据库生成Excel文件。
顺便说一句，覆盖Excel文件是一个可能的失败点。在这种情况下，一个好的做法是写入同一目录中的临时文件，然后执行os.rename，这将原子地用新文件替换旧文件。

赞(0）回复(0）举报 2023-08-08

我来回答

json 重新运行代码时，文件观察器循环无法从停止的地方继续

2条答案

相关问题

热门标签

最新问答