我创建了这个文件监视器循环,当我运行代码时,它会扫描特定文件夹中的.json文件并附加到'output.xls'文件中。然后,代码继续循环运行,扫描文件夹中的新文件,并重复该过程。这工作得很好,但是,当我停止代码时(笔记本电脑关闭或其他),新文件仍在添加到文件夹中,然后当我重新运行代码时,我不能继续我离开的地方,我必须删除output.xls文件并重新开始。
有没有一种方法可以在我停止代码时保存已经追加的文件的历史记录,并在我重新运行代码时继续添加尚未追加的文件?
import os
import glob
import json
import pandas as pd
import time
from datetime import datetime
import openpyxl
from openpyxl.utils.dataframe import dataframe_to_rows
def jsonFilesInDirectory(my_dir: str):
# Get a list of JSON files in the directory
json_files = glob.glob(os.path.join(my_dir, "*.json"))
return json_files
def clean_value(value):
# Clean up numeric values by removing unnecessary characters
return float(value.replace('\xa0s', '').replace('\xa0ms', '').replace(',', ''))
def doThingsWithNewFiles(fileDiff: list, my_dir: str, workbook):
for file_name in fileDiff:
file_path = os.path.join(my_dir, file_name)
with open(file_path, 'r', encoding='utf-8') as file:
try:
json_data = json.load(file)
# Extract data from the JSON file
url = json_data["finalUrl"]
fetch_time = json_data["fetchTime"]
audits = json_data["audits"]
fcp_metric = audits["first-contentful-paint"]["id"]
fcp_value = audits["first-contentful-paint"]["displayValue"]
fcp_score = audits["first-contentful-paint"]["score"]
lcp_metric = audits["largest-contentful-paint"]["id"]
lcp_value = audits["largest-contentful-paint"]["displayValue"]
lcp_score = audits["largest-contentful-paint"]["score"]
fmp_metric = audits["first-meaningful-paint"]["id"]
fmp_value = audits["first-meaningful-paint"]["displayValue"]
fmp_score = audits["first-meaningful-paint"]["score"]
si_metric = audits["speed-index"]["id"]
si_value = audits["speed-index"]["displayValue"]
si_score = audits["speed-index"]["score"]
tbt_metric = audits["total-blocking-time"]["id"]
tbt_value = audits["total-blocking-time"]["displayValue"]
tbt_score = audits["total-blocking-time"]["score"]
cls_metric = audits["cumulative-layout-shift"]["id"]
cls_value = audits["cumulative-layout-shift"]["displayValue"]
cls_score = audits["cumulative-layout-shift"]["score"]
categories = json_data["categories"]
perf_metric = categories["performance"]["id"]
perf_value = 0
perf_score = categories["performance"]["score"]
# Clean up values and format the fetch time
cleaned_fcp_value = clean_value(fcp_value)
cleaned_lcp_value = clean_value(lcp_value)
cleaned_fmp_value = clean_value(fmp_value)
cleaned_si_value = clean_value(si_value)
cleaned_tbt_value = clean_value(tbt_value)
datetime_obj = datetime.strptime(fetch_time, "%Y-%m-%dT%H:%M:%S.%fZ")
cleaned_fetch_time = datetime_obj.strftime("%Y-%m-%d %H:%M:%S")
# Create a data dictionary for the DataFrame
data_dict = {
"fetch_time": [cleaned_fetch_time] * 7,
"url": [url] * 7,
"metric": ["performance","first_contentful_paint", "largest_contentful_paint",
"first-meaningful-paint", "speed-index", "total-blocking-time",
"cumulative-layout-shift"],
"value": [perf_value, cleaned_fcp_value, cleaned_lcp_value,
cleaned_fmp_value, cleaned_si_value, cleaned_tbt_value,
cls_value],
"score": [perf_score, fcp_score, lcp_score, fmp_score, si_score, tbt_score, cls_score]
}
df = pd.DataFrame(data_dict)
# Append the DataFrame to the Excel file
sheet_name = "Sheet1"
if sheet_name in workbook.sheetnames:
sheet = workbook[sheet_name]
startrow = sheet.max_row
for row in dataframe_to_rows(df, index=False, header=False):
sheet.append(row)
else:
sheet = workbook.create_sheet(sheet_name)
for row in dataframe_to_rows(df, index=False, header=True):
sheet.append(row)
print(f"Data extracted from {file_name} and appended to the Excel file")
except KeyError as e:
print(f"KeyError occurred while processing file '{file_name}': {e}")
except json.JSONDecodeError as e:
print(f"JSONDecodeError occurred while processing file '{file_name}': {e}")
except Exception as e:
print(f"An error occurred while processing file '{file_name}': {e}")
def fileWatcher(my_dir: str, pollTime: int):
excel_file_path = os.path.join(my_dir, 'output.xlsx')
existingFiles = []
# Check if the output file already exists
if os.path.isfile(excel_file_path):
try:
workbook = openpyxl.load_workbook(excel_file_path)
existingFiles = jsonFilesInDirectory(my_dir)
# Process the existing JSON files and append data to the Excel file
doThingsWithNewFiles(existingFiles, my_dir, workbook)
print("Existing JSON files processed and data appended to the Excel file")
except openpyxl.utils.exceptions.InvalidFileException:
workbook = openpyxl.Workbook()
else:
workbook = openpyxl.Workbook()
# Check for new files at startup
newFileList = jsonFilesInDirectory(my_dir)
fileDiff = listComparison(existingFiles, newFileList)
existingFiles = newFileList
if len(fileDiff) > 0:
# Process the new files and append data to the Excel file
doThingsWithNewFiles(fileDiff, my_dir, workbook)
# Save the Excel file
workbook.save(excel_file_path)
print(f"DataFrame exported to {excel_file_path}")
while True:
time.sleep(pollTime)
# Get the updated list of JSON files in the directory
newFileList = jsonFilesInDirectory(my_dir)
# Find the difference between the previous and new file lists
fileDiff = listComparison(existingFiles, newFileList)
existingFiles = newFileList
if len(fileDiff) > 0:
# Process the new files and append data to the Excel file
doThingsWithNewFiles(fileDiff, my_dir, workbook)
# Save the Excel file
workbook.save(excel_file_path)
print(f"DataFrame exported to {excel_file_path}")
def listComparison(originalList: list, newList: list):
# Compare two lists and return the differences
differencesList = [x for x in newList if x not in originalList]
return differencesList
my_dir = r"Z:"
pollTime = 60
fileWatcher(my_dir, pollTime)
字符串
2条答案
按热度按时间kupeojn61#
您可以创建一个文本文件,其中存储了扫描文件的列表。
更新了你的代码,如果存在,读取并写入文本文件。
字符串
无法测试代码,如果有任何问题请告诉我。
p8h8hvxi2#
最简单的想法:要获取自上次更新Excel文件以来更改的文件列表,请使用
os.path.getmtime
获取Excel文件和所有JSON文件的上次更改时间,并选择较新的JSON文件。如果Excel文件存在,请在启动时执行此操作,并处理每个选定的JSON文件,就像监视器检测到它们一样。然而,这可能会引入关于在非常接近功率损耗时处理的文件的一些模糊性。所以,更准确的想法是:保存已处理JSON文件的列表,无论是在Excel文件中,还是在单独的位置(例如另一文件或数据库)。
一个更精细的想法是使用一个数据库,将数据保存到JSON文件中,使用数据库作为唯一的真实数据源,并根据需要从数据库生成Excel文件。
顺便说一句,覆盖Excel文件是一个可能的失败点。在这种情况下,一个好的做法是写入同一目录中的临时文件,然后执行
os.rename
,这将原子地用新文件替换旧文件。