PY -比较多个CSV文件

b4lqfgs4  于 12个月前  发布在  其他
关注(0)|答案(2)|浏览(94)

我有几个.csv文件,其中一些文件比其他文件有更多的头和更多的列。
我想把缺少的列填充到没有它的文件中
我试过这个:

import os
import csv

# Get the directory where the Python script is located
script_directory = os.path.dirname(__file__)

# Define the folder containing the CSV files
folder_path = script_directory

# Get a list of CSV files in the folder
csv_files = [f for f in os.listdir(folder_path) if f.endswith(".csv")]

# Find the file with the most columns (maximum header length)
max_columns_file = max(csv_files, key=lambda file_name: len(next(csv.reader(open(os.path.join(folder_path, file_name), "r", newline=""), delimiter=";"))))

# Read the header from the file with the most columns
with open(os.path.join(folder_path, max_columns_file), "r", newline="") as file:
    header_to_copy = next(csv.reader(file, delimiter=";"))

# Iterate through each CSV file and copy the header
for file_name in csv_files:
    file_path = os.path.join(folder_path, file_name)
    data = []

    with open(file_path, "r", newline="") as file:
        reader = csv.reader(file, delimiter=";")
        for row in reader:
            data.append(row)

    # Update the header in the current file
    original_header = data[0]
    data[0] = header_to_copy

    # Print the differences between the original header and the copied header
    differences = set(original_header) ^ set(header_to_copy)
    print(f"Processed: {file_name}")
    print("Differences:")
    print("+----------------+----------------+")
    print("| Original Header | Copied Header  |")
    print("+----------------+----------------+")

    for item in differences:
        original_present = "Yes" if item in original_header else "No"
        copied_present = "Yes" if item in header_to_copy else "No"
        print(f"| {item:<16}| {original_present:<16}| {copied_present:<16}|")

    print("+----------------+----------------+")

    # Print the position of each element in the header if it was not in the original header
    position_dict = {element: position for position, element in enumerate(header_to_copy, start=1)}
    for element in differences:
        if element in header_to_copy:
            position = position_dict[element]
            print(f"Element '{element}' is at position {position} in the header.")

    # Add a new column with the value "0" from the second row onwards at the specified position
    for i in range(1, len(data)):
        for element in differences:
            if element in header_to_copy:
                position = position_dict[element]
                data[i].insert(position - 1, "0")

    # Write the modified data back to the CSV file
    with open(file_path, "w", newline="") as file:
        writer = csv.writer(file, delimiter=";")
        writer.writerows(data)

还有一个例子。我成功地找到了使用的位置,但当有一个特定的情况下,它不工作:
Talbe1.csv |一|B| C| D| e| F| G||-|-|-|-|-|-|---||1| 2| 3| 5| 6| 7| 888|
Table2.csv |一|B| C| c(c)|D| e| F| G||-|-|-|-----|-|-|-|---||1| 2| 3| 4 |5| 6| 7| 888|
结果:Table1.csv|一|B| C| c(c)|D| e| F| G||-|-|-|-----|-|-|-|---||1| 2| 3| 4 |5| 6| 7| 888|
预期结果:Table1.csv|一|B| C| c(c)|D| e| F| G||-|-|-|-----|-|-|-|---||1| 2| 3| 5 |0| 6| 7| 888|
我不知道为什么,因为我用的是同一个职位号码

emeijp43

emeijp431#

我建议除了跟踪文件名之外,还可以构建一个头列表,这样就不必重新打开文件,可以使用该列表来构建输出。当再次迭代文件时,只需使用csv.DictReader而不是csv.reader,您可以使用默认值0row.get(header)来构建行的元素列表。然后将该行写入该文件的输出。您甚至不需要为此打开“主”文件。

headers_list = <the list of headers built during your first pass to find the longest number of headers>

for file in LIST_OF_FILES:
    if file != MAX_HEADER_FILE_NAME:
        data=list()
        data.append(headers)
        with open(os.join(folder_path, file), "r") as infile:
            mydata = list()
            csv.DictReader csv_file(infile)
            for row in csv_file:
                for header in headers:
                    mydata.append[row.get(header, 0)]
            data.append(mydata)
        //now open for writing as csv_write
        with open(os.join(folder_path, file), "w") as outfile:
            csv_writer = csv.writer(outfile, delimiter=',')
            for row in data:
                csv_writer.writerow(row);
vatpfxk5

vatpfxk52#

有没有具体的原因不使用Pandas?这样会更快/更有效。

import pandas as pd
import os

#get file path
folder_path  = os.path.dirname(__file__)

#get list of files
csv_files = [f for f in os.listdir(folder_path) if f.endswith(".csv")]

#create empty df
df = pd.DataFrame()

#loop through each file and add it to empty df we created
for file in csv_files:
    dfx = pd.read_csv(file, delim=';')
    dfx['filename'] = file
    df = pd.concat([df, dfx]) 

#fill nulls with 0s
df = df.fillna(0)
#reset index
df = df.reset_index(drop=True).copy()

#split into component dataframes
for file in list(df['filename'].unique()):
    dfx = df.loc[df['filename'] == file].reset_index(drop=True).copy()
    dfx.to_csv(file, index=False)

相关问题