import re
SKIP_WORDS = ['Id', 'Flag']
LAST_LINE_CRLF = r'"\r\n"'
with open('test.fmt') as input_file:
vers = next(input_file).strip()
_ = next(input_file)
new_lines, new_field_no = [], 0
for line in input_file:
if not any(s in line for s in SKIP_WORDS):
new_field_no += 1
data = re.split(r'( +(?=.*))', line.strip())
data[0] = str(new_field_no) # renumber field
data[10] = str(new_field_no) # renumber field
new_lines.append(data)
with open('test_new.fmt', 'w') as output_file:
output_file.write(vers + '\n')
output_file.write(str(new_field_no) + '\n')
for idx, line in enumerate(new_lines, start=1):
if idx == new_field_no: # adjust last line delimiter
line[8] = LAST_LINE_CRLF
line[9] = ' '
output_file.write(''.join(line) + '\n')
2条答案
按热度按时间tcomlyy61#
你们可以用正常的方式读前两行,其余的用
pandas.read_csv(..., sep="\s+")
我用io.StringIO
仅用于模拟文件,但您应该使用open()
```data = '''14.0
7
1 SQLCHAR 0 12 "," 1 Id ""
2 SQLCHAR 0 200 "," 2 value1 SQL_Latin1_General_CP1_CI_AS
3 SQLCHAR 0 200 "," 3 value2 SQL_Latin1_General_CP1_CI_AS
4 SQLCHAR 0 200 "," 4 value3 SQL_Latin1_General_CP1_CI_AS
5 SQLCHAR 0 200 "," 5 value4 SQL_Latin1_General_CP1_CI_AS
6 SQLCHAR 0 200 "," 6 value5 SQL_Latin1_General_CP1_CI_AS
7 SQLCHAR 0 12 "\r\n" 7 Flag ""'''
import pandas as pd
import io
open file for reading
fh = io.StringIO(data)
fh = open(filename)
read first two line
two_lines = [next(fh), next(fh)]
print(two_lines)
read rest as CSV file without headers
df = pd.read_csv(fh, sep="\s+", header=None)
print(df)
skip rows which have
Id
orFlag
in 7th columndf = df[ ~df[6].str.contains('Id|Flag') ]
assign new values in first column
df[0] = range(1, len(df)+1)
print(df)
open file for writing
fh = open('output.txt', 'w')
write first two lines
fh.write(two_lines[0])
fh.write(str(len(df)) + '\n')
if you don't need spaces then you can use this
df.to_csv(fh, sep=' ', index=False, header=False)
if you need spaces then you can use this
for row in df.itertuples():
# format line to keep spaces
line = '{:<8}{:<20}{:<8}{:<9}{:<8}{:<6}{:<28}{}'.format(*row[1:])
close file
fh.close()
wbgh16ku2#
这应该使用
re.split()
并根据要求跳过/重新编号字段: