我在一个文件夹中有多个tsv文件,如2018Q1.tsv、2018Q2.tsv、2018Q3.tsv等。tsv文件中的每个元组由“\t”分隔,每行由“\n”分隔。
我想把一个文件夹中的所有tsv文件合并到一个文件中,所有的行都包括文件名,作为python中新合并文件中的一个新列。
import os
import pandas as pd
#read the path
cwd = os.path.abspath(r'path/to/directory')
#list all the files from the directory
file_list = os.listdir(cwd)
file_list
columns = ['CIK_number', 'Companyname', 'FilingType', 'Filingdate', 'filingtext', 'filingurl']
for file in file_list:
(pd.concat({f.rsplit('.')[0]: pd.read_csv(f, sep='|', header=None, names=columns)
for f in file_list}, names=['Date'])
.reset_index(0)
.to_csv('output_file.tsv', index=False)
)
样品输入:
2018Q1.tsv
------------
860585|RBS PARTNERS L P /CT|13FCONP|1993-02-11|edgar/data/860585/9999999997-04-035713.txt|edgar/data/860585/9999999997-04-035713-index.html
2018Q2.tsv
-------------
926688|SMITH THOMAS W|13F-HR|1993-02-12|edgar/data/926688/9999999997-05-015654.txt|edgar/data/926688/9999999997-05-015654-index.html
Sample consolidated output:
---------------
Date,CIK_number,Companyname,FilingType,Filingdate,filingtext,filingurl
2018Q1,860585,RBS PARTNERS L P /CT,13FCONP,1993-02-11,edgar/data/860585/9999999997-04-035713.txt,edgar/data/860585/9999999997-04-035713-index.html
2018Q2,926688,SMITH THOMAS W,13F-HR,1993-02-12,edgar/data/926688/9999999997-05-015654.txt,edgar/data/926688/9999999997-05-015654-index.html
追溯错误:
FileNotFoundError Traceback (most recent call last)
Input In [25], in <cell line: 3>()
1 columns = ['CIK_number', 'Companyname', 'FilingType', 'Filingdate', 'filingtext', 'filingurl']
----> 3 pd.concat({f.rsplit('.')[0]: pd.read_csv(f, sep='|', header=None, names=columns) for f in file_list}, names=['Date']).reset_index(0).to_csv('output_file.tsv', index=False)
Input In [25], in <dictcomp>(.0)
1 columns = ['CIK_number', 'Companyname', 'FilingType', 'Filingdate', 'filingtext', 'filingurl']
----> 3 pd.concat({f.rsplit('.')[0]: pd.read_csv(f, sep='|', header=None, names=columns) for f in file_list}, names=['Date']).reset_index(0).to_csv('output_file.tsv', index=False)
File C:\ProgramData\Anaconda3\lib\site-packages\pandas\util\_decorators.py:311, in deprecate_nonkeyword_arguments.<locals>.decorate.<locals>.wrapper(*args, **kwargs)
305 if len(args) > num_allow_args:
306 warnings.warn(
307 msg.format(arguments=arguments),
308 FutureWarning,
309 stacklevel=stacklevel,
310 )
--> 311 return func(*args, **kwargs)
File C:\ProgramData\Anaconda3\lib\site-packages\pandas\io\parsers\readers.py:680, in read_csv(filepath_or_buffer, sep, delimiter, header, names, index_col, usecols, squeeze, prefix, mangle_dupe_cols, dtype, engine, converters, true_values, false_values, skipinitialspace, skiprows, skipfooter, nrows, na_values, keep_default_na, na_filter, verbose, skip_blank_lines, parse_dates, infer_datetime_format, keep_date_col, date_parser, dayfirst, cache_dates, iterator, chunksize, compression, thousands, decimal, lineterminator, quotechar, quoting, doublequote, escapechar, comment, encoding, encoding_errors, dialect, error_bad_lines, warn_bad_lines, on_bad_lines, delim_whitespace, low_memory, memory_map, float_precision, storage_options)
665 kwds_defaults = _refine_defaults_read(
666 dialect,
667 delimiter,
(...)
676 defaults={"delimiter": ","},
677 )
678 kwds.update(kwds_defaults)
--> 680 return _read(filepath_or_buffer, kwds)
File C:\ProgramData\Anaconda3\lib\site-packages\pandas\io\parsers\readers.py:575, in _read(filepath_or_buffer, kwds)
572 _validate_names(kwds.get("names", None))
574 # Create the parser.
--> 575 parser = TextFileReader(filepath_or_buffer, **kwds)
577 if chunksize or iterator:
578 return parser
File C:\ProgramData\Anaconda3\lib\site-packages\pandas\io\parsers\readers.py:933, in TextFileReader.__init__(self, f, engine, **kwds)
930 self.options["has_index_names"] = kwds["has_index_names"]
932 self.handles: IOHandles | None = None
--> 933 self._engine = self._make_engine(f, self.engine)
File C:\ProgramData\Anaconda3\lib\site-packages\pandas\io\parsers\readers.py:1217, in TextFileReader._make_engine(self, f, engine)
1213 mode = "rb"
1214 # error: No overload variant of "get_handle" matches argument types
1215 # "Union[str, PathLike[str], ReadCsvBuffer[bytes], ReadCsvBuffer[str]]"
1216 # , "str", "bool", "Any", "Any", "Any", "Any", "Any"
-> 1217 self.handles = get_handle( # type: ignore[call-overload]
1218 f,
1219 mode,
1220 encoding=self.options.get("encoding", None),
1221 compression=self.options.get("compression", None),
1222 memory_map=self.options.get("memory_map", False),
1223 is_text=is_text,
1224 errors=self.options.get("encoding_errors", "strict"),
1225 storage_options=self.options.get("storage_options", None),
1226 )
1227 assert self.handles is not None
1228 f = self.handles.handle
File C:\ProgramData\Anaconda3\lib\site-packages\pandas\io\common.py:789, in get_handle(path_or_buf, mode, encoding, compression, memory_map, is_text, errors, storage_options)
784 elif isinstance(handle, str):
785 # Check whether the filename is to be opened in binary mode.
786 # Binary mode does not support 'encoding' and 'newline'.
787 if ioargs.encoding and "b" not in ioargs.mode:
788 # Encoding
--> 789 handle = open(
790 handle,
791 ioargs.mode,
792 encoding=ioargs.encoding,
793 errors=errors,
794 newline="",
795 )
796 else:
797 # Binary mode
798 handle = open(handle, ioargs.mode)
FileNotFoundError: [Errno 2] No such file or directory: '1993-QTR1.tsv'
我无法巩固,请帮帮忙
谢谢
1条答案
按热度按时间pftdvrlh1#
您可以使用列表解析和
concat
,并与to_csv
链接:更新真实的示例:
输出文件: