有时Python和/或pandas环境中的默认csv引擎设置会在坏行上引发错误,例如
dataset = load_dataset("alvations/aymara-english")
[out]:
---------------------------------------------------------------------------
ParserError Traceback (most recent call last)
Cell In[7], line 1
----> 1 dataset = load_dataset("alvations/aymara-english")
File /opt/conda/lib/python3.10/site-packages/datasets/load.py:1691, in load_dataset(path, name, data_dir, data_files, split, cache_dir, features, download_config, download_mode, ignore_verifications, keep_in_memory, save_infos, revision, use_auth_token, task, streaming, **config_kwargs)
1688 try_from_hf_gcs = path not in _PACKAGED_DATASETS_MODULES
1690 # Download and prepare data
-> 1691 builder_instance.download_and_prepare(
1692 download_config=download_config,
1693 download_mode=download_mode,
1694 ignore_verifications=ignore_verifications,
1695 try_from_hf_gcs=try_from_hf_gcs,
1696 use_auth_token=use_auth_token,
1697 )
1699 # Build dataset for splits
1700 keep_in_memory = (
1701 keep_in_memory if keep_in_memory is not None else is_small_dataset(builder_instance.info.dataset_size)
1702 )
File /opt/conda/lib/python3.10/site-packages/datasets/builder.py:605, in DatasetBuilder.download_and_prepare(self, download_config, download_mode, ignore_verifications, try_from_hf_gcs, dl_manager, base_path, use_auth_token, **download_and_prepare_kwargs)
603 logger.warning("HF google storage unreachable. Downloading and preparing it from source")
604 if not downloaded_from_gcs:
--> 605 self._download_and_prepare(
606 dl_manager=dl_manager, verify_infos=verify_infos, **download_and_prepare_kwargs
607 )
608 # Sync info
609 self.info.dataset_size = sum(split.num_bytes for split in self.info.splits.values())
File /opt/conda/lib/python3.10/site-packages/datasets/builder.py:694, in DatasetBuilder._download_and_prepare(self, dl_manager, verify_infos, **prepare_split_kwargs)
690 split_dict.add(split_generator.split_info)
692 try:
693 # Prepare split will record examples associated to the split
--> 694 self._prepare_split(split_generator, **prepare_split_kwargs)
695 except OSError as e:
696 raise OSError(
697 "Cannot find data file. "
698 + (self.manual_download_instructions or "")
699 + "\nOriginal error:\n"
700 + str(e)
701 ) from None
File /opt/conda/lib/python3.10/site-packages/datasets/builder.py:1151, in ArrowBasedBuilder._prepare_split(self, split_generator)
1149 generator = self._generate_tables(**split_generator.gen_kwargs)
1150 with ArrowWriter(features=self.info.features, path=fpath) as writer:
-> 1151 for key, table in logging.tqdm(
1152 generator, unit=" tables", leave=False, disable=True # not logging.is_progress_bar_enabled()
1153 ):
1154 writer.write_table(table)
1155 num_examples, num_bytes = writer.finalize()
File /opt/conda/lib/python3.10/site-packages/tqdm/notebook.py:259, in tqdm_notebook.__iter__(self)
257 try:
258 it = super(tqdm_notebook, self).__iter__()
--> 259 for obj in it:
260 # return super(tqdm...) will not catch exception
261 yield obj
262 # NB: except ... [ as ...] breaks IPython async KeyboardInterrupt
File /opt/conda/lib/python3.10/site-packages/tqdm/std.py:1183, in tqdm.__iter__(self)
1180 # If the bar is disabled, then just walk the iterable
1181 # (note: keep this check outside the loop for performance)
1182 if self.disable:
-> 1183 for obj in iterable:
1184 yield obj
1185 return
File /opt/conda/lib/python3.10/site-packages/datasets/packaged_modules/csv/csv.py:156, in Csv._generate_tables(self, files)
154 csv_file_reader = pd.read_csv(file, iterator=True, dtype=dtype, **self.config.read_csv_kwargs)
155 try:
--> 156 for batch_idx, df in enumerate(csv_file_reader):
157 pa_table = pa.Table.from_pandas(df, schema=schema)
158 # Uncomment for debugging (will print the Arrow table size and elements)
159 # logger.warning(f"pa_table: {pa_table} num rows: {pa_table.num_rows}")
160 # logger.warning('\n'.join(str(pa_table.slice(i, 1).to_pydict()) for i in range(pa_table.num_rows)))
File /opt/conda/lib/python3.10/site-packages/pandas/io/parsers/readers.py:1698, in TextFileReader.__next__(self)
1696 def __next__(self) -> DataFrame:
1697 try:
-> 1698 return self.get_chunk()
1699 except StopIteration:
1700 self.close()
File /opt/conda/lib/python3.10/site-packages/pandas/io/parsers/readers.py:1810, in TextFileReader.get_chunk(self, size)
1808 raise StopIteration
1809 size = min(size, self.nrows - self._currow)
-> 1810 return self.read(nrows=size)
File /opt/conda/lib/python3.10/site-packages/pandas/io/parsers/readers.py:1778, in TextFileReader.read(self, nrows)
1771 nrows = validate_integer("nrows", nrows)
1772 try:
1773 # error: "ParserBase" has no attribute "read"
1774 (
1775 index,
1776 columns,
1777 col_dict,
-> 1778 ) = self._engine.read( # type: ignore[attr-defined]
1779 nrows
1780 )
1781 except Exception:
1782 self.close()
File /opt/conda/lib/python3.10/site-packages/pandas/io/parsers/c_parser_wrapper.py:230, in CParserWrapper.read(self, nrows)
228 try:
229 if self.low_memory:
--> 230 chunks = self._reader.read_low_memory(nrows)
231 # destructive to chunks
232 data = _concatenate_chunks(chunks)
File /opt/conda/lib/python3.10/site-packages/pandas/_libs/parsers.pyx:820, in pandas._libs.parsers.TextReader.read_low_memory()
File /opt/conda/lib/python3.10/site-packages/pandas/_libs/parsers.pyx:866, in pandas._libs.parsers.TextReader._read_rows()
File /opt/conda/lib/python3.10/site-packages/pandas/_libs/parsers.pyx:852, in pandas._libs.parsers.TextReader._tokenize_rows()
File /opt/conda/lib/python3.10/site-packages/pandas/_libs/parsers.pyx:1973, in pandas._libs.parsers.raise_parser_error()
ParserError: Error tokenizing data. C error: Expected 1 fields in line 625, saw 2
add
如何将csv reader / pandas参数传入Huggingface数据集的load_dataset
函数?
1条答案
按热度按时间kyvafyod1#
load_dataset
函数允许你放入csv
模块用来加载数据的**fmtparams
参数,所以你可以这样做,它应该跳过默认C引擎无法正确解析的错误行:如果您尝试使用Pandas dataframe read_csv on bad data上建议的
error_bad_lines
选项,系统将提示您仅设置on_bad_lines
[out]: