如何在HuggingFace数据集中加载csv/tsv文件时跳过坏行?解析器错误:标记数据时出错

cyvaqqii  于 2023-05-11  发布在  其他
关注(0)|答案(1)|浏览(204)

有时Python和/或pandas环境中的默认csv引擎设置会在坏行上引发错误,例如

dataset = load_dataset("alvations/aymara-english")

[out]:

---------------------------------------------------------------------------
ParserError                               Traceback (most recent call last)
Cell In[7], line 1
----> 1 dataset = load_dataset("alvations/aymara-english")

File /opt/conda/lib/python3.10/site-packages/datasets/load.py:1691, in load_dataset(path, name, data_dir, data_files, split, cache_dir, features, download_config, download_mode, ignore_verifications, keep_in_memory, save_infos, revision, use_auth_token, task, streaming, **config_kwargs)
   1688 try_from_hf_gcs = path not in _PACKAGED_DATASETS_MODULES
   1690 # Download and prepare data
-> 1691 builder_instance.download_and_prepare(
   1692     download_config=download_config,
   1693     download_mode=download_mode,
   1694     ignore_verifications=ignore_verifications,
   1695     try_from_hf_gcs=try_from_hf_gcs,
   1696     use_auth_token=use_auth_token,
   1697 )
   1699 # Build dataset for splits
   1700 keep_in_memory = (
   1701     keep_in_memory if keep_in_memory is not None else is_small_dataset(builder_instance.info.dataset_size)
   1702 )

File /opt/conda/lib/python3.10/site-packages/datasets/builder.py:605, in DatasetBuilder.download_and_prepare(self, download_config, download_mode, ignore_verifications, try_from_hf_gcs, dl_manager, base_path, use_auth_token, **download_and_prepare_kwargs)
    603         logger.warning("HF google storage unreachable. Downloading and preparing it from source")
    604 if not downloaded_from_gcs:
--> 605     self._download_and_prepare(
    606         dl_manager=dl_manager, verify_infos=verify_infos, **download_and_prepare_kwargs
    607     )
    608 # Sync info
    609 self.info.dataset_size = sum(split.num_bytes for split in self.info.splits.values())

File /opt/conda/lib/python3.10/site-packages/datasets/builder.py:694, in DatasetBuilder._download_and_prepare(self, dl_manager, verify_infos, **prepare_split_kwargs)
    690 split_dict.add(split_generator.split_info)
    692 try:
    693     # Prepare split will record examples associated to the split
--> 694     self._prepare_split(split_generator, **prepare_split_kwargs)
    695 except OSError as e:
    696     raise OSError(
    697         "Cannot find data file. "
    698         + (self.manual_download_instructions or "")
    699         + "\nOriginal error:\n"
    700         + str(e)
    701     ) from None

File /opt/conda/lib/python3.10/site-packages/datasets/builder.py:1151, in ArrowBasedBuilder._prepare_split(self, split_generator)
   1149 generator = self._generate_tables(**split_generator.gen_kwargs)
   1150 with ArrowWriter(features=self.info.features, path=fpath) as writer:
-> 1151     for key, table in logging.tqdm(
   1152         generator, unit=" tables", leave=False, disable=True  # not logging.is_progress_bar_enabled()
   1153     ):
   1154         writer.write_table(table)
   1155     num_examples, num_bytes = writer.finalize()

File /opt/conda/lib/python3.10/site-packages/tqdm/notebook.py:259, in tqdm_notebook.__iter__(self)
    257 try:
    258     it = super(tqdm_notebook, self).__iter__()
--> 259     for obj in it:
    260         # return super(tqdm...) will not catch exception
    261         yield obj
    262 # NB: except ... [ as ...] breaks IPython async KeyboardInterrupt

File /opt/conda/lib/python3.10/site-packages/tqdm/std.py:1183, in tqdm.__iter__(self)
   1180 # If the bar is disabled, then just walk the iterable
   1181 # (note: keep this check outside the loop for performance)
   1182 if self.disable:
-> 1183     for obj in iterable:
   1184         yield obj
   1185     return

File /opt/conda/lib/python3.10/site-packages/datasets/packaged_modules/csv/csv.py:156, in Csv._generate_tables(self, files)
    154 csv_file_reader = pd.read_csv(file, iterator=True, dtype=dtype, **self.config.read_csv_kwargs)
    155 try:
--> 156     for batch_idx, df in enumerate(csv_file_reader):
    157         pa_table = pa.Table.from_pandas(df, schema=schema)
    158         # Uncomment for debugging (will print the Arrow table size and elements)
    159         # logger.warning(f"pa_table: {pa_table} num rows: {pa_table.num_rows}")
    160         # logger.warning('\n'.join(str(pa_table.slice(i, 1).to_pydict()) for i in range(pa_table.num_rows)))

File /opt/conda/lib/python3.10/site-packages/pandas/io/parsers/readers.py:1698, in TextFileReader.__next__(self)
   1696 def __next__(self) -> DataFrame:
   1697     try:
-> 1698         return self.get_chunk()
   1699     except StopIteration:
   1700         self.close()

File /opt/conda/lib/python3.10/site-packages/pandas/io/parsers/readers.py:1810, in TextFileReader.get_chunk(self, size)
   1808         raise StopIteration
   1809     size = min(size, self.nrows - self._currow)
-> 1810 return self.read(nrows=size)

File /opt/conda/lib/python3.10/site-packages/pandas/io/parsers/readers.py:1778, in TextFileReader.read(self, nrows)
   1771 nrows = validate_integer("nrows", nrows)
   1772 try:
   1773     # error: "ParserBase" has no attribute "read"
   1774     (
   1775         index,
   1776         columns,
   1777         col_dict,
-> 1778     ) = self._engine.read(  # type: ignore[attr-defined]
   1779         nrows
   1780     )
   1781 except Exception:
   1782     self.close()

File /opt/conda/lib/python3.10/site-packages/pandas/io/parsers/c_parser_wrapper.py:230, in CParserWrapper.read(self, nrows)
    228 try:
    229     if self.low_memory:
--> 230         chunks = self._reader.read_low_memory(nrows)
    231         # destructive to chunks
    232         data = _concatenate_chunks(chunks)

File /opt/conda/lib/python3.10/site-packages/pandas/_libs/parsers.pyx:820, in pandas._libs.parsers.TextReader.read_low_memory()

File /opt/conda/lib/python3.10/site-packages/pandas/_libs/parsers.pyx:866, in pandas._libs.parsers.TextReader._read_rows()

File /opt/conda/lib/python3.10/site-packages/pandas/_libs/parsers.pyx:852, in pandas._libs.parsers.TextReader._tokenize_rows()

File /opt/conda/lib/python3.10/site-packages/pandas/_libs/parsers.pyx:1973, in pandas._libs.parsers.raise_parser_error()

ParserError: Error tokenizing data. C error: Expected 1 fields in line 625, saw 2
add

如何将csv reader / pandas参数传入Huggingface数据集的load_dataset函数?

kyvafyod

kyvafyod1#

load_dataset函数允许你放入csv模块用来加载数据的**fmtparams参数,所以你可以这样做,它应该跳过默认C引擎无法正确解析的错误行:

dataset = load_dataset("alvations/aymara-english", on_bad_lines='skip')

如果您尝试使用Pandas dataframe read_csv on bad data上建议的error_bad_lines选项,系统将提示您仅设置on_bad_lines

dataset = load_dataset("alvations/aymara-english", error_bad_lines=False)

[out]:

---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
Cell In[8], line 1
----> 1 dataset = load_dataset("alvations/aymara-english", error_bad_lines=False)

File /opt/conda/lib/python3.10/site-packages/datasets/load.py:1691, in load_dataset(path, name, data_dir, data_files, split, cache_dir, features, download_config, download_mode, ignore_verifications, keep_in_memory, save_infos, revision, use_auth_token, task, streaming, **config_kwargs)
   1688 try_from_hf_gcs = path not in _PACKAGED_DATASETS_MODULES
   1690 # Download and prepare data
-> 1691 builder_instance.download_and_prepare(
   1692     download_config=download_config,
   1693     download_mode=download_mode,
   1694     ignore_verifications=ignore_verifications,
   1695     try_from_hf_gcs=try_from_hf_gcs,
   1696     use_auth_token=use_auth_token,
   1697 )
   1699 # Build dataset for splits
   1700 keep_in_memory = (
   1701     keep_in_memory if keep_in_memory is not None else is_small_dataset(builder_instance.info.dataset_size)
   1702 )

File /opt/conda/lib/python3.10/site-packages/datasets/builder.py:605, in DatasetBuilder.download_and_prepare(self, download_config, download_mode, ignore_verifications, try_from_hf_gcs, dl_manager, base_path, use_auth_token, **download_and_prepare_kwargs)
    603         logger.warning("HF google storage unreachable. Downloading and preparing it from source")
    604 if not downloaded_from_gcs:
--> 605     self._download_and_prepare(
    606         dl_manager=dl_manager, verify_infos=verify_infos, **download_and_prepare_kwargs
    607     )
    608 # Sync info
    609 self.info.dataset_size = sum(split.num_bytes for split in self.info.splits.values())

File /opt/conda/lib/python3.10/site-packages/datasets/builder.py:694, in DatasetBuilder._download_and_prepare(self, dl_manager, verify_infos, **prepare_split_kwargs)
    690 split_dict.add(split_generator.split_info)
    692 try:
    693     # Prepare split will record examples associated to the split
--> 694     self._prepare_split(split_generator, **prepare_split_kwargs)
    695 except OSError as e:
    696     raise OSError(
    697         "Cannot find data file. "
    698         + (self.manual_download_instructions or "")
    699         + "\nOriginal error:\n"
    700         + str(e)
    701     ) from None

File /opt/conda/lib/python3.10/site-packages/datasets/builder.py:1151, in ArrowBasedBuilder._prepare_split(self, split_generator)
   1149 generator = self._generate_tables(**split_generator.gen_kwargs)
   1150 with ArrowWriter(features=self.info.features, path=fpath) as writer:
-> 1151     for key, table in logging.tqdm(
   1152         generator, unit=" tables", leave=False, disable=True  # not logging.is_progress_bar_enabled()
   1153     ):
   1154         writer.write_table(table)
   1155     num_examples, num_bytes = writer.finalize()

File /opt/conda/lib/python3.10/site-packages/tqdm/notebook.py:259, in tqdm_notebook.__iter__(self)
    257 try:
    258     it = super(tqdm_notebook, self).__iter__()
--> 259     for obj in it:
    260         # return super(tqdm...) will not catch exception
    261         yield obj
    262 # NB: except ... [ as ...] breaks IPython async KeyboardInterrupt

File /opt/conda/lib/python3.10/site-packages/tqdm/std.py:1183, in tqdm.__iter__(self)
   1180 # If the bar is disabled, then just walk the iterable
   1181 # (note: keep this check outside the loop for performance)
   1182 if self.disable:
-> 1183     for obj in iterable:
   1184         yield obj
   1185     return

File /opt/conda/lib/python3.10/site-packages/datasets/packaged_modules/csv/csv.py:154, in Csv._generate_tables(self, files)
    152 dtype = {name: dtype.to_pandas_dtype() for name, dtype in zip(schema.names, schema.types)} if schema else None
    153 for file_idx, file in enumerate(files):
--> 154     csv_file_reader = pd.read_csv(file, iterator=True, dtype=dtype, **self.config.read_csv_kwargs)
    155     try:
    156         for batch_idx, df in enumerate(csv_file_reader):

File /opt/conda/lib/python3.10/site-packages/pandas/util/_decorators.py:183, in deprecate_kwarg.<locals>._deprecate_kwarg.<locals>.wrapper(*args, **kwargs)
    181     warnings.warn(msg, FutureWarning, stacklevel=stacklevel)
    182     kwargs[old_arg_name] = old_arg_value
--> 183     return func(*args, **kwargs)
    185 elif mapping is not None:
    186     if callable(mapping):

File /opt/conda/lib/python3.10/site-packages/pandas/util/_decorators.py:331, in deprecate_nonkeyword_arguments.<locals>.decorate.<locals>.wrapper(*args, **kwargs)
    325 if len(args) > num_allow_args:
    326     warnings.warn(
    327         msg.format(arguments=_format_argument_list(allow_args)),
    328         FutureWarning,
    329         stacklevel=find_stack_level(),
    330     )
--> 331 return func(*args, **kwargs)

File /opt/conda/lib/python3.10/site-packages/pandas/io/parsers/readers.py:935, in read_csv(filepath_or_buffer, sep, delimiter, header, names, index_col, usecols, squeeze, prefix, mangle_dupe_cols, dtype, engine, converters, true_values, false_values, skipinitialspace, skiprows, skipfooter, nrows, na_values, keep_default_na, na_filter, verbose, skip_blank_lines, parse_dates, infer_datetime_format, keep_date_col, date_parser, dayfirst, cache_dates, iterator, chunksize, compression, thousands, decimal, lineterminator, quotechar, quoting, doublequote, escapechar, comment, encoding, encoding_errors, dialect, error_bad_lines, warn_bad_lines, on_bad_lines, delim_whitespace, low_memory, memory_map, float_precision, storage_options)
    932 del kwds["filepath_or_buffer"]
    933 del kwds["sep"]
--> 935 kwds_defaults = _refine_defaults_read(
    936     dialect,
    937     delimiter,
    938     delim_whitespace,
    939     engine,
    940     sep,
    941     error_bad_lines,
    942     warn_bad_lines,
    943     on_bad_lines,
    944     names,
    945     prefix,
    946     defaults={"delimiter": ","},
    947 )
    948 kwds.update(kwds_defaults)
    950 return _read(filepath_or_buffer, kwds)

File /opt/conda/lib/python3.10/site-packages/pandas/io/parsers/readers.py:2088, in _refine_defaults_read(dialect, delimiter, delim_whitespace, engine, sep, error_bad_lines, warn_bad_lines, on_bad_lines, names, prefix, defaults)
   2086 if on_bad_lines is not None:
   2087     if error_bad_lines is not None or warn_bad_lines is not None:
-> 2088         raise ValueError(
   2089             "Both on_bad_lines and error_bad_lines/warn_bad_lines are set. "
   2090             "Please only set on_bad_lines."
   2091         )
   2092     if on_bad_lines == "error":
   2093         kwds["on_bad_lines"] = ParserBase.BadLineHandleMethod.ERROR

ValueError: Both on_bad_lines and error_bad_lines/warn_bad_lines are set. Please only set on_bad_lines.

相关问题