我试图读取大csv文件(84 GB)在块与Pandas,筛选出必要的行,并将其转换为df
import pandas as pd
chunk_size = 1000000 # Number of rows to read per chunk
my_df = pd.DataFrame()
i = 1
def convert_data(value):
try:
return float(value)
except:
return float(0.778)
for chunk in pd.read_csv(path, delimiter='~', dtype={'FIELD': 'object', 'ID_TAXPAYER': 'object', 'PYEAR': 'object'}, usecols=['PYEAR', 'DATA', 'FIELD', 'ID_TAXPAYER'], chunksize=chunk_size, converters={'DATA': convert_data},engine='python'):
chunk = chunk[chunk['FIELD'].str.contains("field", na=False)]
chunk['FIELD'] = [i.replace('field_', '').replace('_', '.') for i in chunk['FIELD']]
filtered_df = chunk[chunk['FIELD'] == '910.00.001']
print(i)
i+=1
my_df = pd.concat([my_df, filtered_df], ignore_index=True)
# Print the resulting dataframe
print(my_df)
我的笔记本电脑有16 GB的内存和3. 5 GHz的CPU,4个核心。运行一段时间后,当‘i’变量达到323时,错误出现。我知道我的内存不够。但是我认为将 Dataframe 划分为块会解决这个问题。而且,我注意到随着循环的每次迭代,我的内存被填充得越来越多。我试过使用“del chunk”,但它仍然弹出我的错误在323。
有什么想法吗提前感谢你们!
MemoryError Traceback (most recent call last)
Cell In[3], line 14
11 return float(0.778)
13 # Iterate over the chunks
---> 14 for chunk in pd.read_csv(path, delimiter='~', dtype={'FIELD': 'object', 'ID_TAXPAYER': 'object', 'PYEAR': 'object'}, usecols=['PYEAR', 'DATA', 'FIELD', 'ID_TAXPAYER'], chunksize=chunk_size, converters={'DATA': convert_data},engine='python'):
15 chunk = chunk[chunk['FIELD'].str.contains("field", na=False)]
16 chunk['FIELD'] = [i.replace('field_', '').replace('_', '.') for i in chunk['FIELD']]
File ~\AppData\Local\Programs\Python\Python311\Lib\site-packages\pandas\io\parsers\readers.py:1624, in TextFileReader.__next__(self)
1622 def __next__(self) -> DataFrame:
1623 try:
-> 1624 return self.get_chunk()
1625 except StopIteration:
1626 self.close()
File ~\AppData\Local\Programs\Python\Python311\Lib\site-packages\pandas\io\parsers\readers.py:1733, in TextFileReader.get_chunk(self, size)
1731 raise StopIteration
1732 size = min(size, self.nrows - self._currow)
-> 1733 return self.read(nrows=size)
File ~\AppData\Local\Programs\Python\Python311\Lib\site-packages\pandas\io\parsers\readers.py:1704, in TextFileReader.read(self, nrows)
1697 nrows = validate_integer("nrows", nrows)
1698 try:
1699 # error: "ParserBase" has no attribute "read"
1700 (
1701 index,
1702 columns,
1703 col_dict,
-> 1704 ) = self._engine.read( # type: ignore[attr-defined]
1705 nrows
1706 )
1707 except Exception:
1708 self.close()
File ~\AppData\Local\Programs\Python\Python311\Lib\site-packages\pandas\io\parsers\python_parser.py:251, in PythonParser.read(self, rows)
245 def read(
246 self, rows: int | None = None
247 ) -> tuple[
248 Index | None, Sequence[Hashable] | MultiIndex, Mapping[Hashable, ArrayLike]
249 ]:
250 try:
--> 251 content = self._get_lines(rows)
252 except StopIteration:
253 if self._first_chunk:
File ~\AppData\Local\Programs\Python\Python311\Lib\site-packages\pandas\io\parsers\python_parser.py:1114, in PythonParser._get_lines(self, rows)
1110 rows_to_skip = 0
1111 if self.skiprows is not None and self.pos is not None:
1112 # Only read additional rows if pos is in skiprows
1113 rows_to_skip = len(
-> 1114 set(self.skiprows) - set(range(self.pos))
1115 )
1117 for _ in range(rows + rows_to_skip):
1118 # assert for mypy, data is Iterator[str] or None, would
1119 # error in next
1120 assert self.data is not None
MemoryError:
1条答案
按热度按时间l2osamch1#
你做得很对。
我唯一想做的就是改变这条线:
致:
避免声明其他 Dataframe