bug描述 Describe the Bug
Paddle的 Dataloader 遇到纯文本数据时报错 StopIteration
看上去Paddle的Dataloader并不支持纯str类型的数据,而Pytorch则不然
下面是Dataloader文档中的样例代码,后面代码都是基于这份代码来做复现。
- 样例代码
import numpy as np
import paddle
import paddle.nn as nn
import paddle.nn.functional as F
from paddle.io import Dataset, BatchSampler, DataLoader
BATCH_NUM = 20
BATCH_SIZE = 16
EPOCH_NUM = 4
IMAGE_SIZE = 784
CLASS_NUM = 10
# define a random dataset
class RandomDataset(Dataset):
def __init__(self, num_samples):
self.num_samples = num_samples
def __getitem__(self, idx):
image = np.random.random([IMAGE_SIZE]).astype('float32')
label = np.random.randint(0, CLASS_NUM - 1, (1, )).astype('int64')
return image, label
def __len__(self):
return self.num_samples
dataset = RandomDataset(BATCH_NUM * BATCH_SIZE)
class SimpleNet(nn.Layer):
def __init__(self):
super().__init__()
self.fc = nn.Linear(IMAGE_SIZE, CLASS_NUM)
def forward(self, image, label=None):
return self.fc(image)
simple_net = SimpleNet()
opt = paddle.optimizer.SGD(learning_rate=1e-3,
parameters=simple_net.parameters())
loader = DataLoader(dataset,
batch_size=BATCH_SIZE,
shuffle=True,
drop_last=True,
num_workers=2)
for e in range(EPOCH_NUM):
for i, (image, label) in enumerate(loader()):
out = simple_net(image)
loss = F.cross_entropy(out, label)
avg_loss = paddle.mean(loss)
avg_loss.backward()
opt.minimize(avg_loss)
simple_net.clear_gradients()
print("Epoch {} batch {}: loss = {}".format(e, i, np.mean(loss.numpy())))
当dataset中的数据不是tensor,而是str类型时,无法成功load进数据
- Paddle:
import numpy as np
import paddle
import paddle.nn as nn
import paddle.nn.functional as F
from paddle.io import Dataset, BatchSampler, DataLoader
BATCH_NUM = 20
BATCH_SIZE = 16
EPOCH_NUM = 4
IMAGE_SIZE = 784
CLASS_NUM = 10
# define a random dataset
class RandomDataset(Dataset):
def __init__(self, num_samples):
self.num_samples = num_samples
def __getitem__(self, idx): # 改动基本就是这里,原本的np array 变成现在的 str 类型
# image = np.random.random([IMAGE_SIZE]).astype('float32')
# label = np.random.randint(0, CLASS_NUM - 1, (1, )).astype('int64')
image = "This is a string format data"
label = "Also a string format data"
return image, label
def __len__(self):
return self.num_samples
dataset = RandomDataset(BATCH_NUM * BATCH_SIZE)
class SimpleNet(nn.Layer):
def __init__(self):
super().__init__()
self.fc = nn.Linear(IMAGE_SIZE, CLASS_NUM)
def forward(self, image, label=None):
return self.fc(image)
simple_net = SimpleNet()
opt = paddle.optimizer.SGD(learning_rate=1e-3,
parameters=simple_net.parameters())
loader = DataLoader(dataset,
batch_size=BATCH_SIZE,
shuffle=True,
drop_last=True,)
for e in range(EPOCH_NUM):
import pdb
pdb.set_trace()
for i, (image, label) in enumerate(loader):
out = simple_net(image)
loss = F.cross_entropy(out, label)
avg_loss = paddle.mean(loss)
avg_loss.backward()
opt.minimize(avg_loss)
simple_net.clear_gradients()
print("Epoch {} batch {}: loss = {}".format(e, i, np.mean(loss.numpy())))
在内层 for 去读取 dataloader 的数据的时候,报错:
(exp) coco@coco:~$ python mytest.py
> /home/coco/mytest.py(71)<module>()
-> for i, (image, label) in enumerate(loader):
(Pdb) n
StopIteration
- Pytorch
单纯把上面 paddle 的例子中相关api换成 torch 中的 api
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, BatchSampler, DataLoader
BATCH_NUM = 20
BATCH_SIZE = 16
EPOCH_NUM = 4
IMAGE_SIZE = 784
CLASS_NUM = 10
# define a random dataset
class RandomDataset(Dataset):
def __init__(self, num_samples):
self.num_samples = num_samples
def __getitem__(self, idx):
# image = np.random.random([IMAGE_SIZE]).astype('float32')
# label = np.random.randint(0, CLASS_NUM - 1, (1, )).astype('int64')
image = "This is a string format data"
label = "Also a string format data"
return image, label
def __len__(self):
return self.num_samples
dataset = RandomDataset(BATCH_NUM * BATCH_SIZE)
class SimpleNet(nn.Module):
def __init__(self):
super().__init__()
self.fc = nn.Linear(IMAGE_SIZE, CLASS_NUM)
def forward(self, image, label=None):
return self.fc(image)
simple_net = SimpleNet()
opt = torch.optim.SGD(lr=1e-3,params=simple_net.parameters())
loader = DataLoader(dataset,
batch_size=BATCH_SIZE,
shuffle=True,
drop_last=True,
num_workers=2)
for e in range(EPOCH_NUM):
import pdb
pdb.set_trace()
for i, (image, label) in enumerate(loader):
out = simple_net(image)
loss = F.cross_entropy(out, label)
avg_loss = paddle.mean(loss)
avg_loss.backward()
opt.minimize(avg_loss)
simple_net.clear_gradients()
print("Epoch {} batch {}: loss = {}".format(e, i, np.mean(loss.numpy())))
输出:
(exp) coco@coco:~$ python torch_mytest.py
> /home/coco/torch_mytest.py(52)<module>()
-> for i, (image, label) in enumerate(loader):
(Pdb) n
> /home/coco/torch_mytest.py(53)<module>()
-> out = simple_net(image)
(Pdb) l
48
49 for e in range(EPOCH_NUM):
50 import pdb
51 pdb.set_trace()
52 for i, (image, label) in enumerate(loader):
53 -> out = simple_net(image)
54 loss = F.cross_entropy(out, label)
55 avg_loss = paddle.mean(loss)
56 avg_loss.backward()
57 opt.minimize(avg_loss)
58 simple_net.clear_gradients()
(Pdb) image
('This is a string format data', 'This is a string format data', 'This is a string format data', 'This is a string format data', 'This is a string format data', 'This is a string format data', 'This is a string format data', 'This is a string format data', 'This is a string format data', 'This is a string format data', 'This is a string format data', 'This is a string format data', 'This is a string format data', 'This is a string format data', 'This is a string format data', 'This is a string format data')
(Pdb) label
('Also a string format data', 'Also a string format data', 'Also a string format data', 'Also a string format data', 'Also a string format data', 'Also a string format data', 'Also a string format data', 'Also a string format data', 'Also a string format data', 'Also a string format data', 'Also a string format data', 'Also a string format data', 'Also a string format data', 'Also a string format data', 'Also a string format data', 'Also a string format data')
(Pdb)
Pytorch可以正常load进数据。
上面的复现过程中仅针对数据load的部分,不考虑后续计算
但是如果把数据部分改成这样:
def __getitem__(self, idx):
# image = np.random.random([IMAGE_SIZE]).astype('float32')
label = np.random.randint(0, CLASS_NUM - 1, (1, )).astype('int64')
image = "This is a string format data"
# label = "Also a string format data"
return image, label
也就是有文本,也有ndarray,又可以正常读到数据。
所以看上去Paddle的dataloader遇到纯str类型数据会有有问题,而且输出仅仅是一个 StopIteration
,难定位问题,对用户不友好。
感觉会是这里的原因吗?:
Paddle/python/paddle/io/dataloader/dataloader_iter.py
Lines 219 to 277 in 0359685
| | def_thread_loop(self, legacy_expected_place): |
| | # NOTE(zhiqiu): Set the expected place for new thread as the same as father thread, |
| | # and it will call platform::SetDeviceId() in c++ internally. |
| | # If we do not set cudaDeviceId in new thread, the default cudaDeviceId will be 0, |
| | # Which may cost hundreds of MB of GPU memory on CUDAPlace(0) if calling some cuda |
| | # APIs in this thread. |
| | core.set_current_thread_name("Dataloader_"+str(id(self))) |
| | _set_expected_place(legacy_expected_place) |
| | |
| | whilenotself._thread_done_event.is_set(): |
| | try: |
| | indices=next(self._sampler_iter) |
| | |
| | # read data from dataset in mini-batch |
| | # with paddle.base.dygraph.guard(place=paddle.CPUPlace()): |
| | # read data from dataset in mini-batch |
| | batch=self._dataset_fetcher.fetch( |
| | indices, self._thread_done_event |
| | ) |
| | exceptStopIteration: |
| | self._exit_thread_expectedly() |
| | return |
| | |
| | ifbatchisNoneorself._thread_done_event.is_set(): |
| | break |
| | |
| | # flat batch and record structure infos |
| | batch, structure=_flatten_batch(batch) |
| | self._structure_infos.append(structure) |
| | |
| | ifself._thread_done_event.is_set(): |
| | break |
| | |
| | try: |
| | # pack as LoDTensorArray |
| | array=core.LoDTensorArray() |
| | forslotinbatch: |
| | ifisinstance(slot, (paddle.Tensor, core.eager.Tensor)): |
| | slot=slot.value().get_tensor() |
| | elifnotisinstance(slot, core.LoDTensor): |
| | tmp=core.LoDTensor() |
| | tmp.set(slot, core.CPUPlace()) |
| | slot=tmp |
| | |
| | array.append(slot) |
| | |
| | ifself._thread_done_event.is_set(): |
| | break |
| | |
| | try: |
| | self._blocking_queue.push(array) |
| | except: |
| | self._exit_thread_expectedly() |
| | |
| | exceptExceptionase: |
| | self._exit_thread_unexpectedly() |
| | raisee |
| | |
| | self._exit_thread_expectedly() |
中的 _flatten_batch
函数里面:
Paddle/python/paddle/io/dataloader/flat.py
Lines 37 to 43 in 0359685
| | ifisinstance( |
| | field, |
| | (np.ndarray, paddle.Tensor, paddle.base.core.eager.Tensor), |
| | ): |
| | structure.append(f'{FIELD_PREFIX}{field_idx}') |
| | flat_batch.append(field) |
| | field_idx+=1 |
这里用了自己递归的方式,对最内层数据 (np.ndarray, paddle.Tensor, paddle.base.core.eager.Tensor)
这些类型提取并存在了 flat_batch
,也就是说,遇到例如纯str类型的数据时,而 structure
则保存了tensor和ndarray之外的信息以及给tensor和ndarray的值留了占位符,但是此时为纯文本,导致 flat_batch
一直都会为空,不知道是不是这里有影响emm
- 具体遇到的情景:
在一些LLM模型中,它可能在Pytorch实现过程中没有很规范地去写processor或者collator(尤其是tokenize的过程),直接把这些模块放在了组网的过程中,这就意味着dataloader迭代得到数据会是纯文本,这就导致Paddle的dataloader出现上述问题,Paddle中必须写collator把纯文本做一下tokenize,让input_ids、attention_mask等最内层都是tensor或者np array,这样才能顺利被迭代读取。
但是至于为什么会抛出 StopIteration
,我还是比较困惑,希望后续如果可以成功复现,定位问题后可以在这个issue下告知呀~非常感谢!!!
其他补充信息 Additional Supplementary Information
No response
1条答案
按热度按时间kyks70gy1#
已收到您的反馈,我们马上排查。