Paddle的 Dataloader 遇到纯文本数据时报错 StopIteration

fdx2calv  于 5个月前  发布在  其他
关注(0)|答案(1)|浏览(48)

bug描述 Describe the Bug

Paddle的 Dataloader 遇到纯文本数据时报错 StopIteration

看上去Paddle的Dataloader并不支持纯str类型的数据,而Pytorch则不然

下面是Dataloader文档中的样例代码,后面代码都是基于这份代码来做复现。

  • 样例代码
import numpy as np

import paddle
import paddle.nn as nn
import paddle.nn.functional as F
from paddle.io import Dataset, BatchSampler, DataLoader

BATCH_NUM = 20
BATCH_SIZE = 16
EPOCH_NUM = 4

IMAGE_SIZE = 784
CLASS_NUM = 10

# define a random dataset
class RandomDataset(Dataset):
    def __init__(self, num_samples):
        self.num_samples = num_samples

    def __getitem__(self, idx):
        image = np.random.random([IMAGE_SIZE]).astype('float32')
        label = np.random.randint(0, CLASS_NUM - 1, (1, )).astype('int64')
        return image, label

    def __len__(self):
        return self.num_samples

dataset = RandomDataset(BATCH_NUM * BATCH_SIZE)

class SimpleNet(nn.Layer):
    def __init__(self):
        super().__init__()
        self.fc = nn.Linear(IMAGE_SIZE, CLASS_NUM)

    def forward(self, image, label=None):
        return self.fc(image)

simple_net = SimpleNet()
opt = paddle.optimizer.SGD(learning_rate=1e-3,
                            parameters=simple_net.parameters())

loader = DataLoader(dataset,
                    batch_size=BATCH_SIZE,
                    shuffle=True,
                    drop_last=True,
                    num_workers=2)

for e in range(EPOCH_NUM):
    for i, (image, label) in enumerate(loader()):
        out = simple_net(image)
        loss = F.cross_entropy(out, label)
        avg_loss = paddle.mean(loss)
        avg_loss.backward()
        opt.minimize(avg_loss)
        simple_net.clear_gradients()
        print("Epoch {} batch {}: loss = {}".format(e, i, np.mean(loss.numpy())))

当dataset中的数据不是tensor,而是str类型时,无法成功load进数据

  • Paddle:
import numpy as np

import paddle
import paddle.nn as nn
import paddle.nn.functional as F
from paddle.io import Dataset, BatchSampler, DataLoader

BATCH_NUM = 20
BATCH_SIZE = 16
EPOCH_NUM = 4

IMAGE_SIZE = 784
CLASS_NUM = 10

# define a random dataset
class RandomDataset(Dataset):
    def __init__(self, num_samples):
        self.num_samples = num_samples

    def __getitem__(self, idx):                             #  改动基本就是这里,原本的np array 变成现在的 str 类型
        # image = np.random.random([IMAGE_SIZE]).astype('float32')
        # label = np.random.randint(0, CLASS_NUM - 1, (1, )).astype('int64')
        image = "This is a string format data"
        label = "Also a string format data"

        return image, label

    def __len__(self):
        return self.num_samples

dataset = RandomDataset(BATCH_NUM * BATCH_SIZE)

class SimpleNet(nn.Layer):
    def __init__(self):
        super().__init__()
        self.fc = nn.Linear(IMAGE_SIZE, CLASS_NUM)

    def forward(self, image, label=None):
        return self.fc(image)


simple_net = SimpleNet()
opt = paddle.optimizer.SGD(learning_rate=1e-3,
                            parameters=simple_net.parameters())

loader = DataLoader(dataset,
                    batch_size=BATCH_SIZE,
                    shuffle=True,
                    drop_last=True,)


for e in range(EPOCH_NUM):
    import pdb
    pdb.set_trace()
    for i, (image, label) in enumerate(loader):
        out = simple_net(image)
        loss = F.cross_entropy(out, label)
        avg_loss = paddle.mean(loss)
        avg_loss.backward()
        opt.minimize(avg_loss)
        simple_net.clear_gradients()
        print("Epoch {} batch {}: loss = {}".format(e, i, np.mean(loss.numpy())))

在内层 for 去读取 dataloader 的数据的时候,报错:

(exp) coco@coco:~$ python mytest.py
> /home/coco/mytest.py(71)<module>()
-> for i, (image, label) in enumerate(loader):
(Pdb) n
StopIteration
  • Pytorch

单纯把上面 paddle 的例子中相关api换成 torch 中的 api

import numpy as np

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, BatchSampler, DataLoader

BATCH_NUM = 20
BATCH_SIZE = 16
EPOCH_NUM = 4

IMAGE_SIZE = 784
CLASS_NUM = 10

# define a random dataset
class RandomDataset(Dataset):
    def __init__(self, num_samples):
        self.num_samples = num_samples

    def __getitem__(self, idx):
        # image = np.random.random([IMAGE_SIZE]).astype('float32')
        # label = np.random.randint(0, CLASS_NUM - 1, (1, )).astype('int64')
        image = "This is a string format data"
        label = "Also a string format data"

        return image, label

    def __len__(self):
        return self.num_samples

dataset = RandomDataset(BATCH_NUM * BATCH_SIZE)

class SimpleNet(nn.Module):
    def __init__(self):
        super().__init__()
        self.fc = nn.Linear(IMAGE_SIZE, CLASS_NUM)

    def forward(self, image, label=None):
        return self.fc(image)

simple_net = SimpleNet()
opt = torch.optim.SGD(lr=1e-3,params=simple_net.parameters())
loader = DataLoader(dataset,
                    batch_size=BATCH_SIZE,
                    shuffle=True,
                    drop_last=True,
                    num_workers=2)

for e in range(EPOCH_NUM):
    import pdb
    pdb.set_trace()
    for i, (image, label) in enumerate(loader):
        out = simple_net(image)
        loss = F.cross_entropy(out, label)
        avg_loss = paddle.mean(loss)
        avg_loss.backward()
        opt.minimize(avg_loss)
        simple_net.clear_gradients()
        print("Epoch {} batch {}: loss = {}".format(e, i, np.mean(loss.numpy())))

输出:

(exp) coco@coco:~$ python torch_mytest.py
> /home/coco/torch_mytest.py(52)<module>()
-> for i, (image, label) in enumerate(loader):
(Pdb) n
> /home/coco/torch_mytest.py(53)<module>()
-> out = simple_net(image)
(Pdb) l
 48
 49     for e in range(EPOCH_NUM):
 50         import pdb
 51         pdb.set_trace()
 52         for i, (image, label) in enumerate(loader):
 53  ->         out = simple_net(image)
 54             loss = F.cross_entropy(out, label)
 55             avg_loss = paddle.mean(loss)
 56             avg_loss.backward()
 57             opt.minimize(avg_loss)
 58             simple_net.clear_gradients()
(Pdb) image
('This is a string format data', 'This is a string format data', 'This is a string format data', 'This is a string format data', 'This is a string format data', 'This is a string format data', 'This is a string format data', 'This is a string format data', 'This is a string format data', 'This is a string format data', 'This is a string format data', 'This is a string format data', 'This is a string format data', 'This is a string format data', 'This is a string format data', 'This is a string format data')
(Pdb) label
('Also a string format data', 'Also a string format data', 'Also a string format data', 'Also a string format data', 'Also a string format data', 'Also a string format data', 'Also a string format data', 'Also a string format data', 'Also a string format data', 'Also a string format data', 'Also a string format data', 'Also a string format data', 'Also a string format data', 'Also a string format data', 'Also a string format data', 'Also a string format data')
(Pdb)

Pytorch可以正常load进数据。

上面的复现过程中仅针对数据load的部分,不考虑后续计算

但是如果把数据部分改成这样:

def __getitem__(self, idx):
        # image = np.random.random([IMAGE_SIZE]).astype('float32')
        label = np.random.randint(0, CLASS_NUM - 1, (1, )).astype('int64')
        image = "This is a string format data"
        # label = "Also a string format data"

        return image, label

也就是有文本,也有ndarray,又可以正常读到数据。

所以看上去Paddle的dataloader遇到纯str类型数据会有有问题,而且输出仅仅是一个 StopIteration ,难定位问题,对用户不友好。

感觉会是这里的原因吗?:

Paddle/python/paddle/io/dataloader/dataloader_iter.py

Lines 219 to 277 in 0359685

| | def_thread_loop(self, legacy_expected_place): |
| | # NOTE(zhiqiu): Set the expected place for new thread as the same as father thread, |
| | # and it will call platform::SetDeviceId() in c++ internally. |
| | # If we do not set cudaDeviceId in new thread, the default cudaDeviceId will be 0, |
| | # Which may cost hundreds of MB of GPU memory on CUDAPlace(0) if calling some cuda |
| | # APIs in this thread. |
| | core.set_current_thread_name("Dataloader_"+str(id(self))) |
| | _set_expected_place(legacy_expected_place) |
| | |
| | whilenotself._thread_done_event.is_set(): |
| | try: |
| | indices=next(self._sampler_iter) |
| | |
| | # read data from dataset in mini-batch |
| | # with paddle.base.dygraph.guard(place=paddle.CPUPlace()): |
| | # read data from dataset in mini-batch |
| | batch=self._dataset_fetcher.fetch( |
| | indices, self._thread_done_event |
| | ) |
| | exceptStopIteration: |
| | self._exit_thread_expectedly() |
| | return |
| | |
| | ifbatchisNoneorself._thread_done_event.is_set(): |
| | break |
| | |
| | # flat batch and record structure infos |
| | batch, structure=_flatten_batch(batch) |
| | self._structure_infos.append(structure) |
| | |
| | ifself._thread_done_event.is_set(): |
| | break |
| | |
| | try: |
| | # pack as LoDTensorArray |
| | array=core.LoDTensorArray() |
| | forslotinbatch: |
| | ifisinstance(slot, (paddle.Tensor, core.eager.Tensor)): |
| | slot=slot.value().get_tensor() |
| | elifnotisinstance(slot, core.LoDTensor): |
| | tmp=core.LoDTensor() |
| | tmp.set(slot, core.CPUPlace()) |
| | slot=tmp |
| | |
| | array.append(slot) |
| | |
| | ifself._thread_done_event.is_set(): |
| | break |
| | |
| | try: |
| | self._blocking_queue.push(array) |
| | except: |
| | self._exit_thread_expectedly() |
| | |
| | exceptExceptionase: |
| | self._exit_thread_unexpectedly() |
| | raisee |
| | |
| | self._exit_thread_expectedly() |

中的 _flatten_batch 函数里面:

Paddle/python/paddle/io/dataloader/flat.py

Lines 37 to 43 in 0359685

| | ifisinstance( |
| | field, |
| | (np.ndarray, paddle.Tensor, paddle.base.core.eager.Tensor), |
| | ): |
| | structure.append(f'{FIELD_PREFIX}{field_idx}') |
| | flat_batch.append(field) |
| | field_idx+=1 |

这里用了自己递归的方式,对最内层数据 (np.ndarray, paddle.Tensor, paddle.base.core.eager.Tensor) 这些类型提取并存在了 flat_batch ,也就是说,遇到例如纯str类型的数据时,而 structure 则保存了tensor和ndarray之外的信息以及给tensor和ndarray的值留了占位符,但是此时为纯文本,导致 flat_batch 一直都会为空,不知道是不是这里有影响emm

  • 具体遇到的情景:
    在一些LLM模型中,它可能在Pytorch实现过程中没有很规范地去写processor或者collator(尤其是tokenize的过程),直接把这些模块放在了组网的过程中,这就意味着dataloader迭代得到数据会是纯文本,这就导致Paddle的dataloader出现上述问题,Paddle中必须写collator把纯文本做一下tokenize,让input_ids、attention_mask等最内层都是tensor或者np array,这样才能顺利被迭代读取。

但是至于为什么会抛出 StopIteration ,我还是比较困惑,希望后续如果可以成功复现,定位问题后可以在这个issue下告知呀~非常感谢!!!

其他补充信息 Additional Supplementary Information

No response

kyks70gy

kyks70gy1#

已收到您的反馈,我们马上排查。

相关问题