pytorch 为什么使用dataloader训练数据会改变模型的训练?

dgsult0t  于 12个月前  发布在  其他
关注(0)|答案(1)|浏览(141)

我有一个基本模型,其中手动创建训练数据的批次,如下所示。

import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
import tqdm
import copy
import random
import torch
import torch.nn as nn
import torch.optim as optim

SEED = 12345
BATCH_SIZE= 5
N_EPOCHS = 100

torch.manual_seed(SEED)
np.random.seed(SEED)
random.seed(SEED)

dataset = load_iris()
X , y = dataset.data, dataset.target
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, shuffle=True)

X_train = torch.tensor(X_train, dtype=torch.float32)
y_train = torch.tensor(y_train, dtype=torch.float32).reshape(-1,1)
X_test = torch.tensor(X_test, dtype=torch.float32)
y_test = torch.tensor(y_test, dtype=torch.float32).reshape(-1,1)

class NeuralNetwork(nn.Module):
    def __init__(self):
        super().__init__()
        self.linear_relu_stack = nn.Sequential(
            nn.Linear(len(dataset.feature_names), 10),
            nn.LeakyReLU(),
            nn.Linear(10, 20),
            nn.LeakyReLU(),
            nn.Linear(20, 8),
            nn.LeakyReLU(),
            nn.Linear(8, 1)
        )
    def forward(self, x):
        logits = self.linear_relu_stack(x)
        return logits

model = NeuralNetwork()
batch_start = torch.arange(0, len(X_train), BATCH_SIZE)

best_mse = np.inf
best_weights = None
history = []

def train(model, loss_fn, optimizer):
    model.train()
    with tqdm.tqdm(batch_start, unit="batch", mininterval=0, disable=False) as bar:
        bar.set_description(f"Epoch {epoch}")
        for start in bar:
            X_batch = X_train[start:start + BATCH_SIZE]
            y_batch = y_train[start:start + BATCH_SIZE]
            y_pred = model(X_batch)
            loss = loss_fn(y_pred, y_batch)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            bar.set_postfix(mse=float(loss))

def test(model, loss_fn):
    global best_mse
    global best_weights
    model.eval()
    y_pred = model(X_test)
    mse = loss_fn(y_pred, y_test)
    mse = float(mse)
    history.append(mse)
    if mse < best_mse:
        best_mse = mse
        best_weights = copy.deepcopy(model.state_dict())

loss_fn = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.0001)

for epoch in range(N_EPOCHS):
    train(model, loss_fn, optimizer)
    test(model, loss_fn)

model.load_state_dict(best_weights)
print("MSE: %.2f" % best_mse)
print("RMSE: %.2f" % np.sqrt(best_mse))
plt.xlabel("Epoch")
plt.ylabel("MSE")
plt.plot(history)
plt.show()

字符串
在收敛终止于MSE = 0.07RMSE=0.26时,MSE的减小如下所示


的数据
但是,当我使用dataloader自动进行批处理时,其代码如下所示

import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
import tqdm
import copy
import random
import torch
from torch.utils.data import DataLoader
import torch.nn as nn
import torch.optim as optim

SEED = 12345
BATCH_SIZE= 5
N_EPOCHS = 100

torch.manual_seed(SEED)
np.random.seed(SEED)
random.seed(SEED)

dataset = load_iris()
X , y = dataset.data, dataset.target
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, shuffle=True)

X_train = torch.tensor(X_train, dtype=torch.float32)
y_train = torch.tensor(y_train, dtype=torch.float32).reshape(-1,1)
X_test = torch.tensor(X_test, dtype=torch.float32)
y_test = torch.tensor(y_test, dtype=torch.float32).reshape(-1,1)

train_dataloader = DataLoader(list(zip(X_train, y_train)), shuffle=True, batch_size=BATCH_SIZE)
test_dataloader = DataLoader(list(zip(X_test, y_test)), shuffle=False, batch_size=BATCH_SIZE)

class NeuralNetwork(nn.Module):
    def __init__(self):
        super().__init__()
        self.linear_relu_stack = nn.Sequential(
            nn.Linear(len(dataset.feature_names), 10),
            nn.LeakyReLU(),
            nn.Linear(10, 20),
            nn.LeakyReLU(),
            nn.Linear(20, 8),
            nn.LeakyReLU(),
            nn.Linear(8, 1)
        )
    def forward(self, x):
        logits = self.linear_relu_stack(x)
        return logits

model = NeuralNetwork()
batch_start = torch.arange(0, len(X_train), BATCH_SIZE)

best_mse = np.inf
best_weights = None
history = []

def train(model, loss_fn, optimizer):
    model.train()
    with tqdm.tqdm(batch_start, unit="batch", mininterval=0, disable=False) as bar:
        bar.set_description(f"Epoch {epoch}")
        for start in bar:
            for X_train_batch, y_train_batch in train_dataloader:
                y_pred = model(X_train_batch)
                loss = loss_fn(y_pred, y_train_batch)
                optimizer.zero_grad()
                loss.backward()
                optimizer.step()
                bar.set_postfix(mse=float(loss))

def test(model, loss_fn):
    global best_mse
    global best_weights
    model.eval()
    y_pred = model(X_test)
    mse = loss_fn(y_pred, y_test)
    mse = float(mse)
    history.append(mse)
    if mse < best_mse:
        best_mse = mse
        best_weights = copy.deepcopy(model.state_dict())

loss_fn = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.0001)

for epoch in range(N_EPOCHS):
    train(model, loss_fn, optimizer)
    test(model, loss_fn)

model.load_state_dict(best_weights)
print("MSE: %.2f" % best_mse)
print("RMSE: %.2f" % np.sqrt(best_mse))
plt.xlabel("Epoch")
plt.ylabel("MSE")
plt.plot(history)
plt.show()


有两件事是意料之外的:
首先,第二段代码的执行速度比第一段代码慢得多,因为第二段代码使用了数据加载器。其次,训练结果在MSE = 0.02RMSE=0.15处停止,现在完全不同了:



那么,我们能解释为什么这两个代码的结果不一样吗?

eyh26e7m

eyh26e7m1#

编辑:

一些更新:
首先,在dataloader版本中,您不应该将循环嵌套在tqdm bar下(用于start in bar)。其次,您应该禁用train dataloader中的shuffle=True。通过上述更新,以下代码可以重现初始结果(MSE=0.07,RMSE-0.26)。

import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
import tqdm
import copy
import random
import torch
from torch.utils.data import DataLoader
import torch.nn as nn
import torch.optim as optim

SEED = 12345
BATCH_SIZE= 5
N_EPOCHS = 100

torch.manual_seed(SEED)
np.random.seed(SEED)
random.seed(SEED)

dataset = load_iris()
X , y = dataset.data, dataset.target
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, shuffle=True)

X_train = torch.tensor(X_train, dtype=torch.float32)
y_train = torch.tensor(y_train, dtype=torch.float32).reshape(-1,1)
X_test = torch.tensor(X_test, dtype=torch.float32)
y_test = torch.tensor(y_test, dtype=torch.float32).reshape(-1,1)

train_dataloader = DataLoader(list(zip(X_train, y_train)), shuffle=False, batch_size=BATCH_SIZE)
test_dataloader = DataLoader(list(zip(X_test, y_test)), shuffle=False, batch_size=BATCH_SIZE)

class NeuralNetwork(nn.Module):
    def __init__(self):
        super().__init__()
        self.linear_relu_stack = nn.Sequential(
            nn.Linear(len(dataset.feature_names), 10),
            nn.LeakyReLU(),
            nn.Linear(10, 20),
            nn.LeakyReLU(),
            nn.Linear(20, 8),
            nn.LeakyReLU(),
            nn.Linear(8, 1)
        )
    def forward(self, x):
        logits = self.linear_relu_stack(x)
        return logits

model = NeuralNetwork()
batch_start = torch.arange(0, len(X_train), BATCH_SIZE)

best_mse = np.inf
best_weights = None
history = []

def train(model, loss_fn, optimizer):
    model.train()
    with tqdm.tqdm(batch_start, unit="batch", mininterval=0, disable=False) as bar:
        bar.set_description(f"Epoch {epoch}")

        for X_train_batch, y_train_batch in train_dataloader:
            y_pred = model(X_train_batch)
            loss = loss_fn(y_pred, y_train_batch)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            bar.set_postfix(mse=float(loss))

def test(model, loss_fn):
    global best_mse
    global best_weights
    model.eval()
    y_pred = model(X_test)
    mse = loss_fn(y_pred, y_test)
    mse = float(mse)
    history.append(mse)
    if mse < best_mse:
        best_mse = mse
        best_weights = copy.deepcopy(model.state_dict())

loss_fn = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.0001)

for epoch in range(N_EPOCHS):
    train(model, loss_fn, optimizer)
    test(model, loss_fn)

model.load_state_dict(best_weights)
print("MSE: %.2f" % best_mse)
print("RMSE: %.2f" % np.sqrt(best_mse))
plt.xlabel("Epoch")
plt.ylabel("MSE")
plt.plot(history)
plt.show()

字符串
现在让我们进入细节。

***(1)为什么它会慢:***最重要的原因是你在初始的dataloader代码中实际上让模型训练了更多的迭代。你遍历了整个dataloader,在tqdm条的嵌套循环下(你最初实现的是手动遍历)。这也解释了为什么你的初始dataloader版本的结果更好。

另外,你应该知道dataloader本身也有一定的开销:首先是自动归类(在文档https://pytorch.org/docs/stable/data.html中检查collate_fn)。自动排序将尝试递归地预处理数据(例如,将numpy数组转换为torch.Tensor并跨批次连接它们,如果数据是嵌套的dict,则此过程是递归的)。第二个是多处理-虽然您在这里没有使用多处理,python中的多进程本身可能很棘手,通常为子进程创建worker的成本很高。这在您在多进程DataLoader中使用persistent_workers时尤其明显,这意味着每次完成DataLoader遍历并需要再次执行时,都会重新创建子进程,并且会有更多的计算开销。

  • (2)你无法重现结果的另一个原因:* 你的手动版本只是在一个混洗的训练/测试分割上训练,但顺序是确定的(混洗只发生在分割阶段)。

在dataloader版本中,您启用了shuffle=True,这意味着在每个epoch中,批次的顺序和实际组合都可以随机化,这很重要(例如:https://stats.stackexchange.com/questions/245502/why-should-we-shuffle-data-while-training-a-neural-network
现在让我们验证批处理的顺序是否重要:如果在固定数据加载器示例中保留shuffle=True,则输出将为MSE= 0.06RMSE=0.25

相关问题