Paddle 反向传播出错,loss.backward() Segmentation fault: 11

iqih9akk  于 2021-11-30  发布在  Java
关注(0)|答案(6)|浏览(430)

网络前向传播正常,能正常得到loss: Tensor(shape=[1], dtype=float32, place=CUDAPlace(0), stop_gradient=False,
[106.43949890])
反向传播loss.backward()报Segmentation fault: 11,且没有出现其他报错信息。
通过cpp log定位能定位到具体出错的op,但是通过其他方式绕过该op,则会出现其他op报错。

环境:
Paddle2.0
gpu环境, NVIDIA Corporation Device 1023m, cuda:10.0。

nr9pn0ug

nr9pn0ug1#

能方便提供下出现问题的代码环境么

ybzsozfc

ybzsozfc2#

import paddle
import numpy as np
import paddle.nn as nn
import paddle.nn.functional as F

paddle.set_device('gpu')

class MolTreeNode(object):
    def __init__(self, idx, nid, wid):
        self.idx = idx
        self.nid = nid
        self.wid = wid
        self.neighbors = []

class Tree(object):
    def __init__(self, nodes):
        self.nodes = nodes

def GRU(x, h_nei, W_z, W_r, U_r, W_h):
    hidden_size = x.shape[-1]
    sum_h = paddle.sum(h_nei, axis=1)
    z_input = paddle.concat([x, sum_h], axis=1)
    z = F.sigmoid(W_z(z_input))

    r_1 = paddle.reshape(W_r(x), shape=[-1, 1, hidden_size])
    r_2 = U_r(h_nei)
    r = F.sigmoid(r_1 + r_2)

    gated_h = r * h_nei
    sum_gated_h = paddle.sum(gated_h, axis=1)
    h_input = paddle.concat([x, sum_gated_h], axis=1)
    pre_h = F.tanh(W_h(h_input))
    new_h = (1.0 - z) * sum_h + z * pre_h
    return new_h

pred_loss = nn.CrossEntropyLoss(reduction='sum')
stop_loss = nn.BCEWithLogitsLoss(reduction='sum')
latent_size = 28
hidden_size = 450
MAX_NB = 15
embedding = nn.Embedding(531, hidden_size)
W_z = nn.Linear(2 * hidden_size, hidden_size)
U_r = nn.Linear(hidden_size, hidden_size, bias_attr=False)
W_r = nn.Linear(hidden_size, hidden_size)
W_h = nn.Linear(2 * hidden_size, hidden_size)
W = nn.Linear(hidden_size + latent_size, hidden_size)

# Stop Prediction Weights

U = nn.Linear(hidden_size + latent_size, hidden_size)
U_i = nn.Linear(2 * hidden_size, hidden_size)

# Output Weights

W_o = nn.Linear(hidden_size, 531)
U_o = nn.Linear(hidden_size, 1)

def dfs(stack, x, fa_idx):
    for y in x.neighbors:
        if y.idx == fa_idx: continue
        stack.append((x, y, 1))
        dfs(stack, y, x.idx)
        stack.append((y, x, 0))

def aggregate(hiddens, contexts, x_tree_vecs, mode):
    if mode == 'word':
        V, V_o = W, W_o
    elif mode == 'stop':
        V, V_o = U, U_o
    else:
        raise ValueError('aggregate mode is wrong')

    tree_contexts = paddle.index_select(axis=0, index=contexts, x=x_tree_vecs)
    input_vec = paddle.concat([hiddens, tree_contexts], axis=-1)
    output_vec = F.relu(V(input_vec))
    return V_o(output_vec)

n1 = MolTreeNode(0, 1, 133)
n2 = MolTreeNode(1, 2, 505)
n3 = MolTreeNode(2, 3, 133)
n1.neighbors.append(n2)
n2.neighbors.extend([n1, n3])
n3.neighbors.append(n2)

import copy

tree1 = Tree([n1, n2, n3])
tree2 = copy.deepcopy(tree1)
batches = [[tree1], [tree2]]

x_tree_vecs = paddle.randn([1, latent_size])
x_tree_vecs.stop_gradient = False

def forward(mol_batch, x_tree_vecs):
    global pred_loss, stop_loss, latent_size, hidden_size, MAX_NB, embedding, W_z, U_r, W_r, W_h, W, U, U_i, W_o, U_o
    pred_hiddens, pred_contexts, pred_targets = [], [], []

    traces = []
    for mol_tree in mol_batch:
        s = []
        dfs(s, mol_tree.nodes[0], -1)
        traces.append(s)
        for node in mol_tree.nodes:
            node.neighbors = []
    # Predict Root
    batch_size = len(mol_batch)

    pred_hiddens.append(paddle.zeros([len(mol_batch), hidden_size]))
    pred_targets.extend([mol_tree.nodes[0].wid for mol_tree in mol_batch])
    pred_contexts.append(paddle.to_tensor(list(range(batch_size))))

    max_iter = max([len(tr) for tr in traces])
    padding = paddle.zeros([hidden_size])
    padding.stop_gradient = True
    h = {}
    # print('max_iter', max_iter)

    for t in range(max_iter):
        prop_list = []
        batch_list = []
        for i, plist in enumerate(traces):
            if t < len(plist):
                prop_list.append(plist[t])
                batch_list.append(i)

        cur_x = []
        cur_h_nei, cur_o_nei = [], []

        for node_x, real_y, _ in prop_list:
            # Neighbors for message passing (target not included)
            cur_nei = [h[(node_y.idx, node_x.idx)] for node_y in node_x.neighbors if node_y.idx != real_y.idx]
            pad_len = MAX_NB - len(cur_nei)
            cur_h_nei.extend(cur_nei)
            cur_h_nei.extend([padding] * pad_len)
            # Neighbors for stop prediction (all neighbors)
            cur_nei = [h[(node_y.idx, node_x.idx)] for node_y in node_x.neighbors]
            pad_len = MAX_NB - len(cur_nei)

            # Current clique embedding
            cur_x.append(node_x.wid)
        # Clique embedding
        cur_x = paddle.to_tensor(cur_x)
        cur_x = embedding(cur_x)
        # Message passing
        cur_h_nei = paddle.reshape(paddle.stack(cur_h_nei, axis=0), shape=[-1, MAX_NB, hidden_size])
        new_h = GRU(cur_x, cur_h_nei, W_z, W_r, U_r, W_h)
        # Gather targets
        pred_list = []
        # stop_target = []
        for i, m in enumerate(prop_list):
            node_x, node_y, direction = m
            x, y = node_x.idx, node_y.idx
            h[(x, y)] = new_h[i]
            node_y.neighbors.append(node_x)
            if direction == 1:
                pred_list.append(i)

        # Hidden states for stop prediction
        cur_batch = paddle.to_tensor((batch_list))
        # Hidden states for clique prediction
        if len(pred_list) > 0:
            batch_list = [batch_list[i] for i in pred_list]
            cur_batch = paddle.to_tensor(batch_list)
            pred_contexts.append(cur_batch)

            cur_pred = paddle.to_tensor(pred_list)
            pred_hiddens.append(paddle.index_select(axis=0, index=cur_pred, x=new_h))
            # pred_targets.extend(pred_target)
    # Predict next clique
    pred_contexts = paddle.concat(pred_contexts, axis=0)
    pred_hiddens = paddle.concat(pred_hiddens, axis=0)
    pred_scores = aggregate(pred_hiddens, pred_contexts, x_tree_vecs, 'word')
    tmp = paddle.sum(pred_scores)
    return tmp

for i, batch in enumerate(batches):
    loss = forward(batch, x_tree_vecs)
    print('i:%s, loss:%s' % (i, loss))
    loss.backward()

运行结果:

i:0, loss:Tensor(shape=[1], dtype=float32, place=CUDAPlace(0), stop_gradient=False,
       [9.87923050])
Traceback (most recent call last):
  File "xxx.py", line 183, in <module>
    loss = forward(batch, x_tree_vecs)
  File "xxx.py", line 109, in forward
    pred_contexts.append(paddle.to_tensor(list(range(batch_size))))
  File "<decorator-gen-118>", line 2, in to_tensor
  File "/home/bio/hedonglong/paddle2.0/lib/python3.7/site-packages/paddle/fluid/wrapped_decorator.py", line 25, in __impl__
    return wrapped_func(*args,**kwargs)
  File "/home/bio/hedonglong/paddle2.0/lib/python3.7/site-packages/paddle/fluid/framework.py", line 225, in __impl__
    return func(*args,**kwargs)
  File "/home/bio/hedonglong/paddle2.0/lib/python3.7/site-packages/paddle/tensor/creation.py", line 179, in to_tensor
    stop_gradient=stop_gradient)
OSError: (External)  Cuda error(77), an illegal memory access was encountered.
  [Advise: The device encountered a load or store instruction on an invalid memory address. This leaves the process in an inconsistentstate and any further CUDA work will return the same error. To continue using CUDA, the process must be terminated and relaunched. ] (at /paddle/paddle/fluid/platform/gpu_info.cc:301)

可以正常forward、backward一次,第二次forward报错。
更正一下,上诉问题出现在gpu环境,而标题所描述的问题发生在cpu环境。

piwo6bdm

piwo6bdm3#

好的,我们复现定位下,非常感谢给予的反馈

nwlqm0z1

nwlqm0z14#

从日志上看,挂在stack_grad这个op了,这个op的实现输入检查没写好,在没有确定输出是否是空指针的情况下就调用了,导致了段错误

一方面,后面需要在这里增强一下输入检查,至少以报错的形式,提示这里是输出为空导致

另一方面,您这里会报这个问题,大概率是因为stack的部分前向输入是stop_gradient=True的,也就是不生成反向变量,所以这里反向变量为空,麻烦您check一下你stack的输入都有哪些,这些输入的stop_gradient属性是否设置成了False,如果不是,您需要调用tensor.stop_gradient=False配置一下,paddle.to_tensor接口的该熟悉默认为True

p1iqtdky

p1iqtdky5#

您好,我这边的代码逻辑就是stack的输入里会有一部分的tensor是没有梯度的。这样操作在torch里不会报错,可以跑通。

ddhy6vgd

ddhy6vgd6#

紧急修复方案可以通过设置stop_gradient来解决该问题; 同时我们会同步增强op实现和完善报错信息,可以不设置stop_gradient也能保证可以正确运行;修复完成之后我们会把pr信息同步 | | hong***@***.***| Signature is customized by Netease Mail Master 在2021年03月23日 20:04,nickyoungforu 写道: stop_gradient 您好,我这边的代码逻辑就是stack的输入里会有一部分的tensor是没有梯度的。这样操作在torch里不会报错,可以跑通。 — You are receiving this because you commented. Reply to this email directly, view it on GitHub, or unsubscribe.

相关问题