Paddle as stride op 比 torch 慢了700倍

6qftjkof  于 2022-04-21  发布在  Java
关注(0)|答案(2)|浏览(263)

我这里测了一下 as_strided op, 发现paddle的op速度和pytorch的速度差的比较多
可否帮忙看一下我的用法是否正确?
这个op是否可以优化一下?

seconds / slice opcpugpu
paddle静态图 loop(exe.run()0.003084400.00219843
paddle静态图 exe.run(loop()?)0.002973280.00014069
paddle动态图0.001648050.00013993
torch0.000003570.00000306

# script to test paddle.strided_slice and torch.as_strided

from paddle import fluid
import paddle
import paddle.nn.functional as F
import paddle as P
import torch
import time
import pdb
from tqdm import tqdm

print("paddle version ", paddle.__version__)
print("torch version ", torch.__version__)
import numpy as np

loop = 1000
drop_loop = 50
outer_loop = 3

# shape = [100, 200, 768]

# shape = [100, 20, 8]

shape = [100, 200, 50]
target_shape = shape.copy()
for i, v in enumerate(target_shape):
    target_shape[i] = v//2

data_np = np.random.rand(*shape).astype(np.float32) # data to test
slice_np = data_np[0::2, 0::2, 0::2] # , 0::2] # target slices
INT_MAX = 99999

def for_loop_measure(func, loop, drop_loop, out_test=None):
    for i in tqdm(range(loop)):
        if i == drop_loop: start_time = time.time()
        out = func()
        if i == (loop - drop_loop ): end_time = time.time()
    if out_test is not None: out_test(out)
    return (end_time - start_time) / (loop - 2 * drop_loop)

def main_paddle_dynamic(place):
    data = P.to_tensor(data_np, place=place)
    def test_func():
        slice = P.strided_slice(
            data,
            axes=list(range(len(shape))),
            starts=[0,] * len(shape),
            ends=[INT_MAX, ] * len(shape),
            strides=[2, ] * len(shape),
        )
        return slice
    def out_test(out):
        assert (out.numpy() == slice_np).all()
    result_time = for_loop_measure(test_func, loop, drop_loop, out_test=out_test)
    return result_time

def main_paddle_static(place):
    # mp, sp = fluid.Program(), fluid.Program()
    # data = fluid.Tensor()
    # data.set(data_np, place)
    mp, sp = fluid.Program(), fluid.Program()
    with fluid.program_guard(mp, sp):
        i = fluid.layers.fill_constant(shape=[1], dtype='int64', value=0)
        loop_len = fluid.layers.fill_constant(shape=[1],dtype='int64', value=loop)
        cond = fluid.layers.less_than(x=i, y=loop_len)
        while_op = fluid.layers.While(cond=cond)
        data = fluid.data(name="data", shape=shape, dtype='float32')
        slice = fluid.layers.create_tensor(dtype="float32")
        init_slice = fluid.layers.fill_constant(shape=target_shape, value=0, dtype='float64')
        fluid.layers.assign(init_slice, slice)
        with while_op.block():
            slice_tmp = fluid.layers.strided_slice(
                data,
                axes=list(range(len(shape))),
                starts=[0,] * len(shape),
                ends=[INT_MAX, ] * len(shape),
                strides=[2, ] * len(shape),
            )
            fluid.layers.assign(slice_tmp, slice)
            i = fluid.layers.increment(x=i, value=1, in_place=True)
            fluid.layers.less_than(x=i, y=loop_len, cond=cond) # update condition
    exe = fluid.Executor(place)
    exe.run(sp)
    start_time = time.time()
    slice_out, i_out = exe.run(
        mp,
        feed={'data': data_np},
        fetch_list=[slice, i],
    )
    end_time = time.time()
    assert (slice_out == slice_np).all() and i_out.item() == loop
    return (end_time - start_time) / loop

    # def test_func():
    #     slice_out, = exe.run(
    #         mp,
    #         feed={'data': data_np},
    #         fetch_list=[slice],
    #     )
    #     return slice_out
    # def out_test(out):
    #     assert (out == slice_np).all()
    # return for_loop_measure(test_func, loop, drop_loop, out_test=out_test)

def main_torch(device):
    data = torch.tensor(data_np)
    stride = data.stride()
    target_stride = list(stride)
    target_shape = list(data.shape)
    for i in range(len(target_stride)):
        target_stride[i] = target_stride[i] * 2
        target_shape[i] = target_shape[i] // 2

    def test_func():
        slice = torch.as_strided(data, target_shape, target_stride)
        return slice

    def out_test(out):
        assert (out.cpu().numpy() == slice_np).all()

    return for_loop_measure(test_func, loop, drop_loop, out_test=out_test)

if __name__ == "__main__":
    static_test = True
    # static_test = False
    if static_test:
        P.enable_static()
        print("use static paddle for test")

    main_paddle = main_paddle_static if static_test else main_paddle_dynamic

    # four timer, torch cpu, paddle cpu, torch gpu, paddle gpu
    time_list = [0, 0, 0, 0]
    for i in range(outer_loop + 1):
        time_list[0] += main_torch(torch.device("cpu"))
        time_list[1] += main_paddle(P.CPUPlace())
        time_list[2] += main_torch(torch.device("cuda:1"))
        time_list[3] += main_paddle(P.CUDAPlace(2))
        if i == 0: time_list = [0, 0, 0, 0] # drop first time

    for i, v in enumerate(time_list):
        time_list[i] = v / outer_loop

    print(f"torch(cpu):     {time_list[0]:0.8f} seconds")
    print(f"paddle(cpu):    {time_list[1]:0.8f} seconds")
    print(f"torch(cuda:1):  {time_list[2]:0.8f} seconds")
    print(f"paddle(cuda:2): {time_list[3]:0.8f} seconds")
qpgpyjmq

qpgpyjmq1#

您好,我们已经收到了您的问题,会安排技术人员尽快解答您的问题,请耐心等待。请您再次检查是否提供了清晰的问题描述、复现代码、环境&版本、报错信息等。同时,您也可以通过查看官网API文档常见问题历史IssueAI社区来寻求解答。祝您生活愉快~

Hi! We've received your issue and please be patient to get responded. We will arrange technicians to answer your questions as soon as possible. Please make sure that you have posted enough message to demo your request. You may also check out the APIFAQGithub Issue and AI community to get the answer.Have a nice day!

xqkwcwgp

xqkwcwgp2#

@OleNet 感谢反馈,我们后续测一下

相关问题