我这里测了一下 as_strided op, 发现paddle的op速度和pytorch的速度差的比较多
可否帮忙看一下我的用法是否正确?
这个op是否可以优化一下?
seconds / slice op | cpu | gpu |
---|---|---|
paddle静态图 loop(exe.run() | 0.00308440 | 0.00219843 |
paddle静态图 exe.run(loop()?) | 0.00297328 | 0.00014069 |
paddle动态图 | 0.00164805 | 0.00013993 |
torch | 0.00000357 | 0.00000306 |
# script to test paddle.strided_slice and torch.as_strided
from paddle import fluid
import paddle
import paddle.nn.functional as F
import paddle as P
import torch
import time
import pdb
from tqdm import tqdm
print("paddle version ", paddle.__version__)
print("torch version ", torch.__version__)
import numpy as np
loop = 1000
drop_loop = 50
outer_loop = 3
# shape = [100, 200, 768]
# shape = [100, 20, 8]
shape = [100, 200, 50]
target_shape = shape.copy()
for i, v in enumerate(target_shape):
target_shape[i] = v//2
data_np = np.random.rand(*shape).astype(np.float32) # data to test
slice_np = data_np[0::2, 0::2, 0::2] # , 0::2] # target slices
INT_MAX = 99999
def for_loop_measure(func, loop, drop_loop, out_test=None):
for i in tqdm(range(loop)):
if i == drop_loop: start_time = time.time()
out = func()
if i == (loop - drop_loop ): end_time = time.time()
if out_test is not None: out_test(out)
return (end_time - start_time) / (loop - 2 * drop_loop)
def main_paddle_dynamic(place):
data = P.to_tensor(data_np, place=place)
def test_func():
slice = P.strided_slice(
data,
axes=list(range(len(shape))),
starts=[0,] * len(shape),
ends=[INT_MAX, ] * len(shape),
strides=[2, ] * len(shape),
)
return slice
def out_test(out):
assert (out.numpy() == slice_np).all()
result_time = for_loop_measure(test_func, loop, drop_loop, out_test=out_test)
return result_time
def main_paddle_static(place):
# mp, sp = fluid.Program(), fluid.Program()
# data = fluid.Tensor()
# data.set(data_np, place)
mp, sp = fluid.Program(), fluid.Program()
with fluid.program_guard(mp, sp):
i = fluid.layers.fill_constant(shape=[1], dtype='int64', value=0)
loop_len = fluid.layers.fill_constant(shape=[1],dtype='int64', value=loop)
cond = fluid.layers.less_than(x=i, y=loop_len)
while_op = fluid.layers.While(cond=cond)
data = fluid.data(name="data", shape=shape, dtype='float32')
slice = fluid.layers.create_tensor(dtype="float32")
init_slice = fluid.layers.fill_constant(shape=target_shape, value=0, dtype='float64')
fluid.layers.assign(init_slice, slice)
with while_op.block():
slice_tmp = fluid.layers.strided_slice(
data,
axes=list(range(len(shape))),
starts=[0,] * len(shape),
ends=[INT_MAX, ] * len(shape),
strides=[2, ] * len(shape),
)
fluid.layers.assign(slice_tmp, slice)
i = fluid.layers.increment(x=i, value=1, in_place=True)
fluid.layers.less_than(x=i, y=loop_len, cond=cond) # update condition
exe = fluid.Executor(place)
exe.run(sp)
start_time = time.time()
slice_out, i_out = exe.run(
mp,
feed={'data': data_np},
fetch_list=[slice, i],
)
end_time = time.time()
assert (slice_out == slice_np).all() and i_out.item() == loop
return (end_time - start_time) / loop
# def test_func():
# slice_out, = exe.run(
# mp,
# feed={'data': data_np},
# fetch_list=[slice],
# )
# return slice_out
# def out_test(out):
# assert (out == slice_np).all()
# return for_loop_measure(test_func, loop, drop_loop, out_test=out_test)
def main_torch(device):
data = torch.tensor(data_np)
stride = data.stride()
target_stride = list(stride)
target_shape = list(data.shape)
for i in range(len(target_stride)):
target_stride[i] = target_stride[i] * 2
target_shape[i] = target_shape[i] // 2
def test_func():
slice = torch.as_strided(data, target_shape, target_stride)
return slice
def out_test(out):
assert (out.cpu().numpy() == slice_np).all()
return for_loop_measure(test_func, loop, drop_loop, out_test=out_test)
if __name__ == "__main__":
static_test = True
# static_test = False
if static_test:
P.enable_static()
print("use static paddle for test")
main_paddle = main_paddle_static if static_test else main_paddle_dynamic
# four timer, torch cpu, paddle cpu, torch gpu, paddle gpu
time_list = [0, 0, 0, 0]
for i in range(outer_loop + 1):
time_list[0] += main_torch(torch.device("cpu"))
time_list[1] += main_paddle(P.CPUPlace())
time_list[2] += main_torch(torch.device("cuda:1"))
time_list[3] += main_paddle(P.CUDAPlace(2))
if i == 0: time_list = [0, 0, 0, 0] # drop first time
for i, v in enumerate(time_list):
time_list[i] = v / outer_loop
print(f"torch(cpu): {time_list[0]:0.8f} seconds")
print(f"paddle(cpu): {time_list[1]:0.8f} seconds")
print(f"torch(cuda:1): {time_list[2]:0.8f} seconds")
print(f"paddle(cuda:2): {time_list[3]:0.8f} seconds")
2条答案
按热度按时间qpgpyjmq1#
您好,我们已经收到了您的问题,会安排技术人员尽快解答您的问题,请耐心等待。请您再次检查是否提供了清晰的问题描述、复现代码、环境&版本、报错信息等。同时,您也可以通过查看官网API文档、常见问题、历史Issue、AI社区来寻求解答。祝您生活愉快~
Hi! We've received your issue and please be patient to get responded. We will arrange technicians to answer your questions as soon as possible. Please make sure that you have posted enough message to demo your request. You may also check out the API,FAQ,Github Issue and AI community to get the answer.Have a nice day!
xqkwcwgp2#
@OleNet 感谢反馈,我们后续测一下