bug描述 Describe the Bug
版本 develop 76b328b
Cuda模式下结果正确,但是CPU模式结果出错
现在slice/transpose/split 会默认走STRIDED布局,且没有有效任何控制开关。
FLAGS_use_stride_kernel 外部无导出
整合包复现代码
链接: https://pan.baidu.com/s/1zvenzoS8iMdHnPcOpoV9vg?pwd=h54g 提取码: h54g
复现代码main.cxx
#include "paddle/fluid/platform/init.h"
#include "paddle/phi/backends/device_manager.h"
#include "paddle/phi/api/include/api.h"
#include "paddle/phi/api/lib/utils/allocator.h"
#include "paddle/phi/core/dense_tensor.h"
#include "paddle/phi/core/kernel_registry.h"
#include "check_helper.h"
int slice_test() {
auto x_cpu = paddle::experimental::empty(
{5, 52, 312}, paddle::DataType::FLOAT32, phi::CPUPlace());
auto data = CheckHelper::genRandNumber<float>(x_cpu.numel(), -1.f, 1.f);
memcpy(x_cpu.data<float>(), data.data(), data.size() * sizeof(float));
std::vector<int64_t> axes = {1};
std::vector<int64_t> starts = {0};
std::vector<int64_t> ends = {1};
auto out_cpu =
paddle::experimental::slice(x_cpu, axes, starts, ends, {}, {1});
auto x_cuda = paddle::experimental::copy_to(x_cpu, phi::GPUPlace(), true);
auto out_cuda =
paddle::experimental::slice(x_cuda, axes, starts, ends, {}, {1});
auto out_cuda_map =
paddle::experimental::copy_to(out_cuda, phi::CPUPlace(), true);
auto ret = true;
for (int i = 0; i < out_cpu.numel(); i++) {
if (!CheckHelper::checkValue(out_cpu.data<float>()[i],
out_cuda_map.data<float>()[i])) {
ret = false;
}
}
return ret;
}
int transpose_test() {
auto x_cpu = paddle::experimental::empty(
{3, 512, 9, 26}, paddle::DataType::FLOAT32, phi::CPUPlace());
auto data = CheckHelper::genRandNumber<float>(x_cpu.numel(), -1.f, 1.f);
memcpy(x_cpu.data<float>(), data.data(), data.size() * sizeof(float));
auto out_cpu = paddle::experimental::transpose(x_cpu, {0, 2, 1, 3});
auto x_cuda = paddle::experimental::copy_to(x_cpu, phi::GPUPlace(), true);
auto out_cuda = paddle::experimental::transpose(x_cuda, {0, 2, 1, 3});
auto out_cuda_map =
paddle::experimental::copy_to(out_cuda, phi::CPUPlace(), true);
auto ret = true;
for (int i = 0; i < out_cpu.numel(); i++) {
if (!CheckHelper::checkValue(out_cpu.data<float>()[i],
out_cuda_map.data<float>()[i])) {
ret = false;
}
}
return ret;
}
int split_test() {
auto x_cpu = paddle::experimental::empty(
{128, 55, 256}, paddle::DataType::FLOAT32, phi::CPUPlace());
auto data = CheckHelper::genRandNumber<float>(x_cpu.numel(), -1.f, 1.f);
memcpy(x_cpu.data<float>(), data.data(), data.size() * sizeof(float));
auto out_cpu_vec =
paddle::experimental::split(x_cpu, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10}, 1);
auto x_cuda = paddle::experimental::copy_to(x_cpu, phi::GPUPlace(), true);
auto out_cuda_vec =
paddle::experimental::split(x_cuda, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10}, 1);
auto ret = true;
for (int tensor_index = 0; tensor_index < out_cuda_vec.size();
tensor_index++) {
auto &out_cuda = out_cuda_vec[tensor_index];
auto &out_cpu = out_cpu_vec[tensor_index];
auto out_cuda_map =
paddle::experimental::copy_to(out_cuda, phi::CPUPlace(), true);
for (int i = 0; i < out_cpu.numel(); i++) {
if (!CheckHelper::checkValue(out_cpu.data<float>()[i],
out_cuda_map.data<float>()[i])) {
ret = false;
}
}
}
return ret;
}
int main(int argc, char **argv) {
paddle::framework::InitMemoryMethod();
paddle::framework::InitDevices();
int slice_result = slice_test();
int transpose_result = transpose_test();
int split_result = split_test();
if (!slice_result) {
std::cout << "slice error" << std::endl;
}
if (!transpose_result) {
std::cout << "transpose error" << std::endl;
}
if (!split_result) {
std::cout << "split error" << std::endl;
}
return EXIT_SUCCESS;
}
头文件check_helper.h
#pragma once
#include <iostream>
#include <vector>
class CheckHelper {
public:
template <class T>
static std::vector<T> genRandNumber(size_t num, T min_value, T max_value) {
std::vector<T> ret(num);
for (int i = 0; i < num; i++) {
ret[i] = static_cast<T>((float)rand() / float(RAND_MAX) *
(max_value - min_value) +
min_value);
}
return ret;
}
template <class T> static bool checkValue(T a, T b) {
if (a != b) {
std::cout << "different value" << a << "/" << b << std::endl;
return false;
} else {
return true;
}
}
};
其他补充信息 Additional Supplementary Information
No response
5条答案
按热度按时间ngynwnxp1#
@jzhang533
slmsl1lt2#
临时解决方法 禁用stride
phi::GetExportedFlagInfoMap 可以强制修改gflags的全局变量
velaa5lx3#
auto out_cpu = paddle::experimental::slice(x_cpu, axes, starts, ends, {}, {1});
在strided模式下,这句话返回的out_cpu和x_cpu共享内存。通过stride描述偏移位置。out_cpu是一个不连续的Tensor。
auto out_cuda = paddle::experimental::slice(x_cuda, axes, starts, ends, {}, {1});
同样,在strided模式下,这句话返回的out_cuda和x_cuda共享显存。通过stride描述偏移位置。out_cuda是一个不连续的Tensor。
但是:
auto out_cuda_map = paddle::experimental::copy_to(out_cuda, phi::CPUPlace(), true);
copy_to时,会检查out_cuda是不是连续的Tensor,如果不是,则先把out_cuda重新转换为一个连续的Tensor,即再和x_cuda贡献显存。然后再拷贝到CPU给到out_cuda_map。
因此,out_cpu和out_cuda_map,一个是不连续的Tensor,一个是连续的Tensor。内存大小也是不一样的。out_cpu不能通过out_cpu.data()[i]来访问result了。
你可以执行:
auto out_cpu_map = paddle::experimental::copy_to(out_cpu, phi::CPUPlace(), true);
来获得一个out_cpu的连续的Tensor,再做比较就是对的了。
5t7ly7z54#
@wanghuancoder 我输入的是NCHW连续内存布局,为什么一定要强制转为STRIDED布局呢?并以STRIDED布局输出?
STRIDED的布局 = func(NCHW布局)
这在逻辑上是不是不合理?
oo7oh9g95#
@wanghuancoder
而且 out_cpu.layout() 返回的值也是NCHW
无法判断是不是STRIDED