Paddle [OP]新的STRIDED布局算子slice/transpose/split CPU模式结果出错

oiopk7p5  于 4个月前  发布在  其他
关注(0)|答案(5)|浏览(56)

bug描述 Describe the Bug

版本 develop 76b328b

Cuda模式下结果正确,但是CPU模式结果出错

现在slice/transpose/split 会默认走STRIDED布局,且没有有效任何控制开关。
FLAGS_use_stride_kernel 外部无导出

整合包复现代码
链接: https://pan.baidu.com/s/1zvenzoS8iMdHnPcOpoV9vg?pwd=h54g 提取码: h54g

复现代码main.cxx

#include "paddle/fluid/platform/init.h"
#include "paddle/phi/backends/device_manager.h"

#include "paddle/phi/api/include/api.h"
#include "paddle/phi/api/lib/utils/allocator.h"
#include "paddle/phi/core/dense_tensor.h"
#include "paddle/phi/core/kernel_registry.h"

#include "check_helper.h"

int slice_test() {
  auto x_cpu = paddle::experimental::empty(
      {5, 52, 312}, paddle::DataType::FLOAT32, phi::CPUPlace());
  auto data = CheckHelper::genRandNumber<float>(x_cpu.numel(), -1.f, 1.f);
  memcpy(x_cpu.data<float>(), data.data(), data.size() * sizeof(float));
  std::vector<int64_t> axes = {1};
  std::vector<int64_t> starts = {0};
  std::vector<int64_t> ends = {1};
  auto out_cpu =
      paddle::experimental::slice(x_cpu, axes, starts, ends, {}, {1});

  auto x_cuda = paddle::experimental::copy_to(x_cpu, phi::GPUPlace(), true);
  auto out_cuda =
      paddle::experimental::slice(x_cuda, axes, starts, ends, {}, {1});
  auto out_cuda_map =
      paddle::experimental::copy_to(out_cuda, phi::CPUPlace(), true);

  auto ret = true;
  for (int i = 0; i < out_cpu.numel(); i++) {
    if (!CheckHelper::checkValue(out_cpu.data<float>()[i],
                                 out_cuda_map.data<float>()[i])) {
      ret = false;
    }
  }
  return ret;
}

int transpose_test() {
  auto x_cpu = paddle::experimental::empty(
      {3, 512, 9, 26}, paddle::DataType::FLOAT32, phi::CPUPlace());
  auto data = CheckHelper::genRandNumber<float>(x_cpu.numel(), -1.f, 1.f);
  memcpy(x_cpu.data<float>(), data.data(), data.size() * sizeof(float));

  auto out_cpu = paddle::experimental::transpose(x_cpu, {0, 2, 1, 3});

  auto x_cuda = paddle::experimental::copy_to(x_cpu, phi::GPUPlace(), true);
  auto out_cuda = paddle::experimental::transpose(x_cuda, {0, 2, 1, 3});
  auto out_cuda_map =
      paddle::experimental::copy_to(out_cuda, phi::CPUPlace(), true);

  auto ret = true;
  for (int i = 0; i < out_cpu.numel(); i++) {
    if (!CheckHelper::checkValue(out_cpu.data<float>()[i],
                                 out_cuda_map.data<float>()[i])) {
      ret = false;
    }
  }
  return ret;
}

int split_test() {
  auto x_cpu = paddle::experimental::empty(
      {128, 55, 256}, paddle::DataType::FLOAT32, phi::CPUPlace());
  auto data = CheckHelper::genRandNumber<float>(x_cpu.numel(), -1.f, 1.f);
  memcpy(x_cpu.data<float>(), data.data(), data.size() * sizeof(float));

  auto out_cpu_vec =
      paddle::experimental::split(x_cpu, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10}, 1);

  auto x_cuda = paddle::experimental::copy_to(x_cpu, phi::GPUPlace(), true);
  auto out_cuda_vec =
      paddle::experimental::split(x_cuda, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10}, 1);
  auto ret = true;

  for (int tensor_index = 0; tensor_index < out_cuda_vec.size();
       tensor_index++) {
    auto &out_cuda = out_cuda_vec[tensor_index];
    auto &out_cpu = out_cpu_vec[tensor_index];
    auto out_cuda_map =
        paddle::experimental::copy_to(out_cuda, phi::CPUPlace(), true);

    for (int i = 0; i < out_cpu.numel(); i++) {
      if (!CheckHelper::checkValue(out_cpu.data<float>()[i],
                                   out_cuda_map.data<float>()[i])) {
        ret = false;
      }
    }
  }
  return ret;
}

int main(int argc, char **argv) {
  paddle::framework::InitMemoryMethod();
  paddle::framework::InitDevices();

  int slice_result = slice_test();
  int transpose_result = transpose_test();
  int split_result = split_test();

  if (!slice_result) {
    std::cout << "slice error" << std::endl;
  }

  if (!transpose_result) {
    std::cout << "transpose error" << std::endl;
  }

  if (!split_result) {
    std::cout << "split error" << std::endl;
  }

  return EXIT_SUCCESS;
}

头文件check_helper.h

#pragma once

#include <iostream>
#include <vector>

class CheckHelper {
public:
  template <class T>
  static std::vector<T> genRandNumber(size_t num, T min_value, T max_value) {
    std::vector<T> ret(num);
    for (int i = 0; i < num; i++) {
      ret[i] = static_cast<T>((float)rand() / float(RAND_MAX) *
                                  (max_value - min_value) +
                              min_value);
    }
    return ret;
  }

  template <class T> static bool checkValue(T a, T b) {
    if (a != b) {
      std::cout << "different value" << a << "/" << b << std::endl;
      return false;
    } else {
      return true;
    }
  }
};

其他补充信息 Additional Supplementary Information

No response

slmsl1lt

slmsl1lt2#

临时解决方法 禁用stride
phi::GetExportedFlagInfoMap 可以强制修改gflags的全局变量

#include "paddle/phi/core/flags.h"
int main() {
    auto map = phi::GetExportedFlagInfoMap();
    auto* use_stride_kernel_flag = (bool*)map["use_stride_kernel"].value_ptr;
    *use_stride_kernel_flag = false;

   // code
}
velaa5lx

velaa5lx3#

auto out_cpu = paddle::experimental::slice(x_cpu, axes, starts, ends, {}, {1});
在strided模式下,这句话返回的out_cpu和x_cpu共享内存。通过stride描述偏移位置。out_cpu是一个不连续的Tensor。

auto out_cuda = paddle::experimental::slice(x_cuda, axes, starts, ends, {}, {1});
同样,在strided模式下,这句话返回的out_cuda和x_cuda共享显存。通过stride描述偏移位置。out_cuda是一个不连续的Tensor。
但是:
auto out_cuda_map = paddle::experimental::copy_to(out_cuda, phi::CPUPlace(), true);
copy_to时,会检查out_cuda是不是连续的Tensor,如果不是,则先把out_cuda重新转换为一个连续的Tensor,即再和x_cuda贡献显存。然后再拷贝到CPU给到out_cuda_map。

因此,out_cpu和out_cuda_map,一个是不连续的Tensor,一个是连续的Tensor。内存大小也是不一样的。out_cpu不能通过out_cpu.data()[i]来访问result了。
你可以执行:
auto out_cpu_map = paddle::experimental::copy_to(out_cpu, phi::CPUPlace(), true);
来获得一个out_cpu的连续的Tensor,再做比较就是对的了。

5t7ly7z5

5t7ly7z54#

@wanghuancoder 我输入的是NCHW连续内存布局,为什么一定要强制转为STRIDED布局呢?并以STRIDED布局输出?
STRIDED的布局 = func(NCHW布局)
这在逻辑上是不是不合理?

oo7oh9g9

oo7oh9g95#

@wanghuancoder
而且 out_cpu.layout() 返回的值也是NCHW
无法判断是不是STRIDED

相关问题