使用ctype执行矩阵乘法的正确方法是什么?
在我当前的实现中,数据来回传输消耗了大量时间,有没有什么方法可以优化它?通过传递数组地址并返回指针,而不是使用.contents
方法生成整个数组。
cpp_函数.cpp
使用g++ -shared -fPIC cpp_function.cpp -o cpp_function.so
编译
#include <iostream>
extern "C" {
double* mult_matrix(double *a1, double *a2, size_t a1_h, size_t a1_w,
size_t a2_h, size_t a2_w, int size)
{
double* ret_arr = new double[size];
for(size_t i = 0; i < a1_h; i++){
for (size_t j = 0; j < a2_w; j++) {
double val = 0;
for (size_t k = 0; k < a2_h; k++){
val += a1[i * a1_h + k] * a2[k * a2_h +j] ;
}
ret_arr[i * a1_h +j ] = val;
// printf("%f ", ret_arr[i * a1_h +j ]);
}
// printf("\n");
}
return ret_arr;
}
}
用于调用so文件的Python文件main.py
import ctypes
import numpy
from time import time
libmatmult = ctypes.CDLL("./cpp_function.so")
ND_POINTER_1 = numpy.ctypeslib.ndpointer(dtype=numpy.float64,
ndim=2,
flags="C")
ND_POINTER_2 = numpy.ctypeslib.ndpointer(dtype=numpy.float64,
ndim=2,
flags="C")
libmatmult.mult_matrix.argtypes = [ND_POINTER_1, ND_POINTER_2, ctypes.c_size_t, ctypes.c_size_t]
def mult_matrix_cpp(a,b):
shape = a.shape[0] * a.shape[1]
libmatmult.mult_matrix.restype = ctypes.POINTER(ctypes.c_double * shape )
ret_cpp = libmatmult.mult_matrix(a, b, *a.shape, *b.shape , a.shape[0] * a.shape[1])
out_list_c = [i for i in ret_cpp.contents] # <---- regenrating list which is time consuming
return out_list_c
size_a = (300,300)
size_b = size_a
a = numpy.random.uniform(low=1, high=255, size=size_a)
b = numpy.random.uniform(low=1, high=255, size=size_b)
t2 = time()
out_cpp = mult_matrix_cpp(a,b)
print("cpp time taken:{:.2f} ms".format((time() - t2) * 1000))
out_cpp = numpy.array(out_cpp).reshape(size_a[0], size_a[1])
t3 = time()
out_np = numpy.dot(a,b)
# print(out_np)
print("Numpy dot() time taken:{:.2f} ms".format((time() - t3) * 1000))
此解决方案有效,但非常耗时有没有办法加快速度?
1条答案
按热度按时间0md85ypi1#
浪费时间的一个原因是没有使用
ndpointer
作为返回值并将其复制到Python列表中。而是使用下面的restype
。您也不需要后面的reshape
。但是请接受评论者的建议,不要重复发明轮子。