博客
关于我
强烈建议你试试无所不能的chatGPT,快点击我
C++ vs Python向量运算速度评测
阅读量:6081 次
发布时间:2019-06-20

本文共 11292 字,大约阅读时间需要 37 分钟。

本文的起源来自最近一个让我非常不爽的事。

我最近在改一个开源RNN工具包currennt(http://sourceforge.net/projects/currennt/),想用它实现RNNLM功能。

currennt使用了大量的面向对象的编程技巧,可以使用GPU,向量运算使用了thrust库(https://code.google.com/p/thrust/)。

RNNLM(http://rnnlm.org/)也有相应开源实现,非常算法风格的代码,向量运算就是自己使用数组实现的。

结果……大出我的语料,在不使用GPU的情况下,currennt慢成狗!我不断的修改,直到最后几乎完全在currennt里重写了一个RNNLM……速度才终于一致了。这花费了我大量时间,最关键的是我根本没打算花这些时间,算是计划外开销。

所以这里干脆对常用的几种向量运算做个评测,下回遇到至少心里有数。


参与评测的向量实现包括:

  1. C++ array
  2. C++ STL vector
  3. C++ thrust(CPU)
  4. C++ thrust(GPU)
  5. python
  6. python numpy

评测指标包括:

  • 创建、填充向量
  • 向量点乘,相乘
  • 矩阵相乘

测试环境:

Intel Xeon CPU E5649@2.53GHz x24

VS2010

python 2.7.6 (32bit)

thrust v1.5

numpy 1.8.1


C++ array

创建全0向量:0.000s,几乎不占用时间

int vector_size=100000000;float* vector=(float*)calloc(vector_size,sizeof(float));

创建+填充向量:0.140s

int vector_size=100000000;float* vector=(float*)calloc(vector_size,sizeof(float));for (int i=0;i

向量点乘:0.390s

float sum=0;for(int i=0;i

向量相乘:0.265s

float sum=0;for(int i=0;i

矩阵乘向量:0.344s

int matrix1_colnum=50000;int matrix1_rownum=2000;int matrix1_size=matrix1_colnum*matrix1_rownum;float* vector1=(float*)calloc(matrix1_size,sizeof(float));for (int i=0;i

矩阵乘矩阵:0.749

(耗费时间与matrix1_rownum*matrix1_colnum*matrix2_colnum成正比)

int matrix1_rownum=200;int matrix1_colnum=5000;int matrix1_size=matrix1_colnum*matrix1_rownum;float* vector1=(float*)calloc(matrix1_size,sizeof(float));for (int i=0;i

C++ STL vector

创建全0向量:0.140s

int vect_size=100000000; vector
vector(vect_size);

创建+填充向量:0.140s

int vect_size=100000000;vector
vector(vect_size,0.01);

向量点乘:0.375s

int vect_size=100000000;vector
vector1(vect_size,0.01);vector
vector2(vect_size,0.02);start_t=clock();float sum=0;for(int i=0;i

向量相乘:0.250s

int vect_size=100000000;vector
vector1(vect_size,0.01);vector
vector2(vect_size,0.02);vector
vector3(vect_size);start_t=clock();for(int i=0;i

矩阵乘向量:0.390s

int matrix1_colnum=50000;int matrix1_rownum=2000;int matrix1_size=matrix1_colnum*matrix1_rownum;vector
vector1(matrix1_size,0.01);vector
vector2(matrix1_colnum,0.02);vector
vector3(matrix1_rownum);start_t=clock();for(int row=0;row

矩阵乘法:0.827s

int matrix1_rownum=200;int matrix1_colnum=5000;int matrix1_size=matrix1_colnum*matrix1_rownum;vector
vector1(matrix1_size,0.01);int matrix2_rownum=5000;int matrix2_colnum=200;int matrix2_size=matrix2_rownum*matrix2_colnum;vector
vector2(matrix2_size,0.02);int matrix3_size=matrix1_rownum*matrix2_colnum;vector
vector3(matrix3_size);start_t=clock();for(int row1=0;row1

C++ thrust(CPU)

创建全0向量:0.140s

int vect_size=100000000;thrust::host_vector
vector1(vect_size);

创建+填充向量:0.140s

int vect_size=100000000;thrust::host_vector
vector1(vect_size,0.01);

填充向量:0.078s

thrust::fill(vector1.begin(),vector1.end(),0.01);

向量点乘:0.359s

int vect_size=100000000;thrust::host_vector
vector1(vect_size,(float)0.1);thrust::host_vector
vector2(vect_size,(float)0.2);thrust::host_vector
vector3(vect_size,(float)0.2);start_t=clock();thrust::transform(vector1.begin(),vector1.end(),vector2.begin(),vector3.begin(),thrust::multiplies
());float sum=thrust::reduce(vector3.begin(),vector3.end(),(float)0,thrust::multiplies
());end_t=clock();

向量相乘:0.187s

int vect_size=100000000;thrust::host_vector
vector1(vect_size,(float)0.1);thrust::host_vector
vector2(vect_size,(float)0.2);thrust::host_vector
vector3(vect_size);start_t=clock();thrust::transform(vector1.begin(),vector1.end(),vector2.begin(),vector3.begin(),thrust::multiplies
());end_t=clock();

矩阵乘向量:0.110s

struct matrixXvect_func{	thrust::host_vector
* matrix; thrust::host_vector
* vector; int matrix_rownum; int matrix_colnum; __host__ __device__ float operator()(const int& idx) const{ float t=0; for(int col=0;col
vector1(matrix1_size,(float)0.1);thrust::host_vector
vector2(matrix1_colnum,(float)0.2);thrust::host_vector
vector3(matrix1_rownum);start_t=clock();matrixXvect_func fn;fn.matrix=&vector1;fn.vector=&vector2;fn.matrix_rownum=matrix1_rownum;fn.matrix_colnum=matrix1_colnum;thrust::transform( thrust::counting_iterator
(0), thrust::counting_iterator
(0) + matrix1_rownum, vector3.begin(), fn );end_t=clock();

矩阵乘矩阵:0.655s

struct matrixXmatrix_func{	thrust::host_vector
* matrix1; thrust::host_vector
* matrix2; int matrix1_rownum; int matrix1_colnum; int matrix2_rownum; int matrix2_colnum; __host__ __device__ float operator()(const int& idx) const{ int rownum=idx/matrix2_colnum; int colnum=idx%matrix2_colnum; float t=0; for(int col=0;col
vector1(matrix1_size,(float)0.1);int matrix2_rownum=5000;int matrix2_colnum=200;int matrix2_size=matrix2_rownum*matrix2_colnum;thrust::host_vector
vector2(matrix2_size,(float)0.2);int matrix3_size=matrix1_rownum*matrix2_colnum;thrust::host_vector
vector3(matrix3_size);start_t=clock();matrixXmatrix_func fn;fn.matrix1=&vector1;fn.matrix2=&vector2;fn.matrix1_rownum=matrix1_rownum;fn.matrix1_colnum=matrix1_colnum;fn.matrix2_rownum=matrix2_rownum;fn.matrix2_colnum=matrix2_colnum;thrust::transform( thrust::counting_iterator
(0), thrust::counting_iterator
(0) + matrix3_size, vector3.begin(), fn );end_t=clock();

C++ thrust(GPU)

创建全0向量:0.140s

 

int vect_size=1000000;thrust::device_vector
vector1(vect_size);

 

创建+填充向量:0.140s

 

 

int vect_size=1000000;thrust::device_vector
vector1(vect_size,0.1);

 

CPU向量赋值:0.141s

int vect_size=1000000;thrust::host_vector
vector1(vect_size,0.1);start_t=clock();thrust::device_vector
vector2=vector1;end_t=clock();

填充向量:0.000s

int vect_size=1000000;thrust::device_vector
vector(vect_size);start_t=clock();thrust::fill(vector.begin(),vector.end(),(float)0.1);end_t=clock();

向量点乘:0.016s

int vect_size=100000000;thrust::device_vector
vector1(vect_size,(float)0.1);thrust::device_vector
vector2(vect_size,(float)0.2);thrust::device_vector
vector3(vect_size,(float)0.2); start_t=clock();thrust::transform(vector1.begin(),vector1.end(),vector2.begin(),vector3.begin(),thrust::multiplies
());float sum=thrust::reduce(vector3.begin(),vector3.end(),(float)0,thrust::multiplies
());end_t=clock();

向量相乘:0.000s

int vect_size=100000000;thrust::device_vector
vector1(vect_size,(float)0.1);thrust::device_vector
vector2(vect_size,(float)0.2);thrust::device_vector
vector3(vect_size);start_t=clock();thrust::transform(vector1.begin(),vector1.end(),vector2.begin(),vector3.begin(),thrust::multiplies
());end_t=clock();

矩阵乘向量(实现1):0.530s

int matrix1_rownum=2000;int matrix1_colnum=50000;int matrix1_size=matrix1_colnum*matrix1_rownum; thrust::device_vector
vector1(matrix1_size,(float)0.1);thrust::device_vector
vector2(matrix1_colnum,(float)0.2);thrust::device_vector
tmp(matrix1_colnum);thrust::device_vector
vector3(matrix1_rownum); start_t=clock();for(int row=0;row
()); vector3[row]=thrust::reduce(tmp.begin(),tmp.end(),(float)0,thrust::multiplies
());}end_t=clock();

矩阵乘向量(实现2)CUBLAS,待试

矩阵乘矩阵CUBLAS,待试

 

Python

直接使用python的list实现上述功能实在太慢……而且由于无法指定float类型,其默认使用16位double类型来表示小数,使用10^8会超出list索引上限……故只使用10^7实验,速度差距可以自行换算。

大致估算python的向量运算比c++慢50倍,矩阵运算慢1000。

初始化向量并赋值:1.51s

vector_size=10000000vector=[]for i in range(vector_size):	vector.append(0.1)

向量点乘:1.75s

vector_size=10000000 vector1=[]for i in range(vector_size):	vector1.append(0.1)vector2=[]for i in range(vector_size):	vector2.append(0.1)start_t=time.time()sum=0for i in range(vector_size):	sum+=vector1[i]*vector2[i]end_t=time.time()

向量相乘:2.39

vector_size=10000000vector1=[]for i in range(vector_size):	vector1.append(0.1)vector2=[]for i in range(vector_size):	vector2.append(0.1)vector3=[]for i in range(vector_size):	vector3.append(0.1)start_t=time.time()for i in range(vector_size):	vector3[i]=vector1[i]*vector2[i]end_t=time.time()

矩阵乘向量:3.06s

matrix1_rownum=2000matrix1_colnum=5000matrix1_size=matrix1_rownum*matrix1_colnumvector1=[]for i in range(matrix1_size):	vector1.append(0.1)vector2=[]for i in range(matrix1_colnum):	vector2.append(0.1)vector3=[]for i in range(matrix1_rownum):	vector3.append(0.1)start_t=time.time()for row in range(matrix1_rownum):	for col in range(matrix1_colnum):		vector3[row]=vector1[row*matrix1_colnum+col]*vector2[col]end_t=time.time()

矩阵相乘:11.37s

matrix1_rownum=200matrix1_colnum=500matrix1_size=matrix1_rownum*matrix1_colnumvector1=[]for i in range(matrix1_size):	vector1.append(0.1)matrix2_rownum=500matrix2_colnum=200matrix2_size=matrix2_rownum*matrix2_colnumvector2=[]for i in range(matrix2_size):	vector2.append(0.1)matrix3_size=matrix1_rownum*matrix2_colnumvector3=[]for i in range(matrix3_size):	vector3.append(0.1)start_t=time.time()for row in range(matrix1_rownum):	for col in range(matrix2_colnum):		for i in range(matrix1_colnum):			vector3[row*matrix2_colnum+col]+=vector1[row*matrix1_colnum+i]*vector2[i*matrix2_colnum+col]end_t=time.time()

当然实际进行向量运算没人会拿python的list数据结构进行运算,这里只是好奇定量测一下list到底有多慢……

Python numpy

创建全0向量:0.0s

vector_size=100000000vector=numpy.zeros(vector_size)

创建+填充向量:0.25s

vector_size=100000000vector=numpy.zeros(vector_size)vector.fill(0.01)

向量点乘:0.125s(由于python是32位……内存原因,数据规模减半)

vector_size=50000000vector1=numpy.zeros(vector_size)vector1.fill(0.01)vector2=numpy.zeros(vector_size)vector2.fill(0.02)start_t=time.time()sum=numpy.inner(vector1,vector2)end_t=time.time()

向量相乘:0.234s

vector_size=50000000vector1=numpy.zeros(vector_size)vector1.fill(0.01)vector2=numpy.zeros(vector_size)vector2.fill(0.02)start_t=time.time()vector3=numpy.multiply(vector1,vector2)end_t=time.time()

矩阵乘向量:0.094s

matrix1_rownum=2000matrix1_colnum=50000matrix1_size=matrix1_rownum*matrix1_colnumvector1=numpy.zeros(matrix1_size)vector1.fill(0.01)vector2=numpy.zeros(matrix1_colnum)vector2.fill(0.02)start_t=time.time()vector1=vector1.reshape(matrix1_rownum,matrix1_colnum)vector2=vector2.reshape(matrix1_colnum,1)vector3=numpy.dot(vector1,vector2)end_t=time.time()

矩阵乘矩阵:23.16s(numpy.dot出乎意料的慢,使用numpy.matrix类时间为11.73s,依旧很慢而且占用更大内存,在创建matrix对象时也要0.4s)

matrix1_rownum=2000matrix1_colnum=50000matrix1_size=matrix1_rownum*matrix1_colnumvector1=numpy.zeros(matrix1_size)vector1.fill(0.01)matrix2_rownum=50000matrix2_colnum=1000matrix2_size=matrix2_rownum*matrix2_colnumvector2=numpy.zeros(matrix2_size)vector2.fill(0.02)start_t=time.time()vector1=vector1.reshape(matrix1_rownum,matrix1_colnum)vector2=vector2.reshape(matrix2_rownum,matrix2_colnum)vector3=numpy.dot(vector1,vector2)end_t=time.time()

 

转载地址:http://ygqgx.baihongyu.com/

你可能感兴趣的文章
RxJava2 实战知识梳理(5) 简单及进阶的轮询操作
查看>>
js call,apply,bind总结
查看>>
Spring Boot 中使用 Java API 调用 lucene
查看>>
从 Java 层看 React-Native 通信机制
查看>>
来来来!关于iOS基础总结咱俩好好唠唠
查看>>
兑吧:从自建HBase迁移到阿里云HBase实战经验
查看>>
ECS 控制台诊断系统
查看>>
聊聊servicecomb-saga的alpha-server
查看>>
iOS多线程调研
查看>>
iOS多线程Pthreads篇
查看>>
萌新的node教程
查看>>
【活动】掘金技术征文丨给大家看的 Julia 教程
查看>>
推荐Android两种屏幕适配方案
查看>>
HTML5前端面试常见问题汇总
查看>>
HTTP2 基础入门
查看>>
让数据传输更安全
查看>>
实现一个requirejs原型demo
查看>>
画一个三角形
查看>>
node ( 5 ) -----process详解(这个标题不讨喜……)
查看>>
浅谈unicode编码和utf-8编码的关系
查看>>