- 主题:软件bug么
求解惑,double 与 float 的运算速度测试,为什么是这样?
重复执行4次一样的代码,只有第一次的时候,double与float有明显区别,后面三次区别就很小了
#include <iostream>
#include <math.h>
#include <chrono>
using namespace std;
class TicToc
{
public:
TicToc(const std::string& name):m_name(name) { tic(); }
void tic() {
start = std::chrono::system_clock::now();
}
void toc() {
auto millinon_second = std::chrono::duration_cast<std::chrono::milliseconds>
(std::chrono::system_clock::now()-start);
std::cout << m_name << " cost " << millinon_second.count() << " ms." << std::endl;
}
private:
std::string m_name;
std::chrono::time_point<std::chrono::system_clock> start, end;
};
constexpr size_t N = 20000000;
double result_double[N];
float result_float[N];
int main()
{
float pi_float = 3.1415926f;
double pi_double = 3.1415926;
TicToc double_time("double multiply ");
for (size_t i = 0; i < N; ++i) {
result_double[i] = double(i) * pi_double;
}
double_time.toc();
TicToc float_time("float multiply ");
for (size_t i = 0; i < N; ++i) {
result_float[i] = float(i) * pi_float;
}
float_time.toc();
/////////////////////////////////////////
TicToc double_time2("double multiply 2");
for (size_t i = 0; i < N; ++i) {
result_double[i] = double(i) * pi_double;
}
double_time2.toc();
TicToc float_time2("float multiply 2");
for (size_t i = 0; i < N; ++i) {
result_float[i] = float(i) * pi_float;
}
float_time2.toc();
/////////////////////////////////////////
TicToc double_time3("double multiply 3");
for (size_t i = 0; i < N; ++i) {
result_double[i] = double(i) * pi_double;
}
double_time3.toc();
TicToc float_time3("float multiply 3");
for (size_t i = 0; i < N; ++i) {
result_float[i] = float(i) * pi_float;
}
float_time3.toc();
/////////////////////////////////////////
TicToc double_time4("double multiply 4");
for (size_t i = 0; i < N; ++i) {
result_double[i] = double(i) * pi_double;
}
double_time4.toc();
TicToc float_time4("float multiply 4");
for (size_t i = 0; i < N; ++i) {
result_float[i] = float(i) * pi_float;
}
float_time4.toc();
}
编译: g++ float_double.cpp -std=c++11 -O3
四次运行结果:
xxxx$ ./a.out
double multiply cost 73 ms.
float multiply cost 28 ms.
double multiply 2 cost 11 ms.
float multiply 2 cost 9 ms.
double multiply 3 cost 11 ms.
float multiply 3 cost 14 ms.
double multiply 4 cost 14 ms.
float multiply 4 cost 14 ms.
xxxx$ ./a.out
double multiply cost 51 ms.
float multiply cost 28 ms.
double multiply 2 cost 12 ms.
float multiply 2 cost 9 ms.
double multiply 3 cost 11 ms.
float multiply 3 cost 14 ms.
double multiply 4 cost 14 ms.
float multiply 4 cost 14 ms.
xxxx$ ./a.out
double multiply cost 51 ms.
float multiply cost 28 ms.
double multiply 2 cost 11 ms.
float multiply 2 cost 9 ms.
double multiply 3 cost 11 ms.
float multiply 3 cost 14 ms.
double multiply 4 cost 14 ms.
float multiply 4 cost 14 ms.
xxxx$ ./a.out
double multiply cost 78 ms.
float multiply cost 28 ms.
double multiply 2 cost 11 ms.
float multiply 2 cost 9 ms.
double multiply 3 cost 11 ms.
float multiply 3 cost 14 ms.
double multiply 4 cost 14 ms.
float multiply 4 cost 13 ms.
平台:Intel(R) Core(TM) i7-8700 CPU @ 3.20GHz
【 在 foliver 的大作中提到: 】
: 和寄存器指令有关最开始。
: 最开始浮点数都是fpu计算,fpu寄存器都是80bits大小,不管float还是double,没有区别,指令也一样。
: 后来2000年前后,intel推广sse,引入xmm寄存器,遵从float/double的大小,使用不同的指令。这时的float才是真float。
: ...................
--
FROM 14.155.19.*
每次计时开始之前,加了两行
memset(result_double, 0, sizeof(result_double));
memset(result_float, 0, sizeof(result_float));
./a.out
double multiply 1 cost 14 ms.
float multiply 1 cost 9 ms.
double multiply 2 cost 14 ms.
float multiply 2 cost 9 ms.
double multiply 3 cost 14 ms.
float multiply 3 cost 14 ms.
double multiply 4 cost 12 ms.
float multiply 4 cost 11 ms.
之前第一次大耗时看起来更像是使用时开辟内存导致的。然后,差不多每次运行都是前两次float和double时间有差距,后面就没有或者差距很小。不知道怎么解释了。反正我现在除了有些大数据量的东西需要考虑存储的,或者本来就是库已经定义了float格式的,自己使用的一般的数据里面都是无脑double
又把float和double的顺序换了一下
./a.out
float multiply 1 cost 14 ms.
double multiply 1 cost 11 ms.
float multiply 2 cost 14 ms.
double multiply 2 cost 11 ms.
float multiply 3 cost 14 ms.
double multiply 3 cost 14 ms.
float multiply 4 cost 10 ms.
double multiply 4 cost 12 ms.
【 在 foliver 的大作中提到: 】
: 应该是你那两个数组缓存导致的。
: 你在测试之前,先用循环把那两个数组赋值一下。
--
修改:confinement FROM 14.155.19.*
FROM 14.155.19.*
汇编不怎么会玩。。。以前翻过几页而已 我测试 float 跟 double 计算速度好像没区别 ? 我记得什么资料上或者版上有看到过说现在的cpu,这两种没什么区别
【 在 z16166 的大作中提到: 】
: 大胆推测,小心求证
: 你的推测是运行时分配内存,就不求证了?而且这个很容易求证,同样是看汇编代码。熟手可以不用看。
: 楼上也提示了,可能是cache问题
: ...................
--
修改:confinement FROM 14.155.19.*
FROM 14.155.19.*
这个需要专门的编码方式啊?我不会啊。。。有什么参考书吗?
【 在 foliver 的大作中提到: 】
: 你的代码没有发挥simd的功能。
: xmm一次可以装4个float,也就是同时可以进行4个float运算。
: 而double只能2个。
: ...................
--
FROM 14.155.19.*
是下面这样吗? 测试结果几乎没有什么变化。。。编译需要加什么flag吗
#include <iostream>
#include <math.h>
#include <chrono>
#include <string.h>
using namespace std;
class TicToc
{
public:
TicToc(const std::string& name):m_name(name) { tic(); }
void tic() {
start = std::chrono::system_clock::now();
}
void toc() {
auto millinon_second = std::chrono::duration_cast<std::chrono::milliseconds>
(std::chrono::system_clock::now()-start);
std::cout << m_name << " cost " << millinon_second.count() << " ms." << std::endl;
}
private:
std::string m_name;
std::chrono::time_point<std::chrono::system_clock> start, end;
};
constexpr size_t N = 20000000;
constexpr size_t Nf = 20000000 - 4;
constexpr size_t Nd = 20000000 - 2;
double result_double[N];
float result_float[N];
double pi_double = 3.1415926;
float pi_float = 3.1415926f;
int main()
{
memset(result_double, 0, sizeof(result_double));
memset(result_float, 0, sizeof(result_float));
TicToc float_time1("float multiply 1");
for (size_t i = 0; i < N; ++i) {
result_float[i] = float(i) * pi_float;
}
float_time1.toc();
TicToc double_time1("double multiply 1");
for (size_t i = 0; i < N; ++i) {
result_double[i] = double(i) * pi_double;
}
double_time1.toc();
/////////////////////////////////////////
memset(result_double, 0, sizeof(result_double));
memset(result_float, 0, sizeof(result_float));
TicToc float_time2("float multiply 2");
for (size_t i = 0; i < N; ++i) {
result_float[i] = float(i) * pi_float;
}
float_time2.toc();
TicToc double_time2("double multiply 2");
for (size_t i = 0; i < N; ++i) {
result_double[i] = double(i) * pi_double;
}
double_time2.toc();
/////////////////////////////////////////
memset(result_double, 0, sizeof(result_double));
memset(result_float, 0, sizeof(result_float));
TicToc float_time3("float multiply 3");
for (size_t i = 0; i < Nf; i += 4) {
result_float[i] = float(i) * pi_float;
result_float[i+1] = float(i+1) * pi_float;
result_float[i+2] = float(i+2) * pi_float;
result_float[i+3] = float(i+3) * pi_float;
}
float_time3.toc();
TicToc double_time3("double multiply 3");
for (size_t i = 0; i < Nd; i += 2) {
result_double[i] = double(i) * pi_double;
result_double[i+1] = double(i+1) * pi_double;
}
double_time3.toc();
/////////////////////////////////////////
memset(result_double, 0, sizeof(result_double));
memset(result_float, 0, sizeof(result_float));
TicToc float_time4("float multiply 4");
for (size_t i = 0; i < Nf; i += 4) {
result_float[i] = float(i) * pi_float;
result_float[i+1] = float(i+1) * pi_float;
result_float[i+2] = float(i+2) * pi_float;
result_float[i+3] = float(i+3) * pi_float;
}
float_time4.toc();
TicToc double_time4("double multiply 4");
for (size_t i = 0; i < Nd; i += 2) {
result_double[i] = double(i) * pi_double;
result_double[i+1] = double(i+1) * pi_double;
}
double_time4.toc();
}
【 在 foliver 的大作中提到: 】
: 直接把i++变成i+=4。循环里面连续计算4个,编译器就会合并计算。
--
修改:confinement FROM 14.155.19.*
FROM 14.155.19.*
学习了,这个确实能加速 ~
TicToc float_time4("float multiply 4");
for (size_t i = 0; i < Nf; i += 4) {
result_float[i] = float(i) * pi_float;
result_float[i+1] = float(i) * pi_float;
result_float[i+2] = float(i) * pi_float;
result_float[i+3] = float(i) * pi_float;
}
float_time4.toc();
TicToc double_time4("double multiply 4");
for (size_t i = 0; i < Nd; i += 2) {
result_double[i] = double(i) * pi_double;
result_double[i+1] = double(i) * pi_double;
}
double_time4.toc();
float multiply 1 cost 13 ms.
double multiply 1 cost 11 ms.
float multiply 2 cost 14 ms.
double multiply 2 cost 11 ms.
float multiply 3 cost 6 ms.
double multiply 3 cost 12 ms.
float multiply 4 cost 6 ms.
double multiply 4 cost 11 ms.
--
FROM 14.155.19.*