实际上你的C++都在做double加法,没法启用AVX;而Python代码都在做int加法,所以可以启用。
#include <chrono>
#include <format>
#include <functional>
#include <iostream>
#include "omp.h"
double run0(int n, std::function<double(double, double)>&& f) {
double s = 0;
for (int i = 0; i < n; ++i) s = f(s, i);
return s;
}
double run1(int n, auto&& f) {
double s = 0;
for (int i = 0; i < n; ++i) s = f(s, i);
return s;
}
template <typename F>
double run2(int n, F f) {
double s = 0;
for (int i = 0; i < n; ++i) s = f(s, i);
return s;
}
double run3(int n) {
double s = 0;
constexpr auto f = [](double a, double b) { return a + b; };
for (int i = 0; i < n; ++i) s = f(s, i);
return s;
}
double run4(int n) {
double s = 0;
for (int i = 0; i < n; ++i) s += i;
return s;
}
double run5(int n) {
omp_set_num_threads(4);
double s = 0;
#pragma omp parallel for reduction(+ : s)
for (int i = 0; i < n; ++i) s += i;
return s;
}
double run6(int32_t n) {
omp_set_num_threads(4);
int64_t s = 0;
#pragma omp parallel for reduction(+ : s)
for (int32_t i = 0; i < n; ++i) s += i;
return double(s);
}
void print(std::string_view name, double re, auto tic, auto toc) {
std::cerr << std::format("{} resut is {:.2f}, done in {:%Q} ms!", name, re,
std::chrono::duration<double, std::milli>(toc - tic))
<< std::endl;
}
int main() {
#ifdef _OPENMP
omp_lock_t lock;
omp_init_lock(&lock);
std::cerr << "OpenMP init." << std::endl;
#endif
using hrc = std::chrono::high_resolution_clock;
auto ts = hrc::now();
auto re = run0(10000000, [](double a, double b) { return a + b; });
auto td = hrc::now();
print("Run0", re, ts, td);
ts = hrc::now();
re = run1(10000000, [](double a, double b) { return a + b; });
td = hrc::now();
print("Run1", re, ts, td);
ts = hrc::now();
re = run2(10000000, [](double a, double b) { return a + b; });
td = hrc::now();
print("Run2", re, ts, td);
ts = hrc::now();
re = run3(10000000);
td = hrc::now();
print("Run3", re, ts, td);
ts = hrc::now();
re = run4(10000000);
td = hrc::now();
print("Run4", re, ts, td);
ts = hrc::now();
re = run5(10000000);
td = hrc::now();
print("Run5", re, ts, td);
ts = hrc::now();
re = run6(10000000);
td = hrc::now();
print("Run6", re, ts, td);
return 0;
}
OpenMP init.
Run0 resut is 49999995000000.00, done in 22.4456 ms!
Run1 resut is 49999995000000.00, done in 9.1517 ms!
Run2 resut is 49999995000000.00, done in 9.165 ms!
Run3 resut is 49999995000000.00, done in 9.1878 ms!
Run4 resut is 49999995000000.00, done in 9.1732 ms!
Run5 resut is 49999995000000.00, done in 2.6916 ms!
Run6 resut is 49999995000000.00, done in 0.6204 ms!
【 在 finlab 的大作中提到: 】
: c++的0.14秒,是已经打开msvc的avx2编译选项的。
: 我对比过手动simd编程与编译器自动优化的结果,vc的simd优化效果不佳,不能充分发挥simd威力。
: python的numba底层llmv,也是调用了icc_rt来充分利用simd加速。
: ...................
--
修改:ble FROM 222.129.52.*
FROM 222.129.52.*