gcc 编译选项-march=skylake -O2 -funroll-loops
结果:
1740 cycles used!
892 cycles used!
如果f 是循环中调用的,应该可以用SIMD优化,f2里面生成mask和xor都是简单的SIMD指令即可。
void f(uint64_t flag, const uint8_t *src, uint8_t *dst)
{
int dstByteOff = 0;
int srcByteOff = 0;
for(int b = 0; b < 64; ++b){
if(flag & (UINT64_C(1) << b)){
dst[dstByteOff++] = 0;
}
else{
dst[dstByteOff++] = src[srcByteOff++];
}
}
}
void f2(uint64_t flag, const uint8_t *src, uint8_t *dst)
{
int dstByteOff = 0;
int srcByteOff = 0;
uint64_t t1, t2;
uint8_t mask;
for(int b = 0; b < 64; ++b){
t1 = (flag >> b & 1UL);
t2 = (flag >> b & 1UL)<<8;
mask = (uint8_t) (t2 - t1);
dst[dstByteOff++] = src[srcByteOff++] & mask;
}
}
int main (void)
{
uint8_t *src, *dst;
uint64_t flag = 0xaaaaaaaaaaaaaaaa;
uint64_t t_begin, t_end;
src = (void *)malloc(64 * sizeof(uint8_t));
dst = (void *)malloc(64 * sizeof(uint8_t));
RDTSCP(t_begin)
f(flag, src, dst);
RDTSCP(t_end);
printf("%lu cycles used!\n", t_end-t_begin);
RDTSCP(t_begin)
f2(flag, src, dst);
RDTSCP(t_end);
printf("%lu cycles used!\n", t_end-t_begin);
return 0;
}
【 在 allegro 的大作中提到: 】
: 代码如下:
: [code=c]
: void f(uint64_t flag, const uint8_t *src, uint8_t *dst)
: ...................
--
修改:lambdago FROM 111.183.5.*
FROM 111.183.5.*