Open
Description
I tested the following code using -O3 -march=znver4
and -O3 -march=znver3
on an AMD 7950X, the -march=znver4
version is approximately 20% slower now, but I expect the -march=znver4
faster.
Details
#include <stddef.h>
#include <stdint.h>
#define rotate_left(val, shift) ((val << shift) | (val >> (64 - shift)))
const uint32_t RHO[24] = {
1, 3, 6, 10, 15, 21, 28, 36, 45, 55, 2, 14,
27, 41, 56, 8, 25, 43, 62, 18, 39, 61, 20, 44,
};
const uint64_t PI[24] = {
10, 7, 11, 17, 18, 3, 5, 16, 8, 21, 24, 4,
15, 23, 19, 13, 12, 2, 20, 14, 22, 9, 6, 1,
};
const uint64_t RC[24] = {
0x0000000000000001ULL, 0x0000000000008082ULL, 0x800000000000808aULL,
0x8000000080008000ULL, 0x000000000000808bULL, 0x0000000080000001ULL,
0x8000000080008081ULL, 0x8000000000008009ULL, 0x000000000000008aULL,
0x0000000000000088ULL, 0x0000000080008009ULL, 0x000000008000000aULL,
0x000000008000808bULL, 0x800000000000008bULL, 0x8000000000008089ULL,
0x8000000000008003ULL, 0x8000000000008002ULL, 0x8000000000000080ULL,
0x000000000000800aULL, 0x800000008000000aULL, 0x8000000080008081ULL,
0x8000000000008080ULL, 0x0000000080000001ULL, 0x8000000080008008ULL,
};
void keccak_p(uint64_t state[25]) {
for (int i = 0; i < 24; ++i) {
uint64_t current_rc = RC[i];
uint64_t array[5] = {0};
// Theta
for (int x = 0; x < 5; ++x) {
for (int y = 0; y < 5; ++y) {
array[x] ^= state[5 * y + x];
}
}
for (int x = 0; x < 5; ++x) {
uint64_t t1 = array[(x + 4) % 5];
uint64_t t2 = rotate_left(array[(x + 1) % 5], 1);
for (int y = 0; y < 5; ++y) {
state[5 * y + x] ^= t1 ^ t2;
}
}
// Rho and pi
uint64_t last = state[1];
for (int x = 0; x < 24; ++x) {
array[0] = state[PI[x]];
state[PI[x]] = rotate_left(last, RHO[x]);
last = array[0];
}
// Chi
for (int y_step = 0; y_step < 5; ++y_step) {
int y = 5 * y_step;
for (int x = 0; x < 5; ++x) {
array[x] = state[y + x];
}
for (int x = 0; x < 5; ++x) {
uint64_t t1 = ~array[(x + 1) % 5];
uint64_t t2 = array[(x + 2) % 5];
state[y + x] = array[x] ^ (t1 & t2);
}
}
// Iota
state[0] ^= current_rc;
}
}
int main() {
uint64_t state[25] = {0};
for (int i = 0; i < 1000000; ++i) {
keccak_p(state);
}
return 0;
}
$ perf stat -r 5 ./znver3
Performance counter stats for './znver3' (5 runs):
408.52 msec task-clock:u # 0.999 CPUs utilized ( +- 0.36% )
0 context-switches:u # 0.000 /sec
0 cpu-migrations:u # 0.000 /sec
49 page-faults:u # 119.946 /sec ( +- 0.41% )
5,107,180,232 instructions:u # 2.21 insn per cycle
# 0.00 stalled cycles per insn ( +- 0.00% )
2,310,598,646 cycles:u # 5.656 GHz ( +- 0.36% )
481,736 stalled-cycles-frontend:u # 0.02% frontend cycles idle ( +- 2.01% )
27,039,388 branches:u # 66.189 M/sec ( +- 0.00% )
5,051 branch-misses:u # 0.02% of all branches ( +- 0.59% )
0.40895 +- 0.00147 seconds time elapsed ( +- 0.36% )
$ perf stat -r 5 ./znver4
Performance counter stats for './znver4' (5 runs):
576.39 msec task-clock:u # 0.999 CPUs utilized ( +- 0.20% )
0 context-switches:u # 0.000 /sec
0 cpu-migrations:u # 0.000 /sec
50 page-faults:u # 86.747 /sec ( +- 0.89% )
4,062,180,550 instructions:u # 1.25 insn per cycle
# 0.00 stalled cycles per insn ( +- 0.00% )
3,261,654,058 cycles:u # 5.659 GHz ( +- 0.20% )
630,199 stalled-cycles-frontend:u # 0.02% frontend cycles idle ( +- 0.44% )
27,039,720 branches:u # 46.912 M/sec ( +- 0.00% )
5,970 branch-misses:u # 0.02% of all branches ( +- 0.36% )
0.57676 +- 0.00113 seconds time elapsed ( +- 0.20% )
cc @RKSimon @alexey-bataev (as it relates to AMD and SLP)