[SLP] -march=znver4 is slower than -march=znver3 on the znver4 series CPUs.

I tested the following code using `-O3 -march=znver4` and `-O3 -march=znver3` on an AMD 7950X, the `-march=znver4` version is approximately 20% slower now, but I expect the `-march=znver4` faster.

<details><summary>Details</summary>
<p>

```c
#include <stddef.h>
#include <stdint.h>

#define rotate_left(val, shift) ((val << shift) | (val >> (64 - shift)))

const uint32_t RHO[24] = {
    1,  3,  6,  10, 15, 21, 28, 36, 45, 55, 2,  14,
    27, 41, 56, 8,  25, 43, 62, 18, 39, 61, 20, 44,
};

const uint64_t PI[24] = {
    10, 7,  11, 17, 18, 3, 5,  16, 8,  21, 24, 4,
    15, 23, 19, 13, 12, 2, 20, 14, 22, 9,  6,  1,
};

const uint64_t RC[24] = {
    0x0000000000000001ULL, 0x0000000000008082ULL, 0x800000000000808aULL,
    0x8000000080008000ULL, 0x000000000000808bULL, 0x0000000080000001ULL,
    0x8000000080008081ULL, 0x8000000000008009ULL, 0x000000000000008aULL,
    0x0000000000000088ULL, 0x0000000080008009ULL, 0x000000008000000aULL,
    0x000000008000808bULL, 0x800000000000008bULL, 0x8000000000008089ULL,
    0x8000000000008003ULL, 0x8000000000008002ULL, 0x8000000000000080ULL,
    0x000000000000800aULL, 0x800000008000000aULL, 0x8000000080008081ULL,
    0x8000000000008080ULL, 0x0000000080000001ULL, 0x8000000080008008ULL,
};

void keccak_p(uint64_t state[25]) {
  for (int i = 0; i < 24; ++i) {
    uint64_t current_rc = RC[i];
    uint64_t array[5] = {0};

    // Theta
    for (int x = 0; x < 5; ++x) {
      for (int y = 0; y < 5; ++y) {
        array[x] ^= state[5 * y + x];
      }
    }

    for (int x = 0; x < 5; ++x) {
      uint64_t t1 = array[(x + 4) % 5];
      uint64_t t2 = rotate_left(array[(x + 1) % 5], 1);
      for (int y = 0; y < 5; ++y) {
        state[5 * y + x] ^= t1 ^ t2;
      }
    }

    // Rho and pi
    uint64_t last = state[1];
    for (int x = 0; x < 24; ++x) {
      array[0] = state[PI[x]];
      state[PI[x]] = rotate_left(last, RHO[x]);
      last = array[0];
    }

    // Chi
    for (int y_step = 0; y_step < 5; ++y_step) {
      int y = 5 * y_step;

      for (int x = 0; x < 5; ++x) {
        array[x] = state[y + x];
      }

      for (int x = 0; x < 5; ++x) {
        uint64_t t1 = ~array[(x + 1) % 5];
        uint64_t t2 = array[(x + 2) % 5];
        state[y + x] = array[x] ^ (t1 & t2);
      }
    }

    // Iota
    state[0] ^= current_rc;
  }
}

int main() {
  uint64_t state[25] = {0};
  for (int i = 0; i < 1000000; ++i) {
    keccak_p(state);
  }
  return 0;
}
```

</p>
</details> 

```
$ perf stat -r 5 ./znver3

 Performance counter stats for './znver3' (5 runs):

            408.52 msec task-clock:u                     #    0.999 CPUs utilized               ( +-  0.36% )
                 0      context-switches:u               #    0.000 /sec
                 0      cpu-migrations:u                 #    0.000 /sec
                49      page-faults:u                    #  119.946 /sec                        ( +-  0.41% )
     5,107,180,232      instructions:u                   #    2.21  insn per cycle
                                                  #    0.00  stalled cycles per insn     ( +-  0.00% )
     2,310,598,646      cycles:u                         #    5.656 GHz                         ( +-  0.36% )
           481,736      stalled-cycles-frontend:u        #    0.02% frontend cycles idle        ( +-  2.01% )
        27,039,388      branches:u                       #   66.189 M/sec                       ( +-  0.00% )
             5,051      branch-misses:u                  #    0.02% of all branches             ( +-  0.59% )

           0.40895 +- 0.00147 seconds time elapsed  ( +-  0.36% )

$ perf stat -r 5 ./znver4

 Performance counter stats for './znver4' (5 runs):

            576.39 msec task-clock:u                     #    0.999 CPUs utilized               ( +-  0.20% )
                 0      context-switches:u               #    0.000 /sec
                 0      cpu-migrations:u                 #    0.000 /sec
                50      page-faults:u                    #   86.747 /sec                        ( +-  0.89% )
     4,062,180,550      instructions:u                   #    1.25  insn per cycle
                                                  #    0.00  stalled cycles per insn     ( +-  0.00% )
     3,261,654,058      cycles:u                         #    5.659 GHz                         ( +-  0.20% )
           630,199      stalled-cycles-frontend:u        #    0.02% frontend cycles idle        ( +-  0.44% )
        27,039,720      branches:u                       #   46.912 M/sec                       ( +-  0.00% )
             5,970      branch-misses:u                  #    0.02% of all branches             ( +-  0.36% )

           0.57676 +- 0.00113 seconds time elapsed  ( +-  0.20% )

```

cc @RKSimon @alexey-bataev (as it relates to AMD and SLP)

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

[SLP] -march=znver4 is slower than -march=znver3 on the znver4 series CPUs. #137808

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

[SLP] -march=znver4 is slower than -march=znver3 on the znver4 series CPUs. #137808

Description

Metadata

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

Issue actions