Skip to content

[SLP] -march=znver4 is slower than -march=znver3 on the znver4 series CPUs. #137808

Open
@dianqk

Description

@dianqk

I tested the following code using -O3 -march=znver4 and -O3 -march=znver3 on an AMD 7950X, the -march=znver4 version is approximately 20% slower now, but I expect the -march=znver4 faster.

Details

#include <stddef.h>
#include <stdint.h>

#define rotate_left(val, shift) ((val << shift) | (val >> (64 - shift)))

const uint32_t RHO[24] = {
    1,  3,  6,  10, 15, 21, 28, 36, 45, 55, 2,  14,
    27, 41, 56, 8,  25, 43, 62, 18, 39, 61, 20, 44,
};

const uint64_t PI[24] = {
    10, 7,  11, 17, 18, 3, 5,  16, 8,  21, 24, 4,
    15, 23, 19, 13, 12, 2, 20, 14, 22, 9,  6,  1,
};

const uint64_t RC[24] = {
    0x0000000000000001ULL, 0x0000000000008082ULL, 0x800000000000808aULL,
    0x8000000080008000ULL, 0x000000000000808bULL, 0x0000000080000001ULL,
    0x8000000080008081ULL, 0x8000000000008009ULL, 0x000000000000008aULL,
    0x0000000000000088ULL, 0x0000000080008009ULL, 0x000000008000000aULL,
    0x000000008000808bULL, 0x800000000000008bULL, 0x8000000000008089ULL,
    0x8000000000008003ULL, 0x8000000000008002ULL, 0x8000000000000080ULL,
    0x000000000000800aULL, 0x800000008000000aULL, 0x8000000080008081ULL,
    0x8000000000008080ULL, 0x0000000080000001ULL, 0x8000000080008008ULL,
};

void keccak_p(uint64_t state[25]) {
  for (int i = 0; i < 24; ++i) {
    uint64_t current_rc = RC[i];
    uint64_t array[5] = {0};

    // Theta
    for (int x = 0; x < 5; ++x) {
      for (int y = 0; y < 5; ++y) {
        array[x] ^= state[5 * y + x];
      }
    }

    for (int x = 0; x < 5; ++x) {
      uint64_t t1 = array[(x + 4) % 5];
      uint64_t t2 = rotate_left(array[(x + 1) % 5], 1);
      for (int y = 0; y < 5; ++y) {
        state[5 * y + x] ^= t1 ^ t2;
      }
    }

    // Rho and pi
    uint64_t last = state[1];
    for (int x = 0; x < 24; ++x) {
      array[0] = state[PI[x]];
      state[PI[x]] = rotate_left(last, RHO[x]);
      last = array[0];
    }

    // Chi
    for (int y_step = 0; y_step < 5; ++y_step) {
      int y = 5 * y_step;

      for (int x = 0; x < 5; ++x) {
        array[x] = state[y + x];
      }

      for (int x = 0; x < 5; ++x) {
        uint64_t t1 = ~array[(x + 1) % 5];
        uint64_t t2 = array[(x + 2) % 5];
        state[y + x] = array[x] ^ (t1 & t2);
      }
    }

    // Iota
    state[0] ^= current_rc;
  }
}

int main() {
  uint64_t state[25] = {0};
  for (int i = 0; i < 1000000; ++i) {
    keccak_p(state);
  }
  return 0;
}

$ perf stat -r 5 ./znver3

 Performance counter stats for './znver3' (5 runs):

            408.52 msec task-clock:u                     #    0.999 CPUs utilized               ( +-  0.36% )
                 0      context-switches:u               #    0.000 /sec
                 0      cpu-migrations:u                 #    0.000 /sec
                49      page-faults:u                    #  119.946 /sec                        ( +-  0.41% )
     5,107,180,232      instructions:u                   #    2.21  insn per cycle
                                                  #    0.00  stalled cycles per insn     ( +-  0.00% )
     2,310,598,646      cycles:u                         #    5.656 GHz                         ( +-  0.36% )
           481,736      stalled-cycles-frontend:u        #    0.02% frontend cycles idle        ( +-  2.01% )
        27,039,388      branches:u                       #   66.189 M/sec                       ( +-  0.00% )
             5,051      branch-misses:u                  #    0.02% of all branches             ( +-  0.59% )

           0.40895 +- 0.00147 seconds time elapsed  ( +-  0.36% )

$ perf stat -r 5 ./znver4

 Performance counter stats for './znver4' (5 runs):

            576.39 msec task-clock:u                     #    0.999 CPUs utilized               ( +-  0.20% )
                 0      context-switches:u               #    0.000 /sec
                 0      cpu-migrations:u                 #    0.000 /sec
                50      page-faults:u                    #   86.747 /sec                        ( +-  0.89% )
     4,062,180,550      instructions:u                   #    1.25  insn per cycle
                                                  #    0.00  stalled cycles per insn     ( +-  0.00% )
     3,261,654,058      cycles:u                         #    5.659 GHz                         ( +-  0.20% )
           630,199      stalled-cycles-frontend:u        #    0.02% frontend cycles idle        ( +-  0.44% )
        27,039,720      branches:u                       #   46.912 M/sec                       ( +-  0.00% )
             5,970      branch-misses:u                  #    0.02% of all branches             ( +-  0.36% )

           0.57676 +- 0.00113 seconds time elapsed  ( +-  0.20% )

cc @RKSimon @alexey-bataev (as it relates to AMD and SLP)

Metadata

Metadata

Assignees

Type

No type

Projects

No projects

Milestone

No milestone

Relationships

None yet

Development

No branches or pull requests

Issue actions