Clang vec_pack_to_short_fp32 implementation generates incorrect results

Here is the current Clang implementation of `vec_pack_to_short_fp32`:
https://github.com/llvm/llvm-project/blob/5f5cf6029852d703e850c5c16b386284d048dd91/clang/lib/Headers/altivec.h#L7518-L7527

Here is the corrected implementation of `vec_pack_to_short_fp32`:
```cpp
static __inline__ vector unsigned short __ATTRS_o_ai 
vec_pack_to_short_fp32(vector float __a, vector float __b) { 
  vector unsigned int __resa = (vector unsigned int)__builtin_vsx_xvcvsphp(__a); 
  vector unsigned int __resb = (vector unsigned int)__builtin_vsx_xvcvsphp(__b); 
  return vec_pack(__resa, __resb); 
}
```

Here is a test program (which needs to be compiled with the `-std=c11 -mcpu=power9` options) that can be used to check the results of the `vec_pack_to_short_fp32` operation:
```c
#include <stdint.h>
#include <stdlib.h>
#include <stdio.h>
#include <stdalign.h>

#pragma push_macro("vector")
#pragma push_macro("pixel")
#pragma push_macro("bool")

#undef vector
#undef pixel
#undef bool

#include <altivec.h>

#pragma pop_macro("vector")
#pragma pop_macro("pixel")
#pragma pop_macro("bool")

inline __attribute__((__always_inline__, __artificial__))
double Float16BitsToDouble(uint16_t f16_bits) {
  double dbl_result;
  __vector unsigned short f16_vect = vec_splats(f16_bits);
  __asm__("xscvhpdp %x0,%x1"
          : "=wa" (dbl_result)
          : "wa" (f16_vect));
  return dbl_result;
}

int main(int argc, char** argv) {
  alignas(16) float input_vals[8];
  alignas(16) uint16_t result_vals[8];
  for(int i = 1; i < argc; ) {
    input_vals[0] = strtof(argv[i++], NULL);
    input_vals[1] = (i < argc) ? strtof(argv[i++], NULL) : 0.0f;
    input_vals[2] = (i < argc) ? strtof(argv[i++], NULL) : 0.0f;
    input_vals[3] = (i < argc) ? strtof(argv[i++], NULL) : 0.0f;
    input_vals[4] = (i < argc) ? strtof(argv[i++], NULL) : 0.0f;
    input_vals[5] = (i < argc) ? strtof(argv[i++], NULL) : 0.0f;
    input_vals[6] = (i < argc) ? strtof(argv[i++], NULL) : 0.0f;
    input_vals[7] = (i < argc) ? strtof(argv[i++], NULL) : 0.0f;
    
    __vector float src_vect_a =
      *((const __vector float*)input_vals);
    __vector float src_vect_b =
      *((const __vector float*)(input_vals + 4));
    __vector unsigned short result_vect =
      vec_pack_to_short_fp32(src_vect_a, src_vect_b);
    *((__vector unsigned short*)result_vals) = result_vect;
    
    for(int j = 0; j < 8; j++)
      printf("Float32ToFloat16(%g) = %g\n",
             input_vals[j], Float16BitsToDouble(result_vals[j]));
  }
  return 0;
}
```

Here are the results of running the above test program when compiled with GCC 12:
```console
$ ./vsx_vec_pack_to_short_fp32_test_021623_gcc 1.518 2.4447 3.3932 6.4842 -1.4912 -3.3938 -7.532 6.6662
Float32ToFloat16(1.518) = 1.51758
Float32ToFloat16(2.4447) = 2.44531
Float32ToFloat16(3.3932) = 3.39258
Float32ToFloat16(6.4842) = 6.48438
Float32ToFloat16(-1.4912) = -1.49121
Float32ToFloat16(-3.3938) = -3.39453
Float32ToFloat16(-7.532) = -7.53125
Float32ToFloat16(6.6662) = 6.66797
```

Here are the results of running the above test program when compiled with Clang 15 or Clang 17:
```console
$ ./vsx_vec_pack_to_short_fp32_test_021623_clang 1.518 2.4447 3.3932 6.4842 -1.4912 -3.3938 -7.532 6.6662
Float32ToFloat16(1.518) = 1.51758
Float32ToFloat16(2.4447) = 0
Float32ToFloat16(3.3932) = -1.49121
Float32ToFloat16(6.4842) = 0
Float32ToFloat16(-1.4912) = 3.39258
Float32ToFloat16(-3.3938) = 0
Float32ToFloat16(-7.532) = -7.53125
Float32ToFloat16(6.6662) = 0
```

	static __inline__ vector unsigned short __ATTRS_o_ai
	vec_pack_to_short_fp32(vector float __a, vector float __b) {
	vector float __resa = __builtin_vsx_xvcvsphp(__a);
	vector float __resb = __builtin_vsx_xvcvsphp(__b);
	#ifdef __LITTLE_ENDIAN__
	return (vector unsigned short)vec_mergee(__resa, __resb);
	#else
	return (vector unsigned short)vec_mergeo(__resa, __resb);
	#endif
	}

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Clang vec_pack_to_short_fp32 implementation generates incorrect results #60822

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

Clang vec_pack_to_short_fp32 implementation generates incorrect results #60822

Description

Metadata

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

Issue actions