Skip to content

Commit f7ff9d4

Browse files
isurufkiranchandramohan
authored andcommitted
build avx, sse4 versions of dfloor dceil floor
1 parent 9808a3f commit f7ff9d4

File tree

6 files changed

+112
-40
lines changed

6 files changed

+112
-40
lines changed

runtime/libpgmath/lib/common/CMakeLists.txt

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -127,7 +127,7 @@ elseif(${LIBPGMATH_SYSTEM_PROCESSOR} MATCHES "ppc64le")
127127
mth_128defs_init.c
128128
mth_128defs_stats.c)
129129
libmath_add_object_library("${MTH_INTRINSICS_SRCS}" "${FLAGS}" "${DEFINITIONS}" "mth_intrinsics")
130-
130+
131131
set(SRCS
132132
kidnnt.c
133133
sincos.c
@@ -454,7 +454,7 @@ else()
454454
libmath_add_object_library("${TARGET_NAME}.c" "${FLAGS}" "${DEFINITIONS}" "${TARGET_NAME}_build")
455455
add_dependencies("${TARGET_NAME}_build" ${TARGET_NAME})
456456
endif()
457-
457+
458458
# Generate tmp-mth_statsdefs.h
459459
set(TARGET_NAME "tmp-mth_statsdefs")
460460
add_custom_command(OUTPUT ${TARGET_NAME}.h PRE_BUILD
@@ -466,6 +466,6 @@ add_dependencies(dispatch ${TARGET_NAME} tmp-mth_alldefs)
466466
add_dependencies(dispatch ${TARGET_NAME} tmp-mth_statsdefs)
467467
target_include_directories(dispatch
468468
BEFORE
469-
PRIVATE ${CMAKE_CURRENT_BINARY_DIR}
469+
PRIVATE ${CMAKE_CURRENT_BINARY_DIR}
470470
PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/../${PROCESSOR}/math_tables
471471
PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/../generic/math_tables)

runtime/libpgmath/lib/common/dceil.c

Lines changed: 29 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -6,26 +6,44 @@
66
*/
77

88
#include "mthdecls.h"
9-
#if defined(__SSE4_1__) || defined(__AVX__)
10-
#include <immintrin.h>
11-
#endif
129

13-
#if defined(__AVX__)
10+
#if defined(TARGET_X8664)
11+
/*
12+
* For X8664, implement both SSE and AVX versions of __mth_i_ceil using ISA
13+
* instruction extensions.
14+
*
15+
* Using inline assembly allows both the SSE and AVX versions of the routine
16+
* to be compiled in a single unit.
17+
*
18+
* The following asm statements is equivalent to:
19+
* return _mm_cvtss_f32(_mm_ceil_ss(_mm_set1_ps(x), _mm_set1_ps(x)));
20+
* But without the need for separate compiliations for SSE4.1 and AVX ISA
21+
* extensions.
22+
*/
23+
1424
double
15-
__mth_i_dceil_avx(double x)
25+
__mth_i_dceil_sse(double x)
1626
{
17-
return _mm_cvtsd_f64(_mm_ceil_sd(_mm_set1_pd(x), _mm_set1_pd(x)));
27+
__asm__(
28+
"roundsd $0x2,%0,%0"
29+
:"+x"(x)
30+
);
31+
return x;
1832
}
19-
#elif defined(__SSE4_1__)
33+
2034
double
21-
__mth_i_dceil_sse(double x)
35+
__mth_i_dceil_avx(double x)
2236
{
23-
return _mm_cvtsd_f64(_mm_ceil_sd(_mm_set1_pd(x), _mm_set1_pd(x)));
37+
__asm__(
38+
"vroundsd $0x2,%0,%0,%0"
39+
:"+x"(x)
40+
);
41+
return x;
2442
}
25-
#else
43+
#endif
44+
2645
double
2746
__mth_i_dceil(double x)
2847
{
2948
return ceil(x);
3049
}
31-
#endif

runtime/libpgmath/lib/common/dfloor.c

Lines changed: 29 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -6,26 +6,44 @@
66
*/
77

88
#include "mthdecls.h"
9-
#if defined(__SSE4_1__) || defined(__AVX__)
10-
#include <immintrin.h>
11-
#endif
129

13-
#if defined(__AVX__)
10+
#if defined(TARGET_X8664)
11+
/*
12+
* For X8664, implement both SSE and AVX versions of __mth_i_dfloor using ISA
13+
* instruction extensions.
14+
*
15+
* Using inline assembly allows both the SSE and AVX versions of the routine
16+
* to be compiled in a single unit.
17+
*
18+
* The following asm statements is equivalent to:
19+
* return _mm_cvtss_f32(_mm_floor_sd(_mm_set1_ps(x), _mm_set1_ps(x)));
20+
* But without the need for separate compiliations for SSE4.1 and AVX ISA
21+
* extensions.
22+
*/
23+
1424
double
15-
__mth_i_dfloor_avx(double x)
25+
__mth_i_dfloor_sse(double x)
1626
{
17-
return _mm_cvtsd_f64(_mm_floor_sd(_mm_set1_pd(x), _mm_set1_pd(x)));
27+
__asm__(
28+
"roundsd $0x1,%0,%0"
29+
:"+x"(x)
30+
);
31+
return x;
1832
}
19-
#elif defined(__SSE4_1__)
33+
2034
double
21-
__mth_i_dfloor_sse(double x)
35+
__mth_i_dfloor_avx(double x)
2236
{
23-
return _mm_cvtsd_f64(_mm_floor_sd(_mm_set1_pd(x), _mm_set1_pd(x)));
37+
__asm__(
38+
"vroundsd $0x1,%0,%0,%0"
39+
:"+x"(x)
40+
);
41+
return x;
2442
}
25-
#else
43+
#endif
44+
2645
double
2746
__mth_i_dfloor(double x)
2847
{
2948
return floor(x);
3049
}
31-
#endif

runtime/libpgmath/lib/common/floor.c

Lines changed: 29 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -6,26 +6,44 @@
66
*/
77

88
#include "mthdecls.h"
9-
#if defined(__SSE4_1__) || defined(__AVX__)
10-
#include <immintrin.h>
11-
#endif
129

13-
#if defined(__AVX__)
10+
#if defined(TARGET_X8664)
11+
/*
12+
* For X8664, implement both SSE and AVX versions of __mth_i_floor using ISA
13+
* instruction extensions.
14+
*
15+
* Using inline assembly allows both the SSE and AVX versions of the routine
16+
* to be compiled in a single unit.
17+
*
18+
* The following asm statements is equivalent to:
19+
* return _mm_cvtss_f32(_mm_floor_ss(_mm_set1_ps(x), _mm_set1_ps(x)));
20+
* But without the need for separate compiliations for SSE4.1 and AVX ISA
21+
* extensions.
22+
*/
23+
1424
float
15-
__mth_i_floor_avx(float x)
25+
__mth_i_floor_sse(float x)
1626
{
17-
return _mm_cvtss_f32(_mm_floor_ss(_mm_set1_ps(x), _mm_set1_ps(x)));
27+
__asm__(
28+
"roundss $0x1,%0,%0"
29+
:"+x"(x)
30+
);
31+
return x;
1832
}
19-
#elif defined(__SSE4_1__)
33+
2034
float
21-
__mth_i_floor_sse(float x)
35+
__mth_i_floor_avx(float x)
2236
{
23-
return _mm_cvtss_f32(_mm_floor_ss(_mm_set1_ps(x), _mm_set1_ps(x)));
37+
__asm__(
38+
"vroundss $0x1,%0,%0,%0"
39+
:"+x"(x)
40+
);
41+
return x;
2442
}
25-
#else
43+
#endif
44+
2645
float
2746
__mth_i_floor(float x)
2847
{
2948
return floorf(x);
3049
}
31-
#endif

runtime/libpgmath/lib/x86_64/math_tables/mth_ceildefs.h

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,8 +5,14 @@
55
*
66
*/
77

8-
MTHINTRIN(ceil , ss , any , __mth_i_ceil , __mth_i_ceil , __mth_i_ceil ,__math_dispatch_error)
9-
MTHINTRIN(ceil , ds , any , __mth_i_dceil , __mth_i_dceil , __mth_i_dceil ,__math_dispatch_error)
8+
MTHINTRIN(ceil , ss , any , __mth_i_ceil , __mth_i_ceil , __mth_i_ceil ,__math_dispatch_error)
9+
MTHINTRIN(ceil , ds , em64t , __mth_i_dceil , __mth_i_dceil , __mth_i_dceil ,__math_dispatch_error)
10+
MTHINTRIN(ceil , ds , sse4 , __mth_i_dceil_sse , __mth_i_dceil_sse , __mth_i_dceil_sse ,__math_dispatch_error)
11+
MTHINTRIN(ceil , ds , avx , __mth_i_dceil_avx , __mth_i_dceil_avx , __mth_i_dceil_avx ,__math_dispatch_error)
12+
MTHINTRIN(ceil , ds , avxfma4 , __mth_i_dceil_avx , __mth_i_dceil_avx , __mth_i_dceil_avx ,__math_dispatch_error)
13+
MTHINTRIN(ceil , ds , avx2 , __mth_i_dceil_avx , __mth_i_dceil_avx , __mth_i_dceil_avx ,__math_dispatch_error)
14+
MTHINTRIN(ceil , ds , avx512knl , __mth_i_dceil_avx , __mth_i_dceil_avx , __mth_i_dceil_avx ,__math_dispatch_error)
15+
MTHINTRIN(ceil , ds , avx512 , __mth_i_dceil_avx , __mth_i_dceil_avx , __mth_i_dceil_avx ,__math_dispatch_error)
1016
MTHINTRIN(ceil , sv4 , any , __gs_ceil_4_f , __gs_ceil_4_r , __gs_ceil_4_p ,__math_dispatch_error)
1117
MTHINTRIN(ceil , dv2 , any , __gd_ceil_2_f , __gd_ceil_2_r , __gd_ceil_2_p ,__math_dispatch_error)
1218
MTHINTRIN(ceil , sv8 , any , __gs_ceil_8_f , __gs_ceil_8_r , __gs_ceil_8_p ,__math_dispatch_error)

runtime/libpgmath/lib/x86_64/math_tables/mth_floordefs.h

Lines changed: 14 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,8 +5,20 @@
55
*
66
*/
77

8-
MTHINTRIN(floor , ss , any , __mth_i_floor , __mth_i_floor , __mth_i_floor ,__math_dispatch_error)
9-
MTHINTRIN(floor , ds , any , __mth_i_dfloor , __mth_i_dfloor , __mth_i_dfloor ,__math_dispatch_error)
8+
MTHINTRIN(floor , ss , em64t , __mth_i_floor , __mth_i_floor , __mth_i_floor ,__math_dispatch_error)
9+
MTHINTRIN(floor , ds , em64t , __mth_i_dfloor , __mth_i_dfloor , __mth_i_dfloor ,__math_dispatch_error)
10+
MTHINTRIN(floor , ss , sse4 , __mth_i_floor_sse , __mth_i_floor_sse , __mth_i_floor_sse ,__math_dispatch_error)
11+
MTHINTRIN(floor , ds , sse4 , __mth_i_dfloor_sse , __mth_i_dfloor_sse , __mth_i_dfloor_sse ,__math_dispatch_error)
12+
MTHINTRIN(floor , ss , avx , __mth_i_floor_avx , __mth_i_floor_avx , __mth_i_floor_avx ,__math_dispatch_error)
13+
MTHINTRIN(floor , ds , avx , __mth_i_dfloor_avx , __mth_i_dfloor_avx , __mth_i_dfloor_avx ,__math_dispatch_error)
14+
MTHINTRIN(floor , ss , avxfma4 , __mth_i_floor_avx , __mth_i_floor_avx , __mth_i_floor_avx ,__math_dispatch_error)
15+
MTHINTRIN(floor , ds , avxfma4 , __mth_i_dfloor_avx , __mth_i_dfloor_avx , __mth_i_dfloor_avx ,__math_dispatch_error)
16+
MTHINTRIN(floor , ss , avx2 , __mth_i_floor_avx , __mth_i_floor_avx , __mth_i_floor_avx ,__math_dispatch_error)
17+
MTHINTRIN(floor , ds , avx2 , __mth_i_dfloor_avx , __mth_i_dfloor_avx , __mth_i_dfloor_avx ,__math_dispatch_error)
18+
MTHINTRIN(floor , ss , avx512knl , __mth_i_floor_avx , __mth_i_floor_avx , __mth_i_floor_avx ,__math_dispatch_error)
19+
MTHINTRIN(floor , ds , avx512knl , __mth_i_dfloor_avx , __mth_i_dfloor_avx , __mth_i_dfloor_avx ,__math_dispatch_error)
20+
MTHINTRIN(floor , ss , avx512 , __mth_i_floor_avx , __mth_i_floor_avx , __mth_i_floor_avx ,__math_dispatch_error)
21+
MTHINTRIN(floor , ds , avx512 , __mth_i_dfloor_avx , __mth_i_dfloor_avx , __mth_i_dfloor_avx ,__math_dispatch_error)
1022
MTHINTRIN(floor , sv4 , any , __gs_floor_4_f , __gs_floor_4_r , __gs_floor_4_p ,__math_dispatch_error)
1123
MTHINTRIN(floor , dv2 , any , __gd_floor_2_f , __gd_floor_2_r , __gd_floor_2_p ,__math_dispatch_error)
1224
MTHINTRIN(floor , sv8 , any , __gs_floor_8_f , __gs_floor_8_r , __gs_floor_8_p ,__math_dispatch_error)

0 commit comments

Comments
 (0)