build avx, sse4 versions of dfloor dceil floor

isuruf · kiranchandramohan · commit f7ff9d42bf2a · 2022-01-12T11:42:37.000Z
diff --git a/runtime/libpgmath/lib/common/CMakeLists.txt b/runtime/libpgmath/lib/common/CMakeLists.txt
@@ -127,7 +127,7 @@ elseif(${LIBPGMATH_SYSTEM_PROCESSOR} MATCHES "ppc64le")
     mth_128defs_init.c
     mth_128defs_stats.c)
   libmath_add_object_library("${MTH_INTRINSICS_SRCS}" "${FLAGS}" "${DEFINITIONS}" "mth_intrinsics")
- 
+
   set(SRCS
     kidnnt.c
     sincos.c
@@ -454,7 +454,7 @@ else()
   libmath_add_object_library("${TARGET_NAME}.c" "${FLAGS}" "${DEFINITIONS}" "${TARGET_NAME}_build")
   add_dependencies("${TARGET_NAME}_build" ${TARGET_NAME})
 endif()
-  
+
 # Generate tmp-mth_statsdefs.h
 set(TARGET_NAME "tmp-mth_statsdefs")
 add_custom_command(OUTPUT ${TARGET_NAME}.h PRE_BUILD
@@ -466,6 +466,6 @@ add_dependencies(dispatch ${TARGET_NAME} tmp-mth_alldefs)
 add_dependencies(dispatch ${TARGET_NAME} tmp-mth_statsdefs)
 target_include_directories(dispatch
   BEFORE
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR} 
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}
   PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/../${PROCESSOR}/math_tables
   PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/../generic/math_tables)
diff --git a/runtime/libpgmath/lib/common/dceil.c b/runtime/libpgmath/lib/common/dceil.c
@@ -6,26 +6,44 @@
  */
 
 #include "mthdecls.h"
-#if     defined(__SSE4_1__) || defined(__AVX__)
-#include    <immintrin.h>
-#endif
 
-#if     defined(__AVX__)
+#if     defined(TARGET_X8664)
+/*
+ * For X8664, implement both SSE and AVX versions of __mth_i_ceil using ISA
+ * instruction extensions.
+ *
+ * Using inline assembly allows both the SSE and AVX versions of the routine
+ * to be compiled in a single unit.
+ *
+ * The following asm statements is equivalent to:
+ *      return _mm_cvtss_f32(_mm_ceil_ss(_mm_set1_ps(x), _mm_set1_ps(x)));
+ * But without the need for separate compiliations for SSE4.1 and AVX ISA
+ * extensions.
+ */
+
 double
-__mth_i_dceil_avx(double x)
+__mth_i_dceil_sse(double x)
 {
-  return _mm_cvtsd_f64(_mm_ceil_sd(_mm_set1_pd(x), _mm_set1_pd(x)));
+  __asm__(
+    "roundsd $0x2,%0,%0"
+    :"+x"(x)
+    );
+  return x;
 }
-#elif   defined(__SSE4_1__)
+
 double
-__mth_i_dceil_sse(double x)
+__mth_i_dceil_avx(double x)
 {
-  return _mm_cvtsd_f64(_mm_ceil_sd(_mm_set1_pd(x), _mm_set1_pd(x)));
+  __asm__(
+    "vroundsd $0x2,%0,%0,%0"
+    :"+x"(x)
+    );
+  return x;
 }
-#else
+#endif
+
 double
 __mth_i_dceil(double x)
 {
   return ceil(x);
 }
-#endif
diff --git a/runtime/libpgmath/lib/common/dfloor.c b/runtime/libpgmath/lib/common/dfloor.c
@@ -6,26 +6,44 @@
  */
 
 #include "mthdecls.h"
-#if     defined(__SSE4_1__) || defined(__AVX__)
-#include    <immintrin.h>
-#endif
 
-#if     defined(__AVX__)
+#if     defined(TARGET_X8664)
+/*
+ * For X8664, implement both SSE and AVX versions of __mth_i_dfloor using ISA
+ * instruction extensions.
+ *
+ * Using inline assembly allows both the SSE and AVX versions of the routine
+ * to be compiled in a single unit.
+ *
+ * The following asm statements is equivalent to:
+ *      return _mm_cvtss_f32(_mm_floor_sd(_mm_set1_ps(x), _mm_set1_ps(x)));
+ * But without the need for separate compiliations for SSE4.1 and AVX ISA
+ * extensions.
+ */
+
 double
-__mth_i_dfloor_avx(double x)
+__mth_i_dfloor_sse(double x)
 {
-  return _mm_cvtsd_f64(_mm_floor_sd(_mm_set1_pd(x), _mm_set1_pd(x)));
+  __asm__(
+    "roundsd $0x1,%0,%0"
+    :"+x"(x)
+    );
+  return x;
 }
-#elif   defined(__SSE4_1__)
+
 double
-__mth_i_dfloor_sse(double x)
+__mth_i_dfloor_avx(double x)
 {
-  return _mm_cvtsd_f64(_mm_floor_sd(_mm_set1_pd(x), _mm_set1_pd(x)));
+  __asm__(
+    "vroundsd $0x1,%0,%0,%0"
+    :"+x"(x)
+    );
+  return x;
 }
-#else
+#endif
+
 double
 __mth_i_dfloor(double x)
 {
   return floor(x);
 }
-#endif
diff --git a/runtime/libpgmath/lib/common/floor.c b/runtime/libpgmath/lib/common/floor.c
@@ -6,26 +6,44 @@
  */
 
 #include "mthdecls.h"
-#if     defined(__SSE4_1__) || defined(__AVX__)
-#include    <immintrin.h>
-#endif
 
-#if     defined(__AVX__)
+#if     defined(TARGET_X8664)
+/*
+ * For X8664, implement both SSE and AVX versions of __mth_i_floor using ISA
+ * instruction extensions.
+ *
+ * Using inline assembly allows both the SSE and AVX versions of the routine
+ * to be compiled in a single unit.
+ *
+ * The following asm statements is equivalent to:
+ *      return _mm_cvtss_f32(_mm_floor_ss(_mm_set1_ps(x), _mm_set1_ps(x)));
+ * But without the need for separate compiliations for SSE4.1 and AVX ISA
+ * extensions.
+ */
+
 float
-__mth_i_floor_avx(float x)
+__mth_i_floor_sse(float x)
 {
-  return _mm_cvtss_f32(_mm_floor_ss(_mm_set1_ps(x), _mm_set1_ps(x)));
+  __asm__(
+    "roundss $0x1,%0,%0"
+    :"+x"(x)
+    );
+  return x;
 }
-#elif   defined(__SSE4_1__)
+
 float
-__mth_i_floor_sse(float x)
+__mth_i_floor_avx(float x)
 {
-  return _mm_cvtss_f32(_mm_floor_ss(_mm_set1_ps(x), _mm_set1_ps(x)));
+  __asm__(
+    "vroundss $0x1,%0,%0,%0"
+    :"+x"(x)
+    );
+  return x;
 }
-#else
+#endif
+
 float
 __mth_i_floor(float x)
 {
   return floorf(x);
 }
-#endif
diff --git a/runtime/libpgmath/lib/x86_64/math_tables/mth_ceildefs.h b/runtime/libpgmath/lib/x86_64/math_tables/mth_ceildefs.h
@@ -5,8 +5,14 @@
  *
  */
 
-MTHINTRIN(ceil  , ss   , any        ,  __mth_i_ceil         ,  __mth_i_ceil         , __mth_i_ceil          ,__math_dispatch_error)
-MTHINTRIN(ceil  , ds   , any        ,  __mth_i_dceil        , __mth_i_dceil         , __mth_i_dceil         ,__math_dispatch_error)
+MTHINTRIN(ceil  , ss   , any        ,  __mth_i_ceil         , __mth_i_ceil          , __mth_i_ceil          ,__math_dispatch_error)
+MTHINTRIN(ceil  , ds   , em64t      ,  __mth_i_dceil        , __mth_i_dceil         , __mth_i_dceil         ,__math_dispatch_error)
+MTHINTRIN(ceil  , ds   , sse4       ,  __mth_i_dceil_sse    , __mth_i_dceil_sse     , __mth_i_dceil_sse     ,__math_dispatch_error)
+MTHINTRIN(ceil  , ds   , avx        ,  __mth_i_dceil_avx    , __mth_i_dceil_avx     , __mth_i_dceil_avx     ,__math_dispatch_error)
+MTHINTRIN(ceil  , ds   , avxfma4    ,  __mth_i_dceil_avx    , __mth_i_dceil_avx     , __mth_i_dceil_avx     ,__math_dispatch_error)
+MTHINTRIN(ceil  , ds   , avx2       ,  __mth_i_dceil_avx    , __mth_i_dceil_avx     , __mth_i_dceil_avx     ,__math_dispatch_error)
+MTHINTRIN(ceil  , ds   , avx512knl  ,  __mth_i_dceil_avx    , __mth_i_dceil_avx     , __mth_i_dceil_avx     ,__math_dispatch_error)
+MTHINTRIN(ceil  , ds   , avx512     ,  __mth_i_dceil_avx    , __mth_i_dceil_avx     , __mth_i_dceil_avx     ,__math_dispatch_error)
 MTHINTRIN(ceil  , sv4  , any        ,  __gs_ceil_4_f        ,  __gs_ceil_4_r        , __gs_ceil_4_p         ,__math_dispatch_error)
 MTHINTRIN(ceil  , dv2  , any        ,  __gd_ceil_2_f        ,  __gd_ceil_2_r        , __gd_ceil_2_p         ,__math_dispatch_error)
 MTHINTRIN(ceil  , sv8  , any        ,  __gs_ceil_8_f        ,  __gs_ceil_8_r        , __gs_ceil_8_p         ,__math_dispatch_error)
diff --git a/runtime/libpgmath/lib/x86_64/math_tables/mth_floordefs.h b/runtime/libpgmath/lib/x86_64/math_tables/mth_floordefs.h
@@ -5,8 +5,20 @@
  *
  */
 
-MTHINTRIN(floor  , ss   , any        ,  __mth_i_floor         ,  __mth_i_floor         , __mth_i_floor          ,__math_dispatch_error)
-MTHINTRIN(floor  , ds   , any        ,  __mth_i_dfloor        ,  __mth_i_dfloor        , __mth_i_dfloor         ,__math_dispatch_error)
+MTHINTRIN(floor  , ss   , em64t      ,  __mth_i_floor         ,  __mth_i_floor         , __mth_i_floor          ,__math_dispatch_error)
+MTHINTRIN(floor  , ds   , em64t      ,  __mth_i_dfloor        ,  __mth_i_dfloor        , __mth_i_dfloor         ,__math_dispatch_error)
+MTHINTRIN(floor  , ss   , sse4       ,  __mth_i_floor_sse     ,  __mth_i_floor_sse     , __mth_i_floor_sse      ,__math_dispatch_error)
+MTHINTRIN(floor  , ds   , sse4       ,  __mth_i_dfloor_sse    ,  __mth_i_dfloor_sse    , __mth_i_dfloor_sse     ,__math_dispatch_error)
+MTHINTRIN(floor  , ss   , avx        ,  __mth_i_floor_avx     ,  __mth_i_floor_avx     , __mth_i_floor_avx      ,__math_dispatch_error)
+MTHINTRIN(floor  , ds   , avx        ,  __mth_i_dfloor_avx    ,  __mth_i_dfloor_avx    , __mth_i_dfloor_avx     ,__math_dispatch_error)
+MTHINTRIN(floor  , ss   , avxfma4    ,  __mth_i_floor_avx     ,  __mth_i_floor_avx     , __mth_i_floor_avx      ,__math_dispatch_error)
+MTHINTRIN(floor  , ds   , avxfma4    ,  __mth_i_dfloor_avx    ,  __mth_i_dfloor_avx    , __mth_i_dfloor_avx     ,__math_dispatch_error)
+MTHINTRIN(floor  , ss   , avx2       ,  __mth_i_floor_avx     ,  __mth_i_floor_avx     , __mth_i_floor_avx      ,__math_dispatch_error)
+MTHINTRIN(floor  , ds   , avx2       ,  __mth_i_dfloor_avx    ,  __mth_i_dfloor_avx    , __mth_i_dfloor_avx     ,__math_dispatch_error)
+MTHINTRIN(floor  , ss   , avx512knl  ,  __mth_i_floor_avx     ,  __mth_i_floor_avx     , __mth_i_floor_avx      ,__math_dispatch_error)
+MTHINTRIN(floor  , ds   , avx512knl  ,  __mth_i_dfloor_avx    ,  __mth_i_dfloor_avx    , __mth_i_dfloor_avx     ,__math_dispatch_error)
+MTHINTRIN(floor  , ss   , avx512     ,  __mth_i_floor_avx     ,  __mth_i_floor_avx     , __mth_i_floor_avx      ,__math_dispatch_error)
+MTHINTRIN(floor  , ds   , avx512     ,  __mth_i_dfloor_avx    ,  __mth_i_dfloor_avx    , __mth_i_dfloor_avx     ,__math_dispatch_error)
 MTHINTRIN(floor  , sv4  , any        ,  __gs_floor_4_f        ,  __gs_floor_4_r        , __gs_floor_4_p         ,__math_dispatch_error)
 MTHINTRIN(floor  , dv2  , any        ,  __gd_floor_2_f        ,  __gd_floor_2_r        , __gd_floor_2_p         ,__math_dispatch_error)
 MTHINTRIN(floor  , sv8  , any        ,  __gs_floor_8_f        ,  __gs_floor_8_r        , __gs_floor_8_p         ,__math_dispatch_error)