Skip to content

Commit 886cbaf

Browse files
committed
Support AMD Piledriver by bulldozer kernels.
1 parent 0c4074e commit 886cbaf

30 files changed

+377
-80
lines changed

Makefile.system

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -311,14 +311,14 @@ ifeq ($(ARCH), x86)
311311
DYNAMIC_CORE = KATMAI COPPERMINE NORTHWOOD PRESCOTT BANIAS \
312312
CORE2 PENRYN DUNNINGTON NEHALEM ATHLON OPTERON OPTERON_SSE3 BARCELONA BOBCAT ATOM NANO
313313
ifneq ($(NO_AVX), 1)
314-
DYNAMIC_CORE += SANDYBRIDGE BULLDOZER
314+
DYNAMIC_CORE += SANDYBRIDGE BULLDOZER PILEDRIVER
315315
endif
316316
endif
317317

318318
ifeq ($(ARCH), x86_64)
319319
DYNAMIC_CORE = PRESCOTT CORE2 PENRYN DUNNINGTON NEHALEM OPTERON OPTERON_SSE3 BARCELONA BOBCAT ATOM NANO
320320
ifneq ($(NO_AVX), 1)
321-
DYNAMIC_CORE += SANDYBRIDGE BULLDOZER
321+
DYNAMIC_CORE += SANDYBRIDGE BULLDOZER PILEDRIVER
322322
endif
323323
endif
324324

README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,7 @@ Please read GotoBLAS_01Readme.txt
4848
- **Intel Haswell**: Optimized Level-3 BLAS with AVX on x86-64 (identical to Sandy Bridge).
4949
- **AMD Bobcat**: Used GotoBLAS2 Barcelona codes.
5050
- **AMD Bulldozer**: x86-64 S/DGEMM AVX kernels. (Thank Werner Saar)
51+
- **AMD PILEDRIVER**: Used Bulldozer codes.
5152

5253
#### MIPS64:
5354
- **ICT Loongson 3A**: Optimized Level-3 BLAS and the part of Level-1,2.

common_x86.h

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -171,6 +171,11 @@ static __inline int blas_quickdivide(unsigned int x, unsigned int y){
171171
#define MMXSTORE movd
172172
#endif
173173

174+
#if defined(PILEDRIVER) || defined(BULLDOZER)
175+
//Enable some optimazation for barcelona.
176+
#define BARCELONA_OPTIMIZATION
177+
#endif
178+
174179
#if defined(HAVE_3DNOW)
175180
#define EMMS femms
176181
#elif defined(HAVE_MMX)

common_x86_64.h

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -218,6 +218,11 @@ static __inline int blas_quickdivide(unsigned int x, unsigned int y){
218218

219219
#ifdef ASSEMBLER
220220

221+
#if defined(PILEDRIVER) || defined(BULLDOZER)
222+
//Enable some optimazation for barcelona.
223+
#define BARCELONA_OPTIMIZATION
224+
#endif
225+
221226
#if defined(HAVE_3DNOW)
222227
#define EMMS femms
223228
#elif defined(HAVE_MMX)

cpuid.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -106,6 +106,7 @@
106106
#define CORE_SANDYBRIDGE 20
107107
#define CORE_BOBCAT 21
108108
#define CORE_BULLDOZER 22
109+
#define CORE_PILEDRIVER 23
109110
#define CORE_HASWELL CORE_SANDYBRIDGE
110111

111112
#define HAVE_SSE (1 << 0)
@@ -128,6 +129,7 @@
128129
#define HAVE_FASTMOVU (1 << 17)
129130
#define HAVE_AVX (1 << 18)
130131
#define HAVE_FMA4 (1 << 19)
132+
#define HAVE_FMA3 (1 << 20)
131133

132134
#define CACHE_INFO_L1_I 1
133135
#define CACHE_INFO_L1_D 2
@@ -197,6 +199,7 @@ typedef struct {
197199
#define CPUTYPE_SANDYBRIDGE 44
198200
#define CPUTYPE_BOBCAT 45
199201
#define CPUTYPE_BULLDOZER 46
202+
#define CPUTYPE_PILEDRIVER 47
200203
// this define is because BLAS doesn't have haswell specific optimizations yet
201204
#define CPUTYPE_HASWELL CPUTYPE_SANDYBRIDGE
202205

cpuid_x86.c

Lines changed: 37 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,8 @@
4747
#define CORE_SANDYBRIDGE CORE_NEHALEM
4848
#define CPUTYPE_BULLDOZER CPUTYPE_BARCELONA
4949
#define CORE_BULLDOZER CORE_BARCELONA
50+
#define CPUTYPE_PILEDRIVER CPUTYPE_BARCELONA
51+
#define CORE_PILEDRIVER CORE_BARCELONA
5052
#endif
5153

5254
#ifndef CPUIDEMU
@@ -228,6 +230,7 @@ int get_cputype(int gettype){
228230
#ifndef NO_AVX
229231
if (support_avx()) feature |= HAVE_AVX;
230232
#endif
233+
if ((ecx & (1 << 20)) != 0) feature |= HAVE_FMA3;
231234

232235
if (have_excpuid() >= 0x01) {
233236
cpuid(0x80000001, &eax, &ebx, &ecx, &edx);
@@ -1100,11 +1103,21 @@ int get_cpuname(void){
11001103
case 1:
11011104
case 10:
11021105
return CPUTYPE_BARCELONA;
1103-
case 6: //AMD Bulldozer Opteron 6200 / Opteron 4200 / AMD FX-Series
1104-
if(support_avx())
1105-
return CPUTYPE_BULLDOZER;
1106-
else
1107-
return CPUTYPE_BARCELONA; //OS don't support AVX.
1106+
case 6:
1107+
switch (model) {
1108+
case 1:
1109+
//AMD Bulldozer Opteron 6200 / Opteron 4200 / AMD FX-Series
1110+
if(support_avx())
1111+
return CPUTYPE_BULLDOZER;
1112+
else
1113+
return CPUTYPE_BARCELONA; //OS don't support AVX.
1114+
case 2:
1115+
if(support_avx())
1116+
return CPUTYPE_PILEDRIVER;
1117+
else
1118+
return CPUTYPE_BARCELONA; //OS don't support AVX.
1119+
}
1120+
break;
11081121
case 5:
11091122
return CPUTYPE_BOBCAT;
11101123
}
@@ -1229,6 +1242,7 @@ static char *cpuname[] = {
12291242
"SANDYBRIDGE",
12301243
"BOBCAT",
12311244
"BULLDOZER",
1245+
"PILEDRIVER",
12321246
};
12331247

12341248
static char *lowercpuname[] = {
@@ -1278,6 +1292,7 @@ static char *lowercpuname[] = {
12781292
"sandybridge",
12791293
"bobcat",
12801294
"bulldozer",
1295+
"piledriver",
12811296
};
12821297

12831298
static char *corename[] = {
@@ -1304,6 +1319,7 @@ static char *corename[] = {
13041319
"SANDYBRIDGE",
13051320
"BOBCAT",
13061321
"BULLDOZER",
1322+
"PILEDRIVER",
13071323
};
13081324

13091325
static char *corename_lower[] = {
@@ -1330,6 +1346,7 @@ static char *corename_lower[] = {
13301346
"sandybridge",
13311347
"bobcat",
13321348
"bulldozer",
1349+
"piledriver",
13331350
};
13341351

13351352

@@ -1472,11 +1489,19 @@ int get_coretype(void){
14721489
if ((exfamily == 0) || (exfamily == 2)) return CORE_OPTERON;
14731490
else if (exfamily == 5) return CORE_BOBCAT;
14741491
else if (exfamily == 6) {
1475-
//AMD Bulldozer Opteron 6200 / Opteron 4200 / AMD FX-Series
1476-
if(support_avx())
1477-
return CORE_BULLDOZER;
1478-
else
1479-
return CORE_BARCELONA; //OS don't support AVX. Use old kernels.
1492+
switch (model) {
1493+
case 1:
1494+
//AMD Bulldozer Opteron 6200 / Opteron 4200 / AMD FX-Series
1495+
if(support_avx())
1496+
return CORE_BULLDOZER;
1497+
else
1498+
return CORE_BARCELONA; //OS don't support AVX.
1499+
case 2:
1500+
if(support_avx())
1501+
return CORE_PILEDRIVER;
1502+
else
1503+
return CORE_BARCELONA; //OS don't support AVX.
1504+
}
14801505
}else return CORE_BARCELONA;
14811506
}
14821507
}
@@ -1564,6 +1589,7 @@ void get_cpuconfig(void){
15641589
if (features & HAVE_3DNOWEX) printf("#define HAVE_3DNOWEX\n");
15651590
if (features & HAVE_3DNOW) printf("#define HAVE_3DNOW\n");
15661591
if (features & HAVE_FMA4 ) printf("#define HAVE_FMA4\n");
1592+
if (features & HAVE_FMA3 ) printf("#define HAVE_FMA3\n");
15671593
if (features & HAVE_CFLUSH) printf("#define HAVE_CFLUSH\n");
15681594
if (features & HAVE_HIT) printf("#define HAVE_HIT 1\n");
15691595
if (features & HAVE_MISALIGNSSE) printf("#define HAVE_MISALIGNSSE\n");
@@ -1631,5 +1657,6 @@ void get_sse(void){
16311657
if (features & HAVE_3DNOWEX) printf("HAVE_3DNOWEX=1\n");
16321658
if (features & HAVE_3DNOW) printf("HAVE_3DNOW=1\n");
16331659
if (features & HAVE_FMA4 ) printf("HAVE_FMA4=1\n");
1660+
if (features & HAVE_FMA3 ) printf("HAVE_FMA3=1\n");
16341661

16351662
}

driver/others/dynamic.c

Lines changed: 16 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -64,10 +64,12 @@ extern gotoblas_t gotoblas_BOBCAT;
6464
#ifndef NO_AVX
6565
extern gotoblas_t gotoblas_SANDYBRIDGE;
6666
extern gotoblas_t gotoblas_BULLDOZER;
67+
extern gotoblas_t gotoblas_PILEDRIVER;
6768
#else
6869
//Use NEHALEM kernels for sandy bridge
6970
#define gotoblas_SANDYBRIDGE gotoblas_NEHALEM
7071
#define gotoblas_BULLDOZER gotoblas_BARCELONA
72+
#define gotoblas_PILEDRIVER gotoblas_BARCELONA
7173
#endif
7274
//Use sandy bridge kernels for haswell.
7375
#define gotoblas_HASWELL gotoblas_SANDYBRIDGE
@@ -228,13 +230,23 @@ static gotoblas_t *get_coretype(void){
228230
} else if (exfamily == 5) {
229231
return &gotoblas_BOBCAT;
230232
} else if (exfamily == 6) {
231-
//AMD Bulldozer Opteron 6200 / Opteron 4200 / AMD FX-Series
233+
if(model == 1){
234+
//AMD Bulldozer Opteron 6200 / Opteron 4200 / AMD FX-Series
232235
if(support_avx())
233236
return &gotoblas_BULLDOZER;
234237
else{
235238
fprintf(stderr, "OpenBLAS : Your OS does not support AVX instructions. OpenBLAS is using Barcelona kernels as a fallback, which may give poorer performance.\n");
236239
return &gotoblas_BARCELONA; //OS doesn't support AVX. Use old kernels.
237-
}
240+
}
241+
}else if(model == 2){
242+
//AMD Bulldozer Opteron 6300 / Opteron 4300 / Opteron 3300
243+
if(support_avx())
244+
return &gotoblas_PILEDRIVER;
245+
else{
246+
fprintf(stderr, "OpenBLAS : Your OS does not support AVX instructions. OpenBLAS is using Barcelona kernels as a fallback, which may give poorer performance.\n");
247+
return &gotoblas_BARCELONA; //OS doesn't support AVX. Use old kernels.
248+
}
249+
}
238250
} else {
239251
return &gotoblas_BARCELONA;
240252
}
@@ -272,6 +284,7 @@ static char *corename[] = {
272284
"Sandybridge",
273285
"Bobcat",
274286
"Bulldozer",
287+
"Piledriver",
275288
};
276289

277290
char *gotoblas_corename(void) {
@@ -294,6 +307,7 @@ char *gotoblas_corename(void) {
294307
if (gotoblas == &gotoblas_SANDYBRIDGE) return corename[16];
295308
if (gotoblas == &gotoblas_BOBCAT) return corename[17];
296309
if (gotoblas == &gotoblas_BULLDOZER) return corename[18];
310+
if (gotoblas == &gotoblas_PILEDRIVER) return corename[19];
297311

298312
return corename[0];
299313
}

getarch.c

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -106,6 +106,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
106106
/* #define FORCE_ISTANBUL */
107107
/* #define FORCE_BOBCAT */
108108
/* #define FORCE_BULLDOZER */
109+
/* #define FORCE_PILEDRIVER */
109110
/* #define FORCE_SSE_GENERIC */
110111
/* #define FORCE_VIAC3 */
111112
/* #define FORCE_NANO */
@@ -398,6 +399,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
398399
#define CORENAME "BULLDOZER"
399400
#endif
400401

402+
#if defined (FORCE_PILEDRIVER)
403+
#define FORCE
404+
#define FORCE_INTEL
405+
#define ARCHITECTURE "X86"
406+
#define SUBARCHITECTURE "PILEDRIVER"
407+
#define ARCHCONFIG "-DPILEDRIVER " \
408+
"-DL1_DATA_SIZE=16384 -DL1_DATA_LINESIZE=64 " \
409+
"-DL2_SIZE=2097152 -DL2_LINESIZE=64 -DL3_SIZE=12582912 " \
410+
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \
411+
"-DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSE4_1 -DHAVE_SSE4_2" \
412+
"-DHAVE_SSE4A -DHAVE_MISALIGNSSE -DHAVE_128BITFPU -DHAVE_FASTMOVU -DHAVE_CFLUSH" \
413+
"-DHAVE_AVX -DHAVE_FMA4 -DHAVE_FMA3"
414+
#define LIBNAME "piledriver"
415+
#define CORENAME "PILEDRIVER"
416+
#endif
417+
401418
#ifdef FORCE_SSE_GENERIC
402419
#define FORCE
403420
#define FORCE_INTEL

kernel/setparam-ref.c

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -826,6 +826,22 @@ static void init_parameter(void) {
826826
#endif
827827
#endif
828828

829+
#ifdef PILEDRIVER
830+
831+
#ifdef DEBUG
832+
fprintf(stderr, "Piledriver\n");
833+
#endif
834+
835+
TABLE_NAME.sgemm_p = SGEMM_DEFAULT_P;
836+
TABLE_NAME.dgemm_p = DGEMM_DEFAULT_P;
837+
TABLE_NAME.cgemm_p = CGEMM_DEFAULT_P;
838+
TABLE_NAME.zgemm_p = ZGEMM_DEFAULT_P;
839+
#ifdef EXPRECISION
840+
TABLE_NAME.qgemm_p = QGEMM_DEFAULT_P;
841+
TABLE_NAME.xgemm_p = XGEMM_DEFAULT_P;
842+
#endif
843+
#endif
844+
829845
#ifdef NANO
830846

831847
#ifdef DEBUG

kernel/x86/KERNEL.PILEDRIVER

Lines changed: 59 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,59 @@
1+
SGEMMKERNEL = gemm_kernel_4x4_barcelona.S
2+
SGEMMINCOPY =
3+
SGEMMITCOPY =
4+
SGEMMONCOPY = ../generic/gemm_ncopy_4.c
5+
SGEMMOTCOPY = ../generic/gemm_tcopy_4.c
6+
SGEMMINCOPYOBJ =
7+
SGEMMITCOPYOBJ =
8+
SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX)
9+
SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX)
10+
DGEMMKERNEL = gemm_kernel_2x4_barcelona.S
11+
DGEMMINCOPY = ../generic/gemm_ncopy_2.c
12+
DGEMMITCOPY = ../generic/gemm_tcopy_2.c
13+
DGEMMONCOPY = ../generic/gemm_ncopy_4.c
14+
DGEMMOTCOPY = ../generic/gemm_tcopy_4.c
15+
DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX)
16+
DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX)
17+
DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX)
18+
DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX)
19+
CGEMMKERNEL = zgemm_kernel_2x2_barcelona.S
20+
CGEMMINCOPY =
21+
CGEMMITCOPY =
22+
CGEMMONCOPY = ../generic/zgemm_ncopy_2.c
23+
CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c
24+
CGEMMINCOPYOBJ =
25+
CGEMMITCOPYOBJ =
26+
CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX)
27+
CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX)
28+
ZGEMMKERNEL = zgemm_kernel_1x2_barcelona.S
29+
ZGEMMINCOPY = ../generic/zgemm_ncopy_1.c
30+
ZGEMMITCOPY = ../generic/zgemm_tcopy_1.c
31+
ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c
32+
ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c
33+
ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX)
34+
ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX)
35+
ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX)
36+
ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX)
37+
38+
STRSMKERNEL_LN = trsm_kernel_LN_4x4_sse.S
39+
STRSMKERNEL_LT = trsm_kernel_LT_4x4_sse.S
40+
STRSMKERNEL_RN = trsm_kernel_LT_4x4_sse.S
41+
STRSMKERNEL_RT = trsm_kernel_RT_4x4_sse.S
42+
43+
DTRSMKERNEL_LN = trsm_kernel_LN_2x4_sse2.S
44+
DTRSMKERNEL_LT = trsm_kernel_LT_2x4_sse2.S
45+
DTRSMKERNEL_RN = trsm_kernel_LT_2x4_sse2.S
46+
DTRSMKERNEL_RT = trsm_kernel_RT_2x4_sse2.S
47+
48+
CTRSMKERNEL_LN = ztrsm_kernel_LN_2x2_sse.S
49+
CTRSMKERNEL_LT = ztrsm_kernel_LT_2x2_sse.S
50+
CTRSMKERNEL_RN = ztrsm_kernel_LT_2x2_sse.S
51+
CTRSMKERNEL_RT = ztrsm_kernel_RT_2x2_sse.S
52+
53+
ZTRSMKERNEL_LN = ztrsm_kernel_LT_1x2_sse2.S
54+
ZTRSMKERNEL_LT = ztrsm_kernel_LT_1x2_sse2.S
55+
ZTRSMKERNEL_RN = ztrsm_kernel_LT_1x2_sse2.S
56+
ZTRSMKERNEL_RT = ztrsm_kernel_RT_1x2_sse2.S
57+
58+
CGEMM3MKERNEL = zgemm3m_kernel_4x4_barcelona.S
59+
ZGEMM3MKERNEL = zgemm3m_kernel_2x4_barcelona.S

0 commit comments

Comments
 (0)