1
1
/ *******************************************************************************
2
- Copyright (c) 2015 , The OpenBLAS Project
2
+ Copyright (c) 2015 , 2024 The OpenBLAS Project
3
3
All rights reserved.
4
4
Redistribution and use in source and binary forms , with or without
5
5
modification , are permitted provided th at the following conditions are
@@ -170,39 +170,48 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
170
170
171
171
.macro KERNEL_F32_FINALIZE
172
172
#if !defined(DOUBLE)
173
- fadd v1.4s , v1.4s , v2.4s
173
+ // F8 only has 2 accumulators
174
+ // so add into those pairs
174
175
fadd v1.4s , v1.4s , v3.4s
175
- fadd v1.4s , v1.4s , v4.4s
176
- #else
177
- fadd v1.2d , v1.2d , v2.2d
178
- fadd v1.2d , v1.2d , v3.2d
179
- fadd v1.2d , v1.2d , v4.2d
176
+ fadd v2.4s , v2.4s , v4.4s
180
177
#endif
181
178
.endm
182
179
183
- .macro KERNEL_F4
180
+ .macro KERNEL_F8
184
181
#if !defined(DOUBLE)
185
- ld1 {v2 .4s} , [ A_PTR ], # 16
186
- ld1 {v3 .4s} , [ X_PTR ], # 16
187
- fmla v1.4s , v2 .4s , v3 .4s
188
- #else
189
- ld1 {v2.2d} , [ A_PTR ], # 16
190
- ld1 {v3 .2d} , [ X_PTR ], # 16
191
- fmla v1 .2d , v2 .2d , v3 .2d
192
-
193
- ld1 {v4 .2d} , [ A_PTR ], # 16
194
- ld1 {v5 .2d} , [ X_PTR ], # 16
195
- fmla v1 .2d , v4 .2d , v5 .2d
182
+ ld1 {v13 .4s , v14.4s }, [ A_PTR ], # 32
183
+ ld1 {v17 .4s , v18.4s }, [ X_PTR ], # 32
184
+ fmla v1.4s , v13 .4s , v17 .4s
185
+ fmla v2.4s , v14.4s , v18.4s
186
+ #else
187
+ ld1 {v13 .2d , v14.2d , v15.2d , v16.2d }, [ A_PTR ], # 64
188
+ ld1 {v17 .2d , v18 .2d , v19 .2d , v20.2d} , [ X_PTR ], # 64
189
+ fmla v1.2d , v13.2d , v17.2d
190
+ fmla v2 .2d , v14.2d , v18.2d
191
+ fmla v3 .2d , v15.2d , v19.2d
192
+ fmla v4 .2d , v16 .2d , v20 .2d
196
193
#endif
197
194
.endm
198
195
199
- .macro KERNEL_F4_FINALIZE
196
+ .macro KERNEL_F8_FINALIZE
200
197
#if !defined(DOUBLE)
201
- ext v2.16b , v1.16b , v1.16b , # 8
198
+ // Take the top two elements of v1 and
199
+ // put them into the first two lanes of v3
200
+ ext v3.16b , v1.16b , v1.16b , # 8
201
+ fadd v1.2s , v1.2s , v3.2s
202
+ ext v4.16b , v2.16b , v2.16b , # 8
203
+ fadd v2.2s , v2.2s , v4.2s
204
+ // Final pair
202
205
fadd v1.2s , v1.2s , v2.2s
203
206
faddp TEMP , v1.2s
204
207
#else
205
208
faddp TEMP , v1.2d
209
+ faddp TEMP1 , v2.2d
210
+ faddp TEMP2 , v3.2d
211
+ faddp TEMP3 , v4.2d
212
+ fadd TEMP , TEMP , TEMP1
213
+ fadd TEMP2 , TEMP2 , TEMP3
214
+ fadd TEMP , TEMP , TEMP2
206
215
#endif
207
216
.endm
208
217
@@ -258,7 +267,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
258
267
259
268
asr I , M , # 5
260
269
cmp I , xzr
261
- beq .Lgemv_t_kernel_F4
270
+ beq .Lgemv_t_kernel_F8
262
271
263
272
.Lgemv_t_kernel_F320:
264
273
@@ -269,24 +278,24 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
269
278
270
279
KERNEL_F32_FINALIZE
271
280
272
- .Lgemv_t_kernel_F4 :
281
+ .Lgemv_t_kernel_F8 :
273
282
ands I , M , # 31
274
- asr I , I , # 2
283
+ asr I , I , # 3
275
284
cmp I , xzr
276
285
beq .Lgemv_t_kernel_F1
277
286
278
- .Lgemv_t_kernel_F40 :
287
+ .Lgemv_t_kernel_F80 :
279
288
280
- KERNEL_F4
289
+ KERNEL_F8
281
290
282
291
subs I , I , # 1
283
- bne .Lgemv_t_kernel_F40
292
+ bne .Lgemv_t_kernel_F80
284
293
285
294
.Lgemv_t_kernel_F1:
286
295
287
- KERNEL_F4_FINALIZE
296
+ KERNEL_F8_FINALIZE
288
297
289
- ands I , M , # 3
298
+ ands I , M , # 7
290
299
ble .Lgemv_t_kernel_F_END
291
300
292
301
.Lgemv_t_kernel_F10:
0 commit comments