Open
Description
While investigating a performance issue in SPEC CPU2006 481.wrf (WRF version 2.0.2) on Zen4, we discovered that loop vectorization in the SINT subroutine (loops sint.f90:95-212) is slower than the equivalent scalar code. This appears to be caused by masked load/store/gather/scatter costing during LoopVectorize.
It's also worth noting that it looks like AOCC 5.0.0 avoids this issue through BOSCC. Is there still interest in upstreaming that work? That would be interesting.
Here's an IR reproducer for one particular masked gather/scatter issue in SINT (there are others):
; opt -passes=loop-vectorize -pass-remarks=loop-vectorize repro.ll -S -o repro.llvm
; ModuleID = 'FIRModule'
source_filename = "FIRModule"
target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128"
target triple = "x86_64-unknown-linux-gnu"
; Function Attrs: nofree norecurse nosync nounwind memory(argmem: readwrite)
define void @foo_(ptr nocapture readonly %0, ptr nocapture writeonly %1, ptr nocapture readonly %2, ptr nocapture readonly %3, ptr nocapture readonly %4, ptr nocapture readonly %5, ptr nocapture readonly %6, ptr nocapture readonly %7) local_unnamed_addr #0 !dbg !8 {
%9 = load i32, ptr %0, align 4, !tbaa !23
%10 = sext i32 %9 to i64
%11 = tail call i64 @llvm.smax.i64(i64 %10, i64 0)
%12 = icmp sgt i32 %9, 0
br i1 %12, label %.preheader13.lr.ph, label %._crit_edge
.preheader13.lr.ph: ; preds = %8
%13 = mul nuw nsw i64 %11, %11
br label %.preheader13.us
.preheader13.us: ; preds = %._crit_edge14.split.us.us, %.preheader13.lr.ph
%indvars.iv27 = phi i64 [ %indvars.iv.next28, %._crit_edge14.split.us.us ], [ 1, %.preheader13.lr.ph ]
%14 = add nsw i64 %indvars.iv27, -1
%15 = mul nuw nsw i64 %14, %11
%16 = getelementptr i32, ptr %7, i64 %15
%17 = mul nsw i64 %14, %13
%18 = add i64 %17, -1
br label %.preheader.us.us
.preheader.us.us: ; preds = %._crit_edge.us.us, %.preheader13.us
%indvars.iv23 = phi i64 [ %indvars.iv.next24, %._crit_edge.us.us ], [ 1, %.preheader13.us ]
%19 = add nsw i64 %indvars.iv23, -1
%20 = getelementptr i32, ptr %16, i64 %19
%21 = load i32, ptr %20, align 4, !tbaa !73
%.not.us.us = icmp eq i32 %21, 0
%22 = mul nuw nsw i64 %19, %11
%23 = add i64 %18, %22
br i1 %.not.us.us, label %.lr.ph.split.us.us.us.preheader, label %.lr.ph.split.us16.us.preheader
.lr.ph.split.us16.us.preheader: ; preds = %.preheader.us.us
br label %.lr.ph.split.us16.us
.lr.ph.split.us.us.us.preheader: ; preds = %.preheader.us.us
br label %.lr.ph.split.us.us.us
.lr.ph.split.us16.us: ; preds = %.lr.ph.split.us16.us.preheader, %.lr.ph.split.us16.us
%indvars.iv = phi i64 [ %indvars.iv.next, %.lr.ph.split.us16.us ], [ 1, %.lr.ph.split.us16.us.preheader ]
%24 = add i64 %23, %indvars.iv
%25 = getelementptr i32, ptr %4, i64 %24
%26 = load i32, ptr %25, align 4, !tbaa !79
%27 = sext i32 %26 to i64
%28 = getelementptr i32, ptr %5, i64 %24
%29 = load i32, ptr %28, align 4, !tbaa !81
%30 = sext i32 %29 to i64
%31 = getelementptr i32, ptr %6, i64 %24
%32 = load i32, ptr %31, align 4, !tbaa !83
%33 = sext i32 %32 to i64
%34 = add nsw i64 %27, -1
%35 = add nsw i64 %30, -1
%36 = mul nsw i64 %35, %11
%37 = add nsw i64 %34, %36
%38 = add nsw i64 %33, -1
%39 = mul nsw i64 %38, %13
%40 = add nsw i64 %37, %39
%41 = getelementptr float, ptr %2, i64 %40
%42 = load float, ptr %41, align 4, !tbaa !85
%43 = getelementptr float, ptr %3, i64 %40
%44 = load float, ptr %43, align 4, !tbaa !87
%45 = fadd fast float %44, %42
%46 = getelementptr float, ptr %1, i64 %40
store float %45, ptr %46, align 4
%indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
%exitcond.not = icmp eq i64 %indvars.iv, %10
br i1 %exitcond.not, label %._crit_edge.us.us.loopexit32, label %.lr.ph.split.us16.us
._crit_edge.us.us.loopexit: ; preds = %.lr.ph.split.us.us.us
br label %._crit_edge.us.us
._crit_edge.us.us.loopexit32: ; preds = %.lr.ph.split.us16.us
br label %._crit_edge.us.us
._crit_edge.us.us: ; preds = %._crit_edge.us.us.loopexit32, %._crit_edge.us.us.loopexit
%indvars.iv.next24 = add nuw nsw i64 %indvars.iv23, 1
%exitcond26.not = icmp eq i64 %indvars.iv23, %10
br i1 %exitcond26.not, label %._crit_edge14.split.us.us, label %.preheader.us.us
.lr.ph.split.us.us.us: ; preds = %.lr.ph.split.us.us.us.preheader, %.lr.ph.split.us.us.us
%indvars.iv19 = phi i64 [ %indvars.iv.next20, %.lr.ph.split.us.us.us ], [ 1, %.lr.ph.split.us.us.us.preheader ]
%47 = add i64 %23, %indvars.iv19
%48 = getelementptr i32, ptr %4, i64 %47
%49 = load i32, ptr %48, align 4, !tbaa !79
%50 = sext i32 %49 to i64
%51 = getelementptr i32, ptr %5, i64 %47
%52 = load i32, ptr %51, align 4, !tbaa !81
%53 = sext i32 %52 to i64
%54 = getelementptr i32, ptr %6, i64 %47
%55 = load i32, ptr %54, align 4, !tbaa !83
%56 = sext i32 %55 to i64
%57 = add nsw i64 %50, -1
%58 = add nsw i64 %53, -1
%59 = mul nsw i64 %58, %11
%60 = add nsw i64 %57, %59
%61 = add nsw i64 %56, -1
%62 = mul nsw i64 %61, %13
%63 = add nsw i64 %60, %62
%64 = getelementptr float, ptr %2, i64 %63
%65 = load float, ptr %64, align 4, !tbaa !85
%66 = getelementptr float, ptr %3, i64 %63
%67 = load float, ptr %66, align 4, !tbaa !87
%68 = fsub fast float %65, %67
%69 = getelementptr float, ptr %1, i64 %63
store float %68, ptr %69, align 4, !tbaa !89
%indvars.iv.next20 = add nuw nsw i64 %indvars.iv19, 1
%exitcond22.not = icmp eq i64 %indvars.iv19, %10
br i1 %exitcond22.not, label %._crit_edge.us.us.loopexit, label %.lr.ph.split.us.us.us
._crit_edge14.split.us.us: ; preds = %._crit_edge.us.us
%indvars.iv.next28 = add nuw nsw i64 %indvars.iv27, 1
%exitcond30.not = icmp eq i64 %indvars.iv27, %10
br i1 %exitcond30.not, label %._crit_edge.loopexit, label %.preheader13.us
._crit_edge.loopexit: ; preds = %._crit_edge14.split.us.us
br label %._crit_edge
._crit_edge: ; preds = %._crit_edge.loopexit, %8
ret void
}
; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
declare i64 @llvm.smax.i64(i64, i64) #1
attributes #0 = { nofree norecurse nosync nounwind memory(argmem: readwrite) "approx-func-fp-math"="true" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "target-cpu"="znver4" "unsafe-fp-math"="true" }
attributes #1 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
!llvm.module.flags = !{!0, !1, !2, !3, !4}
!llvm.ident = !{!7}
!0 = !{i32 2, !"Debug Info Version", i32 3}
!1 = !{i32 8, !"PIC Level", i32 2}
!2 = !{i32 7, !"PIE Level", i32 2}
!3 = !{i32 1, !"Code Model", i32 4}
!4 = !{i32 1, !"Large Data Threshold", i64 0}
!5 = distinct !DICompileUnit(language: DW_LANG_Fortran95, file: !6, producer: "flang version 20.1.0 (https://github.com/llvm/llvm-project.git 24a30daaa559829ad079f2ff7f73eb4e18095f88)", isOptimized: true, runtimeVersion: 0, emissionKind: NoDebug)
!6 = !DIFile(filename: "test.f90", directory: "/home/cmcinally")
!7 = !{!"flang version 20.1.0 (https://github.com/llvm/llvm-project.git 24a30daaa559829ad079f2ff7f73eb4e18095f88)"}
!8 = distinct !DISubprogram(name: "foo", linkageName: "foo_", scope: !6, file: !6, line: 1, type: !9, scopeLine: 1, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !5)
!9 = !DISubroutineType(cc: DW_CC_normal, types: !10)
!10 = !{null, !11, !12, !12, !12, !16, !16, !16, !17}
!11 = !DIBasicType(name: "integer", size: 32, encoding: DW_ATE_signed)
!12 = !DICompositeType(tag: DW_TAG_array_type, baseType: !13, elements: !14)
!13 = !DIBasicType(name: "real", size: 32, encoding: DW_ATE_float)
!14 = !{!15, !15, !15}
!15 = !DISubrange()
!16 = !DICompositeType(tag: DW_TAG_array_type, baseType: !11, elements: !14)
!17 = !DICompositeType(tag: DW_TAG_array_type, baseType: !18, elements: !19)
!18 = !DIBasicType(name: "logical", size: 32, encoding: DW_ATE_boolean)
!19 = !{!15, !15}
!20 = !DILocalVariable(name: "n", arg: 1, scope: !8, file: !6, line: 3, type: !11)
!23 = !{!24, !24, i64 0}
!24 = !{!"dummy arg data/_QFfooEn", !25, i64 0}
!25 = !{!"dummy arg data", !26, i64 0}
!26 = !{!"any data access", !27, i64 0}
!27 = !{!"any access", !28, i64 0}
!28 = !{!"Flang function root _QPfoo"}
!29 = !DILocalVariable(name: "._QFfooEarr11", scope: !8, file: !6, type: !30, flags: DIFlagArtificial)
!30 = !DIBasicType(name: "integer", size: 64, encoding: DW_ATE_signed)
!31 = !DILocalVariable(name: "arr1", arg: 2, scope: !8, file: !6, line: 4, type: !32)
!32 = !DICompositeType(tag: DW_TAG_array_type, baseType: !13, elements: !33)
!33 = !{!34, !34, !34}
!34 = !DISubrange(count: !29)
!35 = !DILocalVariable(name: "._QFfooEarr21", scope: !8, file: !6, type: !30, flags: DIFlagArtificial)
!37 = !DILocalVariable(name: "arr2", arg: 3, scope: !8, file: !6, line: 4, type: !38)
!38 = !DICompositeType(tag: DW_TAG_array_type, baseType: !13, elements: !39)
!39 = !{!40, !40, !40}
!40 = !DISubrange(count: !35)
!41 = !DILocalVariable(name: "._QFfooEarr31", scope: !8, file: !6, type: !30, flags: DIFlagArtificial)
!43 = !DILocalVariable(name: "arr3", arg: 4, scope: !8, file: !6, line: 4, type: !44)
!44 = !DICompositeType(tag: DW_TAG_array_type, baseType: !13, elements: !45)
!45 = !{!46, !46, !46}
!46 = !DISubrange(count: !41)
!47 = !DILocalVariable(name: "._QFfooEicmask1", scope: !8, file: !6, type: !30, flags: DIFlagArtificial)
!49 = !DILocalVariable(name: "icmask", arg: 8, scope: !8, file: !6, line: 6, type: !50)
!50 = !DICompositeType(tag: DW_TAG_array_type, baseType: !18, elements: !51)
!51 = !{!52, !52}
!52 = !DISubrange(count: !47)
!53 = !DILocalVariable(name: "._QFfooEx1", scope: !8, file: !6, type: !30, flags: DIFlagArtificial)
!55 = !DILocalVariable(name: "x", arg: 5, scope: !8, file: !6, line: 5, type: !56)
!56 = !DICompositeType(tag: DW_TAG_array_type, baseType: !11, elements: !57)
!57 = !{!58, !58, !58}
!58 = !DISubrange(count: !53)
!59 = !DILocalVariable(name: "._QFfooEy1", scope: !8, file: !6, type: !30, flags: DIFlagArtificial)
!61 = !DILocalVariable(name: "y", arg: 6, scope: !8, file: !6, line: 5, type: !62)
!62 = !DICompositeType(tag: DW_TAG_array_type, baseType: !11, elements: !63)
!63 = !{!64, !64, !64}
!64 = !DISubrange(count: !59)
!65 = !DILocalVariable(name: "._QFfooEz1", scope: !8, file: !6, type: !30, flags: DIFlagArtificial)
!67 = !DILocalVariable(name: "z", arg: 7, scope: !8, file: !6, line: 5, type: !68)
!68 = !DICompositeType(tag: DW_TAG_array_type, baseType: !11, elements: !69)
!69 = !{!70, !70, !70}
!70 = !DISubrange(count: !65)
!73 = !{!74, !74, i64 0}
!74 = !{!"dummy arg data/_QFfooEicmask", !25, i64 0}
!79 = !{!80, !80, i64 0}
!80 = !{!"dummy arg data/_QFfooEx", !25, i64 0}
!81 = !{!82, !82, i64 0}
!82 = !{!"dummy arg data/_QFfooEy", !25, i64 0}
!83 = !{!84, !84, i64 0}
!84 = !{!"dummy arg data/_QFfooEz", !25, i64 0}
!85 = !{!86, !86, i64 0}
!86 = !{!"dummy arg data/_QFfooEarr2", !25, i64 0}
!87 = !{!88, !88, i64 0}
!88 = !{!"dummy arg data/_QFfooEarr3", !25, i64 0}
!89 = !{!90, !90, i64 0}
!90 = !{!"dummy arg data/_QFfooEarr1", !25, i64 0}