@@ -31,83 +31,6 @@ func.func @vectorize_1d_tensor_extract(%arg0: tensor<3xf32>, %arg1: tensor<4x3xi
31
31
32
32
// -----
33
33
34
- #map = affine_map <() -> ()>
35
- func.func @extract_scalar_from_0d_into_0d (%src: tensor <f32 >, %init: tensor <f32 >) -> tensor <f32 > {
36
- %res = linalg.generic {
37
- indexing_maps = [#map ],
38
- iterator_types = []
39
- } outs (%init : tensor <f32 >) {
40
- ^bb0 (%in: f32 ):
41
- %1 = tensor.extract %src [] : tensor <f32 >
42
- linalg.yield %1 : f32
43
- } -> tensor <f32 >
44
-
45
- return %res : tensor <f32 >
46
- }
47
-
48
- // CHECK-LABEL: func.func @extract_scalar_from_0d_into_0d(
49
- // CHECK-SAME: %[[SRC:.*]]: tensor<f32>,
50
- // CHECK-SAME: %[[INIT:.*]]: tensor<f32>) -> tensor<f32> {
51
- // CHECK: %[[PAD:.*]] = arith.constant 0.000000e+00 : f32
52
- // CHECK: %[[READ:.*]] = vector.transfer_read %[[SRC]][], %[[PAD]] : tensor<f32>, vector<f32>
53
- // CHECK: vector.transfer_write %[[READ]], %[[INIT]][] : vector<f32>, tensor<f32>
54
-
55
- // -----
56
-
57
- #map = affine_map <(n ) -> (n )>
58
- func.func @extract_scalar_from_0d_into_1d (%src: tensor <f32 >, %init: tensor <1 xf32 >) -> tensor <1 xf32 > {
59
- %res = linalg.generic {
60
- indexing_maps = [#map ],
61
- iterator_types = [" parallel" ]
62
- } outs (%init : tensor <1 xf32 >) {
63
- ^bb0 (%in: f32 ):
64
- %1 = tensor.extract %src [] : tensor <f32 >
65
- linalg.yield %1 : f32
66
- } -> tensor <1 xf32 >
67
-
68
- return %res : tensor <1 xf32 >
69
- }
70
- // CHECK-LABEL: func.func @extract_scalar_from_0d_into_1d(
71
- // CHECK-SAME: %[[SRC:.*]]: tensor<f32>,
72
- // CHECK-SAME: %[[INIT:.*]]: tensor<1xf32>) -> tensor<1xf32> {
73
- // CHECK: %[[C0:.*]] = arith.constant 0 : index
74
- // CHECK: %[[PAD:.*]] = arith.constant 0.000000e+00 : f32
75
- // CHECK: %[[READ:.*]] = vector.transfer_read %[[SRC]][], %[[PAD]] : tensor<f32>, vector<f32>
76
- // CHECK: %[[READ_BCAST:.*]] = vector.broadcast %[[READ]] : vector<f32> to vector<1xf32>
77
- // CHECK: vector.transfer_write %[[READ_BCAST]], %[[INIT]][%[[C0]]] {in_bounds = [true]} : vector<1xf32>, tensor<1xf32>
78
-
79
- // -----
80
-
81
- #map = affine_map <(d0 , d1 , d2 ) -> (d0 , d1 , d2 )>
82
- func.func @vectorize_nd_tensor_extract_scalar_broadcast (%src: tensor <3 x3 xf32 >, %init: tensor <1 x1 x3 xf32 >) -> tensor <1 x1 x3 xf32 > {
83
- %c0 = arith.constant 1 : index
84
- %c1 = arith.constant 2 : index
85
-
86
- %res = linalg.generic {
87
- indexing_maps = [#map ],
88
- iterator_types = [" parallel" , " parallel" , " parallel" ]
89
- } outs (%init : tensor <1 x1 x3 xf32 >) {
90
- ^bb0 (%arg4: f32 ):
91
- %1 = tensor.extract %src [%c0 , %c1 ] : tensor <3 x3 xf32 >
92
- linalg.yield %1 : f32
93
- } -> tensor <1 x1 x3 xf32 >
94
-
95
- return %res : tensor <1 x1 x3 xf32 >
96
- }
97
-
98
- // CHECK-LABEL: func.func @vectorize_nd_tensor_extract_scalar_broadcast(
99
- // CHECK-SAME: %[[SRC:.*]]: tensor<3x3xf32>,
100
- // CHECK-SAME: %[[INIT:.*]]: tensor<1x1x3xf32>) -> tensor<1x1x3xf32> {
101
- // CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index
102
- // CHECK-DAG: %[[C1:.*]] = arith.constant 1 : index
103
- // CHECK-DAG: %[[C2:.*]] = arith.constant 2 : index
104
- // CHECK-DAG: %[[PAD:.*]] = arith.constant 0.000000e+00 : f32
105
- // CHECK: %[[READ:.*]] = vector.transfer_read %[[SRC]][%[[C1]], %[[C2]]], %[[PAD]] : tensor<3x3xf32>, vector<f32>
106
- // CHECK: %[[READ_BCAST:.*]] = vector.broadcast %[[READ]] : vector<f32> to vector<1x1x3xf32>
107
- // CHECK: vector.transfer_write %[[READ_BCAST]], %[[INIT]][%[[C0]], %[[C0]], %[[C0]]] {in_bounds = [true, true, true]} : vector<1x1x3xf32>, tensor<1x1x3xf32>
108
-
109
- // -----
110
-
111
34
#map = affine_map <(d0 , d1 , d2 ) -> (d0 , d1 , d2 )>
112
35
func.func @vectorize_nd_tensor_extract_transfer_read_basic (
113
36
%arg0: tensor <3 x3 x3 xf32 >,
@@ -144,37 +67,6 @@ func.func @vectorize_nd_tensor_extract_transfer_read_basic(
144
67
// CHECK: %[[READ:.*]] = vector.transfer_read %[[ARG0]][%[[IDX1]], %[[IDX2]], %[[IDX3]]], %[[CST]] {in_bounds = [true, true, true]} : tensor<3x3x3xf32>, vector<1x1x3xf32>
145
68
// CHECK: vector.transfer_write %[[READ]], %[[ARG1]][%[[C0]], %[[C0]], %[[C0]]] {in_bounds = [true, true, true]} : vector<1x1x3xf32>, tensor<1x1x3xf32>
146
69
147
- // Same as example above, but reading into a column tensor.
148
-
149
- // TODO: Currently this fails to vectorise when the indices are non-constant.
150
-
151
- func.func @vectorize_nd_tensor_extract_transfer_read_basic_column (
152
- %input: tensor <3 x3 x3 xf32 >,
153
- %output: tensor <3 x1 x1 xf32 >) -> tensor <3 x1 x1 xf32 > {
154
-
155
- %c0 = arith.constant 0 : index
156
- %res = linalg.generic {
157
- indexing_maps = [#map ],
158
- iterator_types = [" parallel" , " parallel" , " parallel" ]
159
- } outs (%output : tensor <3 x1 x1 xf32 >) {
160
- ^bb0 (%out: f32 ):
161
- %5 = tensor.extract %input [%c0 , %c0 , %c0 ] : tensor <3 x3 x3 xf32 >
162
- linalg.yield %5 : f32
163
- } -> tensor <3 x1 x1 xf32 >
164
-
165
- return %res : tensor <3 x1 x1 xf32 >
166
- }
167
-
168
- // CHECK-LABEL: func.func @vectorize_nd_tensor_extract_transfer_read_basic_column(
169
- // CHECK-SAME: %[[INPUT:.*]]: tensor<3x3x3xf32>,
170
- // CHECK-SAME: %[[OUTPUT:.*]]: tensor<3x1x1xf32>)
171
- // CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index
172
- // CHECK-DAG: %[[CST_0:.*]] = arith.constant 0.000000e+00 : f32
173
- // CHECK: %[[READ:.*]] = vector.transfer_read %[[INPUT]]{{\[}}%[[C0]], %[[C0]], %[[C0]]], %[[CST_0]] : tensor<3x3x3xf32>, vector<f32>
174
- // CHECK: %[[BCAST:.*]] = vector.broadcast %[[READ]] : vector<f32> to vector<3x1x1xf32>
175
- // CHECK: %[[RES:.*]] = vector.transfer_write %[[BCAST]], %[[OUTPUT]]{{\[}}%[[C0]], %[[C0]], %[[C0]]] {in_bounds = [true, true, true]} : vector<3x1x1xf32>, tensor<3x1x1xf32>
176
- // CHECK: return %[[RES]] : tensor<3x1x1xf32>
177
-
178
70
// -----
179
71
180
72
func.func @vectorize_nd_tensor_extract_transfer_read_complex (%6: tensor <45 x80 x16 xf32 >, %arg0: index , %arg2: index , %arg1: index , %arg4: index , %extracted_slice : tensor <1 x4 xf32 >) -> tensor <1 x4 xf32 > {
@@ -620,26 +512,6 @@ func.func @vectorize_nd_tensor_extract_block_arg(%arg0: tensor<5x6xf32>, %arg1:
620
512
621
513
// -----
622
514
623
- #map1 = affine_map <(d0 , d1 , d2 ) -> (d0 , d1 , d2 )>
624
- func.func @vectorize_0d_tensor_extract (%arg0: tensor <f32 >, %arg2: tensor <1 x1 x3 xf32 >) -> tensor <1 x1 x3 xf32 > {
625
- %2 = linalg.generic {
626
- indexing_maps = [#map1 ],
627
- iterator_types = [" parallel" , " parallel" , " parallel" ]
628
- } outs (%arg2 : tensor <1 x1 x3 xf32 >) {
629
- ^bb0 (%arg4: f32 ):
630
- %7 = tensor.extract %arg0 [] : tensor <f32 >
631
- linalg.yield %7 : f32
632
- } -> tensor <1 x1 x3 xf32 >
633
- return %2 : tensor <1 x1 x3 xf32 >
634
- }
635
-
636
- // CHECK-LABEL: func.func @vectorize_0d_tensor_extract(
637
- // CHECK-SAME: %[[ARG_0:.*]]: tensor<f32>
638
- // CHECK: %[[EXTRACT:.*]] = vector.transfer_read %[[ARG_0]][], %{{.+}} : tensor<f32>
639
- // CHECK: vector.broadcast %[[EXTRACT]] : vector<f32> to vector<1x1x3xf32>
640
-
641
- // -----
642
-
643
515
#map = affine_map <(d0 , d1 , d2 ) -> (d0 , d1 , d2 )>
644
516
#map1 = affine_map <(d0 , d1 , d2 ) -> (d0 + d1 + d2 )>
645
517
func.func @vectorize_reverse_like_tensor_extract (%arg0: tensor <1 x2 x3 xf32 >, %arg1: tensor <1 x1 x3 xf32 >, %arg2: index ) -> tensor <1 x1 x3 xf32 > {
@@ -674,17 +546,118 @@ func.func @vectorize_reverse_like_tensor_extract(%arg0: tensor<1x2x3xf32>, %arg1
674
546
// CHECK: %[[GATHER:.*]] = vector.gather %[[ARG0]][%[[C0]], %[[C0]], %[[C0]]] [%[[T3]]], %[[MASK]], %[[PASSTHRU]]
675
547
// CHECK: vector.transfer_write %[[GATHER]]
676
548
549
+ //===----------------------------------------------------------------------===//
550
+ // Scalar load + broadcast
551
+ //===----------------------------------------------------------------------===//
552
+
553
+ // -----
554
+
555
+ #map = affine_map <(d0 , d1 , d2 ) -> (d0 , d1 , d2 )>
556
+ func.func @vectorize_nd_tensor_extract_scalar_broadcast (%src: tensor <3 x3 xf32 >, %init: tensor <1 x1 x3 xf32 >) -> tensor <1 x1 x3 xf32 > {
557
+ %c0 = arith.constant 1 : index
558
+ %c1 = arith.constant 2 : index
559
+
560
+ %res = linalg.generic {
561
+ indexing_maps = [#map ],
562
+ iterator_types = [" parallel" , " parallel" , " parallel" ]
563
+ } outs (%init : tensor <1 x1 x3 xf32 >) {
564
+ ^bb0 (%arg4: f32 ):
565
+ %1 = tensor.extract %src [%c0 , %c1 ] : tensor <3 x3 xf32 >
566
+ linalg.yield %1 : f32
567
+ } -> tensor <1 x1 x3 xf32 >
568
+
569
+ return %res : tensor <1 x1 x3 xf32 >
570
+ }
571
+
572
+ // CHECK-LABEL: func.func @vectorize_nd_tensor_extract_scalar_broadcast(
573
+ // CHECK-SAME: %[[SRC:.*]]: tensor<3x3xf32>,
574
+ // CHECK-SAME: %[[INIT:.*]]: tensor<1x1x3xf32>) -> tensor<1x1x3xf32> {
575
+ // CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index
576
+ // CHECK-DAG: %[[C1:.*]] = arith.constant 1 : index
577
+ // CHECK-DAG: %[[C2:.*]] = arith.constant 2 : index
578
+ // CHECK-DAG: %[[PAD:.*]] = arith.constant 0.000000e+00 : f32
579
+ // CHECK: %[[READ:.*]] = vector.transfer_read %[[SRC]][%[[C1]], %[[C2]]], %[[PAD]] : tensor<3x3xf32>, vector<f32>
580
+ // CHECK: %[[READ_BCAST:.*]] = vector.broadcast %[[READ]] : vector<f32> to vector<1x1x3xf32>
581
+ // CHECK: vector.transfer_write %[[READ_BCAST]], %[[INIT]][%[[C0]], %[[C0]], %[[C0]]] {in_bounds = [true, true, true]} : vector<1x1x3xf32>, tensor<1x1x3xf32>
582
+
677
583
// -----
678
584
679
- func.func @vectorize_scalar_read_with_broadcast_from_column_tensor (%init: tensor <1 x1 x4 xi32 >) -> tensor <1 x1 x4 xi32 > {
585
+ #map = affine_map <() -> ()>
586
+ func.func @extract_scalar_from_0d_into_0d (%src: tensor <f32 >, %init: tensor <f32 >) -> tensor <f32 > {
587
+ %res = linalg.generic {
588
+ indexing_maps = [#map ],
589
+ iterator_types = []
590
+ } outs (%init : tensor <f32 >) {
591
+ ^bb0 (%in: f32 ):
592
+ %1 = tensor.extract %src [] : tensor <f32 >
593
+ linalg.yield %1 : f32
594
+ } -> tensor <f32 >
595
+
596
+ return %res : tensor <f32 >
597
+ }
598
+
599
+ // CHECK-LABEL: func.func @extract_scalar_from_0d_into_0d(
600
+ // CHECK-SAME: %[[SRC:.*]]: tensor<f32>,
601
+ // CHECK-SAME: %[[INIT:.*]]: tensor<f32>) -> tensor<f32> {
602
+ // CHECK: %[[PAD:.*]] = arith.constant 0.000000e+00 : f32
603
+ // CHECK: %[[READ:.*]] = vector.transfer_read %[[SRC]][], %[[PAD]] : tensor<f32>, vector<f32>
604
+ // CHECK: vector.transfer_write %[[READ]], %[[INIT]][] : vector<f32>, tensor<f32>
605
+
606
+ // -----
607
+
608
+ #map = affine_map <(n ) -> (n )>
609
+ func.func @extract_scalar_from_0d_into_1d (%src: tensor <f32 >, %init: tensor <1 xf32 >) -> tensor <1 xf32 > {
610
+ %res = linalg.generic {
611
+ indexing_maps = [#map ],
612
+ iterator_types = [" parallel" ]
613
+ } outs (%init : tensor <1 xf32 >) {
614
+ ^bb0 (%in: f32 ):
615
+ %1 = tensor.extract %src [] : tensor <f32 >
616
+ linalg.yield %1 : f32
617
+ } -> tensor <1 xf32 >
618
+
619
+ return %res : tensor <1 xf32 >
620
+ }
621
+ // CHECK-LABEL: func.func @extract_scalar_from_0d_into_1d(
622
+ // CHECK-SAME: %[[SRC:.*]]: tensor<f32>,
623
+ // CHECK-SAME: %[[INIT:.*]]: tensor<1xf32>) -> tensor<1xf32> {
624
+ // CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index
625
+ // CHECK-DAG: %[[PAD:.*]] = arith.constant 0.000000e+00 : f32
626
+ // CHECK: %[[READ:.*]] = vector.transfer_read %[[SRC]][], %[[PAD]] : tensor<f32>, vector<f32>
627
+ // CHECK: %[[READ_BCAST:.*]] = vector.broadcast %[[READ]] : vector<f32> to vector<1xf32>
628
+ // CHECK: vector.transfer_write %[[READ_BCAST]], %[[INIT]][%[[C0]]] {in_bounds = [true]} : vector<1xf32>, tensor<1xf32>
629
+
630
+ // -----
631
+
632
+ #map1 = affine_map <(d0 , d1 , d2 ) -> (d0 , d1 , d2 )>
633
+ func.func @vectorize_0d_tensor_extract (%src: tensor <f32 >, %init: tensor <1 x1 x3 xf32 >) -> tensor <1 x1 x3 xf32 > {
634
+ %res = linalg.generic {
635
+ indexing_maps = [#map1 ],
636
+ iterator_types = [" parallel" , " parallel" , " parallel" ]
637
+ } outs (%init : tensor <1 x1 x3 xf32 >) {
638
+ ^bb0 (%arg4: f32 ):
639
+ %1 = tensor.extract %src [] : tensor <f32 >
640
+ linalg.yield %1 : f32
641
+ } -> tensor <1 x1 x3 xf32 >
642
+ return %res : tensor <1 x1 x3 xf32 >
643
+ }
644
+
645
+ // CHECK-LABEL: func.func @vectorize_0d_tensor_extract(
646
+ // CHECK-SAME: %[[SRC:.*]]: tensor<f32>
647
+ // CHECK: %[[READ:.*]] = vector.transfer_read %[[SRC]][], %{{.+}} : tensor<f32>
648
+ // CHECK: vector.broadcast %[[READ]] : vector<f32> to vector<1x1x3xf32>
649
+
650
+ // -----
651
+
652
+ func.func @scalar_read_with_broadcast_from_column_tensor (%init: tensor <1 x1 x4 xi32 >) -> tensor <1 x1 x4 xi32 > {
680
653
%c4 = arith.constant 4 : index
681
654
%c0 = arith.constant 0 : index
682
655
%src = arith.constant dense <[[0 ], [1 ], [2 ], [3 ], [4 ], [5 ], [6 ], [7 ], [8 ], [9 ], [10 ], [11 ], [12 ], [13 ], [14 ]]> : tensor <15 x1 xi32 >
683
656
684
657
%res = linalg.generic {
685
658
indexing_maps = [affine_map <(d0 , d1 , d2 ) -> (d0 , d1 , d2 )>],
686
- iterator_types = [" parallel" , " parallel" , " parallel" ]}
687
- outs (%init : tensor <1 x1 x4 xi32 >) {
659
+ iterator_types = [" parallel" , " parallel" , " parallel" ]
660
+ } outs (%init : tensor <1 x1 x4 xi32 >) {
688
661
689
662
^bb0 (%out: i32 ):
690
663
%idx = linalg.index 0 : index
@@ -695,13 +668,45 @@ func.func @vectorize_scalar_read_with_broadcast_from_column_tensor(%init: tensor
695
668
return %res : tensor <1 x1 x4 xi32 >
696
669
}
697
670
698
- // CHECK-LABEL: func.func @vectorize_scalar_read_with_broadcast_from_column_tensor(
671
+ // CHECK-LABEL: func.func @scalar_read_with_broadcast_from_column_tensor
699
672
// CHECK-SAME: %[[INIT:.*]]: tensor<1x1x4xi32>) -> tensor<1x1x4xi32> {
700
- // CHECK: %[[PAD:.*]] = arith.constant 0 : i32
701
- // CHECK: %[[C0:.*]] = arith.constant 0 : index
702
- // CHECK: %[[SRC:.*]] = arith.constant dense<{{\[\[}}0], [1], [2], [3], [4], [5], [6], [7], [8], [9], [10], [11], [12], [13], [14]]> : tensor<15x1xi32>
703
- // CHECK: %[[IDX_VEC:.*]] = arith.constant dense<0> : vector<1xindex>
673
+ // CHECK-DAG: %[[PAD:.*]] = arith.constant 0 : i32
674
+ // CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index
675
+ // CHECK-DAG: %[[SRC:.*]] = arith.constant dense<{{\[\[}}0], [1], [2], [3], [4], [5], [6], [7], [8], [9], [10], [11], [12], [13], [14]]> : tensor<15x1xi32>
676
+ // CHECK-DAG: %[[IDX_VEC:.*]] = arith.constant dense<0> : vector<1xindex>
704
677
// CHECK: %[[IDX_ELT:.*]] = vector.extract %[[IDX_VEC]][0] : index from vector<1xindex>
705
678
// CHECK: %[[READ:.*]] = vector.transfer_read %[[SRC]]{{\[}}%[[IDX_ELT]], %[[C0]]], %[[PAD]] : tensor<15x1xi32>, vector<i32>
706
679
// CHECK: %[[READ_BCAST:.*]] = vector.broadcast %[[READ]] : vector<i32> to vector<1x1x4xi32>
707
680
// CHECK: %[[RES:.*]] = vector.transfer_write %[[READ_BCAST]], %[[INIT]][%[[C0]], %[[C0]], %[[C0]]] {in_bounds = [true, true, true]} : vector<1x1x4xi32>, tensor<1x1x4xi32>
681
+
682
+ // -----
683
+
684
+ // TODO: Currently this fails to vectorise when the indices are non-constant.
685
+
686
+ #map = affine_map <(d0 , d1 , d2 ) -> (d0 , d1 , d2 )>
687
+ func.func @vectorize_nd_tensor_extract_transfer_read_basic_column (
688
+ %src: tensor <3 x3 x3 xf32 >,
689
+ %init: tensor <3 x1 x1 xf32 >) -> tensor <3 x1 x1 xf32 > {
690
+
691
+ %c0 = arith.constant 0 : index
692
+
693
+ %res = linalg.generic {
694
+ indexing_maps = [#map ],
695
+ iterator_types = [" parallel" , " parallel" , " parallel" ]
696
+ } outs (%init : tensor <3 x1 x1 xf32 >) {
697
+ ^bb0 (%out: f32 ):
698
+ %1 = tensor.extract %src [%c0 , %c0 , %c0 ] : tensor <3 x3 x3 xf32 >
699
+ linalg.yield %1 : f32
700
+ } -> tensor <3 x1 x1 xf32 >
701
+
702
+ return %res : tensor <3 x1 x1 xf32 >
703
+ }
704
+
705
+ // CHECK-LABEL: func.func @vectorize_nd_tensor_extract_transfer_read_basic_column(
706
+ // CHECK-SAME: %[[SRC:.*]]: tensor<3x3x3xf32>,
707
+ // CHECK-SAME: %[[INIT:.*]]: tensor<3x1x1xf32>)
708
+ // CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index
709
+ // CHECK-DAG: %[[CST_0:.*]] = arith.constant 0.000000e+00 : f32
710
+ // CHECK: %[[READ:.*]] = vector.transfer_read %[[SRC]][%[[C0]], %[[C0]], %[[C0]]], %[[CST_0]] : tensor<3x3x3xf32>, vector<f32>
711
+ // CHECK: %[[READ_BCAST:.*]] = vector.broadcast %[[READ]] : vector<f32> to vector<3x1x1xf32>
712
+ // CHECK: vector.transfer_write %[[READ_BCAST]], %[[INIT]]{{\[}}%[[C0]], %[[C0]], %[[C0]]] {in_bounds = [true, true, true]} : vector<3x1x1xf32>, tensor<3x1x1xf32>
0 commit comments