xref: /llvm-project/mlir/test/Dialect/ArmSME/tile-allocation-liveness.mlir (revision c42512436b23ab50e7637f239abe8371407104a1)
1// RUN: mlir-opt %s -convert-scf-to-cf -test-arm-sme-tile-allocation -split-input-file -verify-diagnostics | FileCheck %s
2// RUN: mlir-opt %s -convert-scf-to-cf -test-arm-sme-tile-allocation=dump-tile-live-ranges -mlir-disable-threading -split-input-file -verify-diagnostics 2>&1 >/dev/null | FileCheck %s --check-prefix=CHECK-LIVE-RANGE
3
4// This file tests some simple aspects of using liveness in the SME tile allocator.
5// Note: We use -convert-scf-to-cf first as the tile allocator expects CF, but
6// some of these tests are written in SCF (to make things easier to follow).
7
8//  CHECK-LIVE-RANGE-LABEL: @constant_with_multiple_users
9//        CHECK-LIVE-RANGE: ========== Coalesced Live Ranges:
10//        CHECK-LIVE-RANGE: ^bb0:
11//        CHECK-LIVE-RANGE: S  arm_sme.zero
12//   CHECK-LIVE-RANGE-NEXT: |S arm_sme.insert_tile_slice
13//   CHECK-LIVE-RANGE-NEXT: || arm_sme.insert_tile_slice
14//   CHECK-LIVE-RANGE-NEXT: |E test.some_use
15//   CHECK-LIVE-RANGE-NEXT: E  test.some_use
16
17// CHECK-LABEL: @constant_with_multiple_users(
18// CHECK-SAME:                                %[[VECTOR_A:.*]]: vector<[4]xf32>, %[[VECTOR_B:.*]]: vector<[4]xf32>
19func.func @constant_with_multiple_users(%a: vector<[4]xf32>, %b: vector<[4]xf32>, %index: index) {
20  // CHECK-NEXT: %[[ZERO_TILE_0:.*]] = arm_sme.zero {tile_id = 0 : i32} : vector<[4]x[4]xf32>
21  // CHECK-NEXT: %[[ZERO_TILE_1:.*]] = arm_sme.zero {tile_id = 1 : i32} : vector<[4]x[4]xf32>
22  // CHECK-NEXT: %[[INSERT_TILE_1:.*]] = arm_sme.insert_tile_slice %[[VECTOR_A]], %[[ZERO_TILE_1]][%{{.*}}] {tile_id = 1 : i32} : vector<[4]xf32> into vector<[4]x[4]xf32>
23  // CHECK-NEXT: %[[INSERT_TILE_0:.*]] = arm_sme.insert_tile_slice %[[VECTOR_B]], %[[ZERO_TILE_0]][%{{.*}}] {tile_id = 0 : i32} : vector<[4]xf32> into vector<[4]x[4]xf32>
24  %zero = arm_sme.zero : vector<[4]x[4]xf32>
25  %tile_a = arm_sme.insert_tile_slice %a, %zero[%index] : vector<[4]xf32> into vector<[4]x[4]xf32>
26  %tile_b = arm_sme.insert_tile_slice %b, %zero[%index] : vector<[4]xf32> into vector<[4]x[4]xf32>
27  "test.some_use"(%tile_a) : (vector<[4]x[4]xf32>) -> ()
28  "test.some_use"(%tile_b) : (vector<[4]x[4]xf32>) -> ()
29  return
30}
31
32// -----
33
34//  CHECK-LIVE-RANGE-LABEL: @value_with_multiple_users
35//        CHECK-LIVE-RANGE: ========== Coalesced Live Ranges:
36//        CHECK-LIVE-RANGE: ^bb0:
37//   CHECK-LIVE-RANGE-NEXT: |S arm_sme.insert_tile_slice
38//   CHECK-LIVE-RANGE-NEXT: || arm_sme.insert_tile_slice
39//   CHECK-LIVE-RANGE-NEXT: |E test.some_use
40//   CHECK-LIVE-RANGE-NEXT: E  test.some_use
41
42// expected-note@below {{tile operand is: <block argument> of type 'vector<[4]x[4]xf32>'}}
43func.func @value_with_multiple_users(%tile: vector<[4]x[4]xf32>, %a: vector<[4]xf32>, %b: vector<[4]xf32>, %index: index) {
44  // expected-error@below {{op tile operand allocated to different SME virtial tile (move required)}}
45  %tile_a = arm_sme.insert_tile_slice %a, %tile[%index] : vector<[4]xf32> into vector<[4]x[4]xf32>
46  %tile_b = arm_sme.insert_tile_slice %b, %tile[%index] : vector<[4]xf32> into vector<[4]x[4]xf32>
47  "test.some_use"(%tile_a) : (vector<[4]x[4]xf32>) -> ()
48  "test.some_use"(%tile_b) : (vector<[4]x[4]xf32>) -> ()
49  return
50}
51
52// -----
53
54//  CHECK-LIVE-RANGE-LABEL: @reuse_tiles_after_initial_use
55//        CHECK-LIVE-RANGE: ========== Coalesced Live Ranges:
56//        CHECK-LIVE-RANGE: ^bb0:
57//   CHECK-LIVE-RANGE-NEXT: S        arm_sme.get_tile
58//   CHECK-LIVE-RANGE-NEXT: |S       arm_sme.get_tile
59//   CHECK-LIVE-RANGE-NEXT: ||S      arm_sme.get_tile
60//   CHECK-LIVE-RANGE-NEXT: |||S     arm_sme.get_tile
61//   CHECK-LIVE-RANGE-NEXT: ||||     test.dummy
62//   CHECK-LIVE-RANGE-NEXT: ||||     test.dummy
63//   CHECK-LIVE-RANGE-NEXT: ||||     test.dummy
64//   CHECK-LIVE-RANGE-NEXT: E|||     test.some_use
65//   CHECK-LIVE-RANGE-NEXT:  E||     test.some_use
66//   CHECK-LIVE-RANGE-NEXT:   E|     test.some_use
67//   CHECK-LIVE-RANGE-NEXT:    E     test.some_use
68//   CHECK-LIVE-RANGE-NEXT:     S    arm_sme.zero
69//   CHECK-LIVE-RANGE-NEXT:     |S   arm_sme.zero
70//   CHECK-LIVE-RANGE-NEXT:     ||S  arm_sme.zero
71//   CHECK-LIVE-RANGE-NEXT:     |||S arm_sme.zero
72//   CHECK-LIVE-RANGE-NEXT:     |||| test.dummy
73//   CHECK-LIVE-RANGE-NEXT:     |||| test.dummy
74//   CHECK-LIVE-RANGE-NEXT:     |||| test.dummy
75//   CHECK-LIVE-RANGE-NEXT:     E||| test.some_use
76//   CHECK-LIVE-RANGE-NEXT:      E|| test.some_use
77//   CHECK-LIVE-RANGE-NEXT:       E| test.some_use
78//   CHECK-LIVE-RANGE-NEXT:        E test.some_use
79
80// CHECK-LABEL: @reuse_tiles_after_initial_use
81func.func @reuse_tiles_after_initial_use() {
82  // CHECK: arm_sme.get_tile {tile_id = 0 : i32}
83  // CHECK: arm_sme.get_tile {tile_id = 1 : i32}
84  // CHECK: arm_sme.get_tile {tile_id = 2 : i32}
85  // CHECK: arm_sme.get_tile {tile_id = 3 : i32}
86  %tile_a = arm_sme.get_tile : vector<[4]x[4]xf32>
87  %tile_b = arm_sme.get_tile : vector<[4]x[4]xf32>
88  %tile_c = arm_sme.get_tile : vector<[4]x[4]xf32>
89  %tile_d = arm_sme.get_tile : vector<[4]x[4]xf32>
90  "test.dummy"(): () -> ()
91  "test.dummy"(): () -> ()
92  "test.dummy"(): () -> ()
93  "test.some_use"(%tile_a) : (vector<[4]x[4]xf32>) -> ()
94  "test.some_use"(%tile_b) : (vector<[4]x[4]xf32>) -> ()
95  "test.some_use"(%tile_c) : (vector<[4]x[4]xf32>) -> ()
96  "test.some_use"(%tile_d) : (vector<[4]x[4]xf32>) -> ()
97  // CHECK: arm_sme.zero {tile_id = 0 : i32}
98  // CHECK: arm_sme.zero {tile_id = 1 : i32}
99  // CHECK: arm_sme.zero {tile_id = 2 : i32}
100  // CHECK: arm_sme.zero {tile_id = 3 : i32}
101  %tile_1 = arm_sme.zero : vector<[4]x[4]xf32>
102  %tile_2 = arm_sme.zero : vector<[4]x[4]xf32>
103  %tile_3 = arm_sme.zero : vector<[4]x[4]xf32>
104  %tile_4 = arm_sme.zero : vector<[4]x[4]xf32>
105  "test.dummy"(): () -> ()
106  "test.dummy"(): () -> ()
107  "test.dummy"(): () -> ()
108  "test.some_use"(%tile_1) : (vector<[4]x[4]xf32>) -> ()
109  "test.some_use"(%tile_2) : (vector<[4]x[4]xf32>) -> ()
110  "test.some_use"(%tile_3) : (vector<[4]x[4]xf32>) -> ()
111  "test.some_use"(%tile_4) : (vector<[4]x[4]xf32>) -> ()
112  return
113}
114
115// -----
116
117//  CHECK-LIVE-RANGE-LABEL: @tile_live_ins
118//        CHECK-LIVE-RANGE: ========== Coalesced Live Ranges:
119//        CHECK-LIVE-RANGE: ^bb0:
120//   CHECK-LIVE-RANGE-NEXT: S  arm_sme.get_tile
121//   CHECK-LIVE-RANGE-NEXT: |S arm_sme.zero
122//   CHECK-LIVE-RANGE-NEXT: EE cf.br
123//   CHECK-LIVE-RANGE-NEXT: ^bb1:
124//   CHECK-LIVE-RANGE-NEXT: || test.dummy
125//   CHECK-LIVE-RANGE-NEXT: || test.dummy
126//   CHECK-LIVE-RANGE-NEXT: EE cf.br
127//   CHECK-LIVE-RANGE-NEXT: ^bb2:
128//   CHECK-LIVE-RANGE-NEXT: || test.dummy
129//   CHECK-LIVE-RANGE-NEXT: || test.dummy
130//   CHECK-LIVE-RANGE-NEXT: EE cf.br
131//   CHECK-LIVE-RANGE-NEXT: ^bb3:
132//   CHECK-LIVE-RANGE-NEXT: E| test.some_use
133//   CHECK-LIVE-RANGE-NEXT:  E test.some_use
134
135// CHECK-LABEL: @tile_live_ins
136func.func @tile_live_ins()
137{
138  // CHECK: arm_sme.get_tile {tile_id = 0 : i32} : vector<[4]x[4]xf32>
139  // CHECK: arm_sme.zero {tile_id = 1 : i32} : vector<[4]x[4]xf32>
140  %tile_1 = arm_sme.get_tile : vector<[4]x[4]xf32>
141  %tile_2 = arm_sme.zero : vector<[4]x[4]xf32>
142  cf.br ^bb1
143^bb1:
144  "test.dummy"(): () -> ()
145  "test.dummy"(): () -> ()
146  cf.br ^bb2
147^bb2:
148  "test.dummy"(): () -> ()
149  "test.dummy"(): () -> ()
150  cf.br ^bb3
151^bb3:
152  "test.some_use"(%tile_1) : (vector<[4]x[4]xf32>) -> ()
153  "test.some_use"(%tile_2) : (vector<[4]x[4]xf32>) -> ()
154  return
155}
156
157// -----
158
159// This is basically the same test as tile_live_ins but shows that the order of
160// the blocks within the source does not relate to the liveness, which is based
161// on successors and predecessors (not textual order).
162//
163// So %tile_1 is live on the path bb0 -> bb2 -> bb1 (and dies in bb1). The
164// 'hole' when looking at the live range dump comes from the textual order
165// (and would disappear if bb1 was moved before bb2 in the source).
166//
167// When looking at the live range dump (outside of straight-line code) it
168// normally makes more sense to consider blocks in isolation (and how they
169// relate to the CFG).
170
171//  CHECK-LIVE-RANGE-LABEL: @non_sequential_live_ins
172//        CHECK-LIVE-RANGE: ========== Coalesced Live Ranges:
173//        CHECK-LIVE-RANGE: ^bb0:
174//   CHECK-LIVE-RANGE-NEXT: S  arm_sme.get_tile
175//   CHECK-LIVE-RANGE-NEXT: |  test.dummy
176//   CHECK-LIVE-RANGE-NEXT: E  cf.br
177//   CHECK-LIVE-RANGE-NEXT: ^bb1:
178//   CHECK-LIVE-RANGE-NEXT: E| test.some_use
179//   CHECK-LIVE-RANGE-NEXT:  | test.dummy
180//   CHECK-LIVE-RANGE-NEXT:  E cf.br
181//   CHECK-LIVE-RANGE-NEXT: ^bb2:
182//   CHECK-LIVE-RANGE-NEXT: |S arm_sme.zero
183//   CHECK-LIVE-RANGE-NEXT: || test.dummy
184//   CHECK-LIVE-RANGE-NEXT: EE cf.cond_br
185//   CHECK-LIVE-RANGE-NEXT: ^bb3:
186//   CHECK-LIVE-RANGE-NEXT:  | test.dummy
187//   CHECK-LIVE-RANGE-NEXT:  E test.some_use
188//   CHECK-LIVE-RANGE-NEXT:    func.return
189
190// CHECK-LABEL: @non_sequential_live_ins
191func.func @non_sequential_live_ins(%cond: i1) {
192  // CHECK: arm_sme.get_tile {tile_id = 0 : i32} : vector<[4]x[4]xf32>
193  // CHECK: arm_sme.zero {tile_id = 1 : i32} : vector<[4]x[4]xf32>
194  %tile_1 = arm_sme.get_tile : vector<[4]x[4]xf32>
195  "test.dummy"(): () -> ()
196  cf.br ^bb2
197^bb1:
198  "test.some_use"(%tile_1) : (vector<[4]x[4]xf32>) -> ()
199  "test.dummy"(): () -> ()
200  cf.br ^bb3
201^bb2:
202  %tile_2 = arm_sme.zero : vector<[4]x[4]xf32>
203  "test.dummy"(): () -> ()
204  cf.cond_br %cond, ^bb1, ^bb3
205^bb3:
206  "test.dummy"(): () -> ()
207  "test.some_use"(%tile_2) : (vector<[4]x[4]xf32>) -> ()
208  return
209}
210
211// -----
212
213//  CHECK-LIVE-RANGE-LABEL: @non_overlapping_branches
214//        CHECK-LIVE-RANGE: ========== Coalesced Live Ranges:
215//        CHECK-LIVE-RANGE: ^bb1:
216//   CHECK-LIVE-RANGE-NEXT: S arm_sme.zero
217//   CHECK-LIVE-RANGE-NEXT: | arm_sme.copy_tile
218//   CHECK-LIVE-RANGE-NEXT: E cf.br
219//   CHECK-LIVE-RANGE-NEXT: ^bb2:
220//   CHECK-LIVE-RANGE-NEXT: S arm_sme.get_tile
221//   CHECK-LIVE-RANGE-NEXT: | arm_sme.copy_tile
222//   CHECK-LIVE-RANGE-NEXT: E cf.br
223
224// CHECK-LABEL: @non_overlapping_branches
225func.func @non_overlapping_branches(%cond: i1) {
226  // CHECK: arm_sme.zero {tile_id = 0 : i32} : vector<[4]x[4]xf32>
227  // CHECK: arm_sme.get_tile {tile_id = 0 : i32} : vector<[4]x[4]xf32>
228  %tile = scf.if %cond -> vector<[4]x[4]xf32> {
229    // ^bb1:
230    %zero = arm_sme.zero : vector<[4]x[4]xf32>
231    scf.yield %zero : vector<[4]x[4]xf32>
232  } else {
233    // ^bb2:
234    %undef = arm_sme.get_tile : vector<[4]x[4]xf32>
235    scf.yield %undef : vector<[4]x[4]xf32>
236  }
237  "test.some_use"(%tile) : (vector<[4]x[4]xf32>) -> ()
238  return
239}
240
241// -----
242
243// Here %vecA and %vecB are not merged into the same live range (as they are unknown values).
244// This means that %vecA and %vecB are both allocated to different tiles (which is not legal).
245
246// expected-note@below {{tile operand is: <block argument> of type 'vector<[4]x[4]xf32>'}}
247func.func @overlapping_branches(%cond: i1, %vecA: vector<[4]x[4]xf32>, %vecB: vector<[4]x[4]xf32>) {
248  // expected-error@below {{op tile operand allocated to different SME virtial tile (move required)}}
249  %tile = scf.if %cond -> vector<[4]x[4]xf32> {
250    scf.yield %vecA : vector<[4]x[4]xf32>
251  } else {
252    scf.yield %vecB : vector<[4]x[4]xf32>
253  }
254  "test.some_use"(%tile) : (vector<[4]x[4]xf32>) -> ()
255  return
256}
257
258// -----
259
260//  CHECK-LIVE-RANGE-LABEL: @run_out_of_tiles_but_avoid_spill
261//        CHECK-LIVE-RANGE: ========== Coalesced Live Ranges:
262//        CHECK-LIVE-RANGE: ^bb2:
263//   CHECK-LIVE-RANGE-NEXT: |S    arm_sme.copy_tile
264//   CHECK-LIVE-RANGE-NEXT: ||S   arm_sme.copy_tile
265//   CHECK-LIVE-RANGE-NEXT: |||S  arm_sme.copy_tile
266//   CHECK-LIVE-RANGE-NEXT: ||||S arm_sme.copy_tile
267//   CHECK-LIVE-RANGE-NEXT: EEEEE cf.br
268
269// Note in the live ranges (above) there is five tile values, but we only have four tiles.
270// There is no 'real' spill as we spill the `arm_sme.zero` but are then able to clone it
271// at each of its uses.
272
273// CHECK-LABEL: @run_out_of_tiles_but_avoid_spill
274func.func @run_out_of_tiles_but_avoid_spill(%a: vector<[4]xf32>, %b: vector<[4]xf32>, %c: vector<[4]xf32>, %d: vector<[4]xf32>) {
275  %init = arm_sme.zero : vector<[4]x[4]xf32>
276  %c0 = arith.constant 0 : index
277  %c1 = arith.constant 1 : index
278  %c10 = arith.constant 10 : index
279  // Live = %init
280  scf.for %i = %c0 to %c10 step %c1 {
281    // CHECK: arm_sme.zero {tile_id = 1 : i32}
282    // CHECK: arm_sme.zero {tile_id = 2 : i32}
283    // CHECK: arm_sme.zero {tile_id = 3 : i32}
284    // CHECK: arm_sme.zero {tile_id = 0 : i32}
285    %tile_a, %tile_b, %tile_c, %tile_d = scf.for %j = %c0 to %c10 step %c1
286      iter_args(%iter_a = %init, %iter_b = %init, %iter_c = %init, %iter_d = %init)
287        -> (vector<[4]x[4]xf32>, vector<[4]x[4]xf32> , vector<[4]x[4]xf32> , vector<[4]x[4]xf32>) {
288        // ^bb2:
289        // CHECK: arm_sme.insert_tile_slice {{.*}} {tile_id = 1 : i32} : vector<[4]xf32> into vector<[4]x[4]xf32>
290        // CHECK: arm_sme.insert_tile_slice {{.*}} {tile_id = 2 : i32} : vector<[4]xf32> into vector<[4]x[4]xf32>
291        // CHECK: arm_sme.insert_tile_slice {{.*}} {tile_id = 3 : i32} : vector<[4]xf32> into vector<[4]x[4]xf32>
292        // CHECK: arm_sme.insert_tile_slice {{.*}} {tile_id = 0 : i32} : vector<[4]xf32> into vector<[4]x[4]xf32>
293        %new_a = arm_sme.insert_tile_slice %a, %iter_a[%i] : vector<[4]xf32> into vector<[4]x[4]xf32>
294        %new_b = arm_sme.insert_tile_slice %b, %iter_b[%i] : vector<[4]xf32> into vector<[4]x[4]xf32>
295        %new_c = arm_sme.insert_tile_slice %c, %iter_c[%i] : vector<[4]xf32> into vector<[4]x[4]xf32>
296        %new_d = arm_sme.insert_tile_slice %d, %iter_d[%i] : vector<[4]xf32> into vector<[4]x[4]xf32>
297        scf.yield %new_a, %new_b, %new_c, %new_d : vector<[4]x[4]xf32>, vector<[4]x[4]xf32>, vector<[4]x[4]xf32>, vector<[4]x[4]xf32>
298    }
299    // Live = %init, %tile_a, %tile_b, %tile_c, %tile_d (out of tiles!)
300    // This should be resolved by duplicating the arm_sme.zero (from folding
301    // arm_sme.copy_tile operations inserted by the tile allocator).
302    "test.some_use"(%tile_a) : (vector<[4]x[4]xf32>) -> ()
303    "test.some_use"(%tile_b) : (vector<[4]x[4]xf32>) -> ()
304    "test.some_use"(%tile_c) : (vector<[4]x[4]xf32>) -> ()
305    "test.some_use"(%tile_d) : (vector<[4]x[4]xf32>) -> ()
306  }
307  return
308}
309
310// -----
311
312// We should be able to avoid spills like this, but logic handling this case is
313// not implemented yet. Note tile ID >= 16 means a spill/in-memory tile.
314
315//  CHECK-LIVE-RANGE-LABEL: @avoidable_spill
316//        CHECK-LIVE-RANGE: ========== Coalesced Live Ranges:
317//        CHECK-LIVE-RANGE: ^bb2:
318//   CHECK-LIVE-RANGE-NEXT: ||     test.some_use
319//   CHECK-LIVE-RANGE-NEXT: ||S    arm_sme.insert_tile_slice
320//   CHECK-LIVE-RANGE-NEXT: |||S   arm_sme.insert_tile_slice
321//   CHECK-LIVE-RANGE-NEXT: ||||S  arm_sme.insert_tile_slice
322//   CHECK-LIVE-RANGE-NEXT: |||||S arm_sme.insert_tile_slice
323//   CHECK-LIVE-RANGE-NEXT: ||E||| test.some_use
324//   CHECK-LIVE-RANGE-NEXT: || E|| test.some_use
325//   CHECK-LIVE-RANGE-NEXT: ||  E| test.some_use
326//   CHECK-LIVE-RANGE-NEXT: ||   E test.some_use
327//   CHECK-LIVE-RANGE-NEXT: ||     arith.addi
328//   CHECK-LIVE-RANGE-NEXT: EE     cf.br
329
330// Note in the live ranges (above) there is two constant live-ins (first two ranges),
331// which gives six overlapping live ranges (at the point where %tile_d is defined).
332// The allocator currently will spill the first constant (which results in a real
333// spill at it's use), however, this could be avoided by using the knowledge that
334// at the first "test.some_use" there's actually only two live ranges (so we can
335// fix this be duplicating the constant).
336
337// CHECK-LABEL: @avoidable_spill
338func.func @avoidable_spill(%a: vector<[4]xf32>, %b: vector<[4]xf32>, %c: vector<[4]xf32>, %d: vector<[4]xf32>) {
339  // CHECK: arm_sme.zero {tile_id = 16 : i32} : vector<[4]x[4]xf32>
340  %zero = arm_sme.zero : vector<[4]x[4]xf32>
341  %tile = arm_sme.get_tile : vector<[4]x[4]xf32>
342  %c0 = arith.constant 0 : index
343  %c1 = arith.constant 1 : index
344  %c10 = arith.constant 10 : index
345  scf.for %i = %c0 to %c10 step %c1 {
346    // So spilled here (unnecessarily).
347    // The arm_sme.zero op could be moved into the loop to avoid this.
348    "test.some_use"(%zero) : (vector<[4]x[4]xf32>) -> ()
349    %tile_a = arm_sme.insert_tile_slice %a, %tile[%c0] : vector<[4]xf32> into vector<[4]x[4]xf32>
350    %tile_b = arm_sme.insert_tile_slice %b, %tile[%c0] : vector<[4]xf32> into vector<[4]x[4]xf32>
351    %tile_c = arm_sme.insert_tile_slice %c, %tile[%c0] : vector<[4]xf32> into vector<[4]x[4]xf32>
352    %tile_d = arm_sme.insert_tile_slice %d, %tile[%c0] : vector<[4]xf32> into vector<[4]x[4]xf32>
353    // %zero is still live here (due the the backedge)
354    "test.some_use"(%tile_a) : (vector<[4]x[4]xf32>) -> ()
355    "test.some_use"(%tile_b) : (vector<[4]x[4]xf32>) -> ()
356    "test.some_use"(%tile_c) : (vector<[4]x[4]xf32>) -> ()
357    "test.some_use"(%tile_d) : (vector<[4]x[4]xf32>) -> ()
358  }
359  return
360}
361
362// -----
363
364// This test is a follow up to the test of the same name in `tile-allocation-copies.mlir`.
365// This shows the live ranges (which are why we need to split the conditional branch).
366
367//  CHECK-LIVE-RANGE-LABEL: @cond_branch_with_backedge
368//        CHECK-LIVE-RANGE: ^bb1:
369//   CHECK-LIVE-RANGE-NEXT:  ||| |           arith.cmpi
370//   CHECK-LIVE-RANGE-NEXT:  EEE E           cf.cond_br
371//
372//   CHECK-LIVE-RANGE-NEXT: ^[[BB3_COPIES:[[:alnum:]]+]]:
373//   CHECK-LIVE-RANGE-NEXT:  ||| ES          arm_sme.copy_tile
374//   CHECK-LIVE-RANGE-NEXT:  E||  |S         arm_sme.copy_tile
375//   CHECK-LIVE-RANGE-NEXT:   E|  ||S        arm_sme.copy_tile
376//   CHECK-LIVE-RANGE-NEXT:    E  |||S       arm_sme.copy_tile
377//   CHECK-LIVE-RANGE-NEXT:       EEEE       cf.br
378//
379// It is important to note that the first three live ranges in ^bb1 do not end
380// at the `cf.cond_br` they are live-out via the backedge bb1 -> bb2 -> bb1.
381// This means that if we placed the `arm_sme.tile_copies` before the `cf.cond_br`
382// then those live ranges would not end at the copies, resulting in unwanted
383// overlapping live ranges (and hence tile spills).
384//
385// With the conditional branch split and the copies placed in the BB3_COPIES
386// block the first three live ranges end at the copy operations (as the
387// BB3_COPIES block is on the path out of the loop and has no backedge). This
388// means there is no overlaps and the live ranges all merge, as shown below.
389//
390//        CHECK-LIVE-RANGE: ========== Coalesced Live Ranges:
391//        CHECK-LIVE-RANGE: ^bb1:
392//   CHECK-LIVE-RANGE-NEXT: |||| arith.cmpi
393//   CHECK-LIVE-RANGE-NEXT: EEEE cf.cond_br
394//
395//   CHECK-LIVE-RANGE-NEXT: ^[[BB3_COPIES]]:
396//   CHECK-LIVE-RANGE-NEXT: |||| arm_sme.copy_tile
397//   CHECK-LIVE-RANGE-NEXT: |||| arm_sme.copy_tile
398//   CHECK-LIVE-RANGE-NEXT: |||| arm_sme.copy_tile
399//   CHECK-LIVE-RANGE-NEXT: |||| arm_sme.copy_tile
400//   CHECK-LIVE-RANGE-NEXT: EEEE cf.br
401
402// CHECK-LABEL: @cond_branch_with_backedge
403// CHECK-NOT: tile_id = 16
404// CHECK: arm_sme.get_tile {tile_id = 0 : i32} : vector<[4]x[4]xf32>
405// CHECK: arm_sme.get_tile {tile_id = 1 : i32} : vector<[4]x[4]xf32>
406// CHECK: arm_sme.get_tile {tile_id = 2 : i32} : vector<[4]x[4]xf32>
407// CHECK: arm_sme.get_tile {tile_id = 3 : i32} : vector<[4]x[4]xf32>
408// CHECK: arm_sme.insert_tile_slice {{.*}} {tile_id = 0 : i32} : vector<[4]xf32> into vector<[4]x[4]xf32>
409// CHECK-NOT: tile_id = 16
410func.func @cond_branch_with_backedge(%slice: vector<[4]xf32>) {
411  %tileA = arm_sme.get_tile : vector<[4]x[4]xf32>
412  %tileB = arm_sme.get_tile : vector<[4]x[4]xf32>
413  %tileC = arm_sme.get_tile : vector<[4]x[4]xf32>
414  %tileD = arm_sme.get_tile : vector<[4]x[4]xf32>
415  %c0 = arith.constant 0 : index
416  %c1 = arith.constant 1 : index
417  %c10 = arith.constant 10 : index
418  // Live here: %tileA, %tileB, %tileC, %tileD
419  cf.br ^bb1(%c0, %tileA : index, vector<[4]x[4]xf32>)
420^bb1(%currentIndex: index, %iterTile: vector<[4]x[4]xf32>):
421  %continueLoop = arith.cmpi slt, %currentIndex, %c10 : index
422  // Live here: %iterTile, %tileB, %tileC, %tileD
423  cf.cond_br %continueLoop, ^bb2, ^bb3(%iterTile, %tileB, %tileC, %tileD : vector<[4]x[4]xf32>, vector<[4]x[4]xf32>, vector<[4]x[4]xf32>, vector<[4]x[4]xf32>)
424^bb2:
425  // Live here: %iterTile, %tileB, %tileC, %tileD
426  %nextTile = arm_sme.insert_tile_slice %slice, %iterTile[%currentIndex] : vector<[4]xf32> into vector<[4]x[4]xf32>
427  %nextIndex = arith.addi %currentIndex, %c1 : index
428  cf.br ^bb1(%nextIndex, %nextTile : index, vector<[4]x[4]xf32>)
429^bb3(%finalTileA: vector<[4]x[4]xf32>, %finalTileB: vector<[4]x[4]xf32>, %finalTileC: vector<[4]x[4]xf32>, %finalTileD: vector<[4]x[4]xf32>):
430  // Live here: %finalTileA, %finalTileB, %finalTileC, %finalTileD
431  return
432}
433
434// -----
435
436//  CHECK-LIVE-RANGE-LABEL: @fill_holes_in_tile_liveness
437//        CHECK-LIVE-RANGE: ========== Coalesced Live Ranges:
438//        CHECK-LIVE-RANGE: ^bb0:
439//   CHECK-LIVE-RANGE-NEXT: S  arm_sme.get_tile
440//   CHECK-LIVE-RANGE-NEXT: E  cf.cond_br
441//   CHECK-LIVE-RANGE-NEXT: ^bb1:
442//   CHECK-LIVE-RANGE-NEXT:  S arm_sme.get_tile
443//   CHECK-LIVE-RANGE-NEXT:  | test.dummy
444//   CHECK-LIVE-RANGE-NEXT:  E test.some_use
445//   CHECK-LIVE-RANGE-NEXT:    cf.br
446//   CHECK-LIVE-RANGE-NEXT: ^bb2:
447//   CHECK-LIVE-RANGE-NEXT: |  test.dummy
448//   CHECK-LIVE-RANGE-NEXT: |  test.dummy
449//   CHECK-LIVE-RANGE-NEXT: |  test.dummy
450//   CHECK-LIVE-RANGE-NEXT: E  test.some_use
451//   CHECK-LIVE-RANGE-NEXT:    cf.br
452
453// Here there's a 'hole' in the liveness of %tileA (in bb1) where another value
454// can reuse the tile ID assigned to %tileA. The liveness for %tileB is
455// entirely within the 'hole' in %tileA's live range, so %tileB should get the
456// same tile ID as %tileA.
457
458// CHECK-LABEL: @fill_holes_in_tile_liveness
459func.func @fill_holes_in_tile_liveness(%cond: i1) {
460  // CHECK: arm_sme.get_tile {tile_id = [[TILE_ID_A:.*]] : i32}
461  %tileA = arm_sme.get_tile : vector<[4]x[4]xf32>
462  cf.cond_br %cond, ^bb2, ^bb1
463^bb1:
464  // CHECK: arm_sme.get_tile {tile_id = [[TILE_ID_A]] : i32}
465  %tileB = arm_sme.get_tile : vector<[4]x[4]xf32>
466  "test.dummy"(): () -> ()
467  "test.some_use"(%tileB) : (vector<[4]x[4]xf32>) -> ()
468  cf.br ^bb3
469^bb2:
470  "test.dummy"(): () -> ()
471  "test.dummy"(): () -> ()
472  "test.dummy"(): () -> ()
473  "test.some_use"(%tileA) : (vector<[4]x[4]xf32>) -> ()
474  cf.br ^bb3
475^bb3:
476  return
477}
478
479// -----
480
481//  CHECK-LIVE-RANGE-LABEL: @holes_in_tile_liveness_inactive_overlaps
482//        CHECK-LIVE-RANGE: ========== Coalesced Live Ranges:
483//        CHECK-LIVE-RANGE: ^bb0:
484//   CHECK-LIVE-RANGE-NEXT: S  arm_sme.get_tile
485//   CHECK-LIVE-RANGE-NEXT: E  cf.cond_br
486//   CHECK-LIVE-RANGE-NEXT: ^bb1:
487//   CHECK-LIVE-RANGE-NEXT:  S arm_sme.get_tile
488//   CHECK-LIVE-RANGE-NEXT:  | test.dummy
489//   CHECK-LIVE-RANGE-NEXT:  | test.some_use
490//   CHECK-LIVE-RANGE-NEXT:  | arm_sme.copy_tile
491//   CHECK-LIVE-RANGE-NEXT:  E cf.br
492//   CHECK-LIVE-RANGE-NEXT: ^bb2:
493//   CHECK-LIVE-RANGE-NEXT: |  test.dummy
494//   CHECK-LIVE-RANGE-NEXT: |  test.dummy
495//   CHECK-LIVE-RANGE-NEXT: |  test.dummy
496//   CHECK-LIVE-RANGE-NEXT: |S arm_sme.get_tile
497//   CHECK-LIVE-RANGE-NEXT: E| test.some_use
498//   CHECK-LIVE-RANGE-NEXT:  | arm_sme.copy_tile
499//   CHECK-LIVE-RANGE-NEXT:  E cf.br
500//   CHECK-LIVE-RANGE-NEXT: ^bb3:
501//   CHECK-LIVE-RANGE-NEXT:  E test.some_use
502//   CHECK-LIVE-RANGE-NEXT:    func.return
503
504// This tests an edge case in inactive live ranges. The first live range is
505// inactive at the start of ^bb1. If the tile allocator did not check if the
506// second live range overlapped the first it would wrongly re-use tile ID 0
507// (as the first live range is inactive so tile ID 0 is free). This would mean
508// in ^bb2 two overlapping live ranges would have the same tile ID (bad!).
509
510// CHECK-LABEL: @holes_in_tile_liveness_inactive_overlaps
511func.func @holes_in_tile_liveness_inactive_overlaps(%cond: i1) {
512  // CHECK: arm_sme.get_tile {tile_id = 0 : i32}
513  %tileA = arm_sme.get_tile : vector<[4]x[4]xf32>
514  cf.cond_br %cond, ^bb2, ^bb1
515^bb1:
516  // CHECK: arm_sme.get_tile {tile_id = 1 : i32}
517  %tileB = arm_sme.get_tile : vector<[4]x[4]xf32>
518  "test.dummy"(): () -> ()
519  "test.some_use"(%tileB) : (vector<[4]x[4]xf32>) -> ()
520  cf.br ^bb3(%tileB: vector<[4]x[4]xf32>)
521^bb2:
522  "test.dummy"(): () -> ()
523  "test.dummy"(): () -> ()
524  "test.dummy"(): () -> ()
525  // CHECK: arm_sme.get_tile {tile_id = 1 : i32}
526  %tileC = arm_sme.get_tile : vector<[4]x[4]xf32>
527  "test.some_use"(%tileA) : (vector<[4]x[4]xf32>) -> ()
528  cf.br ^bb3(%tileC: vector<[4]x[4]xf32>)
529^bb3(%tile: vector<[4]x[4]xf32>):
530  "test.some_use"(%tile) : (vector<[4]x[4]xf32>) -> ()
531  return
532}
533
534// -----
535
536// This is the same as the previous example, but changes the tile types to
537// vector<[16]x[16]xi8>. This means in bb1 the allocator will need to spill the
538// first live range (which is inactive).
539
540// Note: The live ranges are the same as the previous example (so are not checked).
541
542// CHECK-LABEL: @spill_inactive_live_range
543func.func @spill_inactive_live_range(%cond: i1) {
544  // CHECK: arm_sme.get_tile {tile_id = 16 : i32}
545  %tileA = arm_sme.get_tile : vector<[16]x[16]xi8>
546  cf.cond_br %cond, ^bb2, ^bb1
547^bb1:
548  // CHECK: arm_sme.get_tile {tile_id = 0 : i32}
549  %tileB = arm_sme.get_tile : vector<[16]x[16]xi8>
550  "test.dummy"(): () -> ()
551  "test.some_use"(%tileB) : (vector<[16]x[16]xi8>) -> ()
552  cf.br ^bb3(%tileB: vector<[16]x[16]xi8>)
553^bb2:
554  "test.dummy"(): () -> ()
555  "test.dummy"(): () -> ()
556  "test.dummy"(): () -> ()
557  // CHECK: arm_sme.get_tile {tile_id = 0 : i32}
558  %tileC = arm_sme.get_tile : vector<[16]x[16]xi8>
559  "test.some_use"(%tileA) : (vector<[16]x[16]xi8>) -> ()
560  cf.br ^bb3(%tileC: vector<[16]x[16]xi8>)
561^bb3(%tile: vector<[16]x[16]xi8>):
562  "test.some_use"(%tile) : (vector<[16]x[16]xi8>) -> ()
563  return
564}
565
566// -----
567
568//  CHECK-LIVE-RANGE-LABEL: @reactivate_inactive_live_range
569//        CHECK-LIVE-RANGE: ========== Coalesced Live Ranges:
570//        CHECK-LIVE-RANGE: ^bb0:
571//   CHECK-LIVE-RANGE-NEXT: S   arm_sme.get_tile
572//   CHECK-LIVE-RANGE-NEXT: E   cf.cond_br
573//   CHECK-LIVE-RANGE-NEXT: ^bb1:
574//   CHECK-LIVE-RANGE-NEXT:  S  arm_sme.get_tile
575//   CHECK-LIVE-RANGE-NEXT:  |  test.dummy
576//   CHECK-LIVE-RANGE-NEXT:  E  test.some_use
577//   CHECK-LIVE-RANGE-NEXT:     cf.br
578//   CHECK-LIVE-RANGE-NEXT: ^bb2:
579//   CHECK-LIVE-RANGE-NEXT: | S arm_sme.get_tile
580//   CHECK-LIVE-RANGE-NEXT: | | test.dummy
581//   CHECK-LIVE-RANGE-NEXT: | | test.dummy
582//   CHECK-LIVE-RANGE-NEXT: | E test.some_use
583//   CHECK-LIVE-RANGE-NEXT: E   test.some_use
584//   CHECK-LIVE-RANGE-NEXT:     cf.br
585
586// Here the live range for %tileA becomes inactive in bb1 (so %tileB gets tile
587// ID 0 too). Then in bb2 the live range for tileA is reactivated as it overlaps
588// with the start of %tileC's live range (which means %tileC gets tile ID 1).
589
590func.func @reactivate_inactive_live_range(%cond: i1) {
591  // CHECK: arm_sme.get_tile {tile_id = 0 : i32}
592  %tileA = arm_sme.get_tile : vector<[4]x[4]xf32>
593  cf.cond_br %cond, ^bb2, ^bb1
594^bb1:
595  // CHECK: arm_sme.get_tile {tile_id = 0 : i32}
596  %tileB = arm_sme.get_tile : vector<[16]x[16]xi8>
597  "test.dummy"(): () -> ()
598  "test.some_use"(%tileB) : (vector<[16]x[16]xi8>) -> ()
599  cf.br ^bb3
600^bb2:
601  // CHECK: arm_sme.get_tile {tile_id = 1 : i32}
602  %tileC = arm_sme.get_tile : vector<[4]x[4]xf32>
603  "test.dummy"(): () -> ()
604  "test.dummy"(): () -> ()
605  "test.some_use"(%tileC) : (vector<[4]x[4]xf32>) -> ()
606  "test.some_use"(%tileA) : (vector<[4]x[4]xf32>) -> ()
607  cf.br ^bb3
608^bb3:
609  return
610}
611