xref: /llvm-project/mlir/test/Dialect/SCF/loop-pipelining.mlir (revision 8da5aa16f65bc297663573bacd3030f975b9fcde)
1// RUN: mlir-opt %s -test-scf-pipelining -split-input-file -verify-diagnostics | FileCheck %s
2// RUN: mlir-opt %s -test-scf-pipelining=annotate -split-input-file | FileCheck %s --check-prefix ANNOTATE
3// RUN: mlir-opt %s -test-scf-pipelining=no-epilogue-peeling -split-input-file | FileCheck %s --check-prefix NOEPILOGUE
4
5// CHECK-LABEL: simple_pipeline(
6//  CHECK-SAME:   %[[A:.*]]: memref<?xf32>, %[[R:.*]]: memref<?xf32>) {
7//   CHECK-DAG:   %[[C0:.*]] = arith.constant 0 : index
8//   CHECK-DAG:   %[[C1:.*]] = arith.constant 1 : index
9//   CHECK-DAG:   %[[C3:.*]] = arith.constant 3 : index
10// Prologue:
11//       CHECK:   %[[L0:.*]] = memref.load %[[A]][%[[C0]]] : memref<?xf32>
12// Kernel:
13//  CHECK-NEXT:   %[[L1:.*]] = scf.for %[[IV:.*]] = %[[C0]] to %[[C3]]
14//  CHECK-SAME:     step %[[C1]] iter_args(%[[LARG:.*]] = %[[L0]]) -> (f32) {
15//  CHECK-NEXT:     %[[ADD0:.*]] = arith.addf %[[LARG]], %{{.*}} : f32
16//  CHECK-NEXT:     memref.store %[[ADD0]], %[[R]][%[[IV]]] : memref<?xf32>
17//  CHECK-NEXT:     %[[IV1:.*]] = arith.addi %[[IV]], %[[C1]] : index
18//  CHECK-NEXT:     %[[LR:.*]] = memref.load %[[A]][%[[IV1]]] : memref<?xf32>
19//  CHECK-NEXT:     scf.yield %[[LR]] : f32
20//  CHECK-NEXT:   }
21// Epilogue:
22//  CHECK-NEXT:   %[[ADD1:.*]] = arith.addf %[[L1]], %{{.*}} : f32
23//  CHECK-NEXT:   memref.store %[[ADD1]], %[[R]][%[[C3]]] : memref<?xf32>
24func.func @simple_pipeline(%A: memref<?xf32>, %result: memref<?xf32>) {
25  %c0 = arith.constant 0 : index
26  %c1 = arith.constant 1 : index
27  %c4 = arith.constant 4 : index
28  %cf = arith.constant 1.0 : f32
29  scf.for %i0 = %c0 to %c4 step %c1 {
30    %A_elem = memref.load %A[%i0] { __test_pipelining_stage__ = 0, __test_pipelining_op_order__ = 2 } : memref<?xf32>
31    %A1_elem = arith.addf %A_elem, %cf { __test_pipelining_stage__ = 1, __test_pipelining_op_order__ = 0 } : f32
32    memref.store %A1_elem, %result[%i0] { __test_pipelining_stage__ = 1, __test_pipelining_op_order__ = 1 } : memref<?xf32>
33  }  { __test_pipelining_loop__ }
34  return
35}
36
37
38// -----
39
40// CHECK-LABEL: simple_pipeline_region(
41//  CHECK-SAME:   %[[A:.*]]: memref<?xf32>, %[[R:.*]]: memref<?xf32>) {
42//   CHECK-DAG:   %[[C0:.*]] = arith.constant 0 : index
43//   CHECK-DAG:   %[[C1:.*]] = arith.constant 1 : index
44//   CHECK-DAG:   %[[C3:.*]] = arith.constant 3 : index
45// Prologue:
46//       CHECK:   %[[L0:.*]] = scf.execute_region
47//  CHECK-NEXT:     memref.load %[[A]][%[[C0]]] : memref<?xf32>
48// Kernel:
49//       CHECK:   %[[L1:.*]] = scf.for %[[IV:.*]] = %[[C0]] to %[[C3]]
50//  CHECK-SAME:     step %[[C1]] iter_args(%[[LARG:.*]] = %[[L0]]) -> (f32) {
51//  CHECK-NEXT:     %[[ADD0:.*]] = scf.execute_region
52//  CHECK-NEXT:       arith.addf %[[LARG]], %{{.*}} : f32
53//       CHECK:     memref.store %[[ADD0]], %[[R]][%[[IV]]] : memref<?xf32>
54//  CHECK-NEXT:     %[[IV1:.*]] = arith.addi %[[IV]], %[[C1]] : index
55//  CHECK-NEXT:     %[[LR:.*]] = scf.execute_region
56//  CHECK-NEXT:       memref.load %[[A]][%[[IV1]]] : memref<?xf32>
57//       CHECK:     scf.yield %[[LR]] : f32
58//  CHECK-NEXT:   }
59// Epilogue:
60//  CHECK-NEXT:   %[[ADD1:.*]] = scf.execute_region
61//  CHECK-NEXT:     arith.addf %[[L1]], %{{.*}} : f32
62//       CHECK:   memref.store %[[ADD1]], %[[R]][%[[C3]]] : memref<?xf32>
63func.func @simple_pipeline_region(%A: memref<?xf32>, %result: memref<?xf32>) {
64  %c0 = arith.constant 0 : index
65  %c1 = arith.constant 1 : index
66  %c4 = arith.constant 4 : index
67  %cf = arith.constant 1.0 : f32
68  scf.for %i0 = %c0 to %c4 step %c1 {
69
70    %A_elem = scf.execute_region -> f32 {
71      %A_elem1 = memref.load %A[%i0]  : memref<?xf32>
72      scf.yield %A_elem1 : f32
73    } { __test_pipelining_stage__ = 0, __test_pipelining_op_order__ = 2 }
74
75    %A1_elem = scf.execute_region -> f32 {
76      %A1_elem1 = arith.addf %A_elem, %cf  : f32
77      scf.yield %A1_elem1 : f32
78    } { __test_pipelining_stage__ = 1, __test_pipelining_op_order__ = 0 }
79
80    memref.store %A1_elem, %result[%i0] { __test_pipelining_stage__ = 1, __test_pipelining_op_order__ = 1 } : memref<?xf32>
81  }  { __test_pipelining_loop__ }
82  return
83}
84
85// -----
86
87// CHECK-LABEL: simple_pipeline_step(
88//  CHECK-SAME:   %[[A:.*]]: memref<?xf32>, %[[R:.*]]: memref<?xf32>) {
89//   CHECK-DAG:   %[[C0:.*]] = arith.constant 0 : index
90//   CHECK-DAG:   %[[C3:.*]] = arith.constant 3 : index
91//   CHECK-DAG:   %[[C5:.*]] = arith.constant 5 : index
92//   CHECK-DAG:   %[[C6:.*]] = arith.constant 6 : index
93//   CHECK-DAG:   %[[C9:.*]] = arith.constant 9 : index
94// Prologue:
95//       CHECK:   %[[L0:.*]] = memref.load %[[A]][%[[C0]]] : memref<?xf32>
96//       CHECK:   %[[L1:.*]] = memref.load %[[A]][%[[C3]]] : memref<?xf32>
97// Kernel:
98//  CHECK-NEXT:   %[[L2:.*]]:2 = scf.for %[[IV:.*]] = %[[C0]] to %[[C5]]
99//  CHECK-SAME:     step %[[C3]] iter_args(%[[LARG0:.*]] = %[[L0]], %[[LARG1:.*]] = %[[L1]]) -> (f32, f32) {
100//  CHECK-NEXT:     %[[ADD0:.*]] = arith.addf %[[LARG0]], %{{.*}} : f32
101//  CHECK-NEXT:     memref.store %[[ADD0]], %[[R]][%[[IV]]] : memref<?xf32>
102//  CHECK-NEXT:     %[[IV1:.*]] = arith.addi %[[IV]], %[[C6]] : index
103//  CHECK-NEXT:     %[[LR:.*]] = memref.load %[[A]][%[[IV1]]] : memref<?xf32>
104//  CHECK-NEXT:     scf.yield %[[LARG1]], %[[LR]] : f32, f32
105//  CHECK-NEXT:   }
106// Epilogue:
107//  CHECK-NEXT:   %[[ADD1:.*]] = arith.addf %[[L2]]#0, %{{.*}} : f32
108//  CHECK-NEXT:   memref.store %[[ADD1]], %[[R]][%[[C6]]] : memref<?xf32>
109//  CHECK-NEXT:   %[[ADD2:.*]] = arith.addf %[[L2]]#1, %{{.*}} : f32
110//  CHECK-NEXT:   memref.store %[[ADD2]], %[[R]][%[[C9]]] : memref<?xf32>
111func.func @simple_pipeline_step(%A: memref<?xf32>, %result: memref<?xf32>) {
112  %c0 = arith.constant 0 : index
113  %c3 = arith.constant 3 : index
114  %c11 = arith.constant 11 : index
115  %cf = arith.constant 1.0 : f32
116  scf.for %i0 = %c0 to %c11 step %c3 {
117    %A_elem = memref.load %A[%i0] { __test_pipelining_stage__ = 0, __test_pipelining_op_order__ = 2 } : memref<?xf32>
118    %A1_elem = arith.addf %A_elem, %cf { __test_pipelining_stage__ = 2, __test_pipelining_op_order__ = 0 } : f32
119    memref.store %A1_elem, %result[%i0] { __test_pipelining_stage__ = 2, __test_pipelining_op_order__ = 1 } : memref<?xf32>
120  }  { __test_pipelining_loop__ }
121  return
122}
123
124// -----
125
126// CHECK-LABEL: three_stage(
127//  CHECK-SAME:   %[[A:.*]]: memref<?xf32>, %[[R:.*]]: memref<?xf32>) {
128//   CHECK-DAG:   %[[C0:.*]] = arith.constant 0 : index
129//   CHECK-DAG:   %[[C1:.*]] = arith.constant 1 : index
130//   CHECK-DAG:   %[[C2:.*]] = arith.constant 2 : index
131//   CHECK-DAG:   %[[C3:.*]] = arith.constant 3 : index
132// Prologue:
133//       CHECK:   %[[L0:.*]] = memref.load %[[A]][%[[C0]]] : memref<?xf32>
134//  CHECK-NEXT:   %[[ADD0:.*]] = arith.addf %[[L0]], %{{.*}} : f32
135//  CHECK-NEXT:   %[[L1:.*]] = memref.load %[[A]][%[[C1]]] : memref<?xf32>
136// Kernel:
137//  CHECK-NEXT:   %[[LR:.*]]:2 = scf.for %[[IV:.*]] = %[[C0]] to %[[C2]]
138//  CHECK-SAME:     step %[[C1]] iter_args(%[[ADDARG:.*]] = %[[ADD0]],
139//  CHECK-SAME:     %[[LARG:.*]] = %[[L1]]) -> (f32, f32) {
140//  CHECK-NEXT:     memref.store %[[ADDARG]], %[[R]][%[[IV]]] : memref<?xf32>
141//  CHECK-NEXT:     %[[ADD1:.*]] = arith.addf %[[LARG]], %{{.*}} : f32
142//  CHECK-NEXT:     %[[IV2:.*]] = arith.addi %[[IV]], %[[C2]] : index
143//  CHECK-NEXT:     %[[L3:.*]] = memref.load %[[A]][%[[IV2]]] : memref<?xf32>
144//  CHECK-NEXT:     scf.yield %[[ADD1]], %[[L3]] : f32, f32
145//  CHECK-NEXT:   }
146// Epilogue:
147//  CHECK-NEXT:   memref.store %[[LR]]#0, %[[R]][%[[C2]]] : memref<?xf32>
148//  CHECK-NEXT:   %[[ADD2:.*]] = arith.addf %[[LR]]#1, %{{.*}} : f32
149//  CHECK-NEXT:   memref.store %[[ADD2]], %[[R]][%[[C3]]] : memref<?xf32>
150
151// Prologue:
152//  ANNOTATE:   memref.load {{.*}} {__test_pipelining_iteration = 0 : i32, __test_pipelining_part = "prologue"}
153//  ANNOTATE:   memref.load {{.*}} {__test_pipelining_iteration = 1 : i32, __test_pipelining_part = "prologue"}
154// Kernel:
155//  ANNOTATE:   scf.for
156//  ANNOTATE:     memref.store {{.*}} {__test_pipelining_iteration = 0 : i32, __test_pipelining_part = "kernel"}
157//  ANNOTATE:     arith.addf {{.*}} {__test_pipelining_iteration = 0 : i32, __test_pipelining_part = "kernel"}
158//  ANNOTATE:     memref.load {{.*}} {__test_pipelining_iteration = 0 : i32, __test_pipelining_part = "kernel"}
159//  ANNOTATE:     scf.yield
160//  ANNOTATE:   }
161// Epilogue:
162//  ANNOTATE:   memref.store {{.*}} {__test_pipelining_iteration = 0 : i32, __test_pipelining_part = "epilogue"}
163//  ANNOTATE:   arith.addf {{.*}} {__test_pipelining_iteration = 0 : i32, __test_pipelining_part = "epilogue"}
164//  ANNOTATE:   memref.store {{.*}} {__test_pipelining_iteration = 1 : i32, __test_pipelining_part = "epilogue"}
165
166// NOEPILOGUE-LABEL: three_stage(
167//  NOEPILOGUE-SAME:   %[[A:.*]]: memref<?xf32>, %[[R:.*]]: memref<?xf32>) {
168//   NOEPILOGUE-DAG:   %[[C0:.*]] = arith.constant 0 : index
169//   NOEPILOGUE-DAG:   %[[C1:.*]] = arith.constant 1 : index
170//   NOEPILOGUE-DAG:   %[[C2:.*]] = arith.constant 2 : index
171//   NOEPILOGUE-DAG:   %[[C3:.*]] = arith.constant 3 : index
172//   NOEPILOGUE-DAG:   %[[C4:.*]] = arith.constant 4 : index
173//   NOEPILOGUE-DAG:   %[[CF:.*]] = arith.constant 0.000000e+00 : f32
174// Prologue:
175//       NOEPILOGUE:   %[[L0:.*]] = memref.load %[[A]][%[[C0]]] : memref<?xf32>
176//  NOEPILOGUE-NEXT:   %[[ADD0:.*]] = arith.addf %[[L0]], %{{.*}} : f32
177//  NOEPILOGUE-NEXT:   %[[L1:.*]] = memref.load %[[A]][%[[C1]]] : memref<?xf32>
178// Kernel:
179//  NOEPILOGUE-NEXT:   %[[LR:.*]]:2 = scf.for %[[IV:.*]] = %[[C0]] to %[[C4]]
180//  NOEPILOGUE-SAME:     step %[[C1]] iter_args(%[[ADDARG:.*]] = %[[ADD0]],
181//  NOEPILOGUE-SAME:     %[[LARG:.*]] = %[[L1]]) -> (f32, f32) {
182//   NOEPILOGUE-DAG:     %[[S0:.*]] = arith.cmpi slt, %[[IV]], %[[C2]] : index
183//   NOEPILOGUE-DAG:     %[[S1:.*]] = arith.cmpi slt, %[[IV]], %[[C3]] : index
184//  NOEPILOGUE-NEXT:     memref.store %[[ADDARG]], %[[R]][%[[IV]]] : memref<?xf32>
185//  NOEPILOGUE-NEXT:     %[[ADD1:.*]] = scf.if %[[S1]] -> (f32) {
186//  NOEPILOGUE-NEXT:       %[[PADD:.*]] = arith.addf %[[LARG]], %{{.*}} : f32
187//  NOEPILOGUE-NEXT:       scf.yield %[[PADD]] : f32
188//  NOEPILOGUE-NEXT:     } else {
189//  NOEPILOGUE-NEXT:       scf.yield %[[CF]] : f32
190//  NOEPILOGUE-NEXT:     }
191//  NOEPILOGUE-NEXT:     %[[IV2:.*]] = arith.addi %[[IV]], %[[C2]] : index
192//  NOEPILOGUE-NEXT:     %[[L3:.*]] = scf.if %[[S0]] -> (f32) {
193//  NOEPILOGUE-NEXT:       %[[PL:.*]] = memref.load %[[A]][%[[IV2]]] : memref<?xf32>
194//  NOEPILOGUE-NEXT:       scf.yield %[[PL]] : f32
195//  NOEPILOGUE-NEXT:     } else {
196//  NOEPILOGUE-NEXT:       scf.yield %[[CF]] : f32
197//  NOEPILOGUE-NEXT:     }
198//  NOEPILOGUE-NEXT:     scf.yield %[[ADD1]], %[[L3]] : f32, f32
199//  NOEPILOGUE-NEXT:   }
200// No epilogue should be generated.
201//   NOEPILOGUE-NOT:   memref.store
202//       NOEPILOGUE:   return
203
204func.func @three_stage(%A: memref<?xf32>, %result: memref<?xf32>) {
205  %c0 = arith.constant 0 : index
206  %c1 = arith.constant 1 : index
207  %c4 = arith.constant 4 : index
208  %cf = arith.constant 1.0 : f32
209  scf.for %i0 = %c0 to %c4 step %c1 {
210    %A_elem = memref.load %A[%i0] { __test_pipelining_stage__ = 0, __test_pipelining_op_order__ = 2 } : memref<?xf32>
211    %A1_elem = arith.addf %A_elem, %cf { __test_pipelining_stage__ = 1, __test_pipelining_op_order__ = 1 } : f32
212    memref.store %A1_elem, %result[%i0] { __test_pipelining_stage__ = 2, __test_pipelining_op_order__ = 0 } : memref<?xf32>
213  } { __test_pipelining_loop__ }
214  return
215}
216
217// -----
218// CHECK-LABEL: long_liverange(
219//  CHECK-SAME:   %[[A:.*]]: memref<?xf32>, %[[R:.*]]: memref<?xf32>) {
220//   CHECK-DAG:   %[[C0:.*]] = arith.constant 0 : index
221//   CHECK-DAG:   %[[C1:.*]] = arith.constant 1 : index
222//   CHECK-DAG:   %[[C2:.*]] = arith.constant 2 : index
223//   CHECK-DAG:   %[[C3:.*]] = arith.constant 3 : index
224//   CHECK-DAG:   %[[C4:.*]] = arith.constant 4 : index
225//   CHECK-DAG:   %[[C6:.*]] = arith.constant 6 : index
226//   CHECK-DAG:   %[[C7:.*]] = arith.constant 7 : index
227//   CHECK-DAG:   %[[C8:.*]] = arith.constant 8 : index
228//   CHECK-DAG:   %[[C9:.*]] = arith.constant 9 : index
229// Prologue:
230//       CHECK:   %[[L0:.*]] = memref.load %[[A]][%[[C0]]] : memref<?xf32>
231//  CHECK-NEXT:   %[[L1:.*]] = memref.load %[[A]][%[[C1]]] : memref<?xf32>
232//  CHECK-NEXT:   %[[L2:.*]] = memref.load %[[A]][%[[C2]]] : memref<?xf32>
233//  CHECK-NEXT:   %[[L3:.*]] = memref.load %[[A]][%[[C3]]] : memref<?xf32>
234// Kernel:
235//  CHECK-NEXT:   %[[LR:.*]]:4 = scf.for %[[IV:.*]] = %[[C0]] to %[[C6]]
236//  CHECK-SAME:     step %[[C1]] iter_args(%[[LA0:.*]] = %[[L0]],
237//  CHECK-SAME:     %[[LA1:.*]] = %[[L1]], %[[LA2:.*]] = %[[L2]],
238//  CHECK-SAME:     %[[LA3:.*]] = %[[L3]]) -> (f32, f32, f32, f32) {
239//  CHECK-NEXT:     %[[ADD0:.*]] = arith.addf %[[LA0]], %{{.*}} : f32
240//  CHECK-NEXT:     memref.store %[[ADD0]], %[[R]][%[[IV]]] : memref<?xf32>
241//  CHECK-NEXT:     %[[IV4:.*]] = arith.addi %[[IV]], %[[C4]] : index
242//  CHECK-NEXT:     %[[L4:.*]] = memref.load %[[A]][%[[IV4]]] : memref<?xf32>
243//  CHECK-NEXT:     scf.yield %[[LA1]], %[[LA2]], %[[LA3]], %[[L4]] : f32, f32, f32, f32
244//  CHECK-NEXT:   }
245// Epilogue:
246//  CHECK-NEXT:  %[[ADD1:.*]] = arith.addf %[[LR]]#0, %{{.*}} : f32
247//  CHECK-NEXT:  memref.store %[[ADD1]], %[[R]][%[[C6]]] : memref<?xf32>
248//  CHECK-NEXT:  %[[ADD2:.*]] = arith.addf %[[LR]]#1, %{{.*}} : f32
249//  CHECK-NEXT:  memref.store %[[ADD2]], %[[R]][%[[C7]]] : memref<?xf32>
250//  CHECK-NEXT:  %[[ADD3:.*]] = arith.addf %[[LR]]#2, %{{.*}} : f32
251//  CHECK-NEXT:  memref.store %[[ADD3]], %[[R]][%[[C8]]] : memref<?xf32>
252//  CHECK-NEXT:  %[[ADD4:.*]] = arith.addf %[[LR]]#3, %{{.*}} : f32
253//  CHECK-NEXT:  memref.store %[[ADD4]], %[[R]][%[[C9]]] : memref<?xf32>
254func.func @long_liverange(%A: memref<?xf32>, %result: memref<?xf32>) {
255  %c0 = arith.constant 0 : index
256  %c1 = arith.constant 1 : index
257  %c10 = arith.constant 10 : index
258  %cf = arith.constant 1.0 : f32
259  scf.for %i0 = %c0 to %c10 step %c1 {
260    %A_elem = memref.load %A[%i0] { __test_pipelining_stage__ = 0, __test_pipelining_op_order__ = 2 } : memref<?xf32>
261    %A1_elem = arith.addf %A_elem, %cf { __test_pipelining_stage__ = 4, __test_pipelining_op_order__ = 0 } : f32
262    memref.store %A1_elem, %result[%i0] { __test_pipelining_stage__ = 4, __test_pipelining_op_order__ = 1 } : memref<?xf32>
263  } { __test_pipelining_loop__ }
264  return
265}
266
267// -----
268
269// CHECK-LABEL: multiple_uses(
270//  CHECK-SAME:   %[[A:.*]]: memref<?xf32>, %[[R:.*]]: memref<?xf32>) {
271//   CHECK-DAG:   %[[C0:.*]] = arith.constant 0 : index
272//   CHECK-DAG:   %[[C1:.*]] = arith.constant 1 : index
273//   CHECK-DAG:   %[[C2:.*]] = arith.constant 2 : index
274//   CHECK-DAG:   %[[C3:.*]] = arith.constant 3 : index
275//   CHECK-DAG:   %[[C7:.*]] = arith.constant 7 : index
276//   CHECK-DAG:   %[[C8:.*]] = arith.constant 8 : index
277//   CHECK-DAG:   %[[C9:.*]] = arith.constant 9 : index
278// Prologue:
279//       CHECK:   %[[L0:.*]] = memref.load %[[A]][%[[C0]]] : memref<?xf32>
280//  CHECK-NEXT:   %[[ADD0:.*]] = arith.addf %[[L0]], %{{.*}} : f32
281//  CHECK-NEXT:   %[[L1:.*]] = memref.load %[[A]][%[[C1]]] : memref<?xf32>
282//  CHECK-NEXT:   %[[ADD1:.*]] = arith.addf %[[L1]], %{{.*}} : f32
283//  CHECK-NEXT:   %[[MUL0:.*]] = arith.mulf %[[ADD0]], %[[L0]] : f32
284//  CHECK-NEXT:   %[[L2:.*]] = memref.load %[[A]][%[[C2]]] : memref<?xf32>
285// Kernel:
286//  CHECK-NEXT:   %[[LR:.*]]:4 = scf.for %[[IV:.*]] = %[[C0]] to %[[C7]]
287//  CHECK-SAME:     step %[[C1]] iter_args(%[[LA1:.*]] = %[[L1]],
288//  CHECK-SAME:     %[[LA2:.*]] = %[[L2]], %[[ADDARG1:.*]] = %[[ADD1]],
289//  CHECK-SAME:     %[[MULARG0:.*]] = %[[MUL0]]) -> (f32, f32, f32, f32) {
290//  CHECK-NEXT:     %[[ADD2:.*]] = arith.addf %[[LA2]], %{{.*}} : f32
291//  CHECK-NEXT:     %[[MUL1:.*]] = arith.mulf %[[ADDARG1]], %[[LA1]] : f32
292//  CHECK-NEXT:     memref.store %[[MULARG0]], %[[R]][%[[IV]]] : memref<?xf32>
293//  CHECK-NEXT:     %[[IV3:.*]] = arith.addi %[[IV]], %[[C3]] : index
294//  CHECK-NEXT:     %[[L3:.*]] = memref.load %[[A]][%[[IV3]]] : memref<?xf32>
295//  CHECK-NEXT:     scf.yield %[[LA2]], %[[L3]], %[[ADD2]], %[[MUL1]] : f32, f32, f32, f32
296//  CHECK-NEXT:   }
297// Epilogue:
298//  CHECK-NEXT:   %[[ADD3:.*]] = arith.addf %[[LR]]#1, %{{.*}} : f32
299//  CHECK-NEXT:   %[[MUL2:.*]] = arith.mulf %[[LR]]#2, %[[LR]]#0 : f32
300//  CHECK-NEXT:   memref.store %[[LR]]#3, %[[R]][%[[C7]]] : memref<?xf32>
301//  CHECK-NEXT:   %[[MUL3:.*]] = arith.mulf %[[ADD3]], %[[LR]]#1 : f32
302//  CHECK-NEXT:   memref.store %[[MUL2]], %[[R]][%[[C8]]] : memref<?xf32>
303//  CHECK-NEXT:   memref.store %[[MUL3]], %[[R]][%[[C9]]] : memref<?xf32>
304func.func @multiple_uses(%A: memref<?xf32>, %result: memref<?xf32>) {
305  %c0 = arith.constant 0 : index
306  %c1 = arith.constant 1 : index
307  %c10 = arith.constant 10 : index
308  %cf = arith.constant 1.0 : f32
309  scf.for %i0 = %c0 to %c10 step %c1 {
310    %A_elem = memref.load %A[%i0] { __test_pipelining_stage__ = 0, __test_pipelining_op_order__ = 3 } : memref<?xf32>
311    %A1_elem = arith.addf %A_elem, %cf { __test_pipelining_stage__ = 1, __test_pipelining_op_order__ = 0 } : f32
312    %A2_elem = arith.mulf %A1_elem, %A_elem { __test_pipelining_stage__ = 2, __test_pipelining_op_order__ = 1 } : f32
313    memref.store %A2_elem, %result[%i0] { __test_pipelining_stage__ = 3, __test_pipelining_op_order__ = 2 } : memref<?xf32>
314  } { __test_pipelining_loop__ }
315  return
316}
317
318// -----
319
320// CHECK-LABEL: region_multiple_uses(
321//  CHECK-SAME:   %[[A:.*]]: memref<?xf32>, %[[R:.*]]: memref<?xf32>) {
322//   CHECK-DAG:   %[[C0:.*]] = arith.constant 0 : index
323//   CHECK-DAG:   %[[C1:.*]] = arith.constant 1 : index
324//   CHECK-DAG:   %[[C2:.*]] = arith.constant 2 : index
325//   CHECK-DAG:   %[[C3:.*]] = arith.constant 3 : index
326//   CHECK-DAG:   %[[C7:.*]] = arith.constant 7 : index
327//   CHECK-DAG:   %[[C8:.*]] = arith.constant 8 : index
328//   CHECK-DAG:   %[[C9:.*]] = arith.constant 9 : index
329// Prologue:
330//       CHECK:   %[[L0:.*]] = memref.load %[[A]][%[[C0]]] : memref<?xf32>
331//  CHECK-NEXT:   %[[ADD0:.*]] = arith.addf %[[L0]], %{{.*}} : f32
332//  CHECK-NEXT:   %[[L1:.*]] = memref.load %[[A]][%[[C1]]] : memref<?xf32>
333//  CHECK-NEXT:   %[[ADD1:.*]] = arith.addf %[[L1]], %{{.*}} : f32
334//  CHECK-NEXT:   %[[MUL0:.*]] = scf.execute_region
335// arith.mulf %[[ADD0]], %[[L0]] : f32
336//  CHECK:   %[[L2:.*]] = memref.load %[[A]][%[[C2]]] : memref<?xf32>
337// Kernel:
338//  CHECK-NEXT:   %[[LR:.*]]:4 = scf.for %[[IV:.*]] = %[[C0]] to %[[C7]]
339//  CHECK-SAME:     step %[[C1]] iter_args(%[[LA1:.*]] = %[[L1]],
340//  CHECK-SAME:     %[[LA2:.*]] = %[[L2]], %[[ADDARG1:.*]] = %[[ADD1]],
341//  CHECK-SAME:     %[[MULARG0:.*]] = %[[MUL0]]) -> (f32, f32, f32, f32) {
342//  CHECK-NEXT:     %[[ADD2:.*]] = arith.addf %[[LA2]], %{{.*}} : f32
343//  CHECK-NEXT:     %[[MUL1:.*]] = scf.execute_region
344// arith.mulf %[[ADDARG1]], %[[LA1]] : f32
345//       CHECK:     memref.store %[[MULARG0]], %[[R]][%[[IV]]] : memref<?xf32>
346//  CHECK-NEXT:     %[[IV3:.*]] = arith.addi %[[IV]], %[[C3]] : index
347//  CHECK-NEXT:     %[[L3:.*]] = memref.load %[[A]][%[[IV3]]] : memref<?xf32>
348//  CHECK-NEXT:     scf.yield %[[LA2]], %[[L3]], %[[ADD2]], %[[MUL1]] : f32, f32, f32, f32
349//  CHECK-NEXT:   }
350// Epilogue:
351//  CHECK-NEXT:   %[[ADD3:.*]] = arith.addf %[[LR]]#1, %{{.*}} : f32
352//  CHECK-NEXT:   %[[MUL2:.*]] = scf.execute_region
353// arith.mulf %[[LR]]#2, %[[LR]]#0 : f32
354//       CHECK:   memref.store %[[LR]]#3, %[[R]][%[[C7]]] : memref<?xf32>
355//  CHECK-NEXT:   %[[MUL3:.*]] = scf.execute_region
356/// %[[ADD3]], %[[LR]]#1 : f32
357//       CHECK:   memref.store %[[MUL2]], %[[R]][%[[C8]]] : memref<?xf32>
358//  CHECK-NEXT:   memref.store %[[MUL3]], %[[R]][%[[C9]]] : memref<?xf32>
359
360func.func @region_multiple_uses(%A: memref<?xf32>, %result: memref<?xf32>) {
361  %c0 = arith.constant 0 : index
362  %c1 = arith.constant 1 : index
363  %c10 = arith.constant 10 : index
364  %cf = arith.constant 1.0 : f32
365  scf.for %i0 = %c0 to %c10 step %c1 {
366    %A_elem = memref.load %A[%i0] { __test_pipelining_stage__ = 0, __test_pipelining_op_order__ = 3 } : memref<?xf32>
367    %A1_elem = arith.addf %A_elem, %cf { __test_pipelining_stage__ = 1, __test_pipelining_op_order__ = 0 } : f32
368    %A2_elem = scf.execute_region -> f32 {
369      %A2_elem1 = arith.mulf %A1_elem, %A_elem : f32
370      scf.yield %A2_elem1 : f32
371    } { __test_pipelining_stage__ = 2, __test_pipelining_op_order__ = 1 }
372    memref.store %A2_elem, %result[%i0] { __test_pipelining_stage__ = 3, __test_pipelining_op_order__ = 2 } : memref<?xf32>
373  } { __test_pipelining_loop__ }
374  return
375}
376
377// -----
378
379// CHECK-LABEL: loop_carried(
380//  CHECK-SAME:   %[[A:.*]]: memref<?xf32>, %[[R:.*]]: memref<?xf32>) {
381//   CHECK-DAG:   %[[C0:.*]] = arith.constant 0 : index
382//   CHECK-DAG:   %[[C1:.*]] = arith.constant 1 : index
383//   CHECK-DAG:   %[[C3:.*]] = arith.constant 3 : index
384//   CHECK-DAG:   %[[CSTF:.*]] = arith.constant 1.000000e+00 : f32
385// Prologue:
386//       CHECK:   %[[L0:.*]] = memref.load %[[A]][%[[C0]]] : memref<?xf32>
387// Kernel:
388//  CHECK-NEXT:   %[[LR:.*]]:2 = scf.for %[[IV:.*]] = %[[C0]] to %[[C3]]
389//  CHECK-SAME:     step %[[C1]] iter_args(%[[C:.*]] = %[[CSTF]],
390//  CHECK-SAME:     %[[LARG:.*]] = %[[L0]]) -> (f32, f32) {
391//  CHECK-NEXT:     %[[ADD0:.*]] = arith.addf %[[LARG]], %[[C]] : f32
392//  CHECK-NEXT:     %[[IV1:.*]] = arith.addi %[[IV]], %[[C1]] : index
393//  CHECK-NEXT:     %[[L1:.*]] = memref.load %[[A]][%[[IV1]]] : memref<?xf32>
394//  CHECK-NEXT:     scf.yield %[[ADD0]], %[[L1]] : f32, f32
395//  CHECK-NEXT:   }
396// Epilogue:
397//  CHECK-NEXT:   %[[ADD1:.*]] = arith.addf %[[LR]]#1, %[[LR]]#0 : f32
398//  CHECK-NEXT:   memref.store %[[ADD1]], %[[R]][%[[C0]]] : memref<?xf32>
399func.func @loop_carried(%A: memref<?xf32>, %result: memref<?xf32>) {
400  %c0 = arith.constant 0 : index
401  %c1 = arith.constant 1 : index
402  %c4 = arith.constant 4 : index
403  %cf = arith.constant 1.0 : f32
404  %r = scf.for %i0 = %c0 to %c4 step %c1 iter_args(%arg0 = %cf) -> (f32) {
405    %A_elem = memref.load %A[%i0] { __test_pipelining_stage__ = 0, __test_pipelining_op_order__ = 1 } : memref<?xf32>
406    %A1_elem = arith.addf %A_elem, %arg0 { __test_pipelining_stage__ = 1, __test_pipelining_op_order__ = 0 } : f32
407    scf.yield %A1_elem : f32
408  }  { __test_pipelining_loop__ }
409  memref.store %r, %result[%c0] : memref<?xf32>
410  return
411}
412
413// -----
414
415// CHECK-LABEL: backedge_different_stage
416//  CHECK-SAME:   (%[[A:.*]]: memref<?xf32>) -> f32 {
417//   CHECK-DAG:   %[[C0:.*]] = arith.constant 0 : index
418//   CHECK-DAG:   %[[C1:.*]] = arith.constant 1 : index
419//   CHECK-DAG:   %[[C2:.*]] = arith.constant 2 : index
420//   CHECK-DAG:   %[[CSTF:.*]] = arith.constant 2.000000e+00 : f32
421// Prologue:
422//       CHECK:   %[[L0:.*]] = memref.load %[[A]][%[[C0]]] : memref<?xf32>
423//  CHECK-NEXT:   %[[ADD0:.*]] = arith.addf %[[L0]], %[[CSTF]] : f32
424//  CHECK-NEXT:   %[[L1:.*]] = memref.load %[[A]][%[[C1]]] : memref<?xf32>
425// Kernel:
426//  CHECK-NEXT:   %[[R:.*]]:3 = scf.for %[[IV:.*]] = %[[C0]] to %[[C2]]
427//  CHECK-SAME:     step %[[C1]] iter_args(%[[C:.*]] = %[[CSTF]],
428//  CHECK-SAME:     %[[ADDARG:.*]] = %[[ADD0]], %[[LARG:.*]] = %[[L1]]) -> (f32, f32, f32) {
429//  CHECK-NEXT:     %[[MUL0:.*]] = arith.mulf %[[ADDARG]], %[[CSTF]] : f32
430//  CHECK-NEXT:     %[[ADD1:.*]] = arith.addf %[[LARG]], %[[MUL0]] : f32
431//  CHECK-NEXT:     %[[IV2:.*]] = arith.addi %[[IV]], %[[C2]] : index
432//  CHECK-NEXT:     %[[L2:.*]] = memref.load %[[A]][%[[IV2]]] : memref<?xf32>
433//  CHECK-NEXT:     scf.yield %[[MUL0]], %[[ADD1]], %[[L2]] : f32, f32, f32
434//  CHECK-NEXT:   }
435// Epilogue:
436//  CHECK-NEXT:   %[[MUL1:.*]] = arith.mulf %[[R]]#1, %[[CSTF]] : f32
437//  CHECK-NEXT:   %[[ADD2:.*]] = arith.addf %[[R]]#2, %[[MUL1]] : f32
438//  CHECK-NEXT:   %[[MUL2:.*]] = arith.mulf %[[ADD2]], %[[CSTF]] : f32
439//  CHECK-NEXT:   return %[[MUL2]] : f32
440func.func @backedge_different_stage(%A: memref<?xf32>) -> f32 {
441  %c0 = arith.constant 0 : index
442  %c1 = arith.constant 1 : index
443  %c4 = arith.constant 4 : index
444  %cf = arith.constant 2.0 : f32
445  %r = scf.for %i0 = %c0 to %c4 step %c1 iter_args(%arg0 = %cf) -> (f32) {
446    %A_elem = memref.load %A[%i0] { __test_pipelining_stage__ = 0, __test_pipelining_op_order__ = 2 } : memref<?xf32>
447    %A1_elem = arith.addf %A_elem, %arg0 { __test_pipelining_stage__ = 1, __test_pipelining_op_order__ = 1 } : f32
448    %A2_elem = arith.mulf %cf, %A1_elem { __test_pipelining_stage__ = 2, __test_pipelining_op_order__ = 0 } : f32
449    scf.yield %A2_elem : f32
450  }  { __test_pipelining_loop__ }
451  return %r : f32
452}
453
454// -----
455
456// CHECK-LABEL: region_backedge_different_stage
457//  CHECK-SAME:   (%[[A:.*]]: memref<?xf32>) -> f32 {
458//   CHECK-DAG:   %[[C0:.*]] = arith.constant 0 : index
459//   CHECK-DAG:   %[[C1:.*]] = arith.constant 1 : index
460//   CHECK-DAG:   %[[C2:.*]] = arith.constant 2 : index
461//   CHECK-DAG:   %[[CSTF:.*]] = arith.constant 2.000000e+00 : f32
462// Prologue:
463//       CHECK:   %[[L0:.*]] = scf.execute_region
464//  CHECK-NEXT:     memref.load %[[A]][%[[C0]]] : memref<?xf32>
465//       CHECK:   %[[ADD0:.*]] = scf.execute_region
466//  CHECK-NEXT:   arith.addf %[[L0]], %[[CSTF]] : f32
467//       CHECK:   %[[L1:.*]] = scf.execute_region
468//  CHECK-NEXT:     memref.load %[[A]][%[[C1]]] : memref<?xf32>
469// Kernel:
470//       CHECK:   %[[R:.*]]:3 = scf.for %[[IV:.*]] = %[[C0]] to %[[C2]]
471//  CHECK-SAME:     step %[[C1]] iter_args(%[[C:.*]] = %[[CSTF]],
472//  CHECK-SAME:     %[[ADDARG:.*]] = %[[ADD0]], %[[LARG:.*]] = %[[L1]]) -> (f32, f32, f32) {
473//       CHECK:     %[[MUL0:.*]] = arith.mulf %[[ADDARG]], %[[CSTF]] : f32
474//       CHECK:     %[[ADD1:.*]] = scf.execute_region
475//  CHECK-NEXT:       arith.addf %[[LARG]], %[[MUL0]] : f32
476//       CHECK:     %[[IV2:.*]] = arith.addi %[[IV]], %[[C2]] : index
477//       CHECK:     %[[L2:.*]] = scf.execute_region
478//  CHECK-NEXT:       memref.load %[[A]][%[[IV2]]] : memref<?xf32>
479//       CHECK:     scf.yield %[[MUL0]], %[[ADD1]], %[[L2]] : f32, f32, f32
480//  CHECK-NEXT:   }
481// Epilogue:
482//       CHECK:   %[[MUL1:.*]] = arith.mulf %[[R]]#1, %[[CSTF]] : f32
483//       CHECK:   %[[ADD2:.*]] = scf.execute_region
484//  CHECK-NEXT:    arith.addf %[[R]]#2, %[[MUL1]] : f32
485//       CHECK:   %[[MUL2:.*]] = arith.mulf %[[ADD2]], %[[CSTF]] : f32
486//       CHECK:   return %[[MUL2]] : f32
487
488func.func @region_backedge_different_stage(%A: memref<?xf32>) -> f32 {
489  %c0 = arith.constant 0 : index
490  %c1 = arith.constant 1 : index
491  %c4 = arith.constant 4 : index
492  %cf = arith.constant 2.0 : f32
493  %r = scf.for %i0 = %c0 to %c4 step %c1 iter_args(%arg0 = %cf) -> (f32) {
494    %A_elem = scf.execute_region -> f32 {
495      %A_elem1 = memref.load %A[%i0] : memref<?xf32>
496      scf.yield %A_elem1 : f32
497    } { __test_pipelining_stage__ = 0, __test_pipelining_op_order__ = 2 }
498    %A1_elem = scf.execute_region -> f32 {
499      %inner = arith.addf %A_elem, %arg0 : f32
500      scf.yield %inner : f32
501    }  { __test_pipelining_stage__ = 1, __test_pipelining_op_order__ = 1 }
502    %A2_elem = arith.mulf %cf, %A1_elem { __test_pipelining_stage__ = 2, __test_pipelining_op_order__ = 0 } : f32
503    scf.yield %A2_elem : f32
504  }  { __test_pipelining_loop__ }
505  return %r : f32
506}
507
508
509// -----
510
511// CHECK-LABEL: backedge_same_stage
512//  CHECK-SAME:   (%[[A:.*]]: memref<?xf32>) -> f32 {
513//   CHECK-DAG:   %[[C0:.*]] = arith.constant 0 : index
514//   CHECK-DAG:   %[[C1:.*]] = arith.constant 1 : index
515//   CHECK-DAG:   %[[C3:.*]] = arith.constant 3 : index
516//   CHECK-DAG:   %[[CSTF:.*]] = arith.constant 2.000000e+00 : f32
517// Prologue:
518//       CHECK:   %[[L0:.*]] = memref.load %[[A]][%[[C0]]] : memref<?xf32>
519// Kernel:
520//  CHECK-NEXT:   %[[R:.*]]:2 = scf.for %[[IV:.*]] = %[[C0]] to %[[C3]]
521//  CHECK-SAME:     step %[[C1]] iter_args(%[[C:.*]] = %[[CSTF]],
522//  CHECK-SAME:     %[[LARG:.*]] = %[[L0]]) -> (f32, f32) {
523//  CHECK-NEXT:     %[[ADD0:.*]] = arith.addf %[[LARG]], %[[C]] : f32
524//  CHECK-NEXT:     %[[MUL0:.*]] = arith.mulf %[[ADD0]], %[[CSTF]] : f32
525//  CHECK-NEXT:     %[[IV1:.*]] = arith.addi %[[IV]], %[[C1]] : index
526//  CHECK-NEXT:     %[[L2:.*]] = memref.load %[[A]][%[[IV1]]] : memref<?xf32>
527//  CHECK-NEXT:     scf.yield %[[MUL0]], %[[L2]] : f32, f32
528//  CHECK-NEXT:   }
529// Epilogue:
530//  CHECK-NEXT:   %[[ADD1:.*]] = arith.addf %[[R]]#1, %[[R]]#0 : f32
531//  CHECK-NEXT:   %[[MUL1:.*]] = arith.mulf %[[ADD1]], %[[CSTF]] : f32
532//  CHECK-NEXT:   return %[[MUL1]] : f32
533func.func @backedge_same_stage(%A: memref<?xf32>) -> f32 {
534  %c0 = arith.constant 0 : index
535  %c1 = arith.constant 1 : index
536  %c4 = arith.constant 4 : index
537  %cf = arith.constant 2.0 : f32
538  %r = scf.for %i0 = %c0 to %c4 step %c1 iter_args(%arg0 = %cf) -> (f32) {
539    %A_elem = memref.load %A[%i0] { __test_pipelining_stage__ = 0, __test_pipelining_op_order__ = 2 } : memref<?xf32>
540    %A1_elem = arith.addf %A_elem, %arg0 { __test_pipelining_stage__ = 1, __test_pipelining_op_order__ = 0 } : f32
541    %A2_elem = arith.mulf %cf, %A1_elem { __test_pipelining_stage__ = 1, __test_pipelining_op_order__ = 1 } : f32
542    scf.yield %A2_elem : f32
543  }  { __test_pipelining_loop__ }
544  return %r : f32
545}
546
547// -----
548
549// CHECK: @pipeline_op_with_region(%[[ARG0:.+]]: memref<?xf32>, %[[ARG1:.+]]: memref<?xf32>, %[[ARG2:.+]]: memref<?xf32>, %[[CF:.*]]: f32) {
550// CHECK-DAG: %[[C0:.+]] = arith.constant 0 :
551// CHECK-DAG: %[[C3:.+]] = arith.constant 3 :
552// CHECK-DAG: %[[C1:.+]] = arith.constant 1 :
553// CHECK:   %[[APRO:.+]] = memref.alloc() :
554// CHECK:   %[[BPRO:.+]] = memref.alloc() :
555// CHECK:   %[[ASV0:.+]] = memref.subview %[[ARG0]][%[[C0]]] [8] [1] :
556// CHECK:   %[[BSV0:.+]] = memref.subview %[[ARG1]][%[[C0]]] [8] [1] :
557
558// Prologue:
559// CHECK:   %[[PAV0:.+]] = memref.subview %[[APRO]][%[[C0]], 0] [1, 8] [1, 1] :
560// CHECK:   %[[PBV0:.+]] = memref.subview %[[BPRO]][%[[C0]], 0] [1, 8] [1, 1] :
561// CHECK:   memref.copy %[[ASV0]], %[[PAV0]] :
562// CHECK:   memref.copy %[[BSV0]], %[[PBV0]] :
563
564// Kernel:
565// CHECK:   %[[R:.+]]:2 = scf.for %[[IV:.+]] = %[[C0]] to %[[C3]] step %[[C1]]
566// CHECK-SAME: iter_args(%[[IA:.+]] = %[[PAV0]], %[[IB:.+]] = %[[PBV0:.+]])
567// CHECK:     %[[CV:.+]] = memref.subview %[[ARG2]]
568// CHECK:     linalg.generic
569// CHECK-SAME:  ins(%[[IA]], %[[IB]], %{{.*}} : {{.*}}) outs(%[[CV]] :
570// CHECK:     %[[NEXT:.+]] = arith.addi %[[IV]], %[[C1]]
571// CHECK:     %[[ASV:.+]] = memref.subview %[[ARG0]][%[[NEXT]]] [8] [1] :
572// CHECK:     %[[NEXT:.+]] = arith.addi %[[IV]], %[[C1]] :
573// CHECK:     %[[BSV:.+]] = memref.subview %[[ARG1]][%[[NEXT]]] [8] [1] :
574// CHECK:     %[[NEXT:.+]] = arith.addi %[[IV]], %[[C1]] :
575// CHECK:     %[[BUFIDX:.+]] = affine.apply
576// CHECK:     %[[APROSV:.+]] = memref.subview %[[APRO]][%[[BUFIDX]], 0] [1, 8] [1, 1] :
577// CHECK:     %[[BPROSV:.+]] = memref.subview %[[BPRO]][%[[BUFIDX]], 0] [1, 8] [1, 1] :
578// CHECK:     memref.copy %[[ASV]], %[[APROSV]] :
579// CHECK:     memref.copy %[[BSV]], %[[BPROSV]] :
580// CHECK:     scf.yield %[[APROSV]], %[[BPROSV]] :
581// CHECK:   }
582// CHECK:   %[[CV:.+]] = memref.subview %[[ARG2]][%[[C3]]] [8] [1] :
583// CHECK:   linalg.generic
584// CHECK-SAME: ins(%[[R]]#0, %[[R]]#1, %{{.*}} : {{.*}}) outs(%[[CV]] :
585
586
587#map = affine_map<(d0)[s0]->(d0 + s0)>
588#map1 = affine_map<(d0)->(d0)>
589#map2 = affine_map<(d0)->()>
590#linalg_attrs = {
591  indexing_maps = [
592      #map1,
593      #map1,
594      #map2,
595      #map1
596    ],
597  iterator_types = ["parallel"],
598  __test_pipelining_stage__ = 1,
599  __test_pipelining_op_order__ = 2
600}
601func.func @pipeline_op_with_region(%A: memref<?xf32>, %B: memref<?xf32>, %result: memref<?xf32>, %cf: f32) {
602  %c0 = arith.constant 0 : index
603  %c1 = arith.constant 1 : index
604  %c4 = arith.constant 4 : index
605  %a_buf = memref.alloc() : memref<2x8xf32>
606  %b_buf = memref.alloc() : memref<2x8xf32>
607  scf.for %i0 = %c0 to %c4 step %c1 {
608    %A_view = memref.subview %A[%i0][8][1] { __test_pipelining_stage__ = 0, __test_pipelining_op_order__ = 3 } : memref<?xf32> to memref<8xf32, #map>
609    %B_view = memref.subview %B[%i0][8][1] { __test_pipelining_stage__ = 0, __test_pipelining_op_order__ = 4 } : memref<?xf32> to memref<8xf32, #map>
610    %buf_idx = affine.apply  affine_map<(d0)->(d0 mod 2)> (%i0)[] { __test_pipelining_stage__ = 0, __test_pipelining_op_order__ = 5 }
611    %a_buf_view = memref.subview %a_buf[%buf_idx,0][1,8][1,1] { __test_pipelining_stage__ = 0, __test_pipelining_op_order__ = 6 } : memref<2x8xf32> to memref<8xf32, #map>
612    %b_buf_view = memref.subview %b_buf[%buf_idx,0][1,8][1,1] { __test_pipelining_stage__ = 0, __test_pipelining_op_order__ = 7 } : memref<2x8xf32> to memref<8xf32, #map>
613    memref.copy %A_view , %a_buf_view {__test_pipelining_stage__ = 0, __test_pipelining_op_order__ = 8} : memref<8xf32, #map> to memref<8xf32, #map>
614    memref.copy %B_view , %b_buf_view {__test_pipelining_stage__ = 0, __test_pipelining_op_order__ = 9} : memref<8xf32, #map> to memref<8xf32, #map>
615    %C_view = memref.subview %result[%i0][8][1] { __test_pipelining_stage__ = 1, __test_pipelining_op_order__ = 0 } : memref<?xf32> to memref<8xf32, #map>
616    %scalar = arith.addf %cf, %cf {__test_pipelining_stage__ = 1, __test_pipelining_op_order__ = 1} : f32
617    linalg.generic #linalg_attrs ins(%a_buf_view, %b_buf_view, %scalar : memref<8xf32, #map>, memref<8xf32, #map>, f32)
618      outs(%C_view: memref<8xf32, #map>) {
619      ^bb0(%a: f32, %b: f32, %s: f32, %c: f32):
620        %add = arith.addf %a, %b : f32
621        %accum = arith.addf %add, %c : f32
622        %accum1 = arith.addf %scalar, %accum : f32
623        %accum2 = arith.addf %s, %accum1 : f32
624        linalg.yield %accum2 : f32
625    }
626    scf.yield
627  }  { __test_pipelining_loop__ }
628  return
629}
630
631// -----
632
633// CHECK-LABEL: @backedge_mix_order
634//  CHECK-SAME:   (%[[A:.*]]: memref<?xf32>) -> f32 {
635//   CHECK-DAG:   %[[C0:.*]] = arith.constant 0 : index
636//   CHECK-DAG:   %[[C1:.*]] = arith.constant 1 : index
637//   CHECK-DAG:   %[[C3:.*]] = arith.constant 3 : index
638//   CHECK-DAG:   %[[CSTF:.*]] = arith.constant 2.000000e+00 : f32
639// Prologue:
640//       CHECK:   %[[L0:.*]] = memref.load %[[A]][%[[C0]]] : memref<?xf32>
641//  CHECK-NEXT:   %[[L1:.*]] = memref.load %[[A]][%[[C1]]] : memref<?xf32>
642// Kernel:
643//  CHECK-NEXT:   %[[R:.*]]:3 = scf.for %[[IV:.*]] = %[[C0]] to %[[C3]]
644//  CHECK-SAME:     step %[[C1]] iter_args(%[[C:.*]] = %[[CSTF]],
645//  CHECK-SAME:     %[[ARG1:.*]] = %[[L0]], %[[ARG2:.*]] = %[[L1]]) -> (f32, f32, f32) {
646//  CHECK-NEXT:     %[[IV2:.*]] = arith.addi %[[IV]], %[[C1]] : index
647//  CHECK-NEXT:     %[[L2:.*]] = memref.load %[[A]][%[[IV2]]] : memref<?xf32>
648//  CHECK-NEXT:     %[[MUL0:.*]] = arith.mulf %[[C]], %[[ARG1]] : f32
649//  CHECK-NEXT:     %[[IV3:.*]] = arith.addi %[[IV]], %[[C1]] : index
650//  CHECK-NEXT:     %[[IV4:.*]] = arith.addi %[[IV3]], %[[C1]] : index
651//  CHECK-NEXT:     %[[L3:.*]] = memref.load %[[A]][%[[IV4]]] : memref<?xf32>
652//  CHECK-NEXT:     %[[MUL1:.*]] = arith.mulf %[[ARG2]], %[[MUL0]] : f32
653//  CHECK-NEXT:     scf.yield %[[MUL1]], %[[L2]], %[[L3]] : f32, f32, f32
654//  CHECK-NEXT:   }
655// Epilogue:
656//  CHECK-NEXT:   %[[MUL1:.*]] = arith.mulf %[[R]]#0, %[[R]]#1 : f32
657//  CHECK-NEXT:   %[[MUL2:.*]] = arith.mulf %[[R]]#2, %[[MUL1]] : f32
658//  CHECK-NEXT:   return %[[MUL2]] : f32
659func.func @backedge_mix_order(%A: memref<?xf32>) -> f32 {
660  %c0 = arith.constant 0 : index
661  %c1 = arith.constant 1 : index
662  %c4 = arith.constant 4 : index
663  %cf = arith.constant 2.0 : f32
664  %r = scf.for %i0 = %c0 to %c4 step %c1 iter_args(%arg0 = %cf) -> (f32) {
665    %A_elem = memref.load %A[%i0] { __test_pipelining_stage__ = 0, __test_pipelining_op_order__ = 0 } : memref<?xf32>
666    %A2_elem = arith.mulf %arg0, %A_elem { __test_pipelining_stage__ = 1, __test_pipelining_op_order__ = 1 } : f32
667    %i1 = arith.addi %i0, %c1 { __test_pipelining_stage__ = 0, __test_pipelining_op_order__ = 2 } : index
668    %A1_elem = memref.load %A[%i1] { __test_pipelining_stage__ = 0, __test_pipelining_op_order__ = 3 } : memref<?xf32>
669    %A3_elem = arith.mulf %A1_elem, %A2_elem { __test_pipelining_stage__ = 1, __test_pipelining_op_order__ = 4 } : f32
670    scf.yield %A3_elem : f32
671  }  { __test_pipelining_loop__ }
672  return %r : f32
673}
674
675// -----
676
677// CHECK-LABEL: @distance_1_use
678//  CHECK-DAG:   %[[C0:.*]] = arith.constant 0 : index
679//  CHECK-DAG:   %[[C1:.*]] = arith.constant 1 : index
680//  CHECK-DAG:   %[[C2:.*]] = arith.constant 2 : index
681// Prologue:
682//  CHECK: %[[L0:.+]] = memref.load %{{.*}}[%[[C0]]] : memref<?xf32>
683//  CHECK: %[[L1:.+]] = memref.load %{{.*}}[%[[C1]]] : memref<?xf32>
684//  CHECK: %[[R:.+]]:5 = scf.for {{.*}} iter_args(%[[IDX0:.+]] = %[[C2]], %[[L2:.+]] = %[[L0]], %[[L3:.+]] = %[[L1]]
685//  CHECK:   %[[L4:.+]] = memref.load %{{.*}}[%[[IDX0]]] : memref<?xf32>
686//  CHECK:   %[[IDX1:.+]] = arith.addi %[[IDX0]], %[[C1]] : index
687//  CHECK:   memref.store %[[L2]]
688//  CHECK:   scf.yield %[[IDX1]], %[[L3]], %[[L4]]
689func.func @distance_1_use(%A: memref<?xf32>, %result: memref<?xf32>) {
690  %c0 = arith.constant 0 : index
691  %c1 = arith.constant 1 : index
692  %c4 = arith.constant 4 : index
693  %cf = arith.constant 1.0 : f32
694  %r = scf.for %i0 = %c0 to %c4 step %c1 iter_args(%idx = %c0) -> (index) {
695    %A_elem = memref.load %A[%idx] { __test_pipelining_stage__ = 0, __test_pipelining_op_order__ = 0 } : memref<?xf32>
696    %idx1 = arith.addi %idx, %c1 { __test_pipelining_stage__ = 0, __test_pipelining_op_order__ = 1 } : index
697    memref.store %A_elem, %result[%idx] { __test_pipelining_stage__ = 2, __test_pipelining_op_order__ = 2 } : memref<?xf32>
698    scf.yield %idx1 : index
699  }  { __test_pipelining_loop__ }
700  return
701}
702
703// -----
704
705// NOEPILOGUE-LABEL: stage_0_value_escape(
706func.func @stage_0_value_escape(%A: memref<?xf32>, %result: memref<?xf32>, %ub: index) {
707  %c0 = arith.constant 0 : index
708  %c1 = arith.constant 1 : index
709  %cf = arith.constant 1.0 : f32
710// NOEPILOGUE: %[[UB:[^,]+]]: index)
711// NOEPILOGUE-DAG: %[[C0:.+]] = arith.constant 0 : index
712// NOEPILOGUE-DAG: %[[C1:.+]] = arith.constant 1 : index
713// NOEPILOGUE-DAG: %[[CF:.+]] = arith.constant 1.000000e+00
714// NOEPILOGUE: %[[CND0:.+]] = arith.cmpi sgt, %[[UB]], %[[C0]]
715// NOEPILOGUE: scf.if
716// NOEPILOGUE: %[[IF:.+]] = scf.if %[[CND0]]
717// NOEPILOGUE:   %[[A:.+]] = arith.addf
718// NOEPILOGUE:   scf.yield %[[A]]
719// NOEPILOGUE: %[[S0:.+]] = arith.select %[[CND0]], %[[IF]], %[[CF]]
720// NOEPILOGUE: scf.for %[[IV:.+]] = {{.*}} iter_args(%[[ARG:.+]] = %[[S0]],
721// NOEPILOGUE:   %[[UB_1:.+]] = arith.subi %[[UB]], %[[C1]] : index
722// NOEPILOGUE:   %[[CND1:.+]] = arith.cmpi slt, %[[IV]], %[[UB_1]] : index
723// NOEPILOGUE:   %[[S1:.+]] = arith.select %[[CND1]], %{{.+}}, %[[ARG]] : f32
724// NOEPILOGUE:   scf.yield %[[S1]]
725  %r = scf.for %i0 = %c0 to %ub step %c1 iter_args(%arg0 = %cf) -> (f32) {
726    %A_elem = memref.load %A[%i0] { __test_pipelining_stage__ = 0, __test_pipelining_op_order__ = 1 } : memref<?xf32>
727    %A1_elem = arith.addf %A_elem, %arg0 { __test_pipelining_stage__ = 1, __test_pipelining_op_order__ = 0 } : f32
728    memref.store %A1_elem, %result[%c0] { __test_pipelining_stage__ = 2, __test_pipelining_op_order__ = 2 } : memref<?xf32>
729    scf.yield %A1_elem : f32
730  }  { __test_pipelining_loop__ }
731  memref.store %r, %result[%c1] : memref<?xf32>
732  return
733}
734
735// -----
736
737// NOEPILOGUE-LABEL: dynamic_loop(
738//  NOEPILOGUE-SAME:   %[[A:.*]]: memref<?xf32>, %[[R:.*]]: memref<?xf32>, %[[LB:.+]]: index, %[[UB:.+]]: index, %[[STEP:.+]]: index) {
739//  NOEPILOGUE-DAG: %[[C2:.+]] = arith.constant 2 : index
740//  NOEPILOGUE-DAG: %[[CSTF:.+]] = arith.constant 1.000000e+00 : f32
741// Prologue:
742//      NOEPILOGUE: %[[P_I0:.+]] = arith.cmpi slt, %[[LB]], %[[UB]] : index
743//      NOEPILOGUE: %[[L0:.+]] = scf.if %[[P_I0]] -> (f32) {
744// NOEPILOGUE-NEXT:   memref.load %[[A]][%[[LB]]] : memref<?xf32>
745//      NOEPILOGUE: %[[IV1:.+]] = arith.addi %[[LB]], %[[STEP]] : index
746//      NOEPILOGUE: %[[P_I1:.+]] = arith.cmpi slt, %[[IV1]], %[[UB]] : index
747//      NOEPILOGUE: %[[IV1_2:.+]] = arith.addi %[[LB]], %[[STEP]] : index
748//      NOEPILOGUE: %[[V0:.+]] = scf.if %[[P_I0]] -> (f32) {
749// NOEPILOGUE-NEXT:   arith.addf %[[L0]], %[[CSTF]] : f32
750//      NOEPILOGUE: %[[L1:.+]] = scf.if %[[P_I1]] -> (f32) {
751// NOEPILOGUE-NEXT:   memref.load %[[A]][%[[IV1_2]]] : memref<?xf32>
752//  NOEPILOGUE: scf.for %[[IV2:.+]] = %[[LB]] to %[[UB]] step %[[STEP]] iter_args(%[[V1:.+]] = %[[V0]], %[[L2:.+]] = %[[L1]]) -> (f32, f32) {
753//  NOEPILOGUE-DAG:   %[[S2:.+]] = arith.muli %[[STEP]], %[[C2]] : index
754//  NOEPILOGUE-DAG:   %[[IT2:.+]] = arith.subi %[[UB]], %[[S2]] : index
755//  NOEPILOGUE-DAG:   %[[P_I2:.+]] = arith.cmpi slt, %[[IV2]], %[[IT2]] : index
756//  NOEPILOGUE-DAG:   %[[IT3:.+]] = arith.subi %[[UB]], %[[STEP]] : index
757//  NOEPILOGUE-DAG:   %[[P_I3:.+]] = arith.cmpi slt, %[[IV2]], %[[IT3]] : index
758//      NOEPILOGUE:   memref.store %[[V1]], %[[R]][%[[IV2]]] : memref<?xf32>
759//      NOEPILOGUE:   %[[V2:.+]] = scf.if %[[P_I3]] -> (f32) {
760//      NOEPILOGUE:     arith.addf %[[L2]], %[[CSTF]] : f32
761//      NOEPILOGUE:   %[[IT4:.+]] = arith.muli %[[STEP]], %[[C2]] : index
762//      NOEPILOGUE:   %[[IV3:.+]] = arith.addi %[[IV2]], %[[IT4]] : index
763//      NOEPILOGUE:   %[[L3:.+]] = scf.if %[[P_I2]] -> (f32) {
764//      NOEPILOGUE:     memref.load %[[A]][%[[IV3]]] : memref<?xf32>
765//      NOEPILOGUE:   scf.yield %[[V2]], %[[L3]] : f32, f32
766
767// Check for predicated epilogue for dynamic loop.
768// CHECK-LABEL: dynamic_loop(
769//    CHECK-DAG:   %[[C1:.*]] = arith.constant 1 : index
770//    CHECK-DAG:   %[[C2:.*]] = arith.constant 2 : index
771//    CHECK-DAG:   %[[C0:.*]] = arith.constant 0 : index
772//    CHECK-DAG:   %[[CM1:.*]] = arith.constant -1 : index
773//        CHECK:   %[[UBM:.*]] = arith.subi %[[UB:.*]], %{{.*}}
774//        CHECK:   %{{.*}}:2 = scf.for %[[ARG5:.*]] = %[[LB:.*]] to %[[UBM]] step %[[STEP:.*]] iter_args(%[[ARG6:.*]] = %{{.*}}, %[[ARG7:.*]] = %{{.*}})
775//        CHECK:       memref.store %[[ARG6]], %{{.*}}[%[[ARG5]]]
776//        CHECK:       %[[ADDF_24:.*]] = arith.addf %[[ARG7]], %{{.*}}
777//        CHECK:       %[[MULI_25:.*]] = arith.muli %{{.*}}, %{{.*}}
778//        CHECK:       %[[ADDI_26:.*]] = arith.addi %[[ARG5]], %[[MULI_25]]
779//        CHECK:       %[[LOAD_27:.*]] = memref.load %{{.*}}[%[[ADDI_26]]]
780//        CHECK:       scf.yield %[[ADDF_24]], %[[LOAD_27]]
781//        CHECK:   }
782//        CHECK:   %[[CMPI_10:.*]] = arith.cmpi slt, %[[STEP]], %[[C0]]
783//        CHECK:   %[[SELECT_11:.*]] = arith.select %[[CMPI_10]], %[[C1]], %[[CM1]]
784//        CHECK:   %[[SUBI_12:.*]] = arith.subi %[[UB]], %[[LB]]
785//        CHECK:   %[[ADDI_13:.*]] = arith.addi %[[SUBI_12]], %[[STEP]]
786//        CHECK:   %[[ADDI_14:.*]] = arith.addi %[[ADDI_13]], %[[SELECT_11]]
787//        CHECK:   %[[DIVSI_15:.*]] = arith.divsi %[[ADDI_14]], %[[STEP]]
788//        CHECK:   %[[SUBI_17:.*]] = arith.subi %[[DIVSI_15]], %[[C2]]
789//        CHECK:   %[[MAXSI_18:.*]] = arith.maxsi %[[SUBI_17]], %[[C0]]
790//        CHECK:   %[[MULI_19:.*]] = arith.muli %[[STEP]], %[[MAXSI_18]]
791//        CHECK:   %[[ADDI_20:.*]] = arith.addi %[[LB]], %[[MULI_19]]
792//        CHECK:   %[[ADDI_21:.*]] = arith.addi %[[MAXSI_18]], %[[C1]]
793//        CHECK:   %[[CMPI_22:.*]] = arith.cmpi sge, %[[DIVSI_15]], %[[C1]]
794//        CHECK:   %[[MULI_23:.*]] = arith.muli %[[STEP]], %[[ADDI_21]]
795//        CHECK:   %[[ADDI_24:.*]] = arith.addi %[[LB]], %[[MULI_23]]
796//        CHECK:   %[[CMPI_25:.*]] = arith.cmpi sge, %[[DIVSI_15]], %[[C2]]
797//        CHECK:   scf.if %[[CMPI_22]] {
798//        CHECK:     memref.store %{{.*}}#0, %{{.*}}[%[[ADDI_20]]]
799//        CHECK:   } else {
800//        CHECK:   }
801//        CHECK:   %[[IF_26:.*]] = scf.if %[[CMPI_25]]
802//        CHECK:     %[[ADDF_27:.*]] = arith.addf %{{.*}}#1, %{{.*}}
803//        CHECK:     scf.yield %[[ADDF_27]]
804//        CHECK:   } else {
805//        CHECK:     scf.yield %{{.*}}
806//        CHECK:   }
807//        CHECK:   scf.if %[[CMPI_25]] {
808//        CHECK:     memref.store %[[IF_26]], %{{.*}}[%[[ADDI_24]]]
809//        CHECK:   } else {
810//        CHECK:   }
811//        CHECK:   return
812func.func @dynamic_loop(%A: memref<?xf32>, %result: memref<?xf32>, %lb: index, %ub: index, %step: index) {
813  %cf = arith.constant 1.0 : f32
814  scf.for %i0 = %lb to %ub step %step {
815    %A_elem = memref.load %A[%i0] { __test_pipelining_stage__ = 0, __test_pipelining_op_order__ = 2 } : memref<?xf32>
816    %A1_elem = arith.addf %A_elem, %cf { __test_pipelining_stage__ = 1, __test_pipelining_op_order__ = 1 } : f32
817    memref.store %A1_elem, %result[%i0] { __test_pipelining_stage__ = 2, __test_pipelining_op_order__ = 0 } : memref<?xf32>
818  } { __test_pipelining_loop__ }
819  return
820}
821
822// -----
823
824// NOEPILOGUE-LABEL:   func.func @dynamic_loop_result
825//       NOEPILOGUE:     %{{.*}}:2 = scf.for %[[ARG5:.*]] = %{{.*}} to %{{.*}} step %{{.*}} iter_args(%[[ARG6:.*]] = %{{.*}}, %[[ARG7:.*]] = %{{.*}})
826//       NOEPILOGUE:       %[[SUBI_3:.*]] = arith.subi %{{.*}}, %{{.*}}
827//       NOEPILOGUE:       %[[CMPI_4:.*]] = arith.cmpi slt, %[[ARG5]], %[[SUBI_3]]
828//       NOEPILOGUE:       %[[ADDF_5:.*]] = arith.addf %[[ARG7]], %[[ARG6]]
829//       NOEPILOGUE:       %[[MULF_6:.*]] = arith.mulf %[[ADDF_5]], %{{.*}}
830//       NOEPILOGUE:       %[[ADDI_7:.*]] = arith.addi %[[ARG5]], %{{.*}}
831//       NOEPILOGUE:       %[[IF_8:.*]] = scf.if %[[CMPI_4]]
832//       NOEPILOGUE:         %[[LOAD_9:.*]] = memref.load %{{.*}}[%[[ADDI_7]]]
833//       NOEPILOGUE:         scf.yield %[[LOAD_9]]
834//       NOEPILOGUE:       } else {
835//       NOEPILOGUE:         scf.yield %{{.*}}
836//       NOEPILOGUE:       }
837//       NOEPILOGUE:       scf.yield %[[MULF_6]], %[[IF_8]]
838//       NOEPILOGUE:     }
839//       NOEPILOGUE:     memref.store %{{.*}}#0, %{{.*}}[%{{.*}}]
840
841// Check for predicated epilogue for dynamic loop.
842// CHECK-LABEL:   func.func @dynamic_loop_result
843//   CHECK-DAG:   %[[C1:.*]] = arith.constant 1 : index
844//   CHECK-DAG:   %[[C0:.*]] = arith.constant 0 : index
845//   CHECK-DAG:   %[[CM1:.*]] = arith.constant -1 : index
846//   CHECK-DAG:   %[[CF0:.*]] = arith.constant 0.000000e+00
847//       CHECK:   %[[UBM:.*]] = arith.subi %[[UB:.*]], %{{.*}}
848//       CHECK:   %{{.*}}:2 = scf.for %[[ARG5:.*]] = %[[LB:.*]] to %[[UBM]] step %[[STEP:.*]] iter_args(%[[ARG6:.*]] = %{{.*}}, %[[ARG7:.*]] = %{{.*}})
849//       CHECK:       %[[ADDF_13:.*]] = arith.addf %[[ARG7]], %[[ARG6]]
850//       CHECK:       %[[MULF_14:.*]] = arith.mulf %[[ADDF_13]], %{{.*}}
851//       CHECK:       %[[ADDI_15:.*]] = arith.addi %[[ARG5]], %{{.*}}
852//       CHECK:       %[[LOAD_16:.*]] = memref.load %{{.*}}[%[[ADDI_15]]]
853//       CHECK:       scf.yield %[[MULF_14]], %[[LOAD_16]]
854//       CHECK:     }
855//       CHECK:     %[[CMPI_4:.*]] = arith.cmpi slt, %[[STEP]], %[[C0]]
856//       CHECK:     %[[SELECT_5:.*]] = arith.select %[[CMPI_4]], %[[C1]], %[[CM1]]
857//       CHECK:     %[[SUBI_6:.*]] = arith.subi %[[UB]], %[[LB]]
858//       CHECK:     %[[ADDI_7:.*]] = arith.addi %[[SUBI_6]], %[[STEP]]
859//       CHECK:     %[[ADDI_8:.*]] = arith.addi %[[ADDI_7]], %[[SELECT_5]]
860//       CHECK:     %[[DIVSI_9:.*]] = arith.divsi %[[ADDI_8]], %[[STEP]]
861//       CHECK:     %[[CMPI_10:.*]] = arith.cmpi sge, %[[DIVSI_9]], %[[C1]]
862//       CHECK:     %[[IF_11:.*]] = scf.if %[[CMPI_10]]
863//       CHECK:       %[[ADDF_14:.*]] = arith.addf %{{.*}}#1, %{{.*}}#0
864//       CHECK:       scf.yield %[[ADDF_14]]
865//       CHECK:     } else {
866//       CHECK:       scf.yield %[[CF0]]
867//       CHECK:     }
868//       CHECK:     %[[IF_12:.*]] = scf.if %[[CMPI_10]]
869//       CHECK:       %[[MULF_14:.*]] = arith.mulf %[[IF_11]], %{{.*}}
870//       CHECK:       scf.yield %[[MULF_14]]
871//       CHECK:     } else {
872//       CHECK:       scf.yield %[[CF0]]
873//       CHECK:     }
874//       CHECK:     %[[SELECT_13:.*]] = arith.select %[[CMPI_10]], %[[IF_12]], %{{.*}}#0
875//       CHECK:     memref.store %[[SELECT_13]], %{{.*}}[%[[C0]]]
876func.func @dynamic_loop_result(%A: memref<?xf32>, %result: memref<?xf32>, %lb: index, %ub: index, %step: index) {
877  %cf0 = arith.constant 1.0 : f32
878  %cf1 = arith.constant 33.0 : f32
879  %cst = arith.constant 0 : index
880  %res:1 = scf.for %i0 = %lb to %ub step %step iter_args (%arg0 = %cf0) -> (f32) {
881    %A_elem = memref.load %A[%i0] { __test_pipelining_stage__ = 0, __test_pipelining_op_order__ = 2 } : memref<?xf32>
882    %A1_elem = arith.addf %A_elem, %arg0 { __test_pipelining_stage__ = 1, __test_pipelining_op_order__ = 0 } : f32
883    %A2_elem = arith.mulf %A1_elem, %cf1 { __test_pipelining_stage__ = 1, __test_pipelining_op_order__ = 1 } : f32
884    scf.yield %A2_elem : f32
885  } { __test_pipelining_loop__ }
886  memref.store %res#0, %result[%cst] : memref<?xf32>
887  return
888}
889
890// -----
891
892// CHECK-LABEL: yield_constant_loop(
893//  CHECK-SAME:   %[[A:.*]]: memref<?xf32>) -> f32 {
894//   CHECK-DAG:   %[[C0:.*]] = arith.constant 0 : index
895//   CHECK-DAG:   %[[C1:.*]] = arith.constant 1 : index
896//   CHECK-DAG:   %[[C3:.*]] = arith.constant 3 : index
897//   CHECK-DAG:   %[[CST0:.*]] = arith.constant 0.000000e+00 : f32
898//   CHECK-DAG:   %[[CST2:.*]] = arith.constant 2.000000e+00 : f32
899// Prologue:
900//       CHECK:   %[[L0:.*]] = memref.load %[[A]][%[[C0]]] : memref<?xf32>
901// Kernel:
902//  CHECK-NEXT:   %[[L1:.*]]:2 = scf.for %[[IV:.*]] = %[[C0]] to %[[C3]]
903//  CHECK-SAME:     step %[[C1]] iter_args(%[[ARG0:.*]] = %[[CST2]], %[[ARG1:.*]] = %[[L0]]) -> (f32, f32) {
904//  CHECK-NEXT:     %[[ADD0:.*]] = arith.addf %[[ARG1]], %[[ARG0]] : f32
905//  CHECK-NEXT:     %[[MUL0:.*]] = arith.mulf %[[ADD0]], %[[CST0]] : f32
906//  CHECK-NEXT:     memref.store %[[MUL0]], %[[A]][%[[IV]]] : memref<?xf32>
907//  CHECK-NEXT:     %[[IV1:.*]] = arith.addi %[[IV]], %[[C1]] : index
908//  CHECK-NEXT:     %[[L2:.*]] = memref.load %[[A]][%[[IV1]]] : memref<?xf32>
909//  CHECK-NEXT:     scf.yield %[[CST0]], %[[L2]] : f32
910//  CHECK-NEXT:   }
911// Epilogue:
912//  CHECK-NEXT:   %[[ADD1:.*]] = arith.addf %[[L1]]#1, %[[CST0]] : f32
913//  CHECK-NEXT:   %[[MUL1:.*]] = arith.mulf %[[ADD1]], %[[CST0]] : f32
914//  CHECK-NEXT:   memref.store %[[MUL1]], %[[A]][%[[C3]]] : memref<?xf32>
915//  CHECK-NEXT:   return %[[L1]]#0 : f32
916
917func.func @yield_constant_loop(%A: memref<?xf32>) -> f32 {
918  %c0 = arith.constant 0 : index
919  %c1 = arith.constant 1 : index
920  %c4 = arith.constant 4 : index
921  %cf0 = arith.constant 0.0 : f32
922  %cf2 = arith.constant 2.0 : f32
923  %r = scf.for %i0 = %c0 to %c4 step %c1 iter_args(%arg0 = %cf2) -> f32 {
924    %A_elem = memref.load %A[%i0] { __test_pipelining_stage__ = 0, __test_pipelining_op_order__ = 3 } : memref<?xf32>
925    %A1_elem = arith.addf %A_elem, %arg0 { __test_pipelining_stage__ = 1, __test_pipelining_op_order__ = 0 } : f32
926    %A2_elem = arith.mulf %cf0, %A1_elem { __test_pipelining_stage__ = 1, __test_pipelining_op_order__ = 1 } : f32
927    memref.store %A2_elem, %A[%i0] { __test_pipelining_stage__ = 1, __test_pipelining_op_order__ = 2 } : memref<?xf32>
928    scf.yield %cf0: f32
929  }  { __test_pipelining_loop__ }
930  return %r : f32
931}
932
933// -----
934
935func.func @invalid_schedule(%A: memref<?xf32>, %result: memref<?xf32>) {
936  %c0 = arith.constant 0 : index
937  %c1 = arith.constant 1 : index
938  %c4 = arith.constant 4 : index
939  %cf = arith.constant 1.0 : f32
940  scf.for %i0 = %c0 to %c4 step %c1 {
941    %A_elem = memref.load %A[%i0] { __test_pipelining_stage__ = 0, __test_pipelining_op_order__ = 2 } : memref<?xf32>
942    %A1_elem = arith.addf %A_elem, %cf { __test_pipelining_stage__ = 2, __test_pipelining_op_order__ = 0 } : f32
943    // expected-error@+1 {{operation scheduled before its operands}}
944    memref.store %A1_elem, %result[%i0] { __test_pipelining_stage__ = 1, __test_pipelining_op_order__ = 1 } : memref<?xf32>
945  }  { __test_pipelining_loop__ }
946  return
947}
948
949// -----
950
951func.func @invalid_schedule2(%A: memref<?xf32>, %result: memref<?xf32>) {
952  %c0 = arith.constant 0 : index
953  %c1 = arith.constant 1 : index
954  %c4 = arith.constant 4 : index
955  %cf = arith.constant 1.0 : f32
956  %r = scf.for %i0 = %c0 to %c4 step %c1 iter_args(%idx = %c0) -> (index) {
957    // expected-error@+1 {{operation scheduled before its operands}}
958    %A_elem = memref.load %A[%idx] { __test_pipelining_stage__ = 0, __test_pipelining_op_order__ = 0 } : memref<?xf32>
959    %idx1 = arith.addi %idx, %c1 { __test_pipelining_stage__ = 1, __test_pipelining_op_order__ = 1 } : index
960    memref.store %A_elem, %result[%idx] { __test_pipelining_stage__ = 2, __test_pipelining_op_order__ = 2 } : memref<?xf32>
961    scf.yield %idx1 : index
962  }  { __test_pipelining_loop__ }
963  return
964}
965
966// -----
967
968func.func @invalid_schedule3(%A: memref<?xf32>, %result: memref<?xf32>, %ext: index) {
969  %c0 = arith.constant 0 : index
970  %c1 = arith.constant 1 : index
971  %c4 = arith.constant 4 : index
972  %r = scf.for %i0 = %c0 to %c4 step %c1 iter_args(%idx = %c0) -> (index) {
973    %cnd = arith.cmpi slt, %ext, %c4 { __test_pipelining_stage__ = 0, __test_pipelining_op_order__ = 0 } : index
974    // expected-error@+1 {{operation scheduled before its operands}}
975    %idx1 = scf.if %cnd -> (index) {
976      %idxinc = arith.addi %idx, %c1 : index
977      scf.yield %idxinc : index
978    } else {
979      scf.yield %idx : index
980    } { __test_pipelining_stage__ = 0, __test_pipelining_op_order__ = 1 }
981    %A_elem = memref.load %A[%idx1] { __test_pipelining_stage__ = 0, __test_pipelining_op_order__ = 2 } : memref<?xf32>
982    %idx2 = arith.addi %idx1, %c1 { __test_pipelining_stage__ = 1, __test_pipelining_op_order__ = 3 } : index
983    memref.store %A_elem, %result[%idx1] { __test_pipelining_stage__ = 2, __test_pipelining_op_order__ = 4 } : memref<?xf32>
984    scf.yield %idx2 : index
985  }  { __test_pipelining_loop__ }
986  return
987}
988