xref: /llvm-project/mlir/test/Dialect/NVGPU/transform-pipeline-shared.mlir (revision ef112833e11e94ea049f98bec4a29b4fe96a25dd)
1// RUN: mlir-opt %s --transform-interpreter -canonicalize --split-input-file --verify-diagnostics | FileCheck %s
2
3func.func @simple_depth_2_unpeeled(%global: memref<?xf32>, %result: memref<?xf32> ) {
4  %c0 = arith.constant 0 : index
5  %c100 = arith.constant 100 : index
6  %c4 = arith.constant 4 : index
7  %shared = memref.alloc(%c100) : memref<?xf32, #gpu.address_space<workgroup>>
8  %c0f = arith.constant 0.0 : f32
9  // Predication is not currently implemented for transfer_read/write, so this is expected to fail.
10  // expected-note @below {{couldn't predicate}}
11  scf.for %i = %c0 to %c100 step %c4 iter_args(%accum = %c0f) -> f32 {
12    %mem = vector.transfer_read %global[%i], %c0f : memref<?xf32>, vector<4xf32>
13    vector.transfer_write %mem, %shared[%i] : vector<4xf32>, memref<?xf32, #gpu.address_space<workgroup>>
14    %0 = arith.addf %accum, %accum : f32
15    scf.yield %0 : f32
16  }
17  return
18}
19
20!t = !transform.any_op
21
22module attributes {transform.with_named_sequence} {
23  transform.named_sequence @__transform_main(%arg0: !t {transform.readonly}) {
24    %loop = transform.structured.match ops{["scf.for"]} in %arg0 : (!t) -> !t
25    // expected-error @below {{irreversible pipelining failure}}
26    // expected-note @below {{try setting "peel_epilogue"}}
27    transform.nvgpu.pipeline_shared_memory_copies failures(propagate) %loop { depth = 2 } : (!t) -> !t
28    transform.yield
29  }
30}
31
32// -----
33
34// Loop pipeliner is tested separately, just verify the overall shape of the IR here.
35
36func.func private @body(index, memref<?xf32, #gpu.address_space<workgroup>>)
37
38// CHECK-LABEL: @simple_depth_2_peeled
39// CHECK-SAME: %[[ARG:.+]]: memref
40func.func @simple_depth_2_peeled(%global: memref<?xf32>) {
41  %c0 = arith.constant 0 : index
42  %c100 = arith.constant 100 : index
43  %c200 = arith.constant 200 : index
44  %c4 = arith.constant 4 : index
45  // CHECK: memref.alloc
46  %shared = memref.alloc(%c200) : memref<?xf32, #gpu.address_space<workgroup>>
47  %c0f = arith.constant 0.0 : f32
48  // CHECK: %[[LOADED1:.+]] = vector.transfer_read %[[ARG]]
49  // CHECK: %[[LOADED2:.+]] = vector.transfer_read %[[ARG]]
50  // CHECK: %[[LOOP:.+]]:2 = scf.for {{.*}} iter_args(%[[IA1:.+]] = %[[LOADED1]], %[[IA2:.+]] = %[[LOADED2]])
51  // CHECK:   vector.transfer_write %[[IA1]]
52  // CHECK:   func.call @body
53  // CHECK:   %[[LOCAL_LOADED:.+]] = vector.transfer_read %[[ARG]]
54  // CHECK:   scf.yield %[[IA2]], %[[LOCAL_LOADED]]
55  scf.for %i = %c0 to %c100 step %c4 {
56    %mem = vector.transfer_read %global[%i], %c0f : memref<?xf32>, vector<4xf32>
57    vector.transfer_write %mem, %shared[%i] : vector<4xf32>, memref<?xf32, #gpu.address_space<workgroup>>
58    func.call @body(%i, %shared) : (index, memref<?xf32, #gpu.address_space<workgroup>>) -> ()
59  }
60  // CHECK: vector.transfer_write %[[LOOP]]#0
61  // CHECK: call @body
62  // CHECK: vector.transfer_write %[[LOOP]]#1
63  // CHECK: call @body
64  return
65}
66
67!t = !transform.any_op
68
69module attributes {transform.with_named_sequence} {
70  transform.named_sequence @__transform_main(%arg0: !t {transform.readonly}) {
71    %loop = transform.structured.match ops{["scf.for"]} in %arg0 : (!t) -> !t
72    transform.nvgpu.pipeline_shared_memory_copies failures(propagate) %loop { depth = 2, peel_epilogue } : (!t) -> !t
73    transform.yield
74  }
75}
76
77// -----
78
79// CHECK-LABEL: @async_depth_2_predicated
80// CHECK-SAME: %[[GLOBAL:.+]]: memref
81func.func @async_depth_2_predicated(%global: memref<?xf32>, %alloc_size: index) {
82  %c0 = arith.constant 0 : index
83  %c98 = arith.constant 98 : index
84  %c100 = arith.constant 100 : index
85  // CHECK-DAG: %[[C4:.+]] = arith.constant 4
86  // CHECK-DAG:   %[[C90:.+]] = arith.constant 90
87  // CHECK-DAG:   %[[C96:.+]] = arith.constant 96
88  // CHECK-DAG:   %[[C8:.+]] = arith.constant 8
89  // CHECK-DAG:   %[[C2:.+]] = arith.constant 2
90  // CHECK-DAG:   %[[C0:.+]] = arith.constant 0
91  %c4 = arith.constant 4 : index
92  // CHECK: %[[SHARED:.+]] = memref.alloc{{.*}} #gpu.address_space<workgroup>
93  %shared = memref.alloc(%alloc_size) : memref<?xf32, #gpu.address_space<workgroup>>
94  %c0f = arith.constant 0.0 : f32
95  // CHECK: %[[TOKEN0:.+]] = nvgpu.device_async_copy
96  // CHECK: %[[TOKEN1:.+]] = nvgpu.device_async_copy
97  // CHECK: scf.for %[[I:.+]] = {{.*}} iter_args
98  // CHECK-SAME: %[[ITER_ARG0:.+]] = %[[TOKEN0]]
99  // CHECK-SAME: %[[ITER_ARG1:.+]] = %[[TOKEN1]]
100  scf.for %i = %c0 to %c98 step %c4 {
101    // Condition for the predication "select" below.
102    // CHECK:   %[[CMP0:.+]] = arith.cmpi slt, %[[I]], %[[C90]]
103    // CHECK:   nvgpu.device_async_wait %[[ITER_ARG0]] {numGroups = 1
104    // Original "select" with updated induction variable.
105    // CHECK:   %[[I_PLUS_8:.+]] = arith.addi %[[I]], %[[C8]]
106    // CHECK:   %[[CMP1:.+]] = arith.cmpi slt, %[[I_PLUS_8]], %[[C96]]
107    // CHECK:   %[[SELECTED0:.+]] = arith.select %[[CMP1]], %[[C4]], %[[C2]]
108    %c96 = arith.constant 96 : index
109    %cond = arith.cmpi slt, %i, %c96 : index
110    %c2 = arith.constant 2 : index
111    %read_size = arith.select %cond, %c4, %c2 : index
112
113    // Updated induction variables (two more) for the device_async_copy below.
114    // These are generated repeatedly by the pipeliner.
115    // CHECK:   %[[I_PLUS_8_2:.+]] = arith.addi %[[I]], %[[C8]]
116    // CHECK:   %[[I_PLUS_8_3:.+]] = arith.addi %[[I]], %[[C8]]
117
118    // The second "select" is generated by predication and selects 0 for
119    // the two last iterations.
120    // CHECK:   %[[SELECTED1:.+]] = arith.select %[[CMP0]], %[[SELECTED0]], %[[C0]]
121    // CHECK:   %[[ASYNC_TOKEN:.+]] = nvgpu.device_async_copy %[[GLOBAL]][%[[I_PLUS_8_3]]], %[[SHARED]][%[[I_PLUS_8_2]]], 4, %[[SELECTED1]]
122    %token = nvgpu.device_async_copy %global[%i], %shared[%i], 4, %read_size
123      : memref<?xf32> to memref<?xf32, #gpu.address_space<workgroup>>
124
125    nvgpu.device_async_wait %token
126
127    // CHECK: scf.yield %[[ITER_ARG1]], %[[ASYNC_TOKEN]]
128  }
129  // There is no need to wait for the last copies as it it was fully predicated
130  // out and doesn't load the original data.
131  // CHECK-NOT: nvgpu.device_async_wait
132  return
133}
134
135
136!t = !transform.any_op
137
138module attributes {transform.with_named_sequence} {
139  transform.named_sequence @__transform_main(%arg0: !t {transform.readonly}) {
140    %loop = transform.structured.match ops{["scf.for"]} in %arg0 : (!t) -> !t
141    transform.nvgpu.pipeline_shared_memory_copies failures(propagate) %loop { depth = 2 } : (!t) -> !t
142    transform.yield
143  }
144}
145
146// -----
147
148// CHECK-LABEL: @async_depth_2_peeled
149func.func @async_depth_2_peeled(%global: memref<?xf32>) {
150  %c0 = arith.constant 0 : index
151  %c98 = arith.constant 98 : index
152  %c100 = arith.constant 100 : index
153  %c4 = arith.constant 4 : index
154  %shared = memref.alloc(%c100) : memref<?xf32, #gpu.address_space<workgroup>>
155  %c0f = arith.constant 0.0 : f32
156  // CHECK: nvgpu.device_async_copy
157  // CHECK: nvgpu.device_async_copy
158  // CHECK: scf.for
159  // CHECK:   nvgpu.device_async_wait %{{.*}} {numGroups = 1
160  // CHECK:   arith.select
161  // CHECK:   nvgpu.device_async_copy
162  // CHECK:   scf.yield
163  // CHECK: nvgpu.device_async_wait %{{.*}} {numGroups = 1
164  // CHECK: nvgpu.device_async_wait %{{.*}} {numGroups = 0
165  scf.for %i = %c0 to %c98 step %c4 {
166    %c96 = arith.constant 96 : index
167    %cond = arith.cmpi slt, %i, %c96 : index
168    %c2 = arith.constant 2 : index
169    %read_size = arith.select %cond, %c4, %c2 : index
170    %token = nvgpu.device_async_copy %global[%i], %shared[%i], 4, %read_size
171      : memref<?xf32> to memref<?xf32, #gpu.address_space<workgroup>>
172    nvgpu.device_async_wait %token
173  }
174  return
175}
176
177
178!t = !transform.any_op
179
180module attributes {transform.with_named_sequence} {
181  transform.named_sequence @__transform_main(%arg0: !t {transform.readonly}) {
182    %loop = transform.structured.match ops{["scf.for"]} in %arg0 : (!t) -> !t
183    transform.nvgpu.pipeline_shared_memory_copies failures(propagate) %loop { depth = 2, peel_epilogue } : (!t) -> !t
184    transform.yield
185  }
186}
187