xref: /llvm-project/mlir/test/Dialect/NVGPU/transform-create-async-groups.mlir (revision bb6d5c220004a5d7e466a669324001285a688918)
1e4384149SOleksandr "Alex" Zinenko// RUN: mlir-opt %s -transform-interpreter -split-input-file --verify-diagnostics | FileCheck %s
2db393288SMatthias Springer
3db393288SMatthias Springer// Check that we produce async copies from the vector.transfer_xxx operations.
4db393288SMatthias Springerbuiltin.module {
5db393288SMatthias Springer  // CHECK-LABEL: @copies_to_asyncs
6db393288SMatthias Springer  func.func @copies_to_asyncs(%a: memref<1024x1024xf32>) {
7db393288SMatthias Springer    %0 = memref.alloc() : memref<4x32x16xf32, #gpu.address_space<workgroup>>
8db393288SMatthias Springer    %c0 = arith.constant 0 : index
9db393288SMatthias Springer    %c4 = arith.constant 4 : index
10db393288SMatthias Springer    %cst_0 = arith.constant 0.000000e+00 : f32
11db393288SMatthias Springer    // Make sure we emit the bypassL1.
12db393288SMatthias Springer    // CHECK: %[[CP0:.*]] = nvgpu.device_async_copy {{.*}}, {{.*}}, 4  {bypassL1} :
13db393288SMatthias Springer    %1 = vector.transfer_read %a[%c0, %c0], %cst_0 {in_bounds = [true]} : memref<1024x1024xf32>, vector<4xf32>
14db393288SMatthias Springer    vector.transfer_write %1, %0[%c0, %c0, %c0] {in_bounds = [true]} : vector<4xf32>, memref<4x32x16xf32, #gpu.address_space<workgroup>>
15db393288SMatthias Springer    // CHECK-NOT: nvgpu.device_async_create_group
16db393288SMatthias Springer
17db393288SMatthias Springer    // CHECK: %[[CP1:.*]] = nvgpu.device_async_copy {{.*}}, {{.*}}, 1
18db393288SMatthias Springer    %2 = vector.transfer_read %a[%c0, %c4], %cst_0 {in_bounds = [true]} : memref<1024x1024xf32>, vector<1xf32>
19db393288SMatthias Springer    vector.transfer_write %2, %0[%c0, %c4, %c0] {in_bounds = [true]} : vector<1xf32>, memref<4x32x16xf32, #gpu.address_space<workgroup>>
20db393288SMatthias Springer    // CHECK: %[[G:.*]] = nvgpu.device_async_create_group %[[CP0]], %[[CP1]]
21db393288SMatthias Springer    // CHECK: nvgpu.device_async_wait %[[G]]
22db393288SMatthias Springer    return
23db393288SMatthias Springer  }
24db393288SMatthias Springer
25e4384149SOleksandr "Alex" Zinenko  module attributes {transform.with_named_sequence} {
26e4384149SOleksandr "Alex" Zinenko    transform.named_sequence @__transform_main(%variant_op: !transform.any_op {transform.readonly}) {
27db393288SMatthias Springer      %top_level_func = transform.structured.match ops{["func.func"]} in %variant_op : (!transform.any_op) -> !transform.any_op
28db393288SMatthias Springer      transform.nvgpu.create_async_groups %top_level_func {bypass_l1} : (!transform.any_op) -> (!transform.any_op)
29e4384149SOleksandr "Alex" Zinenko      transform.yield
30e4384149SOleksandr "Alex" Zinenko    }
31db393288SMatthias Springer  }
32db393288SMatthias Springer}
33db393288SMatthias Springer
34db393288SMatthias Springer// -----
35db393288SMatthias Springer
36db393288SMatthias Springer// Check that we properly take `bypass_l1 = false` into account.
37db393288SMatthias Springer// I.e., we shouldn't be generating bypassL1 attributes.
38db393288SMatthias Springerbuiltin.module {
39db393288SMatthias Springer  // CHECK-LABEL: @copies_to_asyncs_no_mma
40db393288SMatthias Springer  func.func @copies_to_asyncs_no_mma(%a: memref<1024x1024xf32>) {
41db393288SMatthias Springer    %0 = memref.alloc() : memref<4x32x16xf32, #gpu.address_space<workgroup>>
42db393288SMatthias Springer    %c0 = arith.constant 0 : index
43db393288SMatthias Springer    %c4 = arith.constant 4 : index
44db393288SMatthias Springer    %cst_0 = arith.constant 0.000000e+00 : f32
45db393288SMatthias Springer    // Make sure we don't emit the bypassL1.
46db393288SMatthias Springer    // CHECK: %[[CP0:.*]] = nvgpu.device_async_copy {{.*}}, {{.*}}, 4 :
47db393288SMatthias Springer    %1 = vector.transfer_read %a[%c0, %c0], %cst_0 {in_bounds = [true]} : memref<1024x1024xf32>, vector<4xf32>
48db393288SMatthias Springer    vector.transfer_write %1, %0[%c0, %c0, %c0] {in_bounds = [true]} : vector<4xf32>, memref<4x32x16xf32, #gpu.address_space<workgroup>>
49db393288SMatthias Springer    // CHECK-NOT: nvgpu.device_async_create_group
50db393288SMatthias Springer
51db393288SMatthias Springer    // CHECK: %[[CP1:.*]] = nvgpu.device_async_copy {{.*}}, {{.*}}, 1 :
52db393288SMatthias Springer    %2 = vector.transfer_read %a[%c0, %c4], %cst_0 {in_bounds = [true]} : memref<1024x1024xf32>, vector<1xf32>
53db393288SMatthias Springer    vector.transfer_write %2, %0[%c0, %c4, %c0] {in_bounds = [true]} : vector<1xf32>, memref<4x32x16xf32, #gpu.address_space<workgroup>>
54db393288SMatthias Springer    // CHECK: %[[G:.*]] = nvgpu.device_async_create_group %[[CP0]], %[[CP1]]
55db393288SMatthias Springer    // CHECK: nvgpu.device_async_wait %[[G]]
56db393288SMatthias Springer    return
57db393288SMatthias Springer  }
58db393288SMatthias Springer
59e4384149SOleksandr "Alex" Zinenko  module attributes {transform.with_named_sequence} {
60e4384149SOleksandr "Alex" Zinenko    transform.named_sequence @__transform_main(%variant_op: !transform.any_op {transform.readonly}) {
61db393288SMatthias Springer      %top_level_func = transform.structured.match ops{["func.func"]} in %variant_op : (!transform.any_op) -> !transform.any_op
62db393288SMatthias Springer      transform.nvgpu.create_async_groups %top_level_func : (!transform.any_op) -> (!transform.any_op)
63e4384149SOleksandr "Alex" Zinenko      transform.yield
64e4384149SOleksandr "Alex" Zinenko    }
65db393288SMatthias Springer  }
66db393288SMatthias Springer}
67db393288SMatthias Springer
68db393288SMatthias Springer// -----
69db393288SMatthias Springer
70db393288SMatthias Springer// Check that pattern works with vector.load/vector.store.
71db393288SMatthias Springerbuiltin.module {
72db393288SMatthias Springer  // CHECK-LABEL: @copies_to_asyncs_load_store
73db393288SMatthias Springer  func.func @copies_to_asyncs_load_store(%a: memref<1024x1024xf32>) {
74db393288SMatthias Springer    %0 = memref.alloc() : memref<4x32x16xf32, #gpu.address_space<workgroup>>
75db393288SMatthias Springer    %c0 = arith.constant 0 : index
76db393288SMatthias Springer    %c4 = arith.constant 4 : index
77db393288SMatthias Springer    %cst_0 = arith.constant 0.000000e+00 : f32
78db393288SMatthias Springer    // CHECK: %[[CP0:.*]] = nvgpu.device_async_copy {{.*}}, {{.*}}, 4 :
79db393288SMatthias Springer    %1 = vector.load %a[%c0, %c0] : memref<1024x1024xf32>, vector<4xf32>
80db393288SMatthias Springer    vector.store %1, %0[%c0, %c0, %c0] : memref<4x32x16xf32, #gpu.address_space<workgroup>>, vector<4xf32>
81db393288SMatthias Springer    // CHECK-NOT: nvgpu.device_async_create_group
82db393288SMatthias Springer
83db393288SMatthias Springer    // CHECK: %[[CP1:.*]] = nvgpu.device_async_copy {{.*}}, {{.*}}, 1 :
84db393288SMatthias Springer    %2 = vector.load %a[%c0, %c4] : memref<1024x1024xf32>, vector<1xf32>
85db393288SMatthias Springer    vector.store %2, %0[%c0, %c4, %c0] : memref<4x32x16xf32, #gpu.address_space<workgroup>>, vector<1xf32>
86db393288SMatthias Springer    // CHECK: %[[G:.*]] = nvgpu.device_async_create_group %[[CP0]], %[[CP1]]
87db393288SMatthias Springer    // CHECK: nvgpu.device_async_wait %[[G]]
88db393288SMatthias Springer    return
89db393288SMatthias Springer  }
90db393288SMatthias Springer
91e4384149SOleksandr "Alex" Zinenko  module attributes {transform.with_named_sequence} {
92e4384149SOleksandr "Alex" Zinenko    transform.named_sequence @__transform_main(%variant_op: !transform.any_op {transform.readonly}) {
93db393288SMatthias Springer      %top_level_func = transform.structured.match ops{["func.func"]} in %variant_op : (!transform.any_op) -> !transform.any_op
94db393288SMatthias Springer      transform.nvgpu.create_async_groups %top_level_func : (!transform.any_op) -> (!transform.any_op)
95e4384149SOleksandr "Alex" Zinenko      transform.yield
96e4384149SOleksandr "Alex" Zinenko    }
97db393288SMatthias Springer  }
98db393288SMatthias Springer}
99db393288SMatthias Springer
100db393288SMatthias Springer// -----
101db393288SMatthias Springer
102db393288SMatthias Springer// Check that pattern skips unaligned and unsupported sizes.
103db393288SMatthias Springerbuiltin.module {
104db393288SMatthias Springer  // CHECK-LABEL: @copies_to_asyncs_load_store
105db393288SMatthias Springer  func.func @copies_to_asyncs_load_store(%a: memref<1024x1024xf32>, %b: memref<1024x1024xf16>) {
106db393288SMatthias Springer    %alloc = memref.alloc() : memref<4x32x16xf32, #gpu.address_space<workgroup>>
107db393288SMatthias Springer    %alloc_1 = memref.alloc() : memref<4x32x16xf16, #gpu.address_space<workgroup>>
108db393288SMatthias Springer    %c0 = arith.constant 0 : index
109db393288SMatthias Springer    %c4 = arith.constant 4 : index
110db393288SMatthias Springer    %cst_0 = arith.constant 0.000000e+00 : f32
111db393288SMatthias Springer
112db393288SMatthias Springer    // Requires 1-D vector load
113db393288SMatthias Springer    // CHECK-NOT: nvgpu.device_async_copy
114db393288SMatthias Springer    //     CHECK: vector.load
115db393288SMatthias Springer    //     CHECK: vector.store
116db393288SMatthias Springer    %1 = vector.load %a[%c0, %c4] : memref<1024x1024xf32>, vector<2x2xf32>
117db393288SMatthias Springer    vector.store %1, %alloc[%c0, %c4, %c0] : memref<4x32x16xf32, #gpu.address_space<workgroup>>, vector<2x2xf32>
118db393288SMatthias Springer    // CHECK-NOT: nvgpu.device_async_create_group
119db393288SMatthias Springer
120db393288SMatthias Springer    // CHECK-NOT: nvgpu.device_async_copy
121db393288SMatthias Springer    //     CHECK: vector.load
122db393288SMatthias Springer    //     CHECK: vector.store
123db393288SMatthias Springer    %2 = vector.load %b[%c0, %c4] : memref<1024x1024xf16>, vector<1xf16>
124db393288SMatthias Springer    vector.store %2, %alloc_1[%c0, %c4, %c0] : memref<4x32x16xf16, #gpu.address_space<workgroup>>, vector<1xf16>
125db393288SMatthias Springer    // CHECK-NOT: nvgpu.device_async_create_group
126db393288SMatthias Springer    return
127db393288SMatthias Springer  }
128db393288SMatthias Springer
129e4384149SOleksandr "Alex" Zinenko  module attributes {transform.with_named_sequence} {
130e4384149SOleksandr "Alex" Zinenko    transform.named_sequence @__transform_main(%variant_op: !transform.any_op {transform.readonly}) {
131db393288SMatthias Springer      %top_level_func = transform.structured.match ops{["func.func"]} in %variant_op : (!transform.any_op) -> !transform.any_op
132db393288SMatthias Springer      transform.nvgpu.create_async_groups %top_level_func : (!transform.any_op) -> (!transform.any_op)
133e4384149SOleksandr "Alex" Zinenko      transform.yield
134e4384149SOleksandr "Alex" Zinenko    }
135db393288SMatthias Springer  }
136db393288SMatthias Springer}
137db393288SMatthias Springer
138db393288SMatthias Springer// -----
139db393288SMatthias Springer
140db393288SMatthias Springer// vector.transfer_read with a mask.
141db393288SMatthias Springerbuiltin.module {
142db393288SMatthias Springer  // CHECK-LABEL: @read_with_mask(
143db393288SMatthias Springer  // CHECK-SAME: %{{.*}}: memref<1024x1024xf32>, %[[sz:.*]]: index
144db393288SMatthias Springer  func.func @read_with_mask(%a: memref<1024x1024xf32>, %sz: index) {
145db393288SMatthias Springer    %0 = memref.alloc() : memref<4x32x16xf32, #gpu.address_space<workgroup>>
146db393288SMatthias Springer    %c0 = arith.constant 0 : index
147db393288SMatthias Springer    %cst_0 = arith.constant 0.000000e+00 : f32
148db393288SMatthias Springer    // CHECK: nvgpu.device_async_copy {{.*}}, {{.*}}, 4, %[[sz]] {bypassL1} :
149db393288SMatthias Springer    %mask = vector.create_mask %sz : vector<4xi1>
150db393288SMatthias Springer    %1 = vector.transfer_read %a[%c0, %c0], %cst_0, %mask {in_bounds = [true]} : memref<1024x1024xf32>, vector<4xf32>
151db393288SMatthias Springer    vector.transfer_write %1, %0[%c0, %c0, %c0] {in_bounds = [true]} : vector<4xf32>, memref<4x32x16xf32, #gpu.address_space<workgroup>>
152db393288SMatthias Springer
153db393288SMatthias Springer    return
154db393288SMatthias Springer  }
155db393288SMatthias Springer
156e4384149SOleksandr "Alex" Zinenko  module attributes {transform.with_named_sequence} {
157e4384149SOleksandr "Alex" Zinenko    transform.named_sequence @__transform_main(%variant_op: !transform.any_op {transform.readonly}) {
158db393288SMatthias Springer      %top_level_func = transform.structured.match ops{["func.func"]} in %variant_op : (!transform.any_op) -> !transform.any_op
159db393288SMatthias Springer      transform.nvgpu.create_async_groups %top_level_func {bypass_l1} : (!transform.any_op) -> (!transform.any_op)
160e4384149SOleksandr "Alex" Zinenko      transform.yield
161e4384149SOleksandr "Alex" Zinenko    }
162db393288SMatthias Springer  }
163db393288SMatthias Springer}
16439d8876dSMatthias Springer
16539d8876dSMatthias Springer// -----
16639d8876dSMatthias Springer
16739d8876dSMatthias Springer// 2D vector.transfer_read with a mask.
16839d8876dSMatthias Springerbuiltin.module {
16939d8876dSMatthias Springer  // CHECK-LABEL: @read_2d_with_mask(
17039d8876dSMatthias Springer  //  CHECK-SAME:     %[[sz0:.*]]: index, %[[sz1:.*]]: index, %[[a:.*]]: memref<1024x1024xf32>
17139d8876dSMatthias Springer  func.func @read_2d_with_mask(%sz0: index, %sz1: index, %a: memref<1024x1024xf32>) {
172*bb6d5c22SMatthias Springer    // CHECK-DAG: %[[c0:.*]] = arith.constant 0 : index
173*bb6d5c22SMatthias Springer    // CHECK-DAG: %[[c1:.*]] = arith.constant 1 : index
174*bb6d5c22SMatthias Springer    // CHECK-DAG: %[[c2:.*]] = arith.constant 2 : index
17539d8876dSMatthias Springer    %0 = memref.alloc() : memref<4x32x16xf32, #gpu.address_space<workgroup>>
17639d8876dSMatthias Springer    %c0 = arith.constant 0 : index
17739d8876dSMatthias Springer    %cst_0 = arith.constant 0.000000e+00 : f32
17839d8876dSMatthias Springer
17939d8876dSMatthias Springer    // CHECK: %[[cmpi0:.*]] = arith.cmpi slt, %[[c0]], %[[sz0]]
18039d8876dSMatthias Springer    // CHECK: %[[s0:.*]] = arith.select %[[cmpi0]], %[[sz1]], %[[c0]]
18139d8876dSMatthias Springer    // CHECK: nvgpu.device_async_copy %[[a]][%[[c0]], %[[c0]]], {{.*}}, 4, %[[s0]] {bypassL1}
18239d8876dSMatthias Springer
18339d8876dSMatthias Springer    // CHECK: %[[cmpi1:.*]] = arith.cmpi slt, %[[c1]], %[[sz0]]
18439d8876dSMatthias Springer    // CHECK: %[[s1:.*]] = arith.select %[[cmpi1]], %[[sz1]], %[[c0]]
18539d8876dSMatthias Springer    // CHECK: nvgpu.device_async_copy %[[a]][%[[c1]], %[[c0]]], {{.*}}, 4, %[[s1]] {bypassL1}
18639d8876dSMatthias Springer
18739d8876dSMatthias Springer    // CHECK: %[[cmpi2:.*]] = arith.cmpi slt, %[[c2]], %[[sz0]]
18839d8876dSMatthias Springer    // CHECK: %[[s2:.*]] = arith.select %[[cmpi2]], %[[sz1]], %[[c0]]
18939d8876dSMatthias Springer    // CHECK: nvgpu.device_async_copy %[[a]][%[[c2]], %[[c0]]], {{.*}}, 4, %[[s2]] {bypassL1}
19039d8876dSMatthias Springer    %mask = vector.create_mask %sz0, %sz1 : vector<3x4xi1>
19139d8876dSMatthias Springer    %1 = vector.transfer_read %a[%c0, %c0], %cst_0, %mask {in_bounds = [true, true]} : memref<1024x1024xf32>, vector<3x4xf32>
19239d8876dSMatthias Springer    vector.transfer_write %1, %0[%c0, %c0, %c0] {in_bounds = [true, true]} : vector<3x4xf32>, memref<4x32x16xf32, #gpu.address_space<workgroup>>
19339d8876dSMatthias Springer
19439d8876dSMatthias Springer    return
19539d8876dSMatthias Springer  }
19639d8876dSMatthias Springer
197e4384149SOleksandr "Alex" Zinenko  module attributes {transform.with_named_sequence} {
198e4384149SOleksandr "Alex" Zinenko    transform.named_sequence @__transform_main(%variant_op: !transform.any_op {transform.readonly}) {
19939d8876dSMatthias Springer      %top_level_func = transform.structured.match ops{["func.func"]} in %variant_op : (!transform.any_op) -> !transform.any_op
20039d8876dSMatthias Springer      transform.apply_patterns to %top_level_func {
20139d8876dSMatthias Springer        transform.apply_patterns.vector.transfer_to_scf max_transfer_rank = 1 full_unroll = true
20239d8876dSMatthias Springer      } : !transform.any_op
20339d8876dSMatthias Springer      transform.nvgpu.create_async_groups %top_level_func {bypass_l1} : (!transform.any_op) -> (!transform.any_op)
20439d8876dSMatthias Springer      %top_level_func_2 = transform.structured.match ops{["func.func"]} in %variant_op : (!transform.any_op) -> !transform.any_op
20539d8876dSMatthias Springer      transform.apply_cse to %top_level_func_2 : !transform.any_op
206e4384149SOleksandr "Alex" Zinenko      transform.yield
207e4384149SOleksandr "Alex" Zinenko    }
20839d8876dSMatthias Springer  }
20939d8876dSMatthias Springer}
21015ea2306SMatthias Springer
21115ea2306SMatthias Springer// -----
21215ea2306SMatthias Springer
21315ea2306SMatthias Springer// 3D vector.transfer_read with a mask.
21415ea2306SMatthias Springerbuiltin.module {
21515ea2306SMatthias Springer  // CHECK-LABEL: @read_3d_with_mask(
21615ea2306SMatthias Springer  //  CHECK-SAME:     %[[sz0:.*]]: index, %[[sz1:.*]]: index, %[[sz2:.*]]: index, %[[a:.*]]: memref<1024x1024x1024xf32>
21715ea2306SMatthias Springer  func.func @read_3d_with_mask(%sz0: index, %sz1: index, %sz2: index, %a: memref<1024x1024x1024xf32>) {
218*bb6d5c22SMatthias Springer    // CHECK-DAG: %[[c0:.*]] = arith.constant 0 : index
219*bb6d5c22SMatthias Springer    // CHECK-DAG: %[[c1:.*]] = arith.constant 1 : index
220*bb6d5c22SMatthias Springer    // CHECK-DAG: %[[c2:.*]] = arith.constant 2 : index
22115ea2306SMatthias Springer    %0 = memref.alloc() : memref<4x32x16xf32, #gpu.address_space<workgroup>>
22215ea2306SMatthias Springer    %c0 = arith.constant 0 : index
22315ea2306SMatthias Springer    %cst_0 = arith.constant 0.000000e+00 : f32
22415ea2306SMatthias Springer
22515ea2306SMatthias Springer    // CHECK: %[[cmpi0:.*]] = arith.cmpi slt, %[[c0]], %[[sz0]]
22615ea2306SMatthias Springer    // CHECK: %[[cmpi1:.*]] = arith.cmpi slt, %[[c0]], %[[sz1]]
22715ea2306SMatthias Springer    // CHECK: %[[cond0:.*]] = arith.andi %[[cmpi1]], %[[cmpi0]]
22815ea2306SMatthias Springer    // CHECK: %[[s0:.*]] = arith.select %[[cond0]], %[[sz2]], %[[c0]]
22915ea2306SMatthias Springer    // CHECK: nvgpu.device_async_copy %[[a]][%[[c0]], %[[c0]], %[[c0]]], {{.*}}, 4, %[[s0]] {bypassL1}
23015ea2306SMatthias Springer
23115ea2306SMatthias Springer    // CHECK: %[[cmpi2:.*]] = arith.cmpi slt, %[[c1]], %[[sz1]]
23215ea2306SMatthias Springer    // CHECK: %[[cond1:.*]] = arith.andi %[[cmpi2]], %[[cmpi0]]
23315ea2306SMatthias Springer    // CHECK: %[[s1:.*]] = arith.select %[[cond1]], %[[sz2]], %[[c0]]
23415ea2306SMatthias Springer    // CHECK: nvgpu.device_async_copy %[[a]][%[[c0]], %[[c1]], %[[c0]]], {{.*}}, 4, %[[s1]] {bypassL1}
23515ea2306SMatthias Springer
23615ea2306SMatthias Springer    // CHECK: %[[cmpi3:.*]] = arith.cmpi slt, %[[c2]], %[[sz1]]
23715ea2306SMatthias Springer    // CHECK: %[[cond2:.*]] = arith.andi %[[cmpi3]], %[[cmpi0]]
23815ea2306SMatthias Springer    // CHECK: %[[s2:.*]] = arith.select %[[cond2]], %[[sz2]], %[[c0]]
23915ea2306SMatthias Springer    // CHECK: nvgpu.device_async_copy %[[a]][%[[c0]], %[[c2]], %[[c0]]], {{.*}}, 4, %[[s2]] {bypassL1}
24015ea2306SMatthias Springer
24115ea2306SMatthias Springer    // CHECK: %[[cmpi4:.*]] = arith.cmpi slt, %[[c1]], %[[sz0]]
24215ea2306SMatthias Springer    // CHECK: %[[cond3:.*]] = arith.andi %[[cmpi1]], %[[cmpi4]]
24315ea2306SMatthias Springer    // CHECK: %[[s3:.*]] = arith.select %[[cond3]], %[[sz2]], %[[c0]]
24415ea2306SMatthias Springer    // CHECK: nvgpu.device_async_copy %[[a]][%[[c1]], %[[c0]], %[[c0]]], {{.*}}, 4, %[[s3]] {bypassL1}
24515ea2306SMatthias Springer
24615ea2306SMatthias Springer    // CHECK: %[[cond4:.*]] = arith.andi %[[cmpi2]], %[[cmpi4]]
24715ea2306SMatthias Springer    // CHECK: %[[s4:.*]] = arith.select %[[cond4]], %[[sz2]], %[[c0]]
24815ea2306SMatthias Springer    // CHECK: nvgpu.device_async_copy %[[a]][%[[c1]], %[[c1]], %[[c0]]], {{.*}}, 4, %[[s4]] {bypassL1}
24915ea2306SMatthias Springer
25015ea2306SMatthias Springer    // CHECK: %[[cond5:.*]] = arith.andi %[[cmpi3]], %[[cmpi4]]
25115ea2306SMatthias Springer    // CHECK: %[[s5:.*]] = arith.select %[[cond5]], %[[sz2]], %[[c0]]
25215ea2306SMatthias Springer    // CHECK: nvgpu.device_async_copy %[[a]][%[[c1]], %[[c2]], %[[c0]]], {{.*}}, 4, %[[s5]] {bypassL1}
25315ea2306SMatthias Springer    %mask = vector.create_mask %sz0, %sz1, %sz2 : vector<2x3x4xi1>
25415ea2306SMatthias Springer    %1 = vector.transfer_read %a[%c0, %c0, %c0], %cst_0, %mask {in_bounds = [true, true, true]} : memref<1024x1024x1024xf32>, vector<2x3x4xf32>
25515ea2306SMatthias Springer    vector.transfer_write %1, %0[%c0, %c0, %c0] {in_bounds = [true, true, true]} : vector<2x3x4xf32>, memref<4x32x16xf32, #gpu.address_space<workgroup>>
25615ea2306SMatthias Springer
25715ea2306SMatthias Springer    return
25815ea2306SMatthias Springer  }
25915ea2306SMatthias Springer
260e4384149SOleksandr "Alex" Zinenko  module attributes {transform.with_named_sequence} {
261e4384149SOleksandr "Alex" Zinenko    transform.named_sequence @__transform_main(%variant_op: !transform.any_op {transform.readonly}) {
26215ea2306SMatthias Springer      %top_level_func = transform.structured.match ops{["func.func"]} in %variant_op : (!transform.any_op) -> !transform.any_op
26315ea2306SMatthias Springer      transform.apply_patterns to %top_level_func {
26415ea2306SMatthias Springer        transform.apply_patterns.vector.transfer_to_scf max_transfer_rank = 1 full_unroll = true
26515ea2306SMatthias Springer      } : !transform.any_op
26615ea2306SMatthias Springer      transform.nvgpu.create_async_groups %top_level_func {bypass_l1} : (!transform.any_op) -> (!transform.any_op)
26715ea2306SMatthias Springer      %top_level_func_2 = transform.structured.match ops{["func.func"]} in %variant_op : (!transform.any_op) -> !transform.any_op
26815ea2306SMatthias Springer      transform.apply_cse to %top_level_func_2 : !transform.any_op
269e4384149SOleksandr "Alex" Zinenko      transform.yield
270e4384149SOleksandr "Alex" Zinenko    }
27115ea2306SMatthias Springer  }
27215ea2306SMatthias Springer}
273