1e4384149SOleksandr "Alex" Zinenko// RUN: mlir-opt %s -transform-interpreter -split-input-file --verify-diagnostics | FileCheck %s 2db393288SMatthias Springer 3db393288SMatthias Springer// Check that we produce async copies from the vector.transfer_xxx operations. 4db393288SMatthias Springerbuiltin.module { 5db393288SMatthias Springer // CHECK-LABEL: @copies_to_asyncs 6db393288SMatthias Springer func.func @copies_to_asyncs(%a: memref<1024x1024xf32>) { 7db393288SMatthias Springer %0 = memref.alloc() : memref<4x32x16xf32, #gpu.address_space<workgroup>> 8db393288SMatthias Springer %c0 = arith.constant 0 : index 9db393288SMatthias Springer %c4 = arith.constant 4 : index 10db393288SMatthias Springer %cst_0 = arith.constant 0.000000e+00 : f32 11db393288SMatthias Springer // Make sure we emit the bypassL1. 12db393288SMatthias Springer // CHECK: %[[CP0:.*]] = nvgpu.device_async_copy {{.*}}, {{.*}}, 4 {bypassL1} : 13db393288SMatthias Springer %1 = vector.transfer_read %a[%c0, %c0], %cst_0 {in_bounds = [true]} : memref<1024x1024xf32>, vector<4xf32> 14db393288SMatthias Springer vector.transfer_write %1, %0[%c0, %c0, %c0] {in_bounds = [true]} : vector<4xf32>, memref<4x32x16xf32, #gpu.address_space<workgroup>> 15db393288SMatthias Springer // CHECK-NOT: nvgpu.device_async_create_group 16db393288SMatthias Springer 17db393288SMatthias Springer // CHECK: %[[CP1:.*]] = nvgpu.device_async_copy {{.*}}, {{.*}}, 1 18db393288SMatthias Springer %2 = vector.transfer_read %a[%c0, %c4], %cst_0 {in_bounds = [true]} : memref<1024x1024xf32>, vector<1xf32> 19db393288SMatthias Springer vector.transfer_write %2, %0[%c0, %c4, %c0] {in_bounds = [true]} : vector<1xf32>, memref<4x32x16xf32, #gpu.address_space<workgroup>> 20db393288SMatthias Springer // CHECK: %[[G:.*]] = nvgpu.device_async_create_group %[[CP0]], %[[CP1]] 21db393288SMatthias Springer // CHECK: nvgpu.device_async_wait %[[G]] 22db393288SMatthias Springer return 23db393288SMatthias Springer } 24db393288SMatthias Springer 25e4384149SOleksandr "Alex" Zinenko module attributes {transform.with_named_sequence} { 26e4384149SOleksandr "Alex" Zinenko transform.named_sequence @__transform_main(%variant_op: !transform.any_op {transform.readonly}) { 27db393288SMatthias Springer %top_level_func = transform.structured.match ops{["func.func"]} in %variant_op : (!transform.any_op) -> !transform.any_op 28db393288SMatthias Springer transform.nvgpu.create_async_groups %top_level_func {bypass_l1} : (!transform.any_op) -> (!transform.any_op) 29e4384149SOleksandr "Alex" Zinenko transform.yield 30e4384149SOleksandr "Alex" Zinenko } 31db393288SMatthias Springer } 32db393288SMatthias Springer} 33db393288SMatthias Springer 34db393288SMatthias Springer// ----- 35db393288SMatthias Springer 36db393288SMatthias Springer// Check that we properly take `bypass_l1 = false` into account. 37db393288SMatthias Springer// I.e., we shouldn't be generating bypassL1 attributes. 38db393288SMatthias Springerbuiltin.module { 39db393288SMatthias Springer // CHECK-LABEL: @copies_to_asyncs_no_mma 40db393288SMatthias Springer func.func @copies_to_asyncs_no_mma(%a: memref<1024x1024xf32>) { 41db393288SMatthias Springer %0 = memref.alloc() : memref<4x32x16xf32, #gpu.address_space<workgroup>> 42db393288SMatthias Springer %c0 = arith.constant 0 : index 43db393288SMatthias Springer %c4 = arith.constant 4 : index 44db393288SMatthias Springer %cst_0 = arith.constant 0.000000e+00 : f32 45db393288SMatthias Springer // Make sure we don't emit the bypassL1. 46db393288SMatthias Springer // CHECK: %[[CP0:.*]] = nvgpu.device_async_copy {{.*}}, {{.*}}, 4 : 47db393288SMatthias Springer %1 = vector.transfer_read %a[%c0, %c0], %cst_0 {in_bounds = [true]} : memref<1024x1024xf32>, vector<4xf32> 48db393288SMatthias Springer vector.transfer_write %1, %0[%c0, %c0, %c0] {in_bounds = [true]} : vector<4xf32>, memref<4x32x16xf32, #gpu.address_space<workgroup>> 49db393288SMatthias Springer // CHECK-NOT: nvgpu.device_async_create_group 50db393288SMatthias Springer 51db393288SMatthias Springer // CHECK: %[[CP1:.*]] = nvgpu.device_async_copy {{.*}}, {{.*}}, 1 : 52db393288SMatthias Springer %2 = vector.transfer_read %a[%c0, %c4], %cst_0 {in_bounds = [true]} : memref<1024x1024xf32>, vector<1xf32> 53db393288SMatthias Springer vector.transfer_write %2, %0[%c0, %c4, %c0] {in_bounds = [true]} : vector<1xf32>, memref<4x32x16xf32, #gpu.address_space<workgroup>> 54db393288SMatthias Springer // CHECK: %[[G:.*]] = nvgpu.device_async_create_group %[[CP0]], %[[CP1]] 55db393288SMatthias Springer // CHECK: nvgpu.device_async_wait %[[G]] 56db393288SMatthias Springer return 57db393288SMatthias Springer } 58db393288SMatthias Springer 59e4384149SOleksandr "Alex" Zinenko module attributes {transform.with_named_sequence} { 60e4384149SOleksandr "Alex" Zinenko transform.named_sequence @__transform_main(%variant_op: !transform.any_op {transform.readonly}) { 61db393288SMatthias Springer %top_level_func = transform.structured.match ops{["func.func"]} in %variant_op : (!transform.any_op) -> !transform.any_op 62db393288SMatthias Springer transform.nvgpu.create_async_groups %top_level_func : (!transform.any_op) -> (!transform.any_op) 63e4384149SOleksandr "Alex" Zinenko transform.yield 64e4384149SOleksandr "Alex" Zinenko } 65db393288SMatthias Springer } 66db393288SMatthias Springer} 67db393288SMatthias Springer 68db393288SMatthias Springer// ----- 69db393288SMatthias Springer 70db393288SMatthias Springer// Check that pattern works with vector.load/vector.store. 71db393288SMatthias Springerbuiltin.module { 72db393288SMatthias Springer // CHECK-LABEL: @copies_to_asyncs_load_store 73db393288SMatthias Springer func.func @copies_to_asyncs_load_store(%a: memref<1024x1024xf32>) { 74db393288SMatthias Springer %0 = memref.alloc() : memref<4x32x16xf32, #gpu.address_space<workgroup>> 75db393288SMatthias Springer %c0 = arith.constant 0 : index 76db393288SMatthias Springer %c4 = arith.constant 4 : index 77db393288SMatthias Springer %cst_0 = arith.constant 0.000000e+00 : f32 78db393288SMatthias Springer // CHECK: %[[CP0:.*]] = nvgpu.device_async_copy {{.*}}, {{.*}}, 4 : 79db393288SMatthias Springer %1 = vector.load %a[%c0, %c0] : memref<1024x1024xf32>, vector<4xf32> 80db393288SMatthias Springer vector.store %1, %0[%c0, %c0, %c0] : memref<4x32x16xf32, #gpu.address_space<workgroup>>, vector<4xf32> 81db393288SMatthias Springer // CHECK-NOT: nvgpu.device_async_create_group 82db393288SMatthias Springer 83db393288SMatthias Springer // CHECK: %[[CP1:.*]] = nvgpu.device_async_copy {{.*}}, {{.*}}, 1 : 84db393288SMatthias Springer %2 = vector.load %a[%c0, %c4] : memref<1024x1024xf32>, vector<1xf32> 85db393288SMatthias Springer vector.store %2, %0[%c0, %c4, %c0] : memref<4x32x16xf32, #gpu.address_space<workgroup>>, vector<1xf32> 86db393288SMatthias Springer // CHECK: %[[G:.*]] = nvgpu.device_async_create_group %[[CP0]], %[[CP1]] 87db393288SMatthias Springer // CHECK: nvgpu.device_async_wait %[[G]] 88db393288SMatthias Springer return 89db393288SMatthias Springer } 90db393288SMatthias Springer 91e4384149SOleksandr "Alex" Zinenko module attributes {transform.with_named_sequence} { 92e4384149SOleksandr "Alex" Zinenko transform.named_sequence @__transform_main(%variant_op: !transform.any_op {transform.readonly}) { 93db393288SMatthias Springer %top_level_func = transform.structured.match ops{["func.func"]} in %variant_op : (!transform.any_op) -> !transform.any_op 94db393288SMatthias Springer transform.nvgpu.create_async_groups %top_level_func : (!transform.any_op) -> (!transform.any_op) 95e4384149SOleksandr "Alex" Zinenko transform.yield 96e4384149SOleksandr "Alex" Zinenko } 97db393288SMatthias Springer } 98db393288SMatthias Springer} 99db393288SMatthias Springer 100db393288SMatthias Springer// ----- 101db393288SMatthias Springer 102db393288SMatthias Springer// Check that pattern skips unaligned and unsupported sizes. 103db393288SMatthias Springerbuiltin.module { 104db393288SMatthias Springer // CHECK-LABEL: @copies_to_asyncs_load_store 105db393288SMatthias Springer func.func @copies_to_asyncs_load_store(%a: memref<1024x1024xf32>, %b: memref<1024x1024xf16>) { 106db393288SMatthias Springer %alloc = memref.alloc() : memref<4x32x16xf32, #gpu.address_space<workgroup>> 107db393288SMatthias Springer %alloc_1 = memref.alloc() : memref<4x32x16xf16, #gpu.address_space<workgroup>> 108db393288SMatthias Springer %c0 = arith.constant 0 : index 109db393288SMatthias Springer %c4 = arith.constant 4 : index 110db393288SMatthias Springer %cst_0 = arith.constant 0.000000e+00 : f32 111db393288SMatthias Springer 112db393288SMatthias Springer // Requires 1-D vector load 113db393288SMatthias Springer // CHECK-NOT: nvgpu.device_async_copy 114db393288SMatthias Springer // CHECK: vector.load 115db393288SMatthias Springer // CHECK: vector.store 116db393288SMatthias Springer %1 = vector.load %a[%c0, %c4] : memref<1024x1024xf32>, vector<2x2xf32> 117db393288SMatthias Springer vector.store %1, %alloc[%c0, %c4, %c0] : memref<4x32x16xf32, #gpu.address_space<workgroup>>, vector<2x2xf32> 118db393288SMatthias Springer // CHECK-NOT: nvgpu.device_async_create_group 119db393288SMatthias Springer 120db393288SMatthias Springer // CHECK-NOT: nvgpu.device_async_copy 121db393288SMatthias Springer // CHECK: vector.load 122db393288SMatthias Springer // CHECK: vector.store 123db393288SMatthias Springer %2 = vector.load %b[%c0, %c4] : memref<1024x1024xf16>, vector<1xf16> 124db393288SMatthias Springer vector.store %2, %alloc_1[%c0, %c4, %c0] : memref<4x32x16xf16, #gpu.address_space<workgroup>>, vector<1xf16> 125db393288SMatthias Springer // CHECK-NOT: nvgpu.device_async_create_group 126db393288SMatthias Springer return 127db393288SMatthias Springer } 128db393288SMatthias Springer 129e4384149SOleksandr "Alex" Zinenko module attributes {transform.with_named_sequence} { 130e4384149SOleksandr "Alex" Zinenko transform.named_sequence @__transform_main(%variant_op: !transform.any_op {transform.readonly}) { 131db393288SMatthias Springer %top_level_func = transform.structured.match ops{["func.func"]} in %variant_op : (!transform.any_op) -> !transform.any_op 132db393288SMatthias Springer transform.nvgpu.create_async_groups %top_level_func : (!transform.any_op) -> (!transform.any_op) 133e4384149SOleksandr "Alex" Zinenko transform.yield 134e4384149SOleksandr "Alex" Zinenko } 135db393288SMatthias Springer } 136db393288SMatthias Springer} 137db393288SMatthias Springer 138db393288SMatthias Springer// ----- 139db393288SMatthias Springer 140db393288SMatthias Springer// vector.transfer_read with a mask. 141db393288SMatthias Springerbuiltin.module { 142db393288SMatthias Springer // CHECK-LABEL: @read_with_mask( 143db393288SMatthias Springer // CHECK-SAME: %{{.*}}: memref<1024x1024xf32>, %[[sz:.*]]: index 144db393288SMatthias Springer func.func @read_with_mask(%a: memref<1024x1024xf32>, %sz: index) { 145db393288SMatthias Springer %0 = memref.alloc() : memref<4x32x16xf32, #gpu.address_space<workgroup>> 146db393288SMatthias Springer %c0 = arith.constant 0 : index 147db393288SMatthias Springer %cst_0 = arith.constant 0.000000e+00 : f32 148db393288SMatthias Springer // CHECK: nvgpu.device_async_copy {{.*}}, {{.*}}, 4, %[[sz]] {bypassL1} : 149db393288SMatthias Springer %mask = vector.create_mask %sz : vector<4xi1> 150db393288SMatthias Springer %1 = vector.transfer_read %a[%c0, %c0], %cst_0, %mask {in_bounds = [true]} : memref<1024x1024xf32>, vector<4xf32> 151db393288SMatthias Springer vector.transfer_write %1, %0[%c0, %c0, %c0] {in_bounds = [true]} : vector<4xf32>, memref<4x32x16xf32, #gpu.address_space<workgroup>> 152db393288SMatthias Springer 153db393288SMatthias Springer return 154db393288SMatthias Springer } 155db393288SMatthias Springer 156e4384149SOleksandr "Alex" Zinenko module attributes {transform.with_named_sequence} { 157e4384149SOleksandr "Alex" Zinenko transform.named_sequence @__transform_main(%variant_op: !transform.any_op {transform.readonly}) { 158db393288SMatthias Springer %top_level_func = transform.structured.match ops{["func.func"]} in %variant_op : (!transform.any_op) -> !transform.any_op 159db393288SMatthias Springer transform.nvgpu.create_async_groups %top_level_func {bypass_l1} : (!transform.any_op) -> (!transform.any_op) 160e4384149SOleksandr "Alex" Zinenko transform.yield 161e4384149SOleksandr "Alex" Zinenko } 162db393288SMatthias Springer } 163db393288SMatthias Springer} 16439d8876dSMatthias Springer 16539d8876dSMatthias Springer// ----- 16639d8876dSMatthias Springer 16739d8876dSMatthias Springer// 2D vector.transfer_read with a mask. 16839d8876dSMatthias Springerbuiltin.module { 16939d8876dSMatthias Springer // CHECK-LABEL: @read_2d_with_mask( 17039d8876dSMatthias Springer // CHECK-SAME: %[[sz0:.*]]: index, %[[sz1:.*]]: index, %[[a:.*]]: memref<1024x1024xf32> 17139d8876dSMatthias Springer func.func @read_2d_with_mask(%sz0: index, %sz1: index, %a: memref<1024x1024xf32>) { 172*bb6d5c22SMatthias Springer // CHECK-DAG: %[[c0:.*]] = arith.constant 0 : index 173*bb6d5c22SMatthias Springer // CHECK-DAG: %[[c1:.*]] = arith.constant 1 : index 174*bb6d5c22SMatthias Springer // CHECK-DAG: %[[c2:.*]] = arith.constant 2 : index 17539d8876dSMatthias Springer %0 = memref.alloc() : memref<4x32x16xf32, #gpu.address_space<workgroup>> 17639d8876dSMatthias Springer %c0 = arith.constant 0 : index 17739d8876dSMatthias Springer %cst_0 = arith.constant 0.000000e+00 : f32 17839d8876dSMatthias Springer 17939d8876dSMatthias Springer // CHECK: %[[cmpi0:.*]] = arith.cmpi slt, %[[c0]], %[[sz0]] 18039d8876dSMatthias Springer // CHECK: %[[s0:.*]] = arith.select %[[cmpi0]], %[[sz1]], %[[c0]] 18139d8876dSMatthias Springer // CHECK: nvgpu.device_async_copy %[[a]][%[[c0]], %[[c0]]], {{.*}}, 4, %[[s0]] {bypassL1} 18239d8876dSMatthias Springer 18339d8876dSMatthias Springer // CHECK: %[[cmpi1:.*]] = arith.cmpi slt, %[[c1]], %[[sz0]] 18439d8876dSMatthias Springer // CHECK: %[[s1:.*]] = arith.select %[[cmpi1]], %[[sz1]], %[[c0]] 18539d8876dSMatthias Springer // CHECK: nvgpu.device_async_copy %[[a]][%[[c1]], %[[c0]]], {{.*}}, 4, %[[s1]] {bypassL1} 18639d8876dSMatthias Springer 18739d8876dSMatthias Springer // CHECK: %[[cmpi2:.*]] = arith.cmpi slt, %[[c2]], %[[sz0]] 18839d8876dSMatthias Springer // CHECK: %[[s2:.*]] = arith.select %[[cmpi2]], %[[sz1]], %[[c0]] 18939d8876dSMatthias Springer // CHECK: nvgpu.device_async_copy %[[a]][%[[c2]], %[[c0]]], {{.*}}, 4, %[[s2]] {bypassL1} 19039d8876dSMatthias Springer %mask = vector.create_mask %sz0, %sz1 : vector<3x4xi1> 19139d8876dSMatthias Springer %1 = vector.transfer_read %a[%c0, %c0], %cst_0, %mask {in_bounds = [true, true]} : memref<1024x1024xf32>, vector<3x4xf32> 19239d8876dSMatthias Springer vector.transfer_write %1, %0[%c0, %c0, %c0] {in_bounds = [true, true]} : vector<3x4xf32>, memref<4x32x16xf32, #gpu.address_space<workgroup>> 19339d8876dSMatthias Springer 19439d8876dSMatthias Springer return 19539d8876dSMatthias Springer } 19639d8876dSMatthias Springer 197e4384149SOleksandr "Alex" Zinenko module attributes {transform.with_named_sequence} { 198e4384149SOleksandr "Alex" Zinenko transform.named_sequence @__transform_main(%variant_op: !transform.any_op {transform.readonly}) { 19939d8876dSMatthias Springer %top_level_func = transform.structured.match ops{["func.func"]} in %variant_op : (!transform.any_op) -> !transform.any_op 20039d8876dSMatthias Springer transform.apply_patterns to %top_level_func { 20139d8876dSMatthias Springer transform.apply_patterns.vector.transfer_to_scf max_transfer_rank = 1 full_unroll = true 20239d8876dSMatthias Springer } : !transform.any_op 20339d8876dSMatthias Springer transform.nvgpu.create_async_groups %top_level_func {bypass_l1} : (!transform.any_op) -> (!transform.any_op) 20439d8876dSMatthias Springer %top_level_func_2 = transform.structured.match ops{["func.func"]} in %variant_op : (!transform.any_op) -> !transform.any_op 20539d8876dSMatthias Springer transform.apply_cse to %top_level_func_2 : !transform.any_op 206e4384149SOleksandr "Alex" Zinenko transform.yield 207e4384149SOleksandr "Alex" Zinenko } 20839d8876dSMatthias Springer } 20939d8876dSMatthias Springer} 21015ea2306SMatthias Springer 21115ea2306SMatthias Springer// ----- 21215ea2306SMatthias Springer 21315ea2306SMatthias Springer// 3D vector.transfer_read with a mask. 21415ea2306SMatthias Springerbuiltin.module { 21515ea2306SMatthias Springer // CHECK-LABEL: @read_3d_with_mask( 21615ea2306SMatthias Springer // CHECK-SAME: %[[sz0:.*]]: index, %[[sz1:.*]]: index, %[[sz2:.*]]: index, %[[a:.*]]: memref<1024x1024x1024xf32> 21715ea2306SMatthias Springer func.func @read_3d_with_mask(%sz0: index, %sz1: index, %sz2: index, %a: memref<1024x1024x1024xf32>) { 218*bb6d5c22SMatthias Springer // CHECK-DAG: %[[c0:.*]] = arith.constant 0 : index 219*bb6d5c22SMatthias Springer // CHECK-DAG: %[[c1:.*]] = arith.constant 1 : index 220*bb6d5c22SMatthias Springer // CHECK-DAG: %[[c2:.*]] = arith.constant 2 : index 22115ea2306SMatthias Springer %0 = memref.alloc() : memref<4x32x16xf32, #gpu.address_space<workgroup>> 22215ea2306SMatthias Springer %c0 = arith.constant 0 : index 22315ea2306SMatthias Springer %cst_0 = arith.constant 0.000000e+00 : f32 22415ea2306SMatthias Springer 22515ea2306SMatthias Springer // CHECK: %[[cmpi0:.*]] = arith.cmpi slt, %[[c0]], %[[sz0]] 22615ea2306SMatthias Springer // CHECK: %[[cmpi1:.*]] = arith.cmpi slt, %[[c0]], %[[sz1]] 22715ea2306SMatthias Springer // CHECK: %[[cond0:.*]] = arith.andi %[[cmpi1]], %[[cmpi0]] 22815ea2306SMatthias Springer // CHECK: %[[s0:.*]] = arith.select %[[cond0]], %[[sz2]], %[[c0]] 22915ea2306SMatthias Springer // CHECK: nvgpu.device_async_copy %[[a]][%[[c0]], %[[c0]], %[[c0]]], {{.*}}, 4, %[[s0]] {bypassL1} 23015ea2306SMatthias Springer 23115ea2306SMatthias Springer // CHECK: %[[cmpi2:.*]] = arith.cmpi slt, %[[c1]], %[[sz1]] 23215ea2306SMatthias Springer // CHECK: %[[cond1:.*]] = arith.andi %[[cmpi2]], %[[cmpi0]] 23315ea2306SMatthias Springer // CHECK: %[[s1:.*]] = arith.select %[[cond1]], %[[sz2]], %[[c0]] 23415ea2306SMatthias Springer // CHECK: nvgpu.device_async_copy %[[a]][%[[c0]], %[[c1]], %[[c0]]], {{.*}}, 4, %[[s1]] {bypassL1} 23515ea2306SMatthias Springer 23615ea2306SMatthias Springer // CHECK: %[[cmpi3:.*]] = arith.cmpi slt, %[[c2]], %[[sz1]] 23715ea2306SMatthias Springer // CHECK: %[[cond2:.*]] = arith.andi %[[cmpi3]], %[[cmpi0]] 23815ea2306SMatthias Springer // CHECK: %[[s2:.*]] = arith.select %[[cond2]], %[[sz2]], %[[c0]] 23915ea2306SMatthias Springer // CHECK: nvgpu.device_async_copy %[[a]][%[[c0]], %[[c2]], %[[c0]]], {{.*}}, 4, %[[s2]] {bypassL1} 24015ea2306SMatthias Springer 24115ea2306SMatthias Springer // CHECK: %[[cmpi4:.*]] = arith.cmpi slt, %[[c1]], %[[sz0]] 24215ea2306SMatthias Springer // CHECK: %[[cond3:.*]] = arith.andi %[[cmpi1]], %[[cmpi4]] 24315ea2306SMatthias Springer // CHECK: %[[s3:.*]] = arith.select %[[cond3]], %[[sz2]], %[[c0]] 24415ea2306SMatthias Springer // CHECK: nvgpu.device_async_copy %[[a]][%[[c1]], %[[c0]], %[[c0]]], {{.*}}, 4, %[[s3]] {bypassL1} 24515ea2306SMatthias Springer 24615ea2306SMatthias Springer // CHECK: %[[cond4:.*]] = arith.andi %[[cmpi2]], %[[cmpi4]] 24715ea2306SMatthias Springer // CHECK: %[[s4:.*]] = arith.select %[[cond4]], %[[sz2]], %[[c0]] 24815ea2306SMatthias Springer // CHECK: nvgpu.device_async_copy %[[a]][%[[c1]], %[[c1]], %[[c0]]], {{.*}}, 4, %[[s4]] {bypassL1} 24915ea2306SMatthias Springer 25015ea2306SMatthias Springer // CHECK: %[[cond5:.*]] = arith.andi %[[cmpi3]], %[[cmpi4]] 25115ea2306SMatthias Springer // CHECK: %[[s5:.*]] = arith.select %[[cond5]], %[[sz2]], %[[c0]] 25215ea2306SMatthias Springer // CHECK: nvgpu.device_async_copy %[[a]][%[[c1]], %[[c2]], %[[c0]]], {{.*}}, 4, %[[s5]] {bypassL1} 25315ea2306SMatthias Springer %mask = vector.create_mask %sz0, %sz1, %sz2 : vector<2x3x4xi1> 25415ea2306SMatthias Springer %1 = vector.transfer_read %a[%c0, %c0, %c0], %cst_0, %mask {in_bounds = [true, true, true]} : memref<1024x1024x1024xf32>, vector<2x3x4xf32> 25515ea2306SMatthias Springer vector.transfer_write %1, %0[%c0, %c0, %c0] {in_bounds = [true, true, true]} : vector<2x3x4xf32>, memref<4x32x16xf32, #gpu.address_space<workgroup>> 25615ea2306SMatthias Springer 25715ea2306SMatthias Springer return 25815ea2306SMatthias Springer } 25915ea2306SMatthias Springer 260e4384149SOleksandr "Alex" Zinenko module attributes {transform.with_named_sequence} { 261e4384149SOleksandr "Alex" Zinenko transform.named_sequence @__transform_main(%variant_op: !transform.any_op {transform.readonly}) { 26215ea2306SMatthias Springer %top_level_func = transform.structured.match ops{["func.func"]} in %variant_op : (!transform.any_op) -> !transform.any_op 26315ea2306SMatthias Springer transform.apply_patterns to %top_level_func { 26415ea2306SMatthias Springer transform.apply_patterns.vector.transfer_to_scf max_transfer_rank = 1 full_unroll = true 26515ea2306SMatthias Springer } : !transform.any_op 26615ea2306SMatthias Springer transform.nvgpu.create_async_groups %top_level_func {bypass_l1} : (!transform.any_op) -> (!transform.any_op) 26715ea2306SMatthias Springer %top_level_func_2 = transform.structured.match ops{["func.func"]} in %variant_op : (!transform.any_op) -> !transform.any_op 26815ea2306SMatthias Springer transform.apply_cse to %top_level_func_2 : !transform.any_op 269e4384149SOleksandr "Alex" Zinenko transform.yield 270e4384149SOleksandr "Alex" Zinenko } 27115ea2306SMatthias Springer } 27215ea2306SMatthias Springer} 273