1// RUN: mlir-opt %s | FileCheck %s 2// Verify the printed output can be parsed. 3// RUN: mlir-opt %s | mlir-opt | FileCheck %s 4// Verify the generic form can be parsed. 5// RUN: mlir-opt -mlir-print-op-generic %s | mlir-opt | FileCheck %s 6 7// CHECK-LABEL: gpu.module @test { 8gpu.module @test { 9// CHECK: gpu.func @test_create_nd_tdesc_vc_1(%[[arg0:.*]]: memref<24x32xf32>) { 10gpu.func @test_create_nd_tdesc_vc_1(%src: memref<24x32xf32>) { 11 // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<8x16xf32> 12 %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<8x16xf32> 13 gpu.return 14} 15 16// CHECK: gpu.func @test_create_nd_tdesc_with_sg_map(%[[arg0:.*]]: memref<24x32xf32>) { 17gpu.func @test_create_nd_tdesc_with_sg_map(%src: memref<24x32xf32>) { 18 // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>> 19 %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> -> 20 !xegpu.tensor_desc<8x16xf32, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>> 21 gpu.return 22} 23 24// CHECK: gpu.func @test_create_nd_tdesc_vc_2(%[[arg0:.*]]: ui64, %[[arg1:.*]]: index, %[[arg2:.*]]: index, %[[arg3:.*]]: index, %[[arg4:.*]]: index) { 25gpu.func @test_create_nd_tdesc_vc_2(%src: ui64, %w : index, %h : index, %x : index, %y : index) { 26 //CHECK: %[[C:.*]] = arith.constant 1 : index 27 %c1 = arith.constant 1 : index 28 // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %[[arg0]][%[[arg3]], %[[arg4]]], [%[[arg2]], %[[arg1]]], [%[[arg1]], %[[C]]] : ui64 -> !xegpu.tensor_desc<8x16xf32> 29 %1 = xegpu.create_nd_tdesc %src[%x, %y], [%h, %w], [%w, %c1] : ui64 -> !xegpu.tensor_desc<8x16xf32> 30 gpu.return 31} 32 33// CHECK: gpu.func @test_create_nd_tdesc_vc_3(%[[arg0:.*]]: memref<24x32xf32>) { 34gpu.func @test_create_nd_tdesc_vc_3(%src: memref<24x32xf32>) { 35 // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %[[arg0]][0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<24x16xf32, #xegpu.block_tdesc_attr<array_length = 2 : i64> 36 %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<24x16xf32, #xegpu.block_tdesc_attr<array_length = 2>> 37 gpu.return 38} 39 40// CHECK: gpu.func @test_create_nd_tdesc_vc_4(%[[arg0:.*]]: memref<2x24x32xf32>) { 41gpu.func @test_create_nd_tdesc_vc_4(%src: memref<2x24x32xf32>) { 42 // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %arg0[0, 0, 0] : memref<2x24x32xf32> -> !xegpu.tensor_desc<8x16xf32> 43 %1 = xegpu.create_nd_tdesc %src[0, 0, 0] : memref<2x24x32xf32> -> !xegpu.tensor_desc<8x16xf32> 44 gpu.return 45} 46 47// CHECK: gpu.func @test_create_nd_tdesc_vc_5(%[[arg0:.*]]: memref<2x24x32xf32, 3>) { 48gpu.func @test_create_nd_tdesc_vc_5(%src: memref<2x24x32xf32, 3>) { 49 // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %arg0[0, 0, 0] : memref<2x24x32xf32, 3> -> !xegpu.tensor_desc<16xf32, #xegpu.block_tdesc_attr<memory_space = slm>> 50 %1 = xegpu.create_nd_tdesc %src[0, 0, 0] : memref<2x24x32xf32, 3> -> !xegpu.tensor_desc<16xf32, #xegpu.block_tdesc_attr<memory_space = slm>> 51 gpu.return 52} 53 54// CHECK: gpu.func @test_create_nd_tdesc_vc_6(%[[arg0:.*]]: memref<24x32xf32>) { 55gpu.func @test_create_nd_tdesc_vc_6(%src: memref<24x32xf32>) { 56 // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %[[arg0]][0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<24x16xf32, #xegpu.block_tdesc_attr<array_length = 2 : i64>, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]> 57 %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<24x16xf32, #xegpu.block_tdesc_attr<array_length = 2>, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>> 58 gpu.return 59} 60 61// CHECK: gpu.func @test_prefetch_nd_vc(%[[arg0:.*]]: memref<24x32xf16>) { 62gpu.func @test_prefetch_nd_vc(%src: memref<24x32xf16>) { 63 // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %[[arg0]][0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<8x16xf16> 64 %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<8x16xf16> 65 // CHECK: xegpu.prefetch_nd %[[R0]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<8x16xf16> 66 xegpu.prefetch_nd %1 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}>: !xegpu.tensor_desc<8x16xf16> 67 gpu.return 68} 69 70// CHECK: func @test_load_nd_vc(%[[arg0:.*]]: memref<8x16xf16>) { 71gpu.func @test_load_nd_vc(%src: memref<8x16xf16>) { 72 // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16> 73 %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16> 74 // CHECK: %[[R1:.*]] = xegpu.load_nd %[[R0]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>, packed}> : !xegpu.tensor_desc<8x16xf16> -> vector<4x16x2xf16> 75 %2 = xegpu.load_nd %1 <{packed, l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> 76 : !xegpu.tensor_desc<8x16xf16> -> vector<4x16x2xf16> 77 gpu.return 78} 79 80// CHECK: func @test_load_nd_vc_2(%[[arg0:.*]]: memref<8x16xf16>) { 81gpu.func @test_load_nd_vc_2(%src: memref<8x16xf16>) { 82 // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<8x16xf16> -> !xegpu.tensor_desc<16xf16> 83 %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<8x16xf16> -> !xegpu.tensor_desc<16xf16> 84 // CHECK: %[[R1:.*]] = xegpu.load_nd %[[R0]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<16xf16> -> vector<16xf16> 85 %2 = xegpu.load_nd %1 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<16xf16> -> vector<16xf16> 86 gpu.return 87} 88 89// load_nd args may have different shapes, validated against sg_map 90// CHECK: func @test_load_nd_vc_3(%[[arg0:.*]]: memref<24x32xf32>) { 91gpu.func @test_load_nd_vc_3(%src: memref<24x32xf32>) { 92 // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>> 93 %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> -> 94 !xegpu.tensor_desc<8x16xf32, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>> 95 // CHECK: %[[R1:.*]] = xegpu.load_nd %[[R0]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<8x16xf32, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>> -> vector<8x1xf32> 96 %2 = xegpu.load_nd %1 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<8x16xf32, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>> -> vector<8x1xf32> 97 gpu.return 98} 99 100// CHECK: func @test_store_nd_vc(%[[arg0:.*]]: memref<24x32xf16>) { 101gpu.func @test_store_nd_vc(%dst: memref<24x32xf16>) { 102 // CHECK: %[[C:.*]] = arith.constant dense<1.000000e+00> : vector<24x32xf16> 103 %1 = arith.constant dense<1.0>: vector<24x32xf16> 104 // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %[[arg0]][0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<24x32xf16> 105 %2 = xegpu.create_nd_tdesc %dst[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<24x32xf16> 106 // CHECK: xegpu.store_nd %[[C]], %[[R0]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<uncached>}> : vector<24x32xf16>, !xegpu.tensor_desc<24x32xf16> 107 xegpu.store_nd %1, %2 <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<uncached>}>: vector<24x32xf16>, !xegpu.tensor_desc<24x32xf16> 108 gpu.return 109} 110 111// CHECK: func @test_store_nd_vc_2(%[[arg0:.*]]: memref<24x32xf16>) { 112gpu.func @test_store_nd_vc_2(%dst: memref<24x32xf16>) { 113 // CHECK: %[[C:.*]] = arith.constant dense<1.000000e+00> : vector<32xf16> 114 %1 = arith.constant dense<1.0>: vector<32xf16> 115 // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %[[arg0]][0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<32xf16> 116 %2 = xegpu.create_nd_tdesc %dst[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<32xf16> 117 // CHECK: xegpu.store_nd %[[C]], %[[R0]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<uncached>}> : vector<32xf16>, !xegpu.tensor_desc<32xf16> 118 xegpu.store_nd %1, %2 <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<uncached>}>: vector<32xf16>, !xegpu.tensor_desc<32xf16> 119 gpu.return 120} 121 122// store_nd args may have different shapes, validated against sg_map 123// CHECK: func @test_store_nd_vc_3(%[[arg0:.*]]: memref<24x32xf16>) { 124gpu.func @test_store_nd_vc_3(%src: memref<24x32xf16>) { 125 // CHECK: %[[C:.*]] = arith.constant dense<1.000000e+00> : vector<24x2xf16> 126 %1 = arith.constant dense<1.0>: vector<24x2xf16> 127 // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<24x32xf16, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>> 128 %2 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf16> -> 129 !xegpu.tensor_desc<24x32xf16, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>> 130 // CHECK: xegpu.store_nd %[[C]], %[[R0]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<uncached>}> : vector<24x2xf16>, !xegpu.tensor_desc<24x32xf16, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>> 131 xegpu.store_nd %1, %2 <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<uncached>}>: vector<24x2xf16>, !xegpu.tensor_desc<24x32xf16, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>> 132 gpu.return 133} 134 135// CHECK: gpu.func @test_create_update_nd_tdesc_vc(%[[arg0:.*]]: memref<24x32xf32>) { 136gpu.func @test_create_update_nd_tdesc_vc(%src: memref<24x32xf32>) { 137 // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<8x16xf32> 138 %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<8x16xf32> 139 // CHECK: %[[R1:.*]] = xegpu.update_nd_offset %[[REG]], [0, 16] : !xegpu.tensor_desc<8x16xf32> 140 %2 = xegpu.update_nd_offset %1, [0, 16]: !xegpu.tensor_desc<8x16xf32> 141 gpu.return 142} 143 144// CHECK: gpu.func @test_create_tdesc_vc(%[[arg0:.*]]: ui64) { 145gpu.func @test_create_tdesc_vc(%src: ui64) { 146 //CHECK: %[[cst:.*]] = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex> 147 %0 = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex> 148 //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[cst]] : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>> 149 %1 = xegpu.create_tdesc %src, %0 : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>> 150 gpu.return 151} 152 153// CHECK: gpu.func @test_create_tdesc_vc_1(%[[arg0:.*]]: memref<?xf32, 3>) { 154gpu.func @test_create_tdesc_vc_1(%src: memref<?xf32, 3>) { 155 //CHECK: %[[cst:.*]] = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex> 156 %0 = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex> 157 //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[cst]] : memref<?xf32, 3>, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<memory_space = slm, chunk_size = 2 : i64>> 158 %1 = xegpu.create_tdesc %src, %0 : memref<?xf32, 3>, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<memory_space = slm, chunk_size = 2>> 159 gpu.return 160} 161 162// CHECK: gpu.func @test_create_tdesc_vc_with_sg_map(%[[arg0:.*]]: ui64) { 163gpu.func @test_create_tdesc_vc_with_sg_map(%src: ui64) { 164 //CHECK: %[[cst:.*]] = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex> 165 %0 = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex> 166 //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[cst]] : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>, #xegpu.sg_map<wi_layout = [4, 1], wi_data = [1, 1]>> 167 %1 = xegpu.create_tdesc %src, %0 : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>, #xegpu.sg_map<wi_layout = [4, 1], wi_data = [1, 1]>> 168 gpu.return 169} 170 171// CHECK: gpu.func @test_prefetch_vc(%[[arg0:.*]]: ui64) { 172gpu.func @test_prefetch_vc(%src: ui64) { 173 //CHECK: %[[cst:.*]] = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex> 174 %0 = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex> 175 //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[cst]] : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>> 176 %1 = xegpu.create_tdesc %src, %0 : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>> 177 // CHECK: xegpu.prefetch %[[R0]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>> 178 xegpu.prefetch %1 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}>: !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>> 179 gpu.return 180} 181 182// CHECK: gpu.func @test_load_gather_vc(%[[arg0:.*]]: ui64) { 183gpu.func @test_load_gather_vc(%src: ui64) { 184 //CHECK: %[[cst:.*]] = arith.constant dense<true> : vector<4xi1> 185 %0 = arith.constant dense<1>: vector<4xi1> 186 //CHECK: %[[c2:.*]] = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex> 187 %c = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex> 188 //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[c2]] : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>> 189 %1 = xegpu.create_tdesc %src, %c : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>> 190 //CHECK: %[[R1:.*]] = xegpu.load %[[R0]], %[[cst]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>, transpose}> 191 //CHECK-SAME: !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>>, vector<4xi1> -> vector<2x4xf32> 192 %2 = xegpu.load %1, %0 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>, transpose}> 193 : !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>>, vector<4xi1> -> vector<2x4xf32> 194 gpu.return 195} 196 197// CHECK: gpu.func @test_store_scatter_vc(%[[arg0:.*]]: ui64) { 198gpu.func @test_store_scatter_vc(%src: ui64) { 199 //CHECK: %[[c0:.*]] = arith.constant dense<true> : vector<4xi1> 200 %0 = arith.constant dense<1>: vector<4xi1> 201 //CHECK: %[[c1:.*]] = arith.constant dense<2.900000e+00> : vector<2x4xf32> 202 %1 = arith.constant dense<2.9>: vector<2x4xf32> 203 //CHECK: %[[c2:.*]] = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex> 204 %c = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex> 205 //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[c2]] : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>> 206 %2 = xegpu.create_tdesc %src, %c : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>> 207 //CHECK: xegpu.store %[[c1]], %[[R0]], %[[c0]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<uncached>, transpose}> 208 //CHECK-SAME: vector<2x4xf32>, !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>>, vector<4xi1> 209 xegpu.store %1, %2, %0 <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<uncached>, transpose}> 210 : vector<2x4xf32>, !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>>, vector<4xi1> 211 gpu.return 212} 213 214// CHECK: gpu.func @test_create_update_tdesc_vc(%[[arg0:.*]]: ui64) { 215gpu.func @test_create_update_tdesc_vc(%src: ui64) { 216 //CHECK: %[[cst:.*]] = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex> 217 //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[cst]] : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>> 218 //CHECK: %[[st:.*]] = arith.constant dense<32> : vector<4xindex> 219 //CHECK: %[[R1:.*]] = xegpu.update_offset %[[R0]], %[[st]] : !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>>, vector<4xindex> 220 %0 = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex> 221 %1 = xegpu.create_tdesc %src, %0 : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>> 222 %s = arith.constant dense<[32, 32, 32, 32]> : vector<4xindex> 223 %2 = xegpu.update_offset %1, %s : !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>>, vector<4xindex> 224 gpu.return 225} 226 227// CHECK: gpu.func @test_dpas_vc(%[[arg0:.*]]: vector<8x16xf16>, %[[arg1:.*]]: vector<16x16xf16>) 228gpu.func @test_dpas_vc(%a : vector<8x16xf16>, %b: vector<16x16xf16>) { 229 // CHECK: %0 = xegpu.dpas %[[arg0]], %[[arg1]] : vector<8x16xf16>, vector<16x16xf16> -> vector<8x16xf32> 230 %1 = xegpu.dpas %a, %b: vector<8x16xf16>, vector<16x16xf16> -> vector<8x16xf32> 231 gpu.return 232} 233 234 235// CHECK: gpu.func @test_dpas_vc_with_packed_b(%[[arg0:.*]]: vector<8x16xf16>, %[[arg1:.*]]: vector<8x16x2xf16>) 236gpu.func @test_dpas_vc_with_packed_b(%a : vector<8x16xf16>, %b: vector<8x16x2xf16>) { 237 // CHECK: %0 = xegpu.dpas %[[arg0]], %[[arg1]] : vector<8x16xf16>, vector<8x16x2xf16> -> vector<8x16xf32> 238 %1 = xegpu.dpas %a, %b: vector<8x16xf16>, vector<8x16x2xf16> -> vector<8x16xf32> 239 gpu.return 240} 241 242// CHECK: gpu.func @test_atomic_rmw(%[[arg0:.*]]: ui64, %[[arg1:.*]]: vector<16xf32>, %[[arg2:.*]]: vector<16xi1>) 243gpu.func @test_atomic_rmw(%src: ui64, %value : vector<16xf32>, %mask : vector<16xi1>) { 244 //CHECK: %[[c:.*]] = arith.constant dense<[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]> : vector<16xindex> 245 %c = arith.constant dense<[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]> : vector<16xindex> 246 //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[c]] : ui64, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>> 247 %1 = xegpu.create_tdesc %src, %c: ui64, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>> 248 //CHECK: %[[R1:.*]] = xegpu.atomic_rmw addf %[[R0]], %[[arg2]], %[[arg1]] : !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>>, vector<16xi1>, vector<16xf32> -> vector<16xf32> 249 xegpu.atomic_rmw addf %1, %mask, %value: !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>>, vector<16xi1>, vector<16xf32> -> vector<16xf32> 250 gpu.return 251} 252 253// CHECK: gpu.func @alloc_nbarrier({{.*}}) { 254gpu.func @alloc_nbarrier() { 255 // CHECK: xegpu.alloc_nbarrier 256 xegpu.alloc_nbarrier 8 257 gpu.return 258} 259 260// CHECK: gpu.func @init_nbarrier({{.*}}) { 261gpu.func @init_nbarrier() { 262 //CHECK: %[[c1:.*]] = arith.constant 1 : i8 263 //CHECK: %[[c16:.*]] = arith.constant 16 : i8 264 %nbarrier_id = arith.constant 1 : i8 265 %threads_count = arith.constant 16 : i8 266 //CHECK: xegpu.init_nbarrier %[[c1]], %[[c16]] : i8, i8 -> !xegpu.nbarrier 267 %nbarrier = xegpu.init_nbarrier %nbarrier_id, %threads_count : i8, i8 -> !xegpu.nbarrier 268 gpu.return 269} 270 271// CHECK: gpu.func @nbarrier_arrive(%[[arg0:.*]]: !xegpu.nbarrier) { 272gpu.func @nbarrier_arrive(%nbarrier : !xegpu.nbarrier) { 273 //CHECK: xegpu.nbarrier_arrive %[[arg0]] : !xegpu.nbarrier 274 xegpu.nbarrier_arrive %nbarrier : !xegpu.nbarrier 275 gpu.return 276} 277 278// CHECK: gpu.func @nbarrier_wait(%[[arg0:.*]]: !xegpu.nbarrier) { 279gpu.func @nbarrier_wait(%nbarrier : !xegpu.nbarrier) { 280 //CHECK: xegpu.nbarrier_wait %[[arg0]] : !xegpu.nbarrier 281 xegpu.nbarrier_wait %nbarrier : !xegpu.nbarrier 282 gpu.return 283} 284 285// CHECK-LABEL: gpu.func @fence({{.*}}) { 286gpu.func @fence() { 287 //CHECK: xegpu.fence memory_kind = global, fence_scope = workgroup 288 xegpu.fence memory_kind = global, fence_scope = workgroup 289 gpu.return 290} 291 292} 293