1// RUN: mlir-opt %s --allow-unregistered-dialect --split-input-file \ 2// RUN: --test-vector-warp-distribute=rewrite-warp-ops-to-scf-if | FileCheck %s --check-prefix=CHECK-SCF-IF 3 4// RUN: mlir-opt %s --allow-unregistered-dialect --split-input-file \ 5// RUN: --test-vector-warp-distribute="hoist-uniform" | FileCheck --check-prefixes=CHECK-HOIST %s 6 7// RUN: mlir-opt %s --allow-unregistered-dialect --split-input-file \ 8// RUN: --test-vector-warp-distribute="hoist-uniform distribute-transfer-write max-transfer-write-elements=4" \ 9// RUN: | FileCheck --check-prefixes=CHECK-D %s 10 11// RUN: mlir-opt %s --allow-unregistered-dialect --split-input-file \ 12// RUN: --test-vector-warp-distribute=propagate-distribution --canonicalize \ 13// RUN: | FileCheck --check-prefixes=CHECK-PROP %s 14 15// RUN: mlir-opt %s --allow-unregistered-dialect --split-input-file \ 16// RUN: --test-vector-warp-distribute="hoist-uniform distribute-transfer-write propagate-distribution" \ 17// RUN: --canonicalize | FileCheck --check-prefixes=CHECK-DIST-AND-PROP %s 18 19// CHECK-SCF-IF-DAG: #[[$TIMES2:.*]] = affine_map<()[s0] -> (s0 * 2)> 20// CHECK-SCF-IF-DAG: #[[$TIMES4:.*]] = affine_map<()[s0] -> (s0 * 4)> 21// CHECK-SCF-IF-DAG: #[[$TIMES8:.*]] = affine_map<()[s0] -> (s0 * 8)> 22// CHECK-SCF-IF-DAG: memref.global "private" @__shared_32xf32 : memref<32xf32, 3> 23// CHECK-SCF-IF-DAG: memref.global "private" @__shared_64xf32 : memref<64xf32, 3> 24// CHECK-SCF-IF-DAG: memref.global "private" @__shared_128xf32 : memref<128xf32, 3> 25// CHECK-SCF-IF-DAG: memref.global "private" @__shared_256xf32 : memref<256xf32, 3> 26 27// CHECK-SCF-IF-LABEL: func @rewrite_warp_op_to_scf_if( 28// CHECK-SCF-IF-SAME: %[[laneid:.*]]: index, 29// CHECK-SCF-IF-SAME: %[[v0:.*]]: vector<4xf32>, %[[v1:.*]]: vector<8xf32>) 30func.func @rewrite_warp_op_to_scf_if(%laneid: index, 31 %v0: vector<4xf32>, %v1: vector<8xf32>) { 32// CHECK-SCF-IF-DAG: %[[c0:.*]] = arith.constant 0 : index 33// CHECK-SCF-IF: %[[is_lane_0:.*]] = arith.cmpi eq, %[[laneid]], %[[c0]] 34 35// CHECK-SCF-IF: %[[buffer_v0:.*]] = memref.get_global @__shared_128xf32 36// CHECK-SCF-IF: %[[s0:.*]] = affine.apply #[[$TIMES4]]()[%[[laneid]]] 37// CHECK-SCF-IF: vector.transfer_write %[[v0]], %[[buffer_v0]][%[[s0]]] 38// CHECK-SCF-IF: %[[buffer_v1:.*]] = memref.get_global @__shared_256xf32 39// CHECK-SCF-IF: %[[s1:.*]] = affine.apply #[[$TIMES8]]()[%[[laneid]]] 40// CHECK-SCF-IF: vector.transfer_write %[[v1]], %[[buffer_v1]][%[[s1]]] 41 42// CHECK-SCF-IF-DAG: gpu.barrier 43// CHECK-SCF-IF-DAG: %[[buffer_def_0:.*]] = memref.get_global @__shared_32xf32 44// CHECK-SCF-IF-DAG: %[[buffer_def_1:.*]] = memref.get_global @__shared_64xf32 45 46// CHECK-SCF-IF: scf.if %[[is_lane_0]] { 47 %r:2 = gpu.warp_execute_on_lane_0(%laneid)[32] 48 args(%v0, %v1 : vector<4xf32>, vector<8xf32>) -> (vector<1xf32>, vector<2xf32>) { 49 ^bb0(%arg0: vector<128xf32>, %arg1: vector<256xf32>): 50// CHECK-SCF-IF: %[[arg1:.*]] = vector.transfer_read %[[buffer_v1]][%[[c0]]], %{{.*}} {in_bounds = [true]} : memref<256xf32, 3>, vector<256xf32> 51// CHECK-SCF-IF: %[[arg0:.*]] = vector.transfer_read %[[buffer_v0]][%[[c0]]], %{{.*}} {in_bounds = [true]} : memref<128xf32, 3>, vector<128xf32> 52// CHECK-SCF-IF: %[[def_0:.*]] = "some_def"(%[[arg0]]) : (vector<128xf32>) -> vector<32xf32> 53// CHECK-SCF-IF: %[[def_1:.*]] = "some_def"(%[[arg1]]) : (vector<256xf32>) -> vector<64xf32> 54 %2 = "some_def"(%arg0) : (vector<128xf32>) -> vector<32xf32> 55 %3 = "some_def"(%arg1) : (vector<256xf32>) -> vector<64xf32> 56// CHECK-SCF-IF: vector.transfer_write %[[def_0]], %[[buffer_def_0]][%[[c0]]] 57// CHECK-SCF-IF: vector.transfer_write %[[def_1]], %[[buffer_def_1]][%[[c0]]] 58 gpu.yield %2, %3 : vector<32xf32>, vector<64xf32> 59 } 60// CHECK-SCF-IF: } 61// CHECK-SCF-IF: gpu.barrier 62// CHECK-SCF-IF: %[[o1:.*]] = affine.apply #[[$TIMES2]]()[%[[laneid]]] 63// CHECK-SCF-IF: %[[r1:.*]] = vector.transfer_read %[[buffer_def_1]][%[[o1]]], %{{.*}} {in_bounds = [true]} : memref<64xf32, 3>, vector<2xf32> 64// CHECK-SCF-IF: %[[r0:.*]] = vector.transfer_read %[[buffer_def_0]][%[[laneid]]], %{{.*}} {in_bounds = [true]} : memref<32xf32, 3>, vector<1xf32> 65// CHECK-SCF-IF: "some_use"(%[[r0]]) : (vector<1xf32>) -> () 66// CHECK-SCF-IF: "some_use"(%[[r1]]) : (vector<2xf32>) -> () 67 "some_use"(%r#0) : (vector<1xf32>) -> () 68 "some_use"(%r#1) : (vector<2xf32>) -> () 69 return 70} 71 72// ----- 73 74// CHECK-D-DAG: #[[MAP1:.*]] = affine_map<()[s0] -> (s0 * 2 + 32)> 75 76// CHECK-DIST-AND-PROP-LABEL: func @warp( 77// CHECK-HOIST: memref.subview 78// CHECK-HOIST: memref.subview 79// CHECK-HOIST: memref.subview 80// CHECK-HOIST: gpu.warp_execute_on_lane_0 81 82// CHECK-D: %[[R:.*]]:2 = gpu.warp_execute_on_lane_0(%{{.*}})[32] -> (vector<2xf32>, vector<1xf32>) { 83// CHECK-D: arith.addf {{.*}} : vector<32xf32> 84// CHECK-D: arith.addf {{.*}} : vector<64xf32> 85// CHECK-D: gpu.yield %{{.*}}, %{{.*}} : vector<64xf32>, vector<32xf32> 86// CHECK-D-DAG: vector.transfer_write %[[R]]#1, %{{.*}}[%{{.*}}] {in_bounds = [true]} : vector<1xf32>, memref<128xf32 87// CHECK-D-DAG: %[[ID1:.*]] = affine.apply #[[MAP1]]()[%{{.*}}] 88// CHECK-D-DAG: vector.transfer_write %[[R]]#0, %{{.*}}[%[[ID1]]] {in_bounds = [true]} : vector<2xf32>, memref<128xf32 89 90// CHECK-DIST-AND-PROP-NOT: gpu.warp_execute_on_lane_0 91// CHECK-DIST-AND-PROP: vector.transfer_read {{.*}} vector<1xf32> 92// CHECK-DIST-AND-PROP: vector.transfer_read {{.*}} vector<1xf32> 93// CHECK-DIST-AND-PROP: vector.transfer_read {{.*}} vector<2xf32> 94// CHECK-DIST-AND-PROP: vector.transfer_read {{.*}} vector<2xf32> 95// CHECK-DIST-AND-PROP: arith.addf {{.*}} : vector<1xf32> 96// CHECK-DIST-AND-PROP: arith.addf {{.*}} : vector<2xf32> 97// CHECK-DIST-AND-PROP: vector.transfer_write {{.*}} : vector<1xf32> 98// CHECK-DIST-AND-PROP: vector.transfer_write {{.*}} : vector<2xf32> 99 100func.func @warp(%laneid: index, %arg1: memref<1024xf32>, %arg2: memref<1024xf32>, 101 %arg3: memref<1024xf32>, %gid : index) { 102 gpu.warp_execute_on_lane_0(%laneid)[32] { 103 %sa = memref.subview %arg1[%gid] [128] [1] : memref<1024xf32> to memref<128xf32, strided<[1], offset: ?>> 104 %sb = memref.subview %arg2[%gid] [128] [1] : memref<1024xf32> to memref<128xf32, strided<[1], offset: ?>> 105 %sc = memref.subview %arg3[%gid] [128] [1] : memref<1024xf32> to memref<128xf32, strided<[1], offset: ?>> 106 %c0 = arith.constant 0 : index 107 %c32 = arith.constant 32 : index 108 %cst = arith.constant 0.000000e+00 : f32 109 %2 = vector.transfer_read %sa[%c0], %cst : memref<128xf32, strided<[1], offset: ?>>, vector<32xf32> 110 %3 = vector.transfer_read %sa[%c32], %cst : memref<128xf32, strided<[1], offset: ?>>, vector<32xf32> 111 %4 = vector.transfer_read %sb[%c0], %cst : memref<128xf32, strided<[1], offset: ?>>, vector<64xf32> 112 %5 = vector.transfer_read %sb[%c32], %cst : memref<128xf32, strided<[1], offset: ?>>, vector<64xf32> 113 %6 = arith.addf %2, %3 : vector<32xf32> 114 %7 = arith.addf %4, %5 : vector<64xf32> 115 vector.transfer_write %6, %sc[%c0] : vector<32xf32>, memref<128xf32, strided<[1], offset: ?>> 116 vector.transfer_write %7, %sc[%c32] : vector<64xf32>, memref<128xf32, strided<[1], offset: ?>> 117 } 118 return 119} 120 121// ----- 122 123// CHECK-D-LABEL: func @warp_extract( 124// CHECK-D: %[[WARPOP:.*]]:2 = gpu.warp_execute_on_lane_0(%{{.*}})[32] -> (vector<1xf32>, vector<1x1xf32>) 125// CHECK-D: "test.dummy_op" 126// CHECK-D: "test.dummy_op" 127// CHECK-D: gpu.yield %{{.*}}, %{{.*}} : vector<1xf32>, vector<1x1xf32> 128// CHECK-D: } 129// CHECK-D: gpu.warp_execute_on_lane_0(%{{.*}})[32] { 130// CHECK-D: vector.transfer_write %[[WARPOP]]#1, %{{.*}}[%{{.*}}] {{.*}} : vector<1x1xf32> 131// CHECK-D: } 132// CHECK-D: gpu.warp_execute_on_lane_0(%{{.*}})[32] { 133// CHECK-D: vector.transfer_write %[[WARPOP]]#0, %{{.*}}[%{{.*}}] {{.*}} : vector<1xf32> 134// CHECK-D: } 135 136func.func @warp_extract(%laneid: index, %arg1: memref<1024x1024xf32>, %gid : index) { 137 gpu.warp_execute_on_lane_0(%laneid)[32] { 138 %c0 = arith.constant 0 : index 139 %v = "test.dummy_op"() : () -> (vector<1xf32>) 140 %v1 = "test.dummy_op"() : () -> (vector<1x1xf32>) 141 vector.transfer_write %v1, %arg1[%c0, %c0] : vector<1x1xf32>, memref<1024x1024xf32> 142 vector.transfer_write %v, %arg1[%c0, %c0] : vector<1xf32>, memref<1024x1024xf32> 143 } 144 return 145} 146 147// ----- 148 149// Check that we can distribute writes of the maximum allowed number of elements. 150 151// CHECK-D-LABEL: func @warp_extract_4_elems( 152// CHECK-D: %[[WARPOP:.*]]:2 = gpu.warp_execute_on_lane_0(%{{.*}})[32] -> (vector<4xf32>, vector<4x1xf32>) 153// CHECK-D: "test.dummy_op" 154// CHECK-D: "test.dummy_op" 155// CHECK-D: gpu.yield %{{.*}}, %{{.*}} : vector<4xf32>, vector<4x1xf32> 156// CHECK-D: } 157// CHECK-D: gpu.warp_execute_on_lane_0(%{{.*}})[32] { 158// CHECK-D: vector.transfer_write %[[WARPOP]]#1, %{{.*}}[%{{.*}}] {{.*}} : vector<4x1xf32> 159// CHECK-D: } 160// CHECK-D: gpu.warp_execute_on_lane_0(%{{.*}})[32] { 161// CHECK-D: vector.transfer_write %[[WARPOP]]#0, %{{.*}}[%{{.*}}] {{.*}} : vector<4xf32> 162// CHECK-D: } 163 164func.func @warp_extract_4_elems(%laneid: index, %arg1: memref<1024x1024xf32>, %gid : index) { 165 gpu.warp_execute_on_lane_0(%laneid)[32] { 166 %c0 = arith.constant 0 : index 167 %v = "test.dummy_op"() : () -> (vector<4xf32>) 168 %v1 = "test.dummy_op"() : () -> (vector<4x1xf32>) 169 vector.transfer_write %v1, %arg1[%c0, %c0] : vector<4x1xf32>, memref<1024x1024xf32> 170 vector.transfer_write %v, %arg1[%c0, %c0] : vector<4xf32>, memref<1024x1024xf32> 171 } 172 return 173} 174 175// ----- 176 177// Check that we do not distribute writes larger than the maximum allowed 178// number of elements. 179 180// CHECK-D-LABEL: func @warp_extract_5_elems( 181// CHECK-D: arith.constant 0 : index 182// CHECK-D: gpu.warp_execute_on_lane_0(%{{.*}})[32] { 183// CHECK-D: %[[V:.+]] = "test.dummy_op" 184// CHECK-D: %[[V1:.+]] = "test.dummy_op" 185// CHECK-D: vector.transfer_write %[[V1]], %{{.*}}[%{{.*}}] {{.*}} : vector<5x1xf32> 186// CHECK-D: vector.transfer_write %[[V]], %{{.*}}[%{{.*}}] {{.*}} : vector<5xf32> 187// CHECK-D: } 188 189func.func @warp_extract_5_elems(%laneid: index, %arg1: memref<1024x1024xf32>, %gid : index) { 190 gpu.warp_execute_on_lane_0(%laneid)[32] { 191 %c0 = arith.constant 0 : index 192 %v = "test.dummy_op"() : () -> (vector<5xf32>) 193 %v1 = "test.dummy_op"() : () -> (vector<5x1xf32>) 194 vector.transfer_write %v1, %arg1[%c0, %c0] : vector<5x1xf32>, memref<1024x1024xf32> 195 vector.transfer_write %v, %arg1[%c0, %c0] : vector<5xf32>, memref<1024x1024xf32> 196 } 197 return 198} 199 200// ----- 201 202// Check that we do not distribute writes larger than the maximum allowed 203// number of elements, or multiples of the maximum number of elements. 204 205// CHECK-D-LABEL: func @warp_extract_8_elems( 206// CHECK-D: arith.constant 0 : index 207// CHECK-D: gpu.warp_execute_on_lane_0(%{{.*}})[32] { 208// CHECK-D: %[[V:.+]] = "test.dummy_op" 209// CHECK-D: %[[V1:.+]] = "test.dummy_op" 210// CHECK-D: vector.transfer_write %[[V1]], %{{.*}}[%{{.*}}] {{.*}} : vector<8x1xf32> 211// CHECK-D: vector.transfer_write %[[V]], %{{.*}}[%{{.*}}] {{.*}} : vector<8xf32> 212// CHECK-D: } 213 214func.func @warp_extract_8_elems(%laneid: index, %arg1: memref<1024x1024xf32>, %gid : index) { 215 gpu.warp_execute_on_lane_0(%laneid)[32] { 216 %c0 = arith.constant 0 : index 217 %v = "test.dummy_op"() : () -> (vector<8xf32>) 218 %v1 = "test.dummy_op"() : () -> (vector<8x1xf32>) 219 vector.transfer_write %v1, %arg1[%c0, %c0] : vector<8x1xf32>, memref<1024x1024xf32> 220 vector.transfer_write %v, %arg1[%c0, %c0] : vector<8xf32>, memref<1024x1024xf32> 221 } 222 return 223} 224 225// ----- 226 227// CHECK-PROP-LABEL: func @warp_dead_result( 228func.func @warp_dead_result(%laneid: index) -> (vector<1xf32>) { 229 // CHECK-PROP: %[[R:.*]] = gpu.warp_execute_on_lane_0(%{{.*}})[32] -> (vector<1xf32>) 230 %r:3 = gpu.warp_execute_on_lane_0(%laneid)[32] -> 231 (vector<1xf32>, vector<1xf32>, vector<1xf32>) { 232 %2 = "some_def"() : () -> (vector<32xf32>) 233 %3 = "some_def"() : () -> (vector<32xf32>) 234 %4 = "some_def"() : () -> (vector<32xf32>) 235 // CHECK-PROP: gpu.yield %{{.*}} : vector<32xf32> 236 gpu.yield %2, %3, %4 : vector<32xf32>, vector<32xf32>, vector<32xf32> 237 } 238 // CHECK-PROP: return %[[R]] : vector<1xf32> 239 return %r#1 : vector<1xf32> 240} 241 242// ----- 243 244// CHECK-PROP-LABEL: func @warp_propagate_operand( 245// CHECK-PROP-SAME: %[[ID:.*]]: index, %[[V:.*]]: vector<4xf32>) 246func.func @warp_propagate_operand(%laneid: index, %v0: vector<4xf32>) 247 -> (vector<4xf32>) { 248 %r = gpu.warp_execute_on_lane_0(%laneid)[32] 249 args(%v0 : vector<4xf32>) -> (vector<4xf32>) { 250 ^bb0(%arg0 : vector<128xf32>) : 251 gpu.yield %arg0 : vector<128xf32> 252 } 253 // CHECK-PROP: return %[[V]] : vector<4xf32> 254 return %r : vector<4xf32> 255} 256 257// ----- 258 259#map0 = affine_map<()[s0] -> (s0 * 2)> 260 261// CHECK-PROP-LABEL: func @warp_propagate_elementwise( 262func.func @warp_propagate_elementwise(%laneid: index, %dest: memref<1024xf32>) { 263 %c0 = arith.constant 0 : index 264 %c32 = arith.constant 0 : index 265 %cst = arith.constant 0.000000e+00 : f32 266 // CHECK-PROP: %[[R:.*]]:4 = gpu.warp_execute_on_lane_0(%{{.*}})[32] -> (vector<1xf32>, vector<1xf32>, vector<2xf32>, vector<2xf32>) 267 %r:2 = gpu.warp_execute_on_lane_0(%laneid)[32] -> 268 (vector<1xf32>, vector<2xf32>) { 269 // CHECK-PROP: %[[V0:.*]] = "some_def"() : () -> vector<32xf32> 270 // CHECK-PROP: %[[V1:.*]] = "some_def"() : () -> vector<32xf32> 271 // CHECK-PROP: %[[V2:.*]] = "some_def"() : () -> vector<64xf32> 272 // CHECK-PROP: %[[V3:.*]] = "some_def"() : () -> vector<64xf32> 273 // CHECK-PROP: gpu.yield %[[V0]], %[[V1]], %[[V2]], %[[V3]] : vector<32xf32>, vector<32xf32>, vector<64xf32>, vector<64xf32> 274 %2 = "some_def"() : () -> (vector<32xf32>) 275 %3 = "some_def"() : () -> (vector<32xf32>) 276 %4 = "some_def"() : () -> (vector<64xf32>) 277 %5 = "some_def"() : () -> (vector<64xf32>) 278 %6 = arith.addf %2, %3 : vector<32xf32> 279 %7 = arith.addf %4, %5 : vector<64xf32> 280 gpu.yield %6, %7 : vector<32xf32>, vector<64xf32> 281 } 282 // CHECK-PROP: %[[A0:.*]] = arith.addf %[[R]]#2, %[[R]]#3 : vector<2xf32> 283 // CHECK-PROP: %[[A1:.*]] = arith.addf %[[R]]#0, %[[R]]#1 : vector<1xf32> 284 %id2 = affine.apply #map0()[%laneid] 285 // CHECK-PROP: vector.transfer_write %[[A1]], {{.*}} : vector<1xf32>, memref<1024xf32> 286 // CHECK-PROP: vector.transfer_write %[[A0]], {{.*}} : vector<2xf32>, memref<1024xf32> 287 vector.transfer_write %r#0, %dest[%laneid] : vector<1xf32>, memref<1024xf32> 288 vector.transfer_write %r#1, %dest[%id2] : vector<2xf32>, memref<1024xf32> 289 return 290} 291 292// ----- 293 294// CHECK-PROP-LABEL: func @warp_propagate_scalar_arith( 295// CHECK-PROP: %[[r:.*]]:2 = gpu.warp_execute_on_lane_0{{.*}} { 296// CHECK-PROP: %[[some_def0:.*]] = "some_def" 297// CHECK-PROP: %[[some_def1:.*]] = "some_def" 298// CHECK-PROP: gpu.yield %[[some_def0]], %[[some_def1]] 299// CHECK-PROP: } 300// CHECK-PROP: arith.addf %[[r]]#0, %[[r]]#1 : f32 301func.func @warp_propagate_scalar_arith(%laneid: index) { 302 %r = gpu.warp_execute_on_lane_0(%laneid)[32] -> (f32) { 303 %0 = "some_def"() : () -> (f32) 304 %1 = "some_def"() : () -> (f32) 305 %2 = arith.addf %0, %1 : f32 306 gpu.yield %2 : f32 307 } 308 vector.print %r : f32 309 return 310} 311 312// ----- 313 314// CHECK-PROP-LABEL: func @warp_propagate_cast( 315// CHECK-PROP-NOT: gpu.warp_execute_on_lane_0 316// CHECK-PROP: %[[result:.*]] = arith.sitofp %{{.*}} : i32 to f32 317// CHECK-PROP: return %[[result]] 318func.func @warp_propagate_cast(%laneid : index, %i : i32) -> (f32) { 319 %r = gpu.warp_execute_on_lane_0(%laneid)[32] -> (f32) { 320 %casted = arith.sitofp %i : i32 to f32 321 gpu.yield %casted : f32 322 } 323 return %r : f32 324} 325 326// ----- 327 328#map0 = affine_map<()[s0] -> (s0 * 2)> 329 330// CHECK-PROP-DAG: #[[MAP0:.*]] = affine_map<()[s0] -> (s0 * 2)> 331 332// CHECK-PROP: func @warp_propagate_read 333// CHECK-PROP-SAME: (%[[ID:.*]]: index 334func.func @warp_propagate_read(%laneid: index, %src: memref<1024xf32>, %dest: memref<1024xf32>) { 335// CHECK-PROP-NOT: warp_execute_on_lane_0 336// CHECK-PROP-DAG: %[[R0:.*]] = vector.transfer_read %arg1[%[[ID]]], %{{.*}} : memref<1024xf32>, vector<1xf32> 337// CHECK-PROP-DAG: %[[ID2:.*]] = affine.apply #[[MAP0]]()[%[[ID]]] 338// CHECK-PROP-DAG: %[[R1:.*]] = vector.transfer_read %arg1[%[[ID2]]], %{{.*}} : memref<1024xf32>, vector<2xf32> 339// CHECK-PROP: vector.transfer_write %[[R0]], {{.*}} : vector<1xf32>, memref<1024xf32> 340// CHECK-PROP: vector.transfer_write %[[R1]], {{.*}} : vector<2xf32>, memref<1024xf32> 341 %c0 = arith.constant 0 : index 342 %c32 = arith.constant 0 : index 343 %cst = arith.constant 0.000000e+00 : f32 344 %r:2 = gpu.warp_execute_on_lane_0(%laneid)[32] ->(vector<1xf32>, vector<2xf32>) { 345 %2 = vector.transfer_read %src[%c0], %cst : memref<1024xf32>, vector<32xf32> 346 %3 = vector.transfer_read %src[%c32], %cst : memref<1024xf32>, vector<64xf32> 347 gpu.yield %2, %3 : vector<32xf32>, vector<64xf32> 348 } 349 %id2 = affine.apply #map0()[%laneid] 350 vector.transfer_write %r#0, %dest[%laneid] : vector<1xf32>, memref<1024xf32> 351 vector.transfer_write %r#1, %dest[%id2] : vector<2xf32>, memref<1024xf32> 352 return 353} 354 355// ----- 356 357// CHECK-PROP-LABEL: func @fold_vector_broadcast( 358// CHECK-PROP: %[[r:.*]] = gpu.warp_execute_on_lane_0{{.*}} -> (vector<1xf32>) 359// CHECK-PROP: %[[some_def:.*]] = "some_def" 360// CHECK-PROP: gpu.yield %[[some_def]] : vector<1xf32> 361// CHECK-PROP: vector.print %[[r]] : vector<1xf32> 362func.func @fold_vector_broadcast(%laneid: index) { 363 %r = gpu.warp_execute_on_lane_0(%laneid)[32] -> (vector<1xf32>) { 364 %0 = "some_def"() : () -> (vector<1xf32>) 365 %1 = vector.broadcast %0 : vector<1xf32> to vector<32xf32> 366 gpu.yield %1 : vector<32xf32> 367 } 368 vector.print %r : vector<1xf32> 369 return 370} 371 372// ----- 373 374// CHECK-PROP-LABEL: func @extract_vector_broadcast( 375// CHECK-PROP: %[[r:.*]] = gpu.warp_execute_on_lane_0{{.*}} -> (vector<1xf32>) 376// CHECK-PROP: %[[some_def:.*]] = "some_def" 377// CHECK-PROP: gpu.yield %[[some_def]] : vector<1xf32> 378// CHECK-PROP: %[[broadcasted:.*]] = vector.broadcast %[[r]] : vector<1xf32> to vector<2xf32> 379// CHECK-PROP: vector.print %[[broadcasted]] : vector<2xf32> 380func.func @extract_vector_broadcast(%laneid: index) { 381 %r = gpu.warp_execute_on_lane_0(%laneid)[32] -> (vector<2xf32>) { 382 %0 = "some_def"() : () -> (vector<1xf32>) 383 %1 = vector.broadcast %0 : vector<1xf32> to vector<64xf32> 384 gpu.yield %1 : vector<64xf32> 385 } 386 vector.print %r : vector<2xf32> 387 return 388} 389 390// ----- 391 392// CHECK-PROP-LABEL: func @extract_scalar_vector_broadcast( 393// CHECK-PROP: %[[r:.*]] = gpu.warp_execute_on_lane_0{{.*}} -> (f32) 394// CHECK-PROP: %[[some_def:.*]] = "some_def" 395// CHECK-PROP: gpu.yield %[[some_def]] : f32 396// CHECK-PROP: %[[broadcasted:.*]] = vector.broadcast %[[r]] : f32 to vector<2xf32> 397// CHECK-PROP: vector.print %[[broadcasted]] : vector<2xf32> 398func.func @extract_scalar_vector_broadcast(%laneid: index) { 399 %r = gpu.warp_execute_on_lane_0(%laneid)[32] -> (vector<2xf32>) { 400 %0 = "some_def"() : () -> (f32) 401 %1 = vector.broadcast %0 : f32 to vector<64xf32> 402 gpu.yield %1 : vector<64xf32> 403 } 404 vector.print %r : vector<2xf32> 405 return 406} 407 408// ----- 409 410// CHECK-PROP-LABEL: func @warp_scf_for( 411// CHECK-PROP: %[[INI:.*]] = gpu.warp_execute_on_lane_0(%{{.*}})[32] -> (vector<4xf32>) { 412// CHECK-PROP: %[[INI1:.*]] = "some_def"() : () -> vector<128xf32> 413// CHECK-PROP: gpu.yield %[[INI1]] : vector<128xf32> 414// CHECK-PROP: } 415// CHECK-PROP: %[[F:.*]] = scf.for %[[IT:.+]] = %{{.*}} to %{{.*}} step %{{.*}} iter_args(%[[FARG:.*]] = %[[INI]]) -> (vector<4xf32>) { 416// CHECK-PROP: %[[A:.*]] = arith.addi %[[IT]], %{{.*}} : index 417// CHECK-PROP: %[[W:.*]] = gpu.warp_execute_on_lane_0(%{{.*}})[32] args(%[[FARG]] : vector<4xf32>) -> (vector<4xf32>) { 418// CHECK-PROP: ^bb0(%[[ARG:.*]]: vector<128xf32>): 419// CHECK-PROP: %[[ACC:.*]] = "some_def"(%[[A]], %[[ARG]]) : (index, vector<128xf32>) -> vector<128xf32> 420// CHECK-PROP: gpu.yield %[[ACC]] : vector<128xf32> 421// CHECK-PROP: } 422// CHECK-PROP: scf.yield %[[W]] : vector<4xf32> 423// CHECK-PROP: } 424// CHECK-PROP: "some_use"(%[[F]]) : (vector<4xf32>) -> () 425func.func @warp_scf_for(%arg0: index) { 426 %c128 = arith.constant 128 : index 427 %c1 = arith.constant 1 : index 428 %c0 = arith.constant 0 : index 429 %0 = gpu.warp_execute_on_lane_0(%arg0)[32] -> (vector<4xf32>) { 430 %ini = "some_def"() : () -> (vector<128xf32>) 431 %3 = scf.for %arg3 = %c0 to %c128 step %c1 iter_args(%arg4 = %ini) -> (vector<128xf32>) { 432 %add = arith.addi %arg3, %c1 : index 433 %acc = "some_def"(%add, %arg4) : (index, vector<128xf32>) -> (vector<128xf32>) 434 scf.yield %acc : vector<128xf32> 435 } 436 gpu.yield %3 : vector<128xf32> 437 } 438 "some_use"(%0) : (vector<4xf32>) -> () 439 return 440} 441 442// ----- 443 444// CHECK-PROP-LABEL: func @warp_scf_for_use_from_above( 445// CHECK-PROP: %[[INI:.*]]:2 = gpu.warp_execute_on_lane_0(%{{.*}})[32] -> (vector<4xf32>, vector<4xf32>) { 446// CHECK-PROP: %[[INI1:.*]] = "some_def"() : () -> vector<128xf32> 447// CHECK-PROP: %[[USE:.*]] = "some_def_above"() : () -> vector<128xf32> 448// CHECK-PROP: gpu.yield %[[INI1]], %[[USE]] : vector<128xf32>, vector<128xf32> 449// CHECK-PROP: } 450// CHECK-PROP: %[[F:.*]] = scf.for %{{.*}} = %{{.*}} to %{{.*}} step %{{.*}} iter_args(%[[FARG:.*]] = %[[INI]]#0) -> (vector<4xf32>) { 451// CHECK-PROP: %[[W:.*]] = gpu.warp_execute_on_lane_0(%{{.*}})[32] args(%[[FARG]], %[[INI]]#1 : vector<4xf32>, vector<4xf32>) -> (vector<4xf32>) { 452// CHECK-PROP: ^bb0(%[[ARG0:.*]]: vector<128xf32>, %[[ARG1:.*]]: vector<128xf32>): 453// CHECK-PROP: %[[ACC:.*]] = "some_def"(%[[ARG0]], %[[ARG1]]) : (vector<128xf32>, vector<128xf32>) -> vector<128xf32> 454// CHECK-PROP: gpu.yield %[[ACC]] : vector<128xf32> 455// CHECK-PROP: } 456// CHECK-PROP: scf.yield %[[W]] : vector<4xf32> 457// CHECK-PROP: } 458// CHECK-PROP: "some_use"(%[[F]]) : (vector<4xf32>) -> () 459func.func @warp_scf_for_use_from_above(%arg0: index) { 460 %c128 = arith.constant 128 : index 461 %c1 = arith.constant 1 : index 462 %c0 = arith.constant 0 : index 463 %0 = gpu.warp_execute_on_lane_0(%arg0)[32] -> (vector<4xf32>) { 464 %ini = "some_def"() : () -> (vector<128xf32>) 465 %use_from_above = "some_def_above"() : () -> (vector<128xf32>) 466 %3 = scf.for %arg3 = %c0 to %c128 step %c1 iter_args(%arg4 = %ini) -> (vector<128xf32>) { 467 %acc = "some_def"(%arg4, %use_from_above) : (vector<128xf32>, vector<128xf32>) -> (vector<128xf32>) 468 scf.yield %acc : vector<128xf32> 469 } 470 gpu.yield %3 : vector<128xf32> 471 } 472 "some_use"(%0) : (vector<4xf32>) -> () 473 return 474} 475 476// ----- 477 478// CHECK-PROP-LABEL: func @warp_scf_for_swap( 479// CHECK-PROP: %[[INI:.*]]:2 = gpu.warp_execute_on_lane_0(%{{.*}})[32] -> (vector<4xf32>, vector<4xf32>) { 480// CHECK-PROP: %[[INI1:.*]] = "some_def"() : () -> vector<128xf32> 481// CHECK-PROP: %[[INI2:.*]] = "some_def"() : () -> vector<128xf32> 482// CHECK-PROP: gpu.yield %[[INI1]], %[[INI2]] : vector<128xf32>, vector<128xf32> 483// CHECK-PROP: } 484// CHECK-PROP: %[[F:.*]]:2 = scf.for %{{.*}} = %{{.*}} to %{{.*}} step %{{.*}} iter_args(%[[FARG1:.*]] = %[[INI]]#0, %[[FARG2:.*]] = %[[INI]]#1) -> (vector<4xf32>, vector<4xf32>) { 485// CHECK-PROP: %[[W:.*]]:2 = gpu.warp_execute_on_lane_0(%{{.*}})[32] args(%[[FARG1]], %[[FARG2]] : vector<4xf32>, vector<4xf32>) -> (vector<4xf32>, vector<4xf32>) { 486// CHECK-PROP: ^bb0(%[[ARG1:.*]]: vector<128xf32>, %[[ARG2:.*]]: vector<128xf32>): 487// CHECK-PROP: %[[ACC1:.*]] = "some_def"(%[[ARG1]]) : (vector<128xf32>) -> vector<128xf32> 488// CHECK-PROP: %[[ACC2:.*]] = "some_def"(%[[ARG2]]) : (vector<128xf32>) -> vector<128xf32> 489// CHECK-PROP: gpu.yield %[[ACC2]], %[[ACC1]] : vector<128xf32>, vector<128xf32> 490// CHECK-PROP: } 491// CHECK-PROP: scf.yield %[[W]]#0, %[[W]]#1 : vector<4xf32>, vector<4xf32> 492// CHECK-PROP: } 493// CHECK-PROP: "some_use"(%[[F]]#0) : (vector<4xf32>) -> () 494// CHECK-PROP: "some_use"(%[[F]]#1) : (vector<4xf32>) -> () 495func.func @warp_scf_for_swap(%arg0: index) { 496 %c128 = arith.constant 128 : index 497 %c1 = arith.constant 1 : index 498 %c0 = arith.constant 0 : index 499 %0:2 = gpu.warp_execute_on_lane_0(%arg0)[32] -> (vector<4xf32>, vector<4xf32>) { 500 %ini1 = "some_def"() : () -> (vector<128xf32>) 501 %ini2 = "some_def"() : () -> (vector<128xf32>) 502 %3:2 = scf.for %arg3 = %c0 to %c128 step %c1 iter_args(%arg4 = %ini1, %arg5 = %ini2) -> (vector<128xf32>, vector<128xf32>) { 503 %acc1 = "some_def"(%arg4) : (vector<128xf32>) -> (vector<128xf32>) 504 %acc2 = "some_def"(%arg5) : (vector<128xf32>) -> (vector<128xf32>) 505 scf.yield %acc2, %acc1 : vector<128xf32>, vector<128xf32> 506 } 507 gpu.yield %3#0, %3#1 : vector<128xf32>, vector<128xf32> 508 } 509 "some_use"(%0#0) : (vector<4xf32>) -> () 510 "some_use"(%0#1) : (vector<4xf32>) -> () 511 return 512} 513 514// ----- 515 516// CHECK-PROP-LABEL: func @warp_scf_for_swap_no_yield( 517// CHECK-PROP: scf.for %{{.*}} = %{{.*}} to %{{.*}} step %{{.*}} { 518// CHECK-PROP-NEXT: gpu.warp_execute_on_lane_0(%{{.*}})[32] { 519// CHECK-PROP-NEXT: "some_op"() : () -> () 520// CHECK-PROP-NEXT: } 521// CHECK-PROP-NEXT: } 522func.func @warp_scf_for_swap_no_yield(%arg0: index) { 523 %c128 = arith.constant 128 : index 524 %c1 = arith.constant 1 : index 525 %c0 = arith.constant 0 : index 526 gpu.warp_execute_on_lane_0(%arg0)[32] { 527 scf.for %arg3 = %c0 to %c128 step %c1 { 528 "some_op"() : () -> () 529 } 530 } 531 return 532} 533 534// ----- 535 536#map = affine_map<()[s0] -> (s0 * 4)> 537#map1 = affine_map<()[s0] -> (s0 * 128 + 128)> 538#map2 = affine_map<()[s0] -> (s0 * 4 + 128)> 539 540// CHECK-PROP-LABEL: func @warp_scf_for_multiple_yield( 541// CHECK-PROP: gpu.warp_execute_on_lane_0(%{{.*}})[32] -> (vector<1xf32>) { 542// CHECK-PROP-NEXT: "some_def"() : () -> vector<32xf32> 543// CHECK-PROP-NEXT: gpu.yield %{{.*}} : vector<32xf32> 544// CHECK-PROP-NEXT: } 545// CHECK-PROP-NOT: gpu.warp_execute_on_lane_0 546// CHECK-PROP: vector.transfer_read {{.*}} : memref<?xf32>, vector<4xf32> 547// CHECK-PROP: vector.transfer_read {{.*}} : memref<?xf32>, vector<4xf32> 548// CHECK-PROP: %{{.*}}:2 = scf.for {{.*}} -> (vector<4xf32>, vector<4xf32>) { 549// CHECK-PROP-NOT: gpu.warp_execute_on_lane_0 550// CHECK-PROP: vector.transfer_read {{.*}} : memref<?xf32>, vector<4xf32> 551// CHECK-PROP: vector.transfer_read {{.*}} : memref<?xf32>, vector<4xf32> 552// CHECK-PROP: arith.addf {{.*}} : vector<4xf32> 553// CHECK-PROP: arith.addf {{.*}} : vector<4xf32> 554// CHECK-PROP: scf.yield {{.*}} : vector<4xf32>, vector<4xf32> 555// CHECK-PROP: } 556func.func @warp_scf_for_multiple_yield(%arg0: index, %arg1: memref<?xf32>, %arg2: memref<?xf32>) { 557 %c256 = arith.constant 256 : index 558 %c128 = arith.constant 128 : index 559 %c1 = arith.constant 1 : index 560 %c0 = arith.constant 0 : index 561 %cst = arith.constant 0.000000e+00 : f32 562 %0:3 = gpu.warp_execute_on_lane_0(%arg0)[32] -> 563 (vector<1xf32>, vector<4xf32>, vector<4xf32>) { 564 %def = "some_def"() : () -> (vector<32xf32>) 565 %r1 = vector.transfer_read %arg2[%c0], %cst {in_bounds = [true]} : memref<?xf32>, vector<128xf32> 566 %r2 = vector.transfer_read %arg2[%c128], %cst {in_bounds = [true]} : memref<?xf32>, vector<128xf32> 567 %3:2 = scf.for %arg3 = %c0 to %c128 step %c1 iter_args(%arg4 = %r1, %arg5 = %r2) 568 -> (vector<128xf32>, vector<128xf32>) { 569 %o1 = affine.apply #map1()[%arg3] 570 %o2 = affine.apply #map2()[%arg3] 571 %4 = vector.transfer_read %arg1[%o1], %cst {in_bounds = [true]} : memref<?xf32>, vector<128xf32> 572 %5 = vector.transfer_read %arg1[%o2], %cst {in_bounds = [true]} : memref<?xf32>, vector<128xf32> 573 %6 = arith.addf %4, %arg4 : vector<128xf32> 574 %7 = arith.addf %5, %arg5 : vector<128xf32> 575 scf.yield %6, %7 : vector<128xf32>, vector<128xf32> 576 } 577 gpu.yield %def, %3#0, %3#1 : vector<32xf32>, vector<128xf32>, vector<128xf32> 578 } 579 %1 = affine.apply #map()[%arg0] 580 vector.transfer_write %0#1, %arg2[%1] {in_bounds = [true]} : vector<4xf32>, memref<?xf32> 581 %2 = affine.apply #map2()[%arg0] 582 vector.transfer_write %0#2, %arg2[%2] {in_bounds = [true]} : vector<4xf32>, memref<?xf32> 583 "some_use"(%0#0) : (vector<1xf32>) -> () 584 return 585} 586 587// ----- 588 589// CHECK-PROP-LABEL: func @vector_reduction( 590// CHECK-PROP-SAME: %[[laneid:.*]]: index) 591// CHECK-PROP-DAG: %[[c1:.*]] = arith.constant 1 : i32 592// CHECK-PROP-DAG: %[[c2:.*]] = arith.constant 2 : i32 593// CHECK-PROP-DAG: %[[c4:.*]] = arith.constant 4 : i32 594// CHECK-PROP-DAG: %[[c8:.*]] = arith.constant 8 : i32 595// CHECK-PROP-DAG: %[[c16:.*]] = arith.constant 16 : i32 596// CHECK-PROP-DAG: %[[c32:.*]] = arith.constant 32 : i32 597// CHECK-PROP: %[[warp_op:.*]] = gpu.warp_execute_on_lane_0(%[[laneid]])[32] -> (vector<1xf32>) { 598// CHECK-PROP: gpu.yield %{{.*}} : vector<32xf32> 599// CHECK-PROP: } 600// CHECK-PROP: %[[a:.*]] = vector.extract %[[warp_op]][0] : f32 from vector<1xf32> 601// CHECK-PROP: %[[r0:.*]], %{{.*}} = gpu.shuffle xor %[[a]], %[[c1]], %[[c32]] 602// CHECK-PROP: %[[a0:.*]] = arith.addf %[[a]], %[[r0]] 603// CHECK-PROP: %[[r1:.*]], %{{.*}} = gpu.shuffle xor %[[a0]], %[[c2]], %[[c32]] 604// CHECK-PROP: %[[a1:.*]] = arith.addf %[[a0]], %[[r1]] 605// CHECK-PROP: %[[r2:.*]], %{{.*}} = gpu.shuffle xor %[[a1]], %[[c4]], %[[c32]] 606// CHECK-PROP: %[[a2:.*]] = arith.addf %[[a1]], %[[r2]] 607// CHECK-PROP: %[[r3:.*]], %{{.*}} = gpu.shuffle xor %[[a2]], %[[c8]], %[[c32]] 608// CHECK-PROP: %[[a3:.*]] = arith.addf %[[a2]], %[[r3]] 609// CHECK-PROP: %[[r4:.*]], %{{.*}} = gpu.shuffle xor %[[a3]], %[[c16]], %[[c32]] 610// CHECK-PROP: %[[a4:.*]] = arith.addf %[[a3]], %[[r4]] 611// CHECK-PROP: return %[[a4]] : f32 612func.func @vector_reduction(%laneid: index) -> (f32) { 613 %r = gpu.warp_execute_on_lane_0(%laneid)[32] -> (f32) { 614 %0 = "some_def"() : () -> (vector<32xf32>) 615 %1 = vector.reduction <add>, %0 : vector<32xf32> into f32 616 gpu.yield %1 : f32 617 } 618 return %r : f32 619} 620 621// ----- 622 623// CHECK-PROP-LABEL: func @warp_distribute( 624// CHECK-PROP-SAME: %[[ID:[a-zA-Z0-9]+]] 625// CHECK-PROP-SAME: %[[SRC:[a-zA-Z0-9]+]] 626// CHECK-PROP-SAME: %[[DEST:[a-zA-Z0-9]+]] 627// CHECK-PROP: gpu.warp_execute_on_lane_0(%[[ID]])[32] 628// CHECK-PROP-NEXT: "some_def"() : () -> vector<4096xf32> 629// CHECK-PROP-NEXT: %{{.*}} = vector.reduction 630// CHECK-PROP: %[[DEF:.*]] = arith.divf %{{.*}}, %{{.*}} : vector<1xf32> 631// CHECK-PROP-NOT: gpu.warp_execute_on_lane_0 632// CHECK-PROP: scf.for 633// CHECK-PROP: %{{.*}} = arith.subf %{{.*}}, %[[DEF]] : vector<1xf32> 634func.func @warp_distribute(%arg0: index, %src: memref<128xf32>, %dest: memref<128xf32>){ 635 %cst = arith.constant 0.000000e+00 : f32 636 %c0 = arith.constant 0 : index 637 %c1 = arith.constant 1 : index 638 %c128 = arith.constant 128 : index 639 %f0 = arith.constant 0.000000e+00 : f32 640 gpu.warp_execute_on_lane_0(%arg0)[32]{ 641 %cst_1 = arith.constant dense<2.621440e+05> : vector<1xf32> 642 %0 = "some_def"() : () -> (vector<4096xf32>) 643 %1 = vector.reduction <add>, %0, %cst : vector<4096xf32> into f32 644 %2 = vector.broadcast %1 : f32 to vector<1xf32> 645 %3 = arith.divf %2, %cst_1 : vector<1xf32> 646 scf.for %arg1 = %c0 to %c128 step %c1 { 647 %4 = vector.transfer_read %src[%arg1], %f0 {in_bounds = [true]} : memref<128xf32>, vector<1xf32> 648 %5 = arith.subf %4, %3 : vector<1xf32> 649 vector.transfer_write %5, %dest[%arg1] : vector<1xf32>, memref<128xf32> 650 } 651 } 652 return 653} 654 655// ----- 656 657func.func @vector_reduction(%laneid: index, %m0: memref<4x2x32xf32>, %m1: memref<f32>) { 658 %c0 = arith.constant 0: index 659 %f0 = arith.constant 0.0: f32 660 // CHECK-D: %[[R:.*]] = gpu.warp_execute_on_lane_0(%{{.*}})[32] -> (vector<f32>) { 661 // CHECK-D: gpu.warp_execute_on_lane_0(%{{.*}})[32] { 662 // CHECK-D: vector.transfer_write %[[R]], %{{.*}}[] : vector<f32>, memref<f32> 663 gpu.warp_execute_on_lane_0(%laneid)[32] { 664 %0 = vector.transfer_read %m0[%c0, %c0, %c0], %f0 {in_bounds = [true]} : memref<4x2x32xf32>, vector<32xf32> 665 %1 = vector.transfer_read %m1[], %f0 : memref<f32>, vector<f32> 666 %2 = vector.extractelement %1[] : vector<f32> 667 %3 = vector.reduction <add>, %0 : vector<32xf32> into f32 668 %4 = arith.addf %3, %2 : f32 669 %5 = vector.broadcast %4 : f32 to vector<f32> 670 vector.transfer_write %5, %m1[] : vector<f32>, memref<f32> 671 } 672 return 673} 674 675// ----- 676 677// CHECK-PROP-LABEL: func @vector_reduction_large( 678// CHECK-PROP-SAME: %[[laneid:.*]]: index) 679// CHECK-PROP-DAG: %[[c1:.*]] = arith.constant 1 : i32 680// CHECK-PROP-DAG: %[[c2:.*]] = arith.constant 2 : i32 681// CHECK-PROP-DAG: %[[c4:.*]] = arith.constant 4 : i32 682// CHECK-PROP-DAG: %[[c8:.*]] = arith.constant 8 : i32 683// CHECK-PROP-DAG: %[[c16:.*]] = arith.constant 16 : i32 684// CHECK-PROP-DAG: %[[c32:.*]] = arith.constant 32 : i32 685// CHECK-PROP: %[[warp_op:.*]] = gpu.warp_execute_on_lane_0(%[[laneid]])[32] -> (vector<2xf32>) { 686// CHECK-PROP: gpu.yield %{{.*}} : vector<64xf32> 687// CHECK-PROP: } 688// CHECK-PROP: %[[a:.*]] = vector.reduction <add>, %[[warp_op]] : vector<2xf32> into f32 689// CHECK-PROP: %[[r0:.*]], %{{.*}} = gpu.shuffle xor %[[a]], %[[c1]], %[[c32]] 690// CHECK-PROP: %[[a0:.*]] = arith.addf %[[a]], %[[r0]] 691// CHECK-PROP: %[[r1:.*]], %{{.*}} = gpu.shuffle xor %[[a0]], %[[c2]], %[[c32]] 692// CHECK-PROP: %[[a1:.*]] = arith.addf %[[a0]], %[[r1]] 693// CHECK-PROP: %[[r2:.*]], %{{.*}} = gpu.shuffle xor %[[a1]], %[[c4]], %[[c32]] 694// CHECK-PROP: %[[a2:.*]] = arith.addf %[[a1]], %[[r2]] 695// CHECK-PROP: %[[r3:.*]], %{{.*}} = gpu.shuffle xor %[[a2]], %[[c8]], %[[c32]] 696// CHECK-PROP: %[[a3:.*]] = arith.addf %[[a2]], %[[r3]] 697// CHECK-PROP: %[[r4:.*]], %{{.*}} = gpu.shuffle xor %[[a3]], %[[c16]], %[[c32]] 698// CHECK-PROP: %[[a4:.*]] = arith.addf %[[a3]], %[[r4]] 699// CHECK-PROP: return %[[a4]] : f32 700func.func @vector_reduction_large(%laneid: index) -> (f32) { 701 %r = gpu.warp_execute_on_lane_0(%laneid)[32] -> (f32) { 702 %0 = "some_def"() : () -> (vector<64xf32>) 703 %1 = vector.reduction <add>, %0 : vector<64xf32> into f32 704 gpu.yield %1 : f32 705 } 706 return %r : f32 707} 708 709// ----- 710 711// CHECK-PROP-LABEL: func @vector_reduction_acc( 712// CHECK-PROP-SAME: %[[laneid:.*]]: index) 713// CHECK-PROP-DAG: %[[c1:.*]] = arith.constant 1 : i32 714// CHECK-PROP-DAG: %[[c2:.*]] = arith.constant 2 : i32 715// CHECK-PROP-DAG: %[[c4:.*]] = arith.constant 4 : i32 716// CHECK-PROP-DAG: %[[c8:.*]] = arith.constant 8 : i32 717// CHECK-PROP-DAG: %[[c16:.*]] = arith.constant 16 : i32 718// CHECK-PROP-DAG: %[[c32:.*]] = arith.constant 32 : i32 719// CHECK-PROP: %[[warp_op:.*]]:2 = gpu.warp_execute_on_lane_0(%[[laneid]])[32] -> (vector<2xf32>, f32) { 720// CHECK-PROP: gpu.yield %{{.*}}, %{{.*}} : vector<64xf32>, f32 721// CHECK-PROP: } 722// CHECK-PROP: %[[a:.*]] = vector.reduction <add>, %[[warp_op]]#0 : vector<2xf32> into f32 723// CHECK-PROP: %[[r0:.*]], %{{.*}} = gpu.shuffle xor %[[a]], %[[c1]], %[[c32]] 724// CHECK-PROP: %[[a0:.*]] = arith.addf %[[a]], %[[r0]] 725// CHECK-PROP: %[[r1:.*]], %{{.*}} = gpu.shuffle xor %[[a0]], %[[c2]], %[[c32]] 726// CHECK-PROP: %[[a1:.*]] = arith.addf %[[a0]], %[[r1]] 727// CHECK-PROP: %[[r2:.*]], %{{.*}} = gpu.shuffle xor %[[a1]], %[[c4]], %[[c32]] 728// CHECK-PROP: %[[a2:.*]] = arith.addf %[[a1]], %[[r2]] 729// CHECK-PROP: %[[r3:.*]], %{{.*}} = gpu.shuffle xor %[[a2]], %[[c8]], %[[c32]] 730// CHECK-PROP: %[[a3:.*]] = arith.addf %[[a2]], %[[r3]] 731// CHECK-PROP: %[[r4:.*]], %{{.*}} = gpu.shuffle xor %[[a3]], %[[c16]], %[[c32]] 732// CHECK-PROP: %[[a4:.*]] = arith.addf %[[a3]], %[[r4]] 733// CHECK-PROP: %[[a5:.*]] = arith.addf %[[a4]], %[[warp_op]]#1 734// CHECK-PROP: return %[[a5]] : f32 735func.func @vector_reduction_acc(%laneid: index) -> (f32) { 736 %r = gpu.warp_execute_on_lane_0(%laneid)[32] -> (f32) { 737 %0 = "some_def"() : () -> (vector<64xf32>) 738 %1 = "some_def"() : () -> (f32) 739 %2 = vector.reduction <add>, %0, %1 : vector<64xf32> into f32 740 gpu.yield %2 : f32 741 } 742 return %r : f32 743} 744 745// ----- 746 747// CHECK-PROP-LABEL: func @warp_duplicate_yield( 748func.func @warp_duplicate_yield(%laneid: index) -> (vector<1xf32>, vector<1xf32>) { 749 // CHECK-PROP: %{{.*}}:2 = gpu.warp_execute_on_lane_0(%{{.*}})[32] -> (vector<1xf32>, vector<1xf32>) 750 %r:2 = gpu.warp_execute_on_lane_0(%laneid)[32] -> (vector<1xf32>, vector<1xf32>) { 751 %2 = "some_def"() : () -> (vector<32xf32>) 752 %3 = "some_def"() : () -> (vector<32xf32>) 753 %4 = arith.addf %2, %3 : vector<32xf32> 754 %5 = arith.addf %2, %2 : vector<32xf32> 755// CHECK-PROP-NOT: arith.addf 756// CHECK-PROP: gpu.yield %{{.*}}, %{{.*}} : vector<32xf32>, vector<32xf32> 757 gpu.yield %4, %5 : vector<32xf32>, vector<32xf32> 758 } 759 return %r#0, %r#1 : vector<1xf32>, vector<1xf32> 760} 761 762// ----- 763 764// CHECK-PROP-LABEL: func @warp_constant( 765// CHECK-PROP: %[[C:.*]] = arith.constant dense<2.000000e+00> : vector<1xf32> 766// CHECK-PROP: return %[[C]] : vector<1xf32> 767func.func @warp_constant(%laneid: index) -> (vector<1xf32>) { 768 %r = gpu.warp_execute_on_lane_0(%laneid)[32] -> (vector<1xf32>) { 769 %cst = arith.constant dense<2.0> : vector<32xf32> 770 gpu.yield %cst : vector<32xf32> 771 } 772 return %r : vector<1xf32> 773} 774 775// ----- 776 777// TODO: We could use warp shuffles instead of broadcasting the entire vector. 778 779// CHECK-PROP-LABEL: func.func @vector_extract_1d( 780// CHECK-PROP-DAG: %[[C5_I32:.*]] = arith.constant 5 : i32 781// CHECK-PROP-DAG: %[[C1:.*]] = arith.constant 1 : index 782// CHECK-PROP: %[[R:.*]] = gpu.warp_execute_on_lane_0(%{{.*}})[32] -> (vector<2xf32>) { 783// CHECK-PROP: %[[V:.*]] = "some_def"() : () -> vector<64xf32> 784// CHECK-PROP: gpu.yield %[[V]] : vector<64xf32> 785// CHECK-PROP: } 786// CHECK-PROP: %[[E:.*]] = vector.extract %[[R]][%[[C1]]] : f32 from vector<2xf32> 787// CHECK-PROP: %[[SHUFFLED:.*]], %{{.*}} = gpu.shuffle idx %[[E]], %[[C5_I32]] 788// CHECK-PROP: return %[[SHUFFLED]] : f32 789func.func @vector_extract_1d(%laneid: index) -> (f32) { 790 %r = gpu.warp_execute_on_lane_0(%laneid)[32] -> (f32) { 791 %0 = "some_def"() : () -> (vector<64xf32>) 792 %1 = vector.extract %0[9] : f32 from vector<64xf32> 793 gpu.yield %1 : f32 794 } 795 return %r : f32 796} 797 798// ----- 799 800// CHECK-PROP-LABEL: func.func @vector_extract_2d( 801// CHECK-PROP: %[[W:.*]] = gpu.warp_execute_on_lane_0(%{{.*}})[32] -> (vector<5x3xf32>) { 802// CHECK-PROP: %[[V:.*]] = "some_def" 803// CHECK-PROP: gpu.yield %[[V]] : vector<5x96xf32> 804// CHECK-PROP: } 805// CHECK-PROP: %[[E:.*]] = vector.extract %[[W]][2] : vector<3xf32> from vector<5x3xf32> 806// CHECK-PROP: return %[[E]] 807func.func @vector_extract_2d(%laneid: index) -> (vector<3xf32>) { 808 %r = gpu.warp_execute_on_lane_0(%laneid)[32] -> (vector<3xf32>) { 809 %0 = "some_def"() : () -> (vector<5x96xf32>) 810 %1 = vector.extract %0[2] : vector<96xf32> from vector<5x96xf32> 811 gpu.yield %1 : vector<96xf32> 812 } 813 return %r : vector<3xf32> 814} 815 816// ----- 817 818// CHECK-PROP-LABEL: func.func @vector_extract_2d_broadcast_scalar( 819// CHECK-PROP: %[[W:.*]] = gpu.warp_execute_on_lane_0(%{{.*}})[32] -> (vector<5x96xf32>) { 820// CHECK-PROP: %[[V:.*]] = "some_def" 821// CHECK-PROP: gpu.yield %[[V]] : vector<5x96xf32> 822// CHECK-PROP: } 823// CHECK-PROP: %[[E:.*]] = vector.extract %[[W]][1, 2] : f32 from vector<5x96xf32> 824// CHECK-PROP: return %[[E]] 825func.func @vector_extract_2d_broadcast_scalar(%laneid: index) -> (f32) { 826 %r = gpu.warp_execute_on_lane_0(%laneid)[32] -> (f32) { 827 %0 = "some_def"() : () -> (vector<5x96xf32>) 828 %1 = vector.extract %0[1, 2] : f32 from vector<5x96xf32> 829 gpu.yield %1 : f32 830 } 831 return %r : f32 832} 833 834// ----- 835 836// CHECK-PROP-LABEL: func.func @vector_extract_2d_broadcast( 837// CHECK-PROP: %[[W:.*]] = gpu.warp_execute_on_lane_0(%{{.*}})[32] -> (vector<5x96xf32>) { 838// CHECK-PROP: %[[V:.*]] = "some_def" 839// CHECK-PROP: gpu.yield %[[V]] : vector<5x96xf32> 840// CHECK-PROP: } 841// CHECK-PROP: %[[E:.*]] = vector.extract %[[W]][2] : vector<96xf32> from vector<5x96xf32> 842// CHECK-PROP: return %[[E]] 843func.func @vector_extract_2d_broadcast(%laneid: index) -> (vector<96xf32>) { 844 %r = gpu.warp_execute_on_lane_0(%laneid)[32] -> (vector<96xf32>) { 845 %0 = "some_def"() : () -> (vector<5x96xf32>) 846 %1 = vector.extract %0[2] : vector<96xf32> from vector<5x96xf32> 847 gpu.yield %1 : vector<96xf32> 848 } 849 return %r : vector<96xf32> 850} 851 852// ----- 853 854// CHECK-PROP-LABEL: func.func @vector_extract_3d( 855// CHECK-PROP: %[[W:.*]] = gpu.warp_execute_on_lane_0(%{{.*}})[32] -> (vector<8x4x96xf32>) { 856// CHECK-PROP: %[[V:.*]] = "some_def" 857// CHECK-PROP: gpu.yield %[[V]] : vector<8x128x96xf32> 858// CHECK-PROP: } 859// CHECK-PROP: %[[E:.*]] = vector.extract %[[W]][2] : vector<4x96xf32> from vector<8x4x96xf32> 860// CHECK-PROP: return %[[E]] 861func.func @vector_extract_3d(%laneid: index) -> (vector<4x96xf32>) { 862 %r = gpu.warp_execute_on_lane_0(%laneid)[32] -> (vector<4x96xf32>) { 863 %0 = "some_def"() : () -> (vector<8x128x96xf32>) 864 %1 = vector.extract %0[2] : vector<128x96xf32> from vector<8x128x96xf32> 865 gpu.yield %1 : vector<128x96xf32> 866 } 867 return %r : vector<4x96xf32> 868} 869 870// ----- 871 872// CHECK-PROP-LABEL: func.func @vector_extractelement_0d( 873// CHECK-PROP: %[[R:.*]] = gpu.warp_execute_on_lane_0(%{{.*}})[32] -> (vector<f32>) { 874// CHECK-PROP: %[[V:.*]] = "some_def"() : () -> vector<f32> 875// CHECK-PROP: gpu.yield %[[V]] : vector<f32> 876// CHECK-PROP: } 877// CHECK-PROP: %[[E:.*]] = vector.extract %[[R]][] : f32 from vector<f32> 878// CHECK-PROP: return %[[E]] : f32 879func.func @vector_extractelement_0d(%laneid: index) -> (f32) { 880 %r = gpu.warp_execute_on_lane_0(%laneid)[32] -> (f32) { 881 %0 = "some_def"() : () -> (vector<f32>) 882 %1 = vector.extractelement %0[] : vector<f32> 883 gpu.yield %1 : f32 884 } 885 return %r : f32 886} 887 888// ----- 889 890// CHECK-PROP-LABEL: func.func @vector_extractelement_1element( 891// CHECK-PROP: %[[R:.*]] = gpu.warp_execute_on_lane_0(%{{.*}})[32] -> (vector<1xf32>) { 892// CHECK-PROP: %[[V:.*]] = "some_def"() : () -> vector<1xf32> 893// CHECK-PROP: gpu.yield %[[V]] : vector<1xf32> 894// CHECK-PROP: } 895// CHECK-PROP: %[[E:.*]] = vector.extract %[[R]][0] : f32 from vector<1xf32> 896// CHECK-PROP: return %[[E]] : f32 897func.func @vector_extractelement_1element(%laneid: index) -> (f32) { 898 %r = gpu.warp_execute_on_lane_0(%laneid)[32] -> (f32) { 899 %0 = "some_def"() : () -> (vector<1xf32>) 900 %c0 = arith.constant 0 : index 901 %1 = vector.extractelement %0[%c0 : index] : vector<1xf32> 902 gpu.yield %1 : f32 903 } 904 return %r : f32 905} 906 907// ----- 908 909// CHECK-PROP: #[[$map:.*]] = affine_map<()[s0] -> (s0 ceildiv 3)> 910// CHECK-PROP: #[[$map1:.*]] = affine_map<()[s0] -> (s0 mod 3)> 911// CHECK-PROP-LABEL: func.func @vector_extractelement_1d( 912// CHECK-PROP-SAME: %[[LANEID:.*]]: index, %[[POS:.*]]: index 913// CHECK-PROP-DAG: %[[C32:.*]] = arith.constant 32 : i32 914// CHECK-PROP: %[[W:.*]] = gpu.warp_execute_on_lane_0(%{{.*}})[32] -> (vector<3xf32>) { 915// CHECK-PROP: %[[V:.*]] = "some_def" 916// CHECK-PROP: gpu.yield %[[V]] : vector<96xf32> 917// CHECK-PROP: } 918// CHECK-PROP: %[[FROM_LANE:.*]] = affine.apply #[[$map]]()[%[[POS]]] 919// CHECK-PROP: %[[DISTR_POS:.*]] = affine.apply #[[$map1]]()[%[[POS]]] 920// CHECK-PROP: %[[EXTRACTED:.*]] = vector.extract %[[W]][%[[DISTR_POS]]] : f32 from vector<3xf32> 921// CHECK-PROP: %[[FROM_LANE_I32:.*]] = arith.index_cast %[[FROM_LANE]] : index to i32 922// CHECK-PROP: %[[SHUFFLED:.*]], %{{.*}} = gpu.shuffle idx %[[EXTRACTED]], %[[FROM_LANE_I32]], %[[C32]] : f32 923// CHECK-PROP: return %[[SHUFFLED]] 924func.func @vector_extractelement_1d(%laneid: index, %pos: index) -> (f32) { 925 %r = gpu.warp_execute_on_lane_0(%laneid)[32] -> (f32) { 926 %0 = "some_def"() : () -> (vector<96xf32>) 927 %1 = vector.extractelement %0[%pos : index] : vector<96xf32> 928 gpu.yield %1 : f32 929 } 930 return %r : f32 931} 932 933// ----- 934 935// Index-typed values cannot be shuffled at the moment. 936 937// CHECK-PROP-LABEL: func.func @vector_extractelement_1d_index( 938// CHECK-PROP: gpu.warp_execute_on_lane_0(%{{.*}})[32] -> (index) { 939// CHECK-PROP: "some_def" 940// CHECK-PROP: vector.extract 941// CHECK-PROP: gpu.yield {{.*}} : index 942// CHECK-PROP: } 943func.func @vector_extractelement_1d_index(%laneid: index, %pos: index) -> (index) { 944 %r = gpu.warp_execute_on_lane_0(%laneid)[32] -> (index) { 945 %0 = "some_def"() : () -> (vector<96xindex>) 946 %1 = vector.extractelement %0[%pos : index] : vector<96xindex> 947 gpu.yield %1 : index 948 } 949 return %r : index 950} 951 952// ----- 953 954// CHECK-PROP: func @lane_dependent_warp_propagate_read 955// CHECK-PROP-SAME: %[[ID:.*]]: index 956func.func @lane_dependent_warp_propagate_read( 957 %laneid: index, %src: memref<1x1024xf32>, %dest: memref<1x1024xf32>) { 958 // CHECK-PROP-DAG: %[[C0:.*]] = arith.constant 0 : index 959 // CHECK-PROP-NOT: gpu.warp_execute_on_lane_0 960 // CHECK-PROP-DAG: %[[R0:.*]] = vector.transfer_read %arg1[%[[C0]], %[[ID]]], %{{.*}} : memref<1x1024xf32>, vector<1x1xf32> 961 // CHECK-PROP: vector.transfer_write %[[R0]], {{.*}} : vector<1x1xf32>, memref<1x1024xf32> 962 %c0 = arith.constant 0 : index 963 %cst = arith.constant 0.000000e+00 : f32 964 %r = gpu.warp_execute_on_lane_0(%laneid)[32] -> (vector<1x1xf32>) { 965 %2 = vector.transfer_read %src[%c0, %c0], %cst : memref<1x1024xf32>, vector<1x32xf32> 966 gpu.yield %2 : vector<1x32xf32> 967 } 968 vector.transfer_write %r, %dest[%c0, %laneid] : vector<1x1xf32>, memref<1x1024xf32> 969 return 970} 971 972// ----- 973 974func.func @warp_propagate_read_3d(%laneid: index, %src: memref<32x4x32xf32>) -> vector<1x1x4xf32> { 975 %c0 = arith.constant 0 : index 976 %cst = arith.constant 0.000000e+00 : f32 977 %r = gpu.warp_execute_on_lane_0(%laneid)[1024] -> (vector<1x1x4xf32>) { 978 %2 = vector.transfer_read %src[%c0, %c0, %c0], %cst : memref<32x4x32xf32>, vector<32x4x32xf32> 979 gpu.yield %2 : vector<32x4x32xf32> 980 } 981 return %r : vector<1x1x4xf32> 982} 983 984// CHECK-PROP-DAG: #[[$ID0MAP:.+]] = affine_map<()[s0] -> (s0 * 4 - (s0 floordiv 8) * 32)> 985// CHECK-PROP-DAG: #[[$ID1MAP:.+]] = affine_map<()[s0] -> ((s0 floordiv 8) mod 4)> 986// CHECK-PROP-DAG: #[[$ID2MAP:.+]] = affine_map<()[s0] -> ((s0 floordiv 8) floordiv 32)> 987// CHECK-PROP-LABEL: func.func @warp_propagate_read_3d 988// CHECK-PROP-SAME: (%[[LANE:.+]]: index, %[[SRC:.+]]: memref<32x4x32xf32>) 989// CHECK-PROP-DAG: %[[ID0:.+]] = affine.apply #[[$ID0MAP]]()[%[[LANE]]] 990// CHECK-PROP-DAG: %[[ID1:.+]] = affine.apply #[[$ID1MAP]]()[%[[LANE]]] 991// CHECK-PROP-DAG: %[[ID2:.+]] = affine.apply #[[$ID2MAP]]()[%[[LANE]]] 992// CHECK-PROP: %[[READ:.+]] = vector.transfer_read %[[SRC]][%[[ID2]], %[[ID1]], %[[ID0]]], %{{.+}} : memref<32x4x32xf32>, vector<1x1x4xf32> 993// CHECK-PROP: return %[[READ]] : vector<1x1x4xf32> 994 995// ----- 996 997func.func @warp_propagate_read_broadcast(%laneid: index, %src: memref<32x1xf32>) -> vector<1x4xf32> { 998 %c0 = arith.constant 0 : index 999 %cst = arith.constant 0.000000e+00 : f32 1000 %r = gpu.warp_execute_on_lane_0(%laneid)[512] -> (vector<1x4xf32>) { 1001 %2 = vector.transfer_read %src[%c0, %c0], %cst {in_bounds = [true, true], permutation_map = affine_map<(d0, d1) -> (d0, 0)>} : memref<32x1xf32>, vector<32x64xf32> 1002 gpu.yield %2 : vector<32x64xf32> 1003 } 1004 return %r : vector<1x4xf32> 1005} 1006 1007// CHECK-PROP-DAG: #[[$MAP:.+]] = affine_map<()[s0] -> (s0 floordiv 16)> 1008// CHECK-PROP-DAG: #[[$READMAP:.+]] = affine_map<(d0, d1) -> (d0, 0)> 1009// CHECK-PROP-LABEL: func.func @warp_propagate_read_broadcast 1010// CHECK-PROP-SAME: (%[[LANE:.+]]: index, %[[SRC:.+]]: memref<32x1xf32>) 1011// CHECK-PROP: %[[C0:.+]] = arith.constant 0 : index 1012// CHECK-PROP: %[[ID:.+]] = affine.apply #[[$MAP]]()[%[[LANE]]] 1013// CHECK-PROP: %[[READ:.+]] = vector.transfer_read %[[SRC]][%[[ID]], %[[C0]]], %{{.+}} {in_bounds = [true, true], permutation_map = #[[$READMAP]]} : memref<32x1xf32>, vector<1x4xf32> 1014// CHECK-PROP: return %[[READ]] : vector<1x4xf32> 1015 1016// ----- 1017 1018// CHECK-PROP: func @dont_duplicate_read 1019func.func @dont_duplicate_read( 1020 %laneid: index, %src: memref<1024xf32>) -> vector<1xf32> { 1021 %c0 = arith.constant 0 : index 1022 %cst = arith.constant 0.000000e+00 : f32 1023// CHECK-PROP: gpu.warp_execute_on_lane_0(%{{.*}})[32] -> (vector<1xf32>) { 1024// CHECK-PROP-NEXT: vector.transfer_read 1025// CHECK-PROP-NEXT: "blocking_use" 1026// CHECK-PROP-NEXT: gpu.yield 1027 %r = gpu.warp_execute_on_lane_0(%laneid)[32] -> (vector<1xf32>) { 1028 %2 = vector.transfer_read %src[%c0], %cst : memref<1024xf32>, vector<32xf32> 1029 "blocking_use"(%2) : (vector<32xf32>) -> () 1030 gpu.yield %2 : vector<32xf32> 1031 } 1032 return %r : vector<1xf32> 1033} 1034 1035// ----- 1036 1037// CHECK-PROP: func @dedup 1038func.func @dedup(%laneid: index, %v0: vector<4xf32>, %v1: vector<4xf32>) 1039 -> (vector<1xf32>, vector<1xf32>) { 1040 1041 // CHECK-PROP: %[[SINGLE_RES:.*]] = gpu.warp_execute_on_lane_0{{.*}} -> (vector<1xf32>) { 1042 %r:2 = gpu.warp_execute_on_lane_0(%laneid)[32] 1043 args(%v0, %v1 : vector<4xf32>, vector<4xf32>) -> (vector<1xf32>, vector<1xf32>) { 1044 ^bb0(%arg0: vector<128xf32>, %arg1: vector<128xf32>): 1045 1046 // CHECK-PROP: %[[SINGLE_VAL:.*]] = "some_def"(%{{.*}}) : (vector<128xf32>) -> vector<32xf32> 1047 %2 = "some_def"(%arg0) : (vector<128xf32>) -> vector<32xf32> 1048 1049 // CHECK-PROP: gpu.yield %[[SINGLE_VAL]] : vector<32xf32> 1050 gpu.yield %2, %2 : vector<32xf32>, vector<32xf32> 1051 } 1052 1053 // CHECK-PROP: return %[[SINGLE_RES]], %[[SINGLE_RES]] : vector<1xf32>, vector<1xf32> 1054 return %r#0, %r#1 : vector<1xf32>, vector<1xf32> 1055} 1056 1057// ----- 1058 1059// CHECK-SCF-IF: func @warp_execute_has_broadcast_semantics 1060func.func @warp_execute_has_broadcast_semantics(%laneid: index, %s0: f32, %v0: vector<f32>, %v1: vector<1xf32>, %v2: vector<1x1xf32>) 1061 -> (f32, vector<f32>, vector<1xf32>, vector<1x1xf32>) { 1062 // CHECK-SCF-IF-DAG: %[[C0:.*]] = arith.constant 0 : index 1063 1064 // CHECK-SCF-IF: scf.if{{.*}}{ 1065 %r:4 = gpu.warp_execute_on_lane_0(%laneid)[32] 1066 args(%s0, %v0, %v1, %v2 : f32, vector<f32>, vector<1xf32>, vector<1x1xf32>) -> (f32, vector<f32>, vector<1xf32>, vector<1x1xf32>) { 1067 ^bb0(%bs0: f32, %bv0: vector<f32>, %bv1: vector<1xf32>, %bv2: vector<1x1xf32>): 1068 1069 // CHECK-SCF-IF: vector.transfer_read {{.*}}[%[[C0]], %[[C0]]]{{.*}} {in_bounds = [true, true]} : memref<1x1xf32, 3>, vector<1x1xf32> 1070 // CHECK-SCF-IF: vector.transfer_read {{.*}}[%[[C0]]]{{.*}} {in_bounds = [true]} : memref<1xf32, 3>, vector<1xf32> 1071 // CHECK-SCF-IF: vector.transfer_read {{.*}}[]{{.*}} : memref<f32, 3>, vector<f32> 1072 // CHECK-SCF-IF: memref.load {{.*}}[%[[C0]]] : memref<1xf32, 3> 1073 // CHECK-SCF-IF: "some_def_0"(%{{.*}}) : (f32) -> f32 1074 // CHECK-SCF-IF: "some_def_1"(%{{.*}}) : (vector<f32>) -> vector<f32> 1075 // CHECK-SCF-IF: "some_def_1"(%{{.*}}) : (vector<1xf32>) -> vector<1xf32> 1076 // CHECK-SCF-IF: "some_def_1"(%{{.*}}) : (vector<1x1xf32>) -> vector<1x1xf32> 1077 // CHECK-SCF-IF: memref.store {{.*}}[%[[C0]]] : memref<1xf32, 3> 1078 // CHECK-SCF-IF: vector.transfer_write {{.*}}[] : vector<f32>, memref<f32, 3> 1079 // CHECK-SCF-IF: vector.transfer_write {{.*}}[%[[C0]]] {in_bounds = [true]} : vector<1xf32>, memref<1xf32, 3> 1080 // CHECK-SCF-IF: vector.transfer_write {{.*}}[%[[C0]], %[[C0]]] {in_bounds = [true, true]} : vector<1x1xf32>, memref<1x1xf32, 3> 1081 1082 %rs0 = "some_def_0"(%bs0) : (f32) -> f32 1083 %rv0 = "some_def_1"(%bv0) : (vector<f32>) -> vector<f32> 1084 %rv1 = "some_def_1"(%bv1) : (vector<1xf32>) -> vector<1xf32> 1085 %rv2 = "some_def_1"(%bv2) : (vector<1x1xf32>) -> vector<1x1xf32> 1086 1087 // CHECK-SCF-IF-NOT: gpu.yield 1088 gpu.yield %rs0, %rv0, %rv1, %rv2 : f32, vector<f32>, vector<1xf32>, vector<1x1xf32> 1089 } 1090 1091 // CHECK-SCF-IF: gpu.barrier 1092 // CHECK-SCF-IF: %[[RV2:.*]] = vector.transfer_read {{.*}}[%[[C0]], %[[C0]]]{{.*}} {in_bounds = [true, true]} : memref<1x1xf32, 3>, vector<1x1xf32> 1093 // CHECK-SCF-IF: %[[RV1:.*]] = vector.transfer_read {{.*}}[%[[C0]]]{{.*}} {in_bounds = [true]} : memref<1xf32, 3>, vector<1xf32> 1094 // CHECK-SCF-IF: %[[RV0:.*]] = vector.transfer_read {{.*}}[]{{.*}} : memref<f32, 3>, vector<f32> 1095 // CHECK-SCF-IF: %[[RS0:.*]] = memref.load {{.*}}[%[[C0]]] : memref<1xf32, 3> 1096 // CHECK-SCF-IF: return %[[RS0]], %[[RV0]], %[[RV1]], %[[RV2]] : f32, vector<f32>, vector<1xf32>, vector<1x1xf32> 1097 return %r#0, %r#1, %r#2, %r#3 : f32, vector<f32>, vector<1xf32>, vector<1x1xf32> 1098} 1099 1100// ----- 1101 1102// CHECK-SCF-IF-DAG: #[[$TIMES2:.*]] = affine_map<()[s0] -> (s0 * 2)> 1103 1104// CHECK-SCF-IF: func @warp_execute_nd_distribute 1105// CHECK-SCF-IF-SAME: (%[[LANEID:.*]]: index 1106func.func @warp_execute_nd_distribute(%laneid: index, %v0: vector<1x64x1xf32>, %v1: vector<1x2x128xf32>) 1107 -> (vector<1x64x1xf32>, vector<1x2x128xf32>) { 1108 // CHECK-SCF-IF-DAG: %[[C0:.*]] = arith.constant 0 : index 1109 1110 // CHECK-SCF-IF: vector.transfer_write %{{.*}}, %{{.*}}[%[[LANEID]], %c0, %c0] {in_bounds = [true, true, true]} : vector<1x64x1xf32>, memref<32x64x1xf32, 3> 1111 // CHECK-SCF-IF: %[[RID:.*]] = affine.apply #[[$TIMES2]]()[%[[LANEID]]] 1112 // CHECK-SCF-IF: vector.transfer_write %{{.*}}, %{{.*}}[%[[C0]], %[[RID]], %[[C0]]] {in_bounds = [true, true, true]} : vector<1x2x128xf32>, memref<1x64x128xf32, 3> 1113 // CHECK-SCF-IF: gpu.barrier 1114 1115 // CHECK-SCF-IF: scf.if{{.*}}{ 1116 %r:2 = gpu.warp_execute_on_lane_0(%laneid)[32] 1117 args(%v0, %v1 : vector<1x64x1xf32>, vector<1x2x128xf32>) -> (vector<1x64x1xf32>, vector<1x2x128xf32>) { 1118 ^bb0(%arg0: vector<32x64x1xf32>, %arg1: vector<1x64x128xf32>): 1119 1120 // CHECK-SCF-IF-DAG: %[[SR0:.*]] = vector.transfer_read %{{.*}}[%[[C0]], %[[C0]], %[[C0]]], %{{.*}} {in_bounds = [true, true, true]} : memref<32x64x1xf32, 3>, vector<32x64x1xf32> 1121 // CHECK-SCF-IF-DAG: %[[SR1:.*]] = vector.transfer_read %{{.*}}[%[[C0]], %[[C0]], %[[C0]]], %{{.*}} {in_bounds = [true, true, true]} : memref<1x64x128xf32, 3>, vector<1x64x128xf32> 1122 // CHECK-SCF-IF: %[[W0:.*]] = "some_def_0"(%[[SR0]]) : (vector<32x64x1xf32>) -> vector<32x64x1xf32> 1123 // CHECK-SCF-IF: %[[W1:.*]] = "some_def_1"(%[[SR1]]) : (vector<1x64x128xf32>) -> vector<1x64x128xf32> 1124 // CHECK-SCF-IF-DAG: vector.transfer_write %[[W0]], %{{.*}}[%[[C0]], %[[C0]], %[[C0]]] {in_bounds = [true, true, true]} : vector<32x64x1xf32>, memref<32x64x1xf32, 3> 1125 // CHECK-SCF-IF-DAG: vector.transfer_write %[[W1]], %{{.*}}[%[[C0]], %[[C0]], %[[C0]]] {in_bounds = [true, true, true]} : vector<1x64x128xf32>, memref<1x64x128xf32, 3> 1126 1127 %r0 = "some_def_0"(%arg0) : (vector<32x64x1xf32>) -> vector<32x64x1xf32> 1128 %r1 = "some_def_1"(%arg1) : (vector<1x64x128xf32>) -> vector<1x64x128xf32> 1129 1130 // CHECK-SCF-IF-NOT: gpu.yield 1131 gpu.yield %r0, %r1 : vector<32x64x1xf32>, vector<1x64x128xf32> 1132 } 1133 1134 // CHECK-SCF-IF: gpu.barrier 1135 // CHECK-SCF-IF: %[[WID:.*]] = affine.apply #[[$TIMES2]]()[%[[LANEID]]] 1136 // CHECK-SCF-IF-DAG: %[[R0:.*]] = vector.transfer_read %{{.*}}[%[[LANEID]], %[[C0]], %[[C0]]], %cst {in_bounds = [true, true, true]} : memref<32x64x1xf32, 3>, vector<1x64x1xf32> 1137 // CHECK-SCF-IF-DAG: %[[R1:.*]] = vector.transfer_read %{{.*}}[%[[C0]], %[[WID]], %[[C0]]], %cst {in_bounds = [true, true, true]} : memref<1x64x128xf32, 3>, vector<1x2x128xf32> 1138 // CHECK-SCF-IF: return %[[R0]], %[[R1]] : vector<1x64x1xf32>, vector<1x2x128xf32> 1139 return %r#0, %r#1 : vector<1x64x1xf32>, vector<1x2x128xf32> 1140} 1141 1142// ----- 1143 1144// CHECK-PROP: #[[$MAP:.*]] = affine_map<()[s0] -> (s0 ceildiv 3)> 1145// CHECK-PROP: #[[$MAP1:.*]] = affine_map<()[s0] -> (s0 mod 3)> 1146// CHECK-PROP-LABEL: func @vector_insertelement_1d( 1147// CHECK-PROP-SAME: %[[LANEID:.*]]: index, %[[POS:.*]]: index 1148// CHECK-PROP: %[[W:.*]]:2 = gpu.warp_execute_on_lane_0{{.*}} -> (vector<3xf32>, f32) 1149// CHECK-PROP: %[[INSERTING_LANE:.*]] = affine.apply #[[$MAP]]()[%[[POS]]] 1150// CHECK-PROP: %[[INSERTING_POS:.*]] = affine.apply #[[$MAP1]]()[%[[POS]]] 1151// CHECK-PROP: %[[SHOULD_INSERT:.*]] = arith.cmpi eq, %[[LANEID]], %[[INSERTING_LANE]] : index 1152// CHECK-PROP: %[[R:.*]] = scf.if %[[SHOULD_INSERT]] -> (vector<3xf32>) { 1153// CHECK-PROP: %[[INSERT:.*]] = vector.insert %[[W]]#1, %[[W]]#0 [%[[INSERTING_POS]]] 1154// CHECK-PROP: scf.yield %[[INSERT]] 1155// CHECK-PROP: } else { 1156// CHECK-PROP: scf.yield %[[W]]#0 1157// CHECK-PROP: } 1158// CHECK-PROP: return %[[R]] 1159func.func @vector_insertelement_1d(%laneid: index, %pos: index) -> (vector<3xf32>) { 1160 %r = gpu.warp_execute_on_lane_0(%laneid)[32] -> (vector<3xf32>) { 1161 %0 = "some_def"() : () -> (vector<96xf32>) 1162 %f = "another_def"() : () -> (f32) 1163 %1 = vector.insertelement %f, %0[%pos : index] : vector<96xf32> 1164 gpu.yield %1 : vector<96xf32> 1165 } 1166 return %r : vector<3xf32> 1167} 1168 1169// ----- 1170 1171// CHECK-PROP-LABEL: func @vector_insertelement_1d_broadcast( 1172// CHECK-PROP-SAME: %[[LANEID:.*]]: index, %[[POS:.*]]: index 1173// CHECK-PROP: %[[W:.*]]:2 = gpu.warp_execute_on_lane_0{{.*}} -> (vector<96xf32>, f32) 1174// CHECK-PROP: %[[VEC:.*]] = "some_def" 1175// CHECK-PROP: %[[VAL:.*]] = "another_def" 1176// CHECK-PROP: gpu.yield %[[VEC]], %[[VAL]] 1177// CHECK-PROP: vector.insert %[[W]]#1, %[[W]]#0 [%[[POS]]] : f32 into vector<96xf32> 1178func.func @vector_insertelement_1d_broadcast(%laneid: index, %pos: index) -> (vector<96xf32>) { 1179 %r = gpu.warp_execute_on_lane_0(%laneid)[32] -> (vector<96xf32>) { 1180 %0 = "some_def"() : () -> (vector<96xf32>) 1181 %f = "another_def"() : () -> (f32) 1182 %1 = vector.insertelement %f, %0[%pos : index] : vector<96xf32> 1183 gpu.yield %1 : vector<96xf32> 1184 } 1185 return %r : vector<96xf32> 1186} 1187 1188// ----- 1189 1190// CHECK-PROP-LABEL: func @vector_insertelement_0d( 1191// CHECK-PROP: %[[W:.*]]:2 = gpu.warp_execute_on_lane_0{{.*}} -> (vector<f32>, f32) 1192// CHECK-PROP: %[[VEC:.*]] = "some_def" 1193// CHECK-PROP: %[[VAL:.*]] = "another_def" 1194// CHECK-PROP: gpu.yield %[[VEC]], %[[VAL]] 1195// CHECK-PROP: vector.insert %[[W]]#1, %[[W]]#0 [] : f32 into vector<f32> 1196func.func @vector_insertelement_0d(%laneid: index) -> (vector<f32>) { 1197 %r = gpu.warp_execute_on_lane_0(%laneid)[32] -> (vector<f32>) { 1198 %0 = "some_def"() : () -> (vector<f32>) 1199 %f = "another_def"() : () -> (f32) 1200 %1 = vector.insertelement %f, %0[] : vector<f32> 1201 gpu.yield %1 : vector<f32> 1202 } 1203 return %r : vector<f32> 1204} 1205 1206// ----- 1207 1208// CHECK-PROP-LABEL: func @vector_insert_1d( 1209// CHECK-PROP-SAME: %[[LANEID:.*]]: index 1210// CHECK-PROP-DAG: %[[C26:.*]] = arith.constant 26 : index 1211// CHECK-PROP: %[[W:.*]]:2 = gpu.warp_execute_on_lane_0{{.*}} -> (vector<3xf32>, f32) 1212// CHECK-PROP: %[[VEC:.*]] = "some_def" 1213// CHECK-PROP: %[[VAL:.*]] = "another_def" 1214// CHECK-PROP: gpu.yield %[[VEC]], %[[VAL]] 1215// CHECK-PROP: %[[SHOULD_INSERT:.*]] = arith.cmpi eq, %[[LANEID]], %[[C26]] 1216// CHECK-PROP: %[[R:.*]] = scf.if %[[SHOULD_INSERT]] -> (vector<3xf32>) { 1217// CHECK-PROP: %[[INSERT:.*]] = vector.insert %[[W]]#1, %[[W]]#0 [1] 1218// CHECK-PROP: scf.yield %[[INSERT]] 1219// CHECK-PROP: } else { 1220// CHECK-PROP: scf.yield %[[W]]#0 1221// CHECK-PROP: } 1222// CHECK-PROP: return %[[R]] 1223func.func @vector_insert_1d(%laneid: index) -> (vector<3xf32>) { 1224 %r = gpu.warp_execute_on_lane_0(%laneid)[32] -> (vector<3xf32>) { 1225 %0 = "some_def"() : () -> (vector<96xf32>) 1226 %f = "another_def"() : () -> (f32) 1227 %1 = vector.insert %f, %0[76] : f32 into vector<96xf32> 1228 gpu.yield %1 : vector<96xf32> 1229 } 1230 return %r : vector<3xf32> 1231} 1232 1233// ----- 1234 1235// CHECK-PROP-LABEL: func @vector_insert_2d_distr_src( 1236// CHECK-PROP-SAME: %[[LANEID:.*]]: index 1237// CHECK-PROP: %[[W:.*]]:2 = gpu.warp_execute_on_lane_0{{.*}} -> (vector<3xf32>, vector<4x3xf32>) 1238// CHECK-PROP: %[[VEC:.*]] = "some_def" 1239// CHECK-PROP: %[[VAL:.*]] = "another_def" 1240// CHECK-PROP: gpu.yield %[[VAL]], %[[VEC]] 1241// CHECK-PROP: %[[INSERT:.*]] = vector.insert %[[W]]#0, %[[W]]#1 [2] : vector<3xf32> into vector<4x3xf32> 1242// CHECK-PROP: return %[[INSERT]] 1243func.func @vector_insert_2d_distr_src(%laneid: index) -> (vector<4x3xf32>) { 1244 %r = gpu.warp_execute_on_lane_0(%laneid)[32] -> (vector<4x3xf32>) { 1245 %0 = "some_def"() : () -> (vector<4x96xf32>) 1246 %s = "another_def"() : () -> (vector<96xf32>) 1247 %1 = vector.insert %s, %0[2] : vector<96xf32> into vector<4x96xf32> 1248 gpu.yield %1 : vector<4x96xf32> 1249 } 1250 return %r : vector<4x3xf32> 1251} 1252 1253// ----- 1254 1255// CHECK-PROP-LABEL: func @vector_insert_2d_distr_pos( 1256// CHECK-PROP-SAME: %[[LANEID:.*]]: index 1257// CHECK-PROP: %[[C19:.*]] = arith.constant 19 : index 1258// CHECK-PROP: %[[W:.*]]:2 = gpu.warp_execute_on_lane_0{{.*}} -> (vector<96xf32>, vector<4x96xf32>) 1259// CHECK-PROP: %[[VEC:.*]] = "some_def" 1260// CHECK-PROP: %[[VAL:.*]] = "another_def" 1261// CHECK-PROP: gpu.yield %[[VAL]], %[[VEC]] 1262// CHECK-PROP: %[[SHOULD_INSERT:.*]] = arith.cmpi eq, %[[LANEID]], %[[C19]] 1263// CHECK-PROP: %[[R:.*]] = scf.if %[[SHOULD_INSERT]] -> (vector<4x96xf32>) { 1264// CHECK-PROP: %[[INSERT:.*]] = vector.insert %[[W]]#0, %[[W]]#1 [3] : vector<96xf32> into vector<4x96xf32> 1265// CHECK-PROP: scf.yield %[[INSERT]] 1266// CHECK-PROP: } else { 1267// CHECK-PROP: scf.yield %[[W]]#1 1268// CHECK-PROP: } 1269// CHECK-PROP: return %[[R]] 1270func.func @vector_insert_2d_distr_pos(%laneid: index) -> (vector<4x96xf32>) { 1271 %r = gpu.warp_execute_on_lane_0(%laneid)[32] -> (vector<4x96xf32>) { 1272 %0 = "some_def"() : () -> (vector<128x96xf32>) 1273 %s = "another_def"() : () -> (vector<96xf32>) 1274 %1 = vector.insert %s, %0[79] : vector<96xf32> into vector<128x96xf32> 1275 gpu.yield %1 : vector<128x96xf32> 1276 } 1277 return %r : vector<4x96xf32> 1278} 1279 1280// ----- 1281 1282// CHECK-PROP-LABEL: func @vector_insert_2d_broadcast( 1283// CHECK-PROP-SAME: %[[LANEID:.*]]: index 1284// CHECK-PROP: %[[W:.*]]:2 = gpu.warp_execute_on_lane_0{{.*}} -> (vector<96xf32>, vector<4x96xf32>) 1285// CHECK-PROP: %[[VEC:.*]] = "some_def" 1286// CHECK-PROP: %[[VAL:.*]] = "another_def" 1287// CHECK-PROP: gpu.yield %[[VAL]], %[[VEC]] 1288// CHECK-PROP: %[[INSERT:.*]] = vector.insert %[[W]]#0, %[[W]]#1 [2] : vector<96xf32> into vector<4x96xf32> 1289// CHECK-PROP: return %[[INSERT]] 1290func.func @vector_insert_2d_broadcast(%laneid: index) -> (vector<4x96xf32>) { 1291 %r = gpu.warp_execute_on_lane_0(%laneid)[32] -> (vector<4x96xf32>) { 1292 %0 = "some_def"() : () -> (vector<4x96xf32>) 1293 %s = "another_def"() : () -> (vector<96xf32>) 1294 %1 = vector.insert %s, %0[2] : vector<96xf32> into vector<4x96xf32> 1295 gpu.yield %1 : vector<4x96xf32> 1296 } 1297 return %r : vector<4x96xf32> 1298} 1299 1300// ----- 1301 1302// Make sure that all operands of the transfer_read op are properly propagated. 1303// The vector.extractelement op cannot be propagated because index-typed 1304// shuffles are not supported at the moment. 1305 1306// CHECK-PROP: #[[$MAP:.*]] = affine_map<()[s0] -> (s0 * 2)> 1307// CHECK-PROP-LABEL: func @transfer_read_prop_operands( 1308// CHECK-PROP-SAME: %[[IN2:[^ :]*]]: vector<1x2xindex>, 1309// CHECK-PROP-SAME: %[[AR1:[^ :]*]]: memref<1x4x2xi32>, 1310// CHECK-PROP-SAME: %[[AR2:[^ :]*]]: memref<1x4x1024xf32>) 1311// CHECK-PROP-DAG: %[[C0:.*]] = arith.constant 0 : index 1312// CHECK-PROP-DAG: %[[THREADID:.*]] = gpu.thread_id x 1313// CHECK-PROP: %[[W:.*]] = gpu.warp_execute_on_lane_0(%[[THREADID]])[32] args(%[[IN2]] 1314// CHECK-PROP: %[[GATHER:.*]] = vector.gather %[[AR1]][{{.*}}] 1315// CHECK-PROP: %[[EXTRACT:.*]] = vector.extract %[[GATHER]][0] : vector<64xi32> from vector<1x64xi32> 1316// CHECK-PROP: %[[CAST:.*]] = arith.index_cast %[[EXTRACT]] : vector<64xi32> to vector<64xindex> 1317// CHECK-PROP: %[[EXTRACTELT:.*]] = vector.extract %[[CAST]][{{.*}}] : index from vector<64xindex> 1318// CHECK-PROP: gpu.yield %[[EXTRACTELT]] : index 1319// CHECK-PROP: %[[APPLY:.*]] = affine.apply #[[$MAP]]()[%[[THREADID]]] 1320// CHECK-PROP: %[[TRANSFERREAD:.*]] = vector.transfer_read %[[AR2]][%[[C0]], %[[W]], %[[APPLY]]], 1321// CHECK-PROP: return %[[TRANSFERREAD]] 1322func.func @transfer_read_prop_operands(%in2: vector<1x2xindex>, %ar1 : memref<1x4x2xi32>, %ar2 : memref<1x4x1024xf32>)-> vector<2xf32> { 1323 %0 = gpu.thread_id x 1324 %c0_i32 = arith.constant 0 : index 1325 %c0 = arith.constant 0 : index 1326 %cst = arith.constant dense<0> : vector<1x64xi32> 1327 %cst_0 = arith.constant dense<true> : vector<1x64xi1> 1328 %cst_1 = arith.constant dense<3> : vector<64xindex> 1329 %cst_2 = arith.constant dense<0> : vector<64xindex> 1330 %cst_6 = arith.constant 0.000000e+00 : f32 1331 1332 %18 = gpu.warp_execute_on_lane_0(%0)[32] args(%in2 : vector<1x2xindex>) -> (vector<2xf32>) { 1333 ^bb0(%arg4: vector<1x64xindex>): 1334 %28 = vector.gather %ar1[%c0, %c0, %c0] [%arg4], %cst_0, %cst : memref<1x4x2xi32>, vector<1x64xindex>, vector<1x64xi1>, vector<1x64xi32> into vector<1x64xi32> 1335 %29 = vector.extract %28[0] : vector<64xi32> from vector<1x64xi32> 1336 %30 = arith.index_cast %29 : vector<64xi32> to vector<64xindex> 1337 %36 = vector.extractelement %30[%c0_i32 : index] : vector<64xindex> 1338 %37 = vector.transfer_read %ar2[%c0, %36, %c0], %cst_6 {in_bounds = [true]} : memref<1x4x1024xf32>, vector<64xf32> 1339 gpu.yield %37 : vector<64xf32> 1340 } 1341 return %18 : vector<2xf32> 1342} 1343 1344// ----- 1345 1346// Check that we don't fold vector.broadcast when each thread doesn't get the 1347// same value. 1348 1349// CHECK-PROP-LABEL: func @dont_fold_vector_broadcast( 1350// CHECK-PROP: %[[r:.*]] = gpu.warp_execute_on_lane_0{{.*}} -> (vector<1x2xf32>) 1351// CHECK-PROP: %[[some_def:.*]] = "some_def" 1352// CHECK-PROP: %[[broadcast:.*]] = vector.broadcast %[[some_def]] : vector<64xf32> to vector<1x64xf32> 1353// CHECK-PROP: gpu.yield %[[broadcast]] : vector<1x64xf32> 1354// CHECK-PROP: vector.print %[[r]] : vector<1x2xf32> 1355func.func @dont_fold_vector_broadcast(%laneid: index) { 1356 %r = gpu.warp_execute_on_lane_0(%laneid)[32] -> (vector<1x2xf32>) { 1357 %0 = "some_def"() : () -> (vector<64xf32>) 1358 %1 = vector.broadcast %0 : vector<64xf32> to vector<1x64xf32> 1359 gpu.yield %1 : vector<1x64xf32> 1360 } 1361 vector.print %r : vector<1x2xf32> 1362 return 1363} 1364 1365// ----- 1366 1367func.func @warp_propagate_shape_cast(%laneid: index, %src: memref<32x4x32xf32>) -> vector<4xf32> { 1368 %c0 = arith.constant 0 : index 1369 %cst = arith.constant 0.000000e+00 : f32 1370 %r = gpu.warp_execute_on_lane_0(%laneid)[1024] -> (vector<4xf32>) { 1371 %2 = vector.transfer_read %src[%c0, %c0, %c0], %cst : memref<32x4x32xf32>, vector<32x4x32xf32> 1372 %3 = vector.shape_cast %2 : vector<32x4x32xf32> to vector<4096xf32> 1373 gpu.yield %3 : vector<4096xf32> 1374 } 1375 return %r : vector<4xf32> 1376} 1377 1378// CHECK-PROP-LABEL: func.func @warp_propagate_shape_cast 1379// CHECK-PROP: %[[READ:.+]] = vector.transfer_read {{.+}} : memref<32x4x32xf32>, vector<1x1x4xf32> 1380// CHECK-PROP: %[[CAST:.+]] = vector.shape_cast %[[READ]] : vector<1x1x4xf32> to vector<4xf32> 1381// CHECK-PROP: return %[[CAST]] : vector<4xf32> 1382 1383// ----- 1384 1385func.func @warp_propagate_uniform_transfer_read(%laneid: index, %src: memref<4096xf32>, %index: index) -> vector<1xf32> { 1386 %f0 = arith.constant 0.000000e+00 : f32 1387 %r = gpu.warp_execute_on_lane_0(%laneid)[64] -> (vector<1xf32>) { 1388 %1 = vector.transfer_read %src[%index], %f0 {in_bounds = [true]} : memref<4096xf32>, vector<1xf32> 1389 gpu.yield %1 : vector<1xf32> 1390 } 1391 return %r : vector<1xf32> 1392} 1393 1394// CHECK-PROP-LABEL: func.func @warp_propagate_uniform_transfer_read 1395// CHECK-PROP-SAME: (%{{.+}}: index, %[[SRC:.+]]: memref<4096xf32>, %[[INDEX:.+]]: index) 1396// CHECK-PROP: %[[READ:.+]] = vector.transfer_read %[[SRC]][%[[INDEX]]], %cst {in_bounds = [true]} : memref<4096xf32>, vector<1xf32> 1397// CHECK-PROP: return %[[READ]] : vector<1xf32> 1398 1399// ----- 1400 1401func.func @warp_propagate_multi_transfer_read(%laneid: index, %src: memref<4096xf32>, %index: index, %index1: index) -> (vector<1xf32>, vector<1xf32>) { 1402 %f0 = arith.constant 0.000000e+00 : f32 1403 %r:2 = gpu.warp_execute_on_lane_0(%laneid)[64] -> (vector<1xf32>, vector<1xf32>) { 1404 %0 = vector.transfer_read %src[%index], %f0 {in_bounds = [true]} : memref<4096xf32>, vector<1xf32> 1405 "some_use"(%0) : (vector<1xf32>) -> () 1406 %1 = vector.transfer_read %src[%index1], %f0 {in_bounds = [true]} : memref<4096xf32>, vector<1xf32> 1407 gpu.yield %0, %1 : vector<1xf32>, vector<1xf32> 1408 } 1409 return %r#0, %r#1 : vector<1xf32>, vector<1xf32> 1410} 1411 1412// CHECK-PROP-LABEL: func.func @warp_propagate_multi_transfer_read 1413// CHECK-PROP: gpu.warp_execute_on_lane_0{{.*}} -> (vector<1xf32>) 1414// CHECK-PROP: %[[INNER_READ:.+]] = vector.transfer_read 1415// CHECK-PROP: "some_use"(%[[INNER_READ]]) 1416// CHECK-PROP: gpu.yield %[[INNER_READ]] : vector<1xf32> 1417// CHECK-PROP: vector.transfer_read 1418 1419// ----- 1420 1421func.func @warp_propagate_dead_user_multi_read(%laneid: index, %src: memref<4096xf32>, %index: index, %index1: index) -> (vector<1xf32>) { 1422 %f0 = arith.constant 0.000000e+00 : f32 1423 %r = gpu.warp_execute_on_lane_0(%laneid)[64] -> (vector<1xf32>) { 1424 %0 = vector.transfer_read %src[%index], %f0 {in_bounds = [true]} : memref<4096xf32>, vector<64xf32> 1425 %1 = vector.transfer_read %src[%index1], %f0 {in_bounds = [true]} : memref<4096xf32>, vector<64xf32> 1426 %max = arith.maximumf %0, %1 : vector<64xf32> 1427 gpu.yield %max : vector<64xf32> 1428 } 1429 return %r : vector<1xf32> 1430} 1431 1432// CHECK-PROP-LABEL: func.func @warp_propagate_dead_user_multi_read 1433// CHECK-PROP-COUNT-2: vector.transfer_read {{.*}} vector<1xf32> 1434// CHECK-PROP: arith.maximumf {{.*}} : vector<1xf32> 1435 1436// ----- 1437 1438func.func @warp_propagate_masked_write(%laneid: index, %dest: memref<4096xf32>) { 1439 %c0 = arith.constant 0 : index 1440 gpu.warp_execute_on_lane_0(%laneid)[32] -> () { 1441 %mask = "mask_def_0"() : () -> (vector<4096xi1>) 1442 %mask2 = "mask_def_1"() : () -> (vector<32xi1>) 1443 %0 = "some_def_0"() : () -> (vector<4096xf32>) 1444 %1 = "some_def_1"() : () -> (vector<32xf32>) 1445 vector.transfer_write %0, %dest[%c0], %mask : vector<4096xf32>, memref<4096xf32> 1446 vector.transfer_write %1, %dest[%c0], %mask2 : vector<32xf32>, memref<4096xf32> 1447 gpu.yield 1448 } 1449 return 1450} 1451 1452// CHECK-DIST-AND-PROP-LABEL: func.func @warp_propagate_masked_write( 1453// CHECK-DIST-AND-PROP: %[[W:.*]]:4 = gpu.warp_execute_on_lane_0(%{{.*}})[32] -> (vector<1xf32>, vector<1xi1>, vector<128xf32>, vector<128xi1>) { 1454// CHECK-DIST-AND-PROP: %[[M0:.*]] = "mask_def_0" 1455// CHECK-DIST-AND-PROP: %[[M1:.*]] = "mask_def_1" 1456// CHECK-DIST-AND-PROP: %[[V0:.*]] = "some_def_0" 1457// CHECK-DIST-AND-PROP: %[[V1:.*]] = "some_def_1" 1458// CHECK-DIST-AND-PROP: gpu.yield %[[V1]], %[[M1]], %[[V0]], %[[M0]] 1459// CHECK-DIST-AND-PROP-SAME: vector<32xf32>, vector<32xi1>, vector<4096xf32>, vector<4096xi1> 1460// CHECK-DIST-AND-PROP: } 1461// CHECK-DIST-AND-PROP: vector.transfer_write %[[W]]#2, {{.*}}, %[[W]]#3 {in_bounds = [true]} : vector<128xf32>, memref<4096xf32> 1462// CHECK-DIST-AND-PROP: vector.transfer_write %[[W]]#0, {{.*}}, %[[W]]#1 {in_bounds = [true]} : vector<1xf32>, memref<4096xf32> 1463 1464// ----- 1465 1466func.func @warp_propagate_masked_transfer_read(%laneid: index, %src: memref<4096x4096xf32>, %index: index) -> (vector<2xf32>, vector<2x2xf32>) { 1467 %f0 = arith.constant 0.000000e+00 : f32 1468 %c0 = arith.constant 0 : index 1469 %r:2 = gpu.warp_execute_on_lane_0(%laneid)[64] -> (vector<2xf32>, vector<2x2xf32>) { 1470 %mask = "mask_def_0"() : () -> (vector<128xi1>) 1471 %0 = vector.transfer_read %src[%c0, %index], %f0, %mask {in_bounds = [true]} : memref<4096x4096xf32>, vector<128xf32> 1472 %mask2 = "mask_def_1"() : () -> (vector<128x2xi1>) 1473 %1 = vector.transfer_read %src[%c0, %index], %f0, %mask2 {in_bounds = [true, true]} : memref<4096x4096xf32>, vector<128x2xf32> 1474 gpu.yield %0, %1 : vector<128xf32>, vector<128x2xf32> 1475 } 1476 return %r#0, %r#1 : vector<2xf32>, vector<2x2xf32> 1477} 1478 1479// CHECK-PROP-DAG: #[[$MAP0:.+]] = affine_map<()[s0] -> (s0 * 2)> 1480// CHECK-PROP-DAG: #[[$MAP1:.+]] = affine_map<()[s0, s1] -> (s0 + s1 * 2)> 1481// CHECK-PROP-LABEL: func.func @warp_propagate_masked_transfer_read 1482// CHECK-PROP-SAME: %[[ARG0:.+]]: index, {{.*}}, %[[ARG2:.+]]: index 1483// CHECK-PROP: %[[C0:.*]] = arith.constant 0 : index 1484// CHECK-PROP: %[[R:.*]]:2 = gpu.warp_execute_on_lane_0(%{{.*}})[64] -> (vector<2xi1>, vector<2x2xi1>) { 1485// CHECK-PROP: %[[M0:.*]] = "mask_def_0" 1486// CHECK-PROP: %[[M1:.*]] = "mask_def_1" 1487// CHECK-PROP: gpu.yield %[[M0]], %[[M1]] : vector<128xi1>, vector<128x2xi1> 1488// CHECK-PROP: } 1489// CHECK-PROP: %[[DIST_READ_IDX0:.+]] = affine.apply #[[$MAP0]]()[%[[ARG0]]] 1490// CHECK-PROP: vector.transfer_read {{.*}}[%[[DIST_READ_IDX0]], %[[ARG2]]], {{.*}}, %[[R]]#1 {{.*}} vector<2x2xf32> 1491// CHECK-PROP: %[[DIST_READ_IDX1:.+]] = affine.apply #[[$MAP1]]()[%[[ARG2]], %[[ARG0]]] 1492// CHECK-PROP: vector.transfer_read {{.*}}[%[[C0]], %[[DIST_READ_IDX1]]], {{.*}}, %[[R]]#0 {{.*}} vector<2xf32> 1493 1494// ----- 1495 1496func.func @warp_propagate_nontrivial_map_masked_transfer_read(%laneid: index, %src: memref<4096x4096xf32>, %index: index) -> vector<2xf32> { 1497 %f0 = arith.constant 0.000000e+00 : f32 1498 %c0 = arith.constant 0 : index 1499 %r = gpu.warp_execute_on_lane_0(%laneid)[64] -> (vector<2xf32>) { 1500 %mask = "mask_def_0"() : () -> (vector<128xi1>) 1501 %0 = vector.transfer_read %src[%index, %c0], %f0, %mask {in_bounds = [true], permutation_map = affine_map<(d0, d1) -> (d0)>} : memref<4096x4096xf32>, vector<128xf32> 1502 gpu.yield %0 : vector<128xf32> 1503 } 1504 return %r : vector<2xf32> 1505} 1506 1507// CHECK-PROP-DAG: #[[$MAP0:.+]] = affine_map<()[s0, s1] -> (s0 + s1 * 2)> 1508// CHECK-PROP-DAG: #[[$MAP1:.+]] = affine_map<(d0, d1) -> (d0)> 1509// CHECK-PROP-LABEL: func.func @warp_propagate_nontrivial_map_masked_transfer_read 1510// CHECK-PROP-SAME: %[[ARG0:.+]]: index, {{.*}}, %[[ARG2:.+]]: index 1511// CHECK-PROP: %[[C0:.*]] = arith.constant 0 : index 1512// CHECK-PROP: %[[R:.*]] = gpu.warp_execute_on_lane_0(%{{.*}})[64] -> (vector<2xi1>) { 1513// CHECK-PROP: %[[M0:.*]] = "mask_def_0" 1514// CHECK-PROP: gpu.yield %[[M0]] : vector<128xi1> 1515// CHECK-PROP: } 1516// CHECK-PROP: %[[DIST_READ_IDX0:.+]] = affine.apply #[[$MAP0]]()[%[[ARG2]], %[[ARG0]]] 1517// CHECK-PROP: vector.transfer_read {{.*}}[%[[DIST_READ_IDX0]], %[[C0]]], {{.*}}, %[[R]] 1518// CHECK-PROP-SAME: permutation_map = #[[$MAP1]]} {{.*}} vector<2xf32> 1519 1520// ----- 1521 1522func.func @warp_propagate_masked_transfer_read_shared_mask(%laneid: index, %src: memref<4096x4096xf32>, %index: index, %index2: index, %mask_ub: index) -> (vector<2xf32>, vector<2xf32>) { 1523 %f0 = arith.constant 0.000000e+00 : f32 1524 %c0 = arith.constant 0 : index 1525 %r:2 = gpu.warp_execute_on_lane_0(%laneid)[64] -> (vector<2xf32>, vector<2xf32>) { 1526 %mask = vector.create_mask %mask_ub: vector<128xi1> 1527 %0 = vector.transfer_read %src[%c0, %index], %f0, %mask {in_bounds = [true]} : memref<4096x4096xf32>, vector<128xf32> 1528 %1 = vector.transfer_read %src[%c0, %index2], %f0, %mask {in_bounds = [true]} : memref<4096x4096xf32>, vector<128xf32> 1529 gpu.yield %0, %1 : vector<128xf32>, vector<128xf32> 1530 } 1531 return %r#0, %r#1 : vector<2xf32>, vector<2xf32> 1532} 1533 1534// CHECK-PROP-LABEL: func.func @warp_propagate_masked_transfer_read_shared_mask 1535// CHECK-PROP: vector.create_mask %{{.*}} : vector<2xi1> 1536// CHECK-PROP: vector.transfer_read %{{.*}} : memref<4096x4096xf32>, vector<2xf32> 1537// CHECK-PROP: vector.create_mask %{{.*}} : vector<2xi1> 1538// CHECK-PROP: vector.transfer_read %{{.*}} : memref<4096x4096xf32>, vector<2xf32> 1539 1540// ----- 1541 1542func.func @warp_propagate_unconnected_read_write(%laneid: index, %buffer: memref<128xf32>, %f1: f32) -> (vector<2xf32>, vector<4xf32>) { 1543 %f0 = arith.constant 0.000000e+00 : f32 1544 %c0 = arith.constant 0 : index 1545 %r:2 = gpu.warp_execute_on_lane_0(%laneid)[32] -> (vector<2xf32>, vector<4xf32>) { 1546 %cst = arith.constant dense<2.0> : vector<128xf32> 1547 %0 = vector.transfer_read %buffer[%c0], %f0 {in_bounds = [true]} : memref<128xf32>, vector<128xf32> 1548 vector.transfer_write %cst, %buffer[%c0] : vector<128xf32>, memref<128xf32> 1549 %1 = vector.broadcast %f1 : f32 to vector<64xf32> 1550 gpu.yield %1, %0 : vector<64xf32>, vector<128xf32> 1551 } 1552 return %r#0, %r#1 : vector<2xf32>, vector<4xf32> 1553} 1554 1555// Verify that the write comes after the read 1556// CHECK-DIST-AND-PROP-LABEL: func.func @warp_propagate_unconnected_read_write( 1557// CHECK-DIST-AND-PROP: %[[CST:.+]] = arith.constant dense<2.000000e+00> : vector<4xf32> 1558// CHECK-DIST-AND-PROP: vector.transfer_read {{.*}} : memref<128xf32>, vector<4xf32> 1559// CHECK-DIST-AND-PROP: vector.transfer_write %[[CST]], {{.*}} : vector<4xf32>, memref<128xf32> 1560 1561// ----- 1562 1563func.func @warp_propagate_create_mask(%laneid: index, %m0: index) -> vector<1xi1> { 1564 %r = gpu.warp_execute_on_lane_0(%laneid)[32] -> (vector<1xi1>) { 1565 %1 = vector.create_mask %m0 : vector<32xi1> 1566 gpu.yield %1 : vector<32xi1> 1567 } 1568 return %r : vector<1xi1> 1569} 1570 1571// CHECK-PROP-DAG: #[[$SUB:.*]] = affine_map<()[s0, s1] -> (-s0 + s1)> 1572// CHECK-PROP-LABEL: func @warp_propagate_create_mask 1573// CHECK-PROP-SAME: %[[LANEID:.+]]: index, %[[M0:.+]]: index 1574// CHECK-PROP: %[[MDIST:.+]] = affine.apply #[[$SUB]]()[%[[LANEID]], %[[M0]]] 1575// CHECK-PROP: vector.create_mask %[[MDIST]] : vector<1xi1> 1576 1577// ----- 1578 1579func.func @warp_propagate_multi_dim_create_mask(%laneid: index, %m0: index, %m1: index, %m2: index) -> vector<1x2x4xi1> { 1580 %r = gpu.warp_execute_on_lane_0(%laneid)[32] -> (vector<1x2x4xi1>) { 1581 %1 = vector.create_mask %m0, %m1, %m2 : vector<16x4x4xi1> 1582 gpu.yield %1 : vector<16x4x4xi1> 1583 } 1584 return %r : vector<1x2x4xi1> 1585} 1586 1587// CHECK-PROP-DAG: #[[$SUBM0:.*]] = affine_map<()[s0, s1] -> (s0 - s1 floordiv 2)> 1588// CHECK-PROP-DAG: #[[$SUBM1:.*]] = affine_map<()[s0, s1] -> (s0 - s1 * 2 + (s1 floordiv 2) * 4)> 1589// CHECK-PROP-LABEL: func @warp_propagate_multi_dim_create_mask 1590// CHECK-PROP-SAME: %[[LANEID:.+]]: index, %[[M0:.+]]: index, %[[M1:.+]]: index, %[[M2:.+]]: index 1591// CHECK-PROP: %[[DISTM0:.+]] = affine.apply #[[$SUBM0]]()[%[[M0]], %[[LANEID]]] 1592// CHECK-PROP: %[[DISTM1:.+]] = affine.apply #[[$SUBM1]]()[%[[M1]], %[[LANEID]]] 1593// CHECK-PROP: vector.create_mask %[[DISTM0]], %[[DISTM1]], %[[M2]] : vector<1x2x4xi1> 1594 1595// ----- 1596 1597func.func @warp_propagate_nd_write(%laneid: index, %dest: memref<4x1024xf32>) { 1598 %c0 = arith.constant 0 : index 1599 gpu.warp_execute_on_lane_0(%laneid)[32] -> () { 1600 %0 = "some_def"() : () -> (vector<4x1024xf32>) 1601 vector.transfer_write %0, %dest[%c0, %c0] : vector<4x1024xf32>, memref<4x1024xf32> 1602 gpu.yield 1603 } 1604 return 1605} 1606 1607// CHECK-DIST-AND-PROP: #[[$MAP:.+]] = affine_map<()[s0] -> (s0 * 128)> 1608 1609// CHECK-DIST-AND-PROP-LABEL: func.func @warp_propagate_nd_write( 1610// CHECK-DIST-AND-PROP: %[[W:.*]] = gpu.warp_execute_on_lane_0(%{{.*}})[32] -> (vector<1x128xf32>) { 1611// CHECK-DIST-AND-PROP: %[[V0:.*]] = "some_def" 1612// CHECK-DIST-AND-PROP: gpu.yield %[[V0]] 1613// CHECK-DIST-AND-PROP-SAME: vector<4x1024xf32> 1614// CHECK-DIST-AND-PROP: } 1615 1616// CHECK-DIST-AND-PROP: %[[IDS:.+]]:2 = affine.delinearize_index %{{.*}} into (4, 8) : index, index 1617// CHECK-DIST-AND-PROP: %[[INNER_ID:.+]] = affine.apply #map()[%[[IDS]]#1] 1618// CHECK-DIST-AND-PROP: vector.transfer_write %[[W]], %{{.*}}[%[[IDS]]#0, %[[INNER_ID]]] {{.*}} : vector<1x128xf32> 1619