1// RUN: mlir-opt --gpu-kernel-outlining --convert-gpu-to-nvvm %s | FileCheck %s 2 3func.func @main() { 4 %data = memref.alloc() : memref<2x6xf32> 5 %sum = memref.alloc() : memref<2xf32> 6 %mul = memref.alloc() : memref<2xf32> 7 %c1 = arith.constant 1 : index 8 9 // ADD + MUL 10 gpu.launch blocks(%bx, %by, %bz) in (%grid_x = %c1, %grid_y = %c1, %grid_z = %c1) 11 threads(%tx, %ty, %tz) in (%block_x = %c1, %block_y = %c1, %block_z = %c1) { 12 %val = memref.load %data[%bx, %tx] : memref<2x6xf32> 13 %reduced0 = gpu.all_reduce add %val uniform {} : (f32) -> (f32) 14 memref.store %reduced0, %sum[%bx] : memref<2xf32> 15 %reduced1 = gpu.all_reduce mul %val uniform {} : (f32) -> (f32) 16 memref.store %reduced1, %mul[%bx] : memref<2xf32> 17 gpu.terminator 18 } 19 20// CHECK: gpu.module @main_kernel { 21// CHECK-NEXT: llvm.mlir.global internal @{{.*}}() {addr_space = 3 : i32} : !llvm.array<32 x f32> 22// CHECK-NEXT: llvm.mlir.global internal @{{.*}}() {addr_space = 3 : i32} : !llvm.array<32 x f32> 23 24 return 25} 26