xref: /llvm-project/mlir/test/Dialect/GPU/multiple-all-reduce.mlir (revision 247d8d4f7ab19657bd798da2bda27450f3ff135a)
1// RUN: mlir-opt --gpu-kernel-outlining --convert-gpu-to-nvvm %s | FileCheck %s
2
3func.func @main() {
4  %data = memref.alloc() : memref<2x6xf32>
5  %sum = memref.alloc() : memref<2xf32>
6  %mul = memref.alloc() : memref<2xf32>
7  %c1 = arith.constant 1 : index
8
9  // ADD + MUL
10  gpu.launch blocks(%bx, %by, %bz) in (%grid_x = %c1, %grid_y = %c1, %grid_z = %c1)
11             threads(%tx, %ty, %tz) in (%block_x = %c1, %block_y = %c1, %block_z = %c1) {
12    %val = memref.load %data[%bx, %tx] : memref<2x6xf32>
13    %reduced0 = gpu.all_reduce add %val uniform {} : (f32) -> (f32)
14    memref.store %reduced0, %sum[%bx] : memref<2xf32>
15    %reduced1 = gpu.all_reduce mul %val uniform {} : (f32) -> (f32)
16    memref.store %reduced1, %mul[%bx] : memref<2xf32>
17    gpu.terminator
18  }
19
20// CHECK:      gpu.module @main_kernel {
21// CHECK-NEXT:   llvm.mlir.global internal @{{.*}}() {addr_space = 3 : i32} : !llvm.array<32 x f32>
22// CHECK-NEXT:   llvm.mlir.global internal @{{.*}}() {addr_space = 3 : i32} : !llvm.array<32 x f32>
23
24  return
25}
26