1// RUN: mlir-opt -convert-parallel-loops-to-gpu -split-input-file -verify-diagnostics %s | FileCheck %s 2 3// 2-d parallel loop mapped to block.y and block.x 4 5func.func @parallel_loop_bidy_bidx(%arg0 : index, %arg1 : index, %arg2 : index, 6 %arg3 : index, %arg4 : index, 7 %buf : memref<?x?xf32>, 8 %res : memref<?x?xf32>) { 9 %step = arith.constant 2 : index 10 scf.parallel (%i0, %i1) = (%arg0, %arg1) to (%arg2, %arg3) 11 step (%arg4, %step) { 12 %val = memref.load %buf[%i0, %i1] : memref<?x?xf32> 13 memref.store %val, %res[%i1, %i0] : memref<?x?xf32> 14 } { mapping = [#gpu.loop_dim_map<processor = block_y, map = (d0) -> (d0), bound = (d0) -> (d0)>, #gpu.loop_dim_map<processor = block_x, map = (d0) -> (d0), bound = (d0) -> (d0)>] } 15 return 16} 17 18// CHECK: #[[$MAP0:.*]] = affine_map<(d0)[s0, s1] -> ((d0 - s0) ceildiv s1)> 19// CHECK: #[[$MAP1:.*]] = affine_map<(d0)[s0, s1] -> (d0 * s0 + s1)> 20 21// CHECK: module { 22// CHECK-LABEL: func @parallel_loop_bidy_bidx( 23// CHECK-SAME: [[VAL_0:%.*]]: index, [[VAL_1:%.*]]: index, [[VAL_2:%.*]]: index, [[VAL_3:%.*]]: index, [[VAL_4:%.*]]: index, [[VAL_5:%.*]]: memref<?x?xf32>, [[VAL_6:%.*]]: memref<?x?xf32>) { 24// CHECK: [[VAL_7:%.*]] = arith.constant 2 : index 25// CHECK: [[VAL_8:%.*]] = arith.constant 1 : index 26// CHECK: [[VAL_9:%.*]] = affine.apply #[[$MAP0]]([[VAL_2]]){{\[}}[[VAL_0]], [[VAL_4]]] 27// CHECK: [[VAL_10:%.*]] = affine.apply #[[$MAP0]]([[VAL_3]]){{\[}}[[VAL_1]], [[VAL_7]]] 28// CHECK: gpu.launch blocks([[VAL_11:%.*]], [[VAL_12:%.*]], [[VAL_13:%.*]]) in ([[VAL_14:%.*]] = [[VAL_10]], [[VAL_15:%.*]] = [[VAL_9]], [[VAL_16:%.*]] = [[VAL_8]]) threads([[VAL_17:%.*]], [[VAL_18:%.*]], [[VAL_19:%.*]]) in ([[VAL_20:%.*]] = [[VAL_8]], [[VAL_21:%.*]] = [[VAL_8]], [[VAL_22:%.*]] = [[VAL_8]]) { 29// CHECK: [[VAL_23:%.*]] = affine.apply #[[$MAP1]]([[VAL_12]]){{\[}}[[VAL_4]], [[VAL_0]]] 30// CHECK: [[VAL_24:%.*]] = affine.apply #[[$MAP1]]([[VAL_11]]){{\[}}[[VAL_7]], [[VAL_1]]] 31// CHECK: [[VAL_25:%.*]] = memref.load [[VAL_5]]{{\[}}[[VAL_23]], [[VAL_24]]] : memref<?x?xf32> 32// CHECK: memref.store [[VAL_25]], [[VAL_6]]{{\[}}[[VAL_24]], [[VAL_23]]] : memref<?x?xf32> 33// CHECK: gpu.terminator 34// CHECK: } 35// CHECK: return 36// CHECK: } 37// CHECK: } 38 39// ----- 40 41// tiled 2-d parallel loop mapped to block.y and block.x and thread.y and thread.x. 42 43func.func @parallel_loop_tiled(%arg0 : index, %arg1 : index, %arg2 : index, 44 %arg3 : index, 45 %buf : memref<?x?xf32>, 46 %res : memref<?x?xf32>) { 47 %zero = arith.constant 0 : index 48 %one = arith.constant 1 : index 49 %four = arith.constant 4 : index 50 scf.parallel (%i0, %i1) = (%arg0, %arg1) to (%arg2, %arg3) 51 step (%four, %four) { 52 scf.parallel (%si0, %si1) = (%zero, %zero) to (%four, %four) 53 step (%one, %one) { 54 %idx0 = arith.addi %i0, %si0 : index 55 %idx1 = arith.addi %i1, %si1 : index 56 %val = memref.load %buf[%idx0, %idx1] : memref<?x?xf32> 57 memref.store %val, %res[%idx1, %idx0] : memref<?x?xf32> 58 } { mapping = [ 59 #gpu.loop_dim_map<processor = thread_y, map = (d0) -> (d0), bound = (d0) -> (d0)>, 60 #gpu.loop_dim_map<processor = thread_x, map = (d0) -> (d0), bound = (d0) -> (d0)> 61 ] } 62 } { mapping = [ 63 #gpu.loop_dim_map<processor = block_y, map = (d0) -> (d0), bound = (d0) -> (d0)>, 64 #gpu.loop_dim_map<processor = block_x, map = (d0) -> (d0), bound = (d0) -> (d0)> 65 ] } 66 return 67} 68 69// CHECK: #[[$MAP0:.*]] = affine_map<(d0)[s0, s1] -> ((d0 - s0) ceildiv s1)> 70// CHECK: #[[$MAP1:.*]] = affine_map<(d0)[s0, s1] -> (d0 * s0 + s1)> 71 72// CHECK: module { 73// CHECK-LABEL: func @parallel_loop_tiled( 74// CHECK-SAME: [[VAL_26:%.*]]: index, [[VAL_27:%.*]]: index, [[VAL_28:%.*]]: index, [[VAL_29:%.*]]: index, [[VAL_30:%.*]]: memref<?x?xf32>, [[VAL_31:%.*]]: memref<?x?xf32>) { 75// CHECK: [[VAL_32:%.*]] = arith.constant 0 : index 76// CHECK: [[VAL_33:%.*]] = arith.constant 1 : index 77// CHECK: [[VAL_34:%.*]] = arith.constant 4 : index 78// CHECK: [[VAL_35:%.*]] = arith.constant 1 : index 79// CHECK: [[VAL_36:%.*]] = affine.apply #[[$MAP0]]([[VAL_28]]){{\[}}[[VAL_26]], [[VAL_34]]] 80// CHECK: [[VAL_37:%.*]] = affine.apply #[[$MAP0]]([[VAL_29]]){{\[}}[[VAL_27]], [[VAL_34]]] 81// CHECK: [[VAL_38:%.*]] = affine.apply #[[$MAP0]]([[VAL_34]]){{\[}}[[VAL_32]], [[VAL_33]]] 82// CHECK: [[VAL_39:%.*]] = affine.apply #[[$MAP0]]([[VAL_34]]){{\[}}[[VAL_32]], [[VAL_33]]] 83// CHECK: gpu.launch blocks([[VAL_40:%.*]], [[VAL_41:%.*]], [[VAL_42:%.*]]) in ([[VAL_43:%.*]] = [[VAL_37]], [[VAL_44:%.*]] = [[VAL_36]], [[VAL_45:%.*]] = [[VAL_35]]) threads([[VAL_46:%.*]], [[VAL_47:%.*]], [[VAL_48:%.*]]) in ([[VAL_49:%.*]] = [[VAL_39]], [[VAL_50:%.*]] = [[VAL_38]], [[VAL_51:%.*]] = [[VAL_35]]) { 84// CHECK: [[VAL_52:%.*]] = affine.apply #[[$MAP1]]([[VAL_41]]){{\[}}[[VAL_34]], [[VAL_26]]] 85// CHECK: [[VAL_53:%.*]] = affine.apply #[[$MAP1]]([[VAL_40]]){{\[}}[[VAL_34]], [[VAL_27]]] 86// CHECK: [[VAL_54:%.*]] = affine.apply #[[$MAP1]]([[VAL_47]]){{\[}}[[VAL_33]], [[VAL_32]]] 87// CHECK: [[VAL_55:%.*]] = affine.apply #[[$MAP1]]([[VAL_46]]){{\[}}[[VAL_33]], [[VAL_32]]] 88// CHECK: [[VAL_56:%.*]] = arith.addi [[VAL_52]], [[VAL_54]] : index 89// CHECK: [[VAL_57:%.*]] = arith.addi [[VAL_53]], [[VAL_55]] : index 90// CHECK: [[VAL_58:%.*]] = memref.load [[VAL_30]]{{\[}}[[VAL_56]], [[VAL_57]]] : memref<?x?xf32> 91// CHECK: memref.store [[VAL_58]], [[VAL_31]]{{\[}}[[VAL_57]], [[VAL_56]]] : memref<?x?xf32> 92// CHECK: gpu.terminator 93// CHECK: } 94// CHECK: return 95// CHECK: } 96// CHECK: } 97 98// ----- 99 100// 2-d parallel loop mapped to block.y and sequential 101 102func.func @parallel_loop_bidy_seq(%arg0 : index, %arg1 : index, %arg2 : index, 103 %arg3 : index, %arg4 : index, 104 %buf : memref<?x?xf32>, 105 %res : memref<?x?xf32>) { 106 %step = arith.constant 2 : index 107 scf.parallel (%i0, %i1) = (%arg0, %arg1) to (%arg2, %arg3) 108 step (%arg4, %step) { 109 %val = memref.load %buf[%i0, %i1] : memref<?x?xf32> 110 memref.store %val, %res[%i1, %i0] : memref<?x?xf32> 111 } { mapping = [ 112 #gpu.loop_dim_map<processor = block_y, map = (d0) -> (d0), bound = (d0) -> (d0)>, 113 #gpu.loop_dim_map<processor = sequential, map = (d0) -> (d0), bound = (d0) -> (d0)> 114 ] } 115 return 116} 117 118// CHECK: #[[$MAP0:.*]] = affine_map<(d0)[s0, s1] -> ((d0 - s0) ceildiv s1)> 119// CHECK: #[[$MAP1:.*]] = affine_map<(d0)[s0, s1] -> (d0 * s0 + s1)> 120 121// CHECK: module { 122// CHECK-LABEL: func @parallel_loop_bidy_seq( 123// CHECK-SAME: [[VAL_59:%.*]]: index, [[VAL_60:%.*]]: index, [[VAL_61:%.*]]: index, [[VAL_62:%.*]]: index, [[VAL_63:%.*]]: index, [[VAL_64:%.*]]: memref<?x?xf32>, [[VAL_65:%.*]]: memref<?x?xf32>) { 124// CHECK: [[VAL_66:%.*]] = arith.constant 2 : index 125// CHECK: [[VAL_67:%.*]] = arith.constant 1 : index 126// CHECK: [[VAL_68:%.*]] = affine.apply #[[$MAP0]]([[VAL_61]]){{\[}}[[VAL_59]], [[VAL_63]]] 127// CHECK: gpu.launch blocks([[VAL_69:%.*]], [[VAL_70:%.*]], [[VAL_71:%.*]]) in ([[VAL_72:%.*]] = [[VAL_67]], [[VAL_73:%.*]] = [[VAL_68]], [[VAL_74:%.*]] = [[VAL_67]]) threads([[VAL_75:%.*]], [[VAL_76:%.*]], [[VAL_77:%.*]]) in ([[VAL_78:%.*]] = [[VAL_67]], [[VAL_79:%.*]] = [[VAL_67]], [[VAL_80:%.*]] = [[VAL_67]]) { 128// CHECK: [[VAL_81:%.*]] = affine.apply #[[$MAP1]]([[VAL_70]]){{\[}}[[VAL_63]], [[VAL_59]]] 129// CHECK: scf.for [[VAL_82:%.*]] = [[VAL_60]] to [[VAL_62]] step [[VAL_66]] { 130// CHECK: [[VAL_83:%.*]] = memref.load [[VAL_64]]{{\[}}[[VAL_81]], [[VAL_82]]] : memref<?x?xf32> 131// CHECK: memref.store [[VAL_83]], [[VAL_65]]{{\[}}[[VAL_82]], [[VAL_81]]] : memref<?x?xf32> 132// CHECK: } 133// CHECK: gpu.terminator 134// CHECK: } 135// CHECK: return 136// CHECK: } 137// CHECK: } 138 139// ----- 140 141// tiled 2-d parallel loop mapped to block.y and seq. and thread.y and seq. 142 143func.func @parallel_loop_tiled_seq(%arg0 : index, %arg1 : index, %arg2 : index, 144 %arg3 : index, 145 %buf : memref<?x?xf32>, 146 %res : memref<?x?xf32>) { 147 %zero = arith.constant 0 : index 148 %one = arith.constant 1 : index 149 %four = arith.constant 4 : index 150 scf.parallel (%i0, %i1) = (%arg0, %arg1) to (%arg2, %arg3) 151 step (%four, %four) { 152 scf.parallel (%si0, %si1) = (%zero, %zero) to (%four, %four) 153 step (%one, %one) { 154 %idx0 = arith.addi %i0, %si0 : index 155 %idx1 = arith.addi %i1, %si1 : index 156 %val = memref.load %buf[%idx0, %idx1] : memref<?x?xf32> 157 memref.store %val, %res[%idx1, %idx0] : memref<?x?xf32> 158 } { mapping = [ 159 #gpu.loop_dim_map<processor = thread_y, map = (d0) -> (d0), bound = (d0) -> (d0)>, 160 #gpu.loop_dim_map<processor = sequential, map = (d0) -> (d0), bound = (d0) -> (d0)> 161 ] } 162 } { mapping = [ 163 #gpu.loop_dim_map<processor = block_y, map = (d0) -> (d0), bound = (d0) -> (d0)>, 164 #gpu.loop_dim_map<processor = sequential, map = (d0) -> (d0), bound = (d0) -> (d0)> 165 ] } 166 return 167} 168 169// CHECK: #[[$MAP0:.*]] = affine_map<(d0)[s0, s1] -> ((d0 - s0) ceildiv s1)> 170// CHECK: #[[$MAP1:.*]] = affine_map<(d0)[s0, s1] -> (d0 * s0 + s1)> 171 172// CHECK: module { 173// CHECK-LABEL: func @parallel_loop_tiled_seq( 174// CHECK-SAME: [[VAL_84:%.*]]: index, [[VAL_85:%.*]]: index, [[VAL_86:%.*]]: index, [[VAL_87:%.*]]: index, [[VAL_88:%.*]]: memref<?x?xf32>, [[VAL_89:%.*]]: memref<?x?xf32>) { 175// CHECK: [[VAL_90:%.*]] = arith.constant 0 : index 176// CHECK: [[VAL_91:%.*]] = arith.constant 1 : index 177// CHECK: [[VAL_92:%.*]] = arith.constant 4 : index 178// CHECK: [[VAL_93:%.*]] = arith.constant 1 : index 179// CHECK: [[VAL_94:%.*]] = affine.apply #[[$MAP0]]([[VAL_86]]){{\[}}[[VAL_84]], [[VAL_92]]] 180// CHECK: [[VAL_95:%.*]] = affine.apply #[[$MAP0]]([[VAL_92]]){{\[}}[[VAL_90]], [[VAL_91]]] 181// CHECK: gpu.launch blocks([[VAL_96:%.*]], [[VAL_97:%.*]], [[VAL_98:%.*]]) in ([[VAL_99:%.*]] = [[VAL_93]], [[VAL_100:%.*]] = [[VAL_94]], [[VAL_101:%.*]] = [[VAL_93]]) threads([[VAL_102:%.*]], [[VAL_103:%.*]], [[VAL_104:%.*]]) in ([[VAL_105:%.*]] = [[VAL_93]], [[VAL_106:%.*]] = [[VAL_95]], [[VAL_107:%.*]] = [[VAL_93]]) { 182// CHECK: [[VAL_108:%.*]] = affine.apply #[[$MAP1]]([[VAL_97]]){{\[}}[[VAL_92]], [[VAL_84]]] 183// CHECK: scf.for [[VAL_109:%.*]] = [[VAL_85]] to [[VAL_87]] step [[VAL_92]] { 184// CHECK: [[VAL_110:%.*]] = affine.apply #[[$MAP1]]([[VAL_103]]){{\[}}[[VAL_91]], [[VAL_90]]] 185// CHECK: scf.for [[VAL_111:%.*]] = [[VAL_90]] to [[VAL_92]] step [[VAL_91]] { 186// CHECK: [[VAL_112:%.*]] = arith.addi [[VAL_108]], [[VAL_110]] : index 187// CHECK: [[VAL_113:%.*]] = arith.addi [[VAL_109]], [[VAL_111]] : index 188// CHECK: [[VAL_114:%.*]] = memref.load [[VAL_88]]{{\[}}[[VAL_112]], [[VAL_113]]] : memref<?x?xf32> 189// CHECK: memref.store [[VAL_114]], [[VAL_89]]{{\[}}[[VAL_113]], [[VAL_112]]] : memref<?x?xf32> 190// CHECK: } 191// CHECK: } 192// CHECK: gpu.terminator 193// CHECK: } 194// CHECK: return 195// CHECK: } 196// CHECK: } 197 198// ----- 199 200#map1 = affine_map<(d0)[s0] -> (2, -d0 + s0)> 201#map2 = affine_map<(d0)[s0] -> (3, -d0 + s0)> 202 203module { 204 func.func @sum(%arg0: memref<?x?xf32, strided<[?, 1], offset: ?>>, %arg1: memref<?x?xf32, strided<[?, 1], offset: ?>>, %arg2: memref<?x?xf32, strided<[?, 1], offset: ?>>) { 205 %c1 = arith.constant 1 : index 206 %c0 = arith.constant 0 : index 207 %c3 = arith.constant 3 : index 208 %c2 = arith.constant 2 : index 209 %0 = memref.dim %arg0, %c0 : memref<?x?xf32, strided<[?, 1], offset: ?>> 210 %1 = memref.dim %arg0, %c1 : memref<?x?xf32, strided<[?, 1], offset: ?>> 211 scf.parallel (%arg3, %arg4) = (%c0, %c0) to (%0, %1) step (%c2, %c3) { 212 %2 = memref.dim %arg0, %c0 : memref<?x?xf32, strided<[?, 1], offset: ?>> 213 %3 = affine.min #map1(%arg3)[%2] 214 %squared_min = arith.muli %3, %3 : index 215 %4 = memref.dim %arg0, %c1 : memref<?x?xf32, strided<[?, 1], offset: ?>> 216 %d = arith.subi %4, %arg4 : index 217 %5 = arith.minsi %c3, %d : index 218 %6 = memref.subview %arg0[%arg3, %arg4][%squared_min, %5][%c1, %c1] : memref<?x?xf32, strided<[?, 1], offset: ?>> to memref<?x?xf32, strided<[?, ?], offset: ?>> 219 %7 = memref.dim %arg1, %c0 : memref<?x?xf32, strided<[?, 1], offset: ?>> 220 %8 = affine.min #map1(%arg3)[%7] 221 %9 = memref.dim %arg1, %c1 : memref<?x?xf32, strided<[?, 1], offset: ?>> 222 %10 = affine.min #map2(%arg4)[%9] 223 %11 = memref.subview %arg1[%arg3, %arg4][%8, %10][%c1, %c1] : memref<?x?xf32, strided<[?, 1], offset: ?>> to memref<?x?xf32, strided<[?, ?], offset: ?>> 224 %12 = memref.dim %arg2, %c0 : memref<?x?xf32, strided<[?, 1], offset: ?>> 225 %13 = affine.min #map1(%arg3)[%12] 226 %14 = memref.dim %arg2, %c1 : memref<?x?xf32, strided<[?, 1], offset: ?>> 227 %15 = affine.min #map2(%arg4)[%14] 228 %16 = memref.subview %arg2[%arg3, %arg4][%13, %15][%c1, %c1] : memref<?x?xf32, strided<[?, 1], offset: ?>> to memref<?x?xf32, strided<[?, ?], offset: ?>> 229 scf.parallel (%arg5, %arg6) = (%c0, %c0) to (%squared_min, %5) step (%c1, %c1) { 230 %17 = memref.load %6[%arg5, %arg6] : memref<?x?xf32, strided<[?, ?], offset: ?>> 231 %18 = memref.load %11[%arg5, %arg6] : memref<?x?xf32, strided<[?, ?], offset: ?>> 232 %19 = memref.load %16[%arg5, %arg6] : memref<?x?xf32, strided<[?, ?], offset: ?>> 233 %20 = arith.addf %17, %18 : f32 234 memref.store %20, %16[%arg5, %arg6] : memref<?x?xf32, strided<[?, ?], offset: ?>> 235 scf.reduce 236 } {mapping = [#gpu.loop_dim_map<bound = (d0) -> (d0), map = (d0) -> (d0), processor = thread_x>, #gpu.loop_dim_map<bound = (d0) -> (d0), map = (d0) -> (d0), processor = thread_y>]} 237 scf.reduce 238 } {mapping = [#gpu.loop_dim_map<bound = (d0) -> (d0), map = (d0) -> (d0), processor = block_x>, #gpu.loop_dim_map<bound = (d0) -> (d0), map = (d0) -> (d0), processor = block_y>]} 239 return 240 } 241} 242 243// CHECK-DAG: #[[$MAP1:.*]] = affine_map<(d0)[s0, s1] -> ((d0 - s0) ceildiv s1)> 244// CHECK-DAG: #[[$MAP2:.*]] = affine_map<(d0)[s0, s1] -> (d0 * s0 + s1)> 245// CHECK-DAG: #[[$MAP3:.*]] = affine_map<(d0)[s0] -> (2, -d0 + s0)> 246// CHECK-DAG: #[[$MAP4:.*]] = affine_map<(d0)[s0] -> (3, -d0 + s0)> 247 248// CHECK: module { 249// CHECK-LABEL: func @sum( 250// CHECK-SAME: [[VAL_0:%.*]]: memref<?x?xf32, strided<[?, 1], offset: ?>>, [[VAL_1:%.*]]: memref<?x?xf32, strided<[?, 1], offset: ?>>, [[VAL_2:%.*]]: memref<?x?xf32, strided<[?, 1], offset: ?>>) { 251// CHECK: %[[C1:.*]] = arith.constant 1 : index 252// CHECK: %[[C0:.*]] = arith.constant 0 : index 253// CHECK: %[[C3:.*]] = arith.constant 3 : index 254// CHECK: %[[C2:.*]] = arith.constant 2 : index 255// CHECK: [[VAL_7:%.*]] = memref.dim [[VAL_0]], %[[C0]] : memref<?x?xf32, strided<[?, 1], offset: ?>> 256// CHECK: [[VAL_8:%.*]] = memref.dim [[VAL_0]], %[[C1]] : memref<?x?xf32, strided<[?, 1], offset: ?>> 257// CHECK: [[VAL_9:%.*]] = arith.constant 1 : index 258// CHECK: [[VAL_10:%.*]] = affine.apply #[[$MAP1]]([[VAL_7]]){{\[}}%[[C0]], %[[C2]]] 259// CHECK: [[VAL_11:%.*]] = affine.apply #[[$MAP1]]([[VAL_8]]){{\[}}%[[C0]], %[[C3]]] 260// CHECK: [[VAL_12:%.*]] = arith.constant 4 : index 261// CHECK: [[VAL_13:%.*]] = affine.apply #[[$MAP1]]([[VAL_12]]){{\[}}%[[C0]], %[[C1]]] 262// CHECK: [[VAL_15:%.*]] = affine.apply #[[$MAP1]](%[[C3]]){{\[}}%[[C0]], %[[C1]]] 263// CHECK: gpu.launch blocks([[VAL_16:%.*]], [[VAL_17:%.*]], [[VAL_18:%.*]]) in ([[VAL_19:%.*]] = [[VAL_10]], [[VAL_20:%.*]] = [[VAL_11]], [[VAL_21:%.*]] = [[VAL_9]]) threads([[VAL_22:%.*]], [[VAL_23:%.*]], [[VAL_24:%.*]]) in ([[VAL_25:%.*]] = [[VAL_13]], [[VAL_26:%.*]] = [[VAL_15]], [[VAL_27:%.*]] = [[VAL_9]]) { 264// CHECK: [[VAL_28:%.*]] = affine.apply #[[$MAP2]]([[VAL_16]]){{\[}}%[[C2]], %[[C0]]] 265// CHECK: [[VAL_29:%.*]] = affine.apply #[[$MAP2]]([[VAL_17]]){{\[}}%[[C3]], %[[C0]]] 266// CHECK: [[VAL_30:%.*]] = memref.dim [[VAL_0]], %[[C0]] : memref<?x?xf32, strided<[?, 1], offset: ?>> 267// CHECK: [[VAL_31:%.*]] = affine.min #[[$MAP3]]([[VAL_28]]){{\[}}[[VAL_30]]] 268// CHECK: [[VAL_31_SQUARED:%.*]] = arith.muli [[VAL_31]], [[VAL_31]] : index 269// CHECK: [[VAL_32:%.*]] = memref.dim [[VAL_0]], %[[C1]] : memref<?x?xf32, strided<[?, 1], offset: ?>> 270// CHECK: [[VAL_D:%.*]] = arith.subi [[VAL_32]], [[VAL_29]] : index 271// CHECK: [[VAL_33:%.*]] = arith.minsi %[[C3]], [[VAL_D]] : index 272// CHECK: [[VAL_34:%.*]] = memref.subview [[VAL_0]]{{\[}}[[VAL_28]], [[VAL_29]]] {{\[}}[[VAL_31_SQUARED]], [[VAL_33]]] {{\[}}%[[C1]], %[[C1]]] : memref<?x?xf32, strided<[?, 1], offset: ?>> to memref<?x?xf32, strided<[?, ?], offset: ?>> 273// CHECK: [[VAL_35:%.*]] = memref.dim [[VAL_1]], %[[C0]] : memref<?x?xf32, strided<[?, 1], offset: ?>> 274// CHECK: [[VAL_36:%.*]] = affine.min #[[$MAP3]]([[VAL_28]]){{\[}}[[VAL_35]]] 275// CHECK: [[VAL_37:%.*]] = memref.dim [[VAL_1]], %[[C1]] : memref<?x?xf32, strided<[?, 1], offset: ?>> 276// CHECK: [[VAL_38:%.*]] = affine.min #[[$MAP4]]([[VAL_29]]){{\[}}[[VAL_37]]] 277// CHECK: [[VAL_39:%.*]] = memref.subview [[VAL_1]]{{\[}}[[VAL_28]], [[VAL_29]]] {{\[}}[[VAL_36]], [[VAL_38]]] {{\[}}%[[C1]], %[[C1]]] : memref<?x?xf32, strided<[?, 1], offset: ?>> to memref<?x?xf32, strided<[?, ?], offset: ?>> 278// CHECK: [[VAL_40:%.*]] = memref.dim [[VAL_2]], %[[C0]] : memref<?x?xf32, strided<[?, 1], offset: ?>> 279// CHECK: [[VAL_41:%.*]] = affine.min #[[$MAP3]]([[VAL_28]]){{\[}}[[VAL_40]]] 280// CHECK: [[VAL_42:%.*]] = memref.dim [[VAL_2]], %[[C1]] : memref<?x?xf32, strided<[?, 1], offset: ?>> 281// CHECK: [[VAL_43:%.*]] = affine.min #[[$MAP4]]([[VAL_29]]){{\[}}[[VAL_42]]] 282// CHECK: [[VAL_44:%.*]] = memref.subview [[VAL_2]]{{\[}}[[VAL_28]], [[VAL_29]]] {{\[}}[[VAL_41]], [[VAL_43]]] {{\[}}%[[C1]], %[[C1]]] : memref<?x?xf32, strided<[?, 1], offset: ?>> to memref<?x?xf32, strided<[?, ?], offset: ?>> 283// CHECK: [[VAL_45:%.*]] = affine.apply #[[$MAP2]]([[VAL_22]]){{\[}}%[[C1]], %[[C0]]] 284// CHECK: [[VAL_46:%.*]] = arith.cmpi slt, [[VAL_45]], [[VAL_31_SQUARED]] : index 285// CHECK: scf.if [[VAL_46]] { 286// CHECK: [[VAL_47:%.*]] = affine.apply #[[$MAP2]]([[VAL_23]]){{\[}}%[[C1]], %[[C0]]] 287// CHECK: [[VAL_48:%.*]] = arith.cmpi slt, [[VAL_47]], [[VAL_33]] : index 288// CHECK: scf.if [[VAL_48]] { 289// CHECK: [[VAL_49:%.*]] = memref.load [[VAL_34]]{{\[}}[[VAL_45]], [[VAL_47]]] : memref<?x?xf32, strided<[?, ?], offset: ?>> 290// CHECK: [[VAL_50:%.*]] = memref.load [[VAL_39]]{{\[}}[[VAL_45]], [[VAL_47]]] : memref<?x?xf32, strided<[?, ?], offset: ?>> 291// CHECK: [[VAL_51:%.*]] = memref.load [[VAL_44]]{{\[}}[[VAL_45]], [[VAL_47]]] : memref<?x?xf32, strided<[?, ?], offset: ?>> 292// CHECK: [[VAL_52:%.*]] = arith.addf [[VAL_49]], [[VAL_50]] : f32 293// CHECK: memref.store [[VAL_52]], [[VAL_44]]{{\[}}[[VAL_45]], [[VAL_47]]] : memref<?x?xf32, strided<[?, ?], offset: ?>> 294// CHECK: } 295// CHECK: } 296// CHECK: gpu.terminator 297// CHECK: } 298// CHECK: return 299// CHECK: } 300// CHECK: } 301 302// ----- 303 304// Optional attribute lowering test 305 306func.func @parallel_loop_optional_attr() { 307 %c0 = arith.constant 0 : index 308 %c1 = arith.constant 1 : index 309 scf.parallel (%i0) = (%c0) to (%c1) step (%c1) { 310 } { mapping = [#gpu.loop_dim_map<processor = block_x, map = (d0) -> (d0), bound = (d0) -> (d0)>], optional_attr = 1 } 311 // CHECK: optional_attr = 1 312 return 313} 314 315// ----- 316 317// Mapping to the same processor twice. Cannot be mapped. 318 319func.func @parallel_double_map(%arg0 : index, %arg1 : index, %arg2 : index, 320 %arg3 : index, 321 %buf : memref<?x?xf32>, 322 %res : memref<?x?xf32>) { 323 %four = arith.constant 4 : index 324 scf.parallel (%i0, %i1) = (%arg0, %arg1) to (%arg2, %arg3) 325 step (%four, %four) { 326 } { mapping = [ 327 #gpu.loop_dim_map<processor = block_y, map = (d0) -> (d0), bound = (d0) -> (d0)>, 328 #gpu.loop_dim_map<processor = block_y, map = (d0) -> (d0), bound = (d0) -> (d0)> 329 ] } 330 return 331} 332 333// CHECK-LABEL: @parallel_double_map 334// CHECK: scf.parallel 335 336// ----- 337 338// Loop with loop-variant upper bound. Cannot be mapped. 339 340func.func @parallel_loop_loop_variant_bound(%arg0 : index, %arg1 : index, %arg2 : index, 341 %arg3 : index, 342 %buf : memref<?x?xf32>, 343 %res : memref<?x?xf32>) { 344 %zero = arith.constant 0 : index 345 %one = arith.constant 1 : index 346 %four = arith.constant 4 : index 347 scf.parallel (%i0, %i1) = (%arg0, %arg1) to (%arg2, %arg3) 348 step (%four, %four) { 349 scf.parallel (%si0, %si1) = (%zero, %zero) to (%i0, %i1) 350 step (%one, %one) { 351 %idx0 = arith.addi %i0, %si0 : index 352 %idx1 = arith.addi %i1, %si1 : index 353 %val = memref.load %buf[%idx0, %idx1] : memref<?x?xf32> 354 memref.store %val, %res[%idx1, %idx0] : memref<?x?xf32> 355 } { mapping = [ 356 #gpu.loop_dim_map<processor = thread_y, map = (d0) -> (d0), bound = (d0) -> (d0)>, 357 #gpu.loop_dim_map<processor = sequential, map = (d0) -> (d0), bound = (d0) -> (d0)> 358 ] } 359 } { mapping = [ 360 #gpu.loop_dim_map<processor = block_y, map = (d0) -> (d0), bound = (d0) -> (d0)>, 361 #gpu.loop_dim_map<processor = sequential, map = (d0) -> (d0), bound = (d0) -> (d0)> 362 ] } 363 return 364} 365 366// CHECK-LABEL: @parallel_loop_loop_variant_bound 367// CHECK: scf.parallel 368// CHECK: scf.parallel 369 370// ----- 371 372// Loop without annotations. Cannot be mapped. 373 374func.func @parallel_no_annotations(%arg0 : index, %arg1 : index, %arg2 : index, 375 %arg3 : index, 376 %buf : memref<?x?xf32>, 377 %res : memref<?x?xf32>) { 378 %four = arith.constant 4 : index 379 scf.parallel (%i0, %i1) = (%arg0, %arg1) to (%arg2, %arg3) 380 step (%four, %four) { 381 } 382 return 383} 384 385// CHECK-LABEL: @parallel_no_annotations 386// CHECK: scf.parallel 387 388// ----- 389 390// CHECK-LABEL: @step_invariant 391func.func @step_invariant() { 392 %alloc = memref.alloc() : memref<1x1xf64> 393 %alloc_0 = memref.alloc() : memref<1x1xf64> 394 %alloc_1 = memref.alloc() : memref<1x1xf64> 395 %c0 = arith.constant 0 : index 396 %c1 = arith.constant 1 : index 397 %c1_2 = arith.constant 1 : index 398 scf.parallel (%arg0) = (%c0) to (%c1) step (%c1_2) { 399 %c0_3 = arith.constant 0 : index 400 %c1_4 = arith.constant 1 : index 401 %c1_5 = arith.constant 1 : index 402 scf.parallel (%arg1) = (%c0_3) to (%c1_4) step (%c1_5) { 403 %0 = memref.load %alloc_1[%arg0, %arg1] : memref<1x1xf64> 404 %1 = memref.load %alloc_0[%arg0, %arg1] : memref<1x1xf64> 405 %2 = arith.addf %0, %1 : f64 406 memref.store %2, %alloc[%arg0, %arg1] : memref<1x1xf64> 407 scf.reduce 408 } {mapping = [#gpu.loop_dim_map<processor = thread_x, map = (d0) -> (d0), bound = (d0) -> (d0)>]} 409 scf.reduce 410 } {mapping = [#gpu.loop_dim_map<processor = block_x, map = (d0) -> (d0), bound = (d0) -> (d0)>]} 411 memref.dealloc %alloc_1 : memref<1x1xf64> 412 memref.dealloc %alloc_0 : memref<1x1xf64> 413 memref.dealloc %alloc : memref<1x1xf64> 414 return 415} 416 417// CHECK: %[[alloc_0:.*]] = memref.alloc() : memref<1x1xf64> 418// CHECK: %[[alloc_1:.*]] = memref.alloc() : memref<1x1xf64> 419// CHECK: %[[alloc_2:.*]] = memref.alloc() : memref<1x1xf64> 420// CHECK: %[[map_0:.*]] = affine.apply #map({{.*}})[{{.*}}, {{.*}}] 421// CHECK: %[[map_1:.*]] = affine.apply #map({{.*}})[{{.*}}, {{.*}}] 422// CHECK: gpu.launch 423// CHECK-SAME: blocks(%[[arg_0:.*]], %{{[^)]*}}, %{{[^)]*}}) in (%{{[^)]*}} = %[[map_0]], %{{[^)]*}} = %{{[^)]*}}, %{{[^)]*}} = %{{[^)]*}}) 424// CHECK-SAME: threads(%[[arg_3:.*]], %{{[^)]*}}, %{{[^)]*}}) in (%{{[^)]*}} = %[[map_1]], %{{[^)]*}} = %{{[^)]*}}, %{{[^)]*}} = %{{[^)]*}}) 425// CHECK: %[[dim0:.*]] = affine.apply #map1(%[[arg_0]])[{{.*}}, {{.*}}] 426// CHECK: %[[dim1:.*]] = affine.apply #map1(%[[arg_3]])[{{.*}}, {{.*}}] 427// CHECK: %[[lhs:.*]] = memref.load %[[alloc_2]][%[[dim0]], %[[dim1]]] : memref<1x1xf64> 428// CHECK: %[[rhs:.*]] = memref.load %[[alloc_1]][%[[dim0]], %[[dim1]]] : memref<1x1xf64> 429// CHECK: %[[sum:.*]] = arith.addf %[[lhs]], %[[rhs]] : f64 430// CHECK: memref.store %[[sum]], %[[alloc_0]][%[[dim0]], %[[dim1]]] : memref<1x1xf64> 431 432// ----- 433 434// 1-d parallel reduction mapped to block.x and thread.x. 435 436// CHECK-LABEL: @parallel_reduction_1d 437func.func @parallel_reduction_1d() { 438 %alloc = memref.alloc() : memref<f32> 439 %alloc_0 = memref.alloc() : memref<64xf32> 440 %c1 = arith.constant 1 : index 441 %c64 = arith.constant 64 : index 442 %c0 = arith.constant 0 : index 443 %cst = arith.constant 0.000000e+00 : f32 444 scf.parallel (%arg1) = (%c0) to (%c1) step (%c1) { 445 %0 = scf.parallel (%arg2) = (%c0) to (%c64) step (%c1) init (%cst) -> f32 { 446 %1 = memref.load %alloc_0[%arg2] : memref<64xf32> 447 scf.reduce(%1 : f32) { 448 ^bb0(%arg3: f32, %arg4: f32): 449 %2 = arith.addf %arg3, %arg4 : f32 450 scf.reduce.return %2 : f32 451 } 452 } {mapping = [#gpu.loop_dim_map<processor = thread_x, map = (d0) -> (d0), bound = (d0) -> (d0)>]} 453 memref.store %0, %alloc[] : memref<f32> 454 scf.reduce 455 } {mapping = [#gpu.loop_dim_map<processor = block_x, map = (d0) -> (d0), bound = (d0) -> (d0)>]} 456 memref.dealloc %alloc : memref<f32> 457 memref.dealloc %alloc_0 : memref<64xf32> 458 return 459} 460 461// CHECK: %[[alloc_0:.*]] = memref.alloc() : memref<f32> 462// CHECK: %[[alloc_1:.*]] = memref.alloc() : memref<64xf32> 463// CHECK: %[[map_0:.*]] = affine.apply #map({{.*}})[{{.*}}, {{.*}}] 464// CHECK: %[[map_1:.*]] = affine.apply #map({{.*}})[{{.*}}, {{.*}}] 465// CHECK: gpu.launch 466// CHECK-SAME: blocks(%[[arg_0:.*]], %{{[^)]*}}, %{{[^)]*}}) in (%{{[^)]*}} = %[[map_0]], %{{[^)]*}} = %{{[^)]*}}, %{{[^)]*}} = %{{[^)]*}}) 467// CHECK-SAME: threads(%[[arg_3:.*]], %{{[^)]*}}, %{{[^)]*}}) in (%{{[^)]*}} = %[[map_1]], %{{[^)]*}} = %{{[^)]*}}, %{{[^)]*}} = %{{[^)]*}}) 468// CHECK-NEXT: %[[dim0:.*]] = affine.apply #map1(%[[arg_0]])[{{.*}}, {{.*}}] 469// CHECK-NEXT: %[[dim1:.*]] = affine.apply #map1(%[[arg_3]])[{{.*}}, {{.*}}] 470// CHECK-NEXT: %[[src:.*]] = memref.load %[[alloc_1]][%[[dim1]]] : memref<64xf32> 471// CHECK-NEXT: %[[res:.*]] = gpu.all_reduce %[[src]] { 472// CHECK-NEXT: ^bb0(%[[arg12:.*]]: f32, %[[arg13:.*]]: f32): 473// CHECK-NEXT: %[[sum:.*]] = arith.addf %[[arg12]], %[[arg13]] : f32 474// CHECK-NEXT: gpu.yield %[[sum]] : f32 475// CHECK-NEXT: } : (f32) -> f32 476// CHECK-NEXT: memref.store %[[res]], %[[alloc_0]][] : memref<f32> 477 478// ----- 479 480// 2-d parallel reduction mapped to block.x and thread.x and thread.y. 481 482// CHECK-LABEL: @parallel_reduction_2d 483func.func @parallel_reduction_2d() { 484 %alloc = memref.alloc() : memref<f32> 485 %alloc_0 = memref.alloc() : memref<8x8xf32> 486 %c1 = arith.constant 1 : index 487 %c8 = arith.constant 8 : index 488 %c0 = arith.constant 0 : index 489 %cst = arith.constant 0.000000e+00 : f32 490 scf.parallel (%arg1) = (%c0) to (%c1) step (%c1) { 491 %0 = scf.parallel (%arg2, %arg3) = (%c0, %c0) to (%c8, %c8) step (%c1, %c1) init (%cst) -> f32 { 492 %1 = memref.load %alloc_0[%arg2, %arg3] : memref<8x8xf32> 493 scf.reduce(%1 : f32) { 494 ^bb0(%arg4: f32, %arg5: f32): 495 %2 = arith.addf %arg4, %arg5 : f32 496 scf.reduce.return %2 : f32 497 } 498 } {mapping = [#gpu.loop_dim_map<processor = thread_x, map = (d0) -> (d0), bound = (d0) -> (d0)>, #gpu.loop_dim_map<processor = thread_y, map = (d0) -> (d0), bound = (d0) -> (d0)>]} 499 memref.store %0, %alloc[] : memref<f32> 500 scf.reduce 501 } {mapping = [#gpu.loop_dim_map<processor = block_x, map = (d0) -> (d0), bound = (d0) -> (d0)>]} 502 memref.dealloc %alloc : memref<f32> 503 memref.dealloc %alloc_0 : memref<8x8xf32> 504 return 505} 506 507// CHECK: %[[alloc_0:.*]] = memref.alloc() : memref<f32> 508// CHECK: %[[alloc_1:.*]] = memref.alloc() : memref<8x8xf32> 509// CHECK: %[[map_0:.*]] = affine.apply #map({{.*}})[{{.*}}, {{.*}}] 510// CHECK: %[[map_1:.*]] = affine.apply #map({{.*}})[{{.*}}, {{.*}}] 511// CHECK: %[[map_2:.*]] = affine.apply #map({{.*}})[{{.*}}, {{.*}}] 512// CHECK: gpu.launch 513// CHECK-SAME: blocks(%[[arg_0:.*]], %{{[^)]*}}, %{{[^)]*}}) in (%{{[^)]*}} = %[[map_0]], %{{[^)]*}} = %{{[^)]*}}, %{{[^)]*}} = %{{[^)]*}}) 514// CHECK-SAME: threads(%[[arg_3:.*]], %[[arg_4:.*]], %{{[^)]*}}) in (%{{[^)]*}} = %[[map_1]], %{{[^)]*}} = %[[map_2]], %{{[^)]*}} = %{{[^)]*}}) 515// CHECK-NEXT: %[[dim0:.*]] = affine.apply #map1(%[[arg_0]])[{{.*}}, {{.*}}] 516// CHECK-NEXT: %[[dim1:.*]] = affine.apply #map1(%[[arg_3]])[{{.*}}, {{.*}}] 517// CHECK-NEXT: %[[dim2:.*]] = affine.apply #map1(%[[arg_4]])[{{.*}}, {{.*}}] 518// CHECK-NEXT: %[[src:.*]] = memref.load %[[alloc_1]][%[[dim1]], %[[dim2]]] : memref<8x8xf32> 519// CHECK-NEXT: %[[res:.*]] = gpu.all_reduce %[[src]] { 520// CHECK-NEXT: ^bb0(%[[arg12:.*]]: f32, %[[arg13:.*]]: f32): 521// CHECK-NEXT: %[[sum:.*]] = arith.addf %[[arg12]], %[[arg13]] : f32 522// CHECK-NEXT: gpu.yield %[[sum]] : f32 523// CHECK-NEXT: } : (f32) -> f32 524// CHECK-NEXT: memref.store %[[res]], %[[alloc_0]][] : memref<f32> 525 526// ----- 527 528// tiled 1-d parallel reduction mapped to block.x and thread.x. 529 530// CHECK-LABEL: @parallel_reduction_1d_tiled 531func.func @parallel_reduction_1d_tiled() { 532 %c128 = arith.constant 128 : index 533 %c1 = arith.constant 1 : index 534 %c64 = arith.constant 64 : index 535 %c0 = arith.constant 0 : index 536 %cst = arith.constant 0.000000e+00 : f32 537 %alloc_0 = memref.alloc() : memref<8192xf32> 538 %alloc_1 = memref.alloc() : memref<64xf32> 539 scf.parallel (%arg1) = (%c0) to (%c64) step (%c1) { 540 %subview = memref.subview %alloc_1[%arg1] [1] [1] : memref<64xf32> to memref<f32, strided<[], offset: ?>> 541 %0 = affine.apply affine_map<(d0) -> (d0 * 128)>(%arg1) 542 %subview_1 = memref.subview %alloc_0[%0] [128] [1] : memref<8192xf32> to memref<128xf32, strided<[1], offset: ?>> 543 %1 = scf.parallel (%arg2) = (%c0) to (%c128) step (%c1) init (%cst) -> f32 { 544 %2 = memref.load %subview_1[%arg2] : memref<128xf32, strided<[1], offset: ?>> 545 scf.reduce(%2 : f32) { 546 ^bb0(%arg3: f32, %arg4: f32): 547 %3 = arith.addf %arg3, %arg4 : f32 548 scf.reduce.return %3 : f32 549 } 550 } {mapping = [#gpu.loop_dim_map<processor = thread_x, map = (d0) -> (d0), bound = (d0) -> (d0)>]} 551 memref.store %1, %subview[] : memref<f32, strided<[], offset: ?>> 552 scf.reduce 553 } {mapping = [#gpu.loop_dim_map<processor = block_x, map = (d0) -> (d0), bound = (d0) -> (d0)>]} 554 memref.dealloc %alloc_0 : memref<8192xf32> 555 memref.dealloc %alloc_1 : memref<64xf32> 556 return 557} 558 559// CHECK: %[[alloc_0:.*]] = memref.alloc() : memref<8192xf32> 560// CHECK: %[[alloc_1:.*]] = memref.alloc() : memref<64xf32> 561// CHECK: %[[map_0:.*]] = affine.apply #map({{.*}})[{{.*}}, {{.*}}] 562// CHECK: %[[map_1:.*]] = affine.apply #map({{.*}})[{{.*}}, {{.*}}] 563// CHECK: gpu.launch 564// CHECK-SAME: blocks(%[[arg_0:.*]], %{{[^)]*}}, %{{[^)]*}}) in (%{{[^)]*}} = %[[map_0]], %{{[^)]*}} = %{{[^)]*}}, %{{[^)]*}} = %{{[^)]*}}) 565// CHECK-SAME: threads(%[[arg_3:.*]], %{{[^)]*}}, %{{[^)]*}}) in (%{{[^)]*}} = %[[map_1]], %{{[^)]*}} = %{{[^)]*}}, %{{[^)]*}} = %{{[^)]*}}) 566// CHECK-NEXT: %[[dim0:.*]] = affine.apply #map1(%[[arg_0]])[{{.*}}, {{.*}}] 567// CHECK-NEXT: %[[dst:.*]] = memref.subview %[[alloc_1]][%[[dim0]]] [1] [1] : memref<64xf32> 568// CHECK-NEXT: %[[dim1:.*]] = affine.apply #map2(%[[dim0]]) 569// CHECK-NEXT: %[[tile:.*]] = memref.subview %[[alloc_0]][%[[dim1]]] [128] [1] : memref<8192xf32> 570// CHECK-NEXT: %[[dim2:.*]] = affine.apply #map1(%[[arg_3]])[{{.*}}, {{.*}}] 571// CHECK-NEXT: %[[src:.*]] = memref.load %[[tile]][%[[dim2]]] : memref<128xf32, strided<[1], offset: ?>> 572// CHECK-NEXT: %[[res:.*]] = gpu.all_reduce %[[src]] { 573// CHECK-NEXT: ^bb0(%[[arg12:.*]]: f32, %[[arg13:.*]]: f32): 574// CHECK-NEXT: %[[sum:.*]] = arith.addf %[[arg12]], %[[arg13]] : f32 575// CHECK-NEXT: gpu.yield %[[sum]] : f32 576// CHECK-NEXT: } : (f32) -> f32 577// CHECK-NEXT: memref.store %[[res]], %[[dst]][] : memref<f32, strided<[], offset: ?>> 578 579// ----- 580 581// 1-d parallel reduction, unsigned int. Cannot be mapped. 582 583// CHECK-LABEL: @parallel_reduction_1d_uint 584func.func @parallel_reduction_1d_uint(%cst : ui32) { 585 %alloc = memref.alloc() : memref<ui32> 586 %alloc_0 = memref.alloc() : memref<64xui32> 587 %c1 = arith.constant 1 : index 588 %c64 = arith.constant 64 : index 589 %c0 = arith.constant 0 : index 590 scf.parallel (%arg1) = (%c0) to (%c1) step (%c1) { 591 %0 = scf.parallel (%arg2) = (%c0) to (%c64) step (%c1) init (%cst) -> ui32 { 592 %1 = memref.load %alloc_0[%arg2] : memref<64xui32> 593 scf.reduce(%1 : ui32) { 594 ^bb0(%arg3: ui32, %arg4: ui32): 595 scf.reduce.return %arg3 : ui32 596 } 597 } {mapping = [#gpu.loop_dim_map<processor = thread_x, map = (d0) -> (d0), bound = (d0) -> (d0)>]} 598 memref.store %0, %alloc[] : memref<ui32> 599 scf.reduce 600 } {mapping = [#gpu.loop_dim_map<processor = block_x, map = (d0) -> (d0), bound = (d0) -> (d0)>]} 601 memref.dealloc %alloc : memref<ui32> 602 memref.dealloc %alloc_0 : memref<64xui32> 603 return 604} 605 606// CHECK: scf.parallel 607// CHECK-NEXT: scf.parallel 608// CHECK: scf.reduce 609 610// ----- 611 612// 1-d parallel reduction, not isolated from above. Cannot be mapped. 613 614// CHECK-LABEL: @parallel_reduction_1d_outside 615func.func @parallel_reduction_1d_outside() { 616 %alloc = memref.alloc() : memref<f32> 617 %alloc_0 = memref.alloc() : memref<64xf32> 618 %c1 = arith.constant 1 : index 619 %c64 = arith.constant 64 : index 620 %c0 = arith.constant 0 : index 621 %cst = arith.constant 0.000000e+00 : f32 622 %const = arith.constant 1.000000e+00 : f32 623 scf.parallel (%arg1) = (%c0) to (%c1) step (%c1) { 624 %0 = scf.parallel (%arg2) = (%c0) to (%c64) step (%c1) init (%cst) -> f32 { 625 %1 = memref.load %alloc_0[%arg2] : memref<64xf32> 626 scf.reduce(%1 : f32) { 627 ^bb0(%arg3: f32, %arg4: f32): 628 %2 = arith.addf %arg3, %arg4 : f32 629 %3 = arith.addf %2, %const : f32 630 scf.reduce.return %3 : f32 631 } 632 } {mapping = [#gpu.loop_dim_map<processor = thread_x, map = (d0) -> (d0), bound = (d0) -> (d0)>]} 633 memref.store %0, %alloc[] : memref<f32> 634 scf.reduce 635 } {mapping = [#gpu.loop_dim_map<processor = block_x, map = (d0) -> (d0), bound = (d0) -> (d0)>]} 636 memref.dealloc %alloc : memref<f32> 637 memref.dealloc %alloc_0 : memref<64xf32> 638 return 639} 640 641// CHECK: scf.parallel 642// CHECK-NEXT: scf.parallel 643// CHECK: scf.reduce 644