1// RUN: mlir-opt -allow-unregistered-dialect %s -affine-scalrep | FileCheck %s 2 3// CHECK-DAG: [[$MAP0:#map[0-9]*]] = affine_map<(d0, d1) -> (d1 + 1)> 4// CHECK-DAG: [[$MAP1:#map[0-9]*]] = affine_map<(d0, d1) -> (d0)> 5// CHECK-DAG: [[$MAP2:#map[0-9]*]] = affine_map<(d0, d1) -> (d1)> 6// CHECK-DAG: [[$MAP3:#map[0-9]*]] = affine_map<(d0, d1) -> (d0 - 1)> 7// CHECK-DAG: [[$MAP4:#map[0-9]*]] = affine_map<(d0) -> (d0 + 1)> 8// CHECK-DAG: [[$IDENT:#map[0-9]*]] = affine_map<(d0) -> (d0)> 9 10// CHECK-LABEL: func @simple_store_load() { 11func.func @simple_store_load() { 12 %cf7 = arith.constant 7.0 : f32 13 %m = memref.alloc() : memref<10xf32> 14 affine.for %i0 = 0 to 10 { 15 affine.store %cf7, %m[%i0] : memref<10xf32> 16 %v0 = affine.load %m[%i0] : memref<10xf32> 17 %v1 = arith.addf %v0, %v0 : f32 18 } 19 memref.dealloc %m : memref<10xf32> 20 return 21// CHECK: %[[C7:.*]] = arith.constant 7.000000e+00 : f32 22// CHECK-NEXT: affine.for %{{.*}} = 0 to 10 { 23// CHECK-NEXT: arith.addf %[[C7]], %[[C7]] : f32 24// CHECK-NEXT: } 25// CHECK-NEXT: return 26} 27 28// CHECK-LABEL: func @multi_store_load() { 29func.func @multi_store_load() { 30 %cf7 = arith.constant 7.0 : f32 31 %cf8 = arith.constant 8.0 : f32 32 %cf9 = arith.constant 9.0 : f32 33 %m = gpu.alloc() : memref<10xf32> 34 affine.for %i0 = 0 to 10 { 35 affine.store %cf7, %m[%i0] : memref<10xf32> 36 %v0 = affine.load %m[%i0] : memref<10xf32> 37 %v1 = arith.addf %v0, %v0 : f32 38 affine.store %cf8, %m[%i0] : memref<10xf32> 39 affine.store %cf9, %m[%i0] : memref<10xf32> 40 %v2 = affine.load %m[%i0] : memref<10xf32> 41 %v3 = affine.load %m[%i0] : memref<10xf32> 42 %v4 = arith.mulf %v2, %v3 : f32 43 } 44 gpu.dealloc %m : memref<10xf32> 45 return 46// CHECK-NEXT: %[[C7:.*]] = arith.constant 7.000000e+00 : f32 47// CHECK-NEXT: arith.constant 8.000000e+00 : f32 48// CHECK-NEXT: %[[C9:.*]] = arith.constant 9.000000e+00 : f32 49// CHECK-NEXT: affine.for %{{.*}} = 0 to 10 { 50// CHECK-NEXT: arith.addf %[[C7]], %[[C7]] : f32 51// CHECK-NEXT: arith.mulf %[[C9]], %[[C9]] : f32 52// CHECK-NEXT: } 53// CHECK-NEXT: return 54} 55 56// The store-load forwarding can see through affine apply's since it relies on 57// dependence information. 58// CHECK-LABEL: func @store_load_affine_apply 59func.func @store_load_affine_apply() -> memref<10x10xf32> { 60 %cf7 = arith.constant 7.0 : f32 61 %m = memref.alloc() : memref<10x10xf32> 62 affine.for %i0 = 0 to 10 { 63 affine.for %i1 = 0 to 10 { 64 %t0 = affine.apply affine_map<(d0, d1) -> (d1 + 1)>(%i0, %i1) 65 %t1 = affine.apply affine_map<(d0, d1) -> (d0)>(%i0, %i1) 66 %idx0 = affine.apply affine_map<(d0, d1) -> (d1)> (%t0, %t1) 67 %idx1 = affine.apply affine_map<(d0, d1) -> (d0 - 1)> (%t0, %t1) 68 affine.store %cf7, %m[%idx0, %idx1] : memref<10x10xf32> 69 // CHECK-NOT: affine.load %{{[0-9]+}} 70 %v0 = affine.load %m[%i0, %i1] : memref<10x10xf32> 71 %v1 = arith.addf %v0, %v0 : f32 72 } 73 } 74 // The memref and its stores won't be erased due to this memref return. 75 return %m : memref<10x10xf32> 76// CHECK: %{{.*}} = arith.constant 7.000000e+00 : f32 77// CHECK-NEXT: %{{.*}} = memref.alloc() : memref<10x10xf32> 78// CHECK-NEXT: affine.for %{{.*}} = 0 to 10 { 79// CHECK-NEXT: affine.for %{{.*}} = 0 to 10 { 80// CHECK-NEXT: %{{.*}} = affine.apply [[$MAP0]](%{{.*}}, %{{.*}}) 81// CHECK-NEXT: %{{.*}} = affine.apply [[$MAP1]](%{{.*}}, %{{.*}}) 82// CHECK-NEXT: %{{.*}} = affine.apply [[$MAP2]](%{{.*}}, %{{.*}}) 83// CHECK-NEXT: %{{.*}} = affine.apply [[$MAP3]](%{{.*}}, %{{.*}}) 84// CHECK-NEXT: affine.store %{{.*}}, %{{.*}}[%{{.*}}, %{{.*}}] : memref<10x10xf32> 85// CHECK-NEXT: %{{.*}} = arith.addf %{{.*}}, %{{.*}} : f32 86// CHECK-NEXT: } 87// CHECK-NEXT: } 88// CHECK-NEXT: return %{{.*}} : memref<10x10xf32> 89} 90 91// CHECK-LABEL: func @store_load_nested 92func.func @store_load_nested(%N : index) { 93 %cf7 = arith.constant 7.0 : f32 94 %m = memref.alloc() : memref<10xf32> 95 affine.for %i0 = 0 to 10 { 96 affine.store %cf7, %m[%i0] : memref<10xf32> 97 affine.for %i1 = 0 to %N { 98 %v0 = affine.load %m[%i0] : memref<10xf32> 99 %v1 = arith.addf %v0, %v0 : f32 100 } 101 } 102 return 103// CHECK: %{{.*}} = arith.constant 7.000000e+00 : f32 104// CHECK-NEXT: affine.for %{{.*}} = 0 to 10 { 105// CHECK-NEXT: affine.for %{{.*}} = 0 to %{{.*}} { 106// CHECK-NEXT: %{{.*}} = arith.addf %{{.*}}, %{{.*}} : f32 107// CHECK-NEXT: } 108// CHECK-NEXT: } 109// CHECK-NEXT: return 110} 111 112// No forwarding happens here since either of the two stores could be the last 113// writer; store/load forwarding will however be possible here once loop live 114// out SSA scalars are available. 115// CHECK-LABEL: func @multi_store_load_nested_no_fwd 116func.func @multi_store_load_nested_no_fwd(%N : index) { 117 %cf7 = arith.constant 7.0 : f32 118 %cf8 = arith.constant 8.0 : f32 119 %m = memref.alloc() : memref<10xf32> 120 affine.for %i0 = 0 to 10 { 121 affine.store %cf7, %m[%i0] : memref<10xf32> 122 affine.for %i1 = 0 to %N { 123 affine.store %cf8, %m[%i1] : memref<10xf32> 124 } 125 affine.for %i2 = 0 to %N { 126 // CHECK: %{{[0-9]+}} = affine.load %{{.*}}[%{{.*}}] : memref<10xf32> 127 %v0 = affine.load %m[%i0] : memref<10xf32> 128 %v1 = arith.addf %v0, %v0 : f32 129 } 130 } 131 return 132} 133 134// No forwarding happens here since both stores have a value going into 135// the load. 136// CHECK-LABEL: func @store_load_store_nested_no_fwd 137func.func @store_load_store_nested_no_fwd(%N : index) { 138 %cf7 = arith.constant 7.0 : f32 139 %cf9 = arith.constant 9.0 : f32 140 %m = memref.alloc() : memref<10xf32> 141 affine.for %i0 = 0 to 10 { 142 affine.store %cf7, %m[%i0] : memref<10xf32> 143 affine.for %i1 = 0 to %N { 144 // CHECK: %{{[0-9]+}} = affine.load %{{.*}}[%{{.*}}] : memref<10xf32> 145 %v0 = affine.load %m[%i0] : memref<10xf32> 146 %v1 = arith.addf %v0, %v0 : f32 147 affine.store %cf9, %m[%i0] : memref<10xf32> 148 } 149 } 150 return 151} 152 153// Forwarding happens here since the last store postdominates all other stores 154// and other forwarding criteria are satisfied. 155// CHECK-LABEL: func @multi_store_load_nested_fwd 156func.func @multi_store_load_nested_fwd(%N : index) { 157 %cf7 = arith.constant 7.0 : f32 158 %cf8 = arith.constant 8.0 : f32 159 %cf9 = arith.constant 9.0 : f32 160 %cf10 = arith.constant 10.0 : f32 161 %m = memref.alloc() : memref<10xf32> 162 affine.for %i0 = 0 to 10 { 163 affine.store %cf7, %m[%i0] : memref<10xf32> 164 affine.for %i1 = 0 to %N { 165 affine.store %cf8, %m[%i1] : memref<10xf32> 166 } 167 affine.for %i2 = 0 to %N { 168 affine.store %cf9, %m[%i2] : memref<10xf32> 169 } 170 affine.store %cf10, %m[%i0] : memref<10xf32> 171 affine.for %i3 = 0 to %N { 172 // CHECK-NOT: %{{[0-9]+}} = affine.load 173 %v0 = affine.load %m[%i0] : memref<10xf32> 174 %v1 = arith.addf %v0, %v0 : f32 175 } 176 } 177 return 178} 179 180// There is no unique load location for the store to forward to. 181// CHECK-LABEL: func @store_load_no_fwd 182func.func @store_load_no_fwd() { 183 %cf7 = arith.constant 7.0 : f32 184 %m = memref.alloc() : memref<10xf32> 185 affine.for %i0 = 0 to 10 { 186 affine.store %cf7, %m[%i0] : memref<10xf32> 187 affine.for %i1 = 0 to 10 { 188 affine.for %i2 = 0 to 10 { 189 // CHECK: affine.load 190 %v0 = affine.load %m[%i2] : memref<10xf32> 191 %v1 = arith.addf %v0, %v0 : f32 192 } 193 } 194 } 195 return 196} 197 198// Forwarding happens here as there is a one-to-one store-load correspondence. 199// CHECK-LABEL: func @store_load_fwd 200func.func @store_load_fwd() { 201 %cf7 = arith.constant 7.0 : f32 202 %c0 = arith.constant 0 : index 203 %m = memref.alloc() : memref<10xf32> 204 affine.store %cf7, %m[%c0] : memref<10xf32> 205 affine.for %i0 = 0 to 10 { 206 affine.for %i1 = 0 to 10 { 207 affine.for %i2 = 0 to 10 { 208 // CHECK-NOT: affine.load %{{[0-9]}}+ 209 %v0 = affine.load %m[%c0] : memref<10xf32> 210 %v1 = arith.addf %v0, %v0 : f32 211 } 212 } 213 } 214 return 215} 216 217// Although there is a dependence from the second store to the load, it is 218// satisfied by the outer surrounding loop, and does not prevent the first 219// store to be forwarded to the load. 220func.func @store_load_store_nested_fwd(%N : index) -> f32 { 221 %cf7 = arith.constant 7.0 : f32 222 %cf9 = arith.constant 9.0 : f32 223 %c0 = arith.constant 0 : index 224 %c1 = arith.constant 1 : index 225 %m = memref.alloc() : memref<10xf32> 226 affine.for %i0 = 0 to 10 { 227 affine.store %cf7, %m[%i0] : memref<10xf32> 228 affine.for %i1 = 0 to %N { 229 %v0 = affine.load %m[%i0] : memref<10xf32> 230 %v1 = arith.addf %v0, %v0 : f32 231 %idx = affine.apply affine_map<(d0) -> (d0 + 1)> (%i0) 232 affine.store %cf9, %m[%idx] : memref<10xf32> 233 } 234 } 235 // Due to this load, the memref isn't optimized away. 236 %v3 = affine.load %m[%c1] : memref<10xf32> 237 return %v3 : f32 238// CHECK: %{{.*}} = memref.alloc() : memref<10xf32> 239// CHECK-NEXT: affine.for %{{.*}} = 0 to 10 { 240// CHECK-NEXT: affine.store %{{.*}}, %{{.*}}[%{{.*}}] : memref<10xf32> 241// CHECK-NEXT: affine.for %{{.*}} = 0 to %{{.*}} { 242// CHECK-NEXT: %{{.*}} = arith.addf %{{.*}}, %{{.*}} : f32 243// CHECK-NEXT: %{{.*}} = affine.apply [[$MAP4]](%{{.*}}) 244// CHECK-NEXT: affine.store %{{.*}}, %{{.*}}[%{{.*}}] : memref<10xf32> 245// CHECK-NEXT: } 246// CHECK-NEXT: } 247// CHECK-NEXT: %{{.*}} = affine.load %{{.*}}[%{{.*}}] : memref<10xf32> 248// CHECK-NEXT: return %{{.*}} : f32 249} 250 251// CHECK-LABEL: func @should_not_fwd 252func.func @should_not_fwd(%A: memref<100xf32>, %M : index, %N : index) -> f32 { 253 %cf = arith.constant 0.0 : f32 254 affine.store %cf, %A[%M] : memref<100xf32> 255 // CHECK: affine.load %{{.*}}[%{{.*}}] 256 %v = affine.load %A[%N] : memref<100xf32> 257 return %v : f32 258} 259 260// Can store forward to A[%j, %i], but no forwarding to load on %A[%i, %j] 261// CHECK-LABEL: func @refs_not_known_to_be_equal 262func.func @refs_not_known_to_be_equal(%A : memref<100 x 100 x f32>, %M : index) { 263 %N = affine.apply affine_map<(d0) -> (d0 + 1)> (%M) 264 %cf1 = arith.constant 1.0 : f32 265 affine.for %i = 0 to 100 { 266 // CHECK: affine.for %[[I:.*]] = 267 affine.for %j = 0 to 100 { 268 // CHECK: affine.for %[[J:.*]] = 269 // CHECK: affine.load %{{.*}}[%[[I]], %[[J]]] 270 %u = affine.load %A[%i, %j] : memref<100x100xf32> 271 // CHECK-NEXT: affine.store %{{.*}}, %{{.*}}[%[[J]], %[[I]]] 272 affine.store %cf1, %A[%j, %i] : memref<100x100xf32> 273 // CHECK-NEXT: affine.load %{{.*}}[%[[I]], %[[J]]] 274 %v = affine.load %A[%i, %j] : memref<100x100xf32> 275 // This load should disappear. 276 %w = affine.load %A[%j, %i] : memref<100x100xf32> 277 // CHECK-NEXT: "foo" 278 "foo" (%u, %v, %w) : (f32, f32, f32) -> () 279 } 280 } 281 return 282} 283 284// CHECK-LABEL: func @elim_load_after_store 285func.func @elim_load_after_store(%arg0: memref<100xf32>, %arg1: memref<100xf32>) { 286 %alloc = memref.alloc() : memref<1xf32> 287 %alloc_0 = memref.alloc() : memref<1xf32> 288 // CHECK: affine.for 289 affine.for %arg2 = 0 to 100 { 290 // CHECK: affine.load 291 %0 = affine.load %arg0[%arg2] : memref<100xf32> 292 %1 = affine.load %arg0[%arg2] : memref<100xf32> 293 // CHECK: arith.addf 294 %2 = arith.addf %0, %1 : f32 295 affine.store %2, %alloc_0[0] : memref<1xf32> 296 %3 = affine.load %arg0[%arg2] : memref<100xf32> 297 %4 = affine.load %alloc_0[0] : memref<1xf32> 298 // CHECK-NEXT: arith.addf 299 %5 = arith.addf %3, %4 : f32 300 affine.store %5, %alloc[0] : memref<1xf32> 301 %6 = affine.load %arg0[%arg2] : memref<100xf32> 302 %7 = affine.load %alloc[0] : memref<1xf32> 303 %8 = arith.addf %6, %7 : f32 304 affine.store %8, %arg1[%arg2] : memref<100xf32> 305 } 306 return 307} 308 309// The test checks for value forwarding from vector stores to vector loads. 310// The value loaded from %in can directly be stored to %out by eliminating 311// store and load from %tmp. 312func.func @vector_forwarding(%in : memref<512xf32>, %out : memref<512xf32>) { 313 %tmp = memref.alloc() : memref<512xf32> 314 affine.for %i = 0 to 16 { 315 %ld0 = affine.vector_load %in[32*%i] : memref<512xf32>, vector<32xf32> 316 affine.vector_store %ld0, %tmp[32*%i] : memref<512xf32>, vector<32xf32> 317 %ld1 = affine.vector_load %tmp[32*%i] : memref<512xf32>, vector<32xf32> 318 affine.vector_store %ld1, %out[32*%i] : memref<512xf32>, vector<32xf32> 319 } 320 return 321} 322 323// CHECK-LABEL: func @vector_forwarding 324// CHECK: affine.for %{{.*}} = 0 to 16 { 325// CHECK-NEXT: %[[LDVAL:.*]] = affine.vector_load 326// CHECK-NEXT: affine.vector_store %[[LDVAL]],{{.*}} 327// CHECK-NEXT: } 328 329func.func @vector_no_forwarding(%in : memref<512xf32>, %out : memref<512xf32>) { 330 %tmp = memref.alloc() : memref<512xf32> 331 affine.for %i = 0 to 16 { 332 %ld0 = affine.vector_load %in[32*%i] : memref<512xf32>, vector<32xf32> 333 affine.vector_store %ld0, %tmp[32*%i] : memref<512xf32>, vector<32xf32> 334 %ld1 = affine.vector_load %tmp[32*%i] : memref<512xf32>, vector<16xf32> 335 affine.vector_store %ld1, %out[32*%i] : memref<512xf32>, vector<16xf32> 336 } 337 return 338} 339 340// CHECK-LABEL: func @vector_no_forwarding 341// CHECK: affine.for %{{.*}} = 0 to 16 { 342// CHECK-NEXT: %[[LDVAL:.*]] = affine.vector_load 343// CHECK-NEXT: affine.vector_store %[[LDVAL]],{{.*}} 344// CHECK-NEXT: %[[LDVAL1:.*]] = affine.vector_load 345// CHECK-NEXT: affine.vector_store %[[LDVAL1]],{{.*}} 346// CHECK-NEXT: } 347 348// CHECK-LABEL: func @simple_three_loads 349func.func @simple_three_loads(%in : memref<10xf32>) { 350 affine.for %i0 = 0 to 10 { 351 // CHECK: affine.load 352 %v0 = affine.load %in[%i0] : memref<10xf32> 353 // CHECK-NOT: affine.load 354 %v1 = affine.load %in[%i0] : memref<10xf32> 355 %v2 = arith.addf %v0, %v1 : f32 356 %v3 = affine.load %in[%i0] : memref<10xf32> 357 %v4 = arith.addf %v2, %v3 : f32 358 } 359 return 360} 361 362// CHECK-LABEL: func @nested_loads_const_index 363func.func @nested_loads_const_index(%in : memref<10xf32>) { 364 %c0 = arith.constant 0 : index 365 // CHECK: affine.load 366 %v0 = affine.load %in[%c0] : memref<10xf32> 367 affine.for %i0 = 0 to 10 { 368 affine.for %i1 = 0 to 20 { 369 affine.for %i2 = 0 to 30 { 370 // CHECK-NOT: affine.load 371 %v1 = affine.load %in[%c0] : memref<10xf32> 372 %v2 = arith.addf %v0, %v1 : f32 373 } 374 } 375 } 376 return 377} 378 379// CHECK-LABEL: func @nested_loads 380func.func @nested_loads(%N : index, %in : memref<10xf32>) { 381 affine.for %i0 = 0 to 10 { 382 // CHECK: affine.load 383 %v0 = affine.load %in[%i0] : memref<10xf32> 384 affine.for %i1 = 0 to %N { 385 // CHECK-NOT: affine.load 386 %v1 = affine.load %in[%i0] : memref<10xf32> 387 %v2 = arith.addf %v0, %v1 : f32 388 } 389 } 390 return 391} 392 393// CHECK-LABEL: func @nested_loads_different_memref_accesses_no_cse 394func.func @nested_loads_different_memref_accesses_no_cse(%in : memref<10xf32>) { 395 affine.for %i0 = 0 to 10 { 396 // CHECK: affine.load 397 %v0 = affine.load %in[%i0] : memref<10xf32> 398 affine.for %i1 = 0 to 20 { 399 // CHECK: affine.load 400 %v1 = affine.load %in[%i1] : memref<10xf32> 401 %v2 = arith.addf %v0, %v1 : f32 402 } 403 } 404 return 405} 406 407// CHECK-LABEL: func @load_load_store 408func.func @load_load_store(%m : memref<10xf32>) { 409 affine.for %i0 = 0 to 10 { 410 // CHECK: affine.load 411 %v0 = affine.load %m[%i0] : memref<10xf32> 412 // CHECK-NOT: affine.load 413 %v1 = affine.load %m[%i0] : memref<10xf32> 414 %v2 = arith.addf %v0, %v1 : f32 415 affine.store %v2, %m[%i0] : memref<10xf32> 416 } 417 return 418} 419 420// CHECK-LABEL: func @load_load_store_2_loops_no_cse 421func.func @load_load_store_2_loops_no_cse(%N : index, %m : memref<10xf32>) { 422 affine.for %i0 = 0 to 10 { 423 // CHECK: affine.load 424 %v0 = affine.load %m[%i0] : memref<10xf32> 425 affine.for %i1 = 0 to %N { 426 // CHECK: affine.load 427 %v1 = affine.load %m[%i0] : memref<10xf32> 428 %v2 = arith.addf %v0, %v1 : f32 429 affine.store %v2, %m[%i0] : memref<10xf32> 430 } 431 } 432 return 433} 434 435// CHECK-LABEL: func @load_load_store_3_loops_no_cse 436func.func @load_load_store_3_loops_no_cse(%m : memref<10xf32>) { 437%cf1 = arith.constant 1.0 : f32 438 affine.for %i0 = 0 to 10 { 439 // CHECK: affine.load 440 %v0 = affine.load %m[%i0] : memref<10xf32> 441 affine.for %i1 = 0 to 20 { 442 affine.for %i2 = 0 to 30 { 443 // CHECK: affine.load 444 %v1 = affine.load %m[%i0] : memref<10xf32> 445 %v2 = arith.addf %v0, %v1 : f32 446 } 447 affine.store %cf1, %m[%i0] : memref<10xf32> 448 } 449 } 450 return 451} 452 453// CHECK-LABEL: func @load_load_store_3_loops 454func.func @load_load_store_3_loops(%m : memref<10xf32>) { 455%cf1 = arith.constant 1.0 : f32 456 affine.for %i0 = 0 to 10 { 457 affine.for %i1 = 0 to 20 { 458 // CHECK: affine.load 459 %v0 = affine.load %m[%i0] : memref<10xf32> 460 affine.for %i2 = 0 to 30 { 461 // CHECK-NOT: affine.load 462 %v1 = affine.load %m[%i0] : memref<10xf32> 463 %v2 = arith.addf %v0, %v1 : f32 464 } 465 } 466 affine.store %cf1, %m[%i0] : memref<10xf32> 467 } 468 return 469} 470 471// CHECK-LABEL: func @loads_in_sibling_loops_const_index_no_cse 472func.func @loads_in_sibling_loops_const_index_no_cse(%m : memref<10xf32>) { 473 %c0 = arith.constant 0 : index 474 affine.for %i0 = 0 to 10 { 475 // CHECK: affine.load 476 %v0 = affine.load %m[%c0] : memref<10xf32> 477 } 478 affine.for %i1 = 0 to 10 { 479 // CHECK: affine.load 480 %v0 = affine.load %m[%c0] : memref<10xf32> 481 %v1 = arith.addf %v0, %v0 : f32 482 } 483 return 484} 485 486// CHECK-LABEL: func @load_load_affine_apply 487func.func @load_load_affine_apply(%in : memref<10x10xf32>) { 488 affine.for %i0 = 0 to 10 { 489 affine.for %i1 = 0 to 10 { 490 %t0 = affine.apply affine_map<(d0, d1) -> (d1 + 1)>(%i0, %i1) 491 %t1 = affine.apply affine_map<(d0, d1) -> (d0)>(%i0, %i1) 492 %idx0 = affine.apply affine_map<(d0, d1) -> (d1)> (%t0, %t1) 493 %idx1 = affine.apply affine_map<(d0, d1) -> (d0 - 1)> (%t0, %t1) 494 // CHECK: affine.load 495 %v0 = affine.load %in[%idx0, %idx1] : memref<10x10xf32> 496 // CHECK-NOT: affine.load 497 %v1 = affine.load %in[%i0, %i1] : memref<10x10xf32> 498 %v2 = arith.addf %v0, %v1 : f32 499 } 500 } 501 return 502} 503 504// CHECK-LABEL: func @vector_loads 505func.func @vector_loads(%in : memref<512xf32>, %out : memref<512xf32>) { 506 affine.for %i = 0 to 16 { 507 // CHECK: affine.vector_load 508 %ld0 = affine.vector_load %in[32*%i] : memref<512xf32>, vector<32xf32> 509 // CHECK-NOT: affine.vector_load 510 %ld1 = affine.vector_load %in[32*%i] : memref<512xf32>, vector<32xf32> 511 %add = arith.addf %ld0, %ld1 : vector<32xf32> 512 affine.vector_store %ld1, %out[32*%i] : memref<512xf32>, vector<32xf32> 513 } 514 return 515} 516 517// CHECK-LABEL: func @vector_loads_no_cse 518func.func @vector_loads_no_cse(%in : memref<512xf32>, %out : memref<512xf32>) { 519 affine.for %i = 0 to 16 { 520 // CHECK: affine.vector_load 521 %ld0 = affine.vector_load %in[32*%i] : memref<512xf32>, vector<32xf32> 522 // CHECK: affine.vector_load 523 %ld1 = affine.vector_load %in[32*%i] : memref<512xf32>, vector<16xf32> 524 affine.vector_store %ld1, %out[32*%i] : memref<512xf32>, vector<16xf32> 525 } 526 return 527} 528 529// CHECK-LABEL: func @vector_load_store_load_no_cse 530func.func @vector_load_store_load_no_cse(%in : memref<512xf32>, %out : memref<512xf32>) { 531 affine.for %i = 0 to 16 { 532 // CHECK: affine.vector_load 533 %ld0 = affine.vector_load %in[32*%i] : memref<512xf32>, vector<32xf32> 534 affine.vector_store %ld0, %in[16*%i] : memref<512xf32>, vector<32xf32> 535 // CHECK: affine.vector_load 536 %ld1 = affine.vector_load %in[32*%i] : memref<512xf32>, vector<32xf32> 537 %add = arith.addf %ld0, %ld1 : vector<32xf32> 538 affine.vector_store %ld1, %out[32*%i] : memref<512xf32>, vector<32xf32> 539 } 540 return 541} 542 543// CHECK-LABEL: func @reduction_multi_store 544func.func @reduction_multi_store() -> memref<1xf32> { 545 %A = memref.alloc() : memref<1xf32> 546 %cf0 = arith.constant 0.0 : f32 547 %cf5 = arith.constant 5.0 : f32 548 549 affine.store %cf0, %A[0] : memref<1xf32> 550 affine.for %i = 0 to 100 step 2 { 551 %l = affine.load %A[0] : memref<1xf32> 552 %s = arith.addf %l, %cf5 : f32 553 // Store to load forwarding from this store should happen. 554 affine.store %s, %A[0] : memref<1xf32> 555 %m = affine.load %A[0] : memref<1xf32> 556 "test.foo"(%m) : (f32) -> () 557 } 558 559// CHECK: affine.for 560// CHECK: affine.load 561// CHECK: affine.store %[[S:.*]], 562// CHECK-NEXT: "test.foo"(%[[S]]) 563 564 return %A : memref<1xf32> 565} 566 567// CHECK-LABEL: func @vector_load_affine_apply_store_load 568func.func @vector_load_affine_apply_store_load(%in : memref<512xf32>, %out : memref<512xf32>) { 569 %cf1 = arith.constant 1: index 570 affine.for %i = 0 to 15 { 571 // CHECK: affine.vector_load 572 %ld0 = affine.vector_load %in[32*%i] : memref<512xf32>, vector<32xf32> 573 %idx = affine.apply affine_map<(d0) -> (d0 + 1)> (%i) 574 affine.vector_store %ld0, %in[32*%idx] : memref<512xf32>, vector<32xf32> 575 // CHECK-NOT: affine.vector_load 576 %ld1 = affine.vector_load %in[32*%i] : memref<512xf32>, vector<32xf32> 577 %add = arith.addf %ld0, %ld1 : vector<32xf32> 578 affine.vector_store %ld1, %out[32*%i] : memref<512xf32>, vector<32xf32> 579 } 580 return 581} 582 583// CHECK-LABEL: func @external_no_forward_load 584 585func.func @external_no_forward_load(%in : memref<512xf32>, %out : memref<512xf32>) { 586 affine.for %i = 0 to 16 { 587 %ld0 = affine.load %in[32*%i] : memref<512xf32> 588 affine.store %ld0, %out[32*%i] : memref<512xf32> 589 "memop"(%in, %out) : (memref<512xf32>, memref<512xf32>) -> () 590 %ld1 = affine.load %in[32*%i] : memref<512xf32> 591 affine.store %ld1, %out[32*%i] : memref<512xf32> 592 } 593 return 594} 595// CHECK: affine.load 596// CHECK: affine.store 597// CHECK: affine.load 598// CHECK: affine.store 599 600// CHECK-LABEL: func @external_no_forward_store 601 602func.func @external_no_forward_store(%in : memref<512xf32>, %out : memref<512xf32>) { 603 %cf1 = arith.constant 1.0 : f32 604 affine.for %i = 0 to 16 { 605 affine.store %cf1, %in[32*%i] : memref<512xf32> 606 "memop"(%in, %out) : (memref<512xf32>, memref<512xf32>) -> () 607 %ld1 = affine.load %in[32*%i] : memref<512xf32> 608 affine.store %ld1, %out[32*%i] : memref<512xf32> 609 } 610 return 611} 612// CHECK: affine.store 613// CHECK: affine.load 614// CHECK: affine.store 615 616// CHECK-LABEL: func @no_forward_cast 617 618func.func @no_forward_cast(%in : memref<512xf32>, %out : memref<512xf32>) { 619 %cf1 = arith.constant 1.0 : f32 620 %cf2 = arith.constant 2.0 : f32 621 %m2 = memref.cast %in : memref<512xf32> to memref<?xf32> 622 affine.for %i = 0 to 16 { 623 affine.store %cf1, %in[32*%i] : memref<512xf32> 624 affine.store %cf2, %m2[32*%i] : memref<?xf32> 625 %ld1 = affine.load %in[32*%i] : memref<512xf32> 626 affine.store %ld1, %out[32*%i] : memref<512xf32> 627 } 628 return 629} 630// CHECK: affine.store 631// CHECK-NEXT: affine.store 632// CHECK-NEXT: affine.load 633// CHECK-NEXT: affine.store 634 635// Although there is a dependence from the second store to the load, it is 636// satisfied by the outer surrounding loop, and does not prevent the first 637// store to be forwarded to the load. 638 639// CHECK-LABEL: func @overlap_no_fwd 640func.func @overlap_no_fwd(%N : index) -> f32 { 641 %cf7 = arith.constant 7.0 : f32 642 %cf9 = arith.constant 9.0 : f32 643 %c0 = arith.constant 0 : index 644 %c1 = arith.constant 1 : index 645 %m = memref.alloc() : memref<10xf32> 646 affine.for %i0 = 0 to 5 { 647 affine.store %cf7, %m[2 * %i0] : memref<10xf32> 648 affine.for %i1 = 0 to %N { 649 %v0 = affine.load %m[2 * %i0] : memref<10xf32> 650 %v1 = arith.addf %v0, %v0 : f32 651 affine.store %cf9, %m[%i0 + 1] : memref<10xf32> 652 } 653 } 654 // Due to this load, the memref isn't optimized away. 655 %v3 = affine.load %m[%c1] : memref<10xf32> 656 return %v3 : f32 657 658// CHECK: affine.for %{{.*}} = 0 to 5 { 659// CHECK-NEXT: affine.store %{{.*}}, %{{.*}}[%{{.*}}] : memref<10xf32> 660// CHECK-NEXT: affine.for %{{.*}} = 0 to %{{.*}} { 661// CHECK-NEXT: %{{.*}} = affine.load 662// CHECK-NEXT: %{{.*}} = arith.addf %{{.*}}, %{{.*}} : f32 663// CHECK-NEXT: affine.store %{{.*}}, %{{.*}}[%{{.*}}] : memref<10xf32> 664// CHECK-NEXT: } 665// CHECK-NEXT: } 666// CHECK-NEXT: %{{.*}} = affine.load %{{.*}}[%{{.*}}] : memref<10xf32> 667// CHECK-NEXT: return %{{.*}} : f32 668} 669 670// CHECK-LABEL: func @redundant_store_elim 671 672func.func @redundant_store_elim(%out : memref<512xf32>) { 673 %cf1 = arith.constant 1.0 : f32 674 %cf2 = arith.constant 2.0 : f32 675 affine.for %i = 0 to 16 { 676 affine.store %cf1, %out[32*%i] : memref<512xf32> 677 affine.store %cf2, %out[32*%i] : memref<512xf32> 678 } 679 return 680} 681 682// CHECK: affine.for 683// CHECK-NEXT: affine.store 684// CHECK-NEXT: } 685 686// CHECK-LABEL: func @redundant_store_elim_nonintervening 687 688func.func @redundant_store_elim_nonintervening(%in : memref<512xf32>) { 689 %cf1 = arith.constant 1.0 : f32 690 %out = memref.alloc() : memref<512xf32> 691 affine.for %i = 0 to 16 { 692 affine.store %cf1, %out[32*%i] : memref<512xf32> 693 %0 = affine.load %in[32*%i] : memref<512xf32> 694 affine.store %0, %out[32*%i] : memref<512xf32> 695 } 696 return 697} 698 699// CHECK: affine.for 700// CHECK-NEXT: affine.load 701// CHECK-NEXT: affine.store 702// CHECK-NEXT: } 703 704// CHECK-LABEL: func @redundant_store_elim_fail 705 706func.func @redundant_store_elim_fail(%out : memref<512xf32>) { 707 %cf1 = arith.constant 1.0 : f32 708 %cf2 = arith.constant 2.0 : f32 709 affine.for %i = 0 to 16 { 710 affine.store %cf1, %out[32*%i] : memref<512xf32> 711 "test.use"(%out) : (memref<512xf32>) -> () 712 affine.store %cf2, %out[32*%i] : memref<512xf32> 713 } 714 return 715} 716// CHECK: affine.for 717// CHECK-NEXT: affine.store 718// CHECK-NEXT: "test.use" 719// CHECK-NEXT: affine.store 720// CHECK-NEXT: } 721 722// CHECK-LABEL: @with_inner_ops 723func.func @with_inner_ops(%arg0: memref<?xf64>, %arg1: memref<?xf64>, %arg2: i1) { 724 %cst = arith.constant 0.000000e+00 : f64 725 %cst_0 = arith.constant 3.140000e+00 : f64 726 %cst_1 = arith.constant 1.000000e+00 : f64 727 affine.for %arg3 = 0 to 28 { 728 affine.store %cst, %arg1[%arg3] : memref<?xf64> 729 affine.store %cst_0, %arg1[%arg3] : memref<?xf64> 730 %0 = scf.if %arg2 -> (f64) { 731 scf.yield %cst_1 : f64 732 } else { 733 %1 = affine.load %arg1[%arg3] : memref<?xf64> 734 scf.yield %1 : f64 735 } 736 affine.store %0, %arg0[%arg3] : memref<?xf64> 737 } 738 return 739} 740 741// CHECK: %[[pi:.+]] = arith.constant 3.140000e+00 : f64 742// CHECK: %{{.*}} = scf.if %arg2 -> (f64) { 743// CHECK: scf.yield %{{.*}} : f64 744// CHECK: } else { 745// CHECK: scf.yield %[[pi]] : f64 746// CHECK: } 747 748// Check if scalar replacement works correctly when affine memory ops are in the 749// body of an scf.for. 750 751// CHECK-LABEL: func @affine_store_load_in_scope 752func.func @affine_store_load_in_scope(%memref: memref<1x4094x510x1xf32>, %memref_2: memref<4x4x1x64xf32>, %memref_0: memref<1x2046x254x1x64xf32>) { 753 %c0 = arith.constant 0 : index 754 %c1 = arith.constant 1 : index 755 %c2 = arith.constant 2 : index 756 %c64 = arith.constant 64 : index 757 %c768 = arith.constant 768 : index 758 scf.for %i = %c0 to %c768 step %c1 { 759 %9 = arith.remsi %i, %c64 : index 760 %10 = arith.divsi %i, %c64 : index 761 %11 = arith.remsi %10, %c2 : index 762 %12 = arith.divsi %10, %c2 : index 763 test.affine_scope { 764 %14 = arith.muli %12, %c2 : index 765 %15 = arith.addi %c2, %14 : index 766 %16 = arith.addi %15, %c0 : index 767 %18 = arith.muli %11, %c2 : index 768 %19 = arith.addi %c2, %18 : index 769 %20 = affine.load %memref[0, symbol(%16), symbol(%19), 0] : memref<1x4094x510x1xf32> 770 %21 = affine.load %memref_2[0, 0, 0, symbol(%9)] : memref<4x4x1x64xf32> 771 %24 = affine.load %memref_0[0, symbol(%12), symbol(%11), 0, symbol(%9)] : memref<1x2046x254x1x64xf32> 772 %25 = arith.mulf %20, %21 : f32 773 %26 = arith.addf %24, %25 : f32 774 // CHECK: %[[A:.*]] = arith.addf 775 affine.store %26, %memref_0[0, symbol(%12), symbol(%11), 0, symbol(%9)] : memref<1x2046x254x1x64xf32> 776 %27 = arith.addi %19, %c1 : index 777 %28 = affine.load %memref[0, symbol(%16), symbol(%27), 0] : memref<1x4094x510x1xf32> 778 %29 = affine.load %memref_2[0, 1, 0, symbol(%9)] : memref<4x4x1x64xf32> 779 %30 = affine.load %memref_0[0, symbol(%12), symbol(%11), 0, symbol(%9)] : memref<1x2046x254x1x64xf32> 780 %31 = arith.mulf %28, %29 : f32 781 %32 = arith.addf %30, %31 : f32 782 // The addf above will get the forwarded value from the store on 783 // %memref_0 above which is being loaded into %30.. 784 // CHECK: arith.addf %[[A]], 785 "terminate"() : () -> () 786 } 787 } 788 return 789} 790 791// No scalrep will be performed here but we ensure dependence correctly fails. 792 793// CHECK-LABEL: func @affine_load_store_in_different_scopes 794func.func @affine_load_store_in_different_scopes() -> memref<1xf32> { 795 %A = memref.alloc() : memref<1xf32> 796 %cf0 = arith.constant 0.0 : f32 797 %cf5 = arith.constant 5.0 : f32 798 799 affine.store %cf0, %A[0] : memref<1xf32> 800 test.affine_scope { 801 affine.store %cf5, %A[0] : memref<1xf32> 802 "test.terminate"() : () -> () 803 } 804 %v = affine.load %A[0] : memref<1xf32> 805 // CHECK: affine.store 806 // CHECK-NEXT: test.affine_scope 807 // CHECK: affine.store 808 // CHECK: affine.load 809 return %A : memref<1xf32> 810} 811 812// No forwarding should again happen here. 813 814// CHECK-LABEL: func.func @no_forwarding_across_scopes 815func.func @no_forwarding_across_scopes() -> memref<1xf32> { 816 %A = memref.alloc() : memref<1xf32> 817 %cf0 = arith.constant 0.0 : f32 818 %cf5 = arith.constant 5.0 : f32 819 %c0 = arith.constant 0 : index 820 %c100 = arith.constant 100 : index 821 %c1 = arith.constant 1 : index 822 823 // Store shouldn't be forwarded to the load. 824 affine.store %cf0, %A[0] : memref<1xf32> 825 // CHECK: test.affine_scope 826 // CHECK-NEXT: affine.load 827 test.affine_scope { 828 %l = affine.load %A[0] : memref<1xf32> 829 %s = arith.addf %l, %cf5 : f32 830 affine.store %s, %A[0] : memref<1xf32> 831 "terminator"() : () -> () 832 } 833 return %A : memref<1xf32> 834} 835 836// CHECK-LABEL: func @parallel_store_load() { 837func.func @parallel_store_load() { 838 %cf7 = arith.constant 7.0 : f32 839 %m = memref.alloc() : memref<10xf32> 840 affine.parallel (%i0) = (0) to (10) { 841 affine.store %cf7, %m[%i0] : memref<10xf32> 842 %v0 = affine.load %m[%i0] : memref<10xf32> 843 %v1 = arith.addf %v0, %v0 : f32 844 } 845 memref.dealloc %m : memref<10xf32> 846 return 847// CHECK: %[[C7:.*]] = arith.constant 7.000000e+00 : f32 848// CHECK-NEXT: affine.parallel (%{{.*}}) = (0) to (10) { 849// CHECK-NEXT: arith.addf %[[C7]], %[[C7]] : f32 850// CHECK-NEXT: } 851// CHECK-NEXT: return 852} 853 854func.func @non_constant_parallel_store_load(%N : index) { 855 %cf7 = arith.constant 7.0 : f32 856 %m = memref.alloc() : memref<10xf32> 857 affine.parallel (%i0) = (0) to (%N) { 858 affine.store %cf7, %m[%i0] : memref<10xf32> 859 %v0 = affine.load %m[%i0] : memref<10xf32> 860 %v1 = arith.addf %v0, %v0 : f32 861 } 862 memref.dealloc %m : memref<10xf32> 863 return 864} 865// CHECK: func.func @non_constant_parallel_store_load(%[[ARG0:.*]]: index) { 866// CHECK-NEXT: %[[C7:.*]] = arith.constant 7.000000e+00 : f32 867// CHECK-NEXT: affine.parallel (%{{.*}}) = (0) to (%[[ARG0]]) { 868// CHECK-NEXT: arith.addf %[[C7]], %[[C7]] : f32 869// CHECK-NEXT: } 870// CHECK-NEXT: return 871 872// CHECK-LABEL: func @parallel_surrounding_for() { 873func.func @parallel_surrounding_for() { 874 %cf7 = arith.constant 7.0 : f32 875 %m = memref.alloc() : memref<10x10xf32> 876 affine.parallel (%i0) = (0) to (10) { 877 affine.for %i1 = 0 to 10 { 878 affine.store %cf7, %m[%i0,%i1] : memref<10x10xf32> 879 %v0 = affine.load %m[%i0,%i1] : memref<10x10xf32> 880 %v1 = arith.addf %v0, %v0 : f32 881 } 882 } 883 memref.dealloc %m : memref<10x10xf32> 884 return 885// CHECK: %[[C7:.*]] = arith.constant 7.000000e+00 : f32 886// CHECK-NEXT: affine.parallel (%{{.*}}) = (0) to (10) { 887// CHECK-NEXT: affine.for %{{.*}} = 0 to 10 { 888// CHECK-NEXT: arith.addf %[[C7]], %[[C7]] : f32 889// CHECK-NEXT: } 890// CHECK-NEXT: } 891// CHECK-NEXT: return 892} 893 894// CHECK-LABEL: func.func @dead_affine_region_op 895func.func @dead_affine_region_op() { 896 %c1 = arith.constant 1 : index 897 %alloc = memref.alloc() : memref<15xi1> 898 %true = arith.constant true 899 affine.store %true, %alloc[%c1] : memref<15xi1> 900 // Dead store. 901 affine.store %true, %alloc[%c1] : memref<15xi1> 902 // This affine.if is dead. 903 affine.if affine_set<(d0, d1, d2, d3) : ((d0 + 1) mod 8 >= 0, d0 * -8 >= 0)>(%c1, %c1, %c1, %c1){ 904 // No forwarding will happen. 905 affine.load %alloc[%c1] : memref<15xi1> 906 } 907 // CHECK-NEXT: arith.constant 908 // CHECK-NEXT: memref.alloc 909 // CHECK-NEXT: arith.constant 910 // CHECK-NEXT: affine.store 911 // CHECK-NEXT: affine.if 912 // CHECK-NEXT: affine.load 913 return 914} 915 916// We perform no scalar replacement here since we don't depend on dominance 917// info, which would be needed in such cases when ops fall in different blocks 918// of a CFG region. 919 920// CHECK-LABEL: func @cross_block 921func.func @cross_block() { 922 %c10 = arith.constant 10 : index 923 %alloc_83 = memref.alloc() : memref<1x13xf32> 924 %alloc_99 = memref.alloc() : memref<13xi1> 925 %true_110 = arith.constant true 926 affine.store %true_110, %alloc_99[%c10] : memref<13xi1> 927 %true = arith.constant true 928 affine.store %true, %alloc_99[%c10] : memref<13xi1> 929 cf.br ^bb1(%alloc_83 : memref<1x13xf32>) 930^bb1(%35: memref<1x13xf32>): 931 // CHECK: affine.load 932 %69 = affine.load %alloc_99[%c10] : memref<13xi1> 933 return 934} 935 936#map1 = affine_map<(d0) -> (d0)> 937 938// CHECK-LABEL: func @consecutive_store 939func.func @consecutive_store() { 940 // CHECK: %[[CST:.*]] = arith.constant 941 %tmp = arith.constant 1.1 : f16 942 // CHECK: %[[ALLOC:.*]] = memref.alloc 943 %alloc_66 = memref.alloc() : memref<f16, 1> 944 affine.for %arg2 = 4 to 6 { 945 affine.for %arg3 = #map1(%arg2) to #map1(%arg2) step 4 { 946 // CHECK: affine.store %[[CST]], %[[ALLOC]][] 947 affine.store %tmp, %alloc_66[] : memref<f16, 1> 948 // CHECK-NOT: affine.store %[[CST]], %[[ALLOC]][] 949 affine.store %tmp, %alloc_66[] : memref<f16, 1> 950 %270 = affine.load %alloc_66[] : memref<f16, 1> 951 } 952 } 953 return 954} 955