1// RUN: mlir-opt -allow-unregistered-dialect %s -split-input-file -affine-data-copy-generate="generate-dma fast-mem-space=2 skip-non-unit-stride-loops" -verify-diagnostics | FileCheck %s 2// RUN: mlir-opt -allow-unregistered-dialect %s -split-input-file -affine-data-copy-generate="generate-dma fast-mem-capacity=16 fast-mem-space=2" | FileCheck %s --check-prefix FAST-MEM-16KB 3 4// We run most test cases with -copy-skip-non-unit-stride-loops to allow testing 5// DMA generation at inner levels easily - since the DMA generation would 6// otherwise always generate DMAs at the outermost level (default for fast mem 7// capacity is infinite). Using a specific capacity makes it harder to write 8// a test case as one would have to calculate total footprints. With 9// -copy-skip-non-unit-stride-loops, non-unit strides will always be skipped and 10// its inner loops will be traversed till a unit stride loop is found (or the 11// innermost block is reached). 12 13// ----- 14 15// CHECK-LABEL: func @loop_nest_1d() { 16func.func @loop_nest_1d() { 17 %A = memref.alloc() : memref<256 x f32> 18 %B = memref.alloc() : memref<512 x f32> 19 %F = memref.alloc() : memref<256 x f32, 2> 20 // First DMA buffer. 21 // CHECK: memref.alloc() : memref<256xf32> 22 // CHECK: memref.alloc() : memref<256xf32, 2> 23 // Tag for first DMA. 24 // CHECK: memref.alloc() : memref<1xi32> 25 // First DMA transfer. 26 // CHECK: affine.dma_start %{{.*}}[%{{.*}}], %{{.*}}[%{{.*}}], %{{.*}}[%{{.*}}], %{{.*}} : memref<256xf32>, memref<256xf32, 2>, memref<1xi32> 27 // CHECK: affine.dma_wait %{{.*}}[%{{.*}}], %{{.*}} : memref<1xi32> 28 // Second DMA buffer. 29 // CHECK: memref.alloc() : memref<256xf32, 2> 30 // Tag for second DMA. 31 // CHECK: memref.alloc() : memref<1xi32> 32 // Second DMA transfer. 33 // CHECK: affine.dma_start %{{.*}}[%{{.*}}], %{{.*}}[%{{.*}}], %{{.*}}[%{{.*}}], %{{.*}} : memref<512xf32>, memref<256xf32, 2>, memref<1xi32> 34 // CHECK-NEXT: affine.dma_wait %{{.*}}[%{{.*}}], %{{.*}} : memref<1xi32> 35 // CHECK: affine.for %[[IV:.*]] = 0 to 256 { 36 // CHECK-NEXT: affine.load %{{.*}}[%{{.*}}] : memref<256xf32, 2> 37 // Buffer for '%{{.*}}' in faster memref space is of smaller size: 256xf32 38 // Affine map for load on B is composed and becomes identity. 39 // CHECK: affine.load %{{.*}}[%[[IV]]] : memref<256xf32, 2> 40 // Already in faster memory space. 41 // CHECK: affine.load %{{.*}}[%[[IV]]] : memref<256xf32, 2> 42 // CHECK-NEXT: } 43 // CHECK-NEXT: dealloc %{{.*}} : memref<1xi32> 44 // CHECK-NEXT: dealloc %{{.*}} : memref<256xf32, 2> 45 // CHECK-NEXT: dealloc %{{.*}} : memref<1xi32> 46 // CHECK-NEXT: dealloc %{{.*}} : memref<256xf32, 2> 47 // CHECK-NEXT: return 48 affine.for %i = 0 to 256 { 49 affine.load %A[%i] : memref<256 x f32> 50 %idx = affine.apply affine_map<(d0) -> (d0 + 256)>(%i) 51 affine.load %B[%idx] : memref<512 x f32> 52 affine.load %F[%i] : memref<256 x f32, 2> 53 } 54 return 55} 56 57// ----- 58 59// CHECK-LABEL: func @loop_nest_high_d 60// CHECK: %{{.*}} = arith.constant 16384 : index 61// CHECK-DAG: [[BUFB:%[0-9a-zA-Z_]+]] = memref.alloc() : memref<512x32xf32, 2> 62// CHECK-DAG: [[BUFA:%[0-9a-zA-Z_]+]] = memref.alloc() : memref<512x32xf32, 2> 63// CHECK-DAG: [[BUFC:%[0-9a-zA-Z_]+]] = memref.alloc() : memref<512x32xf32, 2> 64// CHECK-DAG: [[TAGB:%[0-9a-zA-Z_]+]] = memref.alloc() : memref<1xi32> 65// CHECK-DAG: [[TAGA:%[0-9a-zA-Z_]+]] = memref.alloc() : memref<1xi32> 66// CHECK-DAG: [[TAGC:%[0-9a-zA-Z_]+]] = memref.alloc() : memref<1xi32> 67// CHECK-DAG: [[TAGC_W:%[0-9a-zA-Z_]+]] = memref.alloc() : memref<1xi32> 68// INCOMING DMA for B 69// CHECK-DAG: affine.dma_start %{{.*}}[%{{.*}}, %{{.*}}], [[BUFB]][%{{.*}}, %{{.*}}], [[TAGB]][%{{.*}}], %{{.*}} : memref<512x32xf32>, memref<512x32xf32, 2>, memref<1xi32> 70// CHECK-DAG: affine.dma_wait [[TAGB]][%{{.*}}], %{{.*}} : memref<1xi32> 71// INCOMING DMA for A. 72// CHECK-DAG: affine.dma_start %{{.*}}[%{{.*}}, %{{.*}}], [[BUFA]][%{{.*}}, %{{.*}}], [[TAGA]][%{{.*}}], %{{.*}} : memref<512x32xf32>, memref<512x32xf32, 2>, memref<1xi32> 73// CHECK-DAG: affine.dma_wait [[TAGA]][%{{.*}}], %{{.*}} : memref<1xi32> 74// INCOMING DMA for C. 75// CHECK-DAG: affine.dma_start %{{.*}}[%{{.*}}, %{{.*}}], [[BUFC]][%{{.*}}, %{{.*}}], [[TAGC]][%{{.*}}], %{{.*}} : memref<512x32xf32>, memref<512x32xf32, 2>, memref<1xi32> 76// CHECK-DAG: affine.dma_wait [[TAGC]][%{{.*}}], %{{.*}} : memref<1xi32> 77// CHECK-NEXT: affine.for %{{.*}} = 0 to 32 { 78// CHECK-NEXT: affine.for %{{.*}} = 0 to 32 { 79// CHECK-NEXT: affine.for %{{.*}} = 0 to 32 { 80// CHECK-NEXT: affine.for %{{.*}} = 0 to 16 { 81// CHECK: affine.load [[BUFB]][%{{.*}} * 16 + %{{.*}}, %{{.*}}] : memref<512x32xf32, 2> 82// CHECK-NEXT: "foo"(%{{.*}}) : (f32) -> () 83// CHECK-NEXT: } 84// CHECK-NEXT: affine.for %{{.*}} = 0 to 16 { 85// CHECK: affine.load [[BUFA]][%{{.*}} * 16 + %{{.*}}, %{{.*}}] : memref<512x32xf32, 2> 86// CHECK-NEXT: "bar"(%{{.*}}) : (f32) -> () 87// CHECK-NEXT: } 88// CHECK-NEXT: affine.for %{{.*}} = 0 to 16 { 89// CHECK-NEXT: "abc_compute"() : () -> f32 90// CHECK: affine.load [[BUFC]][%{{.*}} * 16 + %{{.*}}, %{{.*}}] : memref<512x32xf32, 2> 91// CHECK-NEXT: "addf32"(%{{.*}}, %{{.*}}) : (f32, f32) -> f32 92// CHECK-NEXT: affine.store %{{.*}}, [[BUFC]][%{{.*}} * 16 + %{{.*}}, %{{.*}}] : memref<512x32xf32, 2> 93// CHECK-NEXT: } 94// CHECK-NEXT: "foobar"() : () -> () 95// CHECK-NEXT: } 96// CHECK-NEXT: } 97// CHECK-NEXT: } 98// OUTGOING DMA for C. 99// CHECK-NEXT: affine.dma_start [[BUFC]][%{{.*}}, %{{.*}}], %{{.*}}[%{{.*}}, %{{.*}}], [[TAGC_W]][%{{.*}}], %{{.*}} : memref<512x32xf32, 2>, memref<512x32xf32>, memref<1xi32> 100// CHECK-NEXT: affine.dma_wait [[TAGC_W]][%{{.*}}], %{{.*}} : memref<1xi32> 101// CHECK-NEXT: dealloc [[TAGC_W]] : memref<1xi32> 102// CHECK-NEXT: dealloc [[TAGC]] : memref<1xi32> 103// CHECK-NEXT: dealloc [[BUFC]] : memref<512x32xf32, 2> 104// CHECK-NEXT: dealloc [[TAGA]] : memref<1xi32> 105// CHECK-NEXT: dealloc [[BUFA]] : memref<512x32xf32, 2> 106// CHECK-NEXT: dealloc [[TAGB]] : memref<1xi32> 107// CHECK-NEXT: dealloc [[BUFB]] : memref<512x32xf32, 2> 108// CHECK-NEXT: return 109// CHECK-NEXT:} 110func.func @loop_nest_high_d(%A: memref<512 x 32 x f32>, 111 %B: memref<512 x 32 x f32>, %C: memref<512 x 32 x f32>) { 112 // DMAs will be performed at this level (jT is the first loop without a stride). 113 // A and B are read, while C is both read and written. A total of three new buffers 114 // are allocated and existing load's/store's are replaced by accesses to those buffers. 115 affine.for %jT = 0 to 32 { 116 affine.for %kT = 0 to 32 { 117 affine.for %iT = 0 to 32 { 118 affine.for %kk = 0 to 16 { // k intratile 119 %k = affine.apply affine_map<(d0, d1) -> (16*d0 + d1)> (%kT, %kk) 120 %v0 = affine.load %B[%k, %jT] : memref<512 x 32 x f32> 121 "foo"(%v0) : (f32) -> () 122 } 123 affine.for %ii = 0 to 16 { // i intratile. 124 %i = affine.apply affine_map<(d0, d1) -> (16*d0 + d1)>(%iT, %ii) 125 %v1 = affine.load %A[%i, %kT] : memref<512 x 32 x f32> 126 "bar"(%v1) : (f32) -> () 127 } 128 affine.for %ii_ = 0 to 16 { // i intratile. 129 %v2 = "abc_compute"() : () -> f32 130 %i_ = affine.apply affine_map<(d0, d1) -> (16*d0 + d1)>(%iT, %ii_) 131 %v3 = affine.load %C[%i_, %jT] : memref<512 x 32 x f32> 132 %v4 = "addf32"(%v2, %v3) : (f32, f32) -> (f32) 133 affine.store %v4, %C[%i_, %jT] : memref<512 x 32 x f32> 134 } 135 "foobar"() : () -> () 136 } 137 } 138 } 139 return 140} 141 142// ----- 143 144// A loop nest with a modulo 2 access. A strided DMA is not needed here a 1x2 145// region within a 256 x 8 memref. 146// 147// CHECK-LABEL: func @loop_nest_modulo() { 148// CHECK: memref.alloc() : memref<256x8xf32> 149// CHECK-NEXT: affine.for %{{.*}} = 0 to 32 step 4 { 150// CHECK: memref.alloc() : memref<1x2xf32, 2> 151// CHECK-NEXT: memref.alloc() : memref<1xi32> 152// Composition of the affine map for '%{{.*}}' causes '%{{.*}}' to be added as a symbol. 153// CHECK-NEXT: affine.dma_start %{{.*}}[%{{.*}}, 0], %{{.*}}[%{{.*}}, %{{.*}}], %{{.*}}[%{{.*}}], %{{.*}} : memref<256x8xf32>, memref<1x2xf32, 2>, memref<1xi32> 154// CHECK-NEXT: affine.dma_wait %{{.*}}[%{{.*}}], %{{.*}} : memref<1xi32> 155// CHECK-NEXT: affine.for %{{.*}} = 0 to 8 { 156// ... 157// ... 158// CHECK: } 159// CHECK-NEXT: dealloc %{{.*}} : memref<1xi32> 160// CHECK-NEXT: dealloc %{{.*}} : memref<1x2xf32, 2> 161// CHECK-NEXT: } 162// CHECK-NEXT: return 163func.func @loop_nest_modulo() { 164 %A = memref.alloc() : memref<256 x 8 x f32> 165 affine.for %i = 0 to 32 step 4 { 166 // DMAs will be performed at this level (%j is the first unit stride loop) 167 affine.for %j = 0 to 8 { 168 %idx = affine.apply affine_map<(d0) -> (d0 mod 2)> (%j) 169 // A buffer of size 32 x 2 will be allocated (original buffer was 256 x 8). 170 %v = affine.load %A[%i, %idx] : memref<256 x 8 x f32> 171 } 172 } 173 return 174} 175 176// ----- 177 178// DMA on tiled loop nest. This also tests the case where the bounds are 179// dependent on outer loop IVs. 180// CHECK-LABEL: func @loop_nest_tiled() -> memref<256x1024xf32> { 181func.func @loop_nest_tiled() -> memref<256x1024xf32> { 182 %0 = memref.alloc() : memref<256x1024xf32> 183 affine.for %i0 = 0 to 256 step 32 { 184 affine.for %i1 = 0 to 1024 step 32 { 185// CHECK: memref.alloc() : memref<32x32xf32, 2> 186// CHECK-NEXT: memref.alloc() : memref<1xi32> 187// Strided DMA here: 32 x 32 tile in a 256 x 1024 memref. 188// CHECK-NEXT: affine.dma_start %{{.*}}[%{{.*}}, %{{.*}}], %{{.*}}[%{{.*}}, %{{.*}}], %{{.*}}[%{{.*}}], %{{.*}}, %{{.*}}, %{{.*}} : memref<256x1024xf32>, memref<32x32xf32, 2>, memref<1xi32> 189// CHECK-NEXT: affine.dma_wait 190// CHECK-NEXT: affine.for %{{.*}} = #map 191// CHECK-NEXT: affine.for %{{.*}} = #map 192 affine.for %i2 = affine_map<(d0) -> (d0)>(%i0) to affine_map<(d0) -> (d0 + 32)>(%i0) { 193 affine.for %i3 = affine_map<(d0) -> (d0)>(%i1) to affine_map<(d0) -> (d0 + 32)>(%i1) { 194 // CHECK: affine.load %{{.*}}[-%{{.*}} + %{{.*}}, -%{{.*}} + %{{.*}}] : memref<32x32xf32, 2> 195 %1 = affine.load %0[%i2, %i3] : memref<256x1024xf32> 196 } // CHECK-NEXT: } 197 } 198 } 199 } 200 return %0 : memref<256x1024xf32> 201} 202 203// ----- 204 205// CHECK-LABEL: func @dma_constant_dim_access 206func.func @dma_constant_dim_access(%A : memref<100x100xf32>) { 207 %one = arith.constant 1 : index 208 %N = arith.constant 100 : index 209 // CHECK: memref.alloc() : memref<1x100xf32, 2> 210 // CHECK-NEXT: memref.alloc() : memref<1xi32> 211 // No strided DMA needed here. 212 // CHECK: affine.dma_start %{{.*}}[%{{.*}}, %{{.*}}], %{{.*}}[%{{.*}}, %{{.*}}], %{{.*}}[%{{.*}}], %{{.*}} : memref<100x100xf32>, memref<1x100xf32, 2>, 213 // CHECK-NEXT: affine.dma_wait %{{.*}}[%{{.*}}], %{{.*}} : memref<1xi32> 214 affine.for %i = 0 to 100 { 215 affine.for %j = 0 to affine_map<()[s0] -> (s0)> ()[%N] { 216 // CHECK: affine.load %{{.*}}[0, %{{.*}}] : memref<1x100xf32, 2> 217 affine.load %A[%one, %j] : memref<100 x 100 x f32> 218 } 219 } 220 return 221} 222 223// ----- 224 225// CHECK-LABEL: func @dma_with_symbolic_accesses 226func.func @dma_with_symbolic_accesses(%A : memref<100x100xf32>, %M : index) { 227 %N = arith.constant 9 : index 228 affine.for %i = 0 to 100 { 229 affine.for %j = 0 to 100 { 230 %idy = affine.apply affine_map<(d0, d1) [s0, s1] -> (d1 + s0 + s1)>(%i, %j)[%M, %N] 231 affine.load %A[%i, %idy] : memref<100 x 100 x f32> 232 } 233 } 234 return 235// CHECK: memref.alloc() : memref<100x100xf32, 2> 236// CHECK-NEXT: memref.alloc() : memref<1xi32> 237// CHECK-NEXT: affine.dma_start %{{.*}}[0, symbol(%{{.*}}) + 9], %{{.*}}[%{{.*}}, %{{.*}}], %{{.*}}[%{{.*}}], %{{.*}} 238// CHECK-NEXT: affine.dma_wait %{{.*}}[%{{.*}}], %{{.*}} 239// CHECK-NEXT: affine.for %[[IV0:.*]] = 0 to 100 { 240// CHECK-NEXT: affine.for %[[IV1:.*]] = 0 to 100 { 241// CHECK: affine.load %{{.*}}[%[[IV0]], %[[IV1]]] : memref<100x100xf32, 2> 242// CHECK-NEXT: } 243// CHECK-NEXT: } 244// CHECK: return 245} 246 247// ----- 248 249// CHECK-LABEL: func @dma_with_symbolic_loop_bounds 250func.func @dma_with_symbolic_loop_bounds(%A : memref<100x100xf32>, %M : index, %N: index) { 251 %K = arith.constant 9 : index 252// The buffer size can't be bound by a constant smaller than the original 253// memref size; so the DMA buffer is the entire 100x100. 254// CHECK: memref.alloc() : memref<100x100xf32, 2> 255// CHECK-NEXT: memref.alloc() : memref<1xi32> 256// CHECK-NEXT: affine.dma_start %{{.*}}[%{{.*}}, %{{.*}}], %{{.*}}[%{{.*}}, %{{.*}}], %{{.*}}[%{{.*}}], %{{.*}} : memref<100x100xf32>, memref<100x100xf32, 2>, memref<1xi32> 257// CHECK-NEXT: affine.dma_wait %{{.*}}[%{{.*}}], %{{.*}} : memref<1xi32> 258 affine.for %i = 0 to 100 { 259 affine.for %j = %M to %N { 260 %idy = affine.apply affine_map<(d1) [s0] -> (d1 + s0)>(%j)[%K] 261 affine.load %A[%i, %idy] : memref<100 x 100 x f32> 262 } 263 } 264 return 265} 266 267// ----- 268 269// CHECK-LABEL: func @dma_unknown_size 270func.func @dma_unknown_size(%arg0: memref<?x?xf32>) { 271 %c0 = arith.constant 0 : index 272 %M = memref.dim %arg0, %c0 : memref<? x ? x f32> 273 %N = memref.dim %arg0, %c0 : memref<? x ? x f32> 274 affine.for %i = 0 to %M { 275 affine.for %j = 0 to %N { 276 // If this loop nest isn't tiled, the access requires a non-constant DMA 277 // size -- not yet implemented. 278 // CHECK: affine.load %{{.*}}[%{{.*}}, %{{.*}}] : memref<?x?xf32> 279 affine.load %arg0[%i, %j] : memref<? x ? x f32> 280 } 281 } 282 return 283} 284 285// ----- 286 287// CHECK-LABEL: func @dma_memref_3d 288func.func @dma_memref_3d(%arg0: memref<1024x1024x1024xf32>) { 289 affine.for %i = 0 to 1024 { 290 affine.for %j = 0 to 1024 { 291 affine.for %k = 0 to 1024 { 292 %idx = affine.apply affine_map<(d0) -> (d0 mod 128)>(%i) 293 %idy = affine.apply affine_map<(d0) -> (d0 mod 128)>(%j) 294 %idz = affine.apply affine_map<(d0) -> (d0 mod 128)>(%k) 295 // DMA with nested striding (or emulating with loop around strided DMA) 296 // not yet implemented. 297 // CHECK: affine.load %{{.*}}[%{{.*}}, %{{.*}}, %{{.*}}] : memref<1024x1024x1024xf32> 298 %v = affine.load %arg0[%idx, %idy, %idz] : memref<1024 x 1024 x 1024 x f32> 299 } 300 } 301 } 302 return 303} 304 305// ----- 306 307// The first load accesses ([2,258), [128,384)) 308// The second load accesses ([64,320), [2,258)) 309// The first store writes to ([2,258), [192,448)) 310// The second store writes to ([128,320), [2,258)) 311// The union of all these regions is of size 318 x 446 and has its origin at (2, 312// 2), i.e., the window ([2,320), [2,448)) in the original space. 313 314// CHECK-LABEL: func @multi_load_store_union() { 315func.func @multi_load_store_union() { 316 %A = memref.alloc() : memref<512 x 512 x f32> 317 affine.for %i = 0 to 256 { 318 affine.for %j = 0 to 256 { 319 %idx = affine.apply affine_map<(d0) -> (d0 + 64)>(%i) 320 %idy = affine.apply affine_map<(d0) -> (d0 + 128)>(%j) 321 %ishift = affine.apply affine_map<(d0) -> (d0 + 2)>(%i) 322 %jshift = affine.apply affine_map<(d0) -> (d0 + 2)>(%j) 323 324 %u = affine.load %A[%ishift, %idy] : memref<512 x 512 x f32> 325 %v = affine.load %A[%idx, %jshift] : memref<512 x 512 x f32> 326 327 %sidx = affine.apply affine_map<(d0) -> (d0 + 128)>(%i) 328 %sidy = affine.apply affine_map<(d0) -> (d0 + 192)>(%j) 329 330 affine.store %u, %A[%ishift, %sidy] : memref<512 x 512 x f32> 331 affine.store %v, %A[%sidx, %jshift] : memref<512 x 512 x f32> 332 } 333 } 334 return 335} 336// CHECK: memref.alloc() : memref<512x512xf32> 337// CHECK-NEXT: memref.alloc() : memref<382x446xf32, 2> 338// CHECK-NEXT: memref.alloc() : memref<1xi32> 339// CHECK-NEXT: affine.dma_start %{{.*}}[%{{.*}}, %{{.*}}], %{{.*}}[%{{.*}}, %{{.*}}], %{{.*}}[%{{.*}}], %{{.*}}, %{{.*}}, %{{.*}} : memref<512x512xf32>, memref<382x446xf32, 2>, memref<1xi32> 340// CHECK-NEXT: affine.dma_wait %{{.*}}[%{{.*}}], %{{.*}} : memref<1xi32> 341// CHECK-NEXT: memref.alloc() : memref<1xi32> 342// CHECK-NEXT: affine.for %{{.*}} = 0 to 256 { 343// CHECK-NEXT: affine.for %{{.*}} = 0 to 256 { 344// CHECK: affine.load %{{.*}}[%{{.*}}, %{{.*}} + 126] : memref<382x446xf32, 2> 345// CHECK-NEXT: affine.load %{{.*}}[%{{.*}} + 62, %{{.*}}] : memref<382x446xf32, 2> 346// CHECK: affine.store %{{.*}}, %{{.*}}[%{{.*}}, %{{.*}} + 190] : memref<382x446xf32, 2> 347// CHECK-NEXT: affine.store %{{.*}}, %{{.*}}[%{{.*}} + 126, %{{.*}}] : memref<382x446xf32, 2> 348// CHECK-NEXT: } 349// CHECK-NEXT: } 350// CHECK-NEXT: affine.dma_start %{{.*}}[%{{.*}}, %{{.*}}], %{{.*}}[%{{.*}}, %{{.*}}], %{{.*}}[%{{.*}}], %{{.*}}, %{{.*}}, %{{.*}} : memref<382x446xf32, 2>, memref<512x512xf32>, memref<1xi32> 351// CHECK-NEXT: affine.dma_wait %{{.*}}[%{{.*}}], %{{.*}} : memref<1xi32> 352// CHECK-NEXT: dealloc %{{.*}} : memref<1xi32> 353// CHECK-NEXT: dealloc %{{.*}} : memref<1xi32> 354// CHECK-NEXT: dealloc %{{.*}} : memref<382x446xf32, 2> 355// CHECK-NEXT: return 356// CHECK-NEXT:} 357 358// ----- 359 360// CHECK-LABEL: func @dma_loop_straightline_interspersed() { 361func.func @dma_loop_straightline_interspersed() { 362 %c0 = arith.constant 0 : index 363 %c255 = arith.constant 255 : index 364 %A = memref.alloc() : memref<256 x f32> 365 %v = affine.load %A[%c0] : memref<256 x f32> 366 affine.for %i = 1 to 255 { 367 affine.load %A[%i] : memref<256 x f32> 368 } 369 %l = affine.load %A[%c255] : memref<256 x f32> 370 affine.store %l, %A[%c0] : memref<256 x f32> 371 return 372} 373// There are three regions here - the 'load' preceding the loop, the loop 374// itself, and the operations appearing after the scf. 375// CHECK: memref.alloc() : memref<256xf32> 376// CHECK-NEXT: memref.alloc() : memref<1xf32, 2> 377// CHECK-NEXT: memref.alloc() : memref<1xi32> 378// CHECK-NEXT: affine.dma_start %{{.*}}[%{{.*}}], %{{.*}}[%{{.*}}], %{{.*}}[%{{.*}}], %{{.*}} : memref<256xf32>, memref<1xf32, 2>, memref<1xi32> 379// CHECK-NEXT: affine.dma_wait %{{.*}}[%{{.*}}], %{{.*}} : memref<1xi32> 380// CHECK-NEXT: affine.load %{{.*}}[0] : memref<1xf32, 2> 381// CHECK-NEXT: dealloc %{{.*}} : memref<1xi32> 382// CHECK-NEXT: dealloc %{{.*}} : memref<1xf32, 2> 383// CHECK-NEXT: memref.alloc() : memref<254xf32, 2> 384// CHECK-NEXT: memref.alloc() : memref<1xi32> 385// CHECK-NEXT: affine.dma_start %{{.*}}[%{{.*}}], %{{.*}}[%{{.*}}], %{{.*}}[%{{.*}}], %{{.*}} : memref<256xf32>, memref<254xf32, 2>, memref<1xi32> 386// CHECK-NEXT: affine.dma_wait %{{.*}}[%{{.*}}], %{{.*}} : memref<1xi32> 387// CHECK-NEXT: affine.for %{{.*}} = 1 to 255 { 388// CHECK-NEXT: affine.load %{{.*}}[%{{.*}} - 1] : memref<254xf32, 2> 389// CHECK-NEXT: } 390// CHECK-NEXT: dealloc %{{.*}} : memref<1xi32> 391// CHECK-NEXT: dealloc %{{.*}} : memref<254xf32, 2> 392// CHECK-NEXT: memref.alloc() : memref<256xf32, 2> 393// CHECK-NEXT: memref.alloc() : memref<1xi32> 394// CHECK-NEXT: affine.dma_start %{{.*}}[%{{.*}}], %{{.*}}[%{{.*}}], %{{.*}}[%{{.*}}], %{{.*}} : memref<256xf32>, memref<256xf32, 2>, memref<1xi32> 395// CHECK-NEXT: affine.dma_wait %{{.*}}[%{{.*}}], %{{.*}} : memref<1xi32> 396// CHECK-NEXT: memref.alloc() : memref<1xi32> 397// CHECK-NEXT: affine.load %{{.*}}[255] : memref<256xf32, 2> 398// CHECK-NEXT: affine.store %{{.*}}, %{{.*}}[0] : memref<256xf32, 2> 399// CHECK-NEXT: affine.dma_start %{{.*}}[%{{.*}}], %{{.*}}[%{{.*}}], %{{.*}}[%{{.*}}], %{{.*}} : memref<256xf32, 2>, memref<256xf32>, memref<1xi32> 400// CHECK-NEXT: affine.dma_wait %{{.*}}[%{{.*}}], %{{.*}} : memref<1xi32> 401// CHECK-NEXT: dealloc %{{.*}} : memref<1xi32> 402// CHECK-NEXT: dealloc %{{.*}} : memref<1xi32> 403// CHECK-NEXT: dealloc %{{.*}} : memref<256xf32, 2> 404// CHECK-NEXT: return 405 406// ----- 407 408// CHECK-LABEL: func @dma_mixed_loop_blocks() { 409func.func @dma_mixed_loop_blocks() { 410 %c0 = arith.constant 0 : index 411 %A = memref.alloc() : memref<256 x 256 x vector<8 x f32>> 412 affine.for %i = 0 to 256 { 413 %v = affine.load %A[%c0, %c0] : memref<256 x 256 x vector<8 x f32>> 414 "foo"(%v) : (vector<8 x f32>) -> () 415 affine.for %j = 0 to 256 { 416 %w = affine.load %A[%i, %j] : memref<256 x 256 x vector<8 x f32>> 417 "bar"(%w) : (vector<8 x f32>) -> () 418 } 419 } 420 return 421} 422// CHECK-DAG: [[MEM:%[0-9a-zA-Z_]+]] = memref.alloc() : memref<256x256xvector<8xf32>> 423// CHECK-DAG: [[BUF:%[0-9a-zA-Z_]+]] = memref.alloc() : memref<256x256xvector<8xf32>, 2> 424// CHECK-DAG: [[TAG:%[0-9a-zA-Z_]+]] = memref.alloc() : memref<1xi32> 425// CHECK: affine.dma_start [[MEM]][%{{.*}}, %{{.*}}], [[BUF]][%{{.*}}, %{{.*}}], [[TAG]][%{{.*}}], %{{.*}} : memref<256x256xvector<8xf32>>, memref<256x256xvector<8xf32>, 2>, memref<1xi32> 426// CHECK-NEXT: affine.dma_wait [[TAG]][%{{.*}}], %{{.*}} : memref<1xi32> 427// CHECK-NEXT: affine.for %{{.*}} = 0 to 256 { 428// CHECK: affine.load [[BUF]][0, 0] : memref<256x256xvector<8xf32>, 2> 429// CHECK: affine.for %{{.*}} = 0 to 256 { 430// CHECK-NEXT: affine.load [[BUF]][%{{.*}}, %{{.*}}] : memref<256x256xvector<8xf32>, 2> 431 432// ----- 433 434// CHECK-LABEL: func @relative_loop_bounds 435func.func @relative_loop_bounds(%arg0: memref<1027xf32>) { 436 affine.for %i0 = 0 to 1024 { 437 affine.for %i2 = affine_map<(d0) -> (d0)>(%i0) to affine_map<(d0) -> (d0 + 4)>(%i0) { 438 %0 = arith.constant 0.0 : f32 439 affine.store %0, %arg0[%i2] : memref<1027xf32> 440 } 441 } 442 return 443} 444// CHECK: [[BUF:%[0-9a-zA-Z_]+]] = memref.alloc() : memref<1027xf32, 2> 445// CHECK-NEXT: [[MEM:%[0-9a-zA-Z_]+]] = memref.alloc() : memref<1xi32> 446// CHECK-NEXT: affine.for %{{.*}} = 0 to 1024 { 447// CHECK-NEXT: affine.for %[[I2:.*]] = {{#map[0-9a-zA-Z_]*}}(%{{.*}}) to {{#map[0-9a-zA-Z_]*}}(%{{.*}}) { 448// CHECK: affine.store %{{.*}}, [[BUF]][%[[I2]]] : memref<1027xf32, 2> 449// CHECK-NEXT: } 450// CHECK-NEXT: } 451// CHECK-NEXT: affine.dma_start [[BUF]][%{{.*}}], %{{.*}}[%{{.*}}], [[MEM]][%{{.*}}], %{{.*}} : memref<1027xf32, 2>, memref<1027xf32>, memref<1xi32> 452// CHECK-NEXT: affine.dma_wait [[MEM]][%{{.*}}], %{{.*}} : memref<1xi32> 453 454// ----- 455 456func.func @test_read_write_region_union() { 457 %0 = memref.alloc() : memref<256xf32> 458 affine.for %i0 = 0 to 10 { 459 // memref dims: [0, 256) 460 // read region: [100, 110) 461 // write region: [25, 35) 462 // union region: [25, 110) 463 %a0 = affine.apply affine_map<(d0) -> (d0 + 100)>(%i0) 464 %a1 = affine.apply affine_map<(d0) -> (d0 + 25)>(%i0) 465 %1 = affine.load %0[%a0] : memref<256xf32> 466 affine.store %1, %0[%a1] : memref<256xf32> 467 } 468 return 469} 470 471// CHECK: memref.alloc() : memref<256xf32> 472// CHECK-NEXT: memref.alloc() : memref<85xf32, 2> 473// CHECK-NEXT: memref.alloc() : memref<1xi32> 474// CHECK-NEXT: affine.dma_start %{{.*}}[%{{.*}}], %{{.*}}[%{{.*}}], %{{.*}}[%{{.*}}], %{{.*}} : memref<256xf32>, memref<85xf32, 2>, memref<1xi32> 475// CHECK-NEXT: affine.dma_wait %{{.*}}[%{{.*}}], %{{.*}} : memref<1xi32> 476// CHECK-NEXT: memref.alloc() : memref<1xi32> 477// CHECK-NEXT: affine.for %{{.*}} = 0 to 10 { 478// CHECK: affine.load %{{.*}}[%{{.*}} + 75] : memref<85xf32, 2> 479// CHECK-NEXT: affine.store %{{.*}}, %{{.*}}[%{{.*}}] : memref<85xf32, 2> 480// CHECK-NEXT: } 481// CHECK-NEXT: affine.dma_start %{{.*}}[%{{.*}}], %{{.*}}[%{{.*}}], %{{.*}}[%{{.*}}], %{{.*}} : memref<85xf32, 2>, memref<256xf32>, memref<1xi32> 482// CHECK-NEXT: affine.dma_wait %{{.*}}[%{{.*}}], %{{.*}} : memref<1xi32> 483 484// ----- 485 486// This should create a buffer of size 2 affine.for %arg2. 487 488#map_lb = affine_map<(d0) -> (d0)> 489#map_ub = affine_map<(d0) -> (d0 + 3)> 490#map_acc = affine_map<(d0) -> (d0 floordiv 8)> 491// CHECK-LABEL: func @test_analysis_util 492func.func @test_analysis_util(%arg0: memref<4x4x16x1xf32>, %arg1: memref<144x9xf32>, %arg2: memref<2xf32>) -> (memref<144x9xf32>, memref<2xf32>) { 493 %c0 = arith.constant 0 : index 494 %0 = memref.alloc() : memref<64x1xf32> 495 %1 = memref.alloc() : memref<144x4xf32> 496 %2 = arith.constant 0.0 : f32 497 affine.for %i8 = 0 to 9 step 3 { 498 affine.for %i9 = #map_lb(%i8) to #map_ub(%i8) { 499 affine.for %i17 = 0 to 64 { 500 %23 = affine.apply #map_acc(%i9) 501 %25 = affine.load %arg2[%23] : memref<2xf32> 502 %26 = affine.apply #map_lb(%i17) 503 %27 = affine.load %0[%26, %c0] : memref<64x1xf32> 504 affine.store %27, %arg2[%23] : memref<2xf32> 505 } 506 } 507 } 508 return %arg1, %arg2 : memref<144x9xf32>, memref<2xf32> 509} 510// CHECK: affine.for %{{.*}} = 0 to 9 step 3 { 511// CHECK: [[BUF:%[0-9a-zA-Z_]+]] = memref.alloc() : memref<2xf32, 2> 512// CHECK: affine.dma_start %{{.*}}[%{{.*}} floordiv 8], [[BUF]] 513// CHECK: affine.dma_wait %{{.*}}[%{{.*}}], %{{.*}} : memref<1xi32> 514// CHECK: affine.for %{{.*}} = 515 516// ----- 517 518#map3 = affine_map<(d0) -> (d0)> 519#map12 = affine_map<(d0) -> (d0 + 3)> 520#map14 = affine_map<(d0, d1) -> ((d0 + d1 * 72) floordiv 2304 + ((((d0 + d1 * 72) mod 2304) mod 1152) mod 9) floordiv 3)> 521#map15 = affine_map<(d0, d1) -> ((d0 + d1 * 72) mod 2304 - (((d0 + d1 * 72) mod 2304) floordiv 1152) * 1151 - ((((d0 + d1 * 72) mod 2304) mod 1152) floordiv 9) * 9 - (((((d0 + d1 * 72) mod 2304) mod 1152) mod 9) floordiv 3) * 3)> 522#map16 = affine_map<(d0, d1) -> (((((d0 + d1 * 72) mod 2304) mod 1152) floordiv 9) floordiv 8)> 523// Test for test case in b/128303048 #4. 524// CHECK-LABEL: func @test_memref_bounds 525func.func @test_memref_bounds(%arg0: memref<4x4x16x1xvector<8x128xf32>>, %arg1: memref<144x9xvector<8x128xf32>>, %arg2: memref<2xvector<8x128xf32>>) -> (memref<144x9xvector<8x128xf32>>, memref<2xvector<8x128xf32>>) { 526 %c0 = arith.constant 0 : index 527 affine.for %i8 = 0 to 9 step 3 { 528 affine.for %i9 = #map3(%i8) to #map12(%i8) { 529 affine.for %i10 = 0 to 64 { 530 %10 = affine.apply #map14(%i9, %i10) 531 %11 = affine.apply #map15(%i9, %i10) 532 %12 = affine.apply #map16(%i9, %i10) 533 %13 = affine.load %arg0[%10, %11, %12, %c0] : memref<4x4x16x1xvector<8x128xf32>> 534 } 535 } 536 } 537 return %arg1, %arg2 : memref<144x9xvector<8x128xf32>>, memref<2xvector<8x128xf32>> 538} 539 540// CHECK: memref.alloc() : memref<4x4x16x1xvector<8x128xf32>, 2> 541// CHECK-NEXT: memref.alloc() : memref<1xi32> 542// CHECK-NEXT: affine.dma_start %{{.*}}[%{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}], %{{.*}}[%{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}], %{{.*}}[%{{.*}}], %{{.*}} : memref<4x4x16x1xvector<8x128xf32>>, memref<4x4x16x1xvector<8x128xf32>, 2>, memref<1xi32> 543// CHECK-NEXT: affine.dma_wait %{{.*}}[%{{.*}}], %{{.*}} : memref<1xi32> 544 545// ----- 546 547// Since the fast memory size is 4 KB, DMA generation will happen right under 548// %i0. 549 550// FAST-MEM-16KB-LABEL: func @load_store_same_memref 551func.func @load_store_same_memref(%arg0: memref<256x1024xf32>) { 552 // FAST-MEM-16KB: affine.for %{{.*}} = 0 to 256 step 4 553 affine.for %i0 = 0 to 256 step 4 { 554 // FAST-MEM-16KB: [[BUF:%[0-9a-zA-Z_]+]] = memref.alloc() : memref<4x1024xf32, 2> 555 // FAST-MEM-16KB: affine.dma_start %{{.*}} 556 // FAST-MEM-16KB-NEXT: affine.dma_wait 557 // FAST-MEM-16KB: affine.for %{{.*}} 558 affine.for %i1 = 0 to 1024 step 4 { 559 // FAST-MEM-16KB: affine.for %{{.*}} 560 affine.for %i2 = affine_map<(d0) -> (d0)>(%i0) to affine_map<(d0) -> (d0 + 4)>(%i0) { 561 // FAST-MEM-16KB: affine.for %{{.*}} 562 affine.for %i3 = affine_map<(d0) -> (d0)>(%i1) to affine_map<(d0) -> (d0 + 4)>(%i1) { 563 %3 = affine.load %arg0[%i2, %i3] : memref<256x1024xf32> 564 %4 = arith.mulf %3, %3 : f32 565 affine.store %4, %arg0[%i2, %i3] : memref<256x1024xf32> 566 } // FAST-MEM-16KB: } 567 } // FAST-MEM-16KB: } 568 } // FAST-MEM-16KB: } 569 // FAST-MEM-16KB: affine.dma_start [[BUF]] 570 // FAST-MEM-16KB-NEXT: affine.dma_wait 571 } 572 return 573} 574 575// ----- 576 577// This a 3-d loop nest tiled by 4 x 4 x 4. Under %i, %j, %k, the size of a 578// tile of arg0, arg1, and arg2 accessed is 4 KB (each), i.e., 12 KB in total. 579// With fast mem capacity set to 16 KB, the DMAs if placed under %k will fit. 580// However, the region of arg2 accessed is invariant w.r.t the %k loop unlike 581// %arg0 and %arg1. So, its DMA can be hoisted one level up and placed under 582// %j, while the DMAs for arg0 and arg1 appear right under the %k scf. 583 584#map0 = affine_map<(d0) -> (d0)> 585#map1 = affine_map<(d0) -> (d0 + 4)> 586// FAST-MEM-16KB-LABEL: func @simple_matmul 587func.func @simple_matmul(%arg0: memref<8x8xvector<64xf32>>, %arg1: memref<8x8xvector<64xf32>>, %arg2: memref<8x8xvector<64xf32>>) -> memref<8x8xvector<64xf32>> { 588 affine.for %i = 0 to 8 step 4 { 589 affine.for %j = 0 to 8 step 4 { 590 affine.for %k = 0 to 8 step 4 { 591 affine.for %ii = #map0(%i) to #map1(%i) { 592 affine.for %jj = #map0(%j) to #map1(%j) { 593 affine.for %kk = #map0(%k) to #map1(%k) { 594 %5 = affine.load %arg0[%ii, %kk] : memref<8x8xvector<64xf32>> 595 %6 = affine.load %arg1[%kk, %jj] : memref<8x8xvector<64xf32>> 596 %7 = affine.load %arg2[%ii, %jj] : memref<8x8xvector<64xf32>> 597 %8 = arith.mulf %5, %6 : vector<64xf32> 598 %9 = arith.addf %7, %8 : vector<64xf32> 599 affine.store %9, %arg2[%ii, %jj] : memref<8x8xvector<64xf32>> 600 } 601 } 602 } 603 } 604 } 605 } 606 return %arg2 : memref<8x8xvector<64xf32>> 607} 608// FAST-MEM-16KB: affine.for %{{.*}} = 0 to 8 step 4 { 609// FAST-MEM-16KB: affine.for %{{.*}} = 0 to 8 step 4 { 610// FAST-MEM-16KB: affine.dma_start %{{.*}} 611// FAST-MEM-16KB: affine.dma_wait 612// FAST-MEM-16KB: affine.for %{{.*}} = 0 to 8 step 4 { 613// FAST-MEM-16KB: affine.dma_start %{{.*}} 614// FAST-MEM-16KB: affine.dma_wait 615// FAST-MEM-16KB: affine.dma_start %{{.*}} 616// FAST-MEM-16KB: affine.dma_wait 617// FAST-MEM-16KB: affine.for %{{.*}} = #map{{[0-9a-zA-Z_]*}}(%{{.*}}) to #map{{[0-9a-zA-Z_]*}}(%{{.*}}) { 618// FAST-MEM-16KB-NEXT: affine.for %{{.*}} = #map{{[0-9a-zA-Z_]*}}(%{{.*}}) to #map{{[0-9a-zA-Z_]*}}(%{{.*}}) { 619// FAST-MEM-16KB-NEXT: affine.for %{{.*}} = #map{{[0-9a-zA-Z_]*}}(%{{.*}}) to #map{{[0-9a-zA-Z_]*}}(%{{.*}}) { 620// FAST-MEM-16KB: } 621// FAST-MEM-16KB: } 622// FAST-MEM-16KB: } 623// FAST-MEM-16KB: } 624// FAST-MEM-16KB: affine.dma_start %{{.*}}[%{{.*}}, %{{.*}}], %{{.*}} 625// FAST-MEM-16KB: affine.dma_wait 626