1// RUN: mlir-opt -transform-interpreter -split-input-file -verify-diagnostics -allow-unregistered-dialect %s | FileCheck %s 2 3 4!tt = tensor<8xf16> 5 6// CHECK-LABEL: func @copy_1d_8xf16 7func.func @copy_1d_8xf16(%t0: !tt, %out: !tt) -> !tt { 8 /// Too little data for all threads, needs predication, while keeping most 9 /// minor transfer size -> 1 thread. 10 // CHECK: scf.forall {{.*}} in (1) {{.*}} 11 // CHECK: linalg.copy {{.*}} -> tensor<8xf16> 12 // CHECK: {mapping = [#gpu.thread<linear_dim_0>]} 13 %0 = linalg.copy ins(%t0: !tt) outs(%out: !tt) -> !tt 14 return %0 : !tt 15} 16 17module attributes {transform.with_named_sequence} { 18 transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) { 19 %0 = transform.structured.match ops{["linalg.copy"]} in %arg1 20 : (!transform.any_op) -> !transform.any_op 21 transform.structured.gpu.map_copy_to_threads %0 22 total_num_threads = 32 desired_bit_alignment = 128 23 : (!transform.any_op) -> (!transform.op<"scf.forall">, !transform.op<"linalg.copy">) 24 transform.yield 25 } 26} 27 28// ----- 29 30!tt = tensor<8xf16> 31!tin = tensor<?xf16> 32 33// CHECK-LABEL: func @pad_1d_8xf16 34func.func @pad_1d_8xf16(%t0: !tin, %sz: index) -> !tt { 35 %cst = arith.constant 0.0 : f16 36 /// Too little data for all threads, needs predication, while keeping most 37 /// minor transfer size -> 1 thread. 38 // CHECK: scf.forall {{.*}} in (1) {{.*}} 39 // CHECK: %[[padded:.*]] = tensor.pad {{.*}} 40 // CHECK: tensor.cast %[[padded]] : tensor<?xf16> to tensor<8xf16> 41 // CHECK: {mapping = [#gpu.thread<linear_dim_0>]} 42 %0 = tensor.pad %t0 low[0] high[%sz] { 43 ^bb0(%arg0: index): 44 tensor.yield %cst : f16 45 } : !tin to !tt 46 return %0 : !tt 47} 48 49module attributes {transform.with_named_sequence} { 50 transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) { 51 %0 = transform.structured.match ops{["tensor.pad"]} in %arg1 52 : (!transform.any_op) -> !transform.any_op 53 transform.structured.gpu.map_copy_to_threads %0 54 total_num_threads = 32 desired_bit_alignment = 128 55 : (!transform.any_op) -> (!transform.op<"scf.forall">, !transform.op<"tensor.pad">) 56 transform.yield 57 } 58} 59 60// ----- 61 62!tt = tensor<16xf16> 63 64// CHECK-LABEL: func @copy_1d_16xf16 65func.func @copy_1d_16xf16(%t0: !tt, %out: !tt) -> !tt { 66 /// Too little data for all threads, needs predication, while keeping most 67 /// minor transfer size -> 2 threads. 68 // CHECK: scf.forall {{.*}} in (2) {{.*}} 69 // CHECK: linalg.copy {{.*}} -> tensor<8xf16> 70 // CHECK: {mapping = [#gpu.thread<linear_dim_0>]} 71 %0 = linalg.copy ins(%t0: !tt) outs(%out: !tt) -> !tt 72 return %0 : !tt 73} 74 75module attributes {transform.with_named_sequence} { 76 transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) { 77 %0 = transform.structured.match ops{["linalg.copy"]} in %arg1 78 : (!transform.any_op) -> !transform.any_op 79 transform.structured.gpu.map_copy_to_threads %0 80 total_num_threads = 32 desired_bit_alignment = 128 81 : (!transform.any_op) -> (!transform.op<"scf.forall">, !transform.op<"linalg.copy">) 82 transform.yield 83 } 84} 85 86// ----- 87 88!tt = tensor<20xf16> 89 90// CHECK-LABEL: func @copy_1d_20xf16 91func.func @copy_1d_20xf16(%t0: !tt, %out: !tt) -> !tt { 92 /// Too little data for all threads, needs predication, while keeping most 93 /// minor transfer size -> 5 threads. 94 // CHECK: scf.forall {{.*}} in (5) {{.*}} 95 // CHECK: linalg.copy {{.*}} -> tensor<4xf16> 96 // CHECK: {mapping = [#gpu.thread<linear_dim_0>]} 97 %0 = linalg.copy ins(%t0: !tt) outs(%out: !tt) -> !tt 98 return %0 : !tt 99} 100 101module attributes {transform.with_named_sequence} { 102 transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) { 103 %0 = transform.structured.match ops{["linalg.copy"]} in %arg1 104 : (!transform.any_op) -> !transform.any_op 105 transform.structured.gpu.map_copy_to_threads %0 106 total_num_threads = 32 desired_bit_alignment = 128 107 : (!transform.any_op) -> (!transform.op<"scf.forall">, !transform.op<"linalg.copy">) 108 transform.yield 109 } 110} 111 112 113// ----- 114 115!tt = tensor<20xf16> 116 117// CHECK-LABEL: func @copy_1d_20xf16 118func.func @copy_1d_20xf16(%t0: !tt, %out: !tt) -> !tt { 119 /// Too little data for all threads, needs predication, while keeping most 120 /// minor transfer size -> 5 threads. 121 // CHECK: scf.forall {{.*}} in (5) {{.*}} 122 // CHECK: linalg.copy {{.*}} -> tensor<4xf16> 123 // CHECK: {mapping = [#gpu.thread<linear_dim_0>]} 124 %0 = linalg.copy ins(%t0: !tt) outs(%out: !tt) -> !tt 125 return %0 : !tt 126} 127 128module attributes {transform.with_named_sequence} { 129 transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) { 130 %0 = transform.structured.match ops{["linalg.copy"]} in %arg1 131 : (!transform.any_op) -> !transform.any_op 132 transform.structured.gpu.map_copy_to_threads %0 133 total_num_threads = 32 desired_bit_alignment = 128 134 : (!transform.any_op) -> (!transform.op<"scf.forall">, !transform.op<"linalg.copy">) 135 transform.yield 136 } 137} 138 139// ----- 140 141!tt = tensor<128xf16> 142 143// CHECK-LABEL: func @copy_1d_128xf16 144func.func @copy_1d_128xf16(%t0: !tt, %out: !tt) -> !tt { 145 /// Enough data for all threads and no need for predication but we must reduce 146 /// the transfer size to 4xf16. 147 // CHECK: scf.forall {{.*}} in (32) {{.*}} 148 // CHECK: linalg.copy {{.*}} -> tensor<4xf16> 149 // CHECK: {mapping = [#gpu.thread<linear_dim_0>]} 150 %0 = linalg.copy ins(%t0: !tt) outs(%out: !tt) -> !tt 151 return %0 : !tt 152} 153 154module attributes {transform.with_named_sequence} { 155 transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) { 156 %0 = transform.structured.match ops{["linalg.copy"]} in %arg1 157 : (!transform.any_op) -> !transform.any_op 158 transform.structured.gpu.map_copy_to_threads %0 159 total_num_threads = 32 desired_bit_alignment = 128 160 : (!transform.any_op) -> (!transform.op<"scf.forall">, !transform.op<"linalg.copy">) 161 transform.yield 162 } 163} 164 165// ----- 166 167!tt = tensor<256xf16> 168 169// CHECK-LABEL: func @copy_1d_256xf16 170func.func @copy_1d_256xf16(%t0: !tt, %out: !tt) -> !tt { 171 /// Enough data for all threads and no need for predication. 172 // CHECK: scf.forall {{.*}} in (32) {{.*}} 173 // CHECK: linalg.copy {{.*}} -> tensor<8xf16> 174 // CHECK: {mapping = [#gpu.thread<linear_dim_0>]} 175 %0 = linalg.copy ins(%t0: !tt) outs(%out: !tt) -> !tt 176 return %0 : !tt 177} 178 179module attributes {transform.with_named_sequence} { 180 transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) { 181 %0 = transform.structured.match ops{["linalg.copy"]} in %arg1 182 : (!transform.any_op) -> !transform.any_op 183 transform.structured.gpu.map_copy_to_threads %0 184 total_num_threads = 32 desired_bit_alignment = 128 185 : (!transform.any_op) -> (!transform.op<"scf.forall">, !transform.op<"linalg.copy">) 186 transform.yield 187 } 188} 189 190// ----- 191 192!tt = tensor<16x32x64xi8> 193 194// CHECK-LABEL: func @copy_3d_16x32x64xi8 195func.func @copy_3d_16x32x64xi8(%t0: !tt, %out: !tt) -> !tt { 196 // CHECK: scf.forall {{.*}} in (1, 8, 4) {{.*}} 197 // CHECK: linalg.copy {{.*}} -> tensor<16x4x16xi8> 198 // CHECK: {mapping = [#gpu.thread<linear_dim_2>, #gpu.thread<linear_dim_1>, #gpu.thread<linear_dim_0>]} 199 %0 = linalg.copy ins(%t0: !tt) outs(%out: !tt) -> !tt 200 return %0 : !tt 201} 202 203module attributes {transform.with_named_sequence} { 204 transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) { 205 %0 = transform.structured.match ops{["linalg.copy"]} in %arg1 206 : (!transform.any_op) -> !transform.any_op 207 transform.structured.gpu.map_copy_to_threads %0 208 total_num_threads = 32 desired_bit_alignment = 128 209 : (!transform.any_op) -> (!transform.op<"scf.forall">, !transform.op<"linalg.copy">) 210 transform.yield 211 } 212} 213 214// ----- 215 216!tt = tensor<16x32x64xi8> 217 218// CHECK-LABEL: func @copy_3d_16x32x64xi8 219func.func @copy_3d_16x32x64xi8(%t0: !tt, %out: !tt) -> !tt { 220 // CHECK: scf.forall {{.*}} in (1, 4, 8) {{.*}} 221 // CHECK: linalg.copy {{.*}} -> tensor<16x8x8xi8> 222 // CHECK: {mapping = [#gpu.thread<linear_dim_2>, #gpu.thread<linear_dim_1>, #gpu.thread<linear_dim_0>]} 223 %0 = linalg.copy ins(%t0: !tt) outs(%out: !tt) -> !tt 224 return %0 : !tt 225} 226 227module attributes {transform.with_named_sequence} { 228 transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) { 229 %0 = transform.structured.match ops{["linalg.copy"]} in %arg1 230 : (!transform.any_op) -> !transform.any_op 231 transform.structured.gpu.map_copy_to_threads %0 232 total_num_threads = 32 desired_bit_alignment = 64 233 : (!transform.any_op) -> (!transform.op<"scf.forall">, !transform.op<"linalg.copy">) 234 transform.yield 235 } 236} 237 238// ----- 239 240!tt = tensor<4x8x16xi8> 241 242// CHECK-LABEL: func @copy_3d_4x8x16xi8 243func.func @copy_3d_4x8x16xi8(%t0: !tt, %out: !tt) -> !tt { 244 // CHECK: scf.forall {{.*}} in (4, 8, 1) {{.*}} 245 // CHECK: linalg.copy {{.*}} -> tensor<1x1x16xi8> 246 // CHECK: {mapping = [#gpu.thread<linear_dim_2>, #gpu.thread<linear_dim_1>, #gpu.thread<linear_dim_0>]} 247 %0 = linalg.copy ins(%t0: !tt) outs(%out: !tt) -> !tt 248 return %0 : !tt 249} 250 251module attributes {transform.with_named_sequence} { 252 transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) { 253 %0 = transform.structured.match ops{["linalg.copy"]} in %arg1 254 : (!transform.any_op) -> !transform.any_op 255 transform.structured.gpu.map_copy_to_threads %0 256 total_num_threads = 32 desired_bit_alignment = 128 257 : (!transform.any_op) -> (!transform.op<"scf.forall">, !transform.op<"linalg.copy">) 258 transform.yield 259 } 260} 261 262// ----- 263 264!tt = tensor<4x8x16xi8> 265 266// CHECK-LABEL: func @copy_3d_4x8x16xi8 267func.func @copy_3d_4x8x16xi8(%t0: !tt, %out: !tt) -> !tt { 268 // CHECK: scf.forall {{.*}} in (1, 2, 16) {{.*}} 269 // CHECK: linalg.copy {{.*}} -> tensor<4x4x1xi8> 270 // CHECK: {mapping = [#gpu.thread<linear_dim_2>, #gpu.thread<linear_dim_1>, #gpu.thread<linear_dim_0>]} 271 %0 = linalg.copy ins(%t0: !tt) outs(%out: !tt) -> !tt 272 return %0 : !tt 273} 274 275module attributes {transform.with_named_sequence} { 276 transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) { 277 %0 = transform.structured.match ops{["linalg.copy"]} in %arg1 278 : (!transform.any_op) -> !transform.any_op 279 transform.structured.gpu.map_copy_to_threads %0 280 total_num_threads = 32 desired_bit_alignment = 8 281 : (!transform.any_op) -> (!transform.op<"scf.forall">, !transform.op<"linalg.copy">) 282 transform.yield 283 } 284} 285 286// ----- 287 288!tt = tensor<3x5x7xi8> 289 290// CHECK-LABEL: func @copy_3d_3x5x7xi8 291func.func @copy_3d_3x5x7xi8(%t0: !tt, %out: !tt) -> !tt { 292 // Best effort greedy mapping: first 7, then skip 5 (as 7*5 overflows 32), then 293 // take 3. 294 // DP mapping: 7 mandated most minor, then skip 5 (as 7*5 overflows 32), then 295 // take 3. 296 // CHECK: scf.forall {{.*}} in (3, 1, 7) {{.*}} 297 // CHECK: linalg.copy {{.*}} -> tensor<1x5x1xi8> 298 // CHECK: {mapping = [#gpu.thread<linear_dim_2>, #gpu.thread<linear_dim_1>, #gpu.thread<linear_dim_0>]} 299 %0 = linalg.copy ins(%t0: !tt) outs(%out: !tt) -> !tt 300 return %0 : !tt 301} 302 303module attributes {transform.with_named_sequence} { 304 transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) { 305 %0 = transform.structured.match ops{["linalg.copy"]} in %arg1 306 : (!transform.any_op) -> !transform.any_op 307 transform.structured.gpu.map_copy_to_threads %0 308 total_num_threads = 32 desired_bit_alignment = 8 309 : (!transform.any_op) -> (!transform.op<"scf.forall">, !transform.op<"linalg.copy">) 310 transform.yield 311 } 312} 313 314// ----- 315 316!tt = tensor<16x15x5xi8> 317 318// CHECK-LABEL: func @copy_3d_16x15x5xi8 319func.func @copy_3d_16x15x5xi8(%t0: !tt, %out: !tt) -> !tt { 320 // DP mapping: 5 mandated most minor, then 3 to allow 8 on the outermost. 321 // CHECK: scf.forall {{.*}} in (8, 3, 5) {{.*}} 322 // CHECK: linalg.copy {{.*}} -> tensor<2x5x1xi8> 323 // CHECK: {mapping = [#gpu.thread<linear_dim_2>, #gpu.thread<linear_dim_1>, #gpu.thread<linear_dim_0>]} 324 %0 = linalg.copy ins(%t0: !tt) outs(%out: !tt) -> !tt 325 return %0 : !tt 326} 327 328module attributes {transform.with_named_sequence} { 329 transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) { 330 %0 = transform.structured.match ops{["linalg.copy"]} in %arg1 331 : (!transform.any_op) -> !transform.any_op 332 transform.structured.gpu.map_copy_to_threads %0 333 total_num_threads = 128 desired_bit_alignment = 8 334 : (!transform.any_op) -> (!transform.op<"scf.forall">, !transform.op<"linalg.copy">) 335 transform.yield 336 } 337} 338 339// ----- 340 341!tt = tensor<16x15x40xi8> 342 343// CHECK-LABEL: func @copy_3d_16x15x40xi8 344func.func @copy_3d_16x15x40xi8(%t0: !tt, %out: !tt) -> !tt { 345 // DP mapping: 5 mandated most minor, then 3 to allow 8 on the outermost. 346 // CHECK: scf.forall {{.*}} in (8, 3, 5) {{.*}} 347 // CHECK: linalg.copy {{.*}} -> tensor<2x5x8xi8> 348 // CHECK: {mapping = [#gpu.thread<linear_dim_2>, #gpu.thread<linear_dim_1>, #gpu.thread<linear_dim_0>]} 349 %0 = linalg.copy ins(%t0: !tt) outs(%out: !tt) -> !tt 350 return %0 : !tt 351} 352 353module attributes {transform.with_named_sequence} { 354 transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) { 355 %0 = transform.structured.match ops{["linalg.copy"]} in %arg1 356 : (!transform.any_op) -> !transform.any_op 357 transform.structured.gpu.map_copy_to_threads %0 358 total_num_threads = 128 desired_bit_alignment = 64 359 : (!transform.any_op) -> (!transform.op<"scf.forall">, !transform.op<"linalg.copy">) 360 transform.yield 361 } 362} 363 364 365//////////////////////////////////////////////////////////////////////////////// 366// Tests below are expected to fail. 367//////////////////////////////////////////////////////////////////////////////// 368 369// ----- 370 371!tt = tensor<1024xf16> 372 373// NO-CHECK-LABEL-ON-EXPECTED-ERROR 374func.func @copy_1d_1024xf16(%t0: !tt, %out: !tt) -> !tt { 375 /// Too much data for all threads, we do not try to recover here, this is the 376 /// job of higher-level transformations to select better tile sizes and number 377 /// of threads. 378 379 // expected-note @below {{target op}} 380 %0 = linalg.copy ins(%t0: !tt) outs(%out: !tt) -> !tt 381 return %0 : !tt 382} 383 384module attributes {transform.with_named_sequence} { 385 transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) { 386 %0 = transform.structured.match ops{["linalg.copy"]} in %arg1 387 : (!transform.any_op) -> !transform.any_op 388 // expected-error @below {{too few threads to map copy op to threads on the most minor dimension, given alignment and vector size constraints}} 389 transform.structured.gpu.map_copy_to_threads %0 390 total_num_threads = 32 desired_bit_alignment = 128 391 : (!transform.any_op) -> (!transform.op<"scf.forall">, !transform.op<"linalg.copy">) 392 transform.yield 393 } 394} 395 396// ----- 397 398!tt = tensor<257xf16> 399 400// NO-CHECK-LABEL-ON-EXPECTED-ERROR 401func.func @copy_1d_257xf16(%t0: !tt, %out: !tt) -> !tt { 402 /// Too much data for all threads, we do not try to recover here, this is the 403 /// job of higher-level transformations to select better tile sizes and number 404 /// of threads. 405 406 // expected-note @below {{target op}} 407 %0 = linalg.copy ins(%t0: !tt) outs(%out: !tt) -> !tt 408 return %0 : !tt 409} 410 411module attributes {transform.with_named_sequence} { 412 transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) { 413 %0 = transform.structured.match ops{["linalg.copy"]} in %arg1 414 : (!transform.any_op) -> !transform.any_op 415 // expected-error @below {{too few threads to map copy op to threads on the most minor dimension, given alignment and vector size constraints}} 416 transform.structured.gpu.map_copy_to_threads %0 417 total_num_threads = 32 desired_bit_alignment = 128 418 : (!transform.any_op) -> (!transform.op<"scf.forall">, !transform.op<"linalg.copy">) 419 transform.yield 420 } 421} 422 423// ----- 424 425!tt = tensor<512xi8> 426 427// NO-CHECK-LABEL-ON-EXPECTED-ERROR 428func.func @copy_1d_512xi8(%t0: !tt, %out: !tt) -> !tt { 429 /// Too much data for all threads given the forced alignment to 8b, 430 /// we do not try to recover here, this is the job of higher-level 431 /// transformations to select better tile sizes and number of threads. 432 // expected-note @below {{target op}} 433 %0 = linalg.copy ins(%t0: !tt) outs(%out: !tt) -> !tt 434 return %0 : !tt 435} 436 437module attributes {transform.with_named_sequence} { 438 transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) { 439 %0 = transform.structured.match ops{["linalg.copy"]} in %arg1 440 : (!transform.any_op) -> !transform.any_op 441 // expected-error @below {{too few threads to map copy op to threads on the most minor dimension, given alignment and vector size constraints}} 442 transform.structured.gpu.map_copy_to_threads %0 443 total_num_threads = 32 desired_bit_alignment = 8 444 : (!transform.any_op) -> (!transform.op<"scf.forall">, !transform.op<"linalg.copy">) 445 transform.yield 446 } 447} 448 449// ----- 450 451!tt = tensor<16x32x64xi8> 452 453// NO-CHECK-LABEL-ON-EXPECTED-ERROR 454func.func @copy_3d_16x32x64xi8(%t0: !tt, %out: !tt) -> !tt { 455 /// Too much data for all threads given the forced alignment to 8b, 456 /// we do not try to recover here, this is the job of higher-level 457 /// transformations to select better tile sizes and number of threads. 458 // expected-note @below {{target op}} 459 %0 = linalg.copy ins(%t0: !tt) outs(%out: !tt) -> !tt 460 return %0 : !tt 461} 462 463module attributes {transform.with_named_sequence} { 464 transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) { 465 %0 = transform.structured.match ops{["linalg.copy"]} in %arg1 466 : (!transform.any_op) -> !transform.any_op 467 // expected-error @below {{too few threads to map copy op to threads on the most minor dimension, given alignment and vector size constraints}} 468 transform.structured.gpu.map_copy_to_threads %0 469 total_num_threads = 32 desired_bit_alignment = 8 470 : (!transform.any_op) -> (!transform.op<"scf.forall">, !transform.op<"linalg.copy">) 471 transform.yield 472 } 473} 474