1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+amx-int8 -mattr=+avx512f -verify-machineinstrs | FileCheck %s 3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+amx-int8 -mattr=+avx512f -verify-machineinstrs -enable-ipra | FileCheck -check-prefix=IPRA %s 4; RUN: llc < %s -O0 -mtriple=x86_64-unknown-unknown -mattr=+amx-int8 -mattr=+avx512f -verify-machineinstrs | FileCheck -check-prefix=O0 %s 5 6@buf = dso_local global [3072 x i8] zeroinitializer, align 64 7 8define internal void @foo() { 9; CHECK-LABEL: foo: 10; CHECK: # %bb.0: # %entry 11; CHECK-NEXT: retq 12; 13; IPRA-LABEL: foo: 14; IPRA: # %bb.0: # %entry 15; IPRA-NEXT: retq 16; 17; O0-LABEL: foo: 18; O0: # %bb.0: # %entry 19; O0-NEXT: retq 20entry: 21 ret void 22} 23 24define dso_local void @test_api(i16 signext %0, i16 signext %1) nounwind { 25; CHECK-LABEL: test_api: 26; CHECK: # %bb.0: 27; CHECK-NEXT: pushq %rbp 28; CHECK-NEXT: pushq %r15 29; CHECK-NEXT: pushq %r14 30; CHECK-NEXT: pushq %rbx 31; CHECK-NEXT: subq $2120, %rsp # imm = 0x848 32; CHECK-NEXT: movl %esi, %ebx 33; CHECK-NEXT: movl %edi, %ebp 34; CHECK-NEXT: vxorps %xmm0, %xmm0, %xmm0 35; CHECK-NEXT: vmovups %zmm0, (%rsp) 36; CHECK-NEXT: movb $1, (%rsp) 37; CHECK-NEXT: movw $8, {{[0-9]+}}(%rsp) 38; CHECK-NEXT: movb $8, {{[0-9]+}}(%rsp) 39; CHECK-NEXT: movw %bx, {{[0-9]+}}(%rsp) 40; CHECK-NEXT: movb %bpl, {{[0-9]+}}(%rsp) 41; CHECK-NEXT: movw %bx, {{[0-9]+}}(%rsp) 42; CHECK-NEXT: movb %bpl, {{[0-9]+}}(%rsp) 43; CHECK-NEXT: ldtilecfg (%rsp) 44; CHECK-NEXT: movl $buf, %eax 45; CHECK-NEXT: movl $32, %r14d 46; CHECK-NEXT: movw $8, %r15w 47; CHECK-NEXT: tileloadd (%rax,%r14), %tmm1 48; CHECK-NEXT: movabsq $64, %rax 49; CHECK-NEXT: tilestored %tmm1, 1088(%rsp,%rax) # 1024-byte Folded Spill 50; CHECK-NEXT: movl $buf+1024, %eax 51; CHECK-NEXT: tileloadd (%rax,%r14), %tmm2 52; CHECK-NEXT: movabsq $64, %rax 53; CHECK-NEXT: tilestored %tmm2, 64(%rsp,%rax) # 1024-byte Folded Spill 54; CHECK-NEXT: vzeroupper 55; CHECK-NEXT: callq foo 56; CHECK-NEXT: ldtilecfg (%rsp) 57; CHECK-NEXT: movl $buf+2048, %eax 58; CHECK-NEXT: tileloadd (%rax,%r14), %tmm0 59; CHECK-NEXT: movabsq $64, %rcx 60; CHECK-NEXT: tileloadd 1088(%rsp,%rcx), %tmm1 # 1024-byte Folded Reload 61; CHECK-NEXT: tileloadd 64(%rsp,%rcx), %tmm2 # 1024-byte Folded Reload 62; CHECK-NEXT: tdpbssd %tmm2, %tmm1, %tmm0 63; CHECK-NEXT: tilestored %tmm0, (%rax,%r14) 64; CHECK-NEXT: addq $2120, %rsp # imm = 0x848 65; CHECK-NEXT: popq %rbx 66; CHECK-NEXT: popq %r14 67; CHECK-NEXT: popq %r15 68; CHECK-NEXT: popq %rbp 69; CHECK-NEXT: tilerelease 70; CHECK-NEXT: retq 71; 72; IPRA-LABEL: test_api: 73; IPRA: # %bb.0: 74; IPRA-NEXT: subq $72, %rsp 75; IPRA-NEXT: vxorps %xmm0, %xmm0, %xmm0 76; IPRA-NEXT: vmovups %zmm0, {{[0-9]+}}(%rsp) 77; IPRA-NEXT: movb $1, {{[0-9]+}}(%rsp) 78; IPRA-NEXT: movw $8, {{[0-9]+}}(%rsp) 79; IPRA-NEXT: movb $8, {{[0-9]+}}(%rsp) 80; IPRA-NEXT: movw %si, {{[0-9]+}}(%rsp) 81; IPRA-NEXT: movb %dil, {{[0-9]+}}(%rsp) 82; IPRA-NEXT: movw %si, {{[0-9]+}}(%rsp) 83; IPRA-NEXT: movb %dil, {{[0-9]+}}(%rsp) 84; IPRA-NEXT: ldtilecfg {{[0-9]+}}(%rsp) 85; IPRA-NEXT: movl $buf, %eax 86; IPRA-NEXT: movl $32, %ecx 87; IPRA-NEXT: movw $8, %dx 88; IPRA-NEXT: tileloadd (%rax,%rcx), %tmm0 89; IPRA-NEXT: movl $buf+1024, %eax 90; IPRA-NEXT: tileloadd (%rax,%rcx), %tmm1 91; IPRA-NEXT: callq foo 92; IPRA-NEXT: movl $buf+2048, %eax 93; IPRA-NEXT: tileloadd (%rax,%rcx), %tmm2 94; IPRA-NEXT: tdpbssd %tmm1, %tmm0, %tmm2 95; IPRA-NEXT: tilestored %tmm2, (%rax,%rcx) 96; IPRA-NEXT: addq $72, %rsp 97; IPRA-NEXT: tilerelease 98; IPRA-NEXT: vzeroupper 99; IPRA-NEXT: retq 100; 101; O0-LABEL: test_api: 102; O0: # %bb.0: 103; O0-NEXT: pushq %rbp 104; O0-NEXT: movq %rsp, %rbp 105; O0-NEXT: andq $-1024, %rsp # imm = 0xFC00 106; O0-NEXT: subq $8192, %rsp # imm = 0x2000 107; O0-NEXT: vxorps %xmm0, %xmm0, %xmm0 108; O0-NEXT: vmovups %zmm0, {{[0-9]+}}(%rsp) 109; O0-NEXT: movb $1, {{[0-9]+}}(%rsp) 110; O0-NEXT: movw %si, %cx 111; O0-NEXT: movw %cx, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill 112; O0-NEXT: movw %di, %ax 113; O0-NEXT: movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill 114; O0-NEXT: movl $buf, %esi 115; O0-NEXT: movl $32, %edi 116; O0-NEXT: movw $8, %dx 117; O0-NEXT: # implicit-def: $al 118; O0-NEXT: movb %al, {{[0-9]+}}(%rsp) 119; O0-NEXT: movw %dx, {{[0-9]+}}(%rsp) 120; O0-NEXT: ldtilecfg {{[0-9]+}}(%rsp) 121; O0-NEXT: tileloadd (%rsi,%rdi), %tmm0 122; O0-NEXT: movl $64, %edi 123; O0-NEXT: leaq {{[0-9]+}}(%rsp), %rsi 124; O0-NEXT: movw $8, %dx 125; O0-NEXT: tilestored %tmm0, (%rsi,%rdi) 126; O0-NEXT: movl $32, %esi 127; O0-NEXT: movl $buf+1024, %edx 128; O0-NEXT: movw $8, %ax 129; O0-NEXT: # implicit-def: $al 130; O0-NEXT: movb %al, {{[0-9]+}}(%rsp) 131; O0-NEXT: movw %cx, {{[0-9]+}}(%rsp) 132; O0-NEXT: ldtilecfg {{[0-9]+}}(%rsp) 133; O0-NEXT: tileloadd (%rdx,%rsi), %tmm0 134; O0-NEXT: movl $64, %esi 135; O0-NEXT: leaq {{[0-9]+}}(%rsp), %rdx 136; O0-NEXT: movw $8, %ax 137; O0-NEXT: tilestored %tmm0, (%rdx,%rsi) 138; O0-NEXT: vzeroupper 139; O0-NEXT: callq foo 140; O0-NEXT: movw {{[-0-9]+}}(%r{{[sb]}}p), %cx # 2-byte Reload 141; O0-NEXT: movw {{[-0-9]+}}(%r{{[sb]}}p), %ax # 2-byte Reload 142; O0-NEXT: # implicit-def: $al 143; O0-NEXT: movb %al, {{[0-9]+}}(%rsp) 144; O0-NEXT: movw %cx, {{[0-9]+}}(%rsp) 145; O0-NEXT: ldtilecfg {{[0-9]+}}(%rsp) 146; O0-NEXT: movl $32, %esi 147; O0-NEXT: movl $buf+2048, %edx 148; O0-NEXT: tileloadd (%rdx,%rsi), %tmm0 149; O0-NEXT: movl $64, %esi 150; O0-NEXT: leaq {{[0-9]+}}(%rsp), %rdx 151; O0-NEXT: tilestored %tmm0, (%rdx,%rsi) 152; O0-NEXT: movl $64, %edi 153; O0-NEXT: leaq {{[0-9]+}}(%rsp), %rdx 154; O0-NEXT: movw $8, %si 155; O0-NEXT: # implicit-def: $al 156; O0-NEXT: movb %al, {{[0-9]+}}(%rsp) 157; O0-NEXT: movw %si, {{[0-9]+}}(%rsp) 158; O0-NEXT: ldtilecfg {{[0-9]+}}(%rsp) 159; O0-NEXT: tileloadd (%rdx,%rdi), %tmm0 160; O0-NEXT: movabsq $64, %rdx 161; O0-NEXT: tilestored %tmm0, 1024(%rsp,%rdx) # 1024-byte Folded Spill 162; O0-NEXT: movl $64, %r8d 163; O0-NEXT: leaq {{[0-9]+}}(%rsp), %rdi 164; O0-NEXT: movw $8, %dx 165; O0-NEXT: # implicit-def: $al 166; O0-NEXT: movb %al, {{[0-9]+}}(%rsp) 167; O0-NEXT: movw %cx, {{[0-9]+}}(%rsp) 168; O0-NEXT: # implicit-def: $al 169; O0-NEXT: movb %al, {{[0-9]+}}(%rsp) 170; O0-NEXT: movw %cx, {{[0-9]+}}(%rsp) 171; O0-NEXT: # implicit-def: $al 172; O0-NEXT: movb %al, {{[0-9]+}}(%rsp) 173; O0-NEXT: movw %si, {{[0-9]+}}(%rsp) 174; O0-NEXT: # implicit-def: $al 175; O0-NEXT: movb %al, {{[0-9]+}}(%rsp) 176; O0-NEXT: movw %cx, {{[0-9]+}}(%rsp) 177; O0-NEXT: # implicit-def: $dl 178; O0-NEXT: movb %dl, {{[0-9]+}}(%rsp) 179; O0-NEXT: movw %cx, {{[0-9]+}}(%rsp) 180; O0-NEXT: ldtilecfg {{[0-9]+}}(%rsp) 181; O0-NEXT: tileloadd (%rdi,%r8), %tmm2 182; O0-NEXT: movl $64, %edi 183; O0-NEXT: leaq {{[0-9]+}}(%rsp), %rdx 184; O0-NEXT: tileloadd (%rdx,%rdi), %tmm0 185; O0-NEXT: movw $8, %dx 186; O0-NEXT: movabsq $64, %rdi 187; O0-NEXT: tileloadd 1024(%rsp,%rdi), %tmm1 # 1024-byte Folded Reload 188; O0-NEXT: tdpbssd %tmm2, %tmm1, %tmm0 189; O0-NEXT: movl $64, %esi 190; O0-NEXT: leaq {{[0-9]+}}(%rsp), %rdx 191; O0-NEXT: tilestored %tmm0, (%rdx,%rsi) 192; O0-NEXT: movl $64, %esi 193; O0-NEXT: leaq {{[0-9]+}}(%rsp), %rdx 194; O0-NEXT: tileloadd (%rdx,%rsi), %tmm0 195; O0-NEXT: movl $32, %esi 196; O0-NEXT: movl $buf+2048, %edx 197; O0-NEXT: tilestored %tmm0, (%rdx,%rsi) 198; O0-NEXT: movq %rbp, %rsp 199; O0-NEXT: popq %rbp 200; O0-NEXT: tilerelease 201; O0-NEXT: retq 202 %3 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 %0, i16 8, ptr @buf, i64 32) 203 %4 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 8, i16 %1, ptr getelementptr inbounds ([3072 x i8], ptr @buf, i64 0, i64 1024), i64 32) 204 call void @foo() 205 %5 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 %0, i16 %1, ptr getelementptr inbounds ([3072 x i8], ptr @buf, i64 0, i64 2048), i64 32) 206 %6 = tail call x86_amx @llvm.x86.tdpbssd.internal(i16 %0, i16 %1, i16 8, x86_amx %5, x86_amx %3, x86_amx %4) 207 tail call void @llvm.x86.tilestored64.internal(i16 %0, i16 %1, ptr getelementptr inbounds ([3072 x i8], ptr @buf, i64 0, i64 2048), i64 32, x86_amx %6) 208 ret void 209} 210 211define dso_local i32 @test_loop(i32 %0) nounwind { 212; CHECK-LABEL: test_loop: 213; CHECK: # %bb.0: 214; CHECK-NEXT: pushq %rbp 215; CHECK-NEXT: pushq %r15 216; CHECK-NEXT: pushq %r14 217; CHECK-NEXT: pushq %r13 218; CHECK-NEXT: pushq %r12 219; CHECK-NEXT: pushq %rbx 220; CHECK-NEXT: subq $1096, %rsp # imm = 0x448 221; CHECK-NEXT: movl %edi, %ebx 222; CHECK-NEXT: vxorps %xmm0, %xmm0, %xmm0 223; CHECK-NEXT: vmovups %zmm0, (%rsp) 224; CHECK-NEXT: movb $1, (%rsp) 225; CHECK-NEXT: movb $8, {{[0-9]+}}(%rsp) 226; CHECK-NEXT: movw $8, {{[0-9]+}}(%rsp) 227; CHECK-NEXT: vzeroupper 228; CHECK-NEXT: callq foo 229; CHECK-NEXT: ldtilecfg (%rsp) 230; CHECK-NEXT: testl %ebx, %ebx 231; CHECK-NEXT: jg .LBB2_4 232; CHECK-NEXT: # %bb.1: # %.preheader 233; CHECK-NEXT: movl $7, %ebp 234; CHECK-NEXT: movl $buf, %r14d 235; CHECK-NEXT: movl $32, %r15d 236; CHECK-NEXT: movw $8, %r12w 237; CHECK-NEXT: movl $buf+2048, %r13d 238; CHECK-NEXT: .p2align 4 239; CHECK-NEXT: .LBB2_2: # =>This Inner Loop Header: Depth=1 240; CHECK-NEXT: tileloadd (%r14,%r15), %tmm0 241; CHECK-NEXT: movabsq $64, %rax 242; CHECK-NEXT: tilestored %tmm0, 64(%rsp,%rax) # 1024-byte Folded Spill 243; CHECK-NEXT: callq foo 244; CHECK-NEXT: ldtilecfg (%rsp) 245; CHECK-NEXT: movabsq $64, %rax 246; CHECK-NEXT: tileloadd 64(%rsp,%rax), %tmm0 # 1024-byte Folded Reload 247; CHECK-NEXT: tilestored %tmm0, (%r13,%r15) 248; CHECK-NEXT: callq foo 249; CHECK-NEXT: ldtilecfg (%rsp) 250; CHECK-NEXT: decl %ebp 251; CHECK-NEXT: cmpl $7, %ebp 252; CHECK-NEXT: jne .LBB2_2 253; CHECK-NEXT: # %bb.3: 254; CHECK-NEXT: cmpl $3, %ebx 255; CHECK-NEXT: jne .LBB2_4 256; CHECK-NEXT: # %bb.6: 257; CHECK-NEXT: testl %ebp, %ebp 258; CHECK-NEXT: jne .LBB2_5 259; CHECK-NEXT: # %bb.7: 260; CHECK-NEXT: incl %ebx 261; CHECK-NEXT: jmp .LBB2_8 262; CHECK-NEXT: .LBB2_4: 263; CHECK-NEXT: callq foo 264; CHECK-NEXT: ldtilecfg (%rsp) 265; CHECK-NEXT: movl $32, %eax 266; CHECK-NEXT: movl $buf+1024, %ecx 267; CHECK-NEXT: movw $8, %dx 268; CHECK-NEXT: tileloadd (%rcx,%rax), %tmm0 269; CHECK-NEXT: tilestored %tmm0, (%rcx,%rax) 270; CHECK-NEXT: .LBB2_5: 271; CHECK-NEXT: decl %ebx 272; CHECK-NEXT: .LBB2_8: 273; CHECK-NEXT: movl %ebx, %eax 274; CHECK-NEXT: addq $1096, %rsp # imm = 0x448 275; CHECK-NEXT: popq %rbx 276; CHECK-NEXT: popq %r12 277; CHECK-NEXT: popq %r13 278; CHECK-NEXT: popq %r14 279; CHECK-NEXT: popq %r15 280; CHECK-NEXT: popq %rbp 281; CHECK-NEXT: tilerelease 282; CHECK-NEXT: retq 283; 284; IPRA-LABEL: test_loop: 285; IPRA: # %bb.0: 286; IPRA-NEXT: subq $72, %rsp 287; IPRA-NEXT: movl %edi, %eax 288; IPRA-NEXT: vxorps %xmm0, %xmm0, %xmm0 289; IPRA-NEXT: vmovups %zmm0, {{[0-9]+}}(%rsp) 290; IPRA-NEXT: movb $1, {{[0-9]+}}(%rsp) 291; IPRA-NEXT: movb $8, {{[0-9]+}}(%rsp) 292; IPRA-NEXT: movw $8, {{[0-9]+}}(%rsp) 293; IPRA-NEXT: ldtilecfg {{[0-9]+}}(%rsp) 294; IPRA-NEXT: callq foo 295; IPRA-NEXT: testl %edi, %edi 296; IPRA-NEXT: jg .LBB2_4 297; IPRA-NEXT: # %bb.1: # %.preheader 298; IPRA-NEXT: movl $7, %ecx 299; IPRA-NEXT: movl $buf, %edx 300; IPRA-NEXT: movl $32, %esi 301; IPRA-NEXT: movw $8, %di 302; IPRA-NEXT: movl $buf+2048, %r8d 303; IPRA-NEXT: .p2align 4 304; IPRA-NEXT: .LBB2_2: # =>This Inner Loop Header: Depth=1 305; IPRA-NEXT: tileloadd (%rdx,%rsi), %tmm0 306; IPRA-NEXT: callq foo 307; IPRA-NEXT: tilestored %tmm0, (%r8,%rsi) 308; IPRA-NEXT: callq foo 309; IPRA-NEXT: decl %ecx 310; IPRA-NEXT: cmpl $7, %ecx 311; IPRA-NEXT: jne .LBB2_2 312; IPRA-NEXT: # %bb.3: 313; IPRA-NEXT: cmpl $3, %eax 314; IPRA-NEXT: jne .LBB2_4 315; IPRA-NEXT: # %bb.6: 316; IPRA-NEXT: testl %ecx, %ecx 317; IPRA-NEXT: jne .LBB2_5 318; IPRA-NEXT: # %bb.7: 319; IPRA-NEXT: incl %eax 320; IPRA-NEXT: jmp .LBB2_8 321; IPRA-NEXT: .LBB2_4: 322; IPRA-NEXT: callq foo 323; IPRA-NEXT: movl $32, %ecx 324; IPRA-NEXT: movl $buf+1024, %edx 325; IPRA-NEXT: movw $8, %si 326; IPRA-NEXT: tileloadd (%rdx,%rcx), %tmm0 327; IPRA-NEXT: tilestored %tmm0, (%rdx,%rcx) 328; IPRA-NEXT: .LBB2_5: 329; IPRA-NEXT: decl %eax 330; IPRA-NEXT: .LBB2_8: 331; IPRA-NEXT: addq $72, %rsp 332; IPRA-NEXT: tilerelease 333; IPRA-NEXT: vzeroupper 334; IPRA-NEXT: retq 335; 336; O0-LABEL: test_loop: 337; O0: # %bb.0: 338; O0-NEXT: pushq %rbp 339; O0-NEXT: movq %rsp, %rbp 340; O0-NEXT: andq $-1024, %rsp # imm = 0xFC00 341; O0-NEXT: subq $4096, %rsp # imm = 0x1000 342; O0-NEXT: vxorps %xmm0, %xmm0, %xmm0 343; O0-NEXT: vmovups %zmm0, {{[0-9]+}}(%rsp) 344; O0-NEXT: movb $1, {{[0-9]+}}(%rsp) 345; O0-NEXT: movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill 346; O0-NEXT: vzeroupper 347; O0-NEXT: callq foo 348; O0-NEXT: # %bb.1: 349; O0-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 4-byte Reload 350; O0-NEXT: xorl %eax, %eax 351; O0-NEXT: cmpl $0, %ecx 352; O0-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill 353; O0-NEXT: jg .LBB2_4 354; O0-NEXT: jmp .LBB2_3 355; O0-NEXT: .LBB2_2: 356; O0-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload 357; O0-NEXT: cmpl $3, %eax 358; O0-NEXT: je .LBB2_5 359; O0-NEXT: jmp .LBB2_4 360; O0-NEXT: .LBB2_3: # =>This Inner Loop Header: Depth=1 361; O0-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload 362; O0-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill 363; O0-NEXT: movl $buf, %ecx 364; O0-NEXT: movl $32, %edx 365; O0-NEXT: movw $8, %ax 366; O0-NEXT: # implicit-def: $al 367; O0-NEXT: movb %al, {{[0-9]+}}(%rsp) 368; O0-NEXT: movw %ax, {{[0-9]+}}(%rsp) 369; O0-NEXT: ldtilecfg {{[0-9]+}}(%rsp) 370; O0-NEXT: tileloadd (%rcx,%rdx), %tmm0 371; O0-NEXT: movl $64, %edx 372; O0-NEXT: leaq {{[0-9]+}}(%rsp), %rcx 373; O0-NEXT: movw $8, %ax 374; O0-NEXT: tilestored %tmm0, (%rcx,%rdx) 375; O0-NEXT: callq foo 376; O0-NEXT: movl $64, %edx 377; O0-NEXT: leaq {{[0-9]+}}(%rsp), %rcx 378; O0-NEXT: movw $8, %ax 379; O0-NEXT: # implicit-def: $al 380; O0-NEXT: movb %al, {{[0-9]+}}(%rsp) 381; O0-NEXT: movw %ax, {{[0-9]+}}(%rsp) 382; O0-NEXT: ldtilecfg {{[0-9]+}}(%rsp) 383; O0-NEXT: tileloadd (%rcx,%rdx), %tmm0 384; O0-NEXT: movl $32, %edx 385; O0-NEXT: movl $buf+2048, %ecx 386; O0-NEXT: movw $8, %ax 387; O0-NEXT: tilestored %tmm0, (%rcx,%rdx) 388; O0-NEXT: callq foo 389; O0-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload 390; O0-NEXT: addl $1, %eax 391; O0-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill 392; O0-NEXT: cmpl $0, %eax 393; O0-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill 394; O0-NEXT: je .LBB2_2 395; O0-NEXT: jmp .LBB2_3 396; O0-NEXT: .LBB2_4: 397; O0-NEXT: callq foo 398; O0-NEXT: movl $32, %edx 399; O0-NEXT: movl $buf+1024, %ecx 400; O0-NEXT: movw $8, %ax 401; O0-NEXT: # implicit-def: $al 402; O0-NEXT: movb %al, {{[0-9]+}}(%rsp) 403; O0-NEXT: movw %ax, {{[0-9]+}}(%rsp) 404; O0-NEXT: ldtilecfg {{[0-9]+}}(%rsp) 405; O0-NEXT: tileloadd (%rcx,%rdx), %tmm0 406; O0-NEXT: movl $64, %edx 407; O0-NEXT: leaq {{[0-9]+}}(%rsp), %rcx 408; O0-NEXT: movw $8, %ax 409; O0-NEXT: tilestored %tmm0, (%rcx,%rdx) 410; O0-NEXT: movl $64, %edx 411; O0-NEXT: leaq {{[0-9]+}}(%rsp), %rcx 412; O0-NEXT: movw $8, %ax 413; O0-NEXT: # implicit-def: $al 414; O0-NEXT: movb %al, {{[0-9]+}}(%rsp) 415; O0-NEXT: movw %ax, {{[0-9]+}}(%rsp) 416; O0-NEXT: ldtilecfg {{[0-9]+}}(%rsp) 417; O0-NEXT: tileloadd (%rcx,%rdx), %tmm0 418; O0-NEXT: movl $32, %edx 419; O0-NEXT: movl $buf+1024, %ecx 420; O0-NEXT: movw $8, %ax 421; O0-NEXT: tilestored %tmm0, (%rcx,%rdx) 422; O0-NEXT: jmp .LBB2_7 423; O0-NEXT: .LBB2_5: 424; O0-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload 425; O0-NEXT: cmpl $7, %eax 426; O0-NEXT: jne .LBB2_7 427; O0-NEXT: # %bb.6: 428; O0-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload 429; O0-NEXT: addl $1, %eax 430; O0-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill 431; O0-NEXT: jmp .LBB2_8 432; O0-NEXT: .LBB2_7: 433; O0-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload 434; O0-NEXT: subl $1, %eax 435; O0-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill 436; O0-NEXT: .LBB2_8: 437; O0-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload 438; O0-NEXT: movq %rbp, %rsp 439; O0-NEXT: popq %rbp 440; O0-NEXT: tilerelease 441; O0-NEXT: retq 442 call void @foo() 443 br label %2 4442: 445 %3 = icmp sgt i32 %0, 0 446 br i1 %3, label %11, label %6 4474: 448 %5 = icmp eq i32 %0, 3 449 br i1 %5, label %13, label %11 4506: 451 %7 = phi i32 [ %9, %6 ], [ 0, %2 ] 452 %8 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 8, i16 8, ptr @buf, i64 32) 453 call void @foo() 454 tail call void @llvm.x86.tilestored64.internal(i16 8, i16 8, ptr getelementptr inbounds ([3072 x i8], ptr @buf, i64 0, i64 2048), i64 32, x86_amx %8) 455 call void @foo() 456 %9 = add i32 %7, 1 457 %10 = icmp eq i32 %9, 0 458 br i1 %10, label %4, label %6 45911: 460 call void @foo() 461 %12 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 8, i16 8, ptr getelementptr inbounds ([3072 x i8], ptr @buf, i64 0, i64 1024), i64 32) 462 tail call void @llvm.x86.tilestored64.internal(i16 8, i16 8, ptr getelementptr inbounds ([3072 x i8], ptr @buf, i64 0, i64 1024), i64 32, x86_amx %12) 463 br label %17 46413: 465 %14 = icmp eq i32 %9, 7 466 br i1 %14, label %15, label %17 46715: 468 %16 = add i32 %0, 1 469 br label %19 47017: 471 %18 = sub i32 %0, 1 472 br label %19 47319: 474 %20 = phi i32 [ %16, %15 ], [ %18, %17 ] 475 ret i32 %20 476} 477 478define dso_local void @test_loop2(i32 %0) nounwind { 479; CHECK-LABEL: test_loop2: 480; CHECK: # %bb.0: 481; CHECK-NEXT: pushq %rbp 482; CHECK-NEXT: pushq %r15 483; CHECK-NEXT: pushq %r14 484; CHECK-NEXT: pushq %r12 485; CHECK-NEXT: pushq %rbx 486; CHECK-NEXT: subq $1088, %rsp # imm = 0x440 487; CHECK-NEXT: movl %edi, %ebx 488; CHECK-NEXT: vxorps %xmm0, %xmm0, %xmm0 489; CHECK-NEXT: vmovups %zmm0, (%rsp) 490; CHECK-NEXT: movb $1, (%rsp) 491; CHECK-NEXT: movb $8, {{[0-9]+}}(%rsp) 492; CHECK-NEXT: movw $8, {{[0-9]+}}(%rsp) 493; CHECK-NEXT: movl $buf, %r14d 494; CHECK-NEXT: movl $32, %r15d 495; CHECK-NEXT: movw $8, %bp 496; CHECK-NEXT: movl $buf+2048, %r12d 497; CHECK-NEXT: .p2align 4 498; CHECK-NEXT: .LBB3_1: # =>This Inner Loop Header: Depth=1 499; CHECK-NEXT: vzeroupper 500; CHECK-NEXT: callq foo 501; CHECK-NEXT: ldtilecfg (%rsp) 502; CHECK-NEXT: testl %ebx, %ebx 503; CHECK-NEXT: jle .LBB3_3 504; CHECK-NEXT: # %bb.2: # in Loop: Header=BB3_1 Depth=1 505; CHECK-NEXT: tileloadd (%r14,%r15), %tmm0 506; CHECK-NEXT: movabsq $64, %rax 507; CHECK-NEXT: tilestored %tmm0, 64(%rsp,%rax) # 1024-byte Folded Spill 508; CHECK-NEXT: callq foo 509; CHECK-NEXT: ldtilecfg (%rsp) 510; CHECK-NEXT: movabsq $64, %rax 511; CHECK-NEXT: tileloadd 64(%rsp,%rax), %tmm0 # 1024-byte Folded Reload 512; CHECK-NEXT: tilestored %tmm0, (%r12,%r15) 513; CHECK-NEXT: callq foo 514; CHECK-NEXT: jmp .LBB3_1 515; CHECK-NEXT: .LBB3_3: 516; CHECK-NEXT: addq $1088, %rsp # imm = 0x440 517; CHECK-NEXT: popq %rbx 518; CHECK-NEXT: popq %r12 519; CHECK-NEXT: popq %r14 520; CHECK-NEXT: popq %r15 521; CHECK-NEXT: popq %rbp 522; CHECK-NEXT: tilerelease 523; CHECK-NEXT: retq 524; 525; IPRA-LABEL: test_loop2: 526; IPRA: # %bb.0: 527; IPRA-NEXT: subq $72, %rsp 528; IPRA-NEXT: vxorps %xmm0, %xmm0, %xmm0 529; IPRA-NEXT: vmovups %zmm0, {{[0-9]+}}(%rsp) 530; IPRA-NEXT: movb $1, {{[0-9]+}}(%rsp) 531; IPRA-NEXT: movb $8, {{[0-9]+}}(%rsp) 532; IPRA-NEXT: movw $8, {{[0-9]+}}(%rsp) 533; IPRA-NEXT: ldtilecfg {{[0-9]+}}(%rsp) 534; IPRA-NEXT: movl $buf, %eax 535; IPRA-NEXT: movl $32, %ecx 536; IPRA-NEXT: movw $8, %dx 537; IPRA-NEXT: movl $buf+2048, %esi 538; IPRA-NEXT: .p2align 4 539; IPRA-NEXT: .LBB3_1: # =>This Inner Loop Header: Depth=1 540; IPRA-NEXT: callq foo 541; IPRA-NEXT: testl %edi, %edi 542; IPRA-NEXT: jle .LBB3_3 543; IPRA-NEXT: # %bb.2: # in Loop: Header=BB3_1 Depth=1 544; IPRA-NEXT: tileloadd (%rax,%rcx), %tmm0 545; IPRA-NEXT: callq foo 546; IPRA-NEXT: tilestored %tmm0, (%rsi,%rcx) 547; IPRA-NEXT: callq foo 548; IPRA-NEXT: jmp .LBB3_1 549; IPRA-NEXT: .LBB3_3: 550; IPRA-NEXT: addq $72, %rsp 551; IPRA-NEXT: tilerelease 552; IPRA-NEXT: vzeroupper 553; IPRA-NEXT: retq 554; 555; O0-LABEL: test_loop2: 556; O0: # %bb.0: 557; O0-NEXT: pushq %rbp 558; O0-NEXT: movq %rsp, %rbp 559; O0-NEXT: andq $-1024, %rsp # imm = 0xFC00 560; O0-NEXT: subq $3072, %rsp # imm = 0xC00 561; O0-NEXT: vxorps %xmm0, %xmm0, %xmm0 562; O0-NEXT: vmovups %zmm0, {{[0-9]+}}(%rsp) 563; O0-NEXT: movb $1, {{[0-9]+}}(%rsp) 564; O0-NEXT: movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill 565; O0-NEXT: xorl %eax, %eax 566; O0-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill 567; O0-NEXT: .LBB3_1: # =>This Inner Loop Header: Depth=1 568; O0-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload 569; O0-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill 570; O0-NEXT: vzeroupper 571; O0-NEXT: callq foo 572; O0-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload 573; O0-NEXT: cmpl $0, %eax 574; O0-NEXT: jle .LBB3_3 575; O0-NEXT: # %bb.2: # in Loop: Header=BB3_1 Depth=1 576; O0-NEXT: movl $buf, %ecx 577; O0-NEXT: movl $32, %edx 578; O0-NEXT: movw $8, %ax 579; O0-NEXT: # implicit-def: $al 580; O0-NEXT: movb %al, {{[0-9]+}}(%rsp) 581; O0-NEXT: movw %ax, {{[0-9]+}}(%rsp) 582; O0-NEXT: ldtilecfg {{[0-9]+}}(%rsp) 583; O0-NEXT: tileloadd (%rcx,%rdx), %tmm0 584; O0-NEXT: movl $64, %edx 585; O0-NEXT: leaq {{[0-9]+}}(%rsp), %rcx 586; O0-NEXT: movw $8, %ax 587; O0-NEXT: tilestored %tmm0, (%rcx,%rdx) 588; O0-NEXT: callq foo 589; O0-NEXT: movl $64, %edx 590; O0-NEXT: leaq {{[0-9]+}}(%rsp), %rcx 591; O0-NEXT: movw $8, %ax 592; O0-NEXT: # implicit-def: $al 593; O0-NEXT: movb %al, {{[0-9]+}}(%rsp) 594; O0-NEXT: movw %ax, {{[0-9]+}}(%rsp) 595; O0-NEXT: ldtilecfg {{[0-9]+}}(%rsp) 596; O0-NEXT: tileloadd (%rcx,%rdx), %tmm0 597; O0-NEXT: movl $32, %edx 598; O0-NEXT: movl $buf+2048, %ecx 599; O0-NEXT: movw $8, %ax 600; O0-NEXT: tilestored %tmm0, (%rcx,%rdx) 601; O0-NEXT: callq foo 602; O0-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload 603; O0-NEXT: addl $1, %eax 604; O0-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill 605; O0-NEXT: jmp .LBB3_1 606; O0-NEXT: .LBB3_3: 607; O0-NEXT: movq %rbp, %rsp 608; O0-NEXT: popq %rbp 609; O0-NEXT: tilerelease 610; O0-NEXT: retq 611 br label %2 6122: 613 %3 = phi i32 [ 0, %1 ], [ %7, %5 ] 614 call void @foo() 615 %4 = icmp sgt i32 %0, 0 616 br i1 %4, label %5, label %8 6175: 618 %6 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 8, i16 8, ptr @buf, i64 32) 619 call void @foo() 620 tail call void @llvm.x86.tilestored64.internal(i16 8, i16 8, ptr getelementptr inbounds ([3072 x i8], ptr @buf, i64 0, i64 2048), i64 32, x86_amx %6) 621 call void @foo() 622 %7 = add i32 %3, 1 623 br label %2 6248: 625 ret void 626} 627 628declare x86_amx @llvm.x86.tileloadd64.internal(i16, i16, ptr, i64) 629declare x86_amx @llvm.x86.tdpbssd.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx) 630declare void @llvm.x86.tilestored64.internal(i16, i16, ptr, i64, x86_amx) 631