1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -O0 -mtriple=x86_64-unknown-unknown -mattr=+amx-int8 -mattr=+avx512f | FileCheck %s --check-prefix=AVX512 3 4%struct.__tile1024i_str = type <{ i16, i16, [60 x i8], <256 x i32> }> 5@buf = dso_local global [1024 x i8] zeroinitializer, align 16 6@buf2 = dso_local global [1024 x i8] zeroinitializer, align 16 7 8define dso_local void @test_api(i32 %cond, i16 signext %row, i16 signext %col) #0 { 9; AVX512-LABEL: test_api: 10; AVX512: # %bb.0: # %entry 11; AVX512-NEXT: pushq %rbp 12; AVX512-NEXT: movq %rsp, %rbp 13; AVX512-NEXT: andq $-1024, %rsp # imm = 0xFC00 14; AVX512-NEXT: subq $25600, %rsp # imm = 0x6400 15; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0 16; AVX512-NEXT: vmovups %zmm0, {{[0-9]+}}(%rsp) 17; AVX512-NEXT: movb $1, {{[0-9]+}}(%rsp) 18; AVX512-NEXT: movw %dx, %ax 19; AVX512-NEXT: movw %si, %cx 20; AVX512-NEXT: movl %edi, {{[0-9]+}}(%rsp) 21; AVX512-NEXT: movw %cx, {{[0-9]+}}(%rsp) 22; AVX512-NEXT: movw %ax, {{[0-9]+}}(%rsp) 23; AVX512-NEXT: leaq {{[0-9]+}}(%rsp), %rdi 24; AVX512-NEXT: xorl %esi, %esi 25; AVX512-NEXT: movl $1088, %edx # imm = 0x440 26; AVX512-NEXT: vzeroupper 27; AVX512-NEXT: callq memset@PLT 28; AVX512-NEXT: movw {{[0-9]+}}(%rsp), %ax 29; AVX512-NEXT: movw %ax, {{[0-9]+}}(%rsp) 30; AVX512-NEXT: movw $8, {{[0-9]+}}(%rsp) 31; AVX512-NEXT: leaq {{[0-9]+}}(%rsp), %rdi 32; AVX512-NEXT: xorl %esi, %esi 33; AVX512-NEXT: movl $1088, %edx # imm = 0x440 34; AVX512-NEXT: callq memset@PLT 35; AVX512-NEXT: movw $8, {{[0-9]+}}(%rsp) 36; AVX512-NEXT: movw {{[0-9]+}}(%rsp), %ax 37; AVX512-NEXT: movw %ax, {{[0-9]+}}(%rsp) 38; AVX512-NEXT: leaq {{[0-9]+}}(%rsp), %rdi 39; AVX512-NEXT: xorl %esi, %esi 40; AVX512-NEXT: movl $1088, %edx # imm = 0x440 41; AVX512-NEXT: callq memset@PLT 42; AVX512-NEXT: movw {{[0-9]+}}(%rsp), %ax 43; AVX512-NEXT: movw %ax, {{[0-9]+}}(%rsp) 44; AVX512-NEXT: movw {{[0-9]+}}(%rsp), %ax 45; AVX512-NEXT: movw %ax, {{[0-9]+}}(%rsp) 46; AVX512-NEXT: cmpl $0, {{[0-9]+}}(%rsp) 47; AVX512-NEXT: je .LBB0_2 48; AVX512-NEXT: # %bb.1: # %if.then 49; AVX512-NEXT: leaq {{[0-9]+}}(%rsp), %rax 50; AVX512-NEXT: movq %rax, {{[0-9]+}}(%rsp) 51; AVX512-NEXT: movabsq $buf, %rax 52; AVX512-NEXT: movq %rax, {{[0-9]+}}(%rsp) 53; AVX512-NEXT: movq $32, {{[0-9]+}}(%rsp) 54; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax 55; AVX512-NEXT: movw (%rax), %si 56; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax 57; AVX512-NEXT: movw 2(%rax), %dx 58; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rcx 59; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax 60; AVX512-NEXT: movw %si, {{[0-9]+}}(%rsp) 61; AVX512-NEXT: movw %dx, {{[0-9]+}}(%rsp) 62; AVX512-NEXT: movq %rcx, {{[0-9]+}}(%rsp) 63; AVX512-NEXT: movq %rax, {{[0-9]+}}(%rsp) 64; AVX512-NEXT: movw {{[0-9]+}}(%rsp), %ax 65; AVX512-NEXT: movw {{[0-9]+}}(%rsp), %cx 66; AVX512-NEXT: # implicit-def: $al 67; AVX512-NEXT: movb %al, {{[0-9]+}}(%rsp) 68; AVX512-NEXT: movw %cx, {{[0-9]+}}(%rsp) 69; AVX512-NEXT: ldtilecfg {{[0-9]+}}(%rsp) 70; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rdx 71; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rsi 72; AVX512-NEXT: tileloadd (%rdx,%rsi), %tmm0 73; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rdx 74; AVX512-NEXT: addq $64, %rdx 75; AVX512-NEXT: movl $64, %esi 76; AVX512-NEXT: tilestored %tmm0, (%rdx,%rsi) 77; AVX512-NEXT: leaq {{[0-9]+}}(%rsp), %rax 78; AVX512-NEXT: movq %rax, {{[0-9]+}}(%rsp) 79; AVX512-NEXT: movabsq $buf, %rax 80; AVX512-NEXT: movq %rax, {{[0-9]+}}(%rsp) 81; AVX512-NEXT: movq $32, {{[0-9]+}}(%rsp) 82; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax 83; AVX512-NEXT: movw (%rax), %si 84; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax 85; AVX512-NEXT: movw 2(%rax), %dx 86; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rcx 87; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax 88; AVX512-NEXT: movw %si, {{[0-9]+}}(%rsp) 89; AVX512-NEXT: movw %dx, {{[0-9]+}}(%rsp) 90; AVX512-NEXT: movq %rcx, {{[0-9]+}}(%rsp) 91; AVX512-NEXT: movq %rax, {{[0-9]+}}(%rsp) 92; AVX512-NEXT: movw {{[0-9]+}}(%rsp), %ax 93; AVX512-NEXT: movw {{[0-9]+}}(%rsp), %cx 94; AVX512-NEXT: # implicit-def: $al 95; AVX512-NEXT: movb %al, {{[0-9]+}}(%rsp) 96; AVX512-NEXT: movw %cx, {{[0-9]+}}(%rsp) 97; AVX512-NEXT: ldtilecfg {{[0-9]+}}(%rsp) 98; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rdx 99; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rsi 100; AVX512-NEXT: tileloadd (%rdx,%rsi), %tmm0 101; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rdx 102; AVX512-NEXT: addq $64, %rdx 103; AVX512-NEXT: movl $64, %esi 104; AVX512-NEXT: tilestored %tmm0, (%rdx,%rsi) 105; AVX512-NEXT: leaq {{[0-9]+}}(%rsp), %rax 106; AVX512-NEXT: movq %rax, {{[0-9]+}}(%rsp) 107; AVX512-NEXT: movabsq $buf, %rax 108; AVX512-NEXT: movq %rax, {{[0-9]+}}(%rsp) 109; AVX512-NEXT: movq $32, {{[0-9]+}}(%rsp) 110; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax 111; AVX512-NEXT: movw (%rax), %si 112; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax 113; AVX512-NEXT: movw 2(%rax), %dx 114; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rcx 115; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax 116; AVX512-NEXT: movw %si, {{[0-9]+}}(%rsp) 117; AVX512-NEXT: movw %dx, {{[0-9]+}}(%rsp) 118; AVX512-NEXT: movq %rcx, {{[0-9]+}}(%rsp) 119; AVX512-NEXT: movq %rax, {{[0-9]+}}(%rsp) 120; AVX512-NEXT: movw {{[0-9]+}}(%rsp), %ax 121; AVX512-NEXT: movw {{[0-9]+}}(%rsp), %cx 122; AVX512-NEXT: # implicit-def: $al 123; AVX512-NEXT: movb %al, {{[0-9]+}}(%rsp) 124; AVX512-NEXT: movw %cx, {{[0-9]+}}(%rsp) 125; AVX512-NEXT: ldtilecfg {{[0-9]+}}(%rsp) 126; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rdx 127; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rsi 128; AVX512-NEXT: tileloadd (%rdx,%rsi), %tmm0 129; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rdx 130; AVX512-NEXT: addq $64, %rdx 131; AVX512-NEXT: movl $64, %esi 132; AVX512-NEXT: tilestored %tmm0, (%rdx,%rsi) 133; AVX512-NEXT: jmp .LBB0_3 134; AVX512-NEXT: .LBB0_2: # %if.else 135; AVX512-NEXT: leaq {{[0-9]+}}(%rsp), %rax 136; AVX512-NEXT: movq %rax, {{[0-9]+}}(%rsp) 137; AVX512-NEXT: movabsq $buf2, %rax 138; AVX512-NEXT: movq %rax, {{[0-9]+}}(%rsp) 139; AVX512-NEXT: movq $32, {{[0-9]+}}(%rsp) 140; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax 141; AVX512-NEXT: movw (%rax), %si 142; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax 143; AVX512-NEXT: movw 2(%rax), %dx 144; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rcx 145; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax 146; AVX512-NEXT: movw %si, {{[0-9]+}}(%rsp) 147; AVX512-NEXT: movw %dx, {{[0-9]+}}(%rsp) 148; AVX512-NEXT: movq %rcx, {{[0-9]+}}(%rsp) 149; AVX512-NEXT: movq %rax, {{[0-9]+}}(%rsp) 150; AVX512-NEXT: movw {{[0-9]+}}(%rsp), %ax 151; AVX512-NEXT: movw {{[0-9]+}}(%rsp), %cx 152; AVX512-NEXT: # implicit-def: $al 153; AVX512-NEXT: movb %al, {{[0-9]+}}(%rsp) 154; AVX512-NEXT: movw %cx, {{[0-9]+}}(%rsp) 155; AVX512-NEXT: ldtilecfg {{[0-9]+}}(%rsp) 156; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rdx 157; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rsi 158; AVX512-NEXT: tileloadd (%rdx,%rsi), %tmm0 159; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rdx 160; AVX512-NEXT: addq $64, %rdx 161; AVX512-NEXT: movl $64, %esi 162; AVX512-NEXT: tilestored %tmm0, (%rdx,%rsi) 163; AVX512-NEXT: leaq {{[0-9]+}}(%rsp), %rax 164; AVX512-NEXT: movq %rax, {{[0-9]+}}(%rsp) 165; AVX512-NEXT: movabsq $buf2, %rax 166; AVX512-NEXT: movq %rax, {{[0-9]+}}(%rsp) 167; AVX512-NEXT: movq $32, {{[0-9]+}}(%rsp) 168; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax 169; AVX512-NEXT: movw (%rax), %si 170; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax 171; AVX512-NEXT: movw 2(%rax), %dx 172; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rcx 173; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax 174; AVX512-NEXT: movw %si, {{[0-9]+}}(%rsp) 175; AVX512-NEXT: movw %dx, {{[0-9]+}}(%rsp) 176; AVX512-NEXT: movq %rcx, {{[0-9]+}}(%rsp) 177; AVX512-NEXT: movq %rax, {{[0-9]+}}(%rsp) 178; AVX512-NEXT: movw {{[0-9]+}}(%rsp), %ax 179; AVX512-NEXT: movw {{[0-9]+}}(%rsp), %cx 180; AVX512-NEXT: # implicit-def: $al 181; AVX512-NEXT: movb %al, {{[0-9]+}}(%rsp) 182; AVX512-NEXT: movw %cx, {{[0-9]+}}(%rsp) 183; AVX512-NEXT: ldtilecfg {{[0-9]+}}(%rsp) 184; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rdx 185; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rsi 186; AVX512-NEXT: tileloadd (%rdx,%rsi), %tmm0 187; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rdx 188; AVX512-NEXT: addq $64, %rdx 189; AVX512-NEXT: movl $64, %esi 190; AVX512-NEXT: tilestored %tmm0, (%rdx,%rsi) 191; AVX512-NEXT: leaq {{[0-9]+}}(%rsp), %rax 192; AVX512-NEXT: movq %rax, {{[0-9]+}}(%rsp) 193; AVX512-NEXT: movabsq $buf2, %rax 194; AVX512-NEXT: movq %rax, {{[0-9]+}}(%rsp) 195; AVX512-NEXT: movq $32, {{[0-9]+}}(%rsp) 196; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax 197; AVX512-NEXT: movw (%rax), %si 198; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax 199; AVX512-NEXT: movw 2(%rax), %dx 200; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rcx 201; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax 202; AVX512-NEXT: movw %si, {{[0-9]+}}(%rsp) 203; AVX512-NEXT: movw %dx, {{[0-9]+}}(%rsp) 204; AVX512-NEXT: movq %rcx, {{[0-9]+}}(%rsp) 205; AVX512-NEXT: movq %rax, {{[0-9]+}}(%rsp) 206; AVX512-NEXT: movw {{[0-9]+}}(%rsp), %ax 207; AVX512-NEXT: movw {{[0-9]+}}(%rsp), %cx 208; AVX512-NEXT: # implicit-def: $al 209; AVX512-NEXT: movb %al, {{[0-9]+}}(%rsp) 210; AVX512-NEXT: movw %cx, {{[0-9]+}}(%rsp) 211; AVX512-NEXT: ldtilecfg {{[0-9]+}}(%rsp) 212; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rdx 213; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rsi 214; AVX512-NEXT: tileloadd (%rdx,%rsi), %tmm0 215; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rdx 216; AVX512-NEXT: addq $64, %rdx 217; AVX512-NEXT: movl $64, %esi 218; AVX512-NEXT: tilestored %tmm0, (%rdx,%rsi) 219; AVX512-NEXT: .LBB0_3: # %if.end 220; AVX512-NEXT: leaq {{[0-9]+}}(%rsp), %rdi 221; AVX512-NEXT: leaq {{[0-9]+}}(%rsp), %rsi 222; AVX512-NEXT: movl $1088, %edx # imm = 0x440 223; AVX512-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill 224; AVX512-NEXT: callq memcpy@PLT 225; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload 226; AVX512-NEXT: leaq {{[0-9]+}}(%rsp), %rdi 227; AVX512-NEXT: leaq {{[0-9]+}}(%rsp), %rsi 228; AVX512-NEXT: callq memcpy@PLT 229; AVX512-NEXT: leaq {{[0-9]+}}(%rsp), %rax 230; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill 231; AVX512-NEXT: movq %rax, {{[0-9]+}}(%rsp) 232; AVX512-NEXT: movw {{[0-9]+}}(%rsp), %ax 233; AVX512-NEXT: movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill 234; AVX512-NEXT: movw {{[0-9]+}}(%rsp), %ax 235; AVX512-NEXT: movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill 236; AVX512-NEXT: movw {{[0-9]+}}(%rsp), %ax 237; AVX512-NEXT: movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill 238; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax 239; AVX512-NEXT: vmovdqa64 64(%rax), %zmm0 240; AVX512-NEXT: vmovaps %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 241; AVX512-NEXT: vmovdqa64 128(%rax), %zmm0 242; AVX512-NEXT: vmovaps %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 243; AVX512-NEXT: vmovdqa64 192(%rax), %zmm0 244; AVX512-NEXT: vmovaps %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 245; AVX512-NEXT: vmovdqa64 256(%rax), %zmm0 246; AVX512-NEXT: vmovaps %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 247; AVX512-NEXT: vmovdqa64 320(%rax), %zmm0 248; AVX512-NEXT: vmovaps %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 249; AVX512-NEXT: vmovdqa64 384(%rax), %zmm0 250; AVX512-NEXT: vmovaps %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 251; AVX512-NEXT: vmovdqa64 448(%rax), %zmm0 252; AVX512-NEXT: vmovaps %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 253; AVX512-NEXT: vmovdqa64 512(%rax), %zmm0 254; AVX512-NEXT: vmovaps %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 255; AVX512-NEXT: vmovdqa64 576(%rax), %zmm0 256; AVX512-NEXT: vmovaps %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 257; AVX512-NEXT: vmovdqa64 640(%rax), %zmm0 258; AVX512-NEXT: vmovaps %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 259; AVX512-NEXT: vmovdqa64 704(%rax), %zmm0 260; AVX512-NEXT: vmovaps %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 261; AVX512-NEXT: vmovdqa64 768(%rax), %zmm0 262; AVX512-NEXT: vmovaps %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 263; AVX512-NEXT: vmovdqa64 832(%rax), %zmm0 264; AVX512-NEXT: vmovaps %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 265; AVX512-NEXT: vmovdqa64 896(%rax), %zmm0 266; AVX512-NEXT: vmovaps %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 267; AVX512-NEXT: vmovdqa64 960(%rax), %zmm0 268; AVX512-NEXT: vmovaps %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 269; AVX512-NEXT: vmovdqa64 1024(%rax), %zmm0 270; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm16 271; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm17 272; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm18 273; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm19 274; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm20 275; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm21 276; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm22 277; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm23 278; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm24 279; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm25 280; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm26 281; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm27 282; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm28 283; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm29 284; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm30 285; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm31 286; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm1 287; AVX512-NEXT: vmovaps %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 288; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm1 289; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm2 290; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm3 291; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm4 292; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm5 293; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm6 294; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm7 295; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm8 296; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm9 297; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm10 298; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm11 299; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm12 300; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm13 301; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm14 302; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm15 303; AVX512-NEXT: vmovdqa64 %zmm0, {{[0-9]+}}(%rsp) 304; AVX512-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 305; AVX512-NEXT: vmovdqa64 %zmm0, {{[0-9]+}}(%rsp) 306; AVX512-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 307; AVX512-NEXT: vmovdqa64 %zmm0, {{[0-9]+}}(%rsp) 308; AVX512-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 309; AVX512-NEXT: vmovdqa64 %zmm0, {{[0-9]+}}(%rsp) 310; AVX512-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 311; AVX512-NEXT: vmovdqa64 %zmm0, {{[0-9]+}}(%rsp) 312; AVX512-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 313; AVX512-NEXT: vmovdqa64 %zmm0, {{[0-9]+}}(%rsp) 314; AVX512-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 315; AVX512-NEXT: vmovdqa64 %zmm0, {{[0-9]+}}(%rsp) 316; AVX512-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 317; AVX512-NEXT: vmovdqa64 %zmm0, {{[0-9]+}}(%rsp) 318; AVX512-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 319; AVX512-NEXT: vmovdqa64 %zmm0, {{[0-9]+}}(%rsp) 320; AVX512-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 321; AVX512-NEXT: vmovdqa64 %zmm0, {{[0-9]+}}(%rsp) 322; AVX512-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 323; AVX512-NEXT: vmovdqa64 %zmm0, {{[0-9]+}}(%rsp) 324; AVX512-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 325; AVX512-NEXT: vmovdqa64 %zmm0, {{[0-9]+}}(%rsp) 326; AVX512-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 327; AVX512-NEXT: vmovdqa64 %zmm0, {{[0-9]+}}(%rsp) 328; AVX512-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 329; AVX512-NEXT: vmovdqa64 %zmm0, {{[0-9]+}}(%rsp) 330; AVX512-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 331; AVX512-NEXT: vmovdqa64 %zmm0, {{[0-9]+}}(%rsp) 332; AVX512-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 333; AVX512-NEXT: vmovdqa64 %zmm0, {{[0-9]+}}(%rsp) 334; AVX512-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 335; AVX512-NEXT: vmovdqa64 %zmm31, {{[0-9]+}}(%rsp) 336; AVX512-NEXT: vmovdqa64 %zmm30, {{[0-9]+}}(%rsp) 337; AVX512-NEXT: vmovdqa64 %zmm29, {{[0-9]+}}(%rsp) 338; AVX512-NEXT: vmovdqa64 %zmm28, {{[0-9]+}}(%rsp) 339; AVX512-NEXT: vmovdqa64 %zmm27, {{[0-9]+}}(%rsp) 340; AVX512-NEXT: vmovdqa64 %zmm26, {{[0-9]+}}(%rsp) 341; AVX512-NEXT: vmovdqa64 %zmm25, {{[0-9]+}}(%rsp) 342; AVX512-NEXT: vmovdqa64 %zmm24, {{[0-9]+}}(%rsp) 343; AVX512-NEXT: vmovdqa64 %zmm23, {{[0-9]+}}(%rsp) 344; AVX512-NEXT: vmovdqa64 %zmm22, {{[0-9]+}}(%rsp) 345; AVX512-NEXT: vmovdqa64 %zmm21, {{[0-9]+}}(%rsp) 346; AVX512-NEXT: vmovdqa64 %zmm20, {{[0-9]+}}(%rsp) 347; AVX512-NEXT: vmovdqa64 %zmm19, {{[0-9]+}}(%rsp) 348; AVX512-NEXT: vmovdqa64 %zmm18, {{[0-9]+}}(%rsp) 349; AVX512-NEXT: vmovdqa64 %zmm17, {{[0-9]+}}(%rsp) 350; AVX512-NEXT: vmovdqa64 %zmm16, {{[0-9]+}}(%rsp) 351; AVX512-NEXT: vmovdqa64 %zmm15, {{[0-9]+}}(%rsp) 352; AVX512-NEXT: vmovdqa64 %zmm14, {{[0-9]+}}(%rsp) 353; AVX512-NEXT: vmovdqa64 %zmm13, {{[0-9]+}}(%rsp) 354; AVX512-NEXT: vmovdqa64 %zmm12, {{[0-9]+}}(%rsp) 355; AVX512-NEXT: vmovdqa64 %zmm11, {{[0-9]+}}(%rsp) 356; AVX512-NEXT: vmovdqa64 %zmm10, {{[0-9]+}}(%rsp) 357; AVX512-NEXT: vmovdqa64 %zmm9, {{[0-9]+}}(%rsp) 358; AVX512-NEXT: vmovdqa64 %zmm8, {{[0-9]+}}(%rsp) 359; AVX512-NEXT: vmovdqa64 %zmm7, {{[0-9]+}}(%rsp) 360; AVX512-NEXT: vmovdqa64 %zmm6, {{[0-9]+}}(%rsp) 361; AVX512-NEXT: vmovdqa64 %zmm5, {{[0-9]+}}(%rsp) 362; AVX512-NEXT: vmovdqa64 %zmm4, {{[0-9]+}}(%rsp) 363; AVX512-NEXT: vmovdqa64 %zmm3, {{[0-9]+}}(%rsp) 364; AVX512-NEXT: vmovdqa64 %zmm2, {{[0-9]+}}(%rsp) 365; AVX512-NEXT: vmovdqa64 %zmm1, {{[0-9]+}}(%rsp) 366; AVX512-NEXT: vmovdqa64 %zmm0, {{[0-9]+}}(%rsp) 367; AVX512-NEXT: leaq {{[0-9]+}}(%rsp), %rdi 368; AVX512-NEXT: leaq {{[0-9]+}}(%rsp), %rsi 369; AVX512-NEXT: movl $1024, %edx # imm = 0x400 370; AVX512-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill 371; AVX512-NEXT: vzeroupper 372; AVX512-NEXT: callq memcpy@PLT 373; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload 374; AVX512-NEXT: leaq {{[0-9]+}}(%rsp), %rdi 375; AVX512-NEXT: leaq {{[0-9]+}}(%rsp), %rsi 376; AVX512-NEXT: callq memcpy@PLT 377; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload 378; AVX512-NEXT: leaq {{[0-9]+}}(%rsp), %rdi 379; AVX512-NEXT: leaq {{[0-9]+}}(%rsp), %rsi 380; AVX512-NEXT: callq memcpy@PLT 381; AVX512-NEXT: movw {{[-0-9]+}}(%r{{[sb]}}p), %di # 2-byte Reload 382; AVX512-NEXT: movw {{[-0-9]+}}(%r{{[sb]}}p), %cx # 2-byte Reload 383; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload 384; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload 385; AVX512-NEXT: # kill: def $r8 killed $rax 386; AVX512-NEXT: movw {{[-0-9]+}}(%r{{[sb]}}p), %ax # 2-byte Reload 387; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm0 388; AVX512-NEXT: vmovaps %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 389; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm0 390; AVX512-NEXT: vmovaps %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 391; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm0 392; AVX512-NEXT: vmovaps %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 393; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm0 394; AVX512-NEXT: vmovaps %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 395; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm0 396; AVX512-NEXT: vmovaps %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 397; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm0 398; AVX512-NEXT: vmovaps %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 399; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm0 400; AVX512-NEXT: vmovaps %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 401; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm0 402; AVX512-NEXT: vmovaps %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 403; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm0 404; AVX512-NEXT: vmovaps %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 405; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm0 406; AVX512-NEXT: vmovaps %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 407; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm0 408; AVX512-NEXT: vmovaps %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 409; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm0 410; AVX512-NEXT: vmovaps %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 411; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm0 412; AVX512-NEXT: vmovaps %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 413; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm0 414; AVX512-NEXT: vmovaps %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 415; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm0 416; AVX512-NEXT: vmovaps %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 417; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm0 418; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm16 419; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm17 420; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm18 421; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm19 422; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm20 423; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm21 424; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm22 425; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm23 426; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm24 427; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm25 428; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm26 429; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm27 430; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm28 431; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm29 432; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm30 433; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm31 434; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm1 435; AVX512-NEXT: vmovaps %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 436; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm1 437; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm2 438; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm3 439; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm4 440; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm5 441; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm6 442; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm7 443; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm8 444; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm9 445; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm10 446; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm11 447; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm12 448; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm13 449; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm14 450; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm15 451; AVX512-NEXT: movw %di, {{[0-9]+}}(%rsp) 452; AVX512-NEXT: movw %cx, {{[0-9]+}}(%rsp) 453; AVX512-NEXT: movw %ax, {{[0-9]+}}(%rsp) 454; AVX512-NEXT: vmovdqa64 %zmm0, {{[0-9]+}}(%rsp) 455; AVX512-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 456; AVX512-NEXT: vmovdqa64 %zmm0, {{[0-9]+}}(%rsp) 457; AVX512-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 458; AVX512-NEXT: vmovdqa64 %zmm0, {{[0-9]+}}(%rsp) 459; AVX512-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 460; AVX512-NEXT: vmovdqa64 %zmm0, {{[0-9]+}}(%rsp) 461; AVX512-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 462; AVX512-NEXT: vmovdqa64 %zmm0, {{[0-9]+}}(%rsp) 463; AVX512-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 464; AVX512-NEXT: vmovdqa64 %zmm0, {{[0-9]+}}(%rsp) 465; AVX512-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 466; AVX512-NEXT: vmovdqa64 %zmm0, {{[0-9]+}}(%rsp) 467; AVX512-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 468; AVX512-NEXT: vmovdqa64 %zmm0, {{[0-9]+}}(%rsp) 469; AVX512-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 470; AVX512-NEXT: vmovdqa64 %zmm0, {{[0-9]+}}(%rsp) 471; AVX512-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 472; AVX512-NEXT: vmovdqa64 %zmm0, {{[0-9]+}}(%rsp) 473; AVX512-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 474; AVX512-NEXT: vmovdqa64 %zmm0, {{[0-9]+}}(%rsp) 475; AVX512-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 476; AVX512-NEXT: vmovdqa64 %zmm0, {{[0-9]+}}(%rsp) 477; AVX512-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 478; AVX512-NEXT: vmovdqa64 %zmm0, {{[0-9]+}}(%rsp) 479; AVX512-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 480; AVX512-NEXT: vmovdqa64 %zmm0, {{[0-9]+}}(%rsp) 481; AVX512-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 482; AVX512-NEXT: vmovdqa64 %zmm0, {{[0-9]+}}(%rsp) 483; AVX512-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 484; AVX512-NEXT: vmovdqa64 %zmm0, {{[0-9]+}}(%rsp) 485; AVX512-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 486; AVX512-NEXT: vmovdqa64 %zmm31, {{[0-9]+}}(%rsp) 487; AVX512-NEXT: vmovdqa64 %zmm30, {{[0-9]+}}(%rsp) 488; AVX512-NEXT: vmovdqa64 %zmm29, {{[0-9]+}}(%rsp) 489; AVX512-NEXT: vmovdqa64 %zmm28, {{[0-9]+}}(%rsp) 490; AVX512-NEXT: vmovdqa64 %zmm27, {{[0-9]+}}(%rsp) 491; AVX512-NEXT: vmovdqa64 %zmm26, {{[0-9]+}}(%rsp) 492; AVX512-NEXT: vmovdqa64 %zmm25, {{[0-9]+}}(%rsp) 493; AVX512-NEXT: vmovdqa64 %zmm24, {{[0-9]+}}(%rsp) 494; AVX512-NEXT: vmovdqa64 %zmm23, {{[0-9]+}}(%rsp) 495; AVX512-NEXT: vmovdqa64 %zmm22, {{[0-9]+}}(%rsp) 496; AVX512-NEXT: vmovdqa64 %zmm21, {{[0-9]+}}(%rsp) 497; AVX512-NEXT: vmovdqa64 %zmm20, {{[0-9]+}}(%rsp) 498; AVX512-NEXT: vmovdqa64 %zmm19, {{[0-9]+}}(%rsp) 499; AVX512-NEXT: vmovdqa64 %zmm18, {{[0-9]+}}(%rsp) 500; AVX512-NEXT: vmovdqa64 %zmm17, {{[0-9]+}}(%rsp) 501; AVX512-NEXT: vmovdqa64 %zmm16, {{[0-9]+}}(%rsp) 502; AVX512-NEXT: vmovdqa64 %zmm15, {{[0-9]+}}(%rsp) 503; AVX512-NEXT: vmovdqa64 %zmm14, {{[0-9]+}}(%rsp) 504; AVX512-NEXT: vmovdqa64 %zmm13, {{[0-9]+}}(%rsp) 505; AVX512-NEXT: vmovdqa64 %zmm12, {{[0-9]+}}(%rsp) 506; AVX512-NEXT: vmovdqa64 %zmm11, {{[0-9]+}}(%rsp) 507; AVX512-NEXT: vmovdqa64 %zmm10, {{[0-9]+}}(%rsp) 508; AVX512-NEXT: vmovdqa64 %zmm9, {{[0-9]+}}(%rsp) 509; AVX512-NEXT: vmovdqa64 %zmm8, {{[0-9]+}}(%rsp) 510; AVX512-NEXT: vmovdqa64 %zmm7, {{[0-9]+}}(%rsp) 511; AVX512-NEXT: vmovdqa64 %zmm6, {{[0-9]+}}(%rsp) 512; AVX512-NEXT: vmovdqa64 %zmm5, {{[0-9]+}}(%rsp) 513; AVX512-NEXT: vmovdqa64 %zmm4, {{[0-9]+}}(%rsp) 514; AVX512-NEXT: vmovdqa64 %zmm3, {{[0-9]+}}(%rsp) 515; AVX512-NEXT: vmovdqa64 %zmm2, {{[0-9]+}}(%rsp) 516; AVX512-NEXT: vmovdqa64 %zmm1, {{[0-9]+}}(%rsp) 517; AVX512-NEXT: vmovdqa64 %zmm0, {{[0-9]+}}(%rsp) 518; AVX512-NEXT: movw {{[0-9]+}}(%rsp), %ax 519; AVX512-NEXT: movw {{[0-9]+}}(%rsp), %cx 520; AVX512-NEXT: movzwl {{[0-9]+}}(%rsp), %r8d 521; AVX512-NEXT: movw %r8w, %di 522; AVX512-NEXT: shrl $2, %r8d 523; AVX512-NEXT: movw %r8w, %r9w 524; AVX512-NEXT: # implicit-def: $al 525; AVX512-NEXT: movb %al, {{[0-9]+}}(%rsp) 526; AVX512-NEXT: movw %cx, {{[0-9]+}}(%rsp) 527; AVX512-NEXT: # implicit-def: $r9b 528; AVX512-NEXT: movb %r9b, {{[0-9]+}}(%rsp) 529; AVX512-NEXT: movw %cx, {{[0-9]+}}(%rsp) 530; AVX512-NEXT: # implicit-def: $al 531; AVX512-NEXT: movb %al, {{[0-9]+}}(%rsp) 532; AVX512-NEXT: movw %di, {{[0-9]+}}(%rsp) 533; AVX512-NEXT: # implicit-def: $al 534; AVX512-NEXT: movb %al, {{[0-9]+}}(%rsp) 535; AVX512-NEXT: movw %cx, {{[0-9]+}}(%rsp) 536; AVX512-NEXT: ldtilecfg {{[0-9]+}}(%rsp) 537; AVX512-NEXT: movl $64, %r8d 538; AVX512-NEXT: leaq {{[0-9]+}}(%rsp), %r10 539; AVX512-NEXT: tileloadd (%r10,%r8), %tmm0 540; AVX512-NEXT: leaq {{[0-9]+}}(%rsp), %r10 541; AVX512-NEXT: tileloadd (%r10,%r8), %tmm1 542; AVX512-NEXT: leaq {{[0-9]+}}(%rsp), %r10 543; AVX512-NEXT: tileloadd (%r10,%r8), %tmm2 544; AVX512-NEXT: tdpbssd %tmm2, %tmm1, %tmm0 545; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rdi 546; AVX512-NEXT: addq $64, %rdi 547; AVX512-NEXT: tilestored %tmm0, (%rdi,%r8) 548; AVX512-NEXT: leaq {{[0-9]+}}(%rsp), %rdi 549; AVX512-NEXT: vzeroupper 550; AVX512-NEXT: callq memcpy@PLT 551; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload 552; AVX512-NEXT: movq $buf, {{[0-9]+}}(%rsp) 553; AVX512-NEXT: movq $32, {{[0-9]+}}(%rsp) 554; AVX512-NEXT: movw {{[0-9]+}}(%rsp), %ax 555; AVX512-NEXT: movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill 556; AVX512-NEXT: movw {{[0-9]+}}(%rsp), %ax 557; AVX512-NEXT: movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill 558; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax 559; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill 560; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax 561; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill 562; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm0 563; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm1 564; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm2 565; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm3 566; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm4 567; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm5 568; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm6 569; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm7 570; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm8 571; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm9 572; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm10 573; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm11 574; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm12 575; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm13 576; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm14 577; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm15 578; AVX512-NEXT: vmovdqa64 %zmm15, {{[0-9]+}}(%rsp) 579; AVX512-NEXT: vmovdqa64 %zmm14, {{[0-9]+}}(%rsp) 580; AVX512-NEXT: vmovdqa64 %zmm13, {{[0-9]+}}(%rsp) 581; AVX512-NEXT: vmovdqa64 %zmm12, {{[0-9]+}}(%rsp) 582; AVX512-NEXT: vmovdqa64 %zmm11, {{[0-9]+}}(%rsp) 583; AVX512-NEXT: vmovdqa64 %zmm10, {{[0-9]+}}(%rsp) 584; AVX512-NEXT: vmovdqa64 %zmm9, {{[0-9]+}}(%rsp) 585; AVX512-NEXT: vmovdqa64 %zmm8, {{[0-9]+}}(%rsp) 586; AVX512-NEXT: vmovdqa64 %zmm7, {{[0-9]+}}(%rsp) 587; AVX512-NEXT: vmovdqa64 %zmm6, {{[0-9]+}}(%rsp) 588; AVX512-NEXT: vmovdqa64 %zmm5, {{[0-9]+}}(%rsp) 589; AVX512-NEXT: vmovdqa64 %zmm4, {{[0-9]+}}(%rsp) 590; AVX512-NEXT: vmovdqa64 %zmm3, {{[0-9]+}}(%rsp) 591; AVX512-NEXT: vmovdqa64 %zmm2, {{[0-9]+}}(%rsp) 592; AVX512-NEXT: vmovdqa64 %zmm1, {{[0-9]+}}(%rsp) 593; AVX512-NEXT: vmovdqa64 %zmm0, {{[0-9]+}}(%rsp) 594; AVX512-NEXT: leaq {{[0-9]+}}(%rsp), %rdi 595; AVX512-NEXT: leaq {{[0-9]+}}(%rsp), %rsi 596; AVX512-NEXT: vzeroupper 597; AVX512-NEXT: callq memcpy@PLT 598; AVX512-NEXT: movw {{[-0-9]+}}(%r{{[sb]}}p), %si # 2-byte Reload 599; AVX512-NEXT: movw {{[-0-9]+}}(%r{{[sb]}}p), %dx # 2-byte Reload 600; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload 601; AVX512-NEXT: # kill: def $rdi killed $rax 602; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload 603; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm0 604; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm1 605; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm2 606; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm3 607; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm4 608; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm5 609; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm6 610; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm7 611; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm8 612; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm9 613; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm10 614; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm11 615; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm12 616; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm13 617; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm14 618; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm15 619; AVX512-NEXT: movw %si, {{[0-9]+}}(%rsp) 620; AVX512-NEXT: movw %dx, {{[0-9]+}}(%rsp) 621; AVX512-NEXT: movq %rcx, {{[0-9]+}}(%rsp) 622; AVX512-NEXT: movq %rax, {{[0-9]+}}(%rsp) 623; AVX512-NEXT: vmovdqa64 %zmm15, {{[0-9]+}}(%rsp) 624; AVX512-NEXT: vmovdqa64 %zmm14, {{[0-9]+}}(%rsp) 625; AVX512-NEXT: vmovdqa64 %zmm13, {{[0-9]+}}(%rsp) 626; AVX512-NEXT: vmovdqa64 %zmm12, {{[0-9]+}}(%rsp) 627; AVX512-NEXT: vmovdqa64 %zmm11, {{[0-9]+}}(%rsp) 628; AVX512-NEXT: vmovdqa64 %zmm10, {{[0-9]+}}(%rsp) 629; AVX512-NEXT: vmovdqa64 %zmm9, {{[0-9]+}}(%rsp) 630; AVX512-NEXT: vmovdqa64 %zmm8, {{[0-9]+}}(%rsp) 631; AVX512-NEXT: vmovdqa64 %zmm7, {{[0-9]+}}(%rsp) 632; AVX512-NEXT: vmovdqa64 %zmm6, {{[0-9]+}}(%rsp) 633; AVX512-NEXT: vmovdqa64 %zmm5, {{[0-9]+}}(%rsp) 634; AVX512-NEXT: vmovdqa64 %zmm4, {{[0-9]+}}(%rsp) 635; AVX512-NEXT: vmovdqa64 %zmm3, {{[0-9]+}}(%rsp) 636; AVX512-NEXT: vmovdqa64 %zmm2, {{[0-9]+}}(%rsp) 637; AVX512-NEXT: vmovdqa64 %zmm1, {{[0-9]+}}(%rsp) 638; AVX512-NEXT: vmovdqa64 %zmm0, {{[0-9]+}}(%rsp) 639; AVX512-NEXT: movw {{[0-9]+}}(%rsp), %ax 640; AVX512-NEXT: movw {{[0-9]+}}(%rsp), %cx 641; AVX512-NEXT: # implicit-def: $al 642; AVX512-NEXT: movb %al, {{[0-9]+}}(%rsp) 643; AVX512-NEXT: movw %cx, {{[0-9]+}}(%rsp) 644; AVX512-NEXT: ldtilecfg {{[0-9]+}}(%rsp) 645; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rdx 646; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rsi 647; AVX512-NEXT: movl $64, %r8d 648; AVX512-NEXT: leaq {{[0-9]+}}(%rsp), %rdi 649; AVX512-NEXT: tileloadd (%rdi,%r8), %tmm0 650; AVX512-NEXT: tilestored %tmm0, (%rdx,%rsi) 651; AVX512-NEXT: movq %rbp, %rsp 652; AVX512-NEXT: popq %rbp 653; AVX512-NEXT: tilerelease 654; AVX512-NEXT: vzeroupper 655; AVX512-NEXT: retq 656entry: 657 %m.addr.i85 = alloca i16, align 2 658 %n.addr.i86 = alloca i16, align 2 659 %base.addr.i87 = alloca ptr, align 8 660 %stride.addr.i88 = alloca i64, align 8 661 %tile.addr.i = alloca <256 x i32>, align 64 662 %indirect-arg-temp.i5284 = alloca <256 x i32>, align 1024 663 %m.addr.i81 = alloca i16, align 2 664 %n.addr.i82 = alloca i16, align 2 665 %k.addr.i = alloca i16, align 2 666 %dst.addr.i83 = alloca <256 x i32>, align 64 667 %src1.addr.i = alloca <256 x i32>, align 64 668 %src2.addr.i = alloca <256 x i32>, align 64 669 %indirect-arg-temp5.i80 = alloca <256 x i32>, align 1024 670 %indirect-arg-temp4.i79 = alloca <256 x i32>, align 1024 671 %indirect-arg-temp.i78 = alloca <256 x i32>, align 1024 672 %m.addr.i74 = alloca i16, align 2 673 %n.addr.i75 = alloca i16, align 2 674 %base.addr.i76 = alloca ptr, align 8 675 %stride.addr.i77 = alloca i64, align 8 676 %m.addr.i70 = alloca i16, align 2 677 %n.addr.i71 = alloca i16, align 2 678 %base.addr.i72 = alloca ptr, align 8 679 %stride.addr.i73 = alloca i64, align 8 680 %m.addr.i66 = alloca i16, align 2 681 %n.addr.i67 = alloca i16, align 2 682 %base.addr.i68 = alloca ptr, align 8 683 %stride.addr.i69 = alloca i64, align 8 684 %m.addr.i62 = alloca i16, align 2 685 %n.addr.i63 = alloca i16, align 2 686 %base.addr.i64 = alloca ptr, align 8 687 %stride.addr.i65 = alloca i64, align 8 688 %m.addr.i58 = alloca i16, align 2 689 %n.addr.i59 = alloca i16, align 2 690 %base.addr.i60 = alloca ptr, align 8 691 %stride.addr.i61 = alloca i64, align 8 692 %m.addr.i = alloca i16, align 2 693 %n.addr.i = alloca i16, align 2 694 %base.addr.i56 = alloca ptr, align 8 695 %stride.addr.i57 = alloca i64, align 8 696 %base.addr.i50 = alloca ptr, align 8 697 %stride.addr.i51 = alloca i64, align 8 698 %indirect-arg-temp.i52 = alloca <256 x i32>, align 1024 699 %c49 = alloca %struct.__tile1024i_str, align 64 700 %dst.addr.i44 = alloca ptr, align 8 701 %indirect-arg-temp.i = alloca <256 x i32>, align 1024 702 %indirect-arg-temp4.i = alloca <256 x i32>, align 1024 703 %indirect-arg-temp5.i = alloca <256 x i32>, align 1024 704 %b43 = alloca %struct.__tile1024i_str, align 64 705 %a42 = alloca %struct.__tile1024i_str, align 64 706 %dst.addr.i35 = alloca ptr, align 8 707 %base.addr.i36 = alloca ptr, align 8 708 %stride.addr.i37 = alloca i64, align 8 709 %dst.addr.i28 = alloca ptr, align 8 710 %base.addr.i29 = alloca ptr, align 8 711 %stride.addr.i30 = alloca i64, align 8 712 %dst.addr.i21 = alloca ptr, align 8 713 %base.addr.i22 = alloca ptr, align 8 714 %stride.addr.i23 = alloca i64, align 8 715 %dst.addr.i14 = alloca ptr, align 8 716 %base.addr.i15 = alloca ptr, align 8 717 %stride.addr.i16 = alloca i64, align 8 718 %dst.addr.i7 = alloca ptr, align 8 719 %base.addr.i8 = alloca ptr, align 8 720 %stride.addr.i9 = alloca i64, align 8 721 %dst.addr.i = alloca ptr, align 8 722 %base.addr.i = alloca ptr, align 8 723 %stride.addr.i = alloca i64, align 8 724 %cond.addr = alloca i32, align 4 725 %row.addr = alloca i16, align 2 726 %col.addr = alloca i16, align 2 727 %a = alloca %struct.__tile1024i_str, align 64 728 %b = alloca %struct.__tile1024i_str, align 64 729 %c = alloca %struct.__tile1024i_str, align 64 730 store i32 %cond, ptr %cond.addr, align 4 731 store i16 %row, ptr %row.addr, align 2 732 store i16 %col, ptr %col.addr, align 2 733 call void @llvm.memset.p0.i64(ptr align 64 %a, i8 0, i64 1088, i1 false) 734 %0 = load i16, ptr %row.addr, align 2 735 store i16 %0, ptr %a, align 64 736 %col2 = getelementptr inbounds %struct.__tile1024i_str, ptr %a, i32 0, i32 1 737 store i16 8, ptr %col2, align 2 738 call void @llvm.memset.p0.i64(ptr align 64 %b, i8 0, i64 1088, i1 false) 739 store i16 8, ptr %b, align 64 740 %col4 = getelementptr inbounds %struct.__tile1024i_str, ptr %b, i32 0, i32 1 741 %1 = load i16, ptr %col.addr, align 2 742 store i16 %1, ptr %col4, align 2 743 call void @llvm.memset.p0.i64(ptr align 64 %c, i8 0, i64 1088, i1 false) 744 %2 = load i16, ptr %row.addr, align 2 745 store i16 %2, ptr %c, align 64 746 %col6 = getelementptr inbounds %struct.__tile1024i_str, ptr %c, i32 0, i32 1 747 %3 = load i16, ptr %col.addr, align 2 748 store i16 %3, ptr %col6, align 2 749 %4 = load i32, ptr %cond.addr, align 4 750 %tobool = icmp ne i32 %4, 0 751 br i1 %tobool, label %if.then, label %if.else 752 753if.then: ; preds = %entry 754 store ptr %a, ptr %dst.addr.i35, align 8 755 store ptr @buf, ptr %base.addr.i36, align 8 756 store i64 32, ptr %stride.addr.i37, align 8 757 %5 = load ptr, ptr %dst.addr.i35, align 8 758 %6 = load i16, ptr %5, align 64 759 %7 = load ptr, ptr %dst.addr.i35, align 8 760 %col.i39 = getelementptr inbounds %struct.__tile1024i_str, ptr %7, i32 0, i32 1 761 %8 = load i16, ptr %col.i39, align 2 762 %9 = load ptr, ptr %base.addr.i36, align 8 763 %10 = load i64, ptr %stride.addr.i37, align 8 764 store i16 %6, ptr %m.addr.i, align 2 765 store i16 %8, ptr %n.addr.i, align 2 766 store ptr %9, ptr %base.addr.i56, align 8 767 store i64 %10, ptr %stride.addr.i57, align 8 768 %11 = load i16, ptr %m.addr.i, align 2 769 %12 = load i16, ptr %n.addr.i, align 2 770 %13 = load ptr, ptr %base.addr.i56, align 8 771 %14 = load i64, ptr %stride.addr.i57, align 8 772 %15 = call x86_amx @llvm.x86.tileloadd64.internal(i16 %11, i16 %12, ptr %13, i64 %14) #2 773 %16 = bitcast x86_amx %15 to <256 x i32> 774 %17 = load ptr, ptr %dst.addr.i35, align 8 775 %tile.i41 = getelementptr inbounds %struct.__tile1024i_str, ptr %17, i32 0, i32 3 776 store <256 x i32> %16, ptr %tile.i41, align 64 777 store ptr %b, ptr %dst.addr.i28, align 8 778 store ptr @buf, ptr %base.addr.i29, align 8 779 store i64 32, ptr %stride.addr.i30, align 8 780 %18 = load ptr, ptr %dst.addr.i28, align 8 781 %19 = load i16, ptr %18, align 64 782 %20 = load ptr, ptr %dst.addr.i28, align 8 783 %col.i32 = getelementptr inbounds %struct.__tile1024i_str, ptr %20, i32 0, i32 1 784 %21 = load i16, ptr %col.i32, align 2 785 %22 = load ptr, ptr %base.addr.i29, align 8 786 %23 = load i64, ptr %stride.addr.i30, align 8 787 store i16 %19, ptr %m.addr.i58, align 2 788 store i16 %21, ptr %n.addr.i59, align 2 789 store ptr %22, ptr %base.addr.i60, align 8 790 store i64 %23, ptr %stride.addr.i61, align 8 791 %24 = load i16, ptr %m.addr.i58, align 2 792 %25 = load i16, ptr %n.addr.i59, align 2 793 %26 = load ptr, ptr %base.addr.i60, align 8 794 %27 = load i64, ptr %stride.addr.i61, align 8 795 %28 = call x86_amx @llvm.x86.tileloadd64.internal(i16 %24, i16 %25, ptr %26, i64 %27) #2 796 %29 = bitcast x86_amx %28 to <256 x i32> 797 %30 = load ptr, ptr %dst.addr.i28, align 8 798 %tile.i34 = getelementptr inbounds %struct.__tile1024i_str, ptr %30, i32 0, i32 3 799 store <256 x i32> %29, ptr %tile.i34, align 64 800 store ptr %c, ptr %dst.addr.i21, align 8 801 store ptr @buf, ptr %base.addr.i22, align 8 802 store i64 32, ptr %stride.addr.i23, align 8 803 %31 = load ptr, ptr %dst.addr.i21, align 8 804 %32 = load i16, ptr %31, align 64 805 %33 = load ptr, ptr %dst.addr.i21, align 8 806 %col.i25 = getelementptr inbounds %struct.__tile1024i_str, ptr %33, i32 0, i32 1 807 %34 = load i16, ptr %col.i25, align 2 808 %35 = load ptr, ptr %base.addr.i22, align 8 809 %36 = load i64, ptr %stride.addr.i23, align 8 810 store i16 %32, ptr %m.addr.i62, align 2 811 store i16 %34, ptr %n.addr.i63, align 2 812 store ptr %35, ptr %base.addr.i64, align 8 813 store i64 %36, ptr %stride.addr.i65, align 8 814 %37 = load i16, ptr %m.addr.i62, align 2 815 %38 = load i16, ptr %n.addr.i63, align 2 816 %39 = load ptr, ptr %base.addr.i64, align 8 817 %40 = load i64, ptr %stride.addr.i65, align 8 818 %41 = call x86_amx @llvm.x86.tileloadd64.internal(i16 %37, i16 %38, ptr %39, i64 %40) #2 819 %42 = bitcast x86_amx %41 to <256 x i32> 820 %43 = load ptr, ptr %dst.addr.i21, align 8 821 %tile.i27 = getelementptr inbounds %struct.__tile1024i_str, ptr %43, i32 0, i32 3 822 store <256 x i32> %42, ptr %tile.i27, align 64 823 br label %if.end 824 825if.else: ; preds = %entry 826 store ptr %a, ptr %dst.addr.i14, align 8 827 store ptr @buf2, ptr %base.addr.i15, align 8 828 store i64 32, ptr %stride.addr.i16, align 8 829 %44 = load ptr, ptr %dst.addr.i14, align 8 830 %45 = load i16, ptr %44, align 64 831 %46 = load ptr, ptr %dst.addr.i14, align 8 832 %col.i18 = getelementptr inbounds %struct.__tile1024i_str, ptr %46, i32 0, i32 1 833 %47 = load i16, ptr %col.i18, align 2 834 %48 = load ptr, ptr %base.addr.i15, align 8 835 %49 = load i64, ptr %stride.addr.i16, align 8 836 store i16 %45, ptr %m.addr.i66, align 2 837 store i16 %47, ptr %n.addr.i67, align 2 838 store ptr %48, ptr %base.addr.i68, align 8 839 store i64 %49, ptr %stride.addr.i69, align 8 840 %50 = load i16, ptr %m.addr.i66, align 2 841 %51 = load i16, ptr %n.addr.i67, align 2 842 %52 = load ptr, ptr %base.addr.i68, align 8 843 %53 = load i64, ptr %stride.addr.i69, align 8 844 %54 = call x86_amx @llvm.x86.tileloadd64.internal(i16 %50, i16 %51, ptr %52, i64 %53) #2 845 %55 = bitcast x86_amx %54 to <256 x i32> 846 %56 = load ptr, ptr %dst.addr.i14, align 8 847 %tile.i20 = getelementptr inbounds %struct.__tile1024i_str, ptr %56, i32 0, i32 3 848 store <256 x i32> %55, ptr %tile.i20, align 64 849 store ptr %b, ptr %dst.addr.i7, align 8 850 store ptr @buf2, ptr %base.addr.i8, align 8 851 store i64 32, ptr %stride.addr.i9, align 8 852 %57 = load ptr, ptr %dst.addr.i7, align 8 853 %58 = load i16, ptr %57, align 64 854 %59 = load ptr, ptr %dst.addr.i7, align 8 855 %col.i11 = getelementptr inbounds %struct.__tile1024i_str, ptr %59, i32 0, i32 1 856 %60 = load i16, ptr %col.i11, align 2 857 %61 = load ptr, ptr %base.addr.i8, align 8 858 %62 = load i64, ptr %stride.addr.i9, align 8 859 store i16 %58, ptr %m.addr.i70, align 2 860 store i16 %60, ptr %n.addr.i71, align 2 861 store ptr %61, ptr %base.addr.i72, align 8 862 store i64 %62, ptr %stride.addr.i73, align 8 863 %63 = load i16, ptr %m.addr.i70, align 2 864 %64 = load i16, ptr %n.addr.i71, align 2 865 %65 = load ptr, ptr %base.addr.i72, align 8 866 %66 = load i64, ptr %stride.addr.i73, align 8 867 %67 = call x86_amx @llvm.x86.tileloadd64.internal(i16 %63, i16 %64, ptr %65, i64 %66) #2 868 %68 = bitcast x86_amx %67 to <256 x i32> 869 %69 = load ptr, ptr %dst.addr.i7, align 8 870 %tile.i13 = getelementptr inbounds %struct.__tile1024i_str, ptr %69, i32 0, i32 3 871 store <256 x i32> %68, ptr %tile.i13, align 64 872 store ptr %c, ptr %dst.addr.i, align 8 873 store ptr @buf2, ptr %base.addr.i, align 8 874 store i64 32, ptr %stride.addr.i, align 8 875 %70 = load ptr, ptr %dst.addr.i, align 8 876 %71 = load i16, ptr %70, align 64 877 %72 = load ptr, ptr %dst.addr.i, align 8 878 %col.i = getelementptr inbounds %struct.__tile1024i_str, ptr %72, i32 0, i32 1 879 %73 = load i16, ptr %col.i, align 2 880 %74 = load ptr, ptr %base.addr.i, align 8 881 %75 = load i64, ptr %stride.addr.i, align 8 882 store i16 %71, ptr %m.addr.i74, align 2 883 store i16 %73, ptr %n.addr.i75, align 2 884 store ptr %74, ptr %base.addr.i76, align 8 885 store i64 %75, ptr %stride.addr.i77, align 8 886 %76 = load i16, ptr %m.addr.i74, align 2 887 %77 = load i16, ptr %n.addr.i75, align 2 888 %78 = load ptr, ptr %base.addr.i76, align 8 889 %79 = load i64, ptr %stride.addr.i77, align 8 890 %80 = call x86_amx @llvm.x86.tileloadd64.internal(i16 %76, i16 %77, ptr %78, i64 %79) #2 891 %81 = bitcast x86_amx %80 to <256 x i32> 892 %82 = load ptr, ptr %dst.addr.i, align 8 893 %tile.i = getelementptr inbounds %struct.__tile1024i_str, ptr %82, i32 0, i32 3 894 store <256 x i32> %81, ptr %tile.i, align 64 895 br label %if.end 896 897if.end: ; preds = %if.else, %if.then 898 call void @llvm.memcpy.p0.p0.i64(ptr align 1 %b43, ptr align 1 %b, i64 1088, i1 false) #2 899 call void @llvm.memcpy.p0.p0.i64(ptr align 1 %a42, ptr align 1 %a, i64 1088, i1 false) #2 900 store ptr %c, ptr %dst.addr.i44, align 8 901 %83 = load i16, ptr %a42, align 64 902 %col.i46 = getelementptr inbounds %struct.__tile1024i_str, ptr %b43, i32 0, i32 1 903 %84 = load i16, ptr %col.i46, align 2 904 %col1.i = getelementptr inbounds %struct.__tile1024i_str, ptr %a42, i32 0, i32 1 905 %85 = load i16, ptr %col1.i, align 2 906 %86 = load ptr, ptr %dst.addr.i44, align 8 907 %tile.i47 = getelementptr inbounds %struct.__tile1024i_str, ptr %86, i32 0, i32 3 908 %87 = load <256 x i32>, ptr %tile.i47, align 64 909 %tile2.i = getelementptr inbounds %struct.__tile1024i_str, ptr %a42, i32 0, i32 3 910 %88 = load <256 x i32>, ptr %tile2.i, align 64 911 %tile3.i = getelementptr inbounds %struct.__tile1024i_str, ptr %b43, i32 0, i32 3 912 %89 = load <256 x i32>, ptr %tile3.i, align 64 913 store <256 x i32> %87, ptr %indirect-arg-temp.i, align 1024 914 store <256 x i32> %88, ptr %indirect-arg-temp4.i, align 1024 915 store <256 x i32> %89, ptr %indirect-arg-temp5.i, align 1024 916 call void @llvm.memcpy.p0.p0.i64(ptr align 1 %indirect-arg-temp5.i80, ptr align 1 %indirect-arg-temp5.i, i64 1024, i1 false) #2 917 call void @llvm.memcpy.p0.p0.i64(ptr align 1 %indirect-arg-temp4.i79, ptr align 1 %indirect-arg-temp4.i, i64 1024, i1 false) #2 918 call void @llvm.memcpy.p0.p0.i64(ptr align 1 %indirect-arg-temp.i78, ptr align 1 %indirect-arg-temp.i, i64 1024, i1 false) #2 919 %dst.i = load <256 x i32>, ptr %indirect-arg-temp.i78, align 1024 920 %src1.i = load <256 x i32>, ptr %indirect-arg-temp4.i79, align 1024 921 %src2.i = load <256 x i32>, ptr %indirect-arg-temp5.i80, align 1024 922 store i16 %83, ptr %m.addr.i81, align 2 923 store i16 %84, ptr %n.addr.i82, align 2 924 store i16 %85, ptr %k.addr.i, align 2 925 store <256 x i32> %dst.i, ptr %dst.addr.i83, align 64 926 store <256 x i32> %src1.i, ptr %src1.addr.i, align 64 927 store <256 x i32> %src2.i, ptr %src2.addr.i, align 64 928 %90 = load i16, ptr %m.addr.i81, align 2 929 %91 = load i16, ptr %n.addr.i82, align 2 930 %92 = load i16, ptr %k.addr.i, align 2 931 %93 = load <256 x i32>, ptr %dst.addr.i83, align 64 932 %94 = bitcast <256 x i32> %93 to x86_amx 933 %95 = load <256 x i32>, ptr %src1.addr.i, align 64 934 %96 = bitcast <256 x i32> %95 to x86_amx 935 %97 = load <256 x i32>, ptr %src2.addr.i, align 64 936 %98 = bitcast <256 x i32> %97 to x86_amx 937 %99 = call x86_amx @llvm.x86.tdpbssd.internal(i16 %90, i16 %91, i16 %92, x86_amx %94, x86_amx %96, x86_amx %98) #2 938 %100 = bitcast x86_amx %99 to <256 x i32> 939 %101 = load ptr, ptr %dst.addr.i44, align 8 940 %tile6.i = getelementptr inbounds %struct.__tile1024i_str, ptr %101, i32 0, i32 3 941 store <256 x i32> %100, ptr %tile6.i, align 64 942 call void @llvm.memcpy.p0.p0.i64(ptr align 1 %c49, ptr align 1 %c, i64 1088, i1 false) #2 943 store ptr @buf, ptr %base.addr.i50, align 8 944 store i64 32, ptr %stride.addr.i51, align 8 945 %102 = load i16, ptr %c49, align 64 946 %col.i54 = getelementptr inbounds %struct.__tile1024i_str, ptr %c49, i32 0, i32 1 947 %103 = load i16, ptr %col.i54, align 2 948 %104 = load ptr, ptr %base.addr.i50, align 8 949 %105 = load i64, ptr %stride.addr.i51, align 8 950 %tile.i55 = getelementptr inbounds %struct.__tile1024i_str, ptr %c49, i32 0, i32 3 951 %106 = load <256 x i32>, ptr %tile.i55, align 64 952 store <256 x i32> %106, ptr %indirect-arg-temp.i52, align 1024 953 call void @llvm.memcpy.p0.p0.i64(ptr align 1 %indirect-arg-temp.i5284, ptr align 1 %indirect-arg-temp.i52, i64 1024, i1 false) #2 954 %tile.i89 = load <256 x i32>, ptr %indirect-arg-temp.i5284, align 1024 955 store i16 %102, ptr %m.addr.i85, align 2 956 store i16 %103, ptr %n.addr.i86, align 2 957 store ptr %104, ptr %base.addr.i87, align 8 958 store i64 %105, ptr %stride.addr.i88, align 8 959 store <256 x i32> %tile.i89, ptr %tile.addr.i, align 64 960 %107 = load i16, ptr %m.addr.i85, align 2 961 %108 = load i16, ptr %n.addr.i86, align 2 962 %109 = load ptr, ptr %base.addr.i87, align 8 963 %110 = load i64, ptr %stride.addr.i88, align 8 964 %111 = load <256 x i32>, ptr %tile.addr.i, align 64 965 %112 = bitcast <256 x i32> %111 to x86_amx 966 call void @llvm.x86.tilestored64.internal(i16 %107, i16 %108, ptr %109, i64 %110, x86_amx %112) #2 967 ret void 968} 969 970declare void @llvm.memset.p0.i64(ptr nocapture writeonly, i8, i64, i1 immarg) #1 971declare x86_amx @llvm.x86.tileloadd64.internal(i16, i16, ptr, i64) #2 972declare x86_amx @llvm.x86.tdpbssd.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx) #2 973declare void @llvm.x86.tilestored64.internal(i16, i16, ptr, i64, x86_amx) #2 974declare void @llvm.memcpy.p0.p0.i64(ptr noalias nocapture writeonly, ptr noalias nocapture readonly, i64, i1 immarg) #3 975 976attributes #0 = { noinline nounwind optnone } 977attributes #1 = { argmemonly nofree nosync nounwind willreturn writeonly } 978attributes #2 = { nounwind } 979attributes #3 = { argmemonly nofree nosync nounwind willreturn } 980