xref: /llvm-project/llvm/test/CodeGen/X86/AMX/amx-configO0toO0.ll (revision 0eb17a9d8672c3503c76a808b0773235b042f5a9)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -O0 -mtriple=x86_64-unknown-unknown -mattr=+amx-int8 -mattr=+avx512f | FileCheck %s --check-prefix=AVX512
3
4%struct.__tile1024i_str = type <{ i16, i16, [60 x i8], <256 x i32> }>
5@buf = dso_local global [1024 x i8] zeroinitializer, align 16
6@buf2 = dso_local global [1024 x i8] zeroinitializer, align 16
7
8define dso_local void @test_api(i32 %cond, i16 signext %row, i16 signext %col) #0 {
9; AVX512-LABEL: test_api:
10; AVX512:       # %bb.0: # %entry
11; AVX512-NEXT:    pushq %rbp
12; AVX512-NEXT:    movq %rsp, %rbp
13; AVX512-NEXT:    andq $-1024, %rsp # imm = 0xFC00
14; AVX512-NEXT:    subq $25600, %rsp # imm = 0x6400
15; AVX512-NEXT:    vxorps %xmm0, %xmm0, %xmm0
16; AVX512-NEXT:    vmovups %zmm0, {{[0-9]+}}(%rsp)
17; AVX512-NEXT:    movb $1, {{[0-9]+}}(%rsp)
18; AVX512-NEXT:    movw %dx, %ax
19; AVX512-NEXT:    movw %si, %cx
20; AVX512-NEXT:    movl %edi, {{[0-9]+}}(%rsp)
21; AVX512-NEXT:    movw %cx, {{[0-9]+}}(%rsp)
22; AVX512-NEXT:    movw %ax, {{[0-9]+}}(%rsp)
23; AVX512-NEXT:    leaq {{[0-9]+}}(%rsp), %rdi
24; AVX512-NEXT:    xorl %esi, %esi
25; AVX512-NEXT:    movl $1088, %edx # imm = 0x440
26; AVX512-NEXT:    vzeroupper
27; AVX512-NEXT:    callq memset@PLT
28; AVX512-NEXT:    movw {{[0-9]+}}(%rsp), %ax
29; AVX512-NEXT:    movw %ax, {{[0-9]+}}(%rsp)
30; AVX512-NEXT:    movw $8, {{[0-9]+}}(%rsp)
31; AVX512-NEXT:    leaq {{[0-9]+}}(%rsp), %rdi
32; AVX512-NEXT:    xorl %esi, %esi
33; AVX512-NEXT:    movl $1088, %edx # imm = 0x440
34; AVX512-NEXT:    callq memset@PLT
35; AVX512-NEXT:    movw $8, {{[0-9]+}}(%rsp)
36; AVX512-NEXT:    movw {{[0-9]+}}(%rsp), %ax
37; AVX512-NEXT:    movw %ax, {{[0-9]+}}(%rsp)
38; AVX512-NEXT:    leaq {{[0-9]+}}(%rsp), %rdi
39; AVX512-NEXT:    xorl %esi, %esi
40; AVX512-NEXT:    movl $1088, %edx # imm = 0x440
41; AVX512-NEXT:    callq memset@PLT
42; AVX512-NEXT:    movw {{[0-9]+}}(%rsp), %ax
43; AVX512-NEXT:    movw %ax, {{[0-9]+}}(%rsp)
44; AVX512-NEXT:    movw {{[0-9]+}}(%rsp), %ax
45; AVX512-NEXT:    movw %ax, {{[0-9]+}}(%rsp)
46; AVX512-NEXT:    cmpl $0, {{[0-9]+}}(%rsp)
47; AVX512-NEXT:    je .LBB0_2
48; AVX512-NEXT:  # %bb.1: # %if.then
49; AVX512-NEXT:    leaq {{[0-9]+}}(%rsp), %rax
50; AVX512-NEXT:    movq %rax, {{[0-9]+}}(%rsp)
51; AVX512-NEXT:    movabsq $buf, %rax
52; AVX512-NEXT:    movq %rax, {{[0-9]+}}(%rsp)
53; AVX512-NEXT:    movq $32, {{[0-9]+}}(%rsp)
54; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rax
55; AVX512-NEXT:    movw (%rax), %si
56; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rax
57; AVX512-NEXT:    movw 2(%rax), %dx
58; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rcx
59; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rax
60; AVX512-NEXT:    movw %si, {{[0-9]+}}(%rsp)
61; AVX512-NEXT:    movw %dx, {{[0-9]+}}(%rsp)
62; AVX512-NEXT:    movq %rcx, {{[0-9]+}}(%rsp)
63; AVX512-NEXT:    movq %rax, {{[0-9]+}}(%rsp)
64; AVX512-NEXT:    movw {{[0-9]+}}(%rsp), %ax
65; AVX512-NEXT:    movw {{[0-9]+}}(%rsp), %cx
66; AVX512-NEXT:    # implicit-def: $al
67; AVX512-NEXT:    movb %al, {{[0-9]+}}(%rsp)
68; AVX512-NEXT:    movw %cx, {{[0-9]+}}(%rsp)
69; AVX512-NEXT:    ldtilecfg {{[0-9]+}}(%rsp)
70; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rdx
71; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rsi
72; AVX512-NEXT:    tileloadd (%rdx,%rsi), %tmm0
73; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rdx
74; AVX512-NEXT:    addq $64, %rdx
75; AVX512-NEXT:    movl $64, %esi
76; AVX512-NEXT:    tilestored %tmm0, (%rdx,%rsi)
77; AVX512-NEXT:    leaq {{[0-9]+}}(%rsp), %rax
78; AVX512-NEXT:    movq %rax, {{[0-9]+}}(%rsp)
79; AVX512-NEXT:    movabsq $buf, %rax
80; AVX512-NEXT:    movq %rax, {{[0-9]+}}(%rsp)
81; AVX512-NEXT:    movq $32, {{[0-9]+}}(%rsp)
82; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rax
83; AVX512-NEXT:    movw (%rax), %si
84; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rax
85; AVX512-NEXT:    movw 2(%rax), %dx
86; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rcx
87; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rax
88; AVX512-NEXT:    movw %si, {{[0-9]+}}(%rsp)
89; AVX512-NEXT:    movw %dx, {{[0-9]+}}(%rsp)
90; AVX512-NEXT:    movq %rcx, {{[0-9]+}}(%rsp)
91; AVX512-NEXT:    movq %rax, {{[0-9]+}}(%rsp)
92; AVX512-NEXT:    movw {{[0-9]+}}(%rsp), %ax
93; AVX512-NEXT:    movw {{[0-9]+}}(%rsp), %cx
94; AVX512-NEXT:    # implicit-def: $al
95; AVX512-NEXT:    movb %al, {{[0-9]+}}(%rsp)
96; AVX512-NEXT:    movw %cx, {{[0-9]+}}(%rsp)
97; AVX512-NEXT:    ldtilecfg {{[0-9]+}}(%rsp)
98; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rdx
99; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rsi
100; AVX512-NEXT:    tileloadd (%rdx,%rsi), %tmm0
101; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rdx
102; AVX512-NEXT:    addq $64, %rdx
103; AVX512-NEXT:    movl $64, %esi
104; AVX512-NEXT:    tilestored %tmm0, (%rdx,%rsi)
105; AVX512-NEXT:    leaq {{[0-9]+}}(%rsp), %rax
106; AVX512-NEXT:    movq %rax, {{[0-9]+}}(%rsp)
107; AVX512-NEXT:    movabsq $buf, %rax
108; AVX512-NEXT:    movq %rax, {{[0-9]+}}(%rsp)
109; AVX512-NEXT:    movq $32, {{[0-9]+}}(%rsp)
110; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rax
111; AVX512-NEXT:    movw (%rax), %si
112; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rax
113; AVX512-NEXT:    movw 2(%rax), %dx
114; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rcx
115; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rax
116; AVX512-NEXT:    movw %si, {{[0-9]+}}(%rsp)
117; AVX512-NEXT:    movw %dx, {{[0-9]+}}(%rsp)
118; AVX512-NEXT:    movq %rcx, {{[0-9]+}}(%rsp)
119; AVX512-NEXT:    movq %rax, {{[0-9]+}}(%rsp)
120; AVX512-NEXT:    movw {{[0-9]+}}(%rsp), %ax
121; AVX512-NEXT:    movw {{[0-9]+}}(%rsp), %cx
122; AVX512-NEXT:    # implicit-def: $al
123; AVX512-NEXT:    movb %al, {{[0-9]+}}(%rsp)
124; AVX512-NEXT:    movw %cx, {{[0-9]+}}(%rsp)
125; AVX512-NEXT:    ldtilecfg {{[0-9]+}}(%rsp)
126; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rdx
127; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rsi
128; AVX512-NEXT:    tileloadd (%rdx,%rsi), %tmm0
129; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rdx
130; AVX512-NEXT:    addq $64, %rdx
131; AVX512-NEXT:    movl $64, %esi
132; AVX512-NEXT:    tilestored %tmm0, (%rdx,%rsi)
133; AVX512-NEXT:    jmp .LBB0_3
134; AVX512-NEXT:  .LBB0_2: # %if.else
135; AVX512-NEXT:    leaq {{[0-9]+}}(%rsp), %rax
136; AVX512-NEXT:    movq %rax, {{[0-9]+}}(%rsp)
137; AVX512-NEXT:    movabsq $buf2, %rax
138; AVX512-NEXT:    movq %rax, {{[0-9]+}}(%rsp)
139; AVX512-NEXT:    movq $32, {{[0-9]+}}(%rsp)
140; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rax
141; AVX512-NEXT:    movw (%rax), %si
142; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rax
143; AVX512-NEXT:    movw 2(%rax), %dx
144; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rcx
145; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rax
146; AVX512-NEXT:    movw %si, {{[0-9]+}}(%rsp)
147; AVX512-NEXT:    movw %dx, {{[0-9]+}}(%rsp)
148; AVX512-NEXT:    movq %rcx, {{[0-9]+}}(%rsp)
149; AVX512-NEXT:    movq %rax, {{[0-9]+}}(%rsp)
150; AVX512-NEXT:    movw {{[0-9]+}}(%rsp), %ax
151; AVX512-NEXT:    movw {{[0-9]+}}(%rsp), %cx
152; AVX512-NEXT:    # implicit-def: $al
153; AVX512-NEXT:    movb %al, {{[0-9]+}}(%rsp)
154; AVX512-NEXT:    movw %cx, {{[0-9]+}}(%rsp)
155; AVX512-NEXT:    ldtilecfg {{[0-9]+}}(%rsp)
156; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rdx
157; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rsi
158; AVX512-NEXT:    tileloadd (%rdx,%rsi), %tmm0
159; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rdx
160; AVX512-NEXT:    addq $64, %rdx
161; AVX512-NEXT:    movl $64, %esi
162; AVX512-NEXT:    tilestored %tmm0, (%rdx,%rsi)
163; AVX512-NEXT:    leaq {{[0-9]+}}(%rsp), %rax
164; AVX512-NEXT:    movq %rax, {{[0-9]+}}(%rsp)
165; AVX512-NEXT:    movabsq $buf2, %rax
166; AVX512-NEXT:    movq %rax, {{[0-9]+}}(%rsp)
167; AVX512-NEXT:    movq $32, {{[0-9]+}}(%rsp)
168; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rax
169; AVX512-NEXT:    movw (%rax), %si
170; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rax
171; AVX512-NEXT:    movw 2(%rax), %dx
172; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rcx
173; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rax
174; AVX512-NEXT:    movw %si, {{[0-9]+}}(%rsp)
175; AVX512-NEXT:    movw %dx, {{[0-9]+}}(%rsp)
176; AVX512-NEXT:    movq %rcx, {{[0-9]+}}(%rsp)
177; AVX512-NEXT:    movq %rax, {{[0-9]+}}(%rsp)
178; AVX512-NEXT:    movw {{[0-9]+}}(%rsp), %ax
179; AVX512-NEXT:    movw {{[0-9]+}}(%rsp), %cx
180; AVX512-NEXT:    # implicit-def: $al
181; AVX512-NEXT:    movb %al, {{[0-9]+}}(%rsp)
182; AVX512-NEXT:    movw %cx, {{[0-9]+}}(%rsp)
183; AVX512-NEXT:    ldtilecfg {{[0-9]+}}(%rsp)
184; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rdx
185; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rsi
186; AVX512-NEXT:    tileloadd (%rdx,%rsi), %tmm0
187; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rdx
188; AVX512-NEXT:    addq $64, %rdx
189; AVX512-NEXT:    movl $64, %esi
190; AVX512-NEXT:    tilestored %tmm0, (%rdx,%rsi)
191; AVX512-NEXT:    leaq {{[0-9]+}}(%rsp), %rax
192; AVX512-NEXT:    movq %rax, {{[0-9]+}}(%rsp)
193; AVX512-NEXT:    movabsq $buf2, %rax
194; AVX512-NEXT:    movq %rax, {{[0-9]+}}(%rsp)
195; AVX512-NEXT:    movq $32, {{[0-9]+}}(%rsp)
196; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rax
197; AVX512-NEXT:    movw (%rax), %si
198; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rax
199; AVX512-NEXT:    movw 2(%rax), %dx
200; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rcx
201; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rax
202; AVX512-NEXT:    movw %si, {{[0-9]+}}(%rsp)
203; AVX512-NEXT:    movw %dx, {{[0-9]+}}(%rsp)
204; AVX512-NEXT:    movq %rcx, {{[0-9]+}}(%rsp)
205; AVX512-NEXT:    movq %rax, {{[0-9]+}}(%rsp)
206; AVX512-NEXT:    movw {{[0-9]+}}(%rsp), %ax
207; AVX512-NEXT:    movw {{[0-9]+}}(%rsp), %cx
208; AVX512-NEXT:    # implicit-def: $al
209; AVX512-NEXT:    movb %al, {{[0-9]+}}(%rsp)
210; AVX512-NEXT:    movw %cx, {{[0-9]+}}(%rsp)
211; AVX512-NEXT:    ldtilecfg {{[0-9]+}}(%rsp)
212; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rdx
213; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rsi
214; AVX512-NEXT:    tileloadd (%rdx,%rsi), %tmm0
215; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rdx
216; AVX512-NEXT:    addq $64, %rdx
217; AVX512-NEXT:    movl $64, %esi
218; AVX512-NEXT:    tilestored %tmm0, (%rdx,%rsi)
219; AVX512-NEXT:  .LBB0_3: # %if.end
220; AVX512-NEXT:    leaq {{[0-9]+}}(%rsp), %rdi
221; AVX512-NEXT:    leaq {{[0-9]+}}(%rsp), %rsi
222; AVX512-NEXT:    movl $1088, %edx # imm = 0x440
223; AVX512-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
224; AVX512-NEXT:    callq memcpy@PLT
225; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
226; AVX512-NEXT:    leaq {{[0-9]+}}(%rsp), %rdi
227; AVX512-NEXT:    leaq {{[0-9]+}}(%rsp), %rsi
228; AVX512-NEXT:    callq memcpy@PLT
229; AVX512-NEXT:    leaq {{[0-9]+}}(%rsp), %rax
230; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
231; AVX512-NEXT:    movq %rax, {{[0-9]+}}(%rsp)
232; AVX512-NEXT:    movw {{[0-9]+}}(%rsp), %ax
233; AVX512-NEXT:    movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
234; AVX512-NEXT:    movw {{[0-9]+}}(%rsp), %ax
235; AVX512-NEXT:    movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
236; AVX512-NEXT:    movw {{[0-9]+}}(%rsp), %ax
237; AVX512-NEXT:    movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
238; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rax
239; AVX512-NEXT:    vmovdqa64 64(%rax), %zmm0
240; AVX512-NEXT:    vmovaps %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
241; AVX512-NEXT:    vmovdqa64 128(%rax), %zmm0
242; AVX512-NEXT:    vmovaps %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
243; AVX512-NEXT:    vmovdqa64 192(%rax), %zmm0
244; AVX512-NEXT:    vmovaps %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
245; AVX512-NEXT:    vmovdqa64 256(%rax), %zmm0
246; AVX512-NEXT:    vmovaps %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
247; AVX512-NEXT:    vmovdqa64 320(%rax), %zmm0
248; AVX512-NEXT:    vmovaps %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
249; AVX512-NEXT:    vmovdqa64 384(%rax), %zmm0
250; AVX512-NEXT:    vmovaps %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
251; AVX512-NEXT:    vmovdqa64 448(%rax), %zmm0
252; AVX512-NEXT:    vmovaps %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
253; AVX512-NEXT:    vmovdqa64 512(%rax), %zmm0
254; AVX512-NEXT:    vmovaps %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
255; AVX512-NEXT:    vmovdqa64 576(%rax), %zmm0
256; AVX512-NEXT:    vmovaps %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
257; AVX512-NEXT:    vmovdqa64 640(%rax), %zmm0
258; AVX512-NEXT:    vmovaps %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
259; AVX512-NEXT:    vmovdqa64 704(%rax), %zmm0
260; AVX512-NEXT:    vmovaps %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
261; AVX512-NEXT:    vmovdqa64 768(%rax), %zmm0
262; AVX512-NEXT:    vmovaps %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
263; AVX512-NEXT:    vmovdqa64 832(%rax), %zmm0
264; AVX512-NEXT:    vmovaps %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
265; AVX512-NEXT:    vmovdqa64 896(%rax), %zmm0
266; AVX512-NEXT:    vmovaps %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
267; AVX512-NEXT:    vmovdqa64 960(%rax), %zmm0
268; AVX512-NEXT:    vmovaps %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
269; AVX512-NEXT:    vmovdqa64 1024(%rax), %zmm0
270; AVX512-NEXT:    vmovdqa64 {{[0-9]+}}(%rsp), %zmm16
271; AVX512-NEXT:    vmovdqa64 {{[0-9]+}}(%rsp), %zmm17
272; AVX512-NEXT:    vmovdqa64 {{[0-9]+}}(%rsp), %zmm18
273; AVX512-NEXT:    vmovdqa64 {{[0-9]+}}(%rsp), %zmm19
274; AVX512-NEXT:    vmovdqa64 {{[0-9]+}}(%rsp), %zmm20
275; AVX512-NEXT:    vmovdqa64 {{[0-9]+}}(%rsp), %zmm21
276; AVX512-NEXT:    vmovdqa64 {{[0-9]+}}(%rsp), %zmm22
277; AVX512-NEXT:    vmovdqa64 {{[0-9]+}}(%rsp), %zmm23
278; AVX512-NEXT:    vmovdqa64 {{[0-9]+}}(%rsp), %zmm24
279; AVX512-NEXT:    vmovdqa64 {{[0-9]+}}(%rsp), %zmm25
280; AVX512-NEXT:    vmovdqa64 {{[0-9]+}}(%rsp), %zmm26
281; AVX512-NEXT:    vmovdqa64 {{[0-9]+}}(%rsp), %zmm27
282; AVX512-NEXT:    vmovdqa64 {{[0-9]+}}(%rsp), %zmm28
283; AVX512-NEXT:    vmovdqa64 {{[0-9]+}}(%rsp), %zmm29
284; AVX512-NEXT:    vmovdqa64 {{[0-9]+}}(%rsp), %zmm30
285; AVX512-NEXT:    vmovdqa64 {{[0-9]+}}(%rsp), %zmm31
286; AVX512-NEXT:    vmovdqa64 {{[0-9]+}}(%rsp), %zmm1
287; AVX512-NEXT:    vmovaps %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
288; AVX512-NEXT:    vmovdqa64 {{[0-9]+}}(%rsp), %zmm1
289; AVX512-NEXT:    vmovdqa64 {{[0-9]+}}(%rsp), %zmm2
290; AVX512-NEXT:    vmovdqa64 {{[0-9]+}}(%rsp), %zmm3
291; AVX512-NEXT:    vmovdqa64 {{[0-9]+}}(%rsp), %zmm4
292; AVX512-NEXT:    vmovdqa64 {{[0-9]+}}(%rsp), %zmm5
293; AVX512-NEXT:    vmovdqa64 {{[0-9]+}}(%rsp), %zmm6
294; AVX512-NEXT:    vmovdqa64 {{[0-9]+}}(%rsp), %zmm7
295; AVX512-NEXT:    vmovdqa64 {{[0-9]+}}(%rsp), %zmm8
296; AVX512-NEXT:    vmovdqa64 {{[0-9]+}}(%rsp), %zmm9
297; AVX512-NEXT:    vmovdqa64 {{[0-9]+}}(%rsp), %zmm10
298; AVX512-NEXT:    vmovdqa64 {{[0-9]+}}(%rsp), %zmm11
299; AVX512-NEXT:    vmovdqa64 {{[0-9]+}}(%rsp), %zmm12
300; AVX512-NEXT:    vmovdqa64 {{[0-9]+}}(%rsp), %zmm13
301; AVX512-NEXT:    vmovdqa64 {{[0-9]+}}(%rsp), %zmm14
302; AVX512-NEXT:    vmovdqa64 {{[0-9]+}}(%rsp), %zmm15
303; AVX512-NEXT:    vmovdqa64 %zmm0, {{[0-9]+}}(%rsp)
304; AVX512-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
305; AVX512-NEXT:    vmovdqa64 %zmm0, {{[0-9]+}}(%rsp)
306; AVX512-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
307; AVX512-NEXT:    vmovdqa64 %zmm0, {{[0-9]+}}(%rsp)
308; AVX512-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
309; AVX512-NEXT:    vmovdqa64 %zmm0, {{[0-9]+}}(%rsp)
310; AVX512-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
311; AVX512-NEXT:    vmovdqa64 %zmm0, {{[0-9]+}}(%rsp)
312; AVX512-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
313; AVX512-NEXT:    vmovdqa64 %zmm0, {{[0-9]+}}(%rsp)
314; AVX512-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
315; AVX512-NEXT:    vmovdqa64 %zmm0, {{[0-9]+}}(%rsp)
316; AVX512-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
317; AVX512-NEXT:    vmovdqa64 %zmm0, {{[0-9]+}}(%rsp)
318; AVX512-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
319; AVX512-NEXT:    vmovdqa64 %zmm0, {{[0-9]+}}(%rsp)
320; AVX512-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
321; AVX512-NEXT:    vmovdqa64 %zmm0, {{[0-9]+}}(%rsp)
322; AVX512-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
323; AVX512-NEXT:    vmovdqa64 %zmm0, {{[0-9]+}}(%rsp)
324; AVX512-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
325; AVX512-NEXT:    vmovdqa64 %zmm0, {{[0-9]+}}(%rsp)
326; AVX512-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
327; AVX512-NEXT:    vmovdqa64 %zmm0, {{[0-9]+}}(%rsp)
328; AVX512-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
329; AVX512-NEXT:    vmovdqa64 %zmm0, {{[0-9]+}}(%rsp)
330; AVX512-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
331; AVX512-NEXT:    vmovdqa64 %zmm0, {{[0-9]+}}(%rsp)
332; AVX512-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
333; AVX512-NEXT:    vmovdqa64 %zmm0, {{[0-9]+}}(%rsp)
334; AVX512-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
335; AVX512-NEXT:    vmovdqa64 %zmm31, {{[0-9]+}}(%rsp)
336; AVX512-NEXT:    vmovdqa64 %zmm30, {{[0-9]+}}(%rsp)
337; AVX512-NEXT:    vmovdqa64 %zmm29, {{[0-9]+}}(%rsp)
338; AVX512-NEXT:    vmovdqa64 %zmm28, {{[0-9]+}}(%rsp)
339; AVX512-NEXT:    vmovdqa64 %zmm27, {{[0-9]+}}(%rsp)
340; AVX512-NEXT:    vmovdqa64 %zmm26, {{[0-9]+}}(%rsp)
341; AVX512-NEXT:    vmovdqa64 %zmm25, {{[0-9]+}}(%rsp)
342; AVX512-NEXT:    vmovdqa64 %zmm24, {{[0-9]+}}(%rsp)
343; AVX512-NEXT:    vmovdqa64 %zmm23, {{[0-9]+}}(%rsp)
344; AVX512-NEXT:    vmovdqa64 %zmm22, {{[0-9]+}}(%rsp)
345; AVX512-NEXT:    vmovdqa64 %zmm21, {{[0-9]+}}(%rsp)
346; AVX512-NEXT:    vmovdqa64 %zmm20, {{[0-9]+}}(%rsp)
347; AVX512-NEXT:    vmovdqa64 %zmm19, {{[0-9]+}}(%rsp)
348; AVX512-NEXT:    vmovdqa64 %zmm18, {{[0-9]+}}(%rsp)
349; AVX512-NEXT:    vmovdqa64 %zmm17, {{[0-9]+}}(%rsp)
350; AVX512-NEXT:    vmovdqa64 %zmm16, {{[0-9]+}}(%rsp)
351; AVX512-NEXT:    vmovdqa64 %zmm15, {{[0-9]+}}(%rsp)
352; AVX512-NEXT:    vmovdqa64 %zmm14, {{[0-9]+}}(%rsp)
353; AVX512-NEXT:    vmovdqa64 %zmm13, {{[0-9]+}}(%rsp)
354; AVX512-NEXT:    vmovdqa64 %zmm12, {{[0-9]+}}(%rsp)
355; AVX512-NEXT:    vmovdqa64 %zmm11, {{[0-9]+}}(%rsp)
356; AVX512-NEXT:    vmovdqa64 %zmm10, {{[0-9]+}}(%rsp)
357; AVX512-NEXT:    vmovdqa64 %zmm9, {{[0-9]+}}(%rsp)
358; AVX512-NEXT:    vmovdqa64 %zmm8, {{[0-9]+}}(%rsp)
359; AVX512-NEXT:    vmovdqa64 %zmm7, {{[0-9]+}}(%rsp)
360; AVX512-NEXT:    vmovdqa64 %zmm6, {{[0-9]+}}(%rsp)
361; AVX512-NEXT:    vmovdqa64 %zmm5, {{[0-9]+}}(%rsp)
362; AVX512-NEXT:    vmovdqa64 %zmm4, {{[0-9]+}}(%rsp)
363; AVX512-NEXT:    vmovdqa64 %zmm3, {{[0-9]+}}(%rsp)
364; AVX512-NEXT:    vmovdqa64 %zmm2, {{[0-9]+}}(%rsp)
365; AVX512-NEXT:    vmovdqa64 %zmm1, {{[0-9]+}}(%rsp)
366; AVX512-NEXT:    vmovdqa64 %zmm0, {{[0-9]+}}(%rsp)
367; AVX512-NEXT:    leaq {{[0-9]+}}(%rsp), %rdi
368; AVX512-NEXT:    leaq {{[0-9]+}}(%rsp), %rsi
369; AVX512-NEXT:    movl $1024, %edx # imm = 0x400
370; AVX512-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
371; AVX512-NEXT:    vzeroupper
372; AVX512-NEXT:    callq memcpy@PLT
373; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
374; AVX512-NEXT:    leaq {{[0-9]+}}(%rsp), %rdi
375; AVX512-NEXT:    leaq {{[0-9]+}}(%rsp), %rsi
376; AVX512-NEXT:    callq memcpy@PLT
377; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
378; AVX512-NEXT:    leaq {{[0-9]+}}(%rsp), %rdi
379; AVX512-NEXT:    leaq {{[0-9]+}}(%rsp), %rsi
380; AVX512-NEXT:    callq memcpy@PLT
381; AVX512-NEXT:    movw {{[-0-9]+}}(%r{{[sb]}}p), %di # 2-byte Reload
382; AVX512-NEXT:    movw {{[-0-9]+}}(%r{{[sb]}}p), %cx # 2-byte Reload
383; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
384; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
385; AVX512-NEXT:    # kill: def $r8 killed $rax
386; AVX512-NEXT:    movw {{[-0-9]+}}(%r{{[sb]}}p), %ax # 2-byte Reload
387; AVX512-NEXT:    vmovdqa64 {{[0-9]+}}(%rsp), %zmm0
388; AVX512-NEXT:    vmovaps %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
389; AVX512-NEXT:    vmovdqa64 {{[0-9]+}}(%rsp), %zmm0
390; AVX512-NEXT:    vmovaps %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
391; AVX512-NEXT:    vmovdqa64 {{[0-9]+}}(%rsp), %zmm0
392; AVX512-NEXT:    vmovaps %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
393; AVX512-NEXT:    vmovdqa64 {{[0-9]+}}(%rsp), %zmm0
394; AVX512-NEXT:    vmovaps %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
395; AVX512-NEXT:    vmovdqa64 {{[0-9]+}}(%rsp), %zmm0
396; AVX512-NEXT:    vmovaps %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
397; AVX512-NEXT:    vmovdqa64 {{[0-9]+}}(%rsp), %zmm0
398; AVX512-NEXT:    vmovaps %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
399; AVX512-NEXT:    vmovdqa64 {{[0-9]+}}(%rsp), %zmm0
400; AVX512-NEXT:    vmovaps %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
401; AVX512-NEXT:    vmovdqa64 {{[0-9]+}}(%rsp), %zmm0
402; AVX512-NEXT:    vmovaps %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
403; AVX512-NEXT:    vmovdqa64 {{[0-9]+}}(%rsp), %zmm0
404; AVX512-NEXT:    vmovaps %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
405; AVX512-NEXT:    vmovdqa64 {{[0-9]+}}(%rsp), %zmm0
406; AVX512-NEXT:    vmovaps %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
407; AVX512-NEXT:    vmovdqa64 {{[0-9]+}}(%rsp), %zmm0
408; AVX512-NEXT:    vmovaps %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
409; AVX512-NEXT:    vmovdqa64 {{[0-9]+}}(%rsp), %zmm0
410; AVX512-NEXT:    vmovaps %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
411; AVX512-NEXT:    vmovdqa64 {{[0-9]+}}(%rsp), %zmm0
412; AVX512-NEXT:    vmovaps %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
413; AVX512-NEXT:    vmovdqa64 {{[0-9]+}}(%rsp), %zmm0
414; AVX512-NEXT:    vmovaps %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
415; AVX512-NEXT:    vmovdqa64 {{[0-9]+}}(%rsp), %zmm0
416; AVX512-NEXT:    vmovaps %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
417; AVX512-NEXT:    vmovdqa64 {{[0-9]+}}(%rsp), %zmm0
418; AVX512-NEXT:    vmovdqa64 {{[0-9]+}}(%rsp), %zmm16
419; AVX512-NEXT:    vmovdqa64 {{[0-9]+}}(%rsp), %zmm17
420; AVX512-NEXT:    vmovdqa64 {{[0-9]+}}(%rsp), %zmm18
421; AVX512-NEXT:    vmovdqa64 {{[0-9]+}}(%rsp), %zmm19
422; AVX512-NEXT:    vmovdqa64 {{[0-9]+}}(%rsp), %zmm20
423; AVX512-NEXT:    vmovdqa64 {{[0-9]+}}(%rsp), %zmm21
424; AVX512-NEXT:    vmovdqa64 {{[0-9]+}}(%rsp), %zmm22
425; AVX512-NEXT:    vmovdqa64 {{[0-9]+}}(%rsp), %zmm23
426; AVX512-NEXT:    vmovdqa64 {{[0-9]+}}(%rsp), %zmm24
427; AVX512-NEXT:    vmovdqa64 {{[0-9]+}}(%rsp), %zmm25
428; AVX512-NEXT:    vmovdqa64 {{[0-9]+}}(%rsp), %zmm26
429; AVX512-NEXT:    vmovdqa64 {{[0-9]+}}(%rsp), %zmm27
430; AVX512-NEXT:    vmovdqa64 {{[0-9]+}}(%rsp), %zmm28
431; AVX512-NEXT:    vmovdqa64 {{[0-9]+}}(%rsp), %zmm29
432; AVX512-NEXT:    vmovdqa64 {{[0-9]+}}(%rsp), %zmm30
433; AVX512-NEXT:    vmovdqa64 {{[0-9]+}}(%rsp), %zmm31
434; AVX512-NEXT:    vmovdqa64 {{[0-9]+}}(%rsp), %zmm1
435; AVX512-NEXT:    vmovaps %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
436; AVX512-NEXT:    vmovdqa64 {{[0-9]+}}(%rsp), %zmm1
437; AVX512-NEXT:    vmovdqa64 {{[0-9]+}}(%rsp), %zmm2
438; AVX512-NEXT:    vmovdqa64 {{[0-9]+}}(%rsp), %zmm3
439; AVX512-NEXT:    vmovdqa64 {{[0-9]+}}(%rsp), %zmm4
440; AVX512-NEXT:    vmovdqa64 {{[0-9]+}}(%rsp), %zmm5
441; AVX512-NEXT:    vmovdqa64 {{[0-9]+}}(%rsp), %zmm6
442; AVX512-NEXT:    vmovdqa64 {{[0-9]+}}(%rsp), %zmm7
443; AVX512-NEXT:    vmovdqa64 {{[0-9]+}}(%rsp), %zmm8
444; AVX512-NEXT:    vmovdqa64 {{[0-9]+}}(%rsp), %zmm9
445; AVX512-NEXT:    vmovdqa64 {{[0-9]+}}(%rsp), %zmm10
446; AVX512-NEXT:    vmovdqa64 {{[0-9]+}}(%rsp), %zmm11
447; AVX512-NEXT:    vmovdqa64 {{[0-9]+}}(%rsp), %zmm12
448; AVX512-NEXT:    vmovdqa64 {{[0-9]+}}(%rsp), %zmm13
449; AVX512-NEXT:    vmovdqa64 {{[0-9]+}}(%rsp), %zmm14
450; AVX512-NEXT:    vmovdqa64 {{[0-9]+}}(%rsp), %zmm15
451; AVX512-NEXT:    movw %di, {{[0-9]+}}(%rsp)
452; AVX512-NEXT:    movw %cx, {{[0-9]+}}(%rsp)
453; AVX512-NEXT:    movw %ax, {{[0-9]+}}(%rsp)
454; AVX512-NEXT:    vmovdqa64 %zmm0, {{[0-9]+}}(%rsp)
455; AVX512-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
456; AVX512-NEXT:    vmovdqa64 %zmm0, {{[0-9]+}}(%rsp)
457; AVX512-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
458; AVX512-NEXT:    vmovdqa64 %zmm0, {{[0-9]+}}(%rsp)
459; AVX512-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
460; AVX512-NEXT:    vmovdqa64 %zmm0, {{[0-9]+}}(%rsp)
461; AVX512-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
462; AVX512-NEXT:    vmovdqa64 %zmm0, {{[0-9]+}}(%rsp)
463; AVX512-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
464; AVX512-NEXT:    vmovdqa64 %zmm0, {{[0-9]+}}(%rsp)
465; AVX512-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
466; AVX512-NEXT:    vmovdqa64 %zmm0, {{[0-9]+}}(%rsp)
467; AVX512-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
468; AVX512-NEXT:    vmovdqa64 %zmm0, {{[0-9]+}}(%rsp)
469; AVX512-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
470; AVX512-NEXT:    vmovdqa64 %zmm0, {{[0-9]+}}(%rsp)
471; AVX512-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
472; AVX512-NEXT:    vmovdqa64 %zmm0, {{[0-9]+}}(%rsp)
473; AVX512-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
474; AVX512-NEXT:    vmovdqa64 %zmm0, {{[0-9]+}}(%rsp)
475; AVX512-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
476; AVX512-NEXT:    vmovdqa64 %zmm0, {{[0-9]+}}(%rsp)
477; AVX512-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
478; AVX512-NEXT:    vmovdqa64 %zmm0, {{[0-9]+}}(%rsp)
479; AVX512-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
480; AVX512-NEXT:    vmovdqa64 %zmm0, {{[0-9]+}}(%rsp)
481; AVX512-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
482; AVX512-NEXT:    vmovdqa64 %zmm0, {{[0-9]+}}(%rsp)
483; AVX512-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
484; AVX512-NEXT:    vmovdqa64 %zmm0, {{[0-9]+}}(%rsp)
485; AVX512-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
486; AVX512-NEXT:    vmovdqa64 %zmm31, {{[0-9]+}}(%rsp)
487; AVX512-NEXT:    vmovdqa64 %zmm30, {{[0-9]+}}(%rsp)
488; AVX512-NEXT:    vmovdqa64 %zmm29, {{[0-9]+}}(%rsp)
489; AVX512-NEXT:    vmovdqa64 %zmm28, {{[0-9]+}}(%rsp)
490; AVX512-NEXT:    vmovdqa64 %zmm27, {{[0-9]+}}(%rsp)
491; AVX512-NEXT:    vmovdqa64 %zmm26, {{[0-9]+}}(%rsp)
492; AVX512-NEXT:    vmovdqa64 %zmm25, {{[0-9]+}}(%rsp)
493; AVX512-NEXT:    vmovdqa64 %zmm24, {{[0-9]+}}(%rsp)
494; AVX512-NEXT:    vmovdqa64 %zmm23, {{[0-9]+}}(%rsp)
495; AVX512-NEXT:    vmovdqa64 %zmm22, {{[0-9]+}}(%rsp)
496; AVX512-NEXT:    vmovdqa64 %zmm21, {{[0-9]+}}(%rsp)
497; AVX512-NEXT:    vmovdqa64 %zmm20, {{[0-9]+}}(%rsp)
498; AVX512-NEXT:    vmovdqa64 %zmm19, {{[0-9]+}}(%rsp)
499; AVX512-NEXT:    vmovdqa64 %zmm18, {{[0-9]+}}(%rsp)
500; AVX512-NEXT:    vmovdqa64 %zmm17, {{[0-9]+}}(%rsp)
501; AVX512-NEXT:    vmovdqa64 %zmm16, {{[0-9]+}}(%rsp)
502; AVX512-NEXT:    vmovdqa64 %zmm15, {{[0-9]+}}(%rsp)
503; AVX512-NEXT:    vmovdqa64 %zmm14, {{[0-9]+}}(%rsp)
504; AVX512-NEXT:    vmovdqa64 %zmm13, {{[0-9]+}}(%rsp)
505; AVX512-NEXT:    vmovdqa64 %zmm12, {{[0-9]+}}(%rsp)
506; AVX512-NEXT:    vmovdqa64 %zmm11, {{[0-9]+}}(%rsp)
507; AVX512-NEXT:    vmovdqa64 %zmm10, {{[0-9]+}}(%rsp)
508; AVX512-NEXT:    vmovdqa64 %zmm9, {{[0-9]+}}(%rsp)
509; AVX512-NEXT:    vmovdqa64 %zmm8, {{[0-9]+}}(%rsp)
510; AVX512-NEXT:    vmovdqa64 %zmm7, {{[0-9]+}}(%rsp)
511; AVX512-NEXT:    vmovdqa64 %zmm6, {{[0-9]+}}(%rsp)
512; AVX512-NEXT:    vmovdqa64 %zmm5, {{[0-9]+}}(%rsp)
513; AVX512-NEXT:    vmovdqa64 %zmm4, {{[0-9]+}}(%rsp)
514; AVX512-NEXT:    vmovdqa64 %zmm3, {{[0-9]+}}(%rsp)
515; AVX512-NEXT:    vmovdqa64 %zmm2, {{[0-9]+}}(%rsp)
516; AVX512-NEXT:    vmovdqa64 %zmm1, {{[0-9]+}}(%rsp)
517; AVX512-NEXT:    vmovdqa64 %zmm0, {{[0-9]+}}(%rsp)
518; AVX512-NEXT:    movw {{[0-9]+}}(%rsp), %ax
519; AVX512-NEXT:    movw {{[0-9]+}}(%rsp), %cx
520; AVX512-NEXT:    movzwl {{[0-9]+}}(%rsp), %r8d
521; AVX512-NEXT:    movw %r8w, %di
522; AVX512-NEXT:    shrl $2, %r8d
523; AVX512-NEXT:    movw %r8w, %r9w
524; AVX512-NEXT:    # implicit-def: $al
525; AVX512-NEXT:    movb %al, {{[0-9]+}}(%rsp)
526; AVX512-NEXT:    movw %cx, {{[0-9]+}}(%rsp)
527; AVX512-NEXT:    # implicit-def: $r9b
528; AVX512-NEXT:    movb %r9b, {{[0-9]+}}(%rsp)
529; AVX512-NEXT:    movw %cx, {{[0-9]+}}(%rsp)
530; AVX512-NEXT:    # implicit-def: $al
531; AVX512-NEXT:    movb %al, {{[0-9]+}}(%rsp)
532; AVX512-NEXT:    movw %di, {{[0-9]+}}(%rsp)
533; AVX512-NEXT:    # implicit-def: $al
534; AVX512-NEXT:    movb %al, {{[0-9]+}}(%rsp)
535; AVX512-NEXT:    movw %cx, {{[0-9]+}}(%rsp)
536; AVX512-NEXT:    ldtilecfg {{[0-9]+}}(%rsp)
537; AVX512-NEXT:    movl $64, %r8d
538; AVX512-NEXT:    leaq {{[0-9]+}}(%rsp), %r10
539; AVX512-NEXT:    tileloadd (%r10,%r8), %tmm0
540; AVX512-NEXT:    leaq {{[0-9]+}}(%rsp), %r10
541; AVX512-NEXT:    tileloadd (%r10,%r8), %tmm1
542; AVX512-NEXT:    leaq {{[0-9]+}}(%rsp), %r10
543; AVX512-NEXT:    tileloadd (%r10,%r8), %tmm2
544; AVX512-NEXT:    tdpbssd %tmm2, %tmm1, %tmm0
545; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rdi
546; AVX512-NEXT:    addq $64, %rdi
547; AVX512-NEXT:    tilestored %tmm0, (%rdi,%r8)
548; AVX512-NEXT:    leaq {{[0-9]+}}(%rsp), %rdi
549; AVX512-NEXT:    vzeroupper
550; AVX512-NEXT:    callq memcpy@PLT
551; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
552; AVX512-NEXT:    movq $buf, {{[0-9]+}}(%rsp)
553; AVX512-NEXT:    movq $32, {{[0-9]+}}(%rsp)
554; AVX512-NEXT:    movw {{[0-9]+}}(%rsp), %ax
555; AVX512-NEXT:    movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
556; AVX512-NEXT:    movw {{[0-9]+}}(%rsp), %ax
557; AVX512-NEXT:    movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
558; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rax
559; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
560; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rax
561; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
562; AVX512-NEXT:    vmovdqa64 {{[0-9]+}}(%rsp), %zmm0
563; AVX512-NEXT:    vmovdqa64 {{[0-9]+}}(%rsp), %zmm1
564; AVX512-NEXT:    vmovdqa64 {{[0-9]+}}(%rsp), %zmm2
565; AVX512-NEXT:    vmovdqa64 {{[0-9]+}}(%rsp), %zmm3
566; AVX512-NEXT:    vmovdqa64 {{[0-9]+}}(%rsp), %zmm4
567; AVX512-NEXT:    vmovdqa64 {{[0-9]+}}(%rsp), %zmm5
568; AVX512-NEXT:    vmovdqa64 {{[0-9]+}}(%rsp), %zmm6
569; AVX512-NEXT:    vmovdqa64 {{[0-9]+}}(%rsp), %zmm7
570; AVX512-NEXT:    vmovdqa64 {{[0-9]+}}(%rsp), %zmm8
571; AVX512-NEXT:    vmovdqa64 {{[0-9]+}}(%rsp), %zmm9
572; AVX512-NEXT:    vmovdqa64 {{[0-9]+}}(%rsp), %zmm10
573; AVX512-NEXT:    vmovdqa64 {{[0-9]+}}(%rsp), %zmm11
574; AVX512-NEXT:    vmovdqa64 {{[0-9]+}}(%rsp), %zmm12
575; AVX512-NEXT:    vmovdqa64 {{[0-9]+}}(%rsp), %zmm13
576; AVX512-NEXT:    vmovdqa64 {{[0-9]+}}(%rsp), %zmm14
577; AVX512-NEXT:    vmovdqa64 {{[0-9]+}}(%rsp), %zmm15
578; AVX512-NEXT:    vmovdqa64 %zmm15, {{[0-9]+}}(%rsp)
579; AVX512-NEXT:    vmovdqa64 %zmm14, {{[0-9]+}}(%rsp)
580; AVX512-NEXT:    vmovdqa64 %zmm13, {{[0-9]+}}(%rsp)
581; AVX512-NEXT:    vmovdqa64 %zmm12, {{[0-9]+}}(%rsp)
582; AVX512-NEXT:    vmovdqa64 %zmm11, {{[0-9]+}}(%rsp)
583; AVX512-NEXT:    vmovdqa64 %zmm10, {{[0-9]+}}(%rsp)
584; AVX512-NEXT:    vmovdqa64 %zmm9, {{[0-9]+}}(%rsp)
585; AVX512-NEXT:    vmovdqa64 %zmm8, {{[0-9]+}}(%rsp)
586; AVX512-NEXT:    vmovdqa64 %zmm7, {{[0-9]+}}(%rsp)
587; AVX512-NEXT:    vmovdqa64 %zmm6, {{[0-9]+}}(%rsp)
588; AVX512-NEXT:    vmovdqa64 %zmm5, {{[0-9]+}}(%rsp)
589; AVX512-NEXT:    vmovdqa64 %zmm4, {{[0-9]+}}(%rsp)
590; AVX512-NEXT:    vmovdqa64 %zmm3, {{[0-9]+}}(%rsp)
591; AVX512-NEXT:    vmovdqa64 %zmm2, {{[0-9]+}}(%rsp)
592; AVX512-NEXT:    vmovdqa64 %zmm1, {{[0-9]+}}(%rsp)
593; AVX512-NEXT:    vmovdqa64 %zmm0, {{[0-9]+}}(%rsp)
594; AVX512-NEXT:    leaq {{[0-9]+}}(%rsp), %rdi
595; AVX512-NEXT:    leaq {{[0-9]+}}(%rsp), %rsi
596; AVX512-NEXT:    vzeroupper
597; AVX512-NEXT:    callq memcpy@PLT
598; AVX512-NEXT:    movw {{[-0-9]+}}(%r{{[sb]}}p), %si # 2-byte Reload
599; AVX512-NEXT:    movw {{[-0-9]+}}(%r{{[sb]}}p), %dx # 2-byte Reload
600; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
601; AVX512-NEXT:    # kill: def $rdi killed $rax
602; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
603; AVX512-NEXT:    vmovdqa64 {{[0-9]+}}(%rsp), %zmm0
604; AVX512-NEXT:    vmovdqa64 {{[0-9]+}}(%rsp), %zmm1
605; AVX512-NEXT:    vmovdqa64 {{[0-9]+}}(%rsp), %zmm2
606; AVX512-NEXT:    vmovdqa64 {{[0-9]+}}(%rsp), %zmm3
607; AVX512-NEXT:    vmovdqa64 {{[0-9]+}}(%rsp), %zmm4
608; AVX512-NEXT:    vmovdqa64 {{[0-9]+}}(%rsp), %zmm5
609; AVX512-NEXT:    vmovdqa64 {{[0-9]+}}(%rsp), %zmm6
610; AVX512-NEXT:    vmovdqa64 {{[0-9]+}}(%rsp), %zmm7
611; AVX512-NEXT:    vmovdqa64 {{[0-9]+}}(%rsp), %zmm8
612; AVX512-NEXT:    vmovdqa64 {{[0-9]+}}(%rsp), %zmm9
613; AVX512-NEXT:    vmovdqa64 {{[0-9]+}}(%rsp), %zmm10
614; AVX512-NEXT:    vmovdqa64 {{[0-9]+}}(%rsp), %zmm11
615; AVX512-NEXT:    vmovdqa64 {{[0-9]+}}(%rsp), %zmm12
616; AVX512-NEXT:    vmovdqa64 {{[0-9]+}}(%rsp), %zmm13
617; AVX512-NEXT:    vmovdqa64 {{[0-9]+}}(%rsp), %zmm14
618; AVX512-NEXT:    vmovdqa64 {{[0-9]+}}(%rsp), %zmm15
619; AVX512-NEXT:    movw %si, {{[0-9]+}}(%rsp)
620; AVX512-NEXT:    movw %dx, {{[0-9]+}}(%rsp)
621; AVX512-NEXT:    movq %rcx, {{[0-9]+}}(%rsp)
622; AVX512-NEXT:    movq %rax, {{[0-9]+}}(%rsp)
623; AVX512-NEXT:    vmovdqa64 %zmm15, {{[0-9]+}}(%rsp)
624; AVX512-NEXT:    vmovdqa64 %zmm14, {{[0-9]+}}(%rsp)
625; AVX512-NEXT:    vmovdqa64 %zmm13, {{[0-9]+}}(%rsp)
626; AVX512-NEXT:    vmovdqa64 %zmm12, {{[0-9]+}}(%rsp)
627; AVX512-NEXT:    vmovdqa64 %zmm11, {{[0-9]+}}(%rsp)
628; AVX512-NEXT:    vmovdqa64 %zmm10, {{[0-9]+}}(%rsp)
629; AVX512-NEXT:    vmovdqa64 %zmm9, {{[0-9]+}}(%rsp)
630; AVX512-NEXT:    vmovdqa64 %zmm8, {{[0-9]+}}(%rsp)
631; AVX512-NEXT:    vmovdqa64 %zmm7, {{[0-9]+}}(%rsp)
632; AVX512-NEXT:    vmovdqa64 %zmm6, {{[0-9]+}}(%rsp)
633; AVX512-NEXT:    vmovdqa64 %zmm5, {{[0-9]+}}(%rsp)
634; AVX512-NEXT:    vmovdqa64 %zmm4, {{[0-9]+}}(%rsp)
635; AVX512-NEXT:    vmovdqa64 %zmm3, {{[0-9]+}}(%rsp)
636; AVX512-NEXT:    vmovdqa64 %zmm2, {{[0-9]+}}(%rsp)
637; AVX512-NEXT:    vmovdqa64 %zmm1, {{[0-9]+}}(%rsp)
638; AVX512-NEXT:    vmovdqa64 %zmm0, {{[0-9]+}}(%rsp)
639; AVX512-NEXT:    movw {{[0-9]+}}(%rsp), %ax
640; AVX512-NEXT:    movw {{[0-9]+}}(%rsp), %cx
641; AVX512-NEXT:    # implicit-def: $al
642; AVX512-NEXT:    movb %al, {{[0-9]+}}(%rsp)
643; AVX512-NEXT:    movw %cx, {{[0-9]+}}(%rsp)
644; AVX512-NEXT:    ldtilecfg {{[0-9]+}}(%rsp)
645; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rdx
646; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rsi
647; AVX512-NEXT:    movl $64, %r8d
648; AVX512-NEXT:    leaq {{[0-9]+}}(%rsp), %rdi
649; AVX512-NEXT:    tileloadd (%rdi,%r8), %tmm0
650; AVX512-NEXT:    tilestored %tmm0, (%rdx,%rsi)
651; AVX512-NEXT:    movq %rbp, %rsp
652; AVX512-NEXT:    popq %rbp
653; AVX512-NEXT:    tilerelease
654; AVX512-NEXT:    vzeroupper
655; AVX512-NEXT:    retq
656entry:
657  %m.addr.i85 = alloca i16, align 2
658  %n.addr.i86 = alloca i16, align 2
659  %base.addr.i87 = alloca ptr, align 8
660  %stride.addr.i88 = alloca i64, align 8
661  %tile.addr.i = alloca <256 x i32>, align 64
662  %indirect-arg-temp.i5284 = alloca <256 x i32>, align 1024
663  %m.addr.i81 = alloca i16, align 2
664  %n.addr.i82 = alloca i16, align 2
665  %k.addr.i = alloca i16, align 2
666  %dst.addr.i83 = alloca <256 x i32>, align 64
667  %src1.addr.i = alloca <256 x i32>, align 64
668  %src2.addr.i = alloca <256 x i32>, align 64
669  %indirect-arg-temp5.i80 = alloca <256 x i32>, align 1024
670  %indirect-arg-temp4.i79 = alloca <256 x i32>, align 1024
671  %indirect-arg-temp.i78 = alloca <256 x i32>, align 1024
672  %m.addr.i74 = alloca i16, align 2
673  %n.addr.i75 = alloca i16, align 2
674  %base.addr.i76 = alloca ptr, align 8
675  %stride.addr.i77 = alloca i64, align 8
676  %m.addr.i70 = alloca i16, align 2
677  %n.addr.i71 = alloca i16, align 2
678  %base.addr.i72 = alloca ptr, align 8
679  %stride.addr.i73 = alloca i64, align 8
680  %m.addr.i66 = alloca i16, align 2
681  %n.addr.i67 = alloca i16, align 2
682  %base.addr.i68 = alloca ptr, align 8
683  %stride.addr.i69 = alloca i64, align 8
684  %m.addr.i62 = alloca i16, align 2
685  %n.addr.i63 = alloca i16, align 2
686  %base.addr.i64 = alloca ptr, align 8
687  %stride.addr.i65 = alloca i64, align 8
688  %m.addr.i58 = alloca i16, align 2
689  %n.addr.i59 = alloca i16, align 2
690  %base.addr.i60 = alloca ptr, align 8
691  %stride.addr.i61 = alloca i64, align 8
692  %m.addr.i = alloca i16, align 2
693  %n.addr.i = alloca i16, align 2
694  %base.addr.i56 = alloca ptr, align 8
695  %stride.addr.i57 = alloca i64, align 8
696  %base.addr.i50 = alloca ptr, align 8
697  %stride.addr.i51 = alloca i64, align 8
698  %indirect-arg-temp.i52 = alloca <256 x i32>, align 1024
699  %c49 = alloca %struct.__tile1024i_str, align 64
700  %dst.addr.i44 = alloca ptr, align 8
701  %indirect-arg-temp.i = alloca <256 x i32>, align 1024
702  %indirect-arg-temp4.i = alloca <256 x i32>, align 1024
703  %indirect-arg-temp5.i = alloca <256 x i32>, align 1024
704  %b43 = alloca %struct.__tile1024i_str, align 64
705  %a42 = alloca %struct.__tile1024i_str, align 64
706  %dst.addr.i35 = alloca ptr, align 8
707  %base.addr.i36 = alloca ptr, align 8
708  %stride.addr.i37 = alloca i64, align 8
709  %dst.addr.i28 = alloca ptr, align 8
710  %base.addr.i29 = alloca ptr, align 8
711  %stride.addr.i30 = alloca i64, align 8
712  %dst.addr.i21 = alloca ptr, align 8
713  %base.addr.i22 = alloca ptr, align 8
714  %stride.addr.i23 = alloca i64, align 8
715  %dst.addr.i14 = alloca ptr, align 8
716  %base.addr.i15 = alloca ptr, align 8
717  %stride.addr.i16 = alloca i64, align 8
718  %dst.addr.i7 = alloca ptr, align 8
719  %base.addr.i8 = alloca ptr, align 8
720  %stride.addr.i9 = alloca i64, align 8
721  %dst.addr.i = alloca ptr, align 8
722  %base.addr.i = alloca ptr, align 8
723  %stride.addr.i = alloca i64, align 8
724  %cond.addr = alloca i32, align 4
725  %row.addr = alloca i16, align 2
726  %col.addr = alloca i16, align 2
727  %a = alloca %struct.__tile1024i_str, align 64
728  %b = alloca %struct.__tile1024i_str, align 64
729  %c = alloca %struct.__tile1024i_str, align 64
730  store i32 %cond, ptr %cond.addr, align 4
731  store i16 %row, ptr %row.addr, align 2
732  store i16 %col, ptr %col.addr, align 2
733  call void @llvm.memset.p0.i64(ptr align 64 %a, i8 0, i64 1088, i1 false)
734  %0 = load i16, ptr %row.addr, align 2
735  store i16 %0, ptr %a, align 64
736  %col2 = getelementptr inbounds %struct.__tile1024i_str, ptr %a, i32 0, i32 1
737  store i16 8, ptr %col2, align 2
738  call void @llvm.memset.p0.i64(ptr align 64 %b, i8 0, i64 1088, i1 false)
739  store i16 8, ptr %b, align 64
740  %col4 = getelementptr inbounds %struct.__tile1024i_str, ptr %b, i32 0, i32 1
741  %1 = load i16, ptr %col.addr, align 2
742  store i16 %1, ptr %col4, align 2
743  call void @llvm.memset.p0.i64(ptr align 64 %c, i8 0, i64 1088, i1 false)
744  %2 = load i16, ptr %row.addr, align 2
745  store i16 %2, ptr %c, align 64
746  %col6 = getelementptr inbounds %struct.__tile1024i_str, ptr %c, i32 0, i32 1
747  %3 = load i16, ptr %col.addr, align 2
748  store i16 %3, ptr %col6, align 2
749  %4 = load i32, ptr %cond.addr, align 4
750  %tobool = icmp ne i32 %4, 0
751  br i1 %tobool, label %if.then, label %if.else
752
753if.then:                                          ; preds = %entry
754  store ptr %a, ptr %dst.addr.i35, align 8
755  store ptr @buf, ptr %base.addr.i36, align 8
756  store i64 32, ptr %stride.addr.i37, align 8
757  %5 = load ptr, ptr %dst.addr.i35, align 8
758  %6 = load i16, ptr %5, align 64
759  %7 = load ptr, ptr %dst.addr.i35, align 8
760  %col.i39 = getelementptr inbounds %struct.__tile1024i_str, ptr %7, i32 0, i32 1
761  %8 = load i16, ptr %col.i39, align 2
762  %9 = load ptr, ptr %base.addr.i36, align 8
763  %10 = load i64, ptr %stride.addr.i37, align 8
764  store i16 %6, ptr %m.addr.i, align 2
765  store i16 %8, ptr %n.addr.i, align 2
766  store ptr %9, ptr %base.addr.i56, align 8
767  store i64 %10, ptr %stride.addr.i57, align 8
768  %11 = load i16, ptr %m.addr.i, align 2
769  %12 = load i16, ptr %n.addr.i, align 2
770  %13 = load ptr, ptr %base.addr.i56, align 8
771  %14 = load i64, ptr %stride.addr.i57, align 8
772  %15 = call x86_amx @llvm.x86.tileloadd64.internal(i16 %11, i16 %12, ptr %13, i64 %14) #2
773  %16 = bitcast x86_amx %15 to <256 x i32>
774  %17 = load ptr, ptr %dst.addr.i35, align 8
775  %tile.i41 = getelementptr inbounds %struct.__tile1024i_str, ptr %17, i32 0, i32 3
776  store <256 x i32> %16, ptr %tile.i41, align 64
777  store ptr %b, ptr %dst.addr.i28, align 8
778  store ptr @buf, ptr %base.addr.i29, align 8
779  store i64 32, ptr %stride.addr.i30, align 8
780  %18 = load ptr, ptr %dst.addr.i28, align 8
781  %19 = load i16, ptr %18, align 64
782  %20 = load ptr, ptr %dst.addr.i28, align 8
783  %col.i32 = getelementptr inbounds %struct.__tile1024i_str, ptr %20, i32 0, i32 1
784  %21 = load i16, ptr %col.i32, align 2
785  %22 = load ptr, ptr %base.addr.i29, align 8
786  %23 = load i64, ptr %stride.addr.i30, align 8
787  store i16 %19, ptr %m.addr.i58, align 2
788  store i16 %21, ptr %n.addr.i59, align 2
789  store ptr %22, ptr %base.addr.i60, align 8
790  store i64 %23, ptr %stride.addr.i61, align 8
791  %24 = load i16, ptr %m.addr.i58, align 2
792  %25 = load i16, ptr %n.addr.i59, align 2
793  %26 = load ptr, ptr %base.addr.i60, align 8
794  %27 = load i64, ptr %stride.addr.i61, align 8
795  %28 = call x86_amx @llvm.x86.tileloadd64.internal(i16 %24, i16 %25, ptr %26, i64 %27) #2
796  %29 = bitcast x86_amx %28 to <256 x i32>
797  %30 = load ptr, ptr %dst.addr.i28, align 8
798  %tile.i34 = getelementptr inbounds %struct.__tile1024i_str, ptr %30, i32 0, i32 3
799  store <256 x i32> %29, ptr %tile.i34, align 64
800  store ptr %c, ptr %dst.addr.i21, align 8
801  store ptr @buf, ptr %base.addr.i22, align 8
802  store i64 32, ptr %stride.addr.i23, align 8
803  %31 = load ptr, ptr %dst.addr.i21, align 8
804  %32 = load i16, ptr %31, align 64
805  %33 = load ptr, ptr %dst.addr.i21, align 8
806  %col.i25 = getelementptr inbounds %struct.__tile1024i_str, ptr %33, i32 0, i32 1
807  %34 = load i16, ptr %col.i25, align 2
808  %35 = load ptr, ptr %base.addr.i22, align 8
809  %36 = load i64, ptr %stride.addr.i23, align 8
810  store i16 %32, ptr %m.addr.i62, align 2
811  store i16 %34, ptr %n.addr.i63, align 2
812  store ptr %35, ptr %base.addr.i64, align 8
813  store i64 %36, ptr %stride.addr.i65, align 8
814  %37 = load i16, ptr %m.addr.i62, align 2
815  %38 = load i16, ptr %n.addr.i63, align 2
816  %39 = load ptr, ptr %base.addr.i64, align 8
817  %40 = load i64, ptr %stride.addr.i65, align 8
818  %41 = call x86_amx @llvm.x86.tileloadd64.internal(i16 %37, i16 %38, ptr %39, i64 %40) #2
819  %42 = bitcast x86_amx %41 to <256 x i32>
820  %43 = load ptr, ptr %dst.addr.i21, align 8
821  %tile.i27 = getelementptr inbounds %struct.__tile1024i_str, ptr %43, i32 0, i32 3
822  store <256 x i32> %42, ptr %tile.i27, align 64
823  br label %if.end
824
825if.else:                                          ; preds = %entry
826  store ptr %a, ptr %dst.addr.i14, align 8
827  store ptr @buf2, ptr %base.addr.i15, align 8
828  store i64 32, ptr %stride.addr.i16, align 8
829  %44 = load ptr, ptr %dst.addr.i14, align 8
830  %45 = load i16, ptr %44, align 64
831  %46 = load ptr, ptr %dst.addr.i14, align 8
832  %col.i18 = getelementptr inbounds %struct.__tile1024i_str, ptr %46, i32 0, i32 1
833  %47 = load i16, ptr %col.i18, align 2
834  %48 = load ptr, ptr %base.addr.i15, align 8
835  %49 = load i64, ptr %stride.addr.i16, align 8
836  store i16 %45, ptr %m.addr.i66, align 2
837  store i16 %47, ptr %n.addr.i67, align 2
838  store ptr %48, ptr %base.addr.i68, align 8
839  store i64 %49, ptr %stride.addr.i69, align 8
840  %50 = load i16, ptr %m.addr.i66, align 2
841  %51 = load i16, ptr %n.addr.i67, align 2
842  %52 = load ptr, ptr %base.addr.i68, align 8
843  %53 = load i64, ptr %stride.addr.i69, align 8
844  %54 = call x86_amx @llvm.x86.tileloadd64.internal(i16 %50, i16 %51, ptr %52, i64 %53) #2
845  %55 = bitcast x86_amx %54 to <256 x i32>
846  %56 = load ptr, ptr %dst.addr.i14, align 8
847  %tile.i20 = getelementptr inbounds %struct.__tile1024i_str, ptr %56, i32 0, i32 3
848  store <256 x i32> %55, ptr %tile.i20, align 64
849  store ptr %b, ptr %dst.addr.i7, align 8
850  store ptr @buf2, ptr %base.addr.i8, align 8
851  store i64 32, ptr %stride.addr.i9, align 8
852  %57 = load ptr, ptr %dst.addr.i7, align 8
853  %58 = load i16, ptr %57, align 64
854  %59 = load ptr, ptr %dst.addr.i7, align 8
855  %col.i11 = getelementptr inbounds %struct.__tile1024i_str, ptr %59, i32 0, i32 1
856  %60 = load i16, ptr %col.i11, align 2
857  %61 = load ptr, ptr %base.addr.i8, align 8
858  %62 = load i64, ptr %stride.addr.i9, align 8
859  store i16 %58, ptr %m.addr.i70, align 2
860  store i16 %60, ptr %n.addr.i71, align 2
861  store ptr %61, ptr %base.addr.i72, align 8
862  store i64 %62, ptr %stride.addr.i73, align 8
863  %63 = load i16, ptr %m.addr.i70, align 2
864  %64 = load i16, ptr %n.addr.i71, align 2
865  %65 = load ptr, ptr %base.addr.i72, align 8
866  %66 = load i64, ptr %stride.addr.i73, align 8
867  %67 = call x86_amx @llvm.x86.tileloadd64.internal(i16 %63, i16 %64, ptr %65, i64 %66) #2
868  %68 = bitcast x86_amx %67 to <256 x i32>
869  %69 = load ptr, ptr %dst.addr.i7, align 8
870  %tile.i13 = getelementptr inbounds %struct.__tile1024i_str, ptr %69, i32 0, i32 3
871  store <256 x i32> %68, ptr %tile.i13, align 64
872  store ptr %c, ptr %dst.addr.i, align 8
873  store ptr @buf2, ptr %base.addr.i, align 8
874  store i64 32, ptr %stride.addr.i, align 8
875  %70 = load ptr, ptr %dst.addr.i, align 8
876  %71 = load i16, ptr %70, align 64
877  %72 = load ptr, ptr %dst.addr.i, align 8
878  %col.i = getelementptr inbounds %struct.__tile1024i_str, ptr %72, i32 0, i32 1
879  %73 = load i16, ptr %col.i, align 2
880  %74 = load ptr, ptr %base.addr.i, align 8
881  %75 = load i64, ptr %stride.addr.i, align 8
882  store i16 %71, ptr %m.addr.i74, align 2
883  store i16 %73, ptr %n.addr.i75, align 2
884  store ptr %74, ptr %base.addr.i76, align 8
885  store i64 %75, ptr %stride.addr.i77, align 8
886  %76 = load i16, ptr %m.addr.i74, align 2
887  %77 = load i16, ptr %n.addr.i75, align 2
888  %78 = load ptr, ptr %base.addr.i76, align 8
889  %79 = load i64, ptr %stride.addr.i77, align 8
890  %80 = call x86_amx @llvm.x86.tileloadd64.internal(i16 %76, i16 %77, ptr %78, i64 %79) #2
891  %81 = bitcast x86_amx %80 to <256 x i32>
892  %82 = load ptr, ptr %dst.addr.i, align 8
893  %tile.i = getelementptr inbounds %struct.__tile1024i_str, ptr %82, i32 0, i32 3
894  store <256 x i32> %81, ptr %tile.i, align 64
895  br label %if.end
896
897if.end:                                           ; preds = %if.else, %if.then
898  call void @llvm.memcpy.p0.p0.i64(ptr align 1 %b43, ptr align 1 %b, i64 1088, i1 false) #2
899  call void @llvm.memcpy.p0.p0.i64(ptr align 1 %a42, ptr align 1 %a, i64 1088, i1 false) #2
900  store ptr %c, ptr %dst.addr.i44, align 8
901  %83 = load i16, ptr %a42, align 64
902  %col.i46 = getelementptr inbounds %struct.__tile1024i_str, ptr %b43, i32 0, i32 1
903  %84 = load i16, ptr %col.i46, align 2
904  %col1.i = getelementptr inbounds %struct.__tile1024i_str, ptr %a42, i32 0, i32 1
905  %85 = load i16, ptr %col1.i, align 2
906  %86 = load ptr, ptr %dst.addr.i44, align 8
907  %tile.i47 = getelementptr inbounds %struct.__tile1024i_str, ptr %86, i32 0, i32 3
908  %87 = load <256 x i32>, ptr %tile.i47, align 64
909  %tile2.i = getelementptr inbounds %struct.__tile1024i_str, ptr %a42, i32 0, i32 3
910  %88 = load <256 x i32>, ptr %tile2.i, align 64
911  %tile3.i = getelementptr inbounds %struct.__tile1024i_str, ptr %b43, i32 0, i32 3
912  %89 = load <256 x i32>, ptr %tile3.i, align 64
913  store <256 x i32> %87, ptr %indirect-arg-temp.i, align 1024
914  store <256 x i32> %88, ptr %indirect-arg-temp4.i, align 1024
915  store <256 x i32> %89, ptr %indirect-arg-temp5.i, align 1024
916  call void @llvm.memcpy.p0.p0.i64(ptr align 1 %indirect-arg-temp5.i80, ptr align 1 %indirect-arg-temp5.i, i64 1024, i1 false) #2
917  call void @llvm.memcpy.p0.p0.i64(ptr align 1 %indirect-arg-temp4.i79, ptr align 1 %indirect-arg-temp4.i, i64 1024, i1 false) #2
918  call void @llvm.memcpy.p0.p0.i64(ptr align 1 %indirect-arg-temp.i78, ptr align 1 %indirect-arg-temp.i, i64 1024, i1 false) #2
919  %dst.i = load <256 x i32>, ptr %indirect-arg-temp.i78, align 1024
920  %src1.i = load <256 x i32>, ptr %indirect-arg-temp4.i79, align 1024
921  %src2.i = load <256 x i32>, ptr %indirect-arg-temp5.i80, align 1024
922  store i16 %83, ptr %m.addr.i81, align 2
923  store i16 %84, ptr %n.addr.i82, align 2
924  store i16 %85, ptr %k.addr.i, align 2
925  store <256 x i32> %dst.i, ptr %dst.addr.i83, align 64
926  store <256 x i32> %src1.i, ptr %src1.addr.i, align 64
927  store <256 x i32> %src2.i, ptr %src2.addr.i, align 64
928  %90 = load i16, ptr %m.addr.i81, align 2
929  %91 = load i16, ptr %n.addr.i82, align 2
930  %92 = load i16, ptr %k.addr.i, align 2
931  %93 = load <256 x i32>, ptr %dst.addr.i83, align 64
932  %94 = bitcast <256 x i32> %93 to x86_amx
933  %95 = load <256 x i32>, ptr %src1.addr.i, align 64
934  %96 = bitcast <256 x i32> %95 to x86_amx
935  %97 = load <256 x i32>, ptr %src2.addr.i, align 64
936  %98 = bitcast <256 x i32> %97 to x86_amx
937  %99 = call x86_amx @llvm.x86.tdpbssd.internal(i16 %90, i16 %91, i16 %92, x86_amx %94, x86_amx %96, x86_amx %98) #2
938  %100 = bitcast x86_amx %99 to <256 x i32>
939  %101 = load ptr, ptr %dst.addr.i44, align 8
940  %tile6.i = getelementptr inbounds %struct.__tile1024i_str, ptr %101, i32 0, i32 3
941  store <256 x i32> %100, ptr %tile6.i, align 64
942  call void @llvm.memcpy.p0.p0.i64(ptr align 1 %c49, ptr align 1 %c, i64 1088, i1 false) #2
943  store ptr @buf, ptr %base.addr.i50, align 8
944  store i64 32, ptr %stride.addr.i51, align 8
945  %102 = load i16, ptr %c49, align 64
946  %col.i54 = getelementptr inbounds %struct.__tile1024i_str, ptr %c49, i32 0, i32 1
947  %103 = load i16, ptr %col.i54, align 2
948  %104 = load ptr, ptr %base.addr.i50, align 8
949  %105 = load i64, ptr %stride.addr.i51, align 8
950  %tile.i55 = getelementptr inbounds %struct.__tile1024i_str, ptr %c49, i32 0, i32 3
951  %106 = load <256 x i32>, ptr %tile.i55, align 64
952  store <256 x i32> %106, ptr %indirect-arg-temp.i52, align 1024
953  call void @llvm.memcpy.p0.p0.i64(ptr align 1 %indirect-arg-temp.i5284, ptr align 1 %indirect-arg-temp.i52, i64 1024, i1 false) #2
954  %tile.i89 = load <256 x i32>, ptr %indirect-arg-temp.i5284, align 1024
955  store i16 %102, ptr %m.addr.i85, align 2
956  store i16 %103, ptr %n.addr.i86, align 2
957  store ptr %104, ptr %base.addr.i87, align 8
958  store i64 %105, ptr %stride.addr.i88, align 8
959  store <256 x i32> %tile.i89, ptr %tile.addr.i, align 64
960  %107 = load i16, ptr %m.addr.i85, align 2
961  %108 = load i16, ptr %n.addr.i86, align 2
962  %109 = load ptr, ptr %base.addr.i87, align 8
963  %110 = load i64, ptr %stride.addr.i88, align 8
964  %111 = load <256 x i32>, ptr %tile.addr.i, align 64
965  %112 = bitcast <256 x i32> %111 to x86_amx
966  call void @llvm.x86.tilestored64.internal(i16 %107, i16 %108, ptr %109, i64 %110, x86_amx %112) #2
967  ret void
968}
969
970declare void @llvm.memset.p0.i64(ptr nocapture writeonly, i8, i64, i1 immarg) #1
971declare x86_amx @llvm.x86.tileloadd64.internal(i16, i16, ptr, i64) #2
972declare x86_amx @llvm.x86.tdpbssd.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx) #2
973declare void @llvm.x86.tilestored64.internal(i16, i16, ptr, i64, x86_amx) #2
974declare void @llvm.memcpy.p0.p0.i64(ptr noalias nocapture writeonly, ptr noalias nocapture readonly, i64, i1 immarg) #3
975
976attributes #0 = { noinline nounwind optnone }
977attributes #1 = { argmemonly nofree nosync nounwind willreturn writeonly }
978attributes #2 = { nounwind }
979attributes #3 = { argmemonly nofree nosync nounwind willreturn }
980