xref: /llvm-project/llvm/test/CodeGen/X86/frem.ll (revision 2e4d2762b5f8c6b0ae02c2a9d517e009f470b8a6)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -mtriple=x86_64-linux-gnu < %s -mattr=fma | FileCheck %s
3
4; Basic test coverage for FREM
5
6define void @frem_f16(half %a0, half %a1, ptr%p3) nounwind {
7; CHECK-LABEL: frem_f16:
8; CHECK:       # %bb.0:
9; CHECK-NEXT:    pushq %rbx
10; CHECK-NEXT:    subq $16, %rsp
11; CHECK-NEXT:    movq %rdi, %rbx
12; CHECK-NEXT:    vmovss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
13; CHECK-NEXT:    vmovaps %xmm1, %xmm0
14; CHECK-NEXT:    callq __extendhfsf2@PLT
15; CHECK-NEXT:    vmovss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
16; CHECK-NEXT:    vmovd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
17; CHECK-NEXT:    # xmm0 = mem[0],zero,zero,zero
18; CHECK-NEXT:    callq __extendhfsf2@PLT
19; CHECK-NEXT:    vmovss {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Reload
20; CHECK-NEXT:    # xmm1 = mem[0],zero,zero,zero
21; CHECK-NEXT:    callq fmodf@PLT
22; CHECK-NEXT:    callq __truncsfhf2@PLT
23; CHECK-NEXT:    vpextrw $0, %xmm0, (%rbx)
24; CHECK-NEXT:    addq $16, %rsp
25; CHECK-NEXT:    popq %rbx
26; CHECK-NEXT:    retq
27  %frem = frem half %a0, %a1
28  store half %frem, ptr%p3
29  ret void
30}
31
32define void @frem_f32(float %a0, float %a1, ptr%p3) nounwind {
33; CHECK-LABEL: frem_f32:
34; CHECK:       # %bb.0:
35; CHECK-NEXT:    pushq %rbx
36; CHECK-NEXT:    movq %rdi, %rbx
37; CHECK-NEXT:    callq fmodf@PLT
38; CHECK-NEXT:    vmovss %xmm0, (%rbx)
39; CHECK-NEXT:    popq %rbx
40; CHECK-NEXT:    retq
41  %frem = frem float %a0, %a1
42  store float %frem, ptr%p3
43  ret void
44}
45
46define void @frem_f64(double %a0, double %a1, ptr%p3) nounwind {
47; CHECK-LABEL: frem_f64:
48; CHECK:       # %bb.0:
49; CHECK-NEXT:    pushq %rbx
50; CHECK-NEXT:    movq %rdi, %rbx
51; CHECK-NEXT:    callq fmod@PLT
52; CHECK-NEXT:    vmovsd %xmm0, (%rbx)
53; CHECK-NEXT:    popq %rbx
54; CHECK-NEXT:    retq
55  %frem = frem double %a0, %a1
56  store double %frem, ptr%p3
57  ret void
58}
59
60define void @frem_f80(x86_fp80 %a0, x86_fp80 %a1, ptr%p3) nounwind {
61; CHECK-LABEL: frem_f80:
62; CHECK:       # %bb.0:
63; CHECK-NEXT:    pushq %rbx
64; CHECK-NEXT:    subq $32, %rsp
65; CHECK-NEXT:    movq %rdi, %rbx
66; CHECK-NEXT:    fldt {{[0-9]+}}(%rsp)
67; CHECK-NEXT:    fldt {{[0-9]+}}(%rsp)
68; CHECK-NEXT:    fstpt {{[0-9]+}}(%rsp)
69; CHECK-NEXT:    fstpt (%rsp)
70; CHECK-NEXT:    callq fmodl@PLT
71; CHECK-NEXT:    fstpt (%rbx)
72; CHECK-NEXT:    addq $32, %rsp
73; CHECK-NEXT:    popq %rbx
74; CHECK-NEXT:    retq
75  %frem = frem x86_fp80 %a0, %a1
76  store x86_fp80 %frem, ptr%p3
77  ret void
78}
79
80define void @frem_f128(fp128 %a0, fp128 %a1, ptr%p3) nounwind {
81; CHECK-LABEL: frem_f128:
82; CHECK:       # %bb.0:
83; CHECK-NEXT:    pushq %rbx
84; CHECK-NEXT:    movq %rdi, %rbx
85; CHECK-NEXT:    callq fmodf128
86; CHECK-NEXT:    vmovaps %xmm0, (%rbx)
87; CHECK-NEXT:    popq %rbx
88; CHECK-NEXT:    retq
89  %frem = frem fp128 %a0, %a1
90  store fp128 %frem, ptr%p3
91  ret void
92}
93
94define void @frem_v16f32(<16 x float> %a0, <16 x float> %a1, ptr%p3) nounwind {
95; CHECK-LABEL: frem_v16f32:
96; CHECK:       # %bb.0:
97; CHECK-NEXT:    pushq %rbx
98; CHECK-NEXT:    subq $160, %rsp
99; CHECK-NEXT:    movq %rdi, %rbx
100; CHECK-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
101; CHECK-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
102; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
103; CHECK-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
104; CHECK-NEXT:    vmovaps %xmm1, %xmm0
105; CHECK-NEXT:    vmovaps %xmm3, %xmm1
106; CHECK-NEXT:    vzeroupper
107; CHECK-NEXT:    callq fmodf@PLT
108; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
109; CHECK-NEXT:    vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
110; CHECK-NEXT:    # xmm0 = mem[1,1,3,3]
111; CHECK-NEXT:    vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
112; CHECK-NEXT:    # xmm1 = mem[1,1,3,3]
113; CHECK-NEXT:    callq fmodf@PLT
114; CHECK-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
115; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3]
116; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
117; CHECK-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
118; CHECK-NEXT:    # xmm0 = mem[1,0]
119; CHECK-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
120; CHECK-NEXT:    # xmm1 = mem[1,0]
121; CHECK-NEXT:    callq fmodf@PLT
122; CHECK-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
123; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3]
124; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
125; CHECK-NEXT:    vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
126; CHECK-NEXT:    # xmm0 = mem[3,3,3,3]
127; CHECK-NEXT:    vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
128; CHECK-NEXT:    # xmm1 = mem[3,3,3,3]
129; CHECK-NEXT:    callq fmodf@PLT
130; CHECK-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
131; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
132; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
133; CHECK-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
134; CHECK-NEXT:    vextractf128 $1, %ymm0, %xmm2
135; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
136; CHECK-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
137; CHECK-NEXT:    vextractf128 $1, %ymm0, %xmm1
138; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
139; CHECK-NEXT:    vmovaps %xmm2, %xmm0
140; CHECK-NEXT:    vzeroupper
141; CHECK-NEXT:    callq fmodf@PLT
142; CHECK-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
143; CHECK-NEXT:    vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
144; CHECK-NEXT:    # xmm0 = mem[1,1,3,3]
145; CHECK-NEXT:    vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
146; CHECK-NEXT:    # xmm1 = mem[1,1,3,3]
147; CHECK-NEXT:    callq fmodf@PLT
148; CHECK-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
149; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3]
150; CHECK-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
151; CHECK-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
152; CHECK-NEXT:    # xmm0 = mem[1,0]
153; CHECK-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
154; CHECK-NEXT:    # xmm1 = mem[1,0]
155; CHECK-NEXT:    callq fmodf@PLT
156; CHECK-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
157; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3]
158; CHECK-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
159; CHECK-NEXT:    vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
160; CHECK-NEXT:    # xmm0 = mem[3,3,3,3]
161; CHECK-NEXT:    vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
162; CHECK-NEXT:    # xmm1 = mem[3,3,3,3]
163; CHECK-NEXT:    callq fmodf@PLT
164; CHECK-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
165; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
166; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
167; CHECK-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
168; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
169; CHECK-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
170; CHECK-NEXT:    # kill: def $xmm1 killed $xmm1 killed $ymm1
171; CHECK-NEXT:    vzeroupper
172; CHECK-NEXT:    callq fmodf@PLT
173; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
174; CHECK-NEXT:    vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
175; CHECK-NEXT:    # xmm0 = mem[1,1,3,3]
176; CHECK-NEXT:    vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
177; CHECK-NEXT:    # xmm1 = mem[1,1,3,3]
178; CHECK-NEXT:    callq fmodf@PLT
179; CHECK-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
180; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3]
181; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
182; CHECK-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
183; CHECK-NEXT:    # xmm0 = mem[1,0]
184; CHECK-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
185; CHECK-NEXT:    # xmm1 = mem[1,0]
186; CHECK-NEXT:    callq fmodf@PLT
187; CHECK-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
188; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3]
189; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
190; CHECK-NEXT:    vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
191; CHECK-NEXT:    # xmm0 = mem[3,3,3,3]
192; CHECK-NEXT:    vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
193; CHECK-NEXT:    # xmm1 = mem[3,3,3,3]
194; CHECK-NEXT:    callq fmodf@PLT
195; CHECK-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
196; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
197; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
198; CHECK-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
199; CHECK-NEXT:    vextractf128 $1, %ymm0, %xmm2
200; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
201; CHECK-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
202; CHECK-NEXT:    vextractf128 $1, %ymm0, %xmm1
203; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
204; CHECK-NEXT:    vmovaps %xmm2, %xmm0
205; CHECK-NEXT:    vzeroupper
206; CHECK-NEXT:    callq fmodf@PLT
207; CHECK-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
208; CHECK-NEXT:    vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
209; CHECK-NEXT:    # xmm0 = mem[1,1,3,3]
210; CHECK-NEXT:    vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
211; CHECK-NEXT:    # xmm1 = mem[1,1,3,3]
212; CHECK-NEXT:    callq fmodf@PLT
213; CHECK-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
214; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3]
215; CHECK-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
216; CHECK-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
217; CHECK-NEXT:    # xmm0 = mem[1,0]
218; CHECK-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
219; CHECK-NEXT:    # xmm1 = mem[1,0]
220; CHECK-NEXT:    callq fmodf@PLT
221; CHECK-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
222; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3]
223; CHECK-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
224; CHECK-NEXT:    vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
225; CHECK-NEXT:    # xmm0 = mem[3,3,3,3]
226; CHECK-NEXT:    vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
227; CHECK-NEXT:    # xmm1 = mem[3,3,3,3]
228; CHECK-NEXT:    callq fmodf@PLT
229; CHECK-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
230; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
231; CHECK-NEXT:    vmovaps %xmm0, 16(%rbx)
232; CHECK-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
233; CHECK-NEXT:    vmovaps %xmm0, (%rbx)
234; CHECK-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
235; CHECK-NEXT:    vmovaps %xmm0, 48(%rbx)
236; CHECK-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
237; CHECK-NEXT:    vmovaps %xmm0, 32(%rbx)
238; CHECK-NEXT:    addq $160, %rsp
239; CHECK-NEXT:    popq %rbx
240; CHECK-NEXT:    retq
241  %frem = frem <16 x float> %a0, %a1
242  store <16 x float> %frem, ptr%p3
243  ret void
244}
245
246define void @frem_v8f32(<8 x float> %a0, <8 x float> %a1, ptr%p3) nounwind {
247; CHECK-LABEL: frem_v8f32:
248; CHECK:       # %bb.0:
249; CHECK-NEXT:    pushq %rbx
250; CHECK-NEXT:    subq $96, %rsp
251; CHECK-NEXT:    movq %rdi, %rbx
252; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
253; CHECK-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
254; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
255; CHECK-NEXT:    # kill: def $xmm1 killed $xmm1 killed $ymm1
256; CHECK-NEXT:    vzeroupper
257; CHECK-NEXT:    callq fmodf@PLT
258; CHECK-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
259; CHECK-NEXT:    vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
260; CHECK-NEXT:    # xmm0 = mem[1,1,3,3]
261; CHECK-NEXT:    vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
262; CHECK-NEXT:    # xmm1 = mem[1,1,3,3]
263; CHECK-NEXT:    callq fmodf@PLT
264; CHECK-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
265; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3]
266; CHECK-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
267; CHECK-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
268; CHECK-NEXT:    # xmm0 = mem[1,0]
269; CHECK-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
270; CHECK-NEXT:    # xmm1 = mem[1,0]
271; CHECK-NEXT:    callq fmodf@PLT
272; CHECK-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
273; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3]
274; CHECK-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
275; CHECK-NEXT:    vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
276; CHECK-NEXT:    # xmm0 = mem[3,3,3,3]
277; CHECK-NEXT:    vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
278; CHECK-NEXT:    # xmm1 = mem[3,3,3,3]
279; CHECK-NEXT:    callq fmodf@PLT
280; CHECK-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
281; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
282; CHECK-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
283; CHECK-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
284; CHECK-NEXT:    vextractf128 $1, %ymm0, %xmm2
285; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
286; CHECK-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
287; CHECK-NEXT:    vextractf128 $1, %ymm0, %xmm1
288; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
289; CHECK-NEXT:    vmovaps %xmm2, %xmm0
290; CHECK-NEXT:    vzeroupper
291; CHECK-NEXT:    callq fmodf@PLT
292; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
293; CHECK-NEXT:    vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
294; CHECK-NEXT:    # xmm0 = mem[1,1,3,3]
295; CHECK-NEXT:    vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
296; CHECK-NEXT:    # xmm1 = mem[1,1,3,3]
297; CHECK-NEXT:    callq fmodf@PLT
298; CHECK-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
299; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3]
300; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
301; CHECK-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
302; CHECK-NEXT:    # xmm0 = mem[1,0]
303; CHECK-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
304; CHECK-NEXT:    # xmm1 = mem[1,0]
305; CHECK-NEXT:    callq fmodf@PLT
306; CHECK-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
307; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3]
308; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
309; CHECK-NEXT:    vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
310; CHECK-NEXT:    # xmm0 = mem[3,3,3,3]
311; CHECK-NEXT:    vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
312; CHECK-NEXT:    # xmm1 = mem[3,3,3,3]
313; CHECK-NEXT:    callq fmodf@PLT
314; CHECK-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
315; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
316; CHECK-NEXT:    vmovaps %xmm0, 16(%rbx)
317; CHECK-NEXT:    vmovaps (%rsp), %xmm0 # 16-byte Reload
318; CHECK-NEXT:    vmovaps %xmm0, (%rbx)
319; CHECK-NEXT:    addq $96, %rsp
320; CHECK-NEXT:    popq %rbx
321; CHECK-NEXT:    retq
322  %frem = frem <8 x float> %a0, %a1
323  store <8 x float> %frem, ptr%p3
324  ret void
325}
326
327define void @frem_v4f32(<4 x float> %a0, <4 x float> %a1, ptr%p3) nounwind {
328; CHECK-LABEL: frem_v4f32:
329; CHECK:       # %bb.0:
330; CHECK-NEXT:    pushq %rbx
331; CHECK-NEXT:    subq $48, %rsp
332; CHECK-NEXT:    movq %rdi, %rbx
333; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
334; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
335; CHECK-NEXT:    callq fmodf@PLT
336; CHECK-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
337; CHECK-NEXT:    vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
338; CHECK-NEXT:    # xmm0 = mem[1,1,3,3]
339; CHECK-NEXT:    vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
340; CHECK-NEXT:    # xmm1 = mem[1,1,3,3]
341; CHECK-NEXT:    callq fmodf@PLT
342; CHECK-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
343; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3]
344; CHECK-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
345; CHECK-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
346; CHECK-NEXT:    # xmm0 = mem[1,0]
347; CHECK-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
348; CHECK-NEXT:    # xmm1 = mem[1,0]
349; CHECK-NEXT:    callq fmodf@PLT
350; CHECK-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
351; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3]
352; CHECK-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
353; CHECK-NEXT:    vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
354; CHECK-NEXT:    # xmm0 = mem[3,3,3,3]
355; CHECK-NEXT:    vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
356; CHECK-NEXT:    # xmm1 = mem[3,3,3,3]
357; CHECK-NEXT:    callq fmodf@PLT
358; CHECK-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
359; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
360; CHECK-NEXT:    vmovaps %xmm0, (%rbx)
361; CHECK-NEXT:    addq $48, %rsp
362; CHECK-NEXT:    popq %rbx
363; CHECK-NEXT:    retq
364  %frem = frem <4 x float> %a0, %a1
365  store <4 x float> %frem, ptr%p3
366  ret void
367}
368
369define void @frem_v8f64(<8 x double> %a0, <8 x double> %a1, ptr%p3) nounwind {
370; CHECK-LABEL: frem_v8f64:
371; CHECK:       # %bb.0:
372; CHECK-NEXT:    pushq %rbx
373; CHECK-NEXT:    subq $160, %rsp
374; CHECK-NEXT:    movq %rdi, %rbx
375; CHECK-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
376; CHECK-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
377; CHECK-NEXT:    vmovups %ymm1, (%rsp) # 32-byte Spill
378; CHECK-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
379; CHECK-NEXT:    vmovaps %xmm1, %xmm0
380; CHECK-NEXT:    vmovaps %xmm3, %xmm1
381; CHECK-NEXT:    vzeroupper
382; CHECK-NEXT:    callq fmod@PLT
383; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
384; CHECK-NEXT:    vpermilpd $1, (%rsp), %xmm0 # 16-byte Folded Reload
385; CHECK-NEXT:    # xmm0 = mem[1,0]
386; CHECK-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
387; CHECK-NEXT:    # xmm1 = mem[1,0]
388; CHECK-NEXT:    callq fmod@PLT
389; CHECK-NEXT:    vmovapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
390; CHECK-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0]
391; CHECK-NEXT:    vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
392; CHECK-NEXT:    vmovups (%rsp), %ymm0 # 32-byte Reload
393; CHECK-NEXT:    vextractf128 $1, %ymm0, %xmm2
394; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
395; CHECK-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
396; CHECK-NEXT:    vextractf128 $1, %ymm0, %xmm1
397; CHECK-NEXT:    vmovaps %xmm1, (%rsp) # 16-byte Spill
398; CHECK-NEXT:    vmovaps %xmm2, %xmm0
399; CHECK-NEXT:    vzeroupper
400; CHECK-NEXT:    callq fmod@PLT
401; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
402; CHECK-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
403; CHECK-NEXT:    # xmm0 = mem[1,0]
404; CHECK-NEXT:    vpermilpd $1, (%rsp), %xmm1 # 16-byte Folded Reload
405; CHECK-NEXT:    # xmm1 = mem[1,0]
406; CHECK-NEXT:    callq fmod@PLT
407; CHECK-NEXT:    vmovapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
408; CHECK-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0]
409; CHECK-NEXT:    vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
410; CHECK-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
411; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
412; CHECK-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
413; CHECK-NEXT:    # kill: def $xmm1 killed $xmm1 killed $ymm1
414; CHECK-NEXT:    vzeroupper
415; CHECK-NEXT:    callq fmod@PLT
416; CHECK-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
417; CHECK-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
418; CHECK-NEXT:    # xmm0 = mem[1,0]
419; CHECK-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
420; CHECK-NEXT:    # xmm1 = mem[1,0]
421; CHECK-NEXT:    callq fmod@PLT
422; CHECK-NEXT:    vmovapd (%rsp), %xmm1 # 16-byte Reload
423; CHECK-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0]
424; CHECK-NEXT:    vmovapd %xmm0, (%rsp) # 16-byte Spill
425; CHECK-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
426; CHECK-NEXT:    vextractf128 $1, %ymm0, %xmm2
427; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
428; CHECK-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
429; CHECK-NEXT:    vextractf128 $1, %ymm0, %xmm1
430; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
431; CHECK-NEXT:    vmovaps %xmm2, %xmm0
432; CHECK-NEXT:    vzeroupper
433; CHECK-NEXT:    callq fmod@PLT
434; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
435; CHECK-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
436; CHECK-NEXT:    # xmm0 = mem[1,0]
437; CHECK-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
438; CHECK-NEXT:    # xmm1 = mem[1,0]
439; CHECK-NEXT:    callq fmod@PLT
440; CHECK-NEXT:    vmovapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
441; CHECK-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0]
442; CHECK-NEXT:    vmovapd %xmm0, 16(%rbx)
443; CHECK-NEXT:    vmovaps (%rsp), %xmm0 # 16-byte Reload
444; CHECK-NEXT:    vmovaps %xmm0, (%rbx)
445; CHECK-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
446; CHECK-NEXT:    vmovaps %xmm0, 48(%rbx)
447; CHECK-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
448; CHECK-NEXT:    vmovaps %xmm0, 32(%rbx)
449; CHECK-NEXT:    addq $160, %rsp
450; CHECK-NEXT:    popq %rbx
451; CHECK-NEXT:    retq
452  %frem = frem <8 x double> %a0, %a1
453  store <8 x double> %frem, ptr%p3
454  ret void
455}
456
457define void @frem_v4f64(<4 x double> %a0, <4 x double> %a1, ptr%p3) nounwind {
458; CHECK-LABEL: frem_v4f64:
459; CHECK:       # %bb.0:
460; CHECK-NEXT:    pushq %rbx
461; CHECK-NEXT:    subq $96, %rsp
462; CHECK-NEXT:    movq %rdi, %rbx
463; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
464; CHECK-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
465; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
466; CHECK-NEXT:    # kill: def $xmm1 killed $xmm1 killed $ymm1
467; CHECK-NEXT:    vzeroupper
468; CHECK-NEXT:    callq fmod@PLT
469; CHECK-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
470; CHECK-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
471; CHECK-NEXT:    # xmm0 = mem[1,0]
472; CHECK-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
473; CHECK-NEXT:    # xmm1 = mem[1,0]
474; CHECK-NEXT:    callq fmod@PLT
475; CHECK-NEXT:    vmovapd (%rsp), %xmm1 # 16-byte Reload
476; CHECK-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0]
477; CHECK-NEXT:    vmovapd %xmm0, (%rsp) # 16-byte Spill
478; CHECK-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
479; CHECK-NEXT:    vextractf128 $1, %ymm0, %xmm2
480; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
481; CHECK-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
482; CHECK-NEXT:    vextractf128 $1, %ymm0, %xmm1
483; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
484; CHECK-NEXT:    vmovaps %xmm2, %xmm0
485; CHECK-NEXT:    vzeroupper
486; CHECK-NEXT:    callq fmod@PLT
487; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
488; CHECK-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
489; CHECK-NEXT:    # xmm0 = mem[1,0]
490; CHECK-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
491; CHECK-NEXT:    # xmm1 = mem[1,0]
492; CHECK-NEXT:    callq fmod@PLT
493; CHECK-NEXT:    vmovapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
494; CHECK-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0]
495; CHECK-NEXT:    vmovapd %xmm0, 16(%rbx)
496; CHECK-NEXT:    vmovaps (%rsp), %xmm0 # 16-byte Reload
497; CHECK-NEXT:    vmovaps %xmm0, (%rbx)
498; CHECK-NEXT:    addq $96, %rsp
499; CHECK-NEXT:    popq %rbx
500; CHECK-NEXT:    retq
501  %frem = frem <4 x double> %a0, %a1
502  store <4 x double> %frem, ptr%p3
503  ret void
504}
505
506define void @frem_v2f64(<2 x double> %a0, <2 x double> %a1, ptr%p3) nounwind {
507; CHECK-LABEL: frem_v2f64:
508; CHECK:       # %bb.0:
509; CHECK-NEXT:    pushq %rbx
510; CHECK-NEXT:    subq $48, %rsp
511; CHECK-NEXT:    movq %rdi, %rbx
512; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
513; CHECK-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
514; CHECK-NEXT:    callq fmod@PLT
515; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
516; CHECK-NEXT:    vpermilpd $1, (%rsp), %xmm0 # 16-byte Folded Reload
517; CHECK-NEXT:    # xmm0 = mem[1,0]
518; CHECK-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
519; CHECK-NEXT:    # xmm1 = mem[1,0]
520; CHECK-NEXT:    callq fmod@PLT
521; CHECK-NEXT:    vmovapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
522; CHECK-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0]
523; CHECK-NEXT:    vmovapd %xmm0, (%rbx)
524; CHECK-NEXT:    addq $48, %rsp
525; CHECK-NEXT:    popq %rbx
526; CHECK-NEXT:    retq
527  %frem = frem <2 x double> %a0, %a1
528  store <2 x double> %frem, ptr%p3
529  ret void
530}
531
532define void @frem_v32f16(<32 x half> %a0, <32 x half> %a1, ptr%p3) nounwind {
533; CHECK-LABEL: frem_v32f16:
534; CHECK:       # %bb.0:
535; CHECK-NEXT:    pushq %rbx
536; CHECK-NEXT:    subq $224, %rsp
537; CHECK-NEXT:    movq %rdi, %rbx
538; CHECK-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
539; CHECK-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
540; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
541; CHECK-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
542; CHECK-NEXT:    vextractf128 $1, %ymm2, %xmm0
543; CHECK-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
544; CHECK-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
545; CHECK-NEXT:    vzeroupper
546; CHECK-NEXT:    callq __extendhfsf2@PLT
547; CHECK-NEXT:    vmovd %xmm0, (%rsp) # 4-byte Folded Spill
548; CHECK-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
549; CHECK-NEXT:    vextractf128 $1, %ymm0, %xmm0
550; CHECK-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
551; CHECK-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
552; CHECK-NEXT:    vzeroupper
553; CHECK-NEXT:    callq __extendhfsf2@PLT
554; CHECK-NEXT:    vmovss (%rsp), %xmm1 # 4-byte Reload
555; CHECK-NEXT:    # xmm1 = mem[0],zero,zero,zero
556; CHECK-NEXT:    callq fmodf@PLT
557; CHECK-NEXT:    callq __truncsfhf2@PLT
558; CHECK-NEXT:    vmovdqa %xmm0, (%rsp) # 16-byte Spill
559; CHECK-NEXT:    vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
560; CHECK-NEXT:    # xmm0 = mem[3,3,3,3]
561; CHECK-NEXT:    callq __extendhfsf2@PLT
562; CHECK-NEXT:    vmovss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
563; CHECK-NEXT:    vpshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
564; CHECK-NEXT:    # xmm0 = mem[3,3,3,3]
565; CHECK-NEXT:    callq __extendhfsf2@PLT
566; CHECK-NEXT:    vmovss {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Reload
567; CHECK-NEXT:    # xmm1 = mem[0],zero,zero,zero
568; CHECK-NEXT:    callq fmodf@PLT
569; CHECK-NEXT:    callq __truncsfhf2@PLT
570; CHECK-NEXT:    vpunpcklwd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
571; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
572; CHECK-NEXT:    vmovdqa %xmm0, (%rsp) # 16-byte Spill
573; CHECK-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
574; CHECK-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
575; CHECK-NEXT:    callq __extendhfsf2@PLT
576; CHECK-NEXT:    vmovd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill
577; CHECK-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
578; CHECK-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
579; CHECK-NEXT:    callq __extendhfsf2@PLT
580; CHECK-NEXT:    vmovss {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Reload
581; CHECK-NEXT:    # xmm1 = mem[0],zero,zero,zero
582; CHECK-NEXT:    callq fmodf@PLT
583; CHECK-NEXT:    callq __truncsfhf2@PLT
584; CHECK-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
585; CHECK-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
586; CHECK-NEXT:    # xmm0 = mem[1,0]
587; CHECK-NEXT:    callq __extendhfsf2@PLT
588; CHECK-NEXT:    vmovss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
589; CHECK-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
590; CHECK-NEXT:    # xmm0 = mem[1,0]
591; CHECK-NEXT:    callq __extendhfsf2@PLT
592; CHECK-NEXT:    vmovss {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Reload
593; CHECK-NEXT:    # xmm1 = mem[0],zero,zero,zero
594; CHECK-NEXT:    callq fmodf@PLT
595; CHECK-NEXT:    callq __truncsfhf2@PLT
596; CHECK-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
597; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
598; CHECK-NEXT:    vpunpckldq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
599; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
600; CHECK-NEXT:    vmovdqa %xmm0, (%rsp) # 16-byte Spill
601; CHECK-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
602; CHECK-NEXT:    vpsrlq $48, %xmm0, %xmm0
603; CHECK-NEXT:    callq __extendhfsf2@PLT
604; CHECK-NEXT:    vmovd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill
605; CHECK-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
606; CHECK-NEXT:    vpsrlq $48, %xmm0, %xmm0
607; CHECK-NEXT:    callq __extendhfsf2@PLT
608; CHECK-NEXT:    vmovss {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Reload
609; CHECK-NEXT:    # xmm1 = mem[0],zero,zero,zero
610; CHECK-NEXT:    callq fmodf@PLT
611; CHECK-NEXT:    callq __truncsfhf2@PLT
612; CHECK-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
613; CHECK-NEXT:    vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
614; CHECK-NEXT:    # xmm0 = mem[1,1,3,3]
615; CHECK-NEXT:    callq __extendhfsf2@PLT
616; CHECK-NEXT:    vmovss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
617; CHECK-NEXT:    vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
618; CHECK-NEXT:    # xmm0 = mem[1,1,3,3]
619; CHECK-NEXT:    callq __extendhfsf2@PLT
620; CHECK-NEXT:    vmovss {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Reload
621; CHECK-NEXT:    # xmm1 = mem[0],zero,zero,zero
622; CHECK-NEXT:    callq fmodf@PLT
623; CHECK-NEXT:    callq __truncsfhf2@PLT
624; CHECK-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
625; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
626; CHECK-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
627; CHECK-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
628; CHECK-NEXT:    callq __extendhfsf2@PLT
629; CHECK-NEXT:    vmovss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
630; CHECK-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
631; CHECK-NEXT:    callq __extendhfsf2@PLT
632; CHECK-NEXT:    vmovss {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Reload
633; CHECK-NEXT:    # xmm1 = mem[0],zero,zero,zero
634; CHECK-NEXT:    callq fmodf@PLT
635; CHECK-NEXT:    callq __truncsfhf2@PLT
636; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
637; CHECK-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
638; CHECK-NEXT:    vpsrld $16, %xmm0, %xmm0
639; CHECK-NEXT:    callq __extendhfsf2@PLT
640; CHECK-NEXT:    vmovd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill
641; CHECK-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
642; CHECK-NEXT:    vpsrld $16, %xmm0, %xmm0
643; CHECK-NEXT:    callq __extendhfsf2@PLT
644; CHECK-NEXT:    vmovss {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Reload
645; CHECK-NEXT:    # xmm1 = mem[0],zero,zero,zero
646; CHECK-NEXT:    callq fmodf@PLT
647; CHECK-NEXT:    callq __truncsfhf2@PLT
648; CHECK-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
649; CHECK-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
650; CHECK-NEXT:    vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
651; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
652; CHECK-NEXT:    vpunpcklqdq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
653; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0]
654; CHECK-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
655; CHECK-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
656; CHECK-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
657; CHECK-NEXT:    vzeroupper
658; CHECK-NEXT:    callq __extendhfsf2@PLT
659; CHECK-NEXT:    vmovd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill
660; CHECK-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
661; CHECK-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
662; CHECK-NEXT:    vzeroupper
663; CHECK-NEXT:    callq __extendhfsf2@PLT
664; CHECK-NEXT:    vmovss {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Reload
665; CHECK-NEXT:    # xmm1 = mem[0],zero,zero,zero
666; CHECK-NEXT:    callq fmodf@PLT
667; CHECK-NEXT:    callq __truncsfhf2@PLT
668; CHECK-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
669; CHECK-NEXT:    vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
670; CHECK-NEXT:    # xmm0 = mem[3,3,3,3]
671; CHECK-NEXT:    callq __extendhfsf2@PLT
672; CHECK-NEXT:    vmovss %xmm0, (%rsp) # 4-byte Spill
673; CHECK-NEXT:    vpshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
674; CHECK-NEXT:    # xmm0 = mem[3,3,3,3]
675; CHECK-NEXT:    callq __extendhfsf2@PLT
676; CHECK-NEXT:    vmovss (%rsp), %xmm1 # 4-byte Reload
677; CHECK-NEXT:    # xmm1 = mem[0],zero,zero,zero
678; CHECK-NEXT:    callq fmodf@PLT
679; CHECK-NEXT:    callq __truncsfhf2@PLT
680; CHECK-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
681; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
682; CHECK-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
683; CHECK-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
684; CHECK-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
685; CHECK-NEXT:    vzeroupper
686; CHECK-NEXT:    callq __extendhfsf2@PLT
687; CHECK-NEXT:    vmovd %xmm0, (%rsp) # 4-byte Folded Spill
688; CHECK-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
689; CHECK-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
690; CHECK-NEXT:    vzeroupper
691; CHECK-NEXT:    callq __extendhfsf2@PLT
692; CHECK-NEXT:    vmovss (%rsp), %xmm1 # 4-byte Reload
693; CHECK-NEXT:    # xmm1 = mem[0],zero,zero,zero
694; CHECK-NEXT:    callq fmodf@PLT
695; CHECK-NEXT:    callq __truncsfhf2@PLT
696; CHECK-NEXT:    vmovdqa %xmm0, (%rsp) # 16-byte Spill
697; CHECK-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
698; CHECK-NEXT:    # xmm0 = mem[1,0]
699; CHECK-NEXT:    callq __extendhfsf2@PLT
700; CHECK-NEXT:    vmovss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
701; CHECK-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
702; CHECK-NEXT:    # xmm0 = mem[1,0]
703; CHECK-NEXT:    callq __extendhfsf2@PLT
704; CHECK-NEXT:    vmovss {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Reload
705; CHECK-NEXT:    # xmm1 = mem[0],zero,zero,zero
706; CHECK-NEXT:    callq fmodf@PLT
707; CHECK-NEXT:    callq __truncsfhf2@PLT
708; CHECK-NEXT:    vpunpcklwd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
709; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
710; CHECK-NEXT:    vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
711; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
712; CHECK-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
713; CHECK-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
714; CHECK-NEXT:    vpsrlq $48, %xmm0, %xmm0
715; CHECK-NEXT:    vzeroupper
716; CHECK-NEXT:    callq __extendhfsf2@PLT
717; CHECK-NEXT:    vmovd %xmm0, (%rsp) # 4-byte Folded Spill
718; CHECK-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
719; CHECK-NEXT:    vpsrlq $48, %xmm0, %xmm0
720; CHECK-NEXT:    vzeroupper
721; CHECK-NEXT:    callq __extendhfsf2@PLT
722; CHECK-NEXT:    vmovss (%rsp), %xmm1 # 4-byte Reload
723; CHECK-NEXT:    # xmm1 = mem[0],zero,zero,zero
724; CHECK-NEXT:    callq fmodf@PLT
725; CHECK-NEXT:    callq __truncsfhf2@PLT
726; CHECK-NEXT:    vmovdqa %xmm0, (%rsp) # 16-byte Spill
727; CHECK-NEXT:    vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
728; CHECK-NEXT:    # xmm0 = mem[1,1,3,3]
729; CHECK-NEXT:    callq __extendhfsf2@PLT
730; CHECK-NEXT:    vmovss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
731; CHECK-NEXT:    vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
732; CHECK-NEXT:    # xmm0 = mem[1,1,3,3]
733; CHECK-NEXT:    callq __extendhfsf2@PLT
734; CHECK-NEXT:    vmovss {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Reload
735; CHECK-NEXT:    # xmm1 = mem[0],zero,zero,zero
736; CHECK-NEXT:    callq fmodf@PLT
737; CHECK-NEXT:    callq __truncsfhf2@PLT
738; CHECK-NEXT:    vpunpcklwd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
739; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
740; CHECK-NEXT:    vmovdqa %xmm0, (%rsp) # 16-byte Spill
741; CHECK-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
742; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
743; CHECK-NEXT:    vzeroupper
744; CHECK-NEXT:    callq __extendhfsf2@PLT
745; CHECK-NEXT:    vmovss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
746; CHECK-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
747; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
748; CHECK-NEXT:    vzeroupper
749; CHECK-NEXT:    callq __extendhfsf2@PLT
750; CHECK-NEXT:    vmovss {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Reload
751; CHECK-NEXT:    # xmm1 = mem[0],zero,zero,zero
752; CHECK-NEXT:    callq fmodf@PLT
753; CHECK-NEXT:    callq __truncsfhf2@PLT
754; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
755; CHECK-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
756; CHECK-NEXT:    vpsrld $16, %xmm0, %xmm0
757; CHECK-NEXT:    vzeroupper
758; CHECK-NEXT:    callq __extendhfsf2@PLT
759; CHECK-NEXT:    vmovd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill
760; CHECK-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
761; CHECK-NEXT:    vpsrld $16, %xmm0, %xmm0
762; CHECK-NEXT:    vzeroupper
763; CHECK-NEXT:    callq __extendhfsf2@PLT
764; CHECK-NEXT:    vmovss {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Reload
765; CHECK-NEXT:    # xmm1 = mem[0],zero,zero,zero
766; CHECK-NEXT:    callq fmodf@PLT
767; CHECK-NEXT:    callq __truncsfhf2@PLT
768; CHECK-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
769; CHECK-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
770; CHECK-NEXT:    vpunpckldq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
771; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
772; CHECK-NEXT:    vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
773; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0]
774; CHECK-NEXT:    vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload
775; CHECK-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
776; CHECK-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
777; CHECK-NEXT:    vextractf128 $1, %ymm0, %xmm0
778; CHECK-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
779; CHECK-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
780; CHECK-NEXT:    vzeroupper
781; CHECK-NEXT:    callq __extendhfsf2@PLT
782; CHECK-NEXT:    vmovd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill
783; CHECK-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
784; CHECK-NEXT:    vextractf128 $1, %ymm0, %xmm0
785; CHECK-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
786; CHECK-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
787; CHECK-NEXT:    vzeroupper
788; CHECK-NEXT:    callq __extendhfsf2@PLT
789; CHECK-NEXT:    vmovss {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Reload
790; CHECK-NEXT:    # xmm1 = mem[0],zero,zero,zero
791; CHECK-NEXT:    callq fmodf@PLT
792; CHECK-NEXT:    callq __truncsfhf2@PLT
793; CHECK-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
794; CHECK-NEXT:    vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
795; CHECK-NEXT:    # xmm0 = mem[3,3,3,3]
796; CHECK-NEXT:    callq __extendhfsf2@PLT
797; CHECK-NEXT:    vmovss %xmm0, (%rsp) # 4-byte Spill
798; CHECK-NEXT:    vpshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
799; CHECK-NEXT:    # xmm0 = mem[3,3,3,3]
800; CHECK-NEXT:    callq __extendhfsf2@PLT
801; CHECK-NEXT:    vmovss (%rsp), %xmm1 # 4-byte Reload
802; CHECK-NEXT:    # xmm1 = mem[0],zero,zero,zero
803; CHECK-NEXT:    callq fmodf@PLT
804; CHECK-NEXT:    callq __truncsfhf2@PLT
805; CHECK-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
806; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
807; CHECK-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
808; CHECK-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
809; CHECK-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
810; CHECK-NEXT:    callq __extendhfsf2@PLT
811; CHECK-NEXT:    vmovd %xmm0, (%rsp) # 4-byte Folded Spill
812; CHECK-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
813; CHECK-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
814; CHECK-NEXT:    callq __extendhfsf2@PLT
815; CHECK-NEXT:    vmovss (%rsp), %xmm1 # 4-byte Reload
816; CHECK-NEXT:    # xmm1 = mem[0],zero,zero,zero
817; CHECK-NEXT:    callq fmodf@PLT
818; CHECK-NEXT:    callq __truncsfhf2@PLT
819; CHECK-NEXT:    vmovdqa %xmm0, (%rsp) # 16-byte Spill
820; CHECK-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
821; CHECK-NEXT:    # xmm0 = mem[1,0]
822; CHECK-NEXT:    callq __extendhfsf2@PLT
823; CHECK-NEXT:    vmovss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
824; CHECK-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
825; CHECK-NEXT:    # xmm0 = mem[1,0]
826; CHECK-NEXT:    callq __extendhfsf2@PLT
827; CHECK-NEXT:    vmovss {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Reload
828; CHECK-NEXT:    # xmm1 = mem[0],zero,zero,zero
829; CHECK-NEXT:    callq fmodf@PLT
830; CHECK-NEXT:    callq __truncsfhf2@PLT
831; CHECK-NEXT:    vpunpcklwd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
832; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
833; CHECK-NEXT:    vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
834; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
835; CHECK-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
836; CHECK-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
837; CHECK-NEXT:    vpsrlq $48, %xmm0, %xmm0
838; CHECK-NEXT:    callq __extendhfsf2@PLT
839; CHECK-NEXT:    vmovd %xmm0, (%rsp) # 4-byte Folded Spill
840; CHECK-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
841; CHECK-NEXT:    vpsrlq $48, %xmm0, %xmm0
842; CHECK-NEXT:    callq __extendhfsf2@PLT
843; CHECK-NEXT:    vmovss (%rsp), %xmm1 # 4-byte Reload
844; CHECK-NEXT:    # xmm1 = mem[0],zero,zero,zero
845; CHECK-NEXT:    callq fmodf@PLT
846; CHECK-NEXT:    callq __truncsfhf2@PLT
847; CHECK-NEXT:    vmovdqa %xmm0, (%rsp) # 16-byte Spill
848; CHECK-NEXT:    vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
849; CHECK-NEXT:    # xmm0 = mem[1,1,3,3]
850; CHECK-NEXT:    callq __extendhfsf2@PLT
851; CHECK-NEXT:    vmovss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
852; CHECK-NEXT:    vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
853; CHECK-NEXT:    # xmm0 = mem[1,1,3,3]
854; CHECK-NEXT:    callq __extendhfsf2@PLT
855; CHECK-NEXT:    vmovss {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Reload
856; CHECK-NEXT:    # xmm1 = mem[0],zero,zero,zero
857; CHECK-NEXT:    callq fmodf@PLT
858; CHECK-NEXT:    callq __truncsfhf2@PLT
859; CHECK-NEXT:    vpunpcklwd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
860; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
861; CHECK-NEXT:    vmovdqa %xmm0, (%rsp) # 16-byte Spill
862; CHECK-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
863; CHECK-NEXT:    callq __extendhfsf2@PLT
864; CHECK-NEXT:    vmovss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
865; CHECK-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
866; CHECK-NEXT:    callq __extendhfsf2@PLT
867; CHECK-NEXT:    vmovss {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Reload
868; CHECK-NEXT:    # xmm1 = mem[0],zero,zero,zero
869; CHECK-NEXT:    callq fmodf@PLT
870; CHECK-NEXT:    callq __truncsfhf2@PLT
871; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
872; CHECK-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
873; CHECK-NEXT:    vpsrld $16, %xmm0, %xmm0
874; CHECK-NEXT:    callq __extendhfsf2@PLT
875; CHECK-NEXT:    vmovd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill
876; CHECK-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
877; CHECK-NEXT:    vpsrld $16, %xmm0, %xmm0
878; CHECK-NEXT:    callq __extendhfsf2@PLT
879; CHECK-NEXT:    vmovss {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Reload
880; CHECK-NEXT:    # xmm1 = mem[0],zero,zero,zero
881; CHECK-NEXT:    callq fmodf@PLT
882; CHECK-NEXT:    callq __truncsfhf2@PLT
883; CHECK-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
884; CHECK-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
885; CHECK-NEXT:    vpunpckldq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
886; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
887; CHECK-NEXT:    vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
888; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0]
889; CHECK-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
890; CHECK-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
891; CHECK-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
892; CHECK-NEXT:    vzeroupper
893; CHECK-NEXT:    callq __extendhfsf2@PLT
894; CHECK-NEXT:    vmovd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill
895; CHECK-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
896; CHECK-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
897; CHECK-NEXT:    vzeroupper
898; CHECK-NEXT:    callq __extendhfsf2@PLT
899; CHECK-NEXT:    vmovss {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Reload
900; CHECK-NEXT:    # xmm1 = mem[0],zero,zero,zero
901; CHECK-NEXT:    callq fmodf@PLT
902; CHECK-NEXT:    callq __truncsfhf2@PLT
903; CHECK-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
904; CHECK-NEXT:    vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
905; CHECK-NEXT:    # xmm0 = mem[3,3,3,3]
906; CHECK-NEXT:    callq __extendhfsf2@PLT
907; CHECK-NEXT:    vmovss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
908; CHECK-NEXT:    vpshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
909; CHECK-NEXT:    # xmm0 = mem[3,3,3,3]
910; CHECK-NEXT:    callq __extendhfsf2@PLT
911; CHECK-NEXT:    vmovss {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Reload
912; CHECK-NEXT:    # xmm1 = mem[0],zero,zero,zero
913; CHECK-NEXT:    callq fmodf@PLT
914; CHECK-NEXT:    callq __truncsfhf2@PLT
915; CHECK-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
916; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
917; CHECK-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
918; CHECK-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
919; CHECK-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
920; CHECK-NEXT:    vzeroupper
921; CHECK-NEXT:    callq __extendhfsf2@PLT
922; CHECK-NEXT:    vmovd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill
923; CHECK-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
924; CHECK-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
925; CHECK-NEXT:    vzeroupper
926; CHECK-NEXT:    callq __extendhfsf2@PLT
927; CHECK-NEXT:    vmovss {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Reload
928; CHECK-NEXT:    # xmm1 = mem[0],zero,zero,zero
929; CHECK-NEXT:    callq fmodf@PLT
930; CHECK-NEXT:    callq __truncsfhf2@PLT
931; CHECK-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
932; CHECK-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
933; CHECK-NEXT:    # xmm0 = mem[1,0]
934; CHECK-NEXT:    callq __extendhfsf2@PLT
935; CHECK-NEXT:    vmovss %xmm0, (%rsp) # 4-byte Spill
936; CHECK-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
937; CHECK-NEXT:    # xmm0 = mem[1,0]
938; CHECK-NEXT:    callq __extendhfsf2@PLT
939; CHECK-NEXT:    vmovss (%rsp), %xmm1 # 4-byte Reload
940; CHECK-NEXT:    # xmm1 = mem[0],zero,zero,zero
941; CHECK-NEXT:    callq fmodf@PLT
942; CHECK-NEXT:    callq __truncsfhf2@PLT
943; CHECK-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
944; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
945; CHECK-NEXT:    vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
946; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
947; CHECK-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
948; CHECK-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
949; CHECK-NEXT:    vpsrlq $48, %xmm0, %xmm0
950; CHECK-NEXT:    vzeroupper
951; CHECK-NEXT:    callq __extendhfsf2@PLT
952; CHECK-NEXT:    vmovd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill
953; CHECK-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
954; CHECK-NEXT:    vpsrlq $48, %xmm0, %xmm0
955; CHECK-NEXT:    vzeroupper
956; CHECK-NEXT:    callq __extendhfsf2@PLT
957; CHECK-NEXT:    vmovss {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Reload
958; CHECK-NEXT:    # xmm1 = mem[0],zero,zero,zero
959; CHECK-NEXT:    callq fmodf@PLT
960; CHECK-NEXT:    callq __truncsfhf2@PLT
961; CHECK-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
962; CHECK-NEXT:    vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
963; CHECK-NEXT:    # xmm0 = mem[1,1,3,3]
964; CHECK-NEXT:    callq __extendhfsf2@PLT
965; CHECK-NEXT:    vmovss %xmm0, (%rsp) # 4-byte Spill
966; CHECK-NEXT:    vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
967; CHECK-NEXT:    # xmm0 = mem[1,1,3,3]
968; CHECK-NEXT:    callq __extendhfsf2@PLT
969; CHECK-NEXT:    vmovss (%rsp), %xmm1 # 4-byte Reload
970; CHECK-NEXT:    # xmm1 = mem[0],zero,zero,zero
971; CHECK-NEXT:    callq fmodf@PLT
972; CHECK-NEXT:    callq __truncsfhf2@PLT
973; CHECK-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
974; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
975; CHECK-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
976; CHECK-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
977; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
978; CHECK-NEXT:    vzeroupper
979; CHECK-NEXT:    callq __extendhfsf2@PLT
980; CHECK-NEXT:    vmovss %xmm0, (%rsp) # 4-byte Spill
981; CHECK-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
982; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
983; CHECK-NEXT:    vzeroupper
984; CHECK-NEXT:    callq __extendhfsf2@PLT
985; CHECK-NEXT:    vmovss (%rsp), %xmm1 # 4-byte Reload
986; CHECK-NEXT:    # xmm1 = mem[0],zero,zero,zero
987; CHECK-NEXT:    callq fmodf@PLT
988; CHECK-NEXT:    callq __truncsfhf2@PLT
989; CHECK-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
990; CHECK-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
991; CHECK-NEXT:    vpsrld $16, %xmm0, %xmm0
992; CHECK-NEXT:    vzeroupper
993; CHECK-NEXT:    callq __extendhfsf2@PLT
994; CHECK-NEXT:    vmovd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill
995; CHECK-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
996; CHECK-NEXT:    vpsrld $16, %xmm0, %xmm0
997; CHECK-NEXT:    vzeroupper
998; CHECK-NEXT:    callq __extendhfsf2@PLT
999; CHECK-NEXT:    vmovss {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Reload
1000; CHECK-NEXT:    # xmm1 = mem[0],zero,zero,zero
1001; CHECK-NEXT:    callq fmodf@PLT
1002; CHECK-NEXT:    callq __truncsfhf2@PLT
1003; CHECK-NEXT:    vmovdqa (%rsp), %xmm1 # 16-byte Reload
1004; CHECK-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
1005; CHECK-NEXT:    vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
1006; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
1007; CHECK-NEXT:    vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
1008; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0]
1009; CHECK-NEXT:    vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload
1010; CHECK-NEXT:    vmovaps %ymm0, 32(%rbx)
1011; CHECK-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
1012; CHECK-NEXT:    vmovaps %ymm0, (%rbx)
1013; CHECK-NEXT:    addq $224, %rsp
1014; CHECK-NEXT:    popq %rbx
1015; CHECK-NEXT:    vzeroupper
1016; CHECK-NEXT:    retq
1017  %frem = frem <32 x half> %a0, %a1
1018  store <32 x half> %frem, ptr%p3
1019  ret void
1020}
1021
1022define void @frem_v16f16(<16 x half> %a0, <16 x half> %a1, ptr%p3) nounwind {
1023; CHECK-LABEL: frem_v16f16:
1024; CHECK:       # %bb.0:
1025; CHECK-NEXT:    pushq %rbx
1026; CHECK-NEXT:    subq $144, %rsp
1027; CHECK-NEXT:    movq %rdi, %rbx
1028; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1029; CHECK-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1030; CHECK-NEXT:    vextractf128 $1, %ymm1, %xmm0
1031; CHECK-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1032; CHECK-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
1033; CHECK-NEXT:    vzeroupper
1034; CHECK-NEXT:    callq __extendhfsf2@PLT
1035; CHECK-NEXT:    vmovd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill
1036; CHECK-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
1037; CHECK-NEXT:    vextractf128 $1, %ymm0, %xmm0
1038; CHECK-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1039; CHECK-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
1040; CHECK-NEXT:    vzeroupper
1041; CHECK-NEXT:    callq __extendhfsf2@PLT
1042; CHECK-NEXT:    vmovss {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Reload
1043; CHECK-NEXT:    # xmm1 = mem[0],zero,zero,zero
1044; CHECK-NEXT:    callq fmodf@PLT
1045; CHECK-NEXT:    callq __truncsfhf2@PLT
1046; CHECK-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1047; CHECK-NEXT:    vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
1048; CHECK-NEXT:    # xmm0 = mem[3,3,3,3]
1049; CHECK-NEXT:    callq __extendhfsf2@PLT
1050; CHECK-NEXT:    vmovss %xmm0, (%rsp) # 4-byte Spill
1051; CHECK-NEXT:    vpshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
1052; CHECK-NEXT:    # xmm0 = mem[3,3,3,3]
1053; CHECK-NEXT:    callq __extendhfsf2@PLT
1054; CHECK-NEXT:    vmovss (%rsp), %xmm1 # 4-byte Reload
1055; CHECK-NEXT:    # xmm1 = mem[0],zero,zero,zero
1056; CHECK-NEXT:    callq fmodf@PLT
1057; CHECK-NEXT:    callq __truncsfhf2@PLT
1058; CHECK-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
1059; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
1060; CHECK-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1061; CHECK-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1062; CHECK-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
1063; CHECK-NEXT:    callq __extendhfsf2@PLT
1064; CHECK-NEXT:    vmovd %xmm0, (%rsp) # 4-byte Folded Spill
1065; CHECK-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1066; CHECK-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
1067; CHECK-NEXT:    callq __extendhfsf2@PLT
1068; CHECK-NEXT:    vmovss (%rsp), %xmm1 # 4-byte Reload
1069; CHECK-NEXT:    # xmm1 = mem[0],zero,zero,zero
1070; CHECK-NEXT:    callq fmodf@PLT
1071; CHECK-NEXT:    callq __truncsfhf2@PLT
1072; CHECK-NEXT:    vmovdqa %xmm0, (%rsp) # 16-byte Spill
1073; CHECK-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
1074; CHECK-NEXT:    # xmm0 = mem[1,0]
1075; CHECK-NEXT:    callq __extendhfsf2@PLT
1076; CHECK-NEXT:    vmovss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
1077; CHECK-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
1078; CHECK-NEXT:    # xmm0 = mem[1,0]
1079; CHECK-NEXT:    callq __extendhfsf2@PLT
1080; CHECK-NEXT:    vmovss {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Reload
1081; CHECK-NEXT:    # xmm1 = mem[0],zero,zero,zero
1082; CHECK-NEXT:    callq fmodf@PLT
1083; CHECK-NEXT:    callq __truncsfhf2@PLT
1084; CHECK-NEXT:    vpunpcklwd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
1085; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
1086; CHECK-NEXT:    vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
1087; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
1088; CHECK-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1089; CHECK-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1090; CHECK-NEXT:    vpsrlq $48, %xmm0, %xmm0
1091; CHECK-NEXT:    callq __extendhfsf2@PLT
1092; CHECK-NEXT:    vmovd %xmm0, (%rsp) # 4-byte Folded Spill
1093; CHECK-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1094; CHECK-NEXT:    vpsrlq $48, %xmm0, %xmm0
1095; CHECK-NEXT:    callq __extendhfsf2@PLT
1096; CHECK-NEXT:    vmovss (%rsp), %xmm1 # 4-byte Reload
1097; CHECK-NEXT:    # xmm1 = mem[0],zero,zero,zero
1098; CHECK-NEXT:    callq fmodf@PLT
1099; CHECK-NEXT:    callq __truncsfhf2@PLT
1100; CHECK-NEXT:    vmovdqa %xmm0, (%rsp) # 16-byte Spill
1101; CHECK-NEXT:    vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
1102; CHECK-NEXT:    # xmm0 = mem[1,1,3,3]
1103; CHECK-NEXT:    callq __extendhfsf2@PLT
1104; CHECK-NEXT:    vmovss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
1105; CHECK-NEXT:    vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
1106; CHECK-NEXT:    # xmm0 = mem[1,1,3,3]
1107; CHECK-NEXT:    callq __extendhfsf2@PLT
1108; CHECK-NEXT:    vmovss {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Reload
1109; CHECK-NEXT:    # xmm1 = mem[0],zero,zero,zero
1110; CHECK-NEXT:    callq fmodf@PLT
1111; CHECK-NEXT:    callq __truncsfhf2@PLT
1112; CHECK-NEXT:    vpunpcklwd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
1113; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
1114; CHECK-NEXT:    vmovdqa %xmm0, (%rsp) # 16-byte Spill
1115; CHECK-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1116; CHECK-NEXT:    callq __extendhfsf2@PLT
1117; CHECK-NEXT:    vmovss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
1118; CHECK-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1119; CHECK-NEXT:    callq __extendhfsf2@PLT
1120; CHECK-NEXT:    vmovss {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Reload
1121; CHECK-NEXT:    # xmm1 = mem[0],zero,zero,zero
1122; CHECK-NEXT:    callq fmodf@PLT
1123; CHECK-NEXT:    callq __truncsfhf2@PLT
1124; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1125; CHECK-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1126; CHECK-NEXT:    vpsrld $16, %xmm0, %xmm0
1127; CHECK-NEXT:    callq __extendhfsf2@PLT
1128; CHECK-NEXT:    vmovd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill
1129; CHECK-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1130; CHECK-NEXT:    vpsrld $16, %xmm0, %xmm0
1131; CHECK-NEXT:    callq __extendhfsf2@PLT
1132; CHECK-NEXT:    vmovss {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Reload
1133; CHECK-NEXT:    # xmm1 = mem[0],zero,zero,zero
1134; CHECK-NEXT:    callq fmodf@PLT
1135; CHECK-NEXT:    callq __truncsfhf2@PLT
1136; CHECK-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
1137; CHECK-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
1138; CHECK-NEXT:    vpunpckldq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
1139; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
1140; CHECK-NEXT:    vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
1141; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0]
1142; CHECK-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1143; CHECK-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
1144; CHECK-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
1145; CHECK-NEXT:    vzeroupper
1146; CHECK-NEXT:    callq __extendhfsf2@PLT
1147; CHECK-NEXT:    vmovd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill
1148; CHECK-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
1149; CHECK-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
1150; CHECK-NEXT:    vzeroupper
1151; CHECK-NEXT:    callq __extendhfsf2@PLT
1152; CHECK-NEXT:    vmovss {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Reload
1153; CHECK-NEXT:    # xmm1 = mem[0],zero,zero,zero
1154; CHECK-NEXT:    callq fmodf@PLT
1155; CHECK-NEXT:    callq __truncsfhf2@PLT
1156; CHECK-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1157; CHECK-NEXT:    vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
1158; CHECK-NEXT:    # xmm0 = mem[3,3,3,3]
1159; CHECK-NEXT:    callq __extendhfsf2@PLT
1160; CHECK-NEXT:    vmovss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
1161; CHECK-NEXT:    vpshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
1162; CHECK-NEXT:    # xmm0 = mem[3,3,3,3]
1163; CHECK-NEXT:    callq __extendhfsf2@PLT
1164; CHECK-NEXT:    vmovss {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Reload
1165; CHECK-NEXT:    # xmm1 = mem[0],zero,zero,zero
1166; CHECK-NEXT:    callq fmodf@PLT
1167; CHECK-NEXT:    callq __truncsfhf2@PLT
1168; CHECK-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
1169; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
1170; CHECK-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1171; CHECK-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
1172; CHECK-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
1173; CHECK-NEXT:    vzeroupper
1174; CHECK-NEXT:    callq __extendhfsf2@PLT
1175; CHECK-NEXT:    vmovd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill
1176; CHECK-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
1177; CHECK-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
1178; CHECK-NEXT:    vzeroupper
1179; CHECK-NEXT:    callq __extendhfsf2@PLT
1180; CHECK-NEXT:    vmovss {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Reload
1181; CHECK-NEXT:    # xmm1 = mem[0],zero,zero,zero
1182; CHECK-NEXT:    callq fmodf@PLT
1183; CHECK-NEXT:    callq __truncsfhf2@PLT
1184; CHECK-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1185; CHECK-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
1186; CHECK-NEXT:    # xmm0 = mem[1,0]
1187; CHECK-NEXT:    callq __extendhfsf2@PLT
1188; CHECK-NEXT:    vmovss %xmm0, (%rsp) # 4-byte Spill
1189; CHECK-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
1190; CHECK-NEXT:    # xmm0 = mem[1,0]
1191; CHECK-NEXT:    callq __extendhfsf2@PLT
1192; CHECK-NEXT:    vmovss (%rsp), %xmm1 # 4-byte Reload
1193; CHECK-NEXT:    # xmm1 = mem[0],zero,zero,zero
1194; CHECK-NEXT:    callq fmodf@PLT
1195; CHECK-NEXT:    callq __truncsfhf2@PLT
1196; CHECK-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
1197; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
1198; CHECK-NEXT:    vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
1199; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
1200; CHECK-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1201; CHECK-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
1202; CHECK-NEXT:    vpsrlq $48, %xmm0, %xmm0
1203; CHECK-NEXT:    vzeroupper
1204; CHECK-NEXT:    callq __extendhfsf2@PLT
1205; CHECK-NEXT:    vmovd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill
1206; CHECK-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
1207; CHECK-NEXT:    vpsrlq $48, %xmm0, %xmm0
1208; CHECK-NEXT:    vzeroupper
1209; CHECK-NEXT:    callq __extendhfsf2@PLT
1210; CHECK-NEXT:    vmovss {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Reload
1211; CHECK-NEXT:    # xmm1 = mem[0],zero,zero,zero
1212; CHECK-NEXT:    callq fmodf@PLT
1213; CHECK-NEXT:    callq __truncsfhf2@PLT
1214; CHECK-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1215; CHECK-NEXT:    vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
1216; CHECK-NEXT:    # xmm0 = mem[1,1,3,3]
1217; CHECK-NEXT:    callq __extendhfsf2@PLT
1218; CHECK-NEXT:    vmovss %xmm0, (%rsp) # 4-byte Spill
1219; CHECK-NEXT:    vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
1220; CHECK-NEXT:    # xmm0 = mem[1,1,3,3]
1221; CHECK-NEXT:    callq __extendhfsf2@PLT
1222; CHECK-NEXT:    vmovss (%rsp), %xmm1 # 4-byte Reload
1223; CHECK-NEXT:    # xmm1 = mem[0],zero,zero,zero
1224; CHECK-NEXT:    callq fmodf@PLT
1225; CHECK-NEXT:    callq __truncsfhf2@PLT
1226; CHECK-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
1227; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
1228; CHECK-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1229; CHECK-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
1230; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
1231; CHECK-NEXT:    vzeroupper
1232; CHECK-NEXT:    callq __extendhfsf2@PLT
1233; CHECK-NEXT:    vmovss %xmm0, (%rsp) # 4-byte Spill
1234; CHECK-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
1235; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
1236; CHECK-NEXT:    vzeroupper
1237; CHECK-NEXT:    callq __extendhfsf2@PLT
1238; CHECK-NEXT:    vmovss (%rsp), %xmm1 # 4-byte Reload
1239; CHECK-NEXT:    # xmm1 = mem[0],zero,zero,zero
1240; CHECK-NEXT:    callq fmodf@PLT
1241; CHECK-NEXT:    callq __truncsfhf2@PLT
1242; CHECK-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
1243; CHECK-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
1244; CHECK-NEXT:    vpsrld $16, %xmm0, %xmm0
1245; CHECK-NEXT:    vzeroupper
1246; CHECK-NEXT:    callq __extendhfsf2@PLT
1247; CHECK-NEXT:    vmovd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill
1248; CHECK-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
1249; CHECK-NEXT:    vpsrld $16, %xmm0, %xmm0
1250; CHECK-NEXT:    vzeroupper
1251; CHECK-NEXT:    callq __extendhfsf2@PLT
1252; CHECK-NEXT:    vmovss {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Reload
1253; CHECK-NEXT:    # xmm1 = mem[0],zero,zero,zero
1254; CHECK-NEXT:    callq fmodf@PLT
1255; CHECK-NEXT:    callq __truncsfhf2@PLT
1256; CHECK-NEXT:    vmovdqa (%rsp), %xmm1 # 16-byte Reload
1257; CHECK-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
1258; CHECK-NEXT:    vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
1259; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
1260; CHECK-NEXT:    vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
1261; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0]
1262; CHECK-NEXT:    vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload
1263; CHECK-NEXT:    vmovaps %ymm0, (%rbx)
1264; CHECK-NEXT:    addq $144, %rsp
1265; CHECK-NEXT:    popq %rbx
1266; CHECK-NEXT:    vzeroupper
1267; CHECK-NEXT:    retq
1268  %frem = frem <16 x half> %a0, %a1
1269  store <16 x half> %frem, ptr%p3
1270  ret void
1271}
1272
1273define void @frem_v8f16(<8 x half> %a0, <8 x half> %a1, ptr%p3) nounwind {
1274; CHECK-LABEL: frem_v8f16:
1275; CHECK:       # %bb.0:
1276; CHECK-NEXT:    pushq %rbx
1277; CHECK-NEXT:    subq $80, %rsp
1278; CHECK-NEXT:    movq %rdi, %rbx
1279; CHECK-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1280; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1281; CHECK-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm1[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
1282; CHECK-NEXT:    callq __extendhfsf2@PLT
1283; CHECK-NEXT:    vmovd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill
1284; CHECK-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1285; CHECK-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
1286; CHECK-NEXT:    callq __extendhfsf2@PLT
1287; CHECK-NEXT:    vmovss {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Reload
1288; CHECK-NEXT:    # xmm1 = mem[0],zero,zero,zero
1289; CHECK-NEXT:    callq fmodf@PLT
1290; CHECK-NEXT:    callq __truncsfhf2@PLT
1291; CHECK-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1292; CHECK-NEXT:    vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
1293; CHECK-NEXT:    # xmm0 = mem[3,3,3,3]
1294; CHECK-NEXT:    callq __extendhfsf2@PLT
1295; CHECK-NEXT:    vmovss %xmm0, (%rsp) # 4-byte Spill
1296; CHECK-NEXT:    vpshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
1297; CHECK-NEXT:    # xmm0 = mem[3,3,3,3]
1298; CHECK-NEXT:    callq __extendhfsf2@PLT
1299; CHECK-NEXT:    vmovss (%rsp), %xmm1 # 4-byte Reload
1300; CHECK-NEXT:    # xmm1 = mem[0],zero,zero,zero
1301; CHECK-NEXT:    callq fmodf@PLT
1302; CHECK-NEXT:    callq __truncsfhf2@PLT
1303; CHECK-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
1304; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
1305; CHECK-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1306; CHECK-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1307; CHECK-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
1308; CHECK-NEXT:    callq __extendhfsf2@PLT
1309; CHECK-NEXT:    vmovd %xmm0, (%rsp) # 4-byte Folded Spill
1310; CHECK-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1311; CHECK-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
1312; CHECK-NEXT:    callq __extendhfsf2@PLT
1313; CHECK-NEXT:    vmovss (%rsp), %xmm1 # 4-byte Reload
1314; CHECK-NEXT:    # xmm1 = mem[0],zero,zero,zero
1315; CHECK-NEXT:    callq fmodf@PLT
1316; CHECK-NEXT:    callq __truncsfhf2@PLT
1317; CHECK-NEXT:    vmovdqa %xmm0, (%rsp) # 16-byte Spill
1318; CHECK-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
1319; CHECK-NEXT:    # xmm0 = mem[1,0]
1320; CHECK-NEXT:    callq __extendhfsf2@PLT
1321; CHECK-NEXT:    vmovss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
1322; CHECK-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
1323; CHECK-NEXT:    # xmm0 = mem[1,0]
1324; CHECK-NEXT:    callq __extendhfsf2@PLT
1325; CHECK-NEXT:    vmovss {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Reload
1326; CHECK-NEXT:    # xmm1 = mem[0],zero,zero,zero
1327; CHECK-NEXT:    callq fmodf@PLT
1328; CHECK-NEXT:    callq __truncsfhf2@PLT
1329; CHECK-NEXT:    vpunpcklwd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
1330; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
1331; CHECK-NEXT:    vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
1332; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
1333; CHECK-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1334; CHECK-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1335; CHECK-NEXT:    vpsrlq $48, %xmm0, %xmm0
1336; CHECK-NEXT:    callq __extendhfsf2@PLT
1337; CHECK-NEXT:    vmovd %xmm0, (%rsp) # 4-byte Folded Spill
1338; CHECK-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1339; CHECK-NEXT:    vpsrlq $48, %xmm0, %xmm0
1340; CHECK-NEXT:    callq __extendhfsf2@PLT
1341; CHECK-NEXT:    vmovss (%rsp), %xmm1 # 4-byte Reload
1342; CHECK-NEXT:    # xmm1 = mem[0],zero,zero,zero
1343; CHECK-NEXT:    callq fmodf@PLT
1344; CHECK-NEXT:    callq __truncsfhf2@PLT
1345; CHECK-NEXT:    vmovdqa %xmm0, (%rsp) # 16-byte Spill
1346; CHECK-NEXT:    vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
1347; CHECK-NEXT:    # xmm0 = mem[1,1,3,3]
1348; CHECK-NEXT:    callq __extendhfsf2@PLT
1349; CHECK-NEXT:    vmovss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
1350; CHECK-NEXT:    vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
1351; CHECK-NEXT:    # xmm0 = mem[1,1,3,3]
1352; CHECK-NEXT:    callq __extendhfsf2@PLT
1353; CHECK-NEXT:    vmovss {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Reload
1354; CHECK-NEXT:    # xmm1 = mem[0],zero,zero,zero
1355; CHECK-NEXT:    callq fmodf@PLT
1356; CHECK-NEXT:    callq __truncsfhf2@PLT
1357; CHECK-NEXT:    vpunpcklwd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
1358; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
1359; CHECK-NEXT:    vmovdqa %xmm0, (%rsp) # 16-byte Spill
1360; CHECK-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1361; CHECK-NEXT:    callq __extendhfsf2@PLT
1362; CHECK-NEXT:    vmovss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
1363; CHECK-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1364; CHECK-NEXT:    callq __extendhfsf2@PLT
1365; CHECK-NEXT:    vmovss {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Reload
1366; CHECK-NEXT:    # xmm1 = mem[0],zero,zero,zero
1367; CHECK-NEXT:    callq fmodf@PLT
1368; CHECK-NEXT:    callq __truncsfhf2@PLT
1369; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1370; CHECK-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1371; CHECK-NEXT:    vpsrld $16, %xmm0, %xmm0
1372; CHECK-NEXT:    callq __extendhfsf2@PLT
1373; CHECK-NEXT:    vmovd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill
1374; CHECK-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1375; CHECK-NEXT:    vpsrld $16, %xmm0, %xmm0
1376; CHECK-NEXT:    callq __extendhfsf2@PLT
1377; CHECK-NEXT:    vmovss {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Reload
1378; CHECK-NEXT:    # xmm1 = mem[0],zero,zero,zero
1379; CHECK-NEXT:    callq fmodf@PLT
1380; CHECK-NEXT:    callq __truncsfhf2@PLT
1381; CHECK-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
1382; CHECK-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
1383; CHECK-NEXT:    vpunpckldq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
1384; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
1385; CHECK-NEXT:    vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
1386; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0]
1387; CHECK-NEXT:    vmovdqa %xmm0, (%rbx)
1388; CHECK-NEXT:    addq $80, %rsp
1389; CHECK-NEXT:    popq %rbx
1390; CHECK-NEXT:    retq
1391  %frem = frem <8 x half> %a0, %a1
1392  store <8 x half> %frem, ptr%p3
1393  ret void
1394}
1395
1396define void @frem_v4f80(<4 x x86_fp80> %a0, <4 x x86_fp80> %a1, ptr%p3) nounwind {
1397; CHECK-LABEL: frem_v4f80:
1398; CHECK:       # %bb.0:
1399; CHECK-NEXT:    pushq %rbx
1400; CHECK-NEXT:    subq $128, %rsp
1401; CHECK-NEXT:    movq %rdi, %rbx
1402; CHECK-NEXT:    fldt {{[0-9]+}}(%rsp)
1403; CHECK-NEXT:    fstpt {{[-0-9]+}}(%r{{[sb]}}p) # 10-byte Folded Spill
1404; CHECK-NEXT:    fldt {{[0-9]+}}(%rsp)
1405; CHECK-NEXT:    fstpt {{[-0-9]+}}(%r{{[sb]}}p) # 10-byte Folded Spill
1406; CHECK-NEXT:    fldt {{[0-9]+}}(%rsp)
1407; CHECK-NEXT:    fstpt {{[-0-9]+}}(%r{{[sb]}}p) # 10-byte Folded Spill
1408; CHECK-NEXT:    fldt {{[0-9]+}}(%rsp)
1409; CHECK-NEXT:    fstpt {{[-0-9]+}}(%r{{[sb]}}p) # 10-byte Folded Spill
1410; CHECK-NEXT:    fldt {{[0-9]+}}(%rsp)
1411; CHECK-NEXT:    fstpt {{[-0-9]+}}(%r{{[sb]}}p) # 10-byte Folded Spill
1412; CHECK-NEXT:    fldt {{[0-9]+}}(%rsp)
1413; CHECK-NEXT:    fstpt {{[-0-9]+}}(%r{{[sb]}}p) # 10-byte Folded Spill
1414; CHECK-NEXT:    fldt {{[0-9]+}}(%rsp)
1415; CHECK-NEXT:    fldt {{[0-9]+}}(%rsp)
1416; CHECK-NEXT:    fstpt {{[0-9]+}}(%rsp)
1417; CHECK-NEXT:    fstpt (%rsp)
1418; CHECK-NEXT:    callq fmodl@PLT
1419; CHECK-NEXT:    fstpt {{[-0-9]+}}(%r{{[sb]}}p) # 10-byte Folded Spill
1420; CHECK-NEXT:    fldt {{[-0-9]+}}(%r{{[sb]}}p) # 10-byte Folded Reload
1421; CHECK-NEXT:    fstpt {{[0-9]+}}(%rsp)
1422; CHECK-NEXT:    fldt {{[-0-9]+}}(%r{{[sb]}}p) # 10-byte Folded Reload
1423; CHECK-NEXT:    fstpt (%rsp)
1424; CHECK-NEXT:    callq fmodl@PLT
1425; CHECK-NEXT:    fstpt {{[-0-9]+}}(%r{{[sb]}}p) # 10-byte Folded Spill
1426; CHECK-NEXT:    fldt {{[-0-9]+}}(%r{{[sb]}}p) # 10-byte Folded Reload
1427; CHECK-NEXT:    fstpt {{[0-9]+}}(%rsp)
1428; CHECK-NEXT:    fldt {{[-0-9]+}}(%r{{[sb]}}p) # 10-byte Folded Reload
1429; CHECK-NEXT:    fstpt (%rsp)
1430; CHECK-NEXT:    callq fmodl@PLT
1431; CHECK-NEXT:    fstpt {{[-0-9]+}}(%r{{[sb]}}p) # 10-byte Folded Spill
1432; CHECK-NEXT:    fldt {{[-0-9]+}}(%r{{[sb]}}p) # 10-byte Folded Reload
1433; CHECK-NEXT:    fstpt {{[0-9]+}}(%rsp)
1434; CHECK-NEXT:    fldt {{[-0-9]+}}(%r{{[sb]}}p) # 10-byte Folded Reload
1435; CHECK-NEXT:    fstpt (%rsp)
1436; CHECK-NEXT:    callq fmodl@PLT
1437; CHECK-NEXT:    fstpt 30(%rbx)
1438; CHECK-NEXT:    fldt {{[-0-9]+}}(%r{{[sb]}}p) # 10-byte Folded Reload
1439; CHECK-NEXT:    fstpt 20(%rbx)
1440; CHECK-NEXT:    fldt {{[-0-9]+}}(%r{{[sb]}}p) # 10-byte Folded Reload
1441; CHECK-NEXT:    fstpt 10(%rbx)
1442; CHECK-NEXT:    fldt {{[-0-9]+}}(%r{{[sb]}}p) # 10-byte Folded Reload
1443; CHECK-NEXT:    fstpt (%rbx)
1444; CHECK-NEXT:    addq $128, %rsp
1445; CHECK-NEXT:    popq %rbx
1446; CHECK-NEXT:    retq
1447  %frem = frem <4 x x86_fp80> %a0, %a1
1448  store <4 x x86_fp80> %frem, ptr%p3
1449  ret void
1450}
1451