xref: /llvm-project/llvm/test/CodeGen/X86/avx512fp16-frem.ll (revision 2773098ee3187d5f9daca8938d57657dd89dd36f)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx512fp16 | FileCheck %s
3
4define half @frem(half %x, half %y) nounwind {
5; CHECK-LABEL: frem:
6; CHECK:       # %bb.0:
7; CHECK-NEXT:    pushq %rax
8; CHECK-NEXT:    vcvtsh2ss %xmm0, %xmm0, %xmm0
9; CHECK-NEXT:    vcvtsh2ss %xmm1, %xmm1, %xmm1
10; CHECK-NEXT:    callq fmodf@PLT
11; CHECK-NEXT:    vcvtss2sh %xmm0, %xmm0, %xmm0
12; CHECK-NEXT:    popq %rax
13; CHECK-NEXT:    retq
14  %r = frem half %x, %y
15  ret half %r
16}
17
18define <2 x half> @frem_vec2(<2 x half> %x, <2 x half> %y) nounwind {
19; CHECK-LABEL: frem_vec2:
20; CHECK:       # %bb.0:
21; CHECK-NEXT:    subq $88, %rsp
22; CHECK-NEXT:    vmovapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
23; CHECK-NEXT:    vmovapd %xmm0, (%rsp) # 16-byte Spill
24; CHECK-NEXT:    vpsrldq {{.*#+}} xmm2 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
25; CHECK-NEXT:    vcvtsh2ss %xmm2, %xmm2, %xmm0
26; CHECK-NEXT:    vpsrldq {{.*#+}} xmm2 = xmm1[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
27; CHECK-NEXT:    vcvtsh2ss %xmm2, %xmm2, %xmm1
28; CHECK-NEXT:    callq fmodf@PLT
29; CHECK-NEXT:    vcvtss2sh %xmm0, %xmm0, %xmm0
30; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
31; CHECK-NEXT:    vpermilps $255, (%rsp), %xmm0 # 16-byte Folded Reload
32; CHECK-NEXT:    # xmm0 = mem[3,3,3,3]
33; CHECK-NEXT:    vcvtsh2ss %xmm0, %xmm0, %xmm0
34; CHECK-NEXT:    vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
35; CHECK-NEXT:    # xmm1 = mem[3,3,3,3]
36; CHECK-NEXT:    vcvtsh2ss %xmm1, %xmm1, %xmm1
37; CHECK-NEXT:    callq fmodf@PLT
38; CHECK-NEXT:    vcvtss2sh %xmm0, %xmm0, %xmm0
39; CHECK-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
40; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
41; CHECK-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
42; CHECK-NEXT:    vpsrldq $10, (%rsp), %xmm0 # 16-byte Folded Reload
43; CHECK-NEXT:    # xmm0 = mem[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
44; CHECK-NEXT:    vcvtsh2ss %xmm0, %xmm0, %xmm0
45; CHECK-NEXT:    vpsrldq $10, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
46; CHECK-NEXT:    # xmm1 = mem[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
47; CHECK-NEXT:    vcvtsh2ss %xmm1, %xmm1, %xmm1
48; CHECK-NEXT:    callq fmodf@PLT
49; CHECK-NEXT:    vcvtss2sh %xmm0, %xmm0, %xmm0
50; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
51; CHECK-NEXT:    vpermilpd $1, (%rsp), %xmm0 # 16-byte Folded Reload
52; CHECK-NEXT:    # xmm0 = mem[1,0]
53; CHECK-NEXT:    vcvtsh2ss %xmm0, %xmm0, %xmm0
54; CHECK-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
55; CHECK-NEXT:    # xmm1 = mem[1,0]
56; CHECK-NEXT:    vcvtsh2ss %xmm1, %xmm1, %xmm1
57; CHECK-NEXT:    callq fmodf@PLT
58; CHECK-NEXT:    vcvtss2sh %xmm0, %xmm0, %xmm0
59; CHECK-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
60; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
61; CHECK-NEXT:    vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
62; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
63; CHECK-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
64; CHECK-NEXT:    vpsrlq $48, (%rsp), %xmm0 # 16-byte Folded Reload
65; CHECK-NEXT:    vcvtsh2ss %xmm0, %xmm0, %xmm0
66; CHECK-NEXT:    vpsrlq $48, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
67; CHECK-NEXT:    vcvtsh2ss %xmm1, %xmm1, %xmm1
68; CHECK-NEXT:    callq fmodf@PLT
69; CHECK-NEXT:    vcvtss2sh %xmm0, %xmm0, %xmm0
70; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
71; CHECK-NEXT:    vmovshdup (%rsp), %xmm0 # 16-byte Folded Reload
72; CHECK-NEXT:    # xmm0 = mem[1,1,3,3]
73; CHECK-NEXT:    vcvtsh2ss %xmm0, %xmm0, %xmm0
74; CHECK-NEXT:    vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
75; CHECK-NEXT:    # xmm1 = mem[1,1,3,3]
76; CHECK-NEXT:    vcvtsh2ss %xmm1, %xmm1, %xmm1
77; CHECK-NEXT:    callq fmodf@PLT
78; CHECK-NEXT:    vcvtss2sh %xmm0, %xmm0, %xmm0
79; CHECK-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
80; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
81; CHECK-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
82; CHECK-NEXT:    vmovaps (%rsp), %xmm0 # 16-byte Reload
83; CHECK-NEXT:    vcvtsh2ss %xmm0, %xmm0, %xmm0
84; CHECK-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
85; CHECK-NEXT:    vcvtsh2ss %xmm1, %xmm1, %xmm1
86; CHECK-NEXT:    callq fmodf@PLT
87; CHECK-NEXT:    vcvtss2sh %xmm0, %xmm0, %xmm0
88; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
89; CHECK-NEXT:    vpsrld $16, (%rsp), %xmm0 # 16-byte Folded Reload
90; CHECK-NEXT:    vcvtsh2ss %xmm0, %xmm0, %xmm0
91; CHECK-NEXT:    vpsrld $16, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
92; CHECK-NEXT:    vcvtsh2ss %xmm1, %xmm1, %xmm1
93; CHECK-NEXT:    callq fmodf@PLT
94; CHECK-NEXT:    vcvtss2sh %xmm0, %xmm0, %xmm0
95; CHECK-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
96; CHECK-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
97; CHECK-NEXT:    vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
98; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
99; CHECK-NEXT:    vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
100; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0]
101; CHECK-NEXT:    addq $88, %rsp
102; CHECK-NEXT:    retq
103  %r = frem <2 x half> %x, %y
104  ret <2 x half> %r
105}
106
107define <4 x half> @frem_vec4(<4 x half> %x, <4 x half> %y) nounwind {
108; CHECK-LABEL: frem_vec4:
109; CHECK:       # %bb.0:
110; CHECK-NEXT:    subq $88, %rsp
111; CHECK-NEXT:    vmovapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
112; CHECK-NEXT:    vmovapd %xmm0, (%rsp) # 16-byte Spill
113; CHECK-NEXT:    vpsrldq {{.*#+}} xmm2 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
114; CHECK-NEXT:    vcvtsh2ss %xmm2, %xmm2, %xmm0
115; CHECK-NEXT:    vpsrldq {{.*#+}} xmm2 = xmm1[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
116; CHECK-NEXT:    vcvtsh2ss %xmm2, %xmm2, %xmm1
117; CHECK-NEXT:    callq fmodf@PLT
118; CHECK-NEXT:    vcvtss2sh %xmm0, %xmm0, %xmm0
119; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
120; CHECK-NEXT:    vpermilps $255, (%rsp), %xmm0 # 16-byte Folded Reload
121; CHECK-NEXT:    # xmm0 = mem[3,3,3,3]
122; CHECK-NEXT:    vcvtsh2ss %xmm0, %xmm0, %xmm0
123; CHECK-NEXT:    vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
124; CHECK-NEXT:    # xmm1 = mem[3,3,3,3]
125; CHECK-NEXT:    vcvtsh2ss %xmm1, %xmm1, %xmm1
126; CHECK-NEXT:    callq fmodf@PLT
127; CHECK-NEXT:    vcvtss2sh %xmm0, %xmm0, %xmm0
128; CHECK-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
129; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
130; CHECK-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
131; CHECK-NEXT:    vpsrldq $10, (%rsp), %xmm0 # 16-byte Folded Reload
132; CHECK-NEXT:    # xmm0 = mem[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
133; CHECK-NEXT:    vcvtsh2ss %xmm0, %xmm0, %xmm0
134; CHECK-NEXT:    vpsrldq $10, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
135; CHECK-NEXT:    # xmm1 = mem[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
136; CHECK-NEXT:    vcvtsh2ss %xmm1, %xmm1, %xmm1
137; CHECK-NEXT:    callq fmodf@PLT
138; CHECK-NEXT:    vcvtss2sh %xmm0, %xmm0, %xmm0
139; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
140; CHECK-NEXT:    vpermilpd $1, (%rsp), %xmm0 # 16-byte Folded Reload
141; CHECK-NEXT:    # xmm0 = mem[1,0]
142; CHECK-NEXT:    vcvtsh2ss %xmm0, %xmm0, %xmm0
143; CHECK-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
144; CHECK-NEXT:    # xmm1 = mem[1,0]
145; CHECK-NEXT:    vcvtsh2ss %xmm1, %xmm1, %xmm1
146; CHECK-NEXT:    callq fmodf@PLT
147; CHECK-NEXT:    vcvtss2sh %xmm0, %xmm0, %xmm0
148; CHECK-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
149; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
150; CHECK-NEXT:    vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
151; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
152; CHECK-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
153; CHECK-NEXT:    vpsrlq $48, (%rsp), %xmm0 # 16-byte Folded Reload
154; CHECK-NEXT:    vcvtsh2ss %xmm0, %xmm0, %xmm0
155; CHECK-NEXT:    vpsrlq $48, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
156; CHECK-NEXT:    vcvtsh2ss %xmm1, %xmm1, %xmm1
157; CHECK-NEXT:    callq fmodf@PLT
158; CHECK-NEXT:    vcvtss2sh %xmm0, %xmm0, %xmm0
159; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
160; CHECK-NEXT:    vmovshdup (%rsp), %xmm0 # 16-byte Folded Reload
161; CHECK-NEXT:    # xmm0 = mem[1,1,3,3]
162; CHECK-NEXT:    vcvtsh2ss %xmm0, %xmm0, %xmm0
163; CHECK-NEXT:    vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
164; CHECK-NEXT:    # xmm1 = mem[1,1,3,3]
165; CHECK-NEXT:    vcvtsh2ss %xmm1, %xmm1, %xmm1
166; CHECK-NEXT:    callq fmodf@PLT
167; CHECK-NEXT:    vcvtss2sh %xmm0, %xmm0, %xmm0
168; CHECK-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
169; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
170; CHECK-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
171; CHECK-NEXT:    vmovaps (%rsp), %xmm0 # 16-byte Reload
172; CHECK-NEXT:    vcvtsh2ss %xmm0, %xmm0, %xmm0
173; CHECK-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
174; CHECK-NEXT:    vcvtsh2ss %xmm1, %xmm1, %xmm1
175; CHECK-NEXT:    callq fmodf@PLT
176; CHECK-NEXT:    vcvtss2sh %xmm0, %xmm0, %xmm0
177; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
178; CHECK-NEXT:    vpsrld $16, (%rsp), %xmm0 # 16-byte Folded Reload
179; CHECK-NEXT:    vcvtsh2ss %xmm0, %xmm0, %xmm0
180; CHECK-NEXT:    vpsrld $16, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
181; CHECK-NEXT:    vcvtsh2ss %xmm1, %xmm1, %xmm1
182; CHECK-NEXT:    callq fmodf@PLT
183; CHECK-NEXT:    vcvtss2sh %xmm0, %xmm0, %xmm0
184; CHECK-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
185; CHECK-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
186; CHECK-NEXT:    vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
187; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
188; CHECK-NEXT:    vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
189; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0]
190; CHECK-NEXT:    addq $88, %rsp
191; CHECK-NEXT:    retq
192  %r = frem <4 x half> %x, %y
193  ret <4 x half> %r
194}
195
196define <8 x half> @frem_vec8(<8 x half> %x, <8 x half> %y) nounwind {
197; CHECK-LABEL: frem_vec8:
198; CHECK:       # %bb.0:
199; CHECK-NEXT:    subq $88, %rsp
200; CHECK-NEXT:    vmovapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
201; CHECK-NEXT:    vmovapd %xmm0, (%rsp) # 16-byte Spill
202; CHECK-NEXT:    vpsrldq {{.*#+}} xmm2 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
203; CHECK-NEXT:    vcvtsh2ss %xmm2, %xmm2, %xmm0
204; CHECK-NEXT:    vpsrldq {{.*#+}} xmm2 = xmm1[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
205; CHECK-NEXT:    vcvtsh2ss %xmm2, %xmm2, %xmm1
206; CHECK-NEXT:    callq fmodf@PLT
207; CHECK-NEXT:    vcvtss2sh %xmm0, %xmm0, %xmm0
208; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
209; CHECK-NEXT:    vpermilps $255, (%rsp), %xmm0 # 16-byte Folded Reload
210; CHECK-NEXT:    # xmm0 = mem[3,3,3,3]
211; CHECK-NEXT:    vcvtsh2ss %xmm0, %xmm0, %xmm0
212; CHECK-NEXT:    vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
213; CHECK-NEXT:    # xmm1 = mem[3,3,3,3]
214; CHECK-NEXT:    vcvtsh2ss %xmm1, %xmm1, %xmm1
215; CHECK-NEXT:    callq fmodf@PLT
216; CHECK-NEXT:    vcvtss2sh %xmm0, %xmm0, %xmm0
217; CHECK-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
218; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
219; CHECK-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
220; CHECK-NEXT:    vpsrldq $10, (%rsp), %xmm0 # 16-byte Folded Reload
221; CHECK-NEXT:    # xmm0 = mem[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
222; CHECK-NEXT:    vcvtsh2ss %xmm0, %xmm0, %xmm0
223; CHECK-NEXT:    vpsrldq $10, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
224; CHECK-NEXT:    # xmm1 = mem[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
225; CHECK-NEXT:    vcvtsh2ss %xmm1, %xmm1, %xmm1
226; CHECK-NEXT:    callq fmodf@PLT
227; CHECK-NEXT:    vcvtss2sh %xmm0, %xmm0, %xmm0
228; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
229; CHECK-NEXT:    vpermilpd $1, (%rsp), %xmm0 # 16-byte Folded Reload
230; CHECK-NEXT:    # xmm0 = mem[1,0]
231; CHECK-NEXT:    vcvtsh2ss %xmm0, %xmm0, %xmm0
232; CHECK-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
233; CHECK-NEXT:    # xmm1 = mem[1,0]
234; CHECK-NEXT:    vcvtsh2ss %xmm1, %xmm1, %xmm1
235; CHECK-NEXT:    callq fmodf@PLT
236; CHECK-NEXT:    vcvtss2sh %xmm0, %xmm0, %xmm0
237; CHECK-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
238; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
239; CHECK-NEXT:    vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
240; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
241; CHECK-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
242; CHECK-NEXT:    vpsrlq $48, (%rsp), %xmm0 # 16-byte Folded Reload
243; CHECK-NEXT:    vcvtsh2ss %xmm0, %xmm0, %xmm0
244; CHECK-NEXT:    vpsrlq $48, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
245; CHECK-NEXT:    vcvtsh2ss %xmm1, %xmm1, %xmm1
246; CHECK-NEXT:    callq fmodf@PLT
247; CHECK-NEXT:    vcvtss2sh %xmm0, %xmm0, %xmm0
248; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
249; CHECK-NEXT:    vmovshdup (%rsp), %xmm0 # 16-byte Folded Reload
250; CHECK-NEXT:    # xmm0 = mem[1,1,3,3]
251; CHECK-NEXT:    vcvtsh2ss %xmm0, %xmm0, %xmm0
252; CHECK-NEXT:    vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
253; CHECK-NEXT:    # xmm1 = mem[1,1,3,3]
254; CHECK-NEXT:    vcvtsh2ss %xmm1, %xmm1, %xmm1
255; CHECK-NEXT:    callq fmodf@PLT
256; CHECK-NEXT:    vcvtss2sh %xmm0, %xmm0, %xmm0
257; CHECK-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
258; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
259; CHECK-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
260; CHECK-NEXT:    vmovaps (%rsp), %xmm0 # 16-byte Reload
261; CHECK-NEXT:    vcvtsh2ss %xmm0, %xmm0, %xmm0
262; CHECK-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
263; CHECK-NEXT:    vcvtsh2ss %xmm1, %xmm1, %xmm1
264; CHECK-NEXT:    callq fmodf@PLT
265; CHECK-NEXT:    vcvtss2sh %xmm0, %xmm0, %xmm0
266; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
267; CHECK-NEXT:    vpsrld $16, (%rsp), %xmm0 # 16-byte Folded Reload
268; CHECK-NEXT:    vcvtsh2ss %xmm0, %xmm0, %xmm0
269; CHECK-NEXT:    vpsrld $16, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
270; CHECK-NEXT:    vcvtsh2ss %xmm1, %xmm1, %xmm1
271; CHECK-NEXT:    callq fmodf@PLT
272; CHECK-NEXT:    vcvtss2sh %xmm0, %xmm0, %xmm0
273; CHECK-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
274; CHECK-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
275; CHECK-NEXT:    vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
276; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
277; CHECK-NEXT:    vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
278; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0]
279; CHECK-NEXT:    addq $88, %rsp
280; CHECK-NEXT:    retq
281  %r = frem <8 x half> %x, %y
282  ret <8 x half> %r
283}
284
285define <16 x half> @frem_vec16(<16 x half> %x, <16 x half> %y) nounwind {
286; CHECK-LABEL: frem_vec16:
287; CHECK:       # %bb.0:
288; CHECK-NEXT:    subq $184, %rsp
289; CHECK-NEXT:    vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
290; CHECK-NEXT:    vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
291; CHECK-NEXT:    vextractf128 $1, %ymm0, %xmm0
292; CHECK-NEXT:    vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
293; CHECK-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
294; CHECK-NEXT:    vcvtsh2ss %xmm0, %xmm0, %xmm0
295; CHECK-NEXT:    vextractf128 $1, %ymm1, %xmm1
296; CHECK-NEXT:    vmovapd %xmm1, (%rsp) # 16-byte Spill
297; CHECK-NEXT:    vpsrldq {{.*#+}} xmm1 = xmm1[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
298; CHECK-NEXT:    vcvtsh2ss %xmm1, %xmm1, %xmm1
299; CHECK-NEXT:    vzeroupper
300; CHECK-NEXT:    callq fmodf@PLT
301; CHECK-NEXT:    vcvtss2sh %xmm0, %xmm0, %xmm0
302; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
303; CHECK-NEXT:    vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
304; CHECK-NEXT:    # xmm0 = mem[3,3,3,3]
305; CHECK-NEXT:    vcvtsh2ss %xmm0, %xmm0, %xmm0
306; CHECK-NEXT:    vpermilps $255, (%rsp), %xmm1 # 16-byte Folded Reload
307; CHECK-NEXT:    # xmm1 = mem[3,3,3,3]
308; CHECK-NEXT:    vcvtsh2ss %xmm1, %xmm1, %xmm1
309; CHECK-NEXT:    callq fmodf@PLT
310; CHECK-NEXT:    vcvtss2sh %xmm0, %xmm0, %xmm0
311; CHECK-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
312; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
313; CHECK-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
314; CHECK-NEXT:    vpsrldq $14, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
315; CHECK-NEXT:    # xmm0 = mem[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
316; CHECK-NEXT:    vcvtsh2ss %xmm0, %xmm0, %xmm0
317; CHECK-NEXT:    vpsrldq $14, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
318; CHECK-NEXT:    # xmm1 = mem[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
319; CHECK-NEXT:    vcvtsh2ss %xmm1, %xmm1, %xmm1
320; CHECK-NEXT:    callq fmodf@PLT
321; CHECK-NEXT:    vcvtss2sh %xmm0, %xmm0, %xmm0
322; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
323; CHECK-NEXT:    vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
324; CHECK-NEXT:    # xmm0 = mem[3,3,3,3]
325; CHECK-NEXT:    vcvtsh2ss %xmm0, %xmm0, %xmm0
326; CHECK-NEXT:    vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
327; CHECK-NEXT:    # xmm1 = mem[3,3,3,3]
328; CHECK-NEXT:    vcvtsh2ss %xmm1, %xmm1, %xmm1
329; CHECK-NEXT:    callq fmodf@PLT
330; CHECK-NEXT:    vcvtss2sh %xmm0, %xmm0, %xmm0
331; CHECK-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
332; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
333; CHECK-NEXT:    vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload
334; CHECK-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
335; CHECK-NEXT:    vpsrldq $10, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
336; CHECK-NEXT:    # xmm0 = mem[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
337; CHECK-NEXT:    vcvtsh2ss %xmm0, %xmm0, %xmm0
338; CHECK-NEXT:    vpsrldq $10, (%rsp), %xmm1 # 16-byte Folded Reload
339; CHECK-NEXT:    # xmm1 = mem[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
340; CHECK-NEXT:    vcvtsh2ss %xmm1, %xmm1, %xmm1
341; CHECK-NEXT:    vzeroupper
342; CHECK-NEXT:    callq fmodf@PLT
343; CHECK-NEXT:    vcvtss2sh %xmm0, %xmm0, %xmm0
344; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
345; CHECK-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
346; CHECK-NEXT:    # xmm0 = mem[1,0]
347; CHECK-NEXT:    vcvtsh2ss %xmm0, %xmm0, %xmm0
348; CHECK-NEXT:    vpermilpd $1, (%rsp), %xmm1 # 16-byte Folded Reload
349; CHECK-NEXT:    # xmm1 = mem[1,0]
350; CHECK-NEXT:    vcvtsh2ss %xmm1, %xmm1, %xmm1
351; CHECK-NEXT:    callq fmodf@PLT
352; CHECK-NEXT:    vcvtss2sh %xmm0, %xmm0, %xmm0
353; CHECK-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
354; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
355; CHECK-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
356; CHECK-NEXT:    vpsrldq $10, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
357; CHECK-NEXT:    # xmm0 = mem[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
358; CHECK-NEXT:    vcvtsh2ss %xmm0, %xmm0, %xmm0
359; CHECK-NEXT:    vpsrldq $10, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
360; CHECK-NEXT:    # xmm1 = mem[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
361; CHECK-NEXT:    vcvtsh2ss %xmm1, %xmm1, %xmm1
362; CHECK-NEXT:    callq fmodf@PLT
363; CHECK-NEXT:    vcvtss2sh %xmm0, %xmm0, %xmm0
364; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
365; CHECK-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
366; CHECK-NEXT:    # xmm0 = mem[1,0]
367; CHECK-NEXT:    vcvtsh2ss %xmm0, %xmm0, %xmm0
368; CHECK-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
369; CHECK-NEXT:    # xmm1 = mem[1,0]
370; CHECK-NEXT:    vcvtsh2ss %xmm1, %xmm1, %xmm1
371; CHECK-NEXT:    callq fmodf@PLT
372; CHECK-NEXT:    vcvtss2sh %xmm0, %xmm0, %xmm0
373; CHECK-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
374; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
375; CHECK-NEXT:    vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload
376; CHECK-NEXT:    vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
377; CHECK-NEXT:    # ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5]
378; CHECK-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
379; CHECK-NEXT:    vpsrlq $48, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
380; CHECK-NEXT:    vcvtsh2ss %xmm0, %xmm0, %xmm0
381; CHECK-NEXT:    vpsrlq $48, (%rsp), %xmm1 # 16-byte Folded Reload
382; CHECK-NEXT:    vcvtsh2ss %xmm1, %xmm1, %xmm1
383; CHECK-NEXT:    vzeroupper
384; CHECK-NEXT:    callq fmodf@PLT
385; CHECK-NEXT:    vcvtss2sh %xmm0, %xmm0, %xmm0
386; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
387; CHECK-NEXT:    vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
388; CHECK-NEXT:    # xmm0 = mem[1,1,3,3]
389; CHECK-NEXT:    vcvtsh2ss %xmm0, %xmm0, %xmm0
390; CHECK-NEXT:    vmovshdup (%rsp), %xmm1 # 16-byte Folded Reload
391; CHECK-NEXT:    # xmm1 = mem[1,1,3,3]
392; CHECK-NEXT:    vcvtsh2ss %xmm1, %xmm1, %xmm1
393; CHECK-NEXT:    callq fmodf@PLT
394; CHECK-NEXT:    vcvtss2sh %xmm0, %xmm0, %xmm0
395; CHECK-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
396; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
397; CHECK-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
398; CHECK-NEXT:    vpsrlq $48, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
399; CHECK-NEXT:    vcvtsh2ss %xmm0, %xmm0, %xmm0
400; CHECK-NEXT:    vpsrlq $48, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
401; CHECK-NEXT:    vcvtsh2ss %xmm1, %xmm1, %xmm1
402; CHECK-NEXT:    callq fmodf@PLT
403; CHECK-NEXT:    vcvtss2sh %xmm0, %xmm0, %xmm0
404; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
405; CHECK-NEXT:    vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
406; CHECK-NEXT:    # xmm0 = mem[1,1,3,3]
407; CHECK-NEXT:    vcvtsh2ss %xmm0, %xmm0, %xmm0
408; CHECK-NEXT:    vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
409; CHECK-NEXT:    # xmm1 = mem[1,1,3,3]
410; CHECK-NEXT:    vcvtsh2ss %xmm1, %xmm1, %xmm1
411; CHECK-NEXT:    callq fmodf@PLT
412; CHECK-NEXT:    vcvtss2sh %xmm0, %xmm0, %xmm0
413; CHECK-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
414; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
415; CHECK-NEXT:    vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload
416; CHECK-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
417; CHECK-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
418; CHECK-NEXT:    vcvtsh2ss %xmm0, %xmm0, %xmm0
419; CHECK-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
420; CHECK-NEXT:    vcvtsh2ss %xmm1, %xmm1, %xmm1
421; CHECK-NEXT:    vzeroupper
422; CHECK-NEXT:    callq fmodf@PLT
423; CHECK-NEXT:    vcvtss2sh %xmm0, %xmm0, %xmm0
424; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
425; CHECK-NEXT:    vpsrld $16, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
426; CHECK-NEXT:    vcvtsh2ss %xmm0, %xmm0, %xmm0
427; CHECK-NEXT:    vpsrld $16, (%rsp), %xmm1 # 16-byte Folded Reload
428; CHECK-NEXT:    vcvtsh2ss %xmm1, %xmm1, %xmm1
429; CHECK-NEXT:    callq fmodf@PLT
430; CHECK-NEXT:    vcvtss2sh %xmm0, %xmm0, %xmm0
431; CHECK-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
432; CHECK-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
433; CHECK-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
434; CHECK-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
435; CHECK-NEXT:    vcvtsh2ss %xmm0, %xmm0, %xmm0
436; CHECK-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
437; CHECK-NEXT:    vcvtsh2ss %xmm1, %xmm1, %xmm1
438; CHECK-NEXT:    vzeroupper
439; CHECK-NEXT:    callq fmodf@PLT
440; CHECK-NEXT:    vcvtss2sh %xmm0, %xmm0, %xmm0
441; CHECK-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
442; CHECK-NEXT:    vpsrld $16, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
443; CHECK-NEXT:    vcvtsh2ss %xmm0, %xmm0, %xmm0
444; CHECK-NEXT:    vpsrld $16, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
445; CHECK-NEXT:    vcvtsh2ss %xmm1, %xmm1, %xmm1
446; CHECK-NEXT:    callq fmodf@PLT
447; CHECK-NEXT:    vcvtss2sh %xmm0, %xmm0, %xmm0
448; CHECK-NEXT:    vmovdqa (%rsp), %xmm1 # 16-byte Reload
449; CHECK-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
450; CHECK-NEXT:    vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload
451; CHECK-NEXT:    vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
452; CHECK-NEXT:    # ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5]
453; CHECK-NEXT:    vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
454; CHECK-NEXT:    # ymm0 = ymm0[0],mem[0],ymm0[2],mem[2]
455; CHECK-NEXT:    addq $184, %rsp
456; CHECK-NEXT:    retq
457  %r = frem <16 x half> %x, %y
458  ret <16 x half> %r
459}
460
461define <32 x half> @frem_vec32(<32 x half> %x, <32 x half> %y) nounwind {
462; CHECK-LABEL: frem_vec32:
463; CHECK:       # %bb.0:
464; CHECK-NEXT:    subq $408, %rsp # imm = 0x198
465; CHECK-NEXT:    vmovupd %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
466; CHECK-NEXT:    vmovupd %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
467; CHECK-NEXT:    vextractf32x4 $3, %zmm0, %xmm0
468; CHECK-NEXT:    vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
469; CHECK-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
470; CHECK-NEXT:    vcvtsh2ss %xmm0, %xmm0, %xmm0
471; CHECK-NEXT:    vextractf32x4 $3, %zmm1, %xmm1
472; CHECK-NEXT:    vmovapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
473; CHECK-NEXT:    vpsrldq {{.*#+}} xmm1 = xmm1[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
474; CHECK-NEXT:    vcvtsh2ss %xmm1, %xmm1, %xmm1
475; CHECK-NEXT:    vzeroupper
476; CHECK-NEXT:    callq fmodf@PLT
477; CHECK-NEXT:    vcvtss2sh %xmm0, %xmm0, %xmm0
478; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
479; CHECK-NEXT:    vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
480; CHECK-NEXT:    # xmm0 = mem[3,3,3,3]
481; CHECK-NEXT:    vcvtsh2ss %xmm0, %xmm0, %xmm0
482; CHECK-NEXT:    vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
483; CHECK-NEXT:    # xmm1 = mem[3,3,3,3]
484; CHECK-NEXT:    vcvtsh2ss %xmm1, %xmm1, %xmm1
485; CHECK-NEXT:    callq fmodf@PLT
486; CHECK-NEXT:    vcvtss2sh %xmm0, %xmm0, %xmm0
487; CHECK-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
488; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
489; CHECK-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
490; CHECK-NEXT:    vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
491; CHECK-NEXT:    vextractf32x4 $2, %zmm0, %xmm0
492; CHECK-NEXT:    vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
493; CHECK-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
494; CHECK-NEXT:    vcvtsh2ss %xmm0, %xmm0, %xmm0
495; CHECK-NEXT:    vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
496; CHECK-NEXT:    vextractf32x4 $2, %zmm1, %xmm1
497; CHECK-NEXT:    vmovapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
498; CHECK-NEXT:    vpsrldq {{.*#+}} xmm1 = xmm1[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
499; CHECK-NEXT:    vcvtsh2ss %xmm1, %xmm1, %xmm1
500; CHECK-NEXT:    vzeroupper
501; CHECK-NEXT:    callq fmodf@PLT
502; CHECK-NEXT:    vcvtss2sh %xmm0, %xmm0, %xmm0
503; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
504; CHECK-NEXT:    vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
505; CHECK-NEXT:    # xmm0 = mem[3,3,3,3]
506; CHECK-NEXT:    vcvtsh2ss %xmm0, %xmm0, %xmm0
507; CHECK-NEXT:    vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
508; CHECK-NEXT:    # xmm1 = mem[3,3,3,3]
509; CHECK-NEXT:    vcvtsh2ss %xmm1, %xmm1, %xmm1
510; CHECK-NEXT:    callq fmodf@PLT
511; CHECK-NEXT:    vcvtss2sh %xmm0, %xmm0, %xmm0
512; CHECK-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
513; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
514; CHECK-NEXT:    vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload
515; CHECK-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
516; CHECK-NEXT:    vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
517; CHECK-NEXT:    vextractf128 $1, %ymm0, %xmm0
518; CHECK-NEXT:    vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
519; CHECK-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
520; CHECK-NEXT:    vcvtsh2ss %xmm0, %xmm0, %xmm0
521; CHECK-NEXT:    vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
522; CHECK-NEXT:    vextractf128 $1, %ymm1, %xmm1
523; CHECK-NEXT:    vmovapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
524; CHECK-NEXT:    vpsrldq {{.*#+}} xmm1 = xmm1[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
525; CHECK-NEXT:    vcvtsh2ss %xmm1, %xmm1, %xmm1
526; CHECK-NEXT:    vzeroupper
527; CHECK-NEXT:    callq fmodf@PLT
528; CHECK-NEXT:    vcvtss2sh %xmm0, %xmm0, %xmm0
529; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
530; CHECK-NEXT:    vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
531; CHECK-NEXT:    # xmm0 = mem[3,3,3,3]
532; CHECK-NEXT:    vcvtsh2ss %xmm0, %xmm0, %xmm0
533; CHECK-NEXT:    vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
534; CHECK-NEXT:    # xmm1 = mem[3,3,3,3]
535; CHECK-NEXT:    vcvtsh2ss %xmm1, %xmm1, %xmm1
536; CHECK-NEXT:    callq fmodf@PLT
537; CHECK-NEXT:    vcvtss2sh %xmm0, %xmm0, %xmm0
538; CHECK-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
539; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
540; CHECK-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
541; CHECK-NEXT:    vpsrldq $14, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
542; CHECK-NEXT:    # xmm0 = mem[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
543; CHECK-NEXT:    vcvtsh2ss %xmm0, %xmm0, %xmm0
544; CHECK-NEXT:    vpsrldq $14, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
545; CHECK-NEXT:    # xmm1 = mem[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
546; CHECK-NEXT:    vcvtsh2ss %xmm1, %xmm1, %xmm1
547; CHECK-NEXT:    callq fmodf@PLT
548; CHECK-NEXT:    vcvtss2sh %xmm0, %xmm0, %xmm0
549; CHECK-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
550; CHECK-NEXT:    vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
551; CHECK-NEXT:    # xmm0 = mem[3,3,3,3]
552; CHECK-NEXT:    vcvtsh2ss %xmm0, %xmm0, %xmm0
553; CHECK-NEXT:    vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
554; CHECK-NEXT:    # xmm1 = mem[3,3,3,3]
555; CHECK-NEXT:    vcvtsh2ss %xmm1, %xmm1, %xmm1
556; CHECK-NEXT:    callq fmodf@PLT
557; CHECK-NEXT:    vcvtss2sh %xmm0, %xmm0, %xmm0
558; CHECK-NEXT:    vpunpcklwd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
559; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
560; CHECK-NEXT:    vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload
561; CHECK-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 32-byte Folded Reload
562; CHECK-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
563; CHECK-NEXT:    vpsrldq $10, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
564; CHECK-NEXT:    # xmm0 = mem[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
565; CHECK-NEXT:    vcvtsh2ss %xmm0, %xmm0, %xmm0
566; CHECK-NEXT:    vpsrldq $10, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
567; CHECK-NEXT:    # xmm1 = mem[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
568; CHECK-NEXT:    vcvtsh2ss %xmm1, %xmm1, %xmm1
569; CHECK-NEXT:    vzeroupper
570; CHECK-NEXT:    callq fmodf@PLT
571; CHECK-NEXT:    vcvtss2sh %xmm0, %xmm0, %xmm0
572; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
573; CHECK-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
574; CHECK-NEXT:    # xmm0 = mem[1,0]
575; CHECK-NEXT:    vcvtsh2ss %xmm0, %xmm0, %xmm0
576; CHECK-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
577; CHECK-NEXT:    # xmm1 = mem[1,0]
578; CHECK-NEXT:    vcvtsh2ss %xmm1, %xmm1, %xmm1
579; CHECK-NEXT:    callq fmodf@PLT
580; CHECK-NEXT:    vcvtss2sh %xmm0, %xmm0, %xmm0
581; CHECK-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
582; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
583; CHECK-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
584; CHECK-NEXT:    vpsrldq $10, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
585; CHECK-NEXT:    # xmm0 = mem[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
586; CHECK-NEXT:    vcvtsh2ss %xmm0, %xmm0, %xmm0
587; CHECK-NEXT:    vpsrldq $10, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
588; CHECK-NEXT:    # xmm1 = mem[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
589; CHECK-NEXT:    vcvtsh2ss %xmm1, %xmm1, %xmm1
590; CHECK-NEXT:    callq fmodf@PLT
591; CHECK-NEXT:    vcvtss2sh %xmm0, %xmm0, %xmm0
592; CHECK-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
593; CHECK-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
594; CHECK-NEXT:    # xmm0 = mem[1,0]
595; CHECK-NEXT:    vcvtsh2ss %xmm0, %xmm0, %xmm0
596; CHECK-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
597; CHECK-NEXT:    # xmm1 = mem[1,0]
598; CHECK-NEXT:    vcvtsh2ss %xmm1, %xmm1, %xmm1
599; CHECK-NEXT:    callq fmodf@PLT
600; CHECK-NEXT:    vcvtss2sh %xmm0, %xmm0, %xmm0
601; CHECK-NEXT:    vpunpcklwd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
602; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
603; CHECK-NEXT:    vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload
604; CHECK-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
605; CHECK-NEXT:    vpsrldq $10, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
606; CHECK-NEXT:    # xmm0 = mem[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
607; CHECK-NEXT:    vcvtsh2ss %xmm0, %xmm0, %xmm0
608; CHECK-NEXT:    vpsrldq $10, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
609; CHECK-NEXT:    # xmm1 = mem[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
610; CHECK-NEXT:    vcvtsh2ss %xmm1, %xmm1, %xmm1
611; CHECK-NEXT:    vzeroupper
612; CHECK-NEXT:    callq fmodf@PLT
613; CHECK-NEXT:    vcvtss2sh %xmm0, %xmm0, %xmm0
614; CHECK-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
615; CHECK-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
616; CHECK-NEXT:    # xmm0 = mem[1,0]
617; CHECK-NEXT:    vcvtsh2ss %xmm0, %xmm0, %xmm0
618; CHECK-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
619; CHECK-NEXT:    # xmm1 = mem[1,0]
620; CHECK-NEXT:    vcvtsh2ss %xmm1, %xmm1, %xmm1
621; CHECK-NEXT:    callq fmodf@PLT
622; CHECK-NEXT:    vcvtss2sh %xmm0, %xmm0, %xmm0
623; CHECK-NEXT:    vpunpcklwd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
624; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
625; CHECK-NEXT:    vmovdqa %xmm0, (%rsp) # 16-byte Spill
626; CHECK-NEXT:    vpsrldq $10, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
627; CHECK-NEXT:    # xmm0 = mem[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
628; CHECK-NEXT:    vcvtsh2ss %xmm0, %xmm0, %xmm0
629; CHECK-NEXT:    vpsrldq $10, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
630; CHECK-NEXT:    # xmm1 = mem[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
631; CHECK-NEXT:    vcvtsh2ss %xmm1, %xmm1, %xmm1
632; CHECK-NEXT:    callq fmodf@PLT
633; CHECK-NEXT:    vcvtss2sh %xmm0, %xmm0, %xmm0
634; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
635; CHECK-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
636; CHECK-NEXT:    # xmm0 = mem[1,0]
637; CHECK-NEXT:    vcvtsh2ss %xmm0, %xmm0, %xmm0
638; CHECK-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
639; CHECK-NEXT:    # xmm1 = mem[1,0]
640; CHECK-NEXT:    vcvtsh2ss %xmm1, %xmm1, %xmm1
641; CHECK-NEXT:    callq fmodf@PLT
642; CHECK-NEXT:    vcvtss2sh %xmm0, %xmm0, %xmm0
643; CHECK-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
644; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
645; CHECK-NEXT:    vinserti128 $1, (%rsp), %ymm0, %ymm0 # 16-byte Folded Reload
646; CHECK-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 32-byte Folded Reload
647; CHECK-NEXT:    vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
648; CHECK-NEXT:    # zmm0 = zmm0[0],mem[0],zmm0[1],mem[1],zmm0[4],mem[4],zmm0[5],mem[5],zmm0[8],mem[8],zmm0[9],mem[9],zmm0[12],mem[12],zmm0[13],mem[13]
649; CHECK-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
650; CHECK-NEXT:    vpsrlq $48, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
651; CHECK-NEXT:    vcvtsh2ss %xmm0, %xmm0, %xmm0
652; CHECK-NEXT:    vpsrlq $48, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
653; CHECK-NEXT:    vcvtsh2ss %xmm1, %xmm1, %xmm1
654; CHECK-NEXT:    vzeroupper
655; CHECK-NEXT:    callq fmodf@PLT
656; CHECK-NEXT:    vcvtss2sh %xmm0, %xmm0, %xmm0
657; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
658; CHECK-NEXT:    vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
659; CHECK-NEXT:    # xmm0 = mem[1,1,3,3]
660; CHECK-NEXT:    vcvtsh2ss %xmm0, %xmm0, %xmm0
661; CHECK-NEXT:    vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
662; CHECK-NEXT:    # xmm1 = mem[1,1,3,3]
663; CHECK-NEXT:    vcvtsh2ss %xmm1, %xmm1, %xmm1
664; CHECK-NEXT:    callq fmodf@PLT
665; CHECK-NEXT:    vcvtss2sh %xmm0, %xmm0, %xmm0
666; CHECK-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
667; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
668; CHECK-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
669; CHECK-NEXT:    vpsrlq $48, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
670; CHECK-NEXT:    vcvtsh2ss %xmm0, %xmm0, %xmm0
671; CHECK-NEXT:    vpsrlq $48, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
672; CHECK-NEXT:    vcvtsh2ss %xmm1, %xmm1, %xmm1
673; CHECK-NEXT:    callq fmodf@PLT
674; CHECK-NEXT:    vcvtss2sh %xmm0, %xmm0, %xmm0
675; CHECK-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
676; CHECK-NEXT:    vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
677; CHECK-NEXT:    # xmm0 = mem[1,1,3,3]
678; CHECK-NEXT:    vcvtsh2ss %xmm0, %xmm0, %xmm0
679; CHECK-NEXT:    vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
680; CHECK-NEXT:    # xmm1 = mem[1,1,3,3]
681; CHECK-NEXT:    vcvtsh2ss %xmm1, %xmm1, %xmm1
682; CHECK-NEXT:    callq fmodf@PLT
683; CHECK-NEXT:    vcvtss2sh %xmm0, %xmm0, %xmm0
684; CHECK-NEXT:    vpunpcklwd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
685; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
686; CHECK-NEXT:    vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload
687; CHECK-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
688; CHECK-NEXT:    vpsrlq $48, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
689; CHECK-NEXT:    vcvtsh2ss %xmm0, %xmm0, %xmm0
690; CHECK-NEXT:    vpsrlq $48, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
691; CHECK-NEXT:    vcvtsh2ss %xmm1, %xmm1, %xmm1
692; CHECK-NEXT:    vzeroupper
693; CHECK-NEXT:    callq fmodf@PLT
694; CHECK-NEXT:    vcvtss2sh %xmm0, %xmm0, %xmm0
695; CHECK-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
696; CHECK-NEXT:    vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
697; CHECK-NEXT:    # xmm0 = mem[1,1,3,3]
698; CHECK-NEXT:    vcvtsh2ss %xmm0, %xmm0, %xmm0
699; CHECK-NEXT:    vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
700; CHECK-NEXT:    # xmm1 = mem[1,1,3,3]
701; CHECK-NEXT:    vcvtsh2ss %xmm1, %xmm1, %xmm1
702; CHECK-NEXT:    callq fmodf@PLT
703; CHECK-NEXT:    vcvtss2sh %xmm0, %xmm0, %xmm0
704; CHECK-NEXT:    vpunpcklwd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
705; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
706; CHECK-NEXT:    vmovdqa %xmm0, (%rsp) # 16-byte Spill
707; CHECK-NEXT:    vpsrlq $48, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
708; CHECK-NEXT:    vcvtsh2ss %xmm0, %xmm0, %xmm0
709; CHECK-NEXT:    vpsrlq $48, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
710; CHECK-NEXT:    vcvtsh2ss %xmm1, %xmm1, %xmm1
711; CHECK-NEXT:    callq fmodf@PLT
712; CHECK-NEXT:    vcvtss2sh %xmm0, %xmm0, %xmm0
713; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
714; CHECK-NEXT:    vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
715; CHECK-NEXT:    # xmm0 = mem[1,1,3,3]
716; CHECK-NEXT:    vcvtsh2ss %xmm0, %xmm0, %xmm0
717; CHECK-NEXT:    vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
718; CHECK-NEXT:    # xmm1 = mem[1,1,3,3]
719; CHECK-NEXT:    vcvtsh2ss %xmm1, %xmm1, %xmm1
720; CHECK-NEXT:    callq fmodf@PLT
721; CHECK-NEXT:    vcvtss2sh %xmm0, %xmm0, %xmm0
722; CHECK-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
723; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
724; CHECK-NEXT:    vinserti128 $1, (%rsp), %ymm0, %ymm0 # 16-byte Folded Reload
725; CHECK-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 32-byte Folded Reload
726; CHECK-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
727; CHECK-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
728; CHECK-NEXT:    vcvtsh2ss %xmm0, %xmm0, %xmm0
729; CHECK-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
730; CHECK-NEXT:    vcvtsh2ss %xmm1, %xmm1, %xmm1
731; CHECK-NEXT:    vzeroupper
732; CHECK-NEXT:    callq fmodf@PLT
733; CHECK-NEXT:    vcvtss2sh %xmm0, %xmm0, %xmm0
734; CHECK-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
735; CHECK-NEXT:    vpsrld $16, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
736; CHECK-NEXT:    vcvtsh2ss %xmm0, %xmm0, %xmm0
737; CHECK-NEXT:    vpsrld $16, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
738; CHECK-NEXT:    vcvtsh2ss %xmm1, %xmm1, %xmm1
739; CHECK-NEXT:    callq fmodf@PLT
740; CHECK-NEXT:    vcvtss2sh %xmm0, %xmm0, %xmm0
741; CHECK-NEXT:    vmovdqa (%rsp), %xmm1 # 16-byte Reload
742; CHECK-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
743; CHECK-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
744; CHECK-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
745; CHECK-NEXT:    vcvtsh2ss %xmm0, %xmm0, %xmm0
746; CHECK-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
747; CHECK-NEXT:    vcvtsh2ss %xmm1, %xmm1, %xmm1
748; CHECK-NEXT:    callq fmodf@PLT
749; CHECK-NEXT:    vcvtss2sh %xmm0, %xmm0, %xmm0
750; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
751; CHECK-NEXT:    vpsrld $16, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
752; CHECK-NEXT:    vcvtsh2ss %xmm0, %xmm0, %xmm0
753; CHECK-NEXT:    vpsrld $16, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
754; CHECK-NEXT:    vcvtsh2ss %xmm1, %xmm1, %xmm1
755; CHECK-NEXT:    callq fmodf@PLT
756; CHECK-NEXT:    vcvtss2sh %xmm0, %xmm0, %xmm0
757; CHECK-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
758; CHECK-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
759; CHECK-NEXT:    vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload
760; CHECK-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
761; CHECK-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
762; CHECK-NEXT:    vcvtsh2ss %xmm0, %xmm0, %xmm0
763; CHECK-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
764; CHECK-NEXT:    vcvtsh2ss %xmm1, %xmm1, %xmm1
765; CHECK-NEXT:    vzeroupper
766; CHECK-NEXT:    callq fmodf@PLT
767; CHECK-NEXT:    vcvtss2sh %xmm0, %xmm0, %xmm0
768; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
769; CHECK-NEXT:    vpsrld $16, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
770; CHECK-NEXT:    vcvtsh2ss %xmm0, %xmm0, %xmm0
771; CHECK-NEXT:    vpsrld $16, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
772; CHECK-NEXT:    vcvtsh2ss %xmm1, %xmm1, %xmm1
773; CHECK-NEXT:    callq fmodf@PLT
774; CHECK-NEXT:    vcvtss2sh %xmm0, %xmm0, %xmm0
775; CHECK-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
776; CHECK-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
777; CHECK-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
778; CHECK-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
779; CHECK-NEXT:    vcvtsh2ss %xmm0, %xmm0, %xmm0
780; CHECK-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
781; CHECK-NEXT:    vcvtsh2ss %xmm1, %xmm1, %xmm1
782; CHECK-NEXT:    vzeroupper
783; CHECK-NEXT:    callq fmodf@PLT
784; CHECK-NEXT:    vcvtss2sh %xmm0, %xmm0, %xmm0
785; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
786; CHECK-NEXT:    vpsrld $16, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
787; CHECK-NEXT:    vcvtsh2ss %xmm0, %xmm0, %xmm0
788; CHECK-NEXT:    vpsrld $16, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
789; CHECK-NEXT:    vcvtsh2ss %xmm1, %xmm1, %xmm1
790; CHECK-NEXT:    callq fmodf@PLT
791; CHECK-NEXT:    vcvtss2sh %xmm0, %xmm0, %xmm0
792; CHECK-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
793; CHECK-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
794; CHECK-NEXT:    vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload
795; CHECK-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 32-byte Folded Reload
796; CHECK-NEXT:    vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
797; CHECK-NEXT:    # zmm0 = zmm0[0],mem[0],zmm0[1],mem[1],zmm0[4],mem[4],zmm0[5],mem[5],zmm0[8],mem[8],zmm0[9],mem[9],zmm0[12],mem[12],zmm0[13],mem[13]
798; CHECK-NEXT:    vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
799; CHECK-NEXT:    # zmm0 = zmm0[0],mem[0],zmm0[2],mem[2],zmm0[4],mem[4],zmm0[6],mem[6]
800; CHECK-NEXT:    addq $408, %rsp # imm = 0x198
801; CHECK-NEXT:    retq
802  %r = frem <32 x half> %x, %y
803  ret <32 x half> %r
804}
805
806define half @frem_strict(half %x, half %y) nounwind #0 {
807; CHECK-LABEL: frem_strict:
808; CHECK:       # %bb.0:
809; CHECK-NEXT:    pushq %rax
810; CHECK-NEXT:    vcvtsh2ss %xmm1, %xmm1, %xmm1
811; CHECK-NEXT:    vcvtsh2ss %xmm0, %xmm0, %xmm0
812; CHECK-NEXT:    callq fmodf@PLT
813; CHECK-NEXT:    vcvtss2sh %xmm0, %xmm0, %xmm0
814; CHECK-NEXT:    popq %rax
815; CHECK-NEXT:    retq
816  %result = call half @llvm.experimental.constrained.frem.f16(half %x, half %y, metadata !"round.dynamic", metadata !"fpexcept.strict") #0
817  ret half %result
818}
819
820define <2 x half> @frem_strict_vec2(<2 x half> %x, <2 x half> %y) nounwind #0 {
821; CHECK-LABEL: frem_strict_vec2:
822; CHECK:       # %bb.0:
823; CHECK-NEXT:    subq $56, %rsp
824; CHECK-NEXT:    vmovaps %xmm1, (%rsp) # 16-byte Spill
825; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
826; CHECK-NEXT:    vcvtsh2ss %xmm1, %xmm1, %xmm1
827; CHECK-NEXT:    vcvtsh2ss %xmm0, %xmm0, %xmm0
828; CHECK-NEXT:    callq fmodf@PLT
829; CHECK-NEXT:    vcvtss2sh %xmm0, %xmm0, %xmm0
830; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
831; CHECK-NEXT:    vpsrld $16, (%rsp), %xmm0 # 16-byte Folded Reload
832; CHECK-NEXT:    vcvtsh2ss %xmm0, %xmm0, %xmm1
833; CHECK-NEXT:    vpsrld $16, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
834; CHECK-NEXT:    vcvtsh2ss %xmm0, %xmm0, %xmm0
835; CHECK-NEXT:    callq fmodf@PLT
836; CHECK-NEXT:    vcvtss2sh %xmm0, %xmm0, %xmm0
837; CHECK-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
838; CHECK-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
839; CHECK-NEXT:    addq $56, %rsp
840; CHECK-NEXT:    retq
841  %result = call <2 x half> @llvm.experimental.constrained.frem.v2f16(<2 x half> %x, <2 x half> %y, metadata !"round.dynamic", metadata !"fpexcept.strict") #0
842  ret <2 x half> %result
843}
844
845define <4 x half> @frem_strict_vec4(<4 x half> %x, <4 x half> %y) nounwind #0 {
846; CHECK-LABEL: frem_strict_vec4:
847; CHECK:       # %bb.0:
848; CHECK-NEXT:    subq $72, %rsp
849; CHECK-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
850; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
851; CHECK-NEXT:    vpsrlq $48, %xmm1, %xmm2
852; CHECK-NEXT:    vcvtsh2ss %xmm2, %xmm2, %xmm1
853; CHECK-NEXT:    vpsrlq $48, %xmm0, %xmm2
854; CHECK-NEXT:    vcvtsh2ss %xmm2, %xmm2, %xmm0
855; CHECK-NEXT:    callq fmodf@PLT
856; CHECK-NEXT:    vcvtss2sh %xmm0, %xmm0, %xmm0
857; CHECK-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
858; CHECK-NEXT:    vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
859; CHECK-NEXT:    # xmm0 = mem[1,1,3,3]
860; CHECK-NEXT:    vcvtsh2ss %xmm0, %xmm0, %xmm1
861; CHECK-NEXT:    vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
862; CHECK-NEXT:    # xmm0 = mem[1,1,3,3]
863; CHECK-NEXT:    vcvtsh2ss %xmm0, %xmm0, %xmm0
864; CHECK-NEXT:    callq fmodf@PLT
865; CHECK-NEXT:    vcvtss2sh %xmm0, %xmm0, %xmm0
866; CHECK-NEXT:    vpunpcklwd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
867; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
868; CHECK-NEXT:    vmovdqa %xmm0, (%rsp) # 16-byte Spill
869; CHECK-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
870; CHECK-NEXT:    vcvtsh2ss %xmm0, %xmm0, %xmm1
871; CHECK-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
872; CHECK-NEXT:    vcvtsh2ss %xmm0, %xmm0, %xmm0
873; CHECK-NEXT:    callq fmodf@PLT
874; CHECK-NEXT:    vcvtss2sh %xmm0, %xmm0, %xmm0
875; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
876; CHECK-NEXT:    vpsrld $16, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
877; CHECK-NEXT:    vcvtsh2ss %xmm0, %xmm0, %xmm1
878; CHECK-NEXT:    vpsrld $16, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
879; CHECK-NEXT:    vcvtsh2ss %xmm0, %xmm0, %xmm0
880; CHECK-NEXT:    callq fmodf@PLT
881; CHECK-NEXT:    vcvtss2sh %xmm0, %xmm0, %xmm0
882; CHECK-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
883; CHECK-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
884; CHECK-NEXT:    vinsertps $28, (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
885; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0],zero,zero
886; CHECK-NEXT:    addq $72, %rsp
887; CHECK-NEXT:    retq
888  %result = call <4 x half> @llvm.experimental.constrained.frem.v4f16(<4 x half> %x, <4 x half> %y, metadata !"round.dynamic", metadata !"fpexcept.strict") #0
889  ret <4 x half> %result
890}
891
892define <8 x half> @frem_strict_vec8(<8 x half> %x, <8 x half> %y) nounwind #0 {
893; CHECK-LABEL: frem_strict_vec8:
894; CHECK:       # %bb.0:
895; CHECK-NEXT:    subq $88, %rsp
896; CHECK-NEXT:    vmovapd %xmm1, (%rsp) # 16-byte Spill
897; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
898; CHECK-NEXT:    vpsrldq {{.*#+}} xmm2 = xmm1[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
899; CHECK-NEXT:    vcvtsh2ss %xmm2, %xmm2, %xmm1
900; CHECK-NEXT:    vpsrldq {{.*#+}} xmm2 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
901; CHECK-NEXT:    vcvtsh2ss %xmm2, %xmm2, %xmm0
902; CHECK-NEXT:    callq fmodf@PLT
903; CHECK-NEXT:    vcvtss2sh %xmm0, %xmm0, %xmm0
904; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
905; CHECK-NEXT:    vpermilps $255, (%rsp), %xmm0 # 16-byte Folded Reload
906; CHECK-NEXT:    # xmm0 = mem[3,3,3,3]
907; CHECK-NEXT:    vcvtsh2ss %xmm0, %xmm0, %xmm1
908; CHECK-NEXT:    vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
909; CHECK-NEXT:    # xmm0 = mem[3,3,3,3]
910; CHECK-NEXT:    vcvtsh2ss %xmm0, %xmm0, %xmm0
911; CHECK-NEXT:    callq fmodf@PLT
912; CHECK-NEXT:    vcvtss2sh %xmm0, %xmm0, %xmm0
913; CHECK-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
914; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
915; CHECK-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
916; CHECK-NEXT:    vpsrldq $10, (%rsp), %xmm0 # 16-byte Folded Reload
917; CHECK-NEXT:    # xmm0 = mem[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
918; CHECK-NEXT:    vcvtsh2ss %xmm0, %xmm0, %xmm1
919; CHECK-NEXT:    vpsrldq $10, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
920; CHECK-NEXT:    # xmm0 = mem[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
921; CHECK-NEXT:    vcvtsh2ss %xmm0, %xmm0, %xmm0
922; CHECK-NEXT:    callq fmodf@PLT
923; CHECK-NEXT:    vcvtss2sh %xmm0, %xmm0, %xmm0
924; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
925; CHECK-NEXT:    vpermilpd $1, (%rsp), %xmm0 # 16-byte Folded Reload
926; CHECK-NEXT:    # xmm0 = mem[1,0]
927; CHECK-NEXT:    vcvtsh2ss %xmm0, %xmm0, %xmm1
928; CHECK-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
929; CHECK-NEXT:    # xmm0 = mem[1,0]
930; CHECK-NEXT:    vcvtsh2ss %xmm0, %xmm0, %xmm0
931; CHECK-NEXT:    callq fmodf@PLT
932; CHECK-NEXT:    vcvtss2sh %xmm0, %xmm0, %xmm0
933; CHECK-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
934; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
935; CHECK-NEXT:    vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
936; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
937; CHECK-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
938; CHECK-NEXT:    vpsrlq $48, (%rsp), %xmm0 # 16-byte Folded Reload
939; CHECK-NEXT:    vcvtsh2ss %xmm0, %xmm0, %xmm1
940; CHECK-NEXT:    vpsrlq $48, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
941; CHECK-NEXT:    vcvtsh2ss %xmm0, %xmm0, %xmm0
942; CHECK-NEXT:    callq fmodf@PLT
943; CHECK-NEXT:    vcvtss2sh %xmm0, %xmm0, %xmm0
944; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
945; CHECK-NEXT:    vmovshdup (%rsp), %xmm0 # 16-byte Folded Reload
946; CHECK-NEXT:    # xmm0 = mem[1,1,3,3]
947; CHECK-NEXT:    vcvtsh2ss %xmm0, %xmm0, %xmm1
948; CHECK-NEXT:    vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
949; CHECK-NEXT:    # xmm0 = mem[1,1,3,3]
950; CHECK-NEXT:    vcvtsh2ss %xmm0, %xmm0, %xmm0
951; CHECK-NEXT:    callq fmodf@PLT
952; CHECK-NEXT:    vcvtss2sh %xmm0, %xmm0, %xmm0
953; CHECK-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
954; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
955; CHECK-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
956; CHECK-NEXT:    vmovaps (%rsp), %xmm0 # 16-byte Reload
957; CHECK-NEXT:    vcvtsh2ss %xmm0, %xmm0, %xmm1
958; CHECK-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
959; CHECK-NEXT:    vcvtsh2ss %xmm0, %xmm0, %xmm0
960; CHECK-NEXT:    callq fmodf@PLT
961; CHECK-NEXT:    vcvtss2sh %xmm0, %xmm0, %xmm0
962; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
963; CHECK-NEXT:    vpsrld $16, (%rsp), %xmm0 # 16-byte Folded Reload
964; CHECK-NEXT:    vcvtsh2ss %xmm0, %xmm0, %xmm1
965; CHECK-NEXT:    vpsrld $16, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
966; CHECK-NEXT:    vcvtsh2ss %xmm0, %xmm0, %xmm0
967; CHECK-NEXT:    callq fmodf@PLT
968; CHECK-NEXT:    vcvtss2sh %xmm0, %xmm0, %xmm0
969; CHECK-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
970; CHECK-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
971; CHECK-NEXT:    vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
972; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
973; CHECK-NEXT:    vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
974; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0]
975; CHECK-NEXT:    addq $88, %rsp
976; CHECK-NEXT:    retq
977  %result = call <8 x half> @llvm.experimental.constrained.frem.v8f16(<8 x half> %x, <8 x half> %y, metadata !"round.dynamic", metadata !"fpexcept.strict") #0
978  ret <8 x half> %result
979}
980
981define <16 x half> @frem_strict_vec16(<16 x half> %x, <16 x half> %y) nounwind #0 {
982; CHECK-LABEL: frem_strict_vec16:
983; CHECK:       # %bb.0:
984; CHECK-NEXT:    subq $184, %rsp
985; CHECK-NEXT:    vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
986; CHECK-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
987; CHECK-NEXT:    vextractf128 $1, %ymm1, %xmm1
988; CHECK-NEXT:    vmovapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
989; CHECK-NEXT:    vpsrldq {{.*#+}} xmm1 = xmm1[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
990; CHECK-NEXT:    vcvtsh2ss %xmm1, %xmm1, %xmm1
991; CHECK-NEXT:    vextractf128 $1, %ymm0, %xmm0
992; CHECK-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
993; CHECK-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
994; CHECK-NEXT:    vcvtsh2ss %xmm0, %xmm0, %xmm0
995; CHECK-NEXT:    vzeroupper
996; CHECK-NEXT:    callq fmodf@PLT
997; CHECK-NEXT:    vcvtss2sh %xmm0, %xmm0, %xmm0
998; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
999; CHECK-NEXT:    vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
1000; CHECK-NEXT:    # xmm0 = mem[3,3,3,3]
1001; CHECK-NEXT:    vcvtsh2ss %xmm0, %xmm0, %xmm1
1002; CHECK-NEXT:    vpermilps $255, (%rsp), %xmm0 # 16-byte Folded Reload
1003; CHECK-NEXT:    # xmm0 = mem[3,3,3,3]
1004; CHECK-NEXT:    vcvtsh2ss %xmm0, %xmm0, %xmm0
1005; CHECK-NEXT:    callq fmodf@PLT
1006; CHECK-NEXT:    vcvtss2sh %xmm0, %xmm0, %xmm0
1007; CHECK-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
1008; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
1009; CHECK-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1010; CHECK-NEXT:    vpsrldq $14, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
1011; CHECK-NEXT:    # xmm0 = mem[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
1012; CHECK-NEXT:    vcvtsh2ss %xmm0, %xmm0, %xmm1
1013; CHECK-NEXT:    vpsrldq $14, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
1014; CHECK-NEXT:    # xmm0 = mem[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
1015; CHECK-NEXT:    vcvtsh2ss %xmm0, %xmm0, %xmm0
1016; CHECK-NEXT:    callq fmodf@PLT
1017; CHECK-NEXT:    vcvtss2sh %xmm0, %xmm0, %xmm0
1018; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1019; CHECK-NEXT:    vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
1020; CHECK-NEXT:    # xmm0 = mem[3,3,3,3]
1021; CHECK-NEXT:    vcvtsh2ss %xmm0, %xmm0, %xmm1
1022; CHECK-NEXT:    vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
1023; CHECK-NEXT:    # xmm0 = mem[3,3,3,3]
1024; CHECK-NEXT:    vcvtsh2ss %xmm0, %xmm0, %xmm0
1025; CHECK-NEXT:    callq fmodf@PLT
1026; CHECK-NEXT:    vcvtss2sh %xmm0, %xmm0, %xmm0
1027; CHECK-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
1028; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
1029; CHECK-NEXT:    vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload
1030; CHECK-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1031; CHECK-NEXT:    vpsrldq $10, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
1032; CHECK-NEXT:    # xmm0 = mem[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
1033; CHECK-NEXT:    vcvtsh2ss %xmm0, %xmm0, %xmm1
1034; CHECK-NEXT:    vpsrldq $10, (%rsp), %xmm0 # 16-byte Folded Reload
1035; CHECK-NEXT:    # xmm0 = mem[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
1036; CHECK-NEXT:    vcvtsh2ss %xmm0, %xmm0, %xmm0
1037; CHECK-NEXT:    vzeroupper
1038; CHECK-NEXT:    callq fmodf@PLT
1039; CHECK-NEXT:    vcvtss2sh %xmm0, %xmm0, %xmm0
1040; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1041; CHECK-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
1042; CHECK-NEXT:    # xmm0 = mem[1,0]
1043; CHECK-NEXT:    vcvtsh2ss %xmm0, %xmm0, %xmm1
1044; CHECK-NEXT:    vpermilpd $1, (%rsp), %xmm0 # 16-byte Folded Reload
1045; CHECK-NEXT:    # xmm0 = mem[1,0]
1046; CHECK-NEXT:    vcvtsh2ss %xmm0, %xmm0, %xmm0
1047; CHECK-NEXT:    callq fmodf@PLT
1048; CHECK-NEXT:    vcvtss2sh %xmm0, %xmm0, %xmm0
1049; CHECK-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
1050; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
1051; CHECK-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1052; CHECK-NEXT:    vpsrldq $10, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
1053; CHECK-NEXT:    # xmm0 = mem[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
1054; CHECK-NEXT:    vcvtsh2ss %xmm0, %xmm0, %xmm1
1055; CHECK-NEXT:    vpsrldq $10, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
1056; CHECK-NEXT:    # xmm0 = mem[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
1057; CHECK-NEXT:    vcvtsh2ss %xmm0, %xmm0, %xmm0
1058; CHECK-NEXT:    callq fmodf@PLT
1059; CHECK-NEXT:    vcvtss2sh %xmm0, %xmm0, %xmm0
1060; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1061; CHECK-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
1062; CHECK-NEXT:    # xmm0 = mem[1,0]
1063; CHECK-NEXT:    vcvtsh2ss %xmm0, %xmm0, %xmm1
1064; CHECK-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
1065; CHECK-NEXT:    # xmm0 = mem[1,0]
1066; CHECK-NEXT:    vcvtsh2ss %xmm0, %xmm0, %xmm0
1067; CHECK-NEXT:    callq fmodf@PLT
1068; CHECK-NEXT:    vcvtss2sh %xmm0, %xmm0, %xmm0
1069; CHECK-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
1070; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
1071; CHECK-NEXT:    vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload
1072; CHECK-NEXT:    vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
1073; CHECK-NEXT:    # ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5]
1074; CHECK-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1075; CHECK-NEXT:    vpsrlq $48, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
1076; CHECK-NEXT:    vcvtsh2ss %xmm0, %xmm0, %xmm1
1077; CHECK-NEXT:    vpsrlq $48, (%rsp), %xmm0 # 16-byte Folded Reload
1078; CHECK-NEXT:    vcvtsh2ss %xmm0, %xmm0, %xmm0
1079; CHECK-NEXT:    vzeroupper
1080; CHECK-NEXT:    callq fmodf@PLT
1081; CHECK-NEXT:    vcvtss2sh %xmm0, %xmm0, %xmm0
1082; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1083; CHECK-NEXT:    vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
1084; CHECK-NEXT:    # xmm0 = mem[1,1,3,3]
1085; CHECK-NEXT:    vcvtsh2ss %xmm0, %xmm0, %xmm1
1086; CHECK-NEXT:    vmovshdup (%rsp), %xmm0 # 16-byte Folded Reload
1087; CHECK-NEXT:    # xmm0 = mem[1,1,3,3]
1088; CHECK-NEXT:    vcvtsh2ss %xmm0, %xmm0, %xmm0
1089; CHECK-NEXT:    callq fmodf@PLT
1090; CHECK-NEXT:    vcvtss2sh %xmm0, %xmm0, %xmm0
1091; CHECK-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
1092; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
1093; CHECK-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1094; CHECK-NEXT:    vpsrlq $48, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
1095; CHECK-NEXT:    vcvtsh2ss %xmm0, %xmm0, %xmm1
1096; CHECK-NEXT:    vpsrlq $48, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
1097; CHECK-NEXT:    vcvtsh2ss %xmm0, %xmm0, %xmm0
1098; CHECK-NEXT:    callq fmodf@PLT
1099; CHECK-NEXT:    vcvtss2sh %xmm0, %xmm0, %xmm0
1100; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1101; CHECK-NEXT:    vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
1102; CHECK-NEXT:    # xmm0 = mem[1,1,3,3]
1103; CHECK-NEXT:    vcvtsh2ss %xmm0, %xmm0, %xmm1
1104; CHECK-NEXT:    vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
1105; CHECK-NEXT:    # xmm0 = mem[1,1,3,3]
1106; CHECK-NEXT:    vcvtsh2ss %xmm0, %xmm0, %xmm0
1107; CHECK-NEXT:    callq fmodf@PLT
1108; CHECK-NEXT:    vcvtss2sh %xmm0, %xmm0, %xmm0
1109; CHECK-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
1110; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
1111; CHECK-NEXT:    vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload
1112; CHECK-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1113; CHECK-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1114; CHECK-NEXT:    vcvtsh2ss %xmm0, %xmm0, %xmm1
1115; CHECK-NEXT:    vmovaps (%rsp), %xmm0 # 16-byte Reload
1116; CHECK-NEXT:    vcvtsh2ss %xmm0, %xmm0, %xmm0
1117; CHECK-NEXT:    vzeroupper
1118; CHECK-NEXT:    callq fmodf@PLT
1119; CHECK-NEXT:    vcvtss2sh %xmm0, %xmm0, %xmm0
1120; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1121; CHECK-NEXT:    vpsrld $16, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
1122; CHECK-NEXT:    vcvtsh2ss %xmm0, %xmm0, %xmm1
1123; CHECK-NEXT:    vpsrld $16, (%rsp), %xmm0 # 16-byte Folded Reload
1124; CHECK-NEXT:    vcvtsh2ss %xmm0, %xmm0, %xmm0
1125; CHECK-NEXT:    callq fmodf@PLT
1126; CHECK-NEXT:    vcvtss2sh %xmm0, %xmm0, %xmm0
1127; CHECK-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
1128; CHECK-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
1129; CHECK-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1130; CHECK-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
1131; CHECK-NEXT:    vcvtsh2ss %xmm0, %xmm0, %xmm1
1132; CHECK-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
1133; CHECK-NEXT:    vcvtsh2ss %xmm0, %xmm0, %xmm0
1134; CHECK-NEXT:    vzeroupper
1135; CHECK-NEXT:    callq fmodf@PLT
1136; CHECK-NEXT:    vcvtss2sh %xmm0, %xmm0, %xmm0
1137; CHECK-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
1138; CHECK-NEXT:    vpsrld $16, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
1139; CHECK-NEXT:    vcvtsh2ss %xmm0, %xmm0, %xmm1
1140; CHECK-NEXT:    vpsrld $16, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
1141; CHECK-NEXT:    vcvtsh2ss %xmm0, %xmm0, %xmm0
1142; CHECK-NEXT:    callq fmodf@PLT
1143; CHECK-NEXT:    vcvtss2sh %xmm0, %xmm0, %xmm0
1144; CHECK-NEXT:    vmovdqa (%rsp), %xmm1 # 16-byte Reload
1145; CHECK-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
1146; CHECK-NEXT:    vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload
1147; CHECK-NEXT:    vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
1148; CHECK-NEXT:    # ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5]
1149; CHECK-NEXT:    vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
1150; CHECK-NEXT:    # ymm0 = ymm0[0],mem[0],ymm0[2],mem[2]
1151; CHECK-NEXT:    addq $184, %rsp
1152; CHECK-NEXT:    retq
1153  %result = call <16 x half> @llvm.experimental.constrained.frem.v16f16(<16 x half> %x, <16 x half> %y, metadata !"round.dynamic", metadata !"fpexcept.strict") #0
1154  ret <16 x half> %result
1155}
1156
1157define <32 x half> @frem_strict_vec32(<32 x half> %x, <32 x half> %y) nounwind #0 {
1158; CHECK-LABEL: frem_strict_vec32:
1159; CHECK:       # %bb.0:
1160; CHECK-NEXT:    subq $408, %rsp # imm = 0x198
1161; CHECK-NEXT:    vmovupd %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1162; CHECK-NEXT:    vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1163; CHECK-NEXT:    vextractf32x4 $3, %zmm1, %xmm1
1164; CHECK-NEXT:    vmovapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1165; CHECK-NEXT:    vpsrldq {{.*#+}} xmm1 = xmm1[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
1166; CHECK-NEXT:    vcvtsh2ss %xmm1, %xmm1, %xmm1
1167; CHECK-NEXT:    vextractf32x4 $3, %zmm0, %xmm0
1168; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1169; CHECK-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
1170; CHECK-NEXT:    vcvtsh2ss %xmm0, %xmm0, %xmm0
1171; CHECK-NEXT:    vzeroupper
1172; CHECK-NEXT:    callq fmodf@PLT
1173; CHECK-NEXT:    vcvtss2sh %xmm0, %xmm0, %xmm0
1174; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1175; CHECK-NEXT:    vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
1176; CHECK-NEXT:    # xmm0 = mem[3,3,3,3]
1177; CHECK-NEXT:    vcvtsh2ss %xmm0, %xmm0, %xmm1
1178; CHECK-NEXT:    vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
1179; CHECK-NEXT:    # xmm0 = mem[3,3,3,3]
1180; CHECK-NEXT:    vcvtsh2ss %xmm0, %xmm0, %xmm0
1181; CHECK-NEXT:    callq fmodf@PLT
1182; CHECK-NEXT:    vcvtss2sh %xmm0, %xmm0, %xmm0
1183; CHECK-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
1184; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
1185; CHECK-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1186; CHECK-NEXT:    vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
1187; CHECK-NEXT:    vextractf32x4 $2, %zmm0, %xmm0
1188; CHECK-NEXT:    vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1189; CHECK-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
1190; CHECK-NEXT:    vcvtsh2ss %xmm0, %xmm0, %xmm1
1191; CHECK-NEXT:    vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
1192; CHECK-NEXT:    vextractf32x4 $2, %zmm0, %xmm0
1193; CHECK-NEXT:    vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1194; CHECK-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
1195; CHECK-NEXT:    vcvtsh2ss %xmm0, %xmm0, %xmm0
1196; CHECK-NEXT:    vzeroupper
1197; CHECK-NEXT:    callq fmodf@PLT
1198; CHECK-NEXT:    vcvtss2sh %xmm0, %xmm0, %xmm0
1199; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1200; CHECK-NEXT:    vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
1201; CHECK-NEXT:    # xmm0 = mem[3,3,3,3]
1202; CHECK-NEXT:    vcvtsh2ss %xmm0, %xmm0, %xmm1
1203; CHECK-NEXT:    vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
1204; CHECK-NEXT:    # xmm0 = mem[3,3,3,3]
1205; CHECK-NEXT:    vcvtsh2ss %xmm0, %xmm0, %xmm0
1206; CHECK-NEXT:    callq fmodf@PLT
1207; CHECK-NEXT:    vcvtss2sh %xmm0, %xmm0, %xmm0
1208; CHECK-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
1209; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
1210; CHECK-NEXT:    vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload
1211; CHECK-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1212; CHECK-NEXT:    vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
1213; CHECK-NEXT:    vextractf128 $1, %ymm0, %xmm0
1214; CHECK-NEXT:    vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1215; CHECK-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
1216; CHECK-NEXT:    vcvtsh2ss %xmm0, %xmm0, %xmm1
1217; CHECK-NEXT:    vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
1218; CHECK-NEXT:    vextractf128 $1, %ymm0, %xmm0
1219; CHECK-NEXT:    vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1220; CHECK-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
1221; CHECK-NEXT:    vcvtsh2ss %xmm0, %xmm0, %xmm0
1222; CHECK-NEXT:    vzeroupper
1223; CHECK-NEXT:    callq fmodf@PLT
1224; CHECK-NEXT:    vcvtss2sh %xmm0, %xmm0, %xmm0
1225; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1226; CHECK-NEXT:    vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
1227; CHECK-NEXT:    # xmm0 = mem[3,3,3,3]
1228; CHECK-NEXT:    vcvtsh2ss %xmm0, %xmm0, %xmm1
1229; CHECK-NEXT:    vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
1230; CHECK-NEXT:    # xmm0 = mem[3,3,3,3]
1231; CHECK-NEXT:    vcvtsh2ss %xmm0, %xmm0, %xmm0
1232; CHECK-NEXT:    callq fmodf@PLT
1233; CHECK-NEXT:    vcvtss2sh %xmm0, %xmm0, %xmm0
1234; CHECK-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
1235; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
1236; CHECK-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1237; CHECK-NEXT:    vpsrldq $14, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
1238; CHECK-NEXT:    # xmm0 = mem[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
1239; CHECK-NEXT:    vcvtsh2ss %xmm0, %xmm0, %xmm1
1240; CHECK-NEXT:    vpsrldq $14, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
1241; CHECK-NEXT:    # xmm0 = mem[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
1242; CHECK-NEXT:    vcvtsh2ss %xmm0, %xmm0, %xmm0
1243; CHECK-NEXT:    callq fmodf@PLT
1244; CHECK-NEXT:    vcvtss2sh %xmm0, %xmm0, %xmm0
1245; CHECK-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
1246; CHECK-NEXT:    vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
1247; CHECK-NEXT:    # xmm0 = mem[3,3,3,3]
1248; CHECK-NEXT:    vcvtsh2ss %xmm0, %xmm0, %xmm1
1249; CHECK-NEXT:    vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
1250; CHECK-NEXT:    # xmm0 = mem[3,3,3,3]
1251; CHECK-NEXT:    vcvtsh2ss %xmm0, %xmm0, %xmm0
1252; CHECK-NEXT:    callq fmodf@PLT
1253; CHECK-NEXT:    vcvtss2sh %xmm0, %xmm0, %xmm0
1254; CHECK-NEXT:    vpunpcklwd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
1255; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
1256; CHECK-NEXT:    vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload
1257; CHECK-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 32-byte Folded Reload
1258; CHECK-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1259; CHECK-NEXT:    vpsrldq $10, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
1260; CHECK-NEXT:    # xmm0 = mem[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
1261; CHECK-NEXT:    vcvtsh2ss %xmm0, %xmm0, %xmm1
1262; CHECK-NEXT:    vpsrldq $10, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
1263; CHECK-NEXT:    # xmm0 = mem[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
1264; CHECK-NEXT:    vcvtsh2ss %xmm0, %xmm0, %xmm0
1265; CHECK-NEXT:    vzeroupper
1266; CHECK-NEXT:    callq fmodf@PLT
1267; CHECK-NEXT:    vcvtss2sh %xmm0, %xmm0, %xmm0
1268; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1269; CHECK-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
1270; CHECK-NEXT:    # xmm0 = mem[1,0]
1271; CHECK-NEXT:    vcvtsh2ss %xmm0, %xmm0, %xmm1
1272; CHECK-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
1273; CHECK-NEXT:    # xmm0 = mem[1,0]
1274; CHECK-NEXT:    vcvtsh2ss %xmm0, %xmm0, %xmm0
1275; CHECK-NEXT:    callq fmodf@PLT
1276; CHECK-NEXT:    vcvtss2sh %xmm0, %xmm0, %xmm0
1277; CHECK-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
1278; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
1279; CHECK-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1280; CHECK-NEXT:    vpsrldq $10, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
1281; CHECK-NEXT:    # xmm0 = mem[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
1282; CHECK-NEXT:    vcvtsh2ss %xmm0, %xmm0, %xmm1
1283; CHECK-NEXT:    vpsrldq $10, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
1284; CHECK-NEXT:    # xmm0 = mem[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
1285; CHECK-NEXT:    vcvtsh2ss %xmm0, %xmm0, %xmm0
1286; CHECK-NEXT:    callq fmodf@PLT
1287; CHECK-NEXT:    vcvtss2sh %xmm0, %xmm0, %xmm0
1288; CHECK-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
1289; CHECK-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
1290; CHECK-NEXT:    # xmm0 = mem[1,0]
1291; CHECK-NEXT:    vcvtsh2ss %xmm0, %xmm0, %xmm1
1292; CHECK-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
1293; CHECK-NEXT:    # xmm0 = mem[1,0]
1294; CHECK-NEXT:    vcvtsh2ss %xmm0, %xmm0, %xmm0
1295; CHECK-NEXT:    callq fmodf@PLT
1296; CHECK-NEXT:    vcvtss2sh %xmm0, %xmm0, %xmm0
1297; CHECK-NEXT:    vpunpcklwd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
1298; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
1299; CHECK-NEXT:    vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload
1300; CHECK-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1301; CHECK-NEXT:    vpsrldq $10, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
1302; CHECK-NEXT:    # xmm0 = mem[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
1303; CHECK-NEXT:    vcvtsh2ss %xmm0, %xmm0, %xmm1
1304; CHECK-NEXT:    vpsrldq $10, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
1305; CHECK-NEXT:    # xmm0 = mem[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
1306; CHECK-NEXT:    vcvtsh2ss %xmm0, %xmm0, %xmm0
1307; CHECK-NEXT:    vzeroupper
1308; CHECK-NEXT:    callq fmodf@PLT
1309; CHECK-NEXT:    vcvtss2sh %xmm0, %xmm0, %xmm0
1310; CHECK-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
1311; CHECK-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
1312; CHECK-NEXT:    # xmm0 = mem[1,0]
1313; CHECK-NEXT:    vcvtsh2ss %xmm0, %xmm0, %xmm1
1314; CHECK-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
1315; CHECK-NEXT:    # xmm0 = mem[1,0]
1316; CHECK-NEXT:    vcvtsh2ss %xmm0, %xmm0, %xmm0
1317; CHECK-NEXT:    callq fmodf@PLT
1318; CHECK-NEXT:    vcvtss2sh %xmm0, %xmm0, %xmm0
1319; CHECK-NEXT:    vpunpcklwd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
1320; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
1321; CHECK-NEXT:    vmovdqa %xmm0, (%rsp) # 16-byte Spill
1322; CHECK-NEXT:    vpsrldq $10, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
1323; CHECK-NEXT:    # xmm0 = mem[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
1324; CHECK-NEXT:    vcvtsh2ss %xmm0, %xmm0, %xmm1
1325; CHECK-NEXT:    vpsrldq $10, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
1326; CHECK-NEXT:    # xmm0 = mem[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
1327; CHECK-NEXT:    vcvtsh2ss %xmm0, %xmm0, %xmm0
1328; CHECK-NEXT:    callq fmodf@PLT
1329; CHECK-NEXT:    vcvtss2sh %xmm0, %xmm0, %xmm0
1330; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1331; CHECK-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
1332; CHECK-NEXT:    # xmm0 = mem[1,0]
1333; CHECK-NEXT:    vcvtsh2ss %xmm0, %xmm0, %xmm1
1334; CHECK-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
1335; CHECK-NEXT:    # xmm0 = mem[1,0]
1336; CHECK-NEXT:    vcvtsh2ss %xmm0, %xmm0, %xmm0
1337; CHECK-NEXT:    callq fmodf@PLT
1338; CHECK-NEXT:    vcvtss2sh %xmm0, %xmm0, %xmm0
1339; CHECK-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
1340; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
1341; CHECK-NEXT:    vinserti128 $1, (%rsp), %ymm0, %ymm0 # 16-byte Folded Reload
1342; CHECK-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 32-byte Folded Reload
1343; CHECK-NEXT:    vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
1344; CHECK-NEXT:    # zmm0 = zmm0[0],mem[0],zmm0[1],mem[1],zmm0[4],mem[4],zmm0[5],mem[5],zmm0[8],mem[8],zmm0[9],mem[9],zmm0[12],mem[12],zmm0[13],mem[13]
1345; CHECK-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1346; CHECK-NEXT:    vpsrlq $48, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
1347; CHECK-NEXT:    vcvtsh2ss %xmm0, %xmm0, %xmm1
1348; CHECK-NEXT:    vpsrlq $48, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
1349; CHECK-NEXT:    vcvtsh2ss %xmm0, %xmm0, %xmm0
1350; CHECK-NEXT:    vzeroupper
1351; CHECK-NEXT:    callq fmodf@PLT
1352; CHECK-NEXT:    vcvtss2sh %xmm0, %xmm0, %xmm0
1353; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1354; CHECK-NEXT:    vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
1355; CHECK-NEXT:    # xmm0 = mem[1,1,3,3]
1356; CHECK-NEXT:    vcvtsh2ss %xmm0, %xmm0, %xmm1
1357; CHECK-NEXT:    vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
1358; CHECK-NEXT:    # xmm0 = mem[1,1,3,3]
1359; CHECK-NEXT:    vcvtsh2ss %xmm0, %xmm0, %xmm0
1360; CHECK-NEXT:    callq fmodf@PLT
1361; CHECK-NEXT:    vcvtss2sh %xmm0, %xmm0, %xmm0
1362; CHECK-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
1363; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
1364; CHECK-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1365; CHECK-NEXT:    vpsrlq $48, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
1366; CHECK-NEXT:    vcvtsh2ss %xmm0, %xmm0, %xmm1
1367; CHECK-NEXT:    vpsrlq $48, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
1368; CHECK-NEXT:    vcvtsh2ss %xmm0, %xmm0, %xmm0
1369; CHECK-NEXT:    callq fmodf@PLT
1370; CHECK-NEXT:    vcvtss2sh %xmm0, %xmm0, %xmm0
1371; CHECK-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
1372; CHECK-NEXT:    vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
1373; CHECK-NEXT:    # xmm0 = mem[1,1,3,3]
1374; CHECK-NEXT:    vcvtsh2ss %xmm0, %xmm0, %xmm1
1375; CHECK-NEXT:    vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
1376; CHECK-NEXT:    # xmm0 = mem[1,1,3,3]
1377; CHECK-NEXT:    vcvtsh2ss %xmm0, %xmm0, %xmm0
1378; CHECK-NEXT:    callq fmodf@PLT
1379; CHECK-NEXT:    vcvtss2sh %xmm0, %xmm0, %xmm0
1380; CHECK-NEXT:    vpunpcklwd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
1381; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
1382; CHECK-NEXT:    vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload
1383; CHECK-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1384; CHECK-NEXT:    vpsrlq $48, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
1385; CHECK-NEXT:    vcvtsh2ss %xmm0, %xmm0, %xmm1
1386; CHECK-NEXT:    vpsrlq $48, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
1387; CHECK-NEXT:    vcvtsh2ss %xmm0, %xmm0, %xmm0
1388; CHECK-NEXT:    vzeroupper
1389; CHECK-NEXT:    callq fmodf@PLT
1390; CHECK-NEXT:    vcvtss2sh %xmm0, %xmm0, %xmm0
1391; CHECK-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
1392; CHECK-NEXT:    vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
1393; CHECK-NEXT:    # xmm0 = mem[1,1,3,3]
1394; CHECK-NEXT:    vcvtsh2ss %xmm0, %xmm0, %xmm1
1395; CHECK-NEXT:    vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
1396; CHECK-NEXT:    # xmm0 = mem[1,1,3,3]
1397; CHECK-NEXT:    vcvtsh2ss %xmm0, %xmm0, %xmm0
1398; CHECK-NEXT:    callq fmodf@PLT
1399; CHECK-NEXT:    vcvtss2sh %xmm0, %xmm0, %xmm0
1400; CHECK-NEXT:    vpunpcklwd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
1401; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
1402; CHECK-NEXT:    vmovdqa %xmm0, (%rsp) # 16-byte Spill
1403; CHECK-NEXT:    vpsrlq $48, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
1404; CHECK-NEXT:    vcvtsh2ss %xmm0, %xmm0, %xmm1
1405; CHECK-NEXT:    vpsrlq $48, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
1406; CHECK-NEXT:    vcvtsh2ss %xmm0, %xmm0, %xmm0
1407; CHECK-NEXT:    callq fmodf@PLT
1408; CHECK-NEXT:    vcvtss2sh %xmm0, %xmm0, %xmm0
1409; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1410; CHECK-NEXT:    vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
1411; CHECK-NEXT:    # xmm0 = mem[1,1,3,3]
1412; CHECK-NEXT:    vcvtsh2ss %xmm0, %xmm0, %xmm1
1413; CHECK-NEXT:    vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
1414; CHECK-NEXT:    # xmm0 = mem[1,1,3,3]
1415; CHECK-NEXT:    vcvtsh2ss %xmm0, %xmm0, %xmm0
1416; CHECK-NEXT:    callq fmodf@PLT
1417; CHECK-NEXT:    vcvtss2sh %xmm0, %xmm0, %xmm0
1418; CHECK-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
1419; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
1420; CHECK-NEXT:    vinserti128 $1, (%rsp), %ymm0, %ymm0 # 16-byte Folded Reload
1421; CHECK-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 32-byte Folded Reload
1422; CHECK-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1423; CHECK-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1424; CHECK-NEXT:    vcvtsh2ss %xmm0, %xmm0, %xmm1
1425; CHECK-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1426; CHECK-NEXT:    vcvtsh2ss %xmm0, %xmm0, %xmm0
1427; CHECK-NEXT:    vzeroupper
1428; CHECK-NEXT:    callq fmodf@PLT
1429; CHECK-NEXT:    vcvtss2sh %xmm0, %xmm0, %xmm0
1430; CHECK-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
1431; CHECK-NEXT:    vpsrld $16, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
1432; CHECK-NEXT:    vcvtsh2ss %xmm0, %xmm0, %xmm1
1433; CHECK-NEXT:    vpsrld $16, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
1434; CHECK-NEXT:    vcvtsh2ss %xmm0, %xmm0, %xmm0
1435; CHECK-NEXT:    callq fmodf@PLT
1436; CHECK-NEXT:    vcvtss2sh %xmm0, %xmm0, %xmm0
1437; CHECK-NEXT:    vmovdqa (%rsp), %xmm1 # 16-byte Reload
1438; CHECK-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
1439; CHECK-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1440; CHECK-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1441; CHECK-NEXT:    vcvtsh2ss %xmm0, %xmm0, %xmm1
1442; CHECK-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1443; CHECK-NEXT:    vcvtsh2ss %xmm0, %xmm0, %xmm0
1444; CHECK-NEXT:    callq fmodf@PLT
1445; CHECK-NEXT:    vcvtss2sh %xmm0, %xmm0, %xmm0
1446; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1447; CHECK-NEXT:    vpsrld $16, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
1448; CHECK-NEXT:    vcvtsh2ss %xmm0, %xmm0, %xmm1
1449; CHECK-NEXT:    vpsrld $16, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
1450; CHECK-NEXT:    vcvtsh2ss %xmm0, %xmm0, %xmm0
1451; CHECK-NEXT:    callq fmodf@PLT
1452; CHECK-NEXT:    vcvtss2sh %xmm0, %xmm0, %xmm0
1453; CHECK-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
1454; CHECK-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
1455; CHECK-NEXT:    vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload
1456; CHECK-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1457; CHECK-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1458; CHECK-NEXT:    vcvtsh2ss %xmm0, %xmm0, %xmm1
1459; CHECK-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1460; CHECK-NEXT:    vcvtsh2ss %xmm0, %xmm0, %xmm0
1461; CHECK-NEXT:    vzeroupper
1462; CHECK-NEXT:    callq fmodf@PLT
1463; CHECK-NEXT:    vcvtss2sh %xmm0, %xmm0, %xmm0
1464; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1465; CHECK-NEXT:    vpsrld $16, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
1466; CHECK-NEXT:    vcvtsh2ss %xmm0, %xmm0, %xmm1
1467; CHECK-NEXT:    vpsrld $16, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
1468; CHECK-NEXT:    vcvtsh2ss %xmm0, %xmm0, %xmm0
1469; CHECK-NEXT:    callq fmodf@PLT
1470; CHECK-NEXT:    vcvtss2sh %xmm0, %xmm0, %xmm0
1471; CHECK-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
1472; CHECK-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
1473; CHECK-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1474; CHECK-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
1475; CHECK-NEXT:    vcvtsh2ss %xmm0, %xmm0, %xmm1
1476; CHECK-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
1477; CHECK-NEXT:    vcvtsh2ss %xmm0, %xmm0, %xmm0
1478; CHECK-NEXT:    vzeroupper
1479; CHECK-NEXT:    callq fmodf@PLT
1480; CHECK-NEXT:    vcvtss2sh %xmm0, %xmm0, %xmm0
1481; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1482; CHECK-NEXT:    vpsrld $16, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
1483; CHECK-NEXT:    vcvtsh2ss %xmm0, %xmm0, %xmm1
1484; CHECK-NEXT:    vpsrld $16, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
1485; CHECK-NEXT:    vcvtsh2ss %xmm0, %xmm0, %xmm0
1486; CHECK-NEXT:    callq fmodf@PLT
1487; CHECK-NEXT:    vcvtss2sh %xmm0, %xmm0, %xmm0
1488; CHECK-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
1489; CHECK-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
1490; CHECK-NEXT:    vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload
1491; CHECK-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 32-byte Folded Reload
1492; CHECK-NEXT:    vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
1493; CHECK-NEXT:    # zmm0 = zmm0[0],mem[0],zmm0[1],mem[1],zmm0[4],mem[4],zmm0[5],mem[5],zmm0[8],mem[8],zmm0[9],mem[9],zmm0[12],mem[12],zmm0[13],mem[13]
1494; CHECK-NEXT:    vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
1495; CHECK-NEXT:    # zmm0 = zmm0[0],mem[0],zmm0[2],mem[2],zmm0[4],mem[4],zmm0[6],mem[6]
1496; CHECK-NEXT:    addq $408, %rsp # imm = 0x198
1497; CHECK-NEXT:    retq
1498  %result = call <32 x half> @llvm.experimental.constrained.frem.v32f16(<32 x half> %x, <32 x half> %y, metadata !"round.dynamic", metadata !"fpexcept.strict") #0
1499  ret <32 x half> %result
1500}
1501
1502attributes #0 = { strictfp }
1503declare half @llvm.experimental.constrained.frem.f16(half, half, metadata, metadata)
1504declare <2 x half> @llvm.experimental.constrained.frem.v2f16(<2 x half>, <2 x half>, metadata, metadata)
1505declare <4 x half> @llvm.experimental.constrained.frem.v4f16(<4 x half>, <4 x half>, metadata, metadata)
1506declare <8 x half> @llvm.experimental.constrained.frem.v8f16(<8 x half>, <8 x half>, metadata, metadata)
1507declare <16 x half> @llvm.experimental.constrained.frem.v16f16(<16 x half>, <16 x half>, metadata, metadata)
1508declare <32 x half> @llvm.experimental.constrained.frem.v32f16(<32 x half>, <32 x half>, metadata, metadata)
1509