xref: /llvm-project/llvm/test/CodeGen/X86/pr45563-2.ll (revision 4742715eb7fbd90b2d7ca807980bdca7e552fcd2)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -O3 -mtriple=x86_64-linux-generic -mattr=avx < %s | FileCheck %s
3
4; Bug 45563:
5; The SplitVecRes_MLOAD method should split a extended value type
6; according to the halving of the enveloping type to avoid all sorts
7; of inconsistencies downstream. For example for a extended value type
8; with VL=14 and enveloping type VL=16 that is split 8/8, the extended
9; type should be split 8/6 and not 7/7. This also accounts for hi masked
10; load that get zero storage size (and are unused).
11
12define <9 x float> @mload_split9(<9 x i1> %mask, ptr %addr, <9 x float> %dst) {
13; CHECK-LABEL: mload_split9:
14; CHECK:       # %bb.0:
15; CHECK-NEXT:    movq %rdi, %rax
16; CHECK-NEXT:    vinsertps {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[2,3]
17; CHECK-NEXT:    vinsertps {{.*#+}} xmm4 = xmm4[0,1],xmm6[0],xmm4[3]
18; CHECK-NEXT:    vinsertps {{.*#+}} xmm4 = xmm4[0,1,2],xmm7[0]
19; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3]
20; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm2[0],xmm0[3]
21; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[0]
22; CHECK-NEXT:    vinsertf128 $1, %xmm4, %ymm0, %ymm0
23; CHECK-NEXT:    vmovd %esi, %xmm1
24; CHECK-NEXT:    vpinsrb $1, %edx, %xmm1, %xmm1
25; CHECK-NEXT:    vpinsrb $2, %ecx, %xmm1, %xmm1
26; CHECK-NEXT:    vpinsrb $3, %r8d, %xmm1, %xmm1
27; CHECK-NEXT:    vpinsrb $4, %r9d, %xmm1, %xmm2
28; CHECK-NEXT:    vpinsrb $5, {{[0-9]+}}(%rsp), %xmm2, %xmm2
29; CHECK-NEXT:    vpinsrb $6, {{[0-9]+}}(%rsp), %xmm2, %xmm2
30; CHECK-NEXT:    vpinsrb $7, {{[0-9]+}}(%rsp), %xmm2, %xmm2
31; CHECK-NEXT:    vpinsrb $8, {{[0-9]+}}(%rsp), %xmm2, %xmm2
32; CHECK-NEXT:    vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero
33; CHECK-NEXT:    movq {{[0-9]+}}(%rsp), %rcx
34; CHECK-NEXT:    vpmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
35; CHECK-NEXT:    vpslld $31, %xmm1, %xmm1
36; CHECK-NEXT:    vpshufd {{.*#+}} xmm4 = xmm2[1,1,1,1]
37; CHECK-NEXT:    vpmovzxbd {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero
38; CHECK-NEXT:    vpslld $31, %xmm4, %xmm4
39; CHECK-NEXT:    vinsertf128 $1, %xmm4, %ymm1, %ymm1
40; CHECK-NEXT:    vmaskmovps (%rcx), %ymm1, %ymm4
41; CHECK-NEXT:    vblendvps %ymm1, %ymm4, %ymm0, %ymm0
42; CHECK-NEXT:    vpshufb {{.*#+}} xmm1 = xmm2[8,u,u,u],zero,xmm2[u,u,u],zero,xmm2[u,u,u],zero,xmm2[u,u,u]
43; CHECK-NEXT:    vpslld $31, %xmm1, %xmm1
44; CHECK-NEXT:    vmaskmovps 32(%rcx), %ymm1, %ymm2
45; CHECK-NEXT:    vmovaps %ymm0, (%rdi)
46; CHECK-NEXT:    vblendvps %xmm1, %xmm2, %xmm3, %xmm0
47; CHECK-NEXT:    vmovss %xmm0, 32(%rdi)
48; CHECK-NEXT:    vzeroupper
49; CHECK-NEXT:    retq
50  %res = call <9 x float> @llvm.masked.load.v9f32.p0(ptr %addr, i32 4, <9 x i1>%mask, <9 x float> %dst)
51  ret <9 x float> %res
52}
53
54define <13 x float> @mload_split13(<13 x i1> %mask, ptr %addr, <13 x float> %dst) {
55; CHECK-LABEL: mload_split13:
56; CHECK:       # %bb.0:
57; CHECK-NEXT:    movq %rdi, %rax
58; CHECK-NEXT:    vinsertps {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[2,3]
59; CHECK-NEXT:    vinsertps {{.*#+}} xmm4 = xmm4[0,1],xmm6[0],xmm4[3]
60; CHECK-NEXT:    vinsertps {{.*#+}} xmm4 = xmm4[0,1,2],xmm7[0]
61; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3]
62; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm2[0],xmm0[3]
63; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[0]
64; CHECK-NEXT:    vinsertf128 $1, %xmm4, %ymm0, %ymm0
65; CHECK-NEXT:    vmovd %esi, %xmm1
66; CHECK-NEXT:    vpinsrb $1, %edx, %xmm1, %xmm1
67; CHECK-NEXT:    vpinsrb $2, %ecx, %xmm1, %xmm1
68; CHECK-NEXT:    vpinsrb $3, %r8d, %xmm1, %xmm1
69; CHECK-NEXT:    vpinsrb $4, %r9d, %xmm1, %xmm2
70; CHECK-NEXT:    vpinsrb $5, {{[0-9]+}}(%rsp), %xmm2, %xmm2
71; CHECK-NEXT:    vpinsrb $6, {{[0-9]+}}(%rsp), %xmm2, %xmm2
72; CHECK-NEXT:    vpinsrb $7, {{[0-9]+}}(%rsp), %xmm2, %xmm2
73; CHECK-NEXT:    vpinsrb $8, {{[0-9]+}}(%rsp), %xmm2, %xmm2
74; CHECK-NEXT:    vpinsrb $9, {{[0-9]+}}(%rsp), %xmm2, %xmm2
75; CHECK-NEXT:    vpinsrb $10, {{[0-9]+}}(%rsp), %xmm2, %xmm2
76; CHECK-NEXT:    vpinsrb $11, {{[0-9]+}}(%rsp), %xmm2, %xmm2
77; CHECK-NEXT:    vpinsrb $12, {{[0-9]+}}(%rsp), %xmm2, %xmm3
78; CHECK-NEXT:    vmovss {{.*#+}} xmm4 = mem[0],zero,zero,zero
79; CHECK-NEXT:    vinsertps {{.*#+}} xmm4 = xmm4[0],mem[0],xmm4[2,3]
80; CHECK-NEXT:    vinsertps {{.*#+}} xmm4 = xmm4[0,1],mem[0],xmm4[3]
81; CHECK-NEXT:    vinsertps {{.*#+}} xmm4 = xmm4[0,1,2],mem[0]
82; CHECK-NEXT:    vmovss {{.*#+}} xmm5 = mem[0],zero,zero,zero
83; CHECK-NEXT:    movq {{[0-9]+}}(%rsp), %rcx
84; CHECK-NEXT:    vpmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
85; CHECK-NEXT:    vpslld $31, %xmm1, %xmm1
86; CHECK-NEXT:    vpshufd {{.*#+}} xmm6 = xmm3[1,1,1,1]
87; CHECK-NEXT:    vpmovzxbd {{.*#+}} xmm6 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero,xmm6[2],zero,zero,zero,xmm6[3],zero,zero,zero
88; CHECK-NEXT:    vpslld $31, %xmm6, %xmm6
89; CHECK-NEXT:    vinsertf128 $1, %xmm6, %ymm1, %ymm1
90; CHECK-NEXT:    vmaskmovps (%rcx), %ymm1, %ymm6
91; CHECK-NEXT:    vblendvps %ymm1, %ymm6, %ymm0, %ymm0
92; CHECK-NEXT:    vpunpckhbw {{.*#+}} xmm1 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
93; CHECK-NEXT:    vpmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
94; CHECK-NEXT:    vpslld $31, %xmm1, %xmm1
95; CHECK-NEXT:    vpsrldq {{.*#+}} xmm2 = xmm3[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
96; CHECK-NEXT:    vpslld $31, %xmm2, %xmm2
97; CHECK-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm3
98; CHECK-NEXT:    vmaskmovps 32(%rcx), %ymm3, %ymm3
99; CHECK-NEXT:    vmovaps %ymm0, (%rdi)
100; CHECK-NEXT:    vblendvps %xmm1, %xmm3, %xmm4, %xmm0
101; CHECK-NEXT:    vmovaps %xmm0, 32(%rdi)
102; CHECK-NEXT:    vextractf128 $1, %ymm3, %xmm0
103; CHECK-NEXT:    vblendvps %xmm2, %xmm0, %xmm5, %xmm0
104; CHECK-NEXT:    vmovss %xmm0, 48(%rdi)
105; CHECK-NEXT:    vzeroupper
106; CHECK-NEXT:    retq
107  %res = call <13 x float> @llvm.masked.load.v13f32.p0(ptr %addr, i32 4, <13 x i1>%mask, <13 x float> %dst)
108  ret <13 x float> %res
109}
110
111define <14 x float> @mload_split14(<14 x i1> %mask, ptr %addr, <14 x float> %dst) {
112; CHECK-LABEL: mload_split14:
113; CHECK:       # %bb.0:
114; CHECK-NEXT:    movq %rdi, %rax
115; CHECK-NEXT:    vinsertps {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[2,3]
116; CHECK-NEXT:    vinsertps {{.*#+}} xmm4 = xmm4[0,1],xmm6[0],xmm4[3]
117; CHECK-NEXT:    vinsertps {{.*#+}} xmm4 = xmm4[0,1,2],xmm7[0]
118; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3]
119; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm2[0],xmm0[3]
120; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[0]
121; CHECK-NEXT:    vinsertf128 $1, %xmm4, %ymm0, %ymm0
122; CHECK-NEXT:    vmovd %esi, %xmm1
123; CHECK-NEXT:    vpinsrb $1, %edx, %xmm1, %xmm1
124; CHECK-NEXT:    vpinsrb $2, %ecx, %xmm1, %xmm1
125; CHECK-NEXT:    vpinsrb $3, %r8d, %xmm1, %xmm1
126; CHECK-NEXT:    vpinsrb $4, %r9d, %xmm1, %xmm2
127; CHECK-NEXT:    vpinsrb $5, {{[0-9]+}}(%rsp), %xmm2, %xmm2
128; CHECK-NEXT:    vpinsrb $6, {{[0-9]+}}(%rsp), %xmm2, %xmm2
129; CHECK-NEXT:    vpinsrb $7, {{[0-9]+}}(%rsp), %xmm2, %xmm2
130; CHECK-NEXT:    vpinsrb $8, {{[0-9]+}}(%rsp), %xmm2, %xmm2
131; CHECK-NEXT:    vpinsrb $9, {{[0-9]+}}(%rsp), %xmm2, %xmm2
132; CHECK-NEXT:    vpinsrb $10, {{[0-9]+}}(%rsp), %xmm2, %xmm2
133; CHECK-NEXT:    vpinsrb $11, {{[0-9]+}}(%rsp), %xmm2, %xmm2
134; CHECK-NEXT:    vpinsrb $12, {{[0-9]+}}(%rsp), %xmm2, %xmm2
135; CHECK-NEXT:    vpinsrb $13, {{[0-9]+}}(%rsp), %xmm2, %xmm2
136; CHECK-NEXT:    vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero
137; CHECK-NEXT:    vinsertps {{.*#+}} xmm3 = xmm3[0],mem[0],xmm3[2,3]
138; CHECK-NEXT:    vinsertps {{.*#+}} xmm3 = xmm3[0,1],mem[0],xmm3[3]
139; CHECK-NEXT:    vinsertps {{.*#+}} xmm3 = xmm3[0,1,2],mem[0]
140; CHECK-NEXT:    vmovss {{.*#+}} xmm4 = mem[0],zero,zero,zero
141; CHECK-NEXT:    vinsertps {{.*#+}} xmm4 = xmm4[0],mem[0],xmm4[2,3]
142; CHECK-NEXT:    movq {{[0-9]+}}(%rsp), %rcx
143; CHECK-NEXT:    vpmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
144; CHECK-NEXT:    vpslld $31, %xmm1, %xmm1
145; CHECK-NEXT:    vpshufd {{.*#+}} xmm5 = xmm2[1,1,1,1]
146; CHECK-NEXT:    vpmovzxbd {{.*#+}} xmm5 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero,xmm5[2],zero,zero,zero,xmm5[3],zero,zero,zero
147; CHECK-NEXT:    vpslld $31, %xmm5, %xmm5
148; CHECK-NEXT:    vinsertf128 $1, %xmm5, %ymm1, %ymm1
149; CHECK-NEXT:    vmaskmovps (%rcx), %ymm1, %ymm5
150; CHECK-NEXT:    vblendvps %ymm1, %ymm5, %ymm0, %ymm0
151; CHECK-NEXT:    vpshufb {{.*#+}} xmm1 = xmm2[8,u,9,u,10,u,11,u,12,u,13,u],zero,xmm2[u],zero,xmm2[u]
152; CHECK-NEXT:    vpmovzxwd {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
153; CHECK-NEXT:    vpslld $31, %xmm2, %xmm2
154; CHECK-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7]
155; CHECK-NEXT:    vpslld $31, %xmm1, %xmm1
156; CHECK-NEXT:    vinsertf128 $1, %xmm1, %ymm2, %ymm5
157; CHECK-NEXT:    vmaskmovps 32(%rcx), %ymm5, %ymm5
158; CHECK-NEXT:    vmovaps %ymm0, (%rdi)
159; CHECK-NEXT:    vextractf128 $1, %ymm5, %xmm0
160; CHECK-NEXT:    vblendvps %xmm1, %xmm0, %xmm4, %xmm0
161; CHECK-NEXT:    vmovlps %xmm0, 48(%rdi)
162; CHECK-NEXT:    vblendvps %xmm2, %xmm5, %xmm3, %xmm0
163; CHECK-NEXT:    vmovaps %xmm0, 32(%rdi)
164; CHECK-NEXT:    vzeroupper
165; CHECK-NEXT:    retq
166  %res = call <14 x float> @llvm.masked.load.v14f32.p0(ptr %addr, i32 4, <14 x i1>%mask, <14 x float> %dst)
167  ret <14 x float> %res
168}
169
170define <17 x float> @mload_split17(<17 x i1> %mask, ptr %addr, <17 x float> %dst) {
171; CHECK-LABEL: mload_split17:
172; CHECK:       # %bb.0:
173; CHECK-NEXT:    movq %rdi, %rax
174; CHECK-NEXT:    vinsertps {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[2,3]
175; CHECK-NEXT:    vinsertps {{.*#+}} xmm4 = xmm4[0,1],xmm6[0],xmm4[3]
176; CHECK-NEXT:    vinsertps {{.*#+}} xmm4 = xmm4[0,1,2],xmm7[0]
177; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3]
178; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm2[0],xmm0[3]
179; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[0]
180; CHECK-NEXT:    vinsertf128 $1, %xmm4, %ymm0, %ymm2
181; CHECK-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
182; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3]
183; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
184; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0]
185; CHECK-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
186; CHECK-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[2,3]
187; CHECK-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1],mem[0],xmm1[3]
188; CHECK-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],mem[0]
189; CHECK-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
190; CHECK-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
191; CHECK-NEXT:    movzbl {{[0-9]+}}(%rsp), %r10d
192; CHECK-NEXT:    movq {{[0-9]+}}(%rsp), %rdi
193; CHECK-NEXT:    vmovd %esi, %xmm3
194; CHECK-NEXT:    vpinsrb $1, %edx, %xmm3, %xmm3
195; CHECK-NEXT:    vpinsrb $2, %ecx, %xmm3, %xmm3
196; CHECK-NEXT:    vpinsrb $3, %r8d, %xmm3, %xmm3
197; CHECK-NEXT:    vpmovzxbd {{.*#+}} xmm4 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero
198; CHECK-NEXT:    vpslld $31, %xmm4, %xmm4
199; CHECK-NEXT:    vpinsrb $4, %r9d, %xmm3, %xmm3
200; CHECK-NEXT:    vpinsrb $5, {{[0-9]+}}(%rsp), %xmm3, %xmm3
201; CHECK-NEXT:    vpinsrb $6, {{[0-9]+}}(%rsp), %xmm3, %xmm3
202; CHECK-NEXT:    vpinsrb $7, {{[0-9]+}}(%rsp), %xmm3, %xmm3
203; CHECK-NEXT:    vpshufd {{.*#+}} xmm3 = xmm3[1,1,1,1]
204; CHECK-NEXT:    vpmovzxbd {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero
205; CHECK-NEXT:    vpslld $31, %xmm3, %xmm3
206; CHECK-NEXT:    vinsertf128 $1, %xmm3, %ymm4, %ymm3
207; CHECK-NEXT:    vmaskmovps (%rdi), %ymm3, %ymm4
208; CHECK-NEXT:    vblendvps %ymm3, %ymm4, %ymm2, %ymm2
209; CHECK-NEXT:    vmovd {{.*#+}} xmm3 = mem[0],zero,zero,zero
210; CHECK-NEXT:    vpinsrb $2, {{[0-9]+}}(%rsp), %xmm3, %xmm3
211; CHECK-NEXT:    vpinsrb $4, {{[0-9]+}}(%rsp), %xmm3, %xmm3
212; CHECK-NEXT:    vpinsrb $6, {{[0-9]+}}(%rsp), %xmm3, %xmm3
213; CHECK-NEXT:    vpmovzxwd {{.*#+}} xmm4 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero
214; CHECK-NEXT:    vpslld $31, %xmm4, %xmm4
215; CHECK-NEXT:    vpinsrb $8, {{[0-9]+}}(%rsp), %xmm3, %xmm3
216; CHECK-NEXT:    vpinsrb $10, {{[0-9]+}}(%rsp), %xmm3, %xmm3
217; CHECK-NEXT:    vpinsrb $12, {{[0-9]+}}(%rsp), %xmm3, %xmm3
218; CHECK-NEXT:    vpinsrb $14, {{[0-9]+}}(%rsp), %xmm3, %xmm3
219; CHECK-NEXT:    vpunpckhwd {{.*#+}} xmm3 = xmm3[4,4,5,5,6,6,7,7]
220; CHECK-NEXT:    vpslld $31, %xmm3, %xmm3
221; CHECK-NEXT:    vinsertf128 $1, %xmm3, %ymm4, %ymm3
222; CHECK-NEXT:    vmaskmovps 32(%rdi), %ymm3, %ymm4
223; CHECK-NEXT:    vblendvps %ymm3, %ymm4, %ymm1, %ymm1
224; CHECK-NEXT:    vmovd %r10d, %xmm3
225; CHECK-NEXT:    vpslld $31, %xmm3, %xmm3
226; CHECK-NEXT:    vmaskmovps 64(%rdi), %ymm3, %ymm4
227; CHECK-NEXT:    vblendvps %xmm3, %xmm4, %xmm0, %xmm0
228; CHECK-NEXT:    vmovss %xmm0, 64(%rax)
229; CHECK-NEXT:    vmovaps %ymm1, 32(%rax)
230; CHECK-NEXT:    vmovaps %ymm2, (%rax)
231; CHECK-NEXT:    vzeroupper
232; CHECK-NEXT:    retq
233  %res = call <17 x float> @llvm.masked.load.v17f32.p0(ptr %addr, i32 4, <17 x i1>%mask, <17 x float> %dst)
234  ret <17 x float> %res
235}
236
237define <23 x float> @mload_split23(<23 x i1> %mask, ptr %addr, <23 x float> %dst) {
238; CHECK-LABEL: mload_split23:
239; CHECK:       # %bb.0:
240; CHECK-NEXT:    movq %rdi, %rax
241; CHECK-NEXT:    vinsertps {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[2,3]
242; CHECK-NEXT:    vinsertps {{.*#+}} xmm4 = xmm4[0,1],xmm6[0],xmm4[3]
243; CHECK-NEXT:    vinsertps {{.*#+}} xmm4 = xmm4[0,1,2],xmm7[0]
244; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3]
245; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm2[0],xmm0[3]
246; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[0]
247; CHECK-NEXT:    vinsertf128 $1, %xmm4, %ymm0, %ymm3
248; CHECK-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
249; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3]
250; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
251; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0]
252; CHECK-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
253; CHECK-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[2,3]
254; CHECK-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1],mem[0],xmm1[3]
255; CHECK-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],mem[0]
256; CHECK-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm2
257; CHECK-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
258; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3]
259; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
260; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0]
261; CHECK-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
262; CHECK-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[2,3]
263; CHECK-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1],mem[0],xmm1[3]
264; CHECK-NEXT:    movzbl {{[0-9]+}}(%rsp), %r10d
265; CHECK-NEXT:    movq {{[0-9]+}}(%rsp), %rdi
266; CHECK-NEXT:    vmovd %esi, %xmm4
267; CHECK-NEXT:    vpinsrb $1, %edx, %xmm4, %xmm4
268; CHECK-NEXT:    vpinsrb $2, %ecx, %xmm4, %xmm4
269; CHECK-NEXT:    vpinsrb $3, %r8d, %xmm4, %xmm4
270; CHECK-NEXT:    vpmovzxbd {{.*#+}} xmm5 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero
271; CHECK-NEXT:    vpslld $31, %xmm5, %xmm5
272; CHECK-NEXT:    vpinsrb $4, %r9d, %xmm4, %xmm4
273; CHECK-NEXT:    vpinsrb $5, {{[0-9]+}}(%rsp), %xmm4, %xmm4
274; CHECK-NEXT:    vpinsrb $6, {{[0-9]+}}(%rsp), %xmm4, %xmm4
275; CHECK-NEXT:    vpinsrb $7, {{[0-9]+}}(%rsp), %xmm4, %xmm4
276; CHECK-NEXT:    vpshufd {{.*#+}} xmm4 = xmm4[1,1,1,1]
277; CHECK-NEXT:    vpmovzxbd {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero
278; CHECK-NEXT:    vpslld $31, %xmm4, %xmm4
279; CHECK-NEXT:    vinsertf128 $1, %xmm4, %ymm5, %ymm4
280; CHECK-NEXT:    vmaskmovps (%rdi), %ymm4, %ymm5
281; CHECK-NEXT:    vblendvps %ymm4, %ymm5, %ymm3, %ymm3
282; CHECK-NEXT:    vmovd {{.*#+}} xmm4 = mem[0],zero,zero,zero
283; CHECK-NEXT:    vpinsrb $2, {{[0-9]+}}(%rsp), %xmm4, %xmm4
284; CHECK-NEXT:    vpinsrb $4, {{[0-9]+}}(%rsp), %xmm4, %xmm4
285; CHECK-NEXT:    vpinsrb $6, {{[0-9]+}}(%rsp), %xmm4, %xmm4
286; CHECK-NEXT:    vpmovzxwd {{.*#+}} xmm5 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero
287; CHECK-NEXT:    vpslld $31, %xmm5, %xmm5
288; CHECK-NEXT:    vpinsrb $8, {{[0-9]+}}(%rsp), %xmm4, %xmm4
289; CHECK-NEXT:    vpinsrb $10, {{[0-9]+}}(%rsp), %xmm4, %xmm4
290; CHECK-NEXT:    vpinsrb $12, {{[0-9]+}}(%rsp), %xmm4, %xmm4
291; CHECK-NEXT:    vpinsrb $14, {{[0-9]+}}(%rsp), %xmm4, %xmm4
292; CHECK-NEXT:    vpunpckhwd {{.*#+}} xmm4 = xmm4[4,4,5,5,6,6,7,7]
293; CHECK-NEXT:    vpslld $31, %xmm4, %xmm4
294; CHECK-NEXT:    vinsertf128 $1, %xmm4, %ymm5, %ymm4
295; CHECK-NEXT:    vmaskmovps 32(%rdi), %ymm4, %ymm5
296; CHECK-NEXT:    vblendvps %ymm4, %ymm5, %ymm2, %ymm2
297; CHECK-NEXT:    vmovd %r10d, %xmm4
298; CHECK-NEXT:    vpinsrb $1, {{[0-9]+}}(%rsp), %xmm4, %xmm4
299; CHECK-NEXT:    vpinsrb $2, {{[0-9]+}}(%rsp), %xmm4, %xmm4
300; CHECK-NEXT:    vpinsrb $3, {{[0-9]+}}(%rsp), %xmm4, %xmm4
301; CHECK-NEXT:    vpmovzxbd {{.*#+}} xmm5 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero
302; CHECK-NEXT:    vpslld $31, %xmm5, %xmm5
303; CHECK-NEXT:    vpinsrb $4, {{[0-9]+}}(%rsp), %xmm4, %xmm4
304; CHECK-NEXT:    vpinsrb $5, {{[0-9]+}}(%rsp), %xmm4, %xmm4
305; CHECK-NEXT:    vpinsrb $6, {{[0-9]+}}(%rsp), %xmm4, %xmm4
306; CHECK-NEXT:    vpshufd {{.*#+}} xmm4 = xmm4[1,1,1,1]
307; CHECK-NEXT:    vpmovzxbd {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero
308; CHECK-NEXT:    vpslld $31, %xmm4, %xmm4
309; CHECK-NEXT:    vinsertf128 $1, %xmm4, %ymm5, %ymm6
310; CHECK-NEXT:    vmaskmovps 64(%rdi), %ymm6, %ymm6
311; CHECK-NEXT:    vmovaps %ymm2, 32(%rax)
312; CHECK-NEXT:    vextractf128 $1, %ymm6, %xmm2
313; CHECK-NEXT:    vblendvps %xmm4, %xmm2, %xmm1, %xmm1
314; CHECK-NEXT:    vextractps $2, %xmm1, 88(%rax)
315; CHECK-NEXT:    vmovlps %xmm1, 80(%rax)
316; CHECK-NEXT:    vblendvps %xmm5, %xmm6, %xmm0, %xmm0
317; CHECK-NEXT:    vmovaps %xmm0, 64(%rax)
318; CHECK-NEXT:    vmovaps %ymm3, (%rax)
319; CHECK-NEXT:    vzeroupper
320; CHECK-NEXT:    retq
321  %res = call <23 x float> @llvm.masked.load.v23f32.p0(ptr %addr, i32 4, <23 x i1>%mask, <23 x float> %dst)
322  ret <23 x float> %res
323}
324
325declare <9 x float> @llvm.masked.load.v9f32.p0(ptr %addr, i32 %align, <9 x i1> %mask, <9 x float> %dst)
326declare <13 x float> @llvm.masked.load.v13f32.p0(ptr %addr, i32 %align, <13 x i1> %mask, <13 x float> %dst)
327declare <14 x float> @llvm.masked.load.v14f32.p0(ptr %addr, i32 %align, <14 x i1> %mask, <14 x float> %dst)
328declare <17 x float> @llvm.masked.load.v17f32.p0(ptr %addr, i32 %align, <17 x i1> %mask, <17 x float> %dst)
329declare <23 x float> @llvm.masked.load.v23f32.p0(ptr %addr, i32 %align, <23 x i1> %mask, <23 x float> %dst)
330