xref: /llvm-project/llvm/test/CodeGen/X86/masked_expandload.ll (revision 275729ae06d568e9589392c142a416fb8c2bb1a8)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --no_x86_scrub_mem_shuffle
2; RUN: llc < %s -disable-peephole -mtriple=x86_64-apple-darwin -mattr=sse2    | FileCheck %s --check-prefixes=SSE,SSE2
3; RUN: llc < %s -disable-peephole -mtriple=x86_64-apple-darwin -mattr=sse4.2  | FileCheck %s --check-prefixes=SSE,SSE42
4; RUN: llc < %s -disable-peephole -mtriple=x86_64-apple-darwin -mattr=avx     | FileCheck %s --check-prefixes=AVX1OR2,AVX1
5; RUN: llc < %s -disable-peephole -mtriple=x86_64-apple-darwin -mattr=avx2    | FileCheck %s --check-prefixes=AVX1OR2,AVX2
6; RUN: llc < %s -disable-peephole -mtriple=x86_64-apple-darwin -mattr=avx512f | FileCheck %s --check-prefixes=AVX512,AVX512F
7; RUN: llc < %s -disable-peephole -mtriple=x86_64-apple-darwin -mattr=avx512f,avx512dq,avx512vl | FileCheck %s --check-prefixes=AVX512,AVX512VL,AVX512VLDQ
8; RUN: llc < %s -disable-peephole -mtriple=x86_64-apple-darwin -mattr=avx512f,avx512bw,avx512vl | FileCheck %s --check-prefixes=AVX512,AVX512VL,AVX512VLBW
9
10;
11; vXf64
12;
13
14define <2 x double> @expandload_v2f64_v2i64(ptr %base, <2 x double> %src0, <2 x i64> %trigger) {
15; SSE2-LABEL: expandload_v2f64_v2i64:
16; SSE2:       ## %bb.0:
17; SSE2-NEXT:    pxor %xmm2, %xmm2
18; SSE2-NEXT:    pcmpeqd %xmm1, %xmm2
19; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[1,0,3,2]
20; SSE2-NEXT:    pand %xmm2, %xmm1
21; SSE2-NEXT:    movmskpd %xmm1, %eax
22; SSE2-NEXT:    testb $1, %al
23; SSE2-NEXT:    jne LBB0_1
24; SSE2-NEXT:  ## %bb.2: ## %else
25; SSE2-NEXT:    testb $2, %al
26; SSE2-NEXT:    jne LBB0_3
27; SSE2-NEXT:  LBB0_4: ## %else2
28; SSE2-NEXT:    retq
29; SSE2-NEXT:  LBB0_1: ## %cond.load
30; SSE2-NEXT:    movlps (%rdi), %xmm0 ## xmm0 = mem[0,1],xmm0[2,3]
31; SSE2-NEXT:    addq $8, %rdi
32; SSE2-NEXT:    testb $2, %al
33; SSE2-NEXT:    je LBB0_4
34; SSE2-NEXT:  LBB0_3: ## %cond.load1
35; SSE2-NEXT:    movhps (%rdi), %xmm0 ## xmm0 = xmm0[0,1],mem[0,1]
36; SSE2-NEXT:    retq
37;
38; SSE42-LABEL: expandload_v2f64_v2i64:
39; SSE42:       ## %bb.0:
40; SSE42-NEXT:    pxor %xmm2, %xmm2
41; SSE42-NEXT:    pcmpeqq %xmm1, %xmm2
42; SSE42-NEXT:    movmskpd %xmm2, %eax
43; SSE42-NEXT:    testb $1, %al
44; SSE42-NEXT:    jne LBB0_1
45; SSE42-NEXT:  ## %bb.2: ## %else
46; SSE42-NEXT:    testb $2, %al
47; SSE42-NEXT:    jne LBB0_3
48; SSE42-NEXT:  LBB0_4: ## %else2
49; SSE42-NEXT:    retq
50; SSE42-NEXT:  LBB0_1: ## %cond.load
51; SSE42-NEXT:    movlps (%rdi), %xmm0 ## xmm0 = mem[0,1],xmm0[2,3]
52; SSE42-NEXT:    addq $8, %rdi
53; SSE42-NEXT:    testb $2, %al
54; SSE42-NEXT:    je LBB0_4
55; SSE42-NEXT:  LBB0_3: ## %cond.load1
56; SSE42-NEXT:    movhps (%rdi), %xmm0 ## xmm0 = xmm0[0,1],mem[0,1]
57; SSE42-NEXT:    retq
58;
59; AVX1OR2-LABEL: expandload_v2f64_v2i64:
60; AVX1OR2:       ## %bb.0:
61; AVX1OR2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
62; AVX1OR2-NEXT:    vpcmpeqq %xmm2, %xmm1, %xmm1
63; AVX1OR2-NEXT:    vmovmskpd %xmm1, %eax
64; AVX1OR2-NEXT:    testb $1, %al
65; AVX1OR2-NEXT:    jne LBB0_1
66; AVX1OR2-NEXT:  ## %bb.2: ## %else
67; AVX1OR2-NEXT:    testb $2, %al
68; AVX1OR2-NEXT:    jne LBB0_3
69; AVX1OR2-NEXT:  LBB0_4: ## %else2
70; AVX1OR2-NEXT:    retq
71; AVX1OR2-NEXT:  LBB0_1: ## %cond.load
72; AVX1OR2-NEXT:    vmovlps (%rdi), %xmm0, %xmm0 ## xmm0 = mem[0,1],xmm0[2,3]
73; AVX1OR2-NEXT:    addq $8, %rdi
74; AVX1OR2-NEXT:    testb $2, %al
75; AVX1OR2-NEXT:    je LBB0_4
76; AVX1OR2-NEXT:  LBB0_3: ## %cond.load1
77; AVX1OR2-NEXT:    vmovhps (%rdi), %xmm0, %xmm0 ## xmm0 = xmm0[0,1],mem[0,1]
78; AVX1OR2-NEXT:    retq
79;
80; AVX512F-LABEL: expandload_v2f64_v2i64:
81; AVX512F:       ## %bb.0:
82; AVX512F-NEXT:    ## kill: def $xmm1 killed $xmm1 def $zmm1
83; AVX512F-NEXT:    ## kill: def $xmm0 killed $xmm0 def $zmm0
84; AVX512F-NEXT:    vptestnmq %zmm1, %zmm1, %k0
85; AVX512F-NEXT:    kshiftlw $14, %k0, %k0
86; AVX512F-NEXT:    kshiftrw $14, %k0, %k1
87; AVX512F-NEXT:    vexpandpd (%rdi), %zmm0 {%k1}
88; AVX512F-NEXT:    ## kill: def $xmm0 killed $xmm0 killed $zmm0
89; AVX512F-NEXT:    vzeroupper
90; AVX512F-NEXT:    retq
91;
92; AVX512VL-LABEL: expandload_v2f64_v2i64:
93; AVX512VL:       ## %bb.0:
94; AVX512VL-NEXT:    vptestnmq %xmm1, %xmm1, %k1
95; AVX512VL-NEXT:    vexpandpd (%rdi), %xmm0 {%k1}
96; AVX512VL-NEXT:    retq
97  %mask = icmp eq <2 x i64> %trigger, zeroinitializer
98  %res = call <2 x double> @llvm.masked.expandload.v2f64(ptr %base, <2 x i1> %mask, <2 x double> %src0)
99  ret <2 x double>%res
100}
101
102define <4 x double> @expandload_v4f64_v4i64(ptr %base, <4 x double> %src0, <4 x i64> %trigger) {
103; SSE2-LABEL: expandload_v4f64_v4i64:
104; SSE2:       ## %bb.0:
105; SSE2-NEXT:    pxor %xmm4, %xmm4
106; SSE2-NEXT:    pcmpeqd %xmm4, %xmm3
107; SSE2-NEXT:    pcmpeqd %xmm4, %xmm2
108; SSE2-NEXT:    movdqa %xmm2, %xmm4
109; SSE2-NEXT:    shufps {{.*#+}} xmm4 = xmm4[1,3],xmm3[1,3]
110; SSE2-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,2],xmm3[0,2]
111; SSE2-NEXT:    andps %xmm4, %xmm2
112; SSE2-NEXT:    movmskps %xmm2, %eax
113; SSE2-NEXT:    testb $1, %al
114; SSE2-NEXT:    jne LBB1_1
115; SSE2-NEXT:  ## %bb.2: ## %else
116; SSE2-NEXT:    testb $2, %al
117; SSE2-NEXT:    jne LBB1_3
118; SSE2-NEXT:  LBB1_4: ## %else2
119; SSE2-NEXT:    testb $4, %al
120; SSE2-NEXT:    jne LBB1_5
121; SSE2-NEXT:  LBB1_6: ## %else6
122; SSE2-NEXT:    testb $8, %al
123; SSE2-NEXT:    jne LBB1_7
124; SSE2-NEXT:  LBB1_8: ## %else10
125; SSE2-NEXT:    retq
126; SSE2-NEXT:  LBB1_1: ## %cond.load
127; SSE2-NEXT:    movlps (%rdi), %xmm0 ## xmm0 = mem[0,1],xmm0[2,3]
128; SSE2-NEXT:    addq $8, %rdi
129; SSE2-NEXT:    testb $2, %al
130; SSE2-NEXT:    je LBB1_4
131; SSE2-NEXT:  LBB1_3: ## %cond.load1
132; SSE2-NEXT:    movhps (%rdi), %xmm0 ## xmm0 = xmm0[0,1],mem[0,1]
133; SSE2-NEXT:    addq $8, %rdi
134; SSE2-NEXT:    testb $4, %al
135; SSE2-NEXT:    je LBB1_6
136; SSE2-NEXT:  LBB1_5: ## %cond.load5
137; SSE2-NEXT:    movlps (%rdi), %xmm1 ## xmm1 = mem[0,1],xmm1[2,3]
138; SSE2-NEXT:    addq $8, %rdi
139; SSE2-NEXT:    testb $8, %al
140; SSE2-NEXT:    je LBB1_8
141; SSE2-NEXT:  LBB1_7: ## %cond.load9
142; SSE2-NEXT:    movhps (%rdi), %xmm1 ## xmm1 = xmm1[0,1],mem[0,1]
143; SSE2-NEXT:    retq
144;
145; SSE42-LABEL: expandload_v4f64_v4i64:
146; SSE42:       ## %bb.0:
147; SSE42-NEXT:    pxor %xmm4, %xmm4
148; SSE42-NEXT:    pcmpeqq %xmm4, %xmm3
149; SSE42-NEXT:    pcmpeqq %xmm4, %xmm2
150; SSE42-NEXT:    packssdw %xmm3, %xmm2
151; SSE42-NEXT:    movmskps %xmm2, %eax
152; SSE42-NEXT:    testb $1, %al
153; SSE42-NEXT:    jne LBB1_1
154; SSE42-NEXT:  ## %bb.2: ## %else
155; SSE42-NEXT:    testb $2, %al
156; SSE42-NEXT:    jne LBB1_3
157; SSE42-NEXT:  LBB1_4: ## %else2
158; SSE42-NEXT:    testb $4, %al
159; SSE42-NEXT:    jne LBB1_5
160; SSE42-NEXT:  LBB1_6: ## %else6
161; SSE42-NEXT:    testb $8, %al
162; SSE42-NEXT:    jne LBB1_7
163; SSE42-NEXT:  LBB1_8: ## %else10
164; SSE42-NEXT:    retq
165; SSE42-NEXT:  LBB1_1: ## %cond.load
166; SSE42-NEXT:    movlps (%rdi), %xmm0 ## xmm0 = mem[0,1],xmm0[2,3]
167; SSE42-NEXT:    addq $8, %rdi
168; SSE42-NEXT:    testb $2, %al
169; SSE42-NEXT:    je LBB1_4
170; SSE42-NEXT:  LBB1_3: ## %cond.load1
171; SSE42-NEXT:    movhps (%rdi), %xmm0 ## xmm0 = xmm0[0,1],mem[0,1]
172; SSE42-NEXT:    addq $8, %rdi
173; SSE42-NEXT:    testb $4, %al
174; SSE42-NEXT:    je LBB1_6
175; SSE42-NEXT:  LBB1_5: ## %cond.load5
176; SSE42-NEXT:    movlps (%rdi), %xmm1 ## xmm1 = mem[0,1],xmm1[2,3]
177; SSE42-NEXT:    addq $8, %rdi
178; SSE42-NEXT:    testb $8, %al
179; SSE42-NEXT:    je LBB1_8
180; SSE42-NEXT:  LBB1_7: ## %cond.load9
181; SSE42-NEXT:    movhps (%rdi), %xmm1 ## xmm1 = xmm1[0,1],mem[0,1]
182; SSE42-NEXT:    retq
183;
184; AVX1-LABEL: expandload_v4f64_v4i64:
185; AVX1:       ## %bb.0:
186; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
187; AVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
188; AVX1-NEXT:    vpcmpeqq %xmm3, %xmm2, %xmm2
189; AVX1-NEXT:    vpcmpeqq %xmm3, %xmm1, %xmm1
190; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
191; AVX1-NEXT:    vmovmskpd %ymm1, %eax
192; AVX1-NEXT:    testb $1, %al
193; AVX1-NEXT:    jne LBB1_1
194; AVX1-NEXT:  ## %bb.2: ## %else
195; AVX1-NEXT:    testb $2, %al
196; AVX1-NEXT:    jne LBB1_3
197; AVX1-NEXT:  LBB1_4: ## %else2
198; AVX1-NEXT:    testb $4, %al
199; AVX1-NEXT:    jne LBB1_5
200; AVX1-NEXT:  LBB1_6: ## %else6
201; AVX1-NEXT:    testb $8, %al
202; AVX1-NEXT:    jne LBB1_7
203; AVX1-NEXT:  LBB1_8: ## %else10
204; AVX1-NEXT:    retq
205; AVX1-NEXT:  LBB1_1: ## %cond.load
206; AVX1-NEXT:    vmovsd (%rdi), %xmm1 ## xmm1 = mem[0],zero
207; AVX1-NEXT:    vblendpd {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3]
208; AVX1-NEXT:    addq $8, %rdi
209; AVX1-NEXT:    testb $2, %al
210; AVX1-NEXT:    je LBB1_4
211; AVX1-NEXT:  LBB1_3: ## %cond.load1
212; AVX1-NEXT:    vmovhpd (%rdi), %xmm0, %xmm1 ## xmm1 = xmm0[0],mem[0]
213; AVX1-NEXT:    vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3]
214; AVX1-NEXT:    addq $8, %rdi
215; AVX1-NEXT:    testb $4, %al
216; AVX1-NEXT:    je LBB1_6
217; AVX1-NEXT:  LBB1_5: ## %cond.load5
218; AVX1-NEXT:    vbroadcastsd (%rdi), %ymm1
219; AVX1-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3]
220; AVX1-NEXT:    addq $8, %rdi
221; AVX1-NEXT:    testb $8, %al
222; AVX1-NEXT:    je LBB1_8
223; AVX1-NEXT:  LBB1_7: ## %cond.load9
224; AVX1-NEXT:    vbroadcastsd (%rdi), %ymm1
225; AVX1-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3]
226; AVX1-NEXT:    retq
227;
228; AVX2-LABEL: expandload_v4f64_v4i64:
229; AVX2:       ## %bb.0:
230; AVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
231; AVX2-NEXT:    vpcmpeqq %ymm2, %ymm1, %ymm1
232; AVX2-NEXT:    vmovmskpd %ymm1, %eax
233; AVX2-NEXT:    testb $1, %al
234; AVX2-NEXT:    jne LBB1_1
235; AVX2-NEXT:  ## %bb.2: ## %else
236; AVX2-NEXT:    testb $2, %al
237; AVX2-NEXT:    jne LBB1_3
238; AVX2-NEXT:  LBB1_4: ## %else2
239; AVX2-NEXT:    testb $4, %al
240; AVX2-NEXT:    jne LBB1_5
241; AVX2-NEXT:  LBB1_6: ## %else6
242; AVX2-NEXT:    testb $8, %al
243; AVX2-NEXT:    jne LBB1_7
244; AVX2-NEXT:  LBB1_8: ## %else10
245; AVX2-NEXT:    retq
246; AVX2-NEXT:  LBB1_1: ## %cond.load
247; AVX2-NEXT:    vmovsd (%rdi), %xmm1 ## xmm1 = mem[0],zero
248; AVX2-NEXT:    vblendpd {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3]
249; AVX2-NEXT:    addq $8, %rdi
250; AVX2-NEXT:    testb $2, %al
251; AVX2-NEXT:    je LBB1_4
252; AVX2-NEXT:  LBB1_3: ## %cond.load1
253; AVX2-NEXT:    vmovhpd (%rdi), %xmm0, %xmm1 ## xmm1 = xmm0[0],mem[0]
254; AVX2-NEXT:    vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3]
255; AVX2-NEXT:    addq $8, %rdi
256; AVX2-NEXT:    testb $4, %al
257; AVX2-NEXT:    je LBB1_6
258; AVX2-NEXT:  LBB1_5: ## %cond.load5
259; AVX2-NEXT:    vbroadcastsd (%rdi), %ymm1
260; AVX2-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3]
261; AVX2-NEXT:    addq $8, %rdi
262; AVX2-NEXT:    testb $8, %al
263; AVX2-NEXT:    je LBB1_8
264; AVX2-NEXT:  LBB1_7: ## %cond.load9
265; AVX2-NEXT:    vbroadcastsd (%rdi), %ymm1
266; AVX2-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3]
267; AVX2-NEXT:    retq
268;
269; AVX512F-LABEL: expandload_v4f64_v4i64:
270; AVX512F:       ## %bb.0:
271; AVX512F-NEXT:    ## kill: def $ymm1 killed $ymm1 def $zmm1
272; AVX512F-NEXT:    ## kill: def $ymm0 killed $ymm0 def $zmm0
273; AVX512F-NEXT:    vptestnmq %zmm1, %zmm1, %k0
274; AVX512F-NEXT:    kshiftlw $12, %k0, %k0
275; AVX512F-NEXT:    kshiftrw $12, %k0, %k1
276; AVX512F-NEXT:    vexpandpd (%rdi), %zmm0 {%k1}
277; AVX512F-NEXT:    ## kill: def $ymm0 killed $ymm0 killed $zmm0
278; AVX512F-NEXT:    retq
279;
280; AVX512VL-LABEL: expandload_v4f64_v4i64:
281; AVX512VL:       ## %bb.0:
282; AVX512VL-NEXT:    vptestnmq %ymm1, %ymm1, %k1
283; AVX512VL-NEXT:    vexpandpd (%rdi), %ymm0 {%k1}
284; AVX512VL-NEXT:    retq
285  %mask = icmp eq <4 x i64> %trigger, zeroinitializer
286  %res = call <4 x double> @llvm.masked.expandload.v4f64(ptr %base, <4 x i1> %mask, <4 x double> %src0)
287  ret <4 x double>%res
288}
289
290define <8 x double> @expandload_v8f64_v8i1(ptr %base, <8 x double> %src0, <8 x i1> %mask) {
291; SSE-LABEL: expandload_v8f64_v8i1:
292; SSE:       ## %bb.0:
293; SSE-NEXT:    psllw $15, %xmm4
294; SSE-NEXT:    packsswb %xmm4, %xmm4
295; SSE-NEXT:    pmovmskb %xmm4, %eax
296; SSE-NEXT:    testb $1, %al
297; SSE-NEXT:    jne LBB2_1
298; SSE-NEXT:  ## %bb.2: ## %else
299; SSE-NEXT:    testb $2, %al
300; SSE-NEXT:    jne LBB2_3
301; SSE-NEXT:  LBB2_4: ## %else2
302; SSE-NEXT:    testb $4, %al
303; SSE-NEXT:    jne LBB2_5
304; SSE-NEXT:  LBB2_6: ## %else6
305; SSE-NEXT:    testb $8, %al
306; SSE-NEXT:    jne LBB2_7
307; SSE-NEXT:  LBB2_8: ## %else10
308; SSE-NEXT:    testb $16, %al
309; SSE-NEXT:    jne LBB2_9
310; SSE-NEXT:  LBB2_10: ## %else14
311; SSE-NEXT:    testb $32, %al
312; SSE-NEXT:    jne LBB2_11
313; SSE-NEXT:  LBB2_12: ## %else18
314; SSE-NEXT:    testb $64, %al
315; SSE-NEXT:    jne LBB2_13
316; SSE-NEXT:  LBB2_14: ## %else22
317; SSE-NEXT:    testb $-128, %al
318; SSE-NEXT:    jne LBB2_15
319; SSE-NEXT:  LBB2_16: ## %else26
320; SSE-NEXT:    retq
321; SSE-NEXT:  LBB2_1: ## %cond.load
322; SSE-NEXT:    movlps (%rdi), %xmm0 ## xmm0 = mem[0,1],xmm0[2,3]
323; SSE-NEXT:    addq $8, %rdi
324; SSE-NEXT:    testb $2, %al
325; SSE-NEXT:    je LBB2_4
326; SSE-NEXT:  LBB2_3: ## %cond.load1
327; SSE-NEXT:    movhps (%rdi), %xmm0 ## xmm0 = xmm0[0,1],mem[0,1]
328; SSE-NEXT:    addq $8, %rdi
329; SSE-NEXT:    testb $4, %al
330; SSE-NEXT:    je LBB2_6
331; SSE-NEXT:  LBB2_5: ## %cond.load5
332; SSE-NEXT:    movlps (%rdi), %xmm1 ## xmm1 = mem[0,1],xmm1[2,3]
333; SSE-NEXT:    addq $8, %rdi
334; SSE-NEXT:    testb $8, %al
335; SSE-NEXT:    je LBB2_8
336; SSE-NEXT:  LBB2_7: ## %cond.load9
337; SSE-NEXT:    movhps (%rdi), %xmm1 ## xmm1 = xmm1[0,1],mem[0,1]
338; SSE-NEXT:    addq $8, %rdi
339; SSE-NEXT:    testb $16, %al
340; SSE-NEXT:    je LBB2_10
341; SSE-NEXT:  LBB2_9: ## %cond.load13
342; SSE-NEXT:    movlps (%rdi), %xmm2 ## xmm2 = mem[0,1],xmm2[2,3]
343; SSE-NEXT:    addq $8, %rdi
344; SSE-NEXT:    testb $32, %al
345; SSE-NEXT:    je LBB2_12
346; SSE-NEXT:  LBB2_11: ## %cond.load17
347; SSE-NEXT:    movhps (%rdi), %xmm2 ## xmm2 = xmm2[0,1],mem[0,1]
348; SSE-NEXT:    addq $8, %rdi
349; SSE-NEXT:    testb $64, %al
350; SSE-NEXT:    je LBB2_14
351; SSE-NEXT:  LBB2_13: ## %cond.load21
352; SSE-NEXT:    movlps (%rdi), %xmm3 ## xmm3 = mem[0,1],xmm3[2,3]
353; SSE-NEXT:    addq $8, %rdi
354; SSE-NEXT:    testb $-128, %al
355; SSE-NEXT:    je LBB2_16
356; SSE-NEXT:  LBB2_15: ## %cond.load25
357; SSE-NEXT:    movhps (%rdi), %xmm3 ## xmm3 = xmm3[0,1],mem[0,1]
358; SSE-NEXT:    retq
359;
360; AVX1-LABEL: expandload_v8f64_v8i1:
361; AVX1:       ## %bb.0:
362; AVX1-NEXT:    vpsllw $15, %xmm2, %xmm2
363; AVX1-NEXT:    vpacksswb %xmm2, %xmm2, %xmm2
364; AVX1-NEXT:    vpmovmskb %xmm2, %eax
365; AVX1-NEXT:    testb $1, %al
366; AVX1-NEXT:    jne LBB2_1
367; AVX1-NEXT:  ## %bb.2: ## %else
368; AVX1-NEXT:    testb $2, %al
369; AVX1-NEXT:    jne LBB2_3
370; AVX1-NEXT:  LBB2_4: ## %else2
371; AVX1-NEXT:    testb $4, %al
372; AVX1-NEXT:    jne LBB2_5
373; AVX1-NEXT:  LBB2_6: ## %else6
374; AVX1-NEXT:    testb $8, %al
375; AVX1-NEXT:    jne LBB2_7
376; AVX1-NEXT:  LBB2_8: ## %else10
377; AVX1-NEXT:    testb $16, %al
378; AVX1-NEXT:    jne LBB2_9
379; AVX1-NEXT:  LBB2_10: ## %else14
380; AVX1-NEXT:    testb $32, %al
381; AVX1-NEXT:    jne LBB2_11
382; AVX1-NEXT:  LBB2_12: ## %else18
383; AVX1-NEXT:    testb $64, %al
384; AVX1-NEXT:    jne LBB2_13
385; AVX1-NEXT:  LBB2_14: ## %else22
386; AVX1-NEXT:    testb $-128, %al
387; AVX1-NEXT:    jne LBB2_15
388; AVX1-NEXT:  LBB2_16: ## %else26
389; AVX1-NEXT:    retq
390; AVX1-NEXT:  LBB2_1: ## %cond.load
391; AVX1-NEXT:    vmovsd (%rdi), %xmm2 ## xmm2 = mem[0],zero
392; AVX1-NEXT:    vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3,4,5,6,7]
393; AVX1-NEXT:    addq $8, %rdi
394; AVX1-NEXT:    testb $2, %al
395; AVX1-NEXT:    je LBB2_4
396; AVX1-NEXT:  LBB2_3: ## %cond.load1
397; AVX1-NEXT:    vmovhps (%rdi), %xmm0, %xmm2 ## xmm2 = xmm0[0,1],mem[0,1]
398; AVX1-NEXT:    vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
399; AVX1-NEXT:    addq $8, %rdi
400; AVX1-NEXT:    testb $4, %al
401; AVX1-NEXT:    je LBB2_6
402; AVX1-NEXT:  LBB2_5: ## %cond.load5
403; AVX1-NEXT:    vbroadcastsd (%rdi), %ymm2
404; AVX1-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5],ymm0[6,7]
405; AVX1-NEXT:    addq $8, %rdi
406; AVX1-NEXT:    testb $8, %al
407; AVX1-NEXT:    je LBB2_8
408; AVX1-NEXT:  LBB2_7: ## %cond.load9
409; AVX1-NEXT:    vbroadcastsd (%rdi), %ymm2
410; AVX1-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7]
411; AVX1-NEXT:    addq $8, %rdi
412; AVX1-NEXT:    testb $16, %al
413; AVX1-NEXT:    je LBB2_10
414; AVX1-NEXT:  LBB2_9: ## %cond.load13
415; AVX1-NEXT:    vmovsd (%rdi), %xmm2 ## xmm2 = mem[0],zero
416; AVX1-NEXT:    vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5,6,7]
417; AVX1-NEXT:    addq $8, %rdi
418; AVX1-NEXT:    testb $32, %al
419; AVX1-NEXT:    je LBB2_12
420; AVX1-NEXT:  LBB2_11: ## %cond.load17
421; AVX1-NEXT:    vmovhps (%rdi), %xmm1, %xmm2 ## xmm2 = xmm1[0,1],mem[0,1]
422; AVX1-NEXT:    vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
423; AVX1-NEXT:    addq $8, %rdi
424; AVX1-NEXT:    testb $64, %al
425; AVX1-NEXT:    je LBB2_14
426; AVX1-NEXT:  LBB2_13: ## %cond.load21
427; AVX1-NEXT:    vbroadcastsd (%rdi), %ymm2
428; AVX1-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5],ymm1[6,7]
429; AVX1-NEXT:    addq $8, %rdi
430; AVX1-NEXT:    testb $-128, %al
431; AVX1-NEXT:    je LBB2_16
432; AVX1-NEXT:  LBB2_15: ## %cond.load25
433; AVX1-NEXT:    vbroadcastsd (%rdi), %ymm2
434; AVX1-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7]
435; AVX1-NEXT:    retq
436;
437; AVX2-LABEL: expandload_v8f64_v8i1:
438; AVX2:       ## %bb.0:
439; AVX2-NEXT:    vpsllw $15, %xmm2, %xmm2
440; AVX2-NEXT:    vpacksswb %xmm2, %xmm2, %xmm2
441; AVX2-NEXT:    vpmovmskb %xmm2, %eax
442; AVX2-NEXT:    testb $1, %al
443; AVX2-NEXT:    jne LBB2_1
444; AVX2-NEXT:  ## %bb.2: ## %else
445; AVX2-NEXT:    testb $2, %al
446; AVX2-NEXT:    jne LBB2_3
447; AVX2-NEXT:  LBB2_4: ## %else2
448; AVX2-NEXT:    testb $4, %al
449; AVX2-NEXT:    jne LBB2_5
450; AVX2-NEXT:  LBB2_6: ## %else6
451; AVX2-NEXT:    testb $8, %al
452; AVX2-NEXT:    jne LBB2_7
453; AVX2-NEXT:  LBB2_8: ## %else10
454; AVX2-NEXT:    testb $16, %al
455; AVX2-NEXT:    jne LBB2_9
456; AVX2-NEXT:  LBB2_10: ## %else14
457; AVX2-NEXT:    testb $32, %al
458; AVX2-NEXT:    jne LBB2_11
459; AVX2-NEXT:  LBB2_12: ## %else18
460; AVX2-NEXT:    testb $64, %al
461; AVX2-NEXT:    jne LBB2_13
462; AVX2-NEXT:  LBB2_14: ## %else22
463; AVX2-NEXT:    testb $-128, %al
464; AVX2-NEXT:    jne LBB2_15
465; AVX2-NEXT:  LBB2_16: ## %else26
466; AVX2-NEXT:    retq
467; AVX2-NEXT:  LBB2_1: ## %cond.load
468; AVX2-NEXT:    vmovq (%rdi), %xmm2 ## xmm2 = mem[0],zero
469; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3,4,5,6,7]
470; AVX2-NEXT:    addq $8, %rdi
471; AVX2-NEXT:    testb $2, %al
472; AVX2-NEXT:    je LBB2_4
473; AVX2-NEXT:  LBB2_3: ## %cond.load1
474; AVX2-NEXT:    vmovhps (%rdi), %xmm0, %xmm2 ## xmm2 = xmm0[0,1],mem[0,1]
475; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
476; AVX2-NEXT:    addq $8, %rdi
477; AVX2-NEXT:    testb $4, %al
478; AVX2-NEXT:    je LBB2_6
479; AVX2-NEXT:  LBB2_5: ## %cond.load5
480; AVX2-NEXT:    vpbroadcastq (%rdi), %ymm2
481; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5],ymm0[6,7]
482; AVX2-NEXT:    addq $8, %rdi
483; AVX2-NEXT:    testb $8, %al
484; AVX2-NEXT:    je LBB2_8
485; AVX2-NEXT:  LBB2_7: ## %cond.load9
486; AVX2-NEXT:    vpbroadcastq (%rdi), %ymm2
487; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7]
488; AVX2-NEXT:    addq $8, %rdi
489; AVX2-NEXT:    testb $16, %al
490; AVX2-NEXT:    je LBB2_10
491; AVX2-NEXT:  LBB2_9: ## %cond.load13
492; AVX2-NEXT:    vmovq (%rdi), %xmm2 ## xmm2 = mem[0],zero
493; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5,6,7]
494; AVX2-NEXT:    addq $8, %rdi
495; AVX2-NEXT:    testb $32, %al
496; AVX2-NEXT:    je LBB2_12
497; AVX2-NEXT:  LBB2_11: ## %cond.load17
498; AVX2-NEXT:    vmovhps (%rdi), %xmm1, %xmm2 ## xmm2 = xmm1[0,1],mem[0,1]
499; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
500; AVX2-NEXT:    addq $8, %rdi
501; AVX2-NEXT:    testb $64, %al
502; AVX2-NEXT:    je LBB2_14
503; AVX2-NEXT:  LBB2_13: ## %cond.load21
504; AVX2-NEXT:    vpbroadcastq (%rdi), %ymm2
505; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5],ymm1[6,7]
506; AVX2-NEXT:    addq $8, %rdi
507; AVX2-NEXT:    testb $-128, %al
508; AVX2-NEXT:    je LBB2_16
509; AVX2-NEXT:  LBB2_15: ## %cond.load25
510; AVX2-NEXT:    vpbroadcastq (%rdi), %ymm2
511; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7]
512; AVX2-NEXT:    retq
513;
514; AVX512F-LABEL: expandload_v8f64_v8i1:
515; AVX512F:       ## %bb.0:
516; AVX512F-NEXT:    vpmovsxwq %xmm1, %zmm1
517; AVX512F-NEXT:    vpsllq $63, %zmm1, %zmm1
518; AVX512F-NEXT:    vptestmq %zmm1, %zmm1, %k1
519; AVX512F-NEXT:    vexpandpd (%rdi), %zmm0 {%k1}
520; AVX512F-NEXT:    retq
521;
522; AVX512VLDQ-LABEL: expandload_v8f64_v8i1:
523; AVX512VLDQ:       ## %bb.0:
524; AVX512VLDQ-NEXT:    vpmovsxwd %xmm1, %ymm1
525; AVX512VLDQ-NEXT:    vpslld $31, %ymm1, %ymm1
526; AVX512VLDQ-NEXT:    vpmovd2m %ymm1, %k1
527; AVX512VLDQ-NEXT:    vexpandpd (%rdi), %zmm0 {%k1}
528; AVX512VLDQ-NEXT:    retq
529;
530; AVX512VLBW-LABEL: expandload_v8f64_v8i1:
531; AVX512VLBW:       ## %bb.0:
532; AVX512VLBW-NEXT:    vpsllw $15, %xmm1, %xmm1
533; AVX512VLBW-NEXT:    vpmovw2m %xmm1, %k1
534; AVX512VLBW-NEXT:    vexpandpd (%rdi), %zmm0 {%k1}
535; AVX512VLBW-NEXT:    retq
536  %res = call <8 x double> @llvm.masked.expandload.v8f64(ptr %base, <8 x i1> %mask, <8 x double> %src0)
537  ret <8 x double>%res
538}
539
540define <16 x double> @expandload_v16f64_v16i32(ptr %base, <16 x double> %src0, <16 x i32> %trigger) {
541; SSE-LABEL: expandload_v16f64_v16i32:
542; SSE:       ## %bb.0:
543; SSE-NEXT:    movq %rdi, %rax
544; SSE-NEXT:    pxor %xmm8, %xmm8
545; SSE-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm9
546; SSE-NEXT:    pcmpeqd %xmm8, %xmm9
547; SSE-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm10
548; SSE-NEXT:    pcmpeqd %xmm8, %xmm10
549; SSE-NEXT:    packssdw %xmm9, %xmm10
550; SSE-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm9
551; SSE-NEXT:    pcmpeqd %xmm8, %xmm9
552; SSE-NEXT:    pcmpeqd {{[0-9]+}}(%rsp), %xmm8
553; SSE-NEXT:    packssdw %xmm9, %xmm8
554; SSE-NEXT:    packsswb %xmm10, %xmm8
555; SSE-NEXT:    pmovmskb %xmm8, %ecx
556; SSE-NEXT:    testb $1, %cl
557; SSE-NEXT:    jne LBB3_1
558; SSE-NEXT:  ## %bb.2: ## %else
559; SSE-NEXT:    testb $2, %cl
560; SSE-NEXT:    jne LBB3_3
561; SSE-NEXT:  LBB3_4: ## %else2
562; SSE-NEXT:    testb $4, %cl
563; SSE-NEXT:    jne LBB3_5
564; SSE-NEXT:  LBB3_6: ## %else6
565; SSE-NEXT:    testb $8, %cl
566; SSE-NEXT:    jne LBB3_7
567; SSE-NEXT:  LBB3_8: ## %else10
568; SSE-NEXT:    testb $16, %cl
569; SSE-NEXT:    jne LBB3_9
570; SSE-NEXT:  LBB3_10: ## %else14
571; SSE-NEXT:    testb $32, %cl
572; SSE-NEXT:    jne LBB3_11
573; SSE-NEXT:  LBB3_12: ## %else18
574; SSE-NEXT:    testb $64, %cl
575; SSE-NEXT:    jne LBB3_13
576; SSE-NEXT:  LBB3_14: ## %else22
577; SSE-NEXT:    testb %cl, %cl
578; SSE-NEXT:    js LBB3_15
579; SSE-NEXT:  LBB3_16: ## %else26
580; SSE-NEXT:    testl $256, %ecx ## imm = 0x100
581; SSE-NEXT:    jne LBB3_17
582; SSE-NEXT:  LBB3_18: ## %else30
583; SSE-NEXT:    testl $512, %ecx ## imm = 0x200
584; SSE-NEXT:    jne LBB3_19
585; SSE-NEXT:  LBB3_20: ## %else34
586; SSE-NEXT:    testl $1024, %ecx ## imm = 0x400
587; SSE-NEXT:    jne LBB3_21
588; SSE-NEXT:  LBB3_22: ## %else38
589; SSE-NEXT:    testl $2048, %ecx ## imm = 0x800
590; SSE-NEXT:    jne LBB3_23
591; SSE-NEXT:  LBB3_24: ## %else42
592; SSE-NEXT:    testl $4096, %ecx ## imm = 0x1000
593; SSE-NEXT:    jne LBB3_25
594; SSE-NEXT:  LBB3_26: ## %else46
595; SSE-NEXT:    testl $8192, %ecx ## imm = 0x2000
596; SSE-NEXT:    jne LBB3_27
597; SSE-NEXT:  LBB3_28: ## %else50
598; SSE-NEXT:    testl $16384, %ecx ## imm = 0x4000
599; SSE-NEXT:    jne LBB3_29
600; SSE-NEXT:  LBB3_30: ## %else54
601; SSE-NEXT:    testl $32768, %ecx ## imm = 0x8000
602; SSE-NEXT:    je LBB3_32
603; SSE-NEXT:  LBB3_31: ## %cond.load57
604; SSE-NEXT:    movhps (%rsi), %xmm7 ## xmm7 = xmm7[0,1],mem[0,1]
605; SSE-NEXT:  LBB3_32: ## %else58
606; SSE-NEXT:    movaps %xmm0, (%rax)
607; SSE-NEXT:    movaps %xmm1, 16(%rax)
608; SSE-NEXT:    movaps %xmm2, 32(%rax)
609; SSE-NEXT:    movaps %xmm3, 48(%rax)
610; SSE-NEXT:    movaps %xmm4, 64(%rax)
611; SSE-NEXT:    movaps %xmm5, 80(%rax)
612; SSE-NEXT:    movaps %xmm6, 96(%rax)
613; SSE-NEXT:    movaps %xmm7, 112(%rax)
614; SSE-NEXT:    retq
615; SSE-NEXT:  LBB3_1: ## %cond.load
616; SSE-NEXT:    movlps (%rsi), %xmm0 ## xmm0 = mem[0,1],xmm0[2,3]
617; SSE-NEXT:    addq $8, %rsi
618; SSE-NEXT:    testb $2, %cl
619; SSE-NEXT:    je LBB3_4
620; SSE-NEXT:  LBB3_3: ## %cond.load1
621; SSE-NEXT:    movhps (%rsi), %xmm0 ## xmm0 = xmm0[0,1],mem[0,1]
622; SSE-NEXT:    addq $8, %rsi
623; SSE-NEXT:    testb $4, %cl
624; SSE-NEXT:    je LBB3_6
625; SSE-NEXT:  LBB3_5: ## %cond.load5
626; SSE-NEXT:    movlps (%rsi), %xmm1 ## xmm1 = mem[0,1],xmm1[2,3]
627; SSE-NEXT:    addq $8, %rsi
628; SSE-NEXT:    testb $8, %cl
629; SSE-NEXT:    je LBB3_8
630; SSE-NEXT:  LBB3_7: ## %cond.load9
631; SSE-NEXT:    movhps (%rsi), %xmm1 ## xmm1 = xmm1[0,1],mem[0,1]
632; SSE-NEXT:    addq $8, %rsi
633; SSE-NEXT:    testb $16, %cl
634; SSE-NEXT:    je LBB3_10
635; SSE-NEXT:  LBB3_9: ## %cond.load13
636; SSE-NEXT:    movlps (%rsi), %xmm2 ## xmm2 = mem[0,1],xmm2[2,3]
637; SSE-NEXT:    addq $8, %rsi
638; SSE-NEXT:    testb $32, %cl
639; SSE-NEXT:    je LBB3_12
640; SSE-NEXT:  LBB3_11: ## %cond.load17
641; SSE-NEXT:    movhps (%rsi), %xmm2 ## xmm2 = xmm2[0,1],mem[0,1]
642; SSE-NEXT:    addq $8, %rsi
643; SSE-NEXT:    testb $64, %cl
644; SSE-NEXT:    je LBB3_14
645; SSE-NEXT:  LBB3_13: ## %cond.load21
646; SSE-NEXT:    movlps (%rsi), %xmm3 ## xmm3 = mem[0,1],xmm3[2,3]
647; SSE-NEXT:    addq $8, %rsi
648; SSE-NEXT:    testb %cl, %cl
649; SSE-NEXT:    jns LBB3_16
650; SSE-NEXT:  LBB3_15: ## %cond.load25
651; SSE-NEXT:    movhps (%rsi), %xmm3 ## xmm3 = xmm3[0,1],mem[0,1]
652; SSE-NEXT:    addq $8, %rsi
653; SSE-NEXT:    testl $256, %ecx ## imm = 0x100
654; SSE-NEXT:    je LBB3_18
655; SSE-NEXT:  LBB3_17: ## %cond.load29
656; SSE-NEXT:    movlps (%rsi), %xmm4 ## xmm4 = mem[0,1],xmm4[2,3]
657; SSE-NEXT:    addq $8, %rsi
658; SSE-NEXT:    testl $512, %ecx ## imm = 0x200
659; SSE-NEXT:    je LBB3_20
660; SSE-NEXT:  LBB3_19: ## %cond.load33
661; SSE-NEXT:    movhps (%rsi), %xmm4 ## xmm4 = xmm4[0,1],mem[0,1]
662; SSE-NEXT:    addq $8, %rsi
663; SSE-NEXT:    testl $1024, %ecx ## imm = 0x400
664; SSE-NEXT:    je LBB3_22
665; SSE-NEXT:  LBB3_21: ## %cond.load37
666; SSE-NEXT:    movlps (%rsi), %xmm5 ## xmm5 = mem[0,1],xmm5[2,3]
667; SSE-NEXT:    addq $8, %rsi
668; SSE-NEXT:    testl $2048, %ecx ## imm = 0x800
669; SSE-NEXT:    je LBB3_24
670; SSE-NEXT:  LBB3_23: ## %cond.load41
671; SSE-NEXT:    movhps (%rsi), %xmm5 ## xmm5 = xmm5[0,1],mem[0,1]
672; SSE-NEXT:    addq $8, %rsi
673; SSE-NEXT:    testl $4096, %ecx ## imm = 0x1000
674; SSE-NEXT:    je LBB3_26
675; SSE-NEXT:  LBB3_25: ## %cond.load45
676; SSE-NEXT:    movlps (%rsi), %xmm6 ## xmm6 = mem[0,1],xmm6[2,3]
677; SSE-NEXT:    addq $8, %rsi
678; SSE-NEXT:    testl $8192, %ecx ## imm = 0x2000
679; SSE-NEXT:    je LBB3_28
680; SSE-NEXT:  LBB3_27: ## %cond.load49
681; SSE-NEXT:    movhps (%rsi), %xmm6 ## xmm6 = xmm6[0,1],mem[0,1]
682; SSE-NEXT:    addq $8, %rsi
683; SSE-NEXT:    testl $16384, %ecx ## imm = 0x4000
684; SSE-NEXT:    je LBB3_30
685; SSE-NEXT:  LBB3_29: ## %cond.load53
686; SSE-NEXT:    movlps (%rsi), %xmm7 ## xmm7 = mem[0,1],xmm7[2,3]
687; SSE-NEXT:    addq $8, %rsi
688; SSE-NEXT:    testl $32768, %ecx ## imm = 0x8000
689; SSE-NEXT:    jne LBB3_31
690; SSE-NEXT:    jmp LBB3_32
691;
692; AVX1-LABEL: expandload_v16f64_v16i32:
693; AVX1:       ## %bb.0:
694; AVX1-NEXT:    vextractf128 $1, %ymm5, %xmm6
695; AVX1-NEXT:    vpxor %xmm7, %xmm7, %xmm7
696; AVX1-NEXT:    vpcmpeqd %xmm7, %xmm6, %xmm6
697; AVX1-NEXT:    vpcmpeqd %xmm7, %xmm5, %xmm5
698; AVX1-NEXT:    vpackssdw %xmm6, %xmm5, %xmm5
699; AVX1-NEXT:    vextractf128 $1, %ymm4, %xmm6
700; AVX1-NEXT:    vpcmpeqd %xmm7, %xmm6, %xmm6
701; AVX1-NEXT:    vpcmpeqd %xmm7, %xmm4, %xmm4
702; AVX1-NEXT:    vpackssdw %xmm6, %xmm4, %xmm4
703; AVX1-NEXT:    vpacksswb %xmm5, %xmm4, %xmm4
704; AVX1-NEXT:    vpmovmskb %xmm4, %eax
705; AVX1-NEXT:    testb $1, %al
706; AVX1-NEXT:    jne LBB3_1
707; AVX1-NEXT:  ## %bb.2: ## %else
708; AVX1-NEXT:    testb $2, %al
709; AVX1-NEXT:    jne LBB3_3
710; AVX1-NEXT:  LBB3_4: ## %else2
711; AVX1-NEXT:    testb $4, %al
712; AVX1-NEXT:    jne LBB3_5
713; AVX1-NEXT:  LBB3_6: ## %else6
714; AVX1-NEXT:    testb $8, %al
715; AVX1-NEXT:    jne LBB3_7
716; AVX1-NEXT:  LBB3_8: ## %else10
717; AVX1-NEXT:    testb $16, %al
718; AVX1-NEXT:    jne LBB3_9
719; AVX1-NEXT:  LBB3_10: ## %else14
720; AVX1-NEXT:    testb $32, %al
721; AVX1-NEXT:    jne LBB3_11
722; AVX1-NEXT:  LBB3_12: ## %else18
723; AVX1-NEXT:    testb $64, %al
724; AVX1-NEXT:    jne LBB3_13
725; AVX1-NEXT:  LBB3_14: ## %else22
726; AVX1-NEXT:    testb %al, %al
727; AVX1-NEXT:    js LBB3_15
728; AVX1-NEXT:  LBB3_16: ## %else26
729; AVX1-NEXT:    testl $256, %eax ## imm = 0x100
730; AVX1-NEXT:    jne LBB3_17
731; AVX1-NEXT:  LBB3_18: ## %else30
732; AVX1-NEXT:    testl $512, %eax ## imm = 0x200
733; AVX1-NEXT:    jne LBB3_19
734; AVX1-NEXT:  LBB3_20: ## %else34
735; AVX1-NEXT:    testl $1024, %eax ## imm = 0x400
736; AVX1-NEXT:    jne LBB3_21
737; AVX1-NEXT:  LBB3_22: ## %else38
738; AVX1-NEXT:    testl $2048, %eax ## imm = 0x800
739; AVX1-NEXT:    jne LBB3_23
740; AVX1-NEXT:  LBB3_24: ## %else42
741; AVX1-NEXT:    testl $4096, %eax ## imm = 0x1000
742; AVX1-NEXT:    jne LBB3_25
743; AVX1-NEXT:  LBB3_26: ## %else46
744; AVX1-NEXT:    testl $8192, %eax ## imm = 0x2000
745; AVX1-NEXT:    jne LBB3_27
746; AVX1-NEXT:  LBB3_28: ## %else50
747; AVX1-NEXT:    testl $16384, %eax ## imm = 0x4000
748; AVX1-NEXT:    jne LBB3_29
749; AVX1-NEXT:  LBB3_30: ## %else54
750; AVX1-NEXT:    testl $32768, %eax ## imm = 0x8000
751; AVX1-NEXT:    jne LBB3_31
752; AVX1-NEXT:  LBB3_32: ## %else58
753; AVX1-NEXT:    retq
754; AVX1-NEXT:  LBB3_1: ## %cond.load
755; AVX1-NEXT:    vmovsd (%rdi), %xmm4 ## xmm4 = mem[0],zero
756; AVX1-NEXT:    vblendps {{.*#+}} ymm0 = ymm4[0,1],ymm0[2,3,4,5,6,7]
757; AVX1-NEXT:    addq $8, %rdi
758; AVX1-NEXT:    testb $2, %al
759; AVX1-NEXT:    je LBB3_4
760; AVX1-NEXT:  LBB3_3: ## %cond.load1
761; AVX1-NEXT:    vmovhps (%rdi), %xmm0, %xmm4 ## xmm4 = xmm0[0,1],mem[0,1]
762; AVX1-NEXT:    vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7]
763; AVX1-NEXT:    addq $8, %rdi
764; AVX1-NEXT:    testb $4, %al
765; AVX1-NEXT:    je LBB3_6
766; AVX1-NEXT:  LBB3_5: ## %cond.load5
767; AVX1-NEXT:    vbroadcastsd (%rdi), %ymm4
768; AVX1-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4,5],ymm0[6,7]
769; AVX1-NEXT:    addq $8, %rdi
770; AVX1-NEXT:    testb $8, %al
771; AVX1-NEXT:    je LBB3_8
772; AVX1-NEXT:  LBB3_7: ## %cond.load9
773; AVX1-NEXT:    vbroadcastsd (%rdi), %ymm4
774; AVX1-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm4[6,7]
775; AVX1-NEXT:    addq $8, %rdi
776; AVX1-NEXT:    testb $16, %al
777; AVX1-NEXT:    je LBB3_10
778; AVX1-NEXT:  LBB3_9: ## %cond.load13
779; AVX1-NEXT:    vmovsd (%rdi), %xmm4 ## xmm4 = mem[0],zero
780; AVX1-NEXT:    vblendps {{.*#+}} ymm1 = ymm4[0,1],ymm1[2,3,4,5,6,7]
781; AVX1-NEXT:    addq $8, %rdi
782; AVX1-NEXT:    testb $32, %al
783; AVX1-NEXT:    je LBB3_12
784; AVX1-NEXT:  LBB3_11: ## %cond.load17
785; AVX1-NEXT:    vmovhps (%rdi), %xmm1, %xmm4 ## xmm4 = xmm1[0,1],mem[0,1]
786; AVX1-NEXT:    vblendps {{.*#+}} ymm1 = ymm4[0,1,2,3],ymm1[4,5,6,7]
787; AVX1-NEXT:    addq $8, %rdi
788; AVX1-NEXT:    testb $64, %al
789; AVX1-NEXT:    je LBB3_14
790; AVX1-NEXT:  LBB3_13: ## %cond.load21
791; AVX1-NEXT:    vbroadcastsd (%rdi), %ymm4
792; AVX1-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5],ymm1[6,7]
793; AVX1-NEXT:    addq $8, %rdi
794; AVX1-NEXT:    testb %al, %al
795; AVX1-NEXT:    jns LBB3_16
796; AVX1-NEXT:  LBB3_15: ## %cond.load25
797; AVX1-NEXT:    vbroadcastsd (%rdi), %ymm4
798; AVX1-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm4[6,7]
799; AVX1-NEXT:    addq $8, %rdi
800; AVX1-NEXT:    testl $256, %eax ## imm = 0x100
801; AVX1-NEXT:    je LBB3_18
802; AVX1-NEXT:  LBB3_17: ## %cond.load29
803; AVX1-NEXT:    vmovsd (%rdi), %xmm4 ## xmm4 = mem[0],zero
804; AVX1-NEXT:    vblendps {{.*#+}} ymm2 = ymm4[0,1],ymm2[2,3,4,5,6,7]
805; AVX1-NEXT:    addq $8, %rdi
806; AVX1-NEXT:    testl $512, %eax ## imm = 0x200
807; AVX1-NEXT:    je LBB3_20
808; AVX1-NEXT:  LBB3_19: ## %cond.load33
809; AVX1-NEXT:    vmovhps (%rdi), %xmm2, %xmm4 ## xmm4 = xmm2[0,1],mem[0,1]
810; AVX1-NEXT:    vblendps {{.*#+}} ymm2 = ymm4[0,1,2,3],ymm2[4,5,6,7]
811; AVX1-NEXT:    addq $8, %rdi
812; AVX1-NEXT:    testl $1024, %eax ## imm = 0x400
813; AVX1-NEXT:    je LBB3_22
814; AVX1-NEXT:  LBB3_21: ## %cond.load37
815; AVX1-NEXT:    vbroadcastsd (%rdi), %ymm4
816; AVX1-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5],ymm2[6,7]
817; AVX1-NEXT:    addq $8, %rdi
818; AVX1-NEXT:    testl $2048, %eax ## imm = 0x800
819; AVX1-NEXT:    je LBB3_24
820; AVX1-NEXT:  LBB3_23: ## %cond.load41
821; AVX1-NEXT:    vbroadcastsd (%rdi), %ymm4
822; AVX1-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm4[6,7]
823; AVX1-NEXT:    addq $8, %rdi
824; AVX1-NEXT:    testl $4096, %eax ## imm = 0x1000
825; AVX1-NEXT:    je LBB3_26
826; AVX1-NEXT:  LBB3_25: ## %cond.load45
827; AVX1-NEXT:    vmovsd (%rdi), %xmm4 ## xmm4 = mem[0],zero
828; AVX1-NEXT:    vblendps {{.*#+}} ymm3 = ymm4[0,1],ymm3[2,3,4,5,6,7]
829; AVX1-NEXT:    addq $8, %rdi
830; AVX1-NEXT:    testl $8192, %eax ## imm = 0x2000
831; AVX1-NEXT:    je LBB3_28
832; AVX1-NEXT:  LBB3_27: ## %cond.load49
833; AVX1-NEXT:    vmovhps (%rdi), %xmm3, %xmm4 ## xmm4 = xmm3[0,1],mem[0,1]
834; AVX1-NEXT:    vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
835; AVX1-NEXT:    addq $8, %rdi
836; AVX1-NEXT:    testl $16384, %eax ## imm = 0x4000
837; AVX1-NEXT:    je LBB3_30
838; AVX1-NEXT:  LBB3_29: ## %cond.load53
839; AVX1-NEXT:    vbroadcastsd (%rdi), %ymm4
840; AVX1-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5],ymm3[6,7]
841; AVX1-NEXT:    addq $8, %rdi
842; AVX1-NEXT:    testl $32768, %eax ## imm = 0x8000
843; AVX1-NEXT:    je LBB3_32
844; AVX1-NEXT:  LBB3_31: ## %cond.load57
845; AVX1-NEXT:    vbroadcastsd (%rdi), %ymm4
846; AVX1-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm4[6,7]
847; AVX1-NEXT:    retq
848;
849; AVX2-LABEL: expandload_v16f64_v16i32:
850; AVX2:       ## %bb.0:
851; AVX2-NEXT:    vpxor %xmm6, %xmm6, %xmm6
852; AVX2-NEXT:    vpcmpeqd %ymm6, %ymm5, %ymm5
853; AVX2-NEXT:    vpcmpeqd %ymm6, %ymm4, %ymm4
854; AVX2-NEXT:    vpackssdw %ymm5, %ymm4, %ymm4
855; AVX2-NEXT:    vextracti128 $1, %ymm4, %xmm5
856; AVX2-NEXT:    vpacksswb %xmm5, %xmm4, %xmm4
857; AVX2-NEXT:    vpshufd {{.*#+}} xmm4 = xmm4[0,2,1,3]
858; AVX2-NEXT:    vpmovmskb %xmm4, %eax
859; AVX2-NEXT:    testb $1, %al
860; AVX2-NEXT:    jne LBB3_1
861; AVX2-NEXT:  ## %bb.2: ## %else
862; AVX2-NEXT:    testb $2, %al
863; AVX2-NEXT:    jne LBB3_3
864; AVX2-NEXT:  LBB3_4: ## %else2
865; AVX2-NEXT:    testb $4, %al
866; AVX2-NEXT:    jne LBB3_5
867; AVX2-NEXT:  LBB3_6: ## %else6
868; AVX2-NEXT:    testb $8, %al
869; AVX2-NEXT:    jne LBB3_7
870; AVX2-NEXT:  LBB3_8: ## %else10
871; AVX2-NEXT:    testb $16, %al
872; AVX2-NEXT:    jne LBB3_9
873; AVX2-NEXT:  LBB3_10: ## %else14
874; AVX2-NEXT:    testb $32, %al
875; AVX2-NEXT:    jne LBB3_11
876; AVX2-NEXT:  LBB3_12: ## %else18
877; AVX2-NEXT:    testb $64, %al
878; AVX2-NEXT:    jne LBB3_13
879; AVX2-NEXT:  LBB3_14: ## %else22
880; AVX2-NEXT:    testb %al, %al
881; AVX2-NEXT:    js LBB3_15
882; AVX2-NEXT:  LBB3_16: ## %else26
883; AVX2-NEXT:    testl $256, %eax ## imm = 0x100
884; AVX2-NEXT:    jne LBB3_17
885; AVX2-NEXT:  LBB3_18: ## %else30
886; AVX2-NEXT:    testl $512, %eax ## imm = 0x200
887; AVX2-NEXT:    jne LBB3_19
888; AVX2-NEXT:  LBB3_20: ## %else34
889; AVX2-NEXT:    testl $1024, %eax ## imm = 0x400
890; AVX2-NEXT:    jne LBB3_21
891; AVX2-NEXT:  LBB3_22: ## %else38
892; AVX2-NEXT:    testl $2048, %eax ## imm = 0x800
893; AVX2-NEXT:    jne LBB3_23
894; AVX2-NEXT:  LBB3_24: ## %else42
895; AVX2-NEXT:    testl $4096, %eax ## imm = 0x1000
896; AVX2-NEXT:    jne LBB3_25
897; AVX2-NEXT:  LBB3_26: ## %else46
898; AVX2-NEXT:    testl $8192, %eax ## imm = 0x2000
899; AVX2-NEXT:    jne LBB3_27
900; AVX2-NEXT:  LBB3_28: ## %else50
901; AVX2-NEXT:    testl $16384, %eax ## imm = 0x4000
902; AVX2-NEXT:    jne LBB3_29
903; AVX2-NEXT:  LBB3_30: ## %else54
904; AVX2-NEXT:    testl $32768, %eax ## imm = 0x8000
905; AVX2-NEXT:    jne LBB3_31
906; AVX2-NEXT:  LBB3_32: ## %else58
907; AVX2-NEXT:    retq
908; AVX2-NEXT:  LBB3_1: ## %cond.load
909; AVX2-NEXT:    vmovq (%rdi), %xmm4 ## xmm4 = mem[0],zero
910; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm4[0,1],ymm0[2,3,4,5,6,7]
911; AVX2-NEXT:    addq $8, %rdi
912; AVX2-NEXT:    testb $2, %al
913; AVX2-NEXT:    je LBB3_4
914; AVX2-NEXT:  LBB3_3: ## %cond.load1
915; AVX2-NEXT:    vmovhps (%rdi), %xmm0, %xmm4 ## xmm4 = xmm0[0,1],mem[0,1]
916; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7]
917; AVX2-NEXT:    addq $8, %rdi
918; AVX2-NEXT:    testb $4, %al
919; AVX2-NEXT:    je LBB3_6
920; AVX2-NEXT:  LBB3_5: ## %cond.load5
921; AVX2-NEXT:    vpbroadcastq (%rdi), %ymm4
922; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4,5],ymm0[6,7]
923; AVX2-NEXT:    addq $8, %rdi
924; AVX2-NEXT:    testb $8, %al
925; AVX2-NEXT:    je LBB3_8
926; AVX2-NEXT:  LBB3_7: ## %cond.load9
927; AVX2-NEXT:    vpbroadcastq (%rdi), %ymm4
928; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm4[6,7]
929; AVX2-NEXT:    addq $8, %rdi
930; AVX2-NEXT:    testb $16, %al
931; AVX2-NEXT:    je LBB3_10
932; AVX2-NEXT:  LBB3_9: ## %cond.load13
933; AVX2-NEXT:    vmovq (%rdi), %xmm4 ## xmm4 = mem[0],zero
934; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm4[0,1],ymm1[2,3,4,5,6,7]
935; AVX2-NEXT:    addq $8, %rdi
936; AVX2-NEXT:    testb $32, %al
937; AVX2-NEXT:    je LBB3_12
938; AVX2-NEXT:  LBB3_11: ## %cond.load17
939; AVX2-NEXT:    vmovhps (%rdi), %xmm1, %xmm4 ## xmm4 = xmm1[0,1],mem[0,1]
940; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3],ymm1[4,5,6,7]
941; AVX2-NEXT:    addq $8, %rdi
942; AVX2-NEXT:    testb $64, %al
943; AVX2-NEXT:    je LBB3_14
944; AVX2-NEXT:  LBB3_13: ## %cond.load21
945; AVX2-NEXT:    vpbroadcastq (%rdi), %ymm4
946; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5],ymm1[6,7]
947; AVX2-NEXT:    addq $8, %rdi
948; AVX2-NEXT:    testb %al, %al
949; AVX2-NEXT:    jns LBB3_16
950; AVX2-NEXT:  LBB3_15: ## %cond.load25
951; AVX2-NEXT:    vpbroadcastq (%rdi), %ymm4
952; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm4[6,7]
953; AVX2-NEXT:    addq $8, %rdi
954; AVX2-NEXT:    testl $256, %eax ## imm = 0x100
955; AVX2-NEXT:    je LBB3_18
956; AVX2-NEXT:  LBB3_17: ## %cond.load29
957; AVX2-NEXT:    vmovq (%rdi), %xmm4 ## xmm4 = mem[0],zero
958; AVX2-NEXT:    vpblendd {{.*#+}} ymm2 = ymm4[0,1],ymm2[2,3,4,5,6,7]
959; AVX2-NEXT:    addq $8, %rdi
960; AVX2-NEXT:    testl $512, %eax ## imm = 0x200
961; AVX2-NEXT:    je LBB3_20
962; AVX2-NEXT:  LBB3_19: ## %cond.load33
963; AVX2-NEXT:    vmovhps (%rdi), %xmm2, %xmm4 ## xmm4 = xmm2[0,1],mem[0,1]
964; AVX2-NEXT:    vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3],ymm2[4,5,6,7]
965; AVX2-NEXT:    addq $8, %rdi
966; AVX2-NEXT:    testl $1024, %eax ## imm = 0x400
967; AVX2-NEXT:    je LBB3_22
968; AVX2-NEXT:  LBB3_21: ## %cond.load37
969; AVX2-NEXT:    vpbroadcastq (%rdi), %ymm4
970; AVX2-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5],ymm2[6,7]
971; AVX2-NEXT:    addq $8, %rdi
972; AVX2-NEXT:    testl $2048, %eax ## imm = 0x800
973; AVX2-NEXT:    je LBB3_24
974; AVX2-NEXT:  LBB3_23: ## %cond.load41
975; AVX2-NEXT:    vpbroadcastq (%rdi), %ymm4
976; AVX2-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm4[6,7]
977; AVX2-NEXT:    addq $8, %rdi
978; AVX2-NEXT:    testl $4096, %eax ## imm = 0x1000
979; AVX2-NEXT:    je LBB3_26
980; AVX2-NEXT:  LBB3_25: ## %cond.load45
981; AVX2-NEXT:    vmovq (%rdi), %xmm4 ## xmm4 = mem[0],zero
982; AVX2-NEXT:    vpblendd {{.*#+}} ymm3 = ymm4[0,1],ymm3[2,3,4,5,6,7]
983; AVX2-NEXT:    addq $8, %rdi
984; AVX2-NEXT:    testl $8192, %eax ## imm = 0x2000
985; AVX2-NEXT:    je LBB3_28
986; AVX2-NEXT:  LBB3_27: ## %cond.load49
987; AVX2-NEXT:    vmovhps (%rdi), %xmm3, %xmm4 ## xmm4 = xmm3[0,1],mem[0,1]
988; AVX2-NEXT:    vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
989; AVX2-NEXT:    addq $8, %rdi
990; AVX2-NEXT:    testl $16384, %eax ## imm = 0x4000
991; AVX2-NEXT:    je LBB3_30
992; AVX2-NEXT:  LBB3_29: ## %cond.load53
993; AVX2-NEXT:    vpbroadcastq (%rdi), %ymm4
994; AVX2-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5],ymm3[6,7]
995; AVX2-NEXT:    addq $8, %rdi
996; AVX2-NEXT:    testl $32768, %eax ## imm = 0x8000
997; AVX2-NEXT:    je LBB3_32
998; AVX2-NEXT:  LBB3_31: ## %cond.load57
999; AVX2-NEXT:    vpbroadcastq (%rdi), %ymm4
1000; AVX2-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm4[6,7]
1001; AVX2-NEXT:    retq
1002;
1003; AVX512F-LABEL: expandload_v16f64_v16i32:
1004; AVX512F:       ## %bb.0:
1005; AVX512F-NEXT:    vextracti64x4 $1, %zmm2, %ymm3
1006; AVX512F-NEXT:    vptestnmd %zmm3, %zmm3, %k1
1007; AVX512F-NEXT:    vptestnmd %zmm2, %zmm2, %k2
1008; AVX512F-NEXT:    vexpandpd (%rdi), %zmm0 {%k2}
1009; AVX512F-NEXT:    kmovw %k2, %eax
1010; AVX512F-NEXT:    movzbl %al, %eax
1011; AVX512F-NEXT:    imull $134480385, %eax, %eax ## imm = 0x8040201
1012; AVX512F-NEXT:    shrl $3, %eax
1013; AVX512F-NEXT:    andl $286331153, %eax ## imm = 0x11111111
1014; AVX512F-NEXT:    imull $286331153, %eax, %eax ## imm = 0x11111111
1015; AVX512F-NEXT:    shrl $28, %eax
1016; AVX512F-NEXT:    vexpandpd (%rdi,%rax,8), %zmm1 {%k1}
1017; AVX512F-NEXT:    retq
1018;
1019; AVX512VLDQ-LABEL: expandload_v16f64_v16i32:
1020; AVX512VLDQ:       ## %bb.0:
1021; AVX512VLDQ-NEXT:    vextracti64x4 $1, %zmm2, %ymm3
1022; AVX512VLDQ-NEXT:    vptestnmd %ymm3, %ymm3, %k1
1023; AVX512VLDQ-NEXT:    vptestnmd %ymm2, %ymm2, %k2
1024; AVX512VLDQ-NEXT:    kmovb %k2, %eax
1025; AVX512VLDQ-NEXT:    imull $134480385, %eax, %eax ## imm = 0x8040201
1026; AVX512VLDQ-NEXT:    shrl $3, %eax
1027; AVX512VLDQ-NEXT:    andl $286331153, %eax ## imm = 0x11111111
1028; AVX512VLDQ-NEXT:    imull $286331153, %eax, %eax ## imm = 0x11111111
1029; AVX512VLDQ-NEXT:    shrl $28, %eax
1030; AVX512VLDQ-NEXT:    vexpandpd (%rdi,%rax,8), %zmm1 {%k1}
1031; AVX512VLDQ-NEXT:    vexpandpd (%rdi), %zmm0 {%k2}
1032; AVX512VLDQ-NEXT:    retq
1033;
1034; AVX512VLBW-LABEL: expandload_v16f64_v16i32:
1035; AVX512VLBW:       ## %bb.0:
1036; AVX512VLBW-NEXT:    vextracti64x4 $1, %zmm2, %ymm3
1037; AVX512VLBW-NEXT:    vptestnmd %ymm3, %ymm3, %k1
1038; AVX512VLBW-NEXT:    vptestnmd %ymm2, %ymm2, %k2
1039; AVX512VLBW-NEXT:    vexpandpd (%rdi), %zmm0 {%k2}
1040; AVX512VLBW-NEXT:    kmovd %k2, %eax
1041; AVX512VLBW-NEXT:    movzbl %al, %eax
1042; AVX512VLBW-NEXT:    imull $134480385, %eax, %eax ## imm = 0x8040201
1043; AVX512VLBW-NEXT:    shrl $3, %eax
1044; AVX512VLBW-NEXT:    andl $286331153, %eax ## imm = 0x11111111
1045; AVX512VLBW-NEXT:    imull $286331153, %eax, %eax ## imm = 0x11111111
1046; AVX512VLBW-NEXT:    shrl $28, %eax
1047; AVX512VLBW-NEXT:    vexpandpd (%rdi,%rax,8), %zmm1 {%k1}
1048; AVX512VLBW-NEXT:    retq
1049  %mask = icmp eq <16 x i32> %trigger, zeroinitializer
1050  %res = call <16 x double> @llvm.masked.expandload.v16f64(ptr %base, <16 x i1> %mask, <16 x double> %src0)
1051  ret <16 x double> %res
1052}
1053
1054;
1055; vXf32
1056;
1057
1058define <2 x float> @expandload_v2f32_v2i1(ptr %base, <2 x float> %src0, <2 x i32> %trigger) {
1059; SSE2-LABEL: expandload_v2f32_v2i1:
1060; SSE2:       ## %bb.0:
1061; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1]
1062; SSE2-NEXT:    pxor %xmm2, %xmm2
1063; SSE2-NEXT:    pcmpeqd %xmm1, %xmm2
1064; SSE2-NEXT:    movmskpd %xmm2, %eax
1065; SSE2-NEXT:    testb $1, %al
1066; SSE2-NEXT:    jne LBB4_1
1067; SSE2-NEXT:  ## %bb.2: ## %else
1068; SSE2-NEXT:    testb $2, %al
1069; SSE2-NEXT:    jne LBB4_3
1070; SSE2-NEXT:  LBB4_4: ## %else2
1071; SSE2-NEXT:    retq
1072; SSE2-NEXT:  LBB4_1: ## %cond.load
1073; SSE2-NEXT:    movss (%rdi), %xmm1 ## xmm1 = mem[0],zero,zero,zero
1074; SSE2-NEXT:    movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
1075; SSE2-NEXT:    addq $4, %rdi
1076; SSE2-NEXT:    testb $2, %al
1077; SSE2-NEXT:    je LBB4_4
1078; SSE2-NEXT:  LBB4_3: ## %cond.load1
1079; SSE2-NEXT:    movss (%rdi), %xmm1 ## xmm1 = mem[0],zero,zero,zero
1080; SSE2-NEXT:    movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
1081; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[2,3]
1082; SSE2-NEXT:    movaps %xmm1, %xmm0
1083; SSE2-NEXT:    retq
1084;
1085; SSE42-LABEL: expandload_v2f32_v2i1:
1086; SSE42:       ## %bb.0:
1087; SSE42-NEXT:    pxor %xmm2, %xmm2
1088; SSE42-NEXT:    pcmpeqd %xmm1, %xmm2
1089; SSE42-NEXT:    pmovsxdq %xmm2, %xmm1
1090; SSE42-NEXT:    movmskpd %xmm1, %eax
1091; SSE42-NEXT:    testb $1, %al
1092; SSE42-NEXT:    jne LBB4_1
1093; SSE42-NEXT:  ## %bb.2: ## %else
1094; SSE42-NEXT:    testb $2, %al
1095; SSE42-NEXT:    jne LBB4_3
1096; SSE42-NEXT:  LBB4_4: ## %else2
1097; SSE42-NEXT:    retq
1098; SSE42-NEXT:  LBB4_1: ## %cond.load
1099; SSE42-NEXT:    movss (%rdi), %xmm1 ## xmm1 = mem[0],zero,zero,zero
1100; SSE42-NEXT:    blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
1101; SSE42-NEXT:    addq $4, %rdi
1102; SSE42-NEXT:    testb $2, %al
1103; SSE42-NEXT:    je LBB4_4
1104; SSE42-NEXT:  LBB4_3: ## %cond.load1
1105; SSE42-NEXT:    insertps $16, (%rdi), %xmm0 ## xmm0 = xmm0[0],mem[0],xmm0[2,3]
1106; SSE42-NEXT:    retq
1107;
1108; AVX1OR2-LABEL: expandload_v2f32_v2i1:
1109; AVX1OR2:       ## %bb.0:
1110; AVX1OR2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
1111; AVX1OR2-NEXT:    vpcmpeqd %xmm2, %xmm1, %xmm1
1112; AVX1OR2-NEXT:    vpmovsxdq %xmm1, %xmm1
1113; AVX1OR2-NEXT:    vmovmskpd %xmm1, %eax
1114; AVX1OR2-NEXT:    testb $1, %al
1115; AVX1OR2-NEXT:    jne LBB4_1
1116; AVX1OR2-NEXT:  ## %bb.2: ## %else
1117; AVX1OR2-NEXT:    testb $2, %al
1118; AVX1OR2-NEXT:    jne LBB4_3
1119; AVX1OR2-NEXT:  LBB4_4: ## %else2
1120; AVX1OR2-NEXT:    retq
1121; AVX1OR2-NEXT:  LBB4_1: ## %cond.load
1122; AVX1OR2-NEXT:    vmovss (%rdi), %xmm1 ## xmm1 = mem[0],zero,zero,zero
1123; AVX1OR2-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
1124; AVX1OR2-NEXT:    addq $4, %rdi
1125; AVX1OR2-NEXT:    testb $2, %al
1126; AVX1OR2-NEXT:    je LBB4_4
1127; AVX1OR2-NEXT:  LBB4_3: ## %cond.load1
1128; AVX1OR2-NEXT:    vinsertps $16, (%rdi), %xmm0, %xmm0 ## xmm0 = xmm0[0],mem[0],xmm0[2,3]
1129; AVX1OR2-NEXT:    retq
1130;
1131; AVX512F-LABEL: expandload_v2f32_v2i1:
1132; AVX512F:       ## %bb.0:
1133; AVX512F-NEXT:    ## kill: def $xmm1 killed $xmm1 def $zmm1
1134; AVX512F-NEXT:    ## kill: def $xmm0 killed $xmm0 def $zmm0
1135; AVX512F-NEXT:    vptestnmd %zmm1, %zmm1, %k0
1136; AVX512F-NEXT:    kshiftlw $14, %k0, %k0
1137; AVX512F-NEXT:    kshiftrw $14, %k0, %k1
1138; AVX512F-NEXT:    vexpandps (%rdi), %zmm0 {%k1}
1139; AVX512F-NEXT:    ## kill: def $xmm0 killed $xmm0 killed $zmm0
1140; AVX512F-NEXT:    vzeroupper
1141; AVX512F-NEXT:    retq
1142;
1143; AVX512VLDQ-LABEL: expandload_v2f32_v2i1:
1144; AVX512VLDQ:       ## %bb.0:
1145; AVX512VLDQ-NEXT:    vptestnmd %xmm1, %xmm1, %k0
1146; AVX512VLDQ-NEXT:    kshiftlb $6, %k0, %k0
1147; AVX512VLDQ-NEXT:    kshiftrb $6, %k0, %k1
1148; AVX512VLDQ-NEXT:    vexpandps (%rdi), %xmm0 {%k1}
1149; AVX512VLDQ-NEXT:    retq
1150;
1151; AVX512VLBW-LABEL: expandload_v2f32_v2i1:
1152; AVX512VLBW:       ## %bb.0:
1153; AVX512VLBW-NEXT:    vptestnmd %xmm1, %xmm1, %k0
1154; AVX512VLBW-NEXT:    kshiftlw $14, %k0, %k0
1155; AVX512VLBW-NEXT:    kshiftrw $14, %k0, %k1
1156; AVX512VLBW-NEXT:    vexpandps (%rdi), %xmm0 {%k1}
1157; AVX512VLBW-NEXT:    retq
1158  %mask = icmp eq <2 x i32> %trigger, zeroinitializer
1159  %res = call <2 x float> @llvm.masked.expandload.v2f32(ptr %base, <2 x i1> %mask, <2 x float> %src0)
1160  ret <2 x float> %res
1161}
1162
1163define <4 x float> @expandload_v4f32_const(ptr %base, <4 x float> %src0) {
1164; SSE2-LABEL: expandload_v4f32_const:
1165; SSE2:       ## %bb.0:
1166; SSE2-NEXT:    movsd (%rdi), %xmm1 ## xmm1 = mem[0],zero
1167; SSE2-NEXT:    movss 8(%rdi), %xmm2 ## xmm2 = mem[0],zero,zero,zero
1168; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,1],xmm2[0,3]
1169; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0]
1170; SSE2-NEXT:    movaps %xmm1, %xmm0
1171; SSE2-NEXT:    retq
1172;
1173; SSE42-LABEL: expandload_v4f32_const:
1174; SSE42:       ## %bb.0:
1175; SSE42-NEXT:    movsd (%rdi), %xmm1 ## xmm1 = mem[0],zero
1176; SSE42-NEXT:    insertps $32, 8(%rdi), %xmm1 ## xmm1 = xmm1[0,1],mem[0],xmm1[3]
1177; SSE42-NEXT:    blendps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3]
1178; SSE42-NEXT:    retq
1179;
1180; AVX1OR2-LABEL: expandload_v4f32_const:
1181; AVX1OR2:       ## %bb.0:
1182; AVX1OR2-NEXT:    vmovsd (%rdi), %xmm1 ## xmm1 = mem[0],zero
1183; AVX1OR2-NEXT:    vinsertps $32, 8(%rdi), %xmm1, %xmm1 ## xmm1 = xmm1[0,1],mem[0],xmm1[3]
1184; AVX1OR2-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3]
1185; AVX1OR2-NEXT:    retq
1186;
1187; AVX512F-LABEL: expandload_v4f32_const:
1188; AVX512F:       ## %bb.0:
1189; AVX512F-NEXT:    ## kill: def $xmm0 killed $xmm0 def $zmm0
1190; AVX512F-NEXT:    movw $7, %ax
1191; AVX512F-NEXT:    kmovw %eax, %k1
1192; AVX512F-NEXT:    vexpandps (%rdi), %zmm0 {%k1}
1193; AVX512F-NEXT:    ## kill: def $xmm0 killed $xmm0 killed $zmm0
1194; AVX512F-NEXT:    vzeroupper
1195; AVX512F-NEXT:    retq
1196;
1197; AVX512VLDQ-LABEL: expandload_v4f32_const:
1198; AVX512VLDQ:       ## %bb.0:
1199; AVX512VLDQ-NEXT:    movb $7, %al
1200; AVX512VLDQ-NEXT:    kmovw %eax, %k1
1201; AVX512VLDQ-NEXT:    vexpandps (%rdi), %xmm0 {%k1}
1202; AVX512VLDQ-NEXT:    retq
1203;
1204; AVX512VLBW-LABEL: expandload_v4f32_const:
1205; AVX512VLBW:       ## %bb.0:
1206; AVX512VLBW-NEXT:    movb $7, %al
1207; AVX512VLBW-NEXT:    kmovd %eax, %k1
1208; AVX512VLBW-NEXT:    vexpandps (%rdi), %xmm0 {%k1}
1209; AVX512VLBW-NEXT:    retq
1210  %res = call <4 x float> @llvm.masked.expandload.v4f32(ptr %base, <4 x i1> <i1 true, i1 true, i1 true, i1 false>, <4 x float> %src0)
1211  ret <4 x float>%res
1212}
1213
1214define <16 x float> @expandload_v16f32_const(ptr %base, <16 x float> %src0) {
1215; SSE2-LABEL: expandload_v16f32_const:
1216; SSE2:       ## %bb.0:
1217; SSE2-NEXT:    movups (%rdi), %xmm0
1218; SSE2-NEXT:    movups 16(%rdi), %xmm1
1219; SSE2-NEXT:    movsd 44(%rdi), %xmm4 ## xmm4 = mem[0],zero
1220; SSE2-NEXT:    movss 52(%rdi), %xmm6 ## xmm6 = mem[0],zero,zero,zero
1221; SSE2-NEXT:    movsd 32(%rdi), %xmm5 ## xmm5 = mem[0],zero
1222; SSE2-NEXT:    movss 40(%rdi), %xmm7 ## xmm7 = mem[0],zero,zero,zero
1223; SSE2-NEXT:    shufps {{.*#+}} xmm2 = xmm2[3,1],xmm7[0,3]
1224; SSE2-NEXT:    shufps {{.*#+}} xmm5 = xmm5[0,1],xmm2[2,0]
1225; SSE2-NEXT:    shufps {{.*#+}} xmm3 = xmm3[3,1],xmm6[0,3]
1226; SSE2-NEXT:    shufps {{.*#+}} xmm4 = xmm4[0,1],xmm3[2,0]
1227; SSE2-NEXT:    movaps %xmm5, %xmm2
1228; SSE2-NEXT:    movaps %xmm4, %xmm3
1229; SSE2-NEXT:    retq
1230;
1231; SSE42-LABEL: expandload_v16f32_const:
1232; SSE42:       ## %bb.0:
1233; SSE42-NEXT:    movsd 44(%rdi), %xmm4 ## xmm4 = mem[0],zero
1234; SSE42-NEXT:    insertps $32, 52(%rdi), %xmm4 ## xmm4 = xmm4[0,1],mem[0],xmm4[3]
1235; SSE42-NEXT:    movsd 32(%rdi), %xmm5 ## xmm5 = mem[0],zero
1236; SSE42-NEXT:    insertps $32, 40(%rdi), %xmm5 ## xmm5 = xmm5[0,1],mem[0],xmm5[3]
1237; SSE42-NEXT:    movups (%rdi), %xmm0
1238; SSE42-NEXT:    movups 16(%rdi), %xmm1
1239; SSE42-NEXT:    blendps {{.*#+}} xmm2 = xmm5[0,1,2],xmm2[3]
1240; SSE42-NEXT:    blendps {{.*#+}} xmm3 = xmm4[0,1,2],xmm3[3]
1241; SSE42-NEXT:    retq
1242;
1243; AVX1OR2-LABEL: expandload_v16f32_const:
1244; AVX1OR2:       ## %bb.0:
1245; AVX1OR2-NEXT:    vmovsd 44(%rdi), %xmm0 ## xmm0 = mem[0],zero
1246; AVX1OR2-NEXT:    vinsertps $32, 52(%rdi), %xmm0, %xmm0 ## xmm0 = xmm0[0,1],mem[0],xmm0[3]
1247; AVX1OR2-NEXT:    vmovsd 32(%rdi), %xmm2 ## xmm2 = mem[0],zero
1248; AVX1OR2-NEXT:    vinsertps $32, 40(%rdi), %xmm2, %xmm2 ## xmm2 = xmm2[0,1],mem[0],xmm2[3]
1249; AVX1OR2-NEXT:    vinsertf128 $1, %xmm0, %ymm2, %ymm2
1250; AVX1OR2-NEXT:    vmovups (%rdi), %ymm0
1251; AVX1OR2-NEXT:    vblendps {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7]
1252; AVX1OR2-NEXT:    retq
1253;
1254; AVX512F-LABEL: expandload_v16f32_const:
1255; AVX512F:       ## %bb.0:
1256; AVX512F-NEXT:    movw $30719, %ax ## imm = 0x77FF
1257; AVX512F-NEXT:    kmovw %eax, %k1
1258; AVX512F-NEXT:    vexpandps (%rdi), %zmm0 {%k1}
1259; AVX512F-NEXT:    retq
1260;
1261; AVX512VLDQ-LABEL: expandload_v16f32_const:
1262; AVX512VLDQ:       ## %bb.0:
1263; AVX512VLDQ-NEXT:    movw $30719, %ax ## imm = 0x77FF
1264; AVX512VLDQ-NEXT:    kmovw %eax, %k1
1265; AVX512VLDQ-NEXT:    vexpandps (%rdi), %zmm0 {%k1}
1266; AVX512VLDQ-NEXT:    retq
1267;
1268; AVX512VLBW-LABEL: expandload_v16f32_const:
1269; AVX512VLBW:       ## %bb.0:
1270; AVX512VLBW-NEXT:    movw $30719, %ax ## imm = 0x77FF
1271; AVX512VLBW-NEXT:    kmovd %eax, %k1
1272; AVX512VLBW-NEXT:    vexpandps (%rdi), %zmm0 {%k1}
1273; AVX512VLBW-NEXT:    retq
1274  %res = call <16 x float> @llvm.masked.expandload.v16f32(ptr %base, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 false, i1 true, i1 true, i1 true, i1 false>, <16 x float> %src0)
1275  ret <16 x float>%res
1276}
1277
1278define <16 x float> @expandload_v16f32_const_undef(ptr %base) {
1279; SSE2-LABEL: expandload_v16f32_const_undef:
1280; SSE2:       ## %bb.0:
1281; SSE2-NEXT:    movss 40(%rdi), %xmm0 ## xmm0 = mem[0],zero,zero,zero
1282; SSE2-NEXT:    movsd 32(%rdi), %xmm2 ## xmm2 = mem[0],zero
1283; SSE2-NEXT:    movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
1284; SSE2-NEXT:    movups (%rdi), %xmm0
1285; SSE2-NEXT:    movups 16(%rdi), %xmm1
1286; SSE2-NEXT:    movups 44(%rdi), %xmm3
1287; SSE2-NEXT:    retq
1288;
1289; SSE42-LABEL: expandload_v16f32_const_undef:
1290; SSE42:       ## %bb.0:
1291; SSE42-NEXT:    movsd 32(%rdi), %xmm2 ## xmm2 = mem[0],zero
1292; SSE42-NEXT:    insertps $32, 40(%rdi), %xmm2 ## xmm2 = xmm2[0,1],mem[0],xmm2[3]
1293; SSE42-NEXT:    movups (%rdi), %xmm0
1294; SSE42-NEXT:    movups 16(%rdi), %xmm1
1295; SSE42-NEXT:    movups 44(%rdi), %xmm3
1296; SSE42-NEXT:    retq
1297;
1298; AVX1OR2-LABEL: expandload_v16f32_const_undef:
1299; AVX1OR2:       ## %bb.0:
1300; AVX1OR2-NEXT:    vmovsd 32(%rdi), %xmm0 ## xmm0 = mem[0],zero
1301; AVX1OR2-NEXT:    vinsertps $32, 40(%rdi), %xmm0, %xmm0 ## xmm0 = xmm0[0,1],mem[0],xmm0[3]
1302; AVX1OR2-NEXT:    vinsertf128 $1, 44(%rdi), %ymm0, %ymm1
1303; AVX1OR2-NEXT:    vmovups (%rdi), %ymm0
1304; AVX1OR2-NEXT:    retq
1305;
1306; AVX512F-LABEL: expandload_v16f32_const_undef:
1307; AVX512F:       ## %bb.0:
1308; AVX512F-NEXT:    movw $-2049, %ax ## imm = 0xF7FF
1309; AVX512F-NEXT:    kmovw %eax, %k1
1310; AVX512F-NEXT:    vexpandps (%rdi), %zmm0 {%k1} {z}
1311; AVX512F-NEXT:    retq
1312;
1313; AVX512VLDQ-LABEL: expandload_v16f32_const_undef:
1314; AVX512VLDQ:       ## %bb.0:
1315; AVX512VLDQ-NEXT:    movw $-2049, %ax ## imm = 0xF7FF
1316; AVX512VLDQ-NEXT:    kmovw %eax, %k1
1317; AVX512VLDQ-NEXT:    vexpandps (%rdi), %zmm0 {%k1} {z}
1318; AVX512VLDQ-NEXT:    retq
1319;
1320; AVX512VLBW-LABEL: expandload_v16f32_const_undef:
1321; AVX512VLBW:       ## %bb.0:
1322; AVX512VLBW-NEXT:    movw $-2049, %ax ## imm = 0xF7FF
1323; AVX512VLBW-NEXT:    kmovd %eax, %k1
1324; AVX512VLBW-NEXT:    vexpandps (%rdi), %zmm0 {%k1} {z}
1325; AVX512VLBW-NEXT:    retq
1326  %res = call <16 x float> @llvm.masked.expandload.v16f32(ptr %base, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 false, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
1327  ret <16 x float>%res
1328}
1329
1330
1331define <32 x float> @expandload_v32f32_v32i32(ptr %base, <32 x float> %src0, <32 x i32> %trigger) {
1332; SSE2-LABEL: expandload_v32f32_v32i32:
1333; SSE2:       ## %bb.0:
1334; SSE2-NEXT:    movq %rdi, %rax
1335; SSE2-NEXT:    pxor %xmm8, %xmm8
1336; SSE2-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm9
1337; SSE2-NEXT:    pcmpeqd %xmm8, %xmm9
1338; SSE2-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm10
1339; SSE2-NEXT:    pcmpeqd %xmm8, %xmm10
1340; SSE2-NEXT:    packssdw %xmm9, %xmm10
1341; SSE2-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm9
1342; SSE2-NEXT:    pcmpeqd %xmm8, %xmm9
1343; SSE2-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm11
1344; SSE2-NEXT:    pcmpeqd %xmm8, %xmm11
1345; SSE2-NEXT:    packssdw %xmm9, %xmm11
1346; SSE2-NEXT:    packsswb %xmm10, %xmm11
1347; SSE2-NEXT:    pmovmskb %xmm11, %edx
1348; SSE2-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm9
1349; SSE2-NEXT:    pcmpeqd %xmm8, %xmm9
1350; SSE2-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm10
1351; SSE2-NEXT:    pcmpeqd %xmm8, %xmm10
1352; SSE2-NEXT:    packssdw %xmm9, %xmm10
1353; SSE2-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm9
1354; SSE2-NEXT:    pcmpeqd %xmm8, %xmm9
1355; SSE2-NEXT:    pcmpeqd {{[0-9]+}}(%rsp), %xmm8
1356; SSE2-NEXT:    packssdw %xmm9, %xmm8
1357; SSE2-NEXT:    packsswb %xmm10, %xmm8
1358; SSE2-NEXT:    pmovmskb %xmm8, %ecx
1359; SSE2-NEXT:    shll $16, %ecx
1360; SSE2-NEXT:    orl %edx, %ecx
1361; SSE2-NEXT:    testb $1, %cl
1362; SSE2-NEXT:    jne LBB8_1
1363; SSE2-NEXT:  ## %bb.2: ## %else
1364; SSE2-NEXT:    testb $2, %cl
1365; SSE2-NEXT:    jne LBB8_3
1366; SSE2-NEXT:  LBB8_4: ## %else2
1367; SSE2-NEXT:    testb $4, %cl
1368; SSE2-NEXT:    jne LBB8_5
1369; SSE2-NEXT:  LBB8_6: ## %else6
1370; SSE2-NEXT:    testb $8, %cl
1371; SSE2-NEXT:    jne LBB8_7
1372; SSE2-NEXT:  LBB8_8: ## %else10
1373; SSE2-NEXT:    testb $16, %cl
1374; SSE2-NEXT:    jne LBB8_9
1375; SSE2-NEXT:  LBB8_10: ## %else14
1376; SSE2-NEXT:    testb $32, %cl
1377; SSE2-NEXT:    jne LBB8_11
1378; SSE2-NEXT:  LBB8_12: ## %else18
1379; SSE2-NEXT:    testb $64, %cl
1380; SSE2-NEXT:    jne LBB8_13
1381; SSE2-NEXT:  LBB8_14: ## %else22
1382; SSE2-NEXT:    testb %cl, %cl
1383; SSE2-NEXT:    js LBB8_15
1384; SSE2-NEXT:  LBB8_16: ## %else26
1385; SSE2-NEXT:    testl $256, %ecx ## imm = 0x100
1386; SSE2-NEXT:    jne LBB8_17
1387; SSE2-NEXT:  LBB8_18: ## %else30
1388; SSE2-NEXT:    testl $512, %ecx ## imm = 0x200
1389; SSE2-NEXT:    jne LBB8_19
1390; SSE2-NEXT:  LBB8_20: ## %else34
1391; SSE2-NEXT:    testl $1024, %ecx ## imm = 0x400
1392; SSE2-NEXT:    jne LBB8_21
1393; SSE2-NEXT:  LBB8_22: ## %else38
1394; SSE2-NEXT:    testl $2048, %ecx ## imm = 0x800
1395; SSE2-NEXT:    jne LBB8_23
1396; SSE2-NEXT:  LBB8_24: ## %else42
1397; SSE2-NEXT:    testl $4096, %ecx ## imm = 0x1000
1398; SSE2-NEXT:    jne LBB8_25
1399; SSE2-NEXT:  LBB8_26: ## %else46
1400; SSE2-NEXT:    testl $8192, %ecx ## imm = 0x2000
1401; SSE2-NEXT:    jne LBB8_27
1402; SSE2-NEXT:  LBB8_28: ## %else50
1403; SSE2-NEXT:    testl $16384, %ecx ## imm = 0x4000
1404; SSE2-NEXT:    jne LBB8_29
1405; SSE2-NEXT:  LBB8_30: ## %else54
1406; SSE2-NEXT:    testw %cx, %cx
1407; SSE2-NEXT:    js LBB8_31
1408; SSE2-NEXT:  LBB8_32: ## %else58
1409; SSE2-NEXT:    testl $65536, %ecx ## imm = 0x10000
1410; SSE2-NEXT:    jne LBB8_33
1411; SSE2-NEXT:  LBB8_34: ## %else62
1412; SSE2-NEXT:    testl $131072, %ecx ## imm = 0x20000
1413; SSE2-NEXT:    jne LBB8_35
1414; SSE2-NEXT:  LBB8_36: ## %else66
1415; SSE2-NEXT:    testl $262144, %ecx ## imm = 0x40000
1416; SSE2-NEXT:    jne LBB8_37
1417; SSE2-NEXT:  LBB8_38: ## %else70
1418; SSE2-NEXT:    testl $524288, %ecx ## imm = 0x80000
1419; SSE2-NEXT:    jne LBB8_39
1420; SSE2-NEXT:  LBB8_40: ## %else74
1421; SSE2-NEXT:    testl $1048576, %ecx ## imm = 0x100000
1422; SSE2-NEXT:    jne LBB8_41
1423; SSE2-NEXT:  LBB8_42: ## %else78
1424; SSE2-NEXT:    testl $2097152, %ecx ## imm = 0x200000
1425; SSE2-NEXT:    jne LBB8_43
1426; SSE2-NEXT:  LBB8_44: ## %else82
1427; SSE2-NEXT:    testl $4194304, %ecx ## imm = 0x400000
1428; SSE2-NEXT:    jne LBB8_45
1429; SSE2-NEXT:  LBB8_46: ## %else86
1430; SSE2-NEXT:    testl $8388608, %ecx ## imm = 0x800000
1431; SSE2-NEXT:    jne LBB8_47
1432; SSE2-NEXT:  LBB8_48: ## %else90
1433; SSE2-NEXT:    testl $16777216, %ecx ## imm = 0x1000000
1434; SSE2-NEXT:    jne LBB8_49
1435; SSE2-NEXT:  LBB8_50: ## %else94
1436; SSE2-NEXT:    testl $33554432, %ecx ## imm = 0x2000000
1437; SSE2-NEXT:    jne LBB8_51
1438; SSE2-NEXT:  LBB8_52: ## %else98
1439; SSE2-NEXT:    testl $67108864, %ecx ## imm = 0x4000000
1440; SSE2-NEXT:    jne LBB8_53
1441; SSE2-NEXT:  LBB8_54: ## %else102
1442; SSE2-NEXT:    testl $134217728, %ecx ## imm = 0x8000000
1443; SSE2-NEXT:    jne LBB8_55
1444; SSE2-NEXT:  LBB8_56: ## %else106
1445; SSE2-NEXT:    testl $268435456, %ecx ## imm = 0x10000000
1446; SSE2-NEXT:    jne LBB8_57
1447; SSE2-NEXT:  LBB8_58: ## %else110
1448; SSE2-NEXT:    testl $536870912, %ecx ## imm = 0x20000000
1449; SSE2-NEXT:    jne LBB8_59
1450; SSE2-NEXT:  LBB8_60: ## %else114
1451; SSE2-NEXT:    testl $1073741824, %ecx ## imm = 0x40000000
1452; SSE2-NEXT:    jne LBB8_61
1453; SSE2-NEXT:  LBB8_62: ## %else118
1454; SSE2-NEXT:    testl $-2147483648, %ecx ## imm = 0x80000000
1455; SSE2-NEXT:    je LBB8_64
1456; SSE2-NEXT:  LBB8_63: ## %cond.load121
1457; SSE2-NEXT:    movss (%rsi), %xmm8 ## xmm8 = mem[0],zero,zero,zero
1458; SSE2-NEXT:    shufps {{.*#+}} xmm8 = xmm8[0,1],xmm7[2,3]
1459; SSE2-NEXT:    shufps {{.*#+}} xmm7 = xmm7[0,1],xmm8[2,0]
1460; SSE2-NEXT:  LBB8_64: ## %else122
1461; SSE2-NEXT:    movaps %xmm0, (%rax)
1462; SSE2-NEXT:    movaps %xmm1, 16(%rax)
1463; SSE2-NEXT:    movaps %xmm2, 32(%rax)
1464; SSE2-NEXT:    movaps %xmm3, 48(%rax)
1465; SSE2-NEXT:    movaps %xmm4, 64(%rax)
1466; SSE2-NEXT:    movaps %xmm5, 80(%rax)
1467; SSE2-NEXT:    movaps %xmm6, 96(%rax)
1468; SSE2-NEXT:    movaps %xmm7, 112(%rax)
1469; SSE2-NEXT:    retq
1470; SSE2-NEXT:  LBB8_1: ## %cond.load
1471; SSE2-NEXT:    movss (%rsi), %xmm8 ## xmm8 = mem[0],zero,zero,zero
1472; SSE2-NEXT:    movss {{.*#+}} xmm0 = xmm8[0],xmm0[1,2,3]
1473; SSE2-NEXT:    addq $4, %rsi
1474; SSE2-NEXT:    testb $2, %cl
1475; SSE2-NEXT:    je LBB8_4
1476; SSE2-NEXT:  LBB8_3: ## %cond.load1
1477; SSE2-NEXT:    movss (%rsi), %xmm8 ## xmm8 = mem[0],zero,zero,zero
1478; SSE2-NEXT:    movlhps {{.*#+}} xmm8 = xmm8[0],xmm0[0]
1479; SSE2-NEXT:    shufps {{.*#+}} xmm8 = xmm8[2,0],xmm0[2,3]
1480; SSE2-NEXT:    addq $4, %rsi
1481; SSE2-NEXT:    movaps %xmm8, %xmm0
1482; SSE2-NEXT:    testb $4, %cl
1483; SSE2-NEXT:    je LBB8_6
1484; SSE2-NEXT:  LBB8_5: ## %cond.load5
1485; SSE2-NEXT:    movss (%rsi), %xmm8 ## xmm8 = mem[0],zero,zero,zero
1486; SSE2-NEXT:    shufps {{.*#+}} xmm8 = xmm8[0,0],xmm0[3,0]
1487; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1],xmm8[0,2]
1488; SSE2-NEXT:    addq $4, %rsi
1489; SSE2-NEXT:    testb $8, %cl
1490; SSE2-NEXT:    je LBB8_8
1491; SSE2-NEXT:  LBB8_7: ## %cond.load9
1492; SSE2-NEXT:    movss (%rsi), %xmm8 ## xmm8 = mem[0],zero,zero,zero
1493; SSE2-NEXT:    shufps {{.*#+}} xmm8 = xmm8[0,1],xmm0[2,3]
1494; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1],xmm8[2,0]
1495; SSE2-NEXT:    addq $4, %rsi
1496; SSE2-NEXT:    testb $16, %cl
1497; SSE2-NEXT:    je LBB8_10
1498; SSE2-NEXT:  LBB8_9: ## %cond.load13
1499; SSE2-NEXT:    movss (%rsi), %xmm8 ## xmm8 = mem[0],zero,zero,zero
1500; SSE2-NEXT:    movss {{.*#+}} xmm1 = xmm8[0],xmm1[1,2,3]
1501; SSE2-NEXT:    addq $4, %rsi
1502; SSE2-NEXT:    testb $32, %cl
1503; SSE2-NEXT:    je LBB8_12
1504; SSE2-NEXT:  LBB8_11: ## %cond.load17
1505; SSE2-NEXT:    movss (%rsi), %xmm8 ## xmm8 = mem[0],zero,zero,zero
1506; SSE2-NEXT:    movlhps {{.*#+}} xmm8 = xmm8[0],xmm1[0]
1507; SSE2-NEXT:    shufps {{.*#+}} xmm8 = xmm8[2,0],xmm1[2,3]
1508; SSE2-NEXT:    addq $4, %rsi
1509; SSE2-NEXT:    movaps %xmm8, %xmm1
1510; SSE2-NEXT:    testb $64, %cl
1511; SSE2-NEXT:    je LBB8_14
1512; SSE2-NEXT:  LBB8_13: ## %cond.load21
1513; SSE2-NEXT:    movss (%rsi), %xmm8 ## xmm8 = mem[0],zero,zero,zero
1514; SSE2-NEXT:    shufps {{.*#+}} xmm8 = xmm8[0,0],xmm1[3,0]
1515; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,1],xmm8[0,2]
1516; SSE2-NEXT:    addq $4, %rsi
1517; SSE2-NEXT:    testb %cl, %cl
1518; SSE2-NEXT:    jns LBB8_16
1519; SSE2-NEXT:  LBB8_15: ## %cond.load25
1520; SSE2-NEXT:    movss (%rsi), %xmm8 ## xmm8 = mem[0],zero,zero,zero
1521; SSE2-NEXT:    shufps {{.*#+}} xmm8 = xmm8[0,1],xmm1[2,3]
1522; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,1],xmm8[2,0]
1523; SSE2-NEXT:    addq $4, %rsi
1524; SSE2-NEXT:    testl $256, %ecx ## imm = 0x100
1525; SSE2-NEXT:    je LBB8_18
1526; SSE2-NEXT:  LBB8_17: ## %cond.load29
1527; SSE2-NEXT:    movss (%rsi), %xmm8 ## xmm8 = mem[0],zero,zero,zero
1528; SSE2-NEXT:    movss {{.*#+}} xmm2 = xmm8[0],xmm2[1,2,3]
1529; SSE2-NEXT:    addq $4, %rsi
1530; SSE2-NEXT:    testl $512, %ecx ## imm = 0x200
1531; SSE2-NEXT:    je LBB8_20
1532; SSE2-NEXT:  LBB8_19: ## %cond.load33
1533; SSE2-NEXT:    movss (%rsi), %xmm8 ## xmm8 = mem[0],zero,zero,zero
1534; SSE2-NEXT:    movlhps {{.*#+}} xmm8 = xmm8[0],xmm2[0]
1535; SSE2-NEXT:    shufps {{.*#+}} xmm8 = xmm8[2,0],xmm2[2,3]
1536; SSE2-NEXT:    addq $4, %rsi
1537; SSE2-NEXT:    movaps %xmm8, %xmm2
1538; SSE2-NEXT:    testl $1024, %ecx ## imm = 0x400
1539; SSE2-NEXT:    je LBB8_22
1540; SSE2-NEXT:  LBB8_21: ## %cond.load37
1541; SSE2-NEXT:    movss (%rsi), %xmm8 ## xmm8 = mem[0],zero,zero,zero
1542; SSE2-NEXT:    shufps {{.*#+}} xmm8 = xmm8[0,0],xmm2[3,0]
1543; SSE2-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,1],xmm8[0,2]
1544; SSE2-NEXT:    addq $4, %rsi
1545; SSE2-NEXT:    testl $2048, %ecx ## imm = 0x800
1546; SSE2-NEXT:    je LBB8_24
1547; SSE2-NEXT:  LBB8_23: ## %cond.load41
1548; SSE2-NEXT:    movss (%rsi), %xmm8 ## xmm8 = mem[0],zero,zero,zero
1549; SSE2-NEXT:    shufps {{.*#+}} xmm8 = xmm8[0,1],xmm2[2,3]
1550; SSE2-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,1],xmm8[2,0]
1551; SSE2-NEXT:    addq $4, %rsi
1552; SSE2-NEXT:    testl $4096, %ecx ## imm = 0x1000
1553; SSE2-NEXT:    je LBB8_26
1554; SSE2-NEXT:  LBB8_25: ## %cond.load45
1555; SSE2-NEXT:    movss (%rsi), %xmm8 ## xmm8 = mem[0],zero,zero,zero
1556; SSE2-NEXT:    movss {{.*#+}} xmm3 = xmm8[0],xmm3[1,2,3]
1557; SSE2-NEXT:    addq $4, %rsi
1558; SSE2-NEXT:    testl $8192, %ecx ## imm = 0x2000
1559; SSE2-NEXT:    je LBB8_28
1560; SSE2-NEXT:  LBB8_27: ## %cond.load49
1561; SSE2-NEXT:    movss (%rsi), %xmm8 ## xmm8 = mem[0],zero,zero,zero
1562; SSE2-NEXT:    movlhps {{.*#+}} xmm8 = xmm8[0],xmm3[0]
1563; SSE2-NEXT:    shufps {{.*#+}} xmm8 = xmm8[2,0],xmm3[2,3]
1564; SSE2-NEXT:    addq $4, %rsi
1565; SSE2-NEXT:    movaps %xmm8, %xmm3
1566; SSE2-NEXT:    testl $16384, %ecx ## imm = 0x4000
1567; SSE2-NEXT:    je LBB8_30
1568; SSE2-NEXT:  LBB8_29: ## %cond.load53
1569; SSE2-NEXT:    movss (%rsi), %xmm8 ## xmm8 = mem[0],zero,zero,zero
1570; SSE2-NEXT:    shufps {{.*#+}} xmm8 = xmm8[0,0],xmm3[3,0]
1571; SSE2-NEXT:    shufps {{.*#+}} xmm3 = xmm3[0,1],xmm8[0,2]
1572; SSE2-NEXT:    addq $4, %rsi
1573; SSE2-NEXT:    testw %cx, %cx
1574; SSE2-NEXT:    jns LBB8_32
1575; SSE2-NEXT:  LBB8_31: ## %cond.load57
1576; SSE2-NEXT:    movss (%rsi), %xmm8 ## xmm8 = mem[0],zero,zero,zero
1577; SSE2-NEXT:    shufps {{.*#+}} xmm8 = xmm8[0,1],xmm3[2,3]
1578; SSE2-NEXT:    shufps {{.*#+}} xmm3 = xmm3[0,1],xmm8[2,0]
1579; SSE2-NEXT:    addq $4, %rsi
1580; SSE2-NEXT:    testl $65536, %ecx ## imm = 0x10000
1581; SSE2-NEXT:    je LBB8_34
1582; SSE2-NEXT:  LBB8_33: ## %cond.load61
1583; SSE2-NEXT:    movss (%rsi), %xmm8 ## xmm8 = mem[0],zero,zero,zero
1584; SSE2-NEXT:    movss {{.*#+}} xmm4 = xmm8[0],xmm4[1,2,3]
1585; SSE2-NEXT:    addq $4, %rsi
1586; SSE2-NEXT:    testl $131072, %ecx ## imm = 0x20000
1587; SSE2-NEXT:    je LBB8_36
1588; SSE2-NEXT:  LBB8_35: ## %cond.load65
1589; SSE2-NEXT:    movss (%rsi), %xmm8 ## xmm8 = mem[0],zero,zero,zero
1590; SSE2-NEXT:    movlhps {{.*#+}} xmm8 = xmm8[0],xmm4[0]
1591; SSE2-NEXT:    shufps {{.*#+}} xmm8 = xmm8[2,0],xmm4[2,3]
1592; SSE2-NEXT:    addq $4, %rsi
1593; SSE2-NEXT:    movaps %xmm8, %xmm4
1594; SSE2-NEXT:    testl $262144, %ecx ## imm = 0x40000
1595; SSE2-NEXT:    je LBB8_38
1596; SSE2-NEXT:  LBB8_37: ## %cond.load69
1597; SSE2-NEXT:    movss (%rsi), %xmm8 ## xmm8 = mem[0],zero,zero,zero
1598; SSE2-NEXT:    shufps {{.*#+}} xmm8 = xmm8[0,0],xmm4[3,0]
1599; SSE2-NEXT:    shufps {{.*#+}} xmm4 = xmm4[0,1],xmm8[0,2]
1600; SSE2-NEXT:    addq $4, %rsi
1601; SSE2-NEXT:    testl $524288, %ecx ## imm = 0x80000
1602; SSE2-NEXT:    je LBB8_40
1603; SSE2-NEXT:  LBB8_39: ## %cond.load73
1604; SSE2-NEXT:    movss (%rsi), %xmm8 ## xmm8 = mem[0],zero,zero,zero
1605; SSE2-NEXT:    shufps {{.*#+}} xmm8 = xmm8[0,1],xmm4[2,3]
1606; SSE2-NEXT:    shufps {{.*#+}} xmm4 = xmm4[0,1],xmm8[2,0]
1607; SSE2-NEXT:    addq $4, %rsi
1608; SSE2-NEXT:    testl $1048576, %ecx ## imm = 0x100000
1609; SSE2-NEXT:    je LBB8_42
1610; SSE2-NEXT:  LBB8_41: ## %cond.load77
1611; SSE2-NEXT:    movss (%rsi), %xmm8 ## xmm8 = mem[0],zero,zero,zero
1612; SSE2-NEXT:    movss {{.*#+}} xmm5 = xmm8[0],xmm5[1,2,3]
1613; SSE2-NEXT:    addq $4, %rsi
1614; SSE2-NEXT:    testl $2097152, %ecx ## imm = 0x200000
1615; SSE2-NEXT:    je LBB8_44
1616; SSE2-NEXT:  LBB8_43: ## %cond.load81
1617; SSE2-NEXT:    movss (%rsi), %xmm8 ## xmm8 = mem[0],zero,zero,zero
1618; SSE2-NEXT:    movlhps {{.*#+}} xmm8 = xmm8[0],xmm5[0]
1619; SSE2-NEXT:    shufps {{.*#+}} xmm8 = xmm8[2,0],xmm5[2,3]
1620; SSE2-NEXT:    addq $4, %rsi
1621; SSE2-NEXT:    movaps %xmm8, %xmm5
1622; SSE2-NEXT:    testl $4194304, %ecx ## imm = 0x400000
1623; SSE2-NEXT:    je LBB8_46
1624; SSE2-NEXT:  LBB8_45: ## %cond.load85
1625; SSE2-NEXT:    movss (%rsi), %xmm8 ## xmm8 = mem[0],zero,zero,zero
1626; SSE2-NEXT:    shufps {{.*#+}} xmm8 = xmm8[0,0],xmm5[3,0]
1627; SSE2-NEXT:    shufps {{.*#+}} xmm5 = xmm5[0,1],xmm8[0,2]
1628; SSE2-NEXT:    addq $4, %rsi
1629; SSE2-NEXT:    testl $8388608, %ecx ## imm = 0x800000
1630; SSE2-NEXT:    je LBB8_48
1631; SSE2-NEXT:  LBB8_47: ## %cond.load89
1632; SSE2-NEXT:    movss (%rsi), %xmm8 ## xmm8 = mem[0],zero,zero,zero
1633; SSE2-NEXT:    shufps {{.*#+}} xmm8 = xmm8[0,1],xmm5[2,3]
1634; SSE2-NEXT:    shufps {{.*#+}} xmm5 = xmm5[0,1],xmm8[2,0]
1635; SSE2-NEXT:    addq $4, %rsi
1636; SSE2-NEXT:    testl $16777216, %ecx ## imm = 0x1000000
1637; SSE2-NEXT:    je LBB8_50
1638; SSE2-NEXT:  LBB8_49: ## %cond.load93
1639; SSE2-NEXT:    movss (%rsi), %xmm8 ## xmm8 = mem[0],zero,zero,zero
1640; SSE2-NEXT:    movss {{.*#+}} xmm6 = xmm8[0],xmm6[1,2,3]
1641; SSE2-NEXT:    addq $4, %rsi
1642; SSE2-NEXT:    testl $33554432, %ecx ## imm = 0x2000000
1643; SSE2-NEXT:    je LBB8_52
1644; SSE2-NEXT:  LBB8_51: ## %cond.load97
1645; SSE2-NEXT:    movss (%rsi), %xmm8 ## xmm8 = mem[0],zero,zero,zero
1646; SSE2-NEXT:    movlhps {{.*#+}} xmm8 = xmm8[0],xmm6[0]
1647; SSE2-NEXT:    shufps {{.*#+}} xmm8 = xmm8[2,0],xmm6[2,3]
1648; SSE2-NEXT:    addq $4, %rsi
1649; SSE2-NEXT:    movaps %xmm8, %xmm6
1650; SSE2-NEXT:    testl $67108864, %ecx ## imm = 0x4000000
1651; SSE2-NEXT:    je LBB8_54
1652; SSE2-NEXT:  LBB8_53: ## %cond.load101
1653; SSE2-NEXT:    movss (%rsi), %xmm8 ## xmm8 = mem[0],zero,zero,zero
1654; SSE2-NEXT:    shufps {{.*#+}} xmm8 = xmm8[0,0],xmm6[3,0]
1655; SSE2-NEXT:    shufps {{.*#+}} xmm6 = xmm6[0,1],xmm8[0,2]
1656; SSE2-NEXT:    addq $4, %rsi
1657; SSE2-NEXT:    testl $134217728, %ecx ## imm = 0x8000000
1658; SSE2-NEXT:    je LBB8_56
1659; SSE2-NEXT:  LBB8_55: ## %cond.load105
1660; SSE2-NEXT:    movss (%rsi), %xmm8 ## xmm8 = mem[0],zero,zero,zero
1661; SSE2-NEXT:    shufps {{.*#+}} xmm8 = xmm8[0,1],xmm6[2,3]
1662; SSE2-NEXT:    shufps {{.*#+}} xmm6 = xmm6[0,1],xmm8[2,0]
1663; SSE2-NEXT:    addq $4, %rsi
1664; SSE2-NEXT:    testl $268435456, %ecx ## imm = 0x10000000
1665; SSE2-NEXT:    je LBB8_58
1666; SSE2-NEXT:  LBB8_57: ## %cond.load109
1667; SSE2-NEXT:    movss (%rsi), %xmm8 ## xmm8 = mem[0],zero,zero,zero
1668; SSE2-NEXT:    movss {{.*#+}} xmm7 = xmm8[0],xmm7[1,2,3]
1669; SSE2-NEXT:    addq $4, %rsi
1670; SSE2-NEXT:    testl $536870912, %ecx ## imm = 0x20000000
1671; SSE2-NEXT:    je LBB8_60
1672; SSE2-NEXT:  LBB8_59: ## %cond.load113
1673; SSE2-NEXT:    movss (%rsi), %xmm8 ## xmm8 = mem[0],zero,zero,zero
1674; SSE2-NEXT:    movlhps {{.*#+}} xmm8 = xmm8[0],xmm7[0]
1675; SSE2-NEXT:    shufps {{.*#+}} xmm8 = xmm8[2,0],xmm7[2,3]
1676; SSE2-NEXT:    addq $4, %rsi
1677; SSE2-NEXT:    movaps %xmm8, %xmm7
1678; SSE2-NEXT:    testl $1073741824, %ecx ## imm = 0x40000000
1679; SSE2-NEXT:    je LBB8_62
1680; SSE2-NEXT:  LBB8_61: ## %cond.load117
1681; SSE2-NEXT:    movss (%rsi), %xmm8 ## xmm8 = mem[0],zero,zero,zero
1682; SSE2-NEXT:    shufps {{.*#+}} xmm8 = xmm8[0,0],xmm7[3,0]
1683; SSE2-NEXT:    shufps {{.*#+}} xmm7 = xmm7[0,1],xmm8[0,2]
1684; SSE2-NEXT:    addq $4, %rsi
1685; SSE2-NEXT:    testl $-2147483648, %ecx ## imm = 0x80000000
1686; SSE2-NEXT:    jne LBB8_63
1687; SSE2-NEXT:    jmp LBB8_64
1688;
1689; SSE42-LABEL: expandload_v32f32_v32i32:
1690; SSE42:       ## %bb.0:
1691; SSE42-NEXT:    movq %rdi, %rax
1692; SSE42-NEXT:    pxor %xmm8, %xmm8
1693; SSE42-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm9
1694; SSE42-NEXT:    pcmpeqd %xmm8, %xmm9
1695; SSE42-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm10
1696; SSE42-NEXT:    pcmpeqd %xmm8, %xmm10
1697; SSE42-NEXT:    packssdw %xmm9, %xmm10
1698; SSE42-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm9
1699; SSE42-NEXT:    pcmpeqd %xmm8, %xmm9
1700; SSE42-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm11
1701; SSE42-NEXT:    pcmpeqd %xmm8, %xmm11
1702; SSE42-NEXT:    packssdw %xmm9, %xmm11
1703; SSE42-NEXT:    packsswb %xmm10, %xmm11
1704; SSE42-NEXT:    pmovmskb %xmm11, %edx
1705; SSE42-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm9
1706; SSE42-NEXT:    pcmpeqd %xmm8, %xmm9
1707; SSE42-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm10
1708; SSE42-NEXT:    pcmpeqd %xmm8, %xmm10
1709; SSE42-NEXT:    packssdw %xmm9, %xmm10
1710; SSE42-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm9
1711; SSE42-NEXT:    pcmpeqd %xmm8, %xmm9
1712; SSE42-NEXT:    pcmpeqd {{[0-9]+}}(%rsp), %xmm8
1713; SSE42-NEXT:    packssdw %xmm9, %xmm8
1714; SSE42-NEXT:    packsswb %xmm10, %xmm8
1715; SSE42-NEXT:    pmovmskb %xmm8, %ecx
1716; SSE42-NEXT:    shll $16, %ecx
1717; SSE42-NEXT:    orl %edx, %ecx
1718; SSE42-NEXT:    testb $1, %cl
1719; SSE42-NEXT:    jne LBB8_1
1720; SSE42-NEXT:  ## %bb.2: ## %else
1721; SSE42-NEXT:    testb $2, %cl
1722; SSE42-NEXT:    jne LBB8_3
1723; SSE42-NEXT:  LBB8_4: ## %else2
1724; SSE42-NEXT:    testb $4, %cl
1725; SSE42-NEXT:    jne LBB8_5
1726; SSE42-NEXT:  LBB8_6: ## %else6
1727; SSE42-NEXT:    testb $8, %cl
1728; SSE42-NEXT:    jne LBB8_7
1729; SSE42-NEXT:  LBB8_8: ## %else10
1730; SSE42-NEXT:    testb $16, %cl
1731; SSE42-NEXT:    jne LBB8_9
1732; SSE42-NEXT:  LBB8_10: ## %else14
1733; SSE42-NEXT:    testb $32, %cl
1734; SSE42-NEXT:    jne LBB8_11
1735; SSE42-NEXT:  LBB8_12: ## %else18
1736; SSE42-NEXT:    testb $64, %cl
1737; SSE42-NEXT:    jne LBB8_13
1738; SSE42-NEXT:  LBB8_14: ## %else22
1739; SSE42-NEXT:    testb %cl, %cl
1740; SSE42-NEXT:    js LBB8_15
1741; SSE42-NEXT:  LBB8_16: ## %else26
1742; SSE42-NEXT:    testl $256, %ecx ## imm = 0x100
1743; SSE42-NEXT:    jne LBB8_17
1744; SSE42-NEXT:  LBB8_18: ## %else30
1745; SSE42-NEXT:    testl $512, %ecx ## imm = 0x200
1746; SSE42-NEXT:    jne LBB8_19
1747; SSE42-NEXT:  LBB8_20: ## %else34
1748; SSE42-NEXT:    testl $1024, %ecx ## imm = 0x400
1749; SSE42-NEXT:    jne LBB8_21
1750; SSE42-NEXT:  LBB8_22: ## %else38
1751; SSE42-NEXT:    testl $2048, %ecx ## imm = 0x800
1752; SSE42-NEXT:    jne LBB8_23
1753; SSE42-NEXT:  LBB8_24: ## %else42
1754; SSE42-NEXT:    testl $4096, %ecx ## imm = 0x1000
1755; SSE42-NEXT:    jne LBB8_25
1756; SSE42-NEXT:  LBB8_26: ## %else46
1757; SSE42-NEXT:    testl $8192, %ecx ## imm = 0x2000
1758; SSE42-NEXT:    jne LBB8_27
1759; SSE42-NEXT:  LBB8_28: ## %else50
1760; SSE42-NEXT:    testl $16384, %ecx ## imm = 0x4000
1761; SSE42-NEXT:    jne LBB8_29
1762; SSE42-NEXT:  LBB8_30: ## %else54
1763; SSE42-NEXT:    testw %cx, %cx
1764; SSE42-NEXT:    js LBB8_31
1765; SSE42-NEXT:  LBB8_32: ## %else58
1766; SSE42-NEXT:    testl $65536, %ecx ## imm = 0x10000
1767; SSE42-NEXT:    jne LBB8_33
1768; SSE42-NEXT:  LBB8_34: ## %else62
1769; SSE42-NEXT:    testl $131072, %ecx ## imm = 0x20000
1770; SSE42-NEXT:    jne LBB8_35
1771; SSE42-NEXT:  LBB8_36: ## %else66
1772; SSE42-NEXT:    testl $262144, %ecx ## imm = 0x40000
1773; SSE42-NEXT:    jne LBB8_37
1774; SSE42-NEXT:  LBB8_38: ## %else70
1775; SSE42-NEXT:    testl $524288, %ecx ## imm = 0x80000
1776; SSE42-NEXT:    jne LBB8_39
1777; SSE42-NEXT:  LBB8_40: ## %else74
1778; SSE42-NEXT:    testl $1048576, %ecx ## imm = 0x100000
1779; SSE42-NEXT:    jne LBB8_41
1780; SSE42-NEXT:  LBB8_42: ## %else78
1781; SSE42-NEXT:    testl $2097152, %ecx ## imm = 0x200000
1782; SSE42-NEXT:    jne LBB8_43
1783; SSE42-NEXT:  LBB8_44: ## %else82
1784; SSE42-NEXT:    testl $4194304, %ecx ## imm = 0x400000
1785; SSE42-NEXT:    jne LBB8_45
1786; SSE42-NEXT:  LBB8_46: ## %else86
1787; SSE42-NEXT:    testl $8388608, %ecx ## imm = 0x800000
1788; SSE42-NEXT:    jne LBB8_47
1789; SSE42-NEXT:  LBB8_48: ## %else90
1790; SSE42-NEXT:    testl $16777216, %ecx ## imm = 0x1000000
1791; SSE42-NEXT:    jne LBB8_49
1792; SSE42-NEXT:  LBB8_50: ## %else94
1793; SSE42-NEXT:    testl $33554432, %ecx ## imm = 0x2000000
1794; SSE42-NEXT:    jne LBB8_51
1795; SSE42-NEXT:  LBB8_52: ## %else98
1796; SSE42-NEXT:    testl $67108864, %ecx ## imm = 0x4000000
1797; SSE42-NEXT:    jne LBB8_53
1798; SSE42-NEXT:  LBB8_54: ## %else102
1799; SSE42-NEXT:    testl $134217728, %ecx ## imm = 0x8000000
1800; SSE42-NEXT:    jne LBB8_55
1801; SSE42-NEXT:  LBB8_56: ## %else106
1802; SSE42-NEXT:    testl $268435456, %ecx ## imm = 0x10000000
1803; SSE42-NEXT:    jne LBB8_57
1804; SSE42-NEXT:  LBB8_58: ## %else110
1805; SSE42-NEXT:    testl $536870912, %ecx ## imm = 0x20000000
1806; SSE42-NEXT:    jne LBB8_59
1807; SSE42-NEXT:  LBB8_60: ## %else114
1808; SSE42-NEXT:    testl $1073741824, %ecx ## imm = 0x40000000
1809; SSE42-NEXT:    jne LBB8_61
1810; SSE42-NEXT:  LBB8_62: ## %else118
1811; SSE42-NEXT:    testl $-2147483648, %ecx ## imm = 0x80000000
1812; SSE42-NEXT:    je LBB8_64
1813; SSE42-NEXT:  LBB8_63: ## %cond.load121
1814; SSE42-NEXT:    insertps $48, (%rsi), %xmm7 ## xmm7 = xmm7[0,1,2],mem[0]
1815; SSE42-NEXT:  LBB8_64: ## %else122
1816; SSE42-NEXT:    movaps %xmm0, (%rax)
1817; SSE42-NEXT:    movaps %xmm1, 16(%rax)
1818; SSE42-NEXT:    movaps %xmm2, 32(%rax)
1819; SSE42-NEXT:    movaps %xmm3, 48(%rax)
1820; SSE42-NEXT:    movaps %xmm4, 64(%rax)
1821; SSE42-NEXT:    movaps %xmm5, 80(%rax)
1822; SSE42-NEXT:    movaps %xmm6, 96(%rax)
1823; SSE42-NEXT:    movaps %xmm7, 112(%rax)
1824; SSE42-NEXT:    retq
1825; SSE42-NEXT:  LBB8_1: ## %cond.load
1826; SSE42-NEXT:    movd (%rsi), %xmm8 ## xmm8 = mem[0],zero,zero,zero
1827; SSE42-NEXT:    pblendw {{.*#+}} xmm0 = xmm8[0,1],xmm0[2,3,4,5,6,7]
1828; SSE42-NEXT:    addq $4, %rsi
1829; SSE42-NEXT:    testb $2, %cl
1830; SSE42-NEXT:    je LBB8_4
1831; SSE42-NEXT:  LBB8_3: ## %cond.load1
1832; SSE42-NEXT:    insertps $16, (%rsi), %xmm0 ## xmm0 = xmm0[0],mem[0],xmm0[2,3]
1833; SSE42-NEXT:    addq $4, %rsi
1834; SSE42-NEXT:    testb $4, %cl
1835; SSE42-NEXT:    je LBB8_6
1836; SSE42-NEXT:  LBB8_5: ## %cond.load5
1837; SSE42-NEXT:    insertps $32, (%rsi), %xmm0 ## xmm0 = xmm0[0,1],mem[0],xmm0[3]
1838; SSE42-NEXT:    addq $4, %rsi
1839; SSE42-NEXT:    testb $8, %cl
1840; SSE42-NEXT:    je LBB8_8
1841; SSE42-NEXT:  LBB8_7: ## %cond.load9
1842; SSE42-NEXT:    insertps $48, (%rsi), %xmm0 ## xmm0 = xmm0[0,1,2],mem[0]
1843; SSE42-NEXT:    addq $4, %rsi
1844; SSE42-NEXT:    testb $16, %cl
1845; SSE42-NEXT:    je LBB8_10
1846; SSE42-NEXT:  LBB8_9: ## %cond.load13
1847; SSE42-NEXT:    movd (%rsi), %xmm8 ## xmm8 = mem[0],zero,zero,zero
1848; SSE42-NEXT:    pblendw {{.*#+}} xmm1 = xmm8[0,1],xmm1[2,3,4,5,6,7]
1849; SSE42-NEXT:    addq $4, %rsi
1850; SSE42-NEXT:    testb $32, %cl
1851; SSE42-NEXT:    je LBB8_12
1852; SSE42-NEXT:  LBB8_11: ## %cond.load17
1853; SSE42-NEXT:    insertps $16, (%rsi), %xmm1 ## xmm1 = xmm1[0],mem[0],xmm1[2,3]
1854; SSE42-NEXT:    addq $4, %rsi
1855; SSE42-NEXT:    testb $64, %cl
1856; SSE42-NEXT:    je LBB8_14
1857; SSE42-NEXT:  LBB8_13: ## %cond.load21
1858; SSE42-NEXT:    insertps $32, (%rsi), %xmm1 ## xmm1 = xmm1[0,1],mem[0],xmm1[3]
1859; SSE42-NEXT:    addq $4, %rsi
1860; SSE42-NEXT:    testb %cl, %cl
1861; SSE42-NEXT:    jns LBB8_16
1862; SSE42-NEXT:  LBB8_15: ## %cond.load25
1863; SSE42-NEXT:    insertps $48, (%rsi), %xmm1 ## xmm1 = xmm1[0,1,2],mem[0]
1864; SSE42-NEXT:    addq $4, %rsi
1865; SSE42-NEXT:    testl $256, %ecx ## imm = 0x100
1866; SSE42-NEXT:    je LBB8_18
1867; SSE42-NEXT:  LBB8_17: ## %cond.load29
1868; SSE42-NEXT:    movd (%rsi), %xmm8 ## xmm8 = mem[0],zero,zero,zero
1869; SSE42-NEXT:    pblendw {{.*#+}} xmm2 = xmm8[0,1],xmm2[2,3,4,5,6,7]
1870; SSE42-NEXT:    addq $4, %rsi
1871; SSE42-NEXT:    testl $512, %ecx ## imm = 0x200
1872; SSE42-NEXT:    je LBB8_20
1873; SSE42-NEXT:  LBB8_19: ## %cond.load33
1874; SSE42-NEXT:    insertps $16, (%rsi), %xmm2 ## xmm2 = xmm2[0],mem[0],xmm2[2,3]
1875; SSE42-NEXT:    addq $4, %rsi
1876; SSE42-NEXT:    testl $1024, %ecx ## imm = 0x400
1877; SSE42-NEXT:    je LBB8_22
1878; SSE42-NEXT:  LBB8_21: ## %cond.load37
1879; SSE42-NEXT:    insertps $32, (%rsi), %xmm2 ## xmm2 = xmm2[0,1],mem[0],xmm2[3]
1880; SSE42-NEXT:    addq $4, %rsi
1881; SSE42-NEXT:    testl $2048, %ecx ## imm = 0x800
1882; SSE42-NEXT:    je LBB8_24
1883; SSE42-NEXT:  LBB8_23: ## %cond.load41
1884; SSE42-NEXT:    insertps $48, (%rsi), %xmm2 ## xmm2 = xmm2[0,1,2],mem[0]
1885; SSE42-NEXT:    addq $4, %rsi
1886; SSE42-NEXT:    testl $4096, %ecx ## imm = 0x1000
1887; SSE42-NEXT:    je LBB8_26
1888; SSE42-NEXT:  LBB8_25: ## %cond.load45
1889; SSE42-NEXT:    movd (%rsi), %xmm8 ## xmm8 = mem[0],zero,zero,zero
1890; SSE42-NEXT:    pblendw {{.*#+}} xmm3 = xmm8[0,1],xmm3[2,3,4,5,6,7]
1891; SSE42-NEXT:    addq $4, %rsi
1892; SSE42-NEXT:    testl $8192, %ecx ## imm = 0x2000
1893; SSE42-NEXT:    je LBB8_28
1894; SSE42-NEXT:  LBB8_27: ## %cond.load49
1895; SSE42-NEXT:    insertps $16, (%rsi), %xmm3 ## xmm3 = xmm3[0],mem[0],xmm3[2,3]
1896; SSE42-NEXT:    addq $4, %rsi
1897; SSE42-NEXT:    testl $16384, %ecx ## imm = 0x4000
1898; SSE42-NEXT:    je LBB8_30
1899; SSE42-NEXT:  LBB8_29: ## %cond.load53
1900; SSE42-NEXT:    insertps $32, (%rsi), %xmm3 ## xmm3 = xmm3[0,1],mem[0],xmm3[3]
1901; SSE42-NEXT:    addq $4, %rsi
1902; SSE42-NEXT:    testw %cx, %cx
1903; SSE42-NEXT:    jns LBB8_32
1904; SSE42-NEXT:  LBB8_31: ## %cond.load57
1905; SSE42-NEXT:    insertps $48, (%rsi), %xmm3 ## xmm3 = xmm3[0,1,2],mem[0]
1906; SSE42-NEXT:    addq $4, %rsi
1907; SSE42-NEXT:    testl $65536, %ecx ## imm = 0x10000
1908; SSE42-NEXT:    je LBB8_34
1909; SSE42-NEXT:  LBB8_33: ## %cond.load61
1910; SSE42-NEXT:    movd (%rsi), %xmm8 ## xmm8 = mem[0],zero,zero,zero
1911; SSE42-NEXT:    pblendw {{.*#+}} xmm4 = xmm8[0,1],xmm4[2,3,4,5,6,7]
1912; SSE42-NEXT:    addq $4, %rsi
1913; SSE42-NEXT:    testl $131072, %ecx ## imm = 0x20000
1914; SSE42-NEXT:    je LBB8_36
1915; SSE42-NEXT:  LBB8_35: ## %cond.load65
1916; SSE42-NEXT:    insertps $16, (%rsi), %xmm4 ## xmm4 = xmm4[0],mem[0],xmm4[2,3]
1917; SSE42-NEXT:    addq $4, %rsi
1918; SSE42-NEXT:    testl $262144, %ecx ## imm = 0x40000
1919; SSE42-NEXT:    je LBB8_38
1920; SSE42-NEXT:  LBB8_37: ## %cond.load69
1921; SSE42-NEXT:    insertps $32, (%rsi), %xmm4 ## xmm4 = xmm4[0,1],mem[0],xmm4[3]
1922; SSE42-NEXT:    addq $4, %rsi
1923; SSE42-NEXT:    testl $524288, %ecx ## imm = 0x80000
1924; SSE42-NEXT:    je LBB8_40
1925; SSE42-NEXT:  LBB8_39: ## %cond.load73
1926; SSE42-NEXT:    insertps $48, (%rsi), %xmm4 ## xmm4 = xmm4[0,1,2],mem[0]
1927; SSE42-NEXT:    addq $4, %rsi
1928; SSE42-NEXT:    testl $1048576, %ecx ## imm = 0x100000
1929; SSE42-NEXT:    je LBB8_42
1930; SSE42-NEXT:  LBB8_41: ## %cond.load77
1931; SSE42-NEXT:    movd (%rsi), %xmm8 ## xmm8 = mem[0],zero,zero,zero
1932; SSE42-NEXT:    pblendw {{.*#+}} xmm5 = xmm8[0,1],xmm5[2,3,4,5,6,7]
1933; SSE42-NEXT:    addq $4, %rsi
1934; SSE42-NEXT:    testl $2097152, %ecx ## imm = 0x200000
1935; SSE42-NEXT:    je LBB8_44
1936; SSE42-NEXT:  LBB8_43: ## %cond.load81
1937; SSE42-NEXT:    insertps $16, (%rsi), %xmm5 ## xmm5 = xmm5[0],mem[0],xmm5[2,3]
1938; SSE42-NEXT:    addq $4, %rsi
1939; SSE42-NEXT:    testl $4194304, %ecx ## imm = 0x400000
1940; SSE42-NEXT:    je LBB8_46
1941; SSE42-NEXT:  LBB8_45: ## %cond.load85
1942; SSE42-NEXT:    insertps $32, (%rsi), %xmm5 ## xmm5 = xmm5[0,1],mem[0],xmm5[3]
1943; SSE42-NEXT:    addq $4, %rsi
1944; SSE42-NEXT:    testl $8388608, %ecx ## imm = 0x800000
1945; SSE42-NEXT:    je LBB8_48
1946; SSE42-NEXT:  LBB8_47: ## %cond.load89
1947; SSE42-NEXT:    insertps $48, (%rsi), %xmm5 ## xmm5 = xmm5[0,1,2],mem[0]
1948; SSE42-NEXT:    addq $4, %rsi
1949; SSE42-NEXT:    testl $16777216, %ecx ## imm = 0x1000000
1950; SSE42-NEXT:    je LBB8_50
1951; SSE42-NEXT:  LBB8_49: ## %cond.load93
1952; SSE42-NEXT:    movd (%rsi), %xmm8 ## xmm8 = mem[0],zero,zero,zero
1953; SSE42-NEXT:    pblendw {{.*#+}} xmm6 = xmm8[0,1],xmm6[2,3,4,5,6,7]
1954; SSE42-NEXT:    addq $4, %rsi
1955; SSE42-NEXT:    testl $33554432, %ecx ## imm = 0x2000000
1956; SSE42-NEXT:    je LBB8_52
1957; SSE42-NEXT:  LBB8_51: ## %cond.load97
1958; SSE42-NEXT:    insertps $16, (%rsi), %xmm6 ## xmm6 = xmm6[0],mem[0],xmm6[2,3]
1959; SSE42-NEXT:    addq $4, %rsi
1960; SSE42-NEXT:    testl $67108864, %ecx ## imm = 0x4000000
1961; SSE42-NEXT:    je LBB8_54
1962; SSE42-NEXT:  LBB8_53: ## %cond.load101
1963; SSE42-NEXT:    insertps $32, (%rsi), %xmm6 ## xmm6 = xmm6[0,1],mem[0],xmm6[3]
1964; SSE42-NEXT:    addq $4, %rsi
1965; SSE42-NEXT:    testl $134217728, %ecx ## imm = 0x8000000
1966; SSE42-NEXT:    je LBB8_56
1967; SSE42-NEXT:  LBB8_55: ## %cond.load105
1968; SSE42-NEXT:    insertps $48, (%rsi), %xmm6 ## xmm6 = xmm6[0,1,2],mem[0]
1969; SSE42-NEXT:    addq $4, %rsi
1970; SSE42-NEXT:    testl $268435456, %ecx ## imm = 0x10000000
1971; SSE42-NEXT:    je LBB8_58
1972; SSE42-NEXT:  LBB8_57: ## %cond.load109
1973; SSE42-NEXT:    movd (%rsi), %xmm8 ## xmm8 = mem[0],zero,zero,zero
1974; SSE42-NEXT:    pblendw {{.*#+}} xmm7 = xmm8[0,1],xmm7[2,3,4,5,6,7]
1975; SSE42-NEXT:    addq $4, %rsi
1976; SSE42-NEXT:    testl $536870912, %ecx ## imm = 0x20000000
1977; SSE42-NEXT:    je LBB8_60
1978; SSE42-NEXT:  LBB8_59: ## %cond.load113
1979; SSE42-NEXT:    insertps $16, (%rsi), %xmm7 ## xmm7 = xmm7[0],mem[0],xmm7[2,3]
1980; SSE42-NEXT:    addq $4, %rsi
1981; SSE42-NEXT:    testl $1073741824, %ecx ## imm = 0x40000000
1982; SSE42-NEXT:    je LBB8_62
1983; SSE42-NEXT:  LBB8_61: ## %cond.load117
1984; SSE42-NEXT:    insertps $32, (%rsi), %xmm7 ## xmm7 = xmm7[0,1],mem[0],xmm7[3]
1985; SSE42-NEXT:    addq $4, %rsi
1986; SSE42-NEXT:    testl $-2147483648, %ecx ## imm = 0x80000000
1987; SSE42-NEXT:    jne LBB8_63
1988; SSE42-NEXT:    jmp LBB8_64
1989;
1990; AVX1-LABEL: expandload_v32f32_v32i32:
1991; AVX1:       ## %bb.0:
1992; AVX1-NEXT:    vextractf128 $1, %ymm5, %xmm8
1993; AVX1-NEXT:    vpxor %xmm9, %xmm9, %xmm9
1994; AVX1-NEXT:    vpcmpeqd %xmm9, %xmm8, %xmm8
1995; AVX1-NEXT:    vpcmpeqd %xmm5, %xmm9, %xmm5
1996; AVX1-NEXT:    vpackssdw %xmm8, %xmm5, %xmm5
1997; AVX1-NEXT:    vextractf128 $1, %ymm4, %xmm8
1998; AVX1-NEXT:    vpcmpeqd %xmm9, %xmm8, %xmm8
1999; AVX1-NEXT:    vpcmpeqd %xmm4, %xmm9, %xmm4
2000; AVX1-NEXT:    vpackssdw %xmm8, %xmm4, %xmm4
2001; AVX1-NEXT:    vpacksswb %xmm5, %xmm4, %xmm4
2002; AVX1-NEXT:    vpmovmskb %xmm4, %ecx
2003; AVX1-NEXT:    vextractf128 $1, %ymm7, %xmm4
2004; AVX1-NEXT:    vpcmpeqd %xmm4, %xmm9, %xmm4
2005; AVX1-NEXT:    vpcmpeqd %xmm7, %xmm9, %xmm5
2006; AVX1-NEXT:    vpackssdw %xmm4, %xmm5, %xmm4
2007; AVX1-NEXT:    vextractf128 $1, %ymm6, %xmm5
2008; AVX1-NEXT:    vpcmpeqd %xmm5, %xmm9, %xmm5
2009; AVX1-NEXT:    vpcmpeqd %xmm6, %xmm9, %xmm6
2010; AVX1-NEXT:    vpackssdw %xmm5, %xmm6, %xmm5
2011; AVX1-NEXT:    vpacksswb %xmm4, %xmm5, %xmm4
2012; AVX1-NEXT:    vpmovmskb %xmm4, %eax
2013; AVX1-NEXT:    shll $16, %eax
2014; AVX1-NEXT:    orl %ecx, %eax
2015; AVX1-NEXT:    testb $1, %al
2016; AVX1-NEXT:    jne LBB8_1
2017; AVX1-NEXT:  ## %bb.2: ## %else
2018; AVX1-NEXT:    testb $2, %al
2019; AVX1-NEXT:    jne LBB8_3
2020; AVX1-NEXT:  LBB8_4: ## %else2
2021; AVX1-NEXT:    testb $4, %al
2022; AVX1-NEXT:    jne LBB8_5
2023; AVX1-NEXT:  LBB8_6: ## %else6
2024; AVX1-NEXT:    testb $8, %al
2025; AVX1-NEXT:    jne LBB8_7
2026; AVX1-NEXT:  LBB8_8: ## %else10
2027; AVX1-NEXT:    testb $16, %al
2028; AVX1-NEXT:    jne LBB8_9
2029; AVX1-NEXT:  LBB8_10: ## %else14
2030; AVX1-NEXT:    testb $32, %al
2031; AVX1-NEXT:    jne LBB8_11
2032; AVX1-NEXT:  LBB8_12: ## %else18
2033; AVX1-NEXT:    testb $64, %al
2034; AVX1-NEXT:    jne LBB8_13
2035; AVX1-NEXT:  LBB8_14: ## %else22
2036; AVX1-NEXT:    testb %al, %al
2037; AVX1-NEXT:    js LBB8_15
2038; AVX1-NEXT:  LBB8_16: ## %else26
2039; AVX1-NEXT:    testl $256, %eax ## imm = 0x100
2040; AVX1-NEXT:    jne LBB8_17
2041; AVX1-NEXT:  LBB8_18: ## %else30
2042; AVX1-NEXT:    testl $512, %eax ## imm = 0x200
2043; AVX1-NEXT:    jne LBB8_19
2044; AVX1-NEXT:  LBB8_20: ## %else34
2045; AVX1-NEXT:    testl $1024, %eax ## imm = 0x400
2046; AVX1-NEXT:    jne LBB8_21
2047; AVX1-NEXT:  LBB8_22: ## %else38
2048; AVX1-NEXT:    testl $2048, %eax ## imm = 0x800
2049; AVX1-NEXT:    jne LBB8_23
2050; AVX1-NEXT:  LBB8_24: ## %else42
2051; AVX1-NEXT:    testl $4096, %eax ## imm = 0x1000
2052; AVX1-NEXT:    jne LBB8_25
2053; AVX1-NEXT:  LBB8_26: ## %else46
2054; AVX1-NEXT:    testl $8192, %eax ## imm = 0x2000
2055; AVX1-NEXT:    jne LBB8_27
2056; AVX1-NEXT:  LBB8_28: ## %else50
2057; AVX1-NEXT:    testl $16384, %eax ## imm = 0x4000
2058; AVX1-NEXT:    jne LBB8_29
2059; AVX1-NEXT:  LBB8_30: ## %else54
2060; AVX1-NEXT:    testw %ax, %ax
2061; AVX1-NEXT:    js LBB8_31
2062; AVX1-NEXT:  LBB8_32: ## %else58
2063; AVX1-NEXT:    testl $65536, %eax ## imm = 0x10000
2064; AVX1-NEXT:    jne LBB8_33
2065; AVX1-NEXT:  LBB8_34: ## %else62
2066; AVX1-NEXT:    testl $131072, %eax ## imm = 0x20000
2067; AVX1-NEXT:    jne LBB8_35
2068; AVX1-NEXT:  LBB8_36: ## %else66
2069; AVX1-NEXT:    testl $262144, %eax ## imm = 0x40000
2070; AVX1-NEXT:    jne LBB8_37
2071; AVX1-NEXT:  LBB8_38: ## %else70
2072; AVX1-NEXT:    testl $524288, %eax ## imm = 0x80000
2073; AVX1-NEXT:    jne LBB8_39
2074; AVX1-NEXT:  LBB8_40: ## %else74
2075; AVX1-NEXT:    testl $1048576, %eax ## imm = 0x100000
2076; AVX1-NEXT:    jne LBB8_41
2077; AVX1-NEXT:  LBB8_42: ## %else78
2078; AVX1-NEXT:    testl $2097152, %eax ## imm = 0x200000
2079; AVX1-NEXT:    jne LBB8_43
2080; AVX1-NEXT:  LBB8_44: ## %else82
2081; AVX1-NEXT:    testl $4194304, %eax ## imm = 0x400000
2082; AVX1-NEXT:    jne LBB8_45
2083; AVX1-NEXT:  LBB8_46: ## %else86
2084; AVX1-NEXT:    testl $8388608, %eax ## imm = 0x800000
2085; AVX1-NEXT:    jne LBB8_47
2086; AVX1-NEXT:  LBB8_48: ## %else90
2087; AVX1-NEXT:    testl $16777216, %eax ## imm = 0x1000000
2088; AVX1-NEXT:    jne LBB8_49
2089; AVX1-NEXT:  LBB8_50: ## %else94
2090; AVX1-NEXT:    testl $33554432, %eax ## imm = 0x2000000
2091; AVX1-NEXT:    jne LBB8_51
2092; AVX1-NEXT:  LBB8_52: ## %else98
2093; AVX1-NEXT:    testl $67108864, %eax ## imm = 0x4000000
2094; AVX1-NEXT:    jne LBB8_53
2095; AVX1-NEXT:  LBB8_54: ## %else102
2096; AVX1-NEXT:    testl $134217728, %eax ## imm = 0x8000000
2097; AVX1-NEXT:    jne LBB8_55
2098; AVX1-NEXT:  LBB8_56: ## %else106
2099; AVX1-NEXT:    testl $268435456, %eax ## imm = 0x10000000
2100; AVX1-NEXT:    jne LBB8_57
2101; AVX1-NEXT:  LBB8_58: ## %else110
2102; AVX1-NEXT:    testl $536870912, %eax ## imm = 0x20000000
2103; AVX1-NEXT:    jne LBB8_59
2104; AVX1-NEXT:  LBB8_60: ## %else114
2105; AVX1-NEXT:    testl $1073741824, %eax ## imm = 0x40000000
2106; AVX1-NEXT:    jne LBB8_61
2107; AVX1-NEXT:  LBB8_62: ## %else118
2108; AVX1-NEXT:    testl $-2147483648, %eax ## imm = 0x80000000
2109; AVX1-NEXT:    jne LBB8_63
2110; AVX1-NEXT:  LBB8_64: ## %else122
2111; AVX1-NEXT:    retq
2112; AVX1-NEXT:  LBB8_1: ## %cond.load
2113; AVX1-NEXT:    vmovss (%rdi), %xmm4 ## xmm4 = mem[0],zero,zero,zero
2114; AVX1-NEXT:    vblendps {{.*#+}} ymm0 = ymm4[0],ymm0[1,2,3,4,5,6,7]
2115; AVX1-NEXT:    addq $4, %rdi
2116; AVX1-NEXT:    testb $2, %al
2117; AVX1-NEXT:    je LBB8_4
2118; AVX1-NEXT:  LBB8_3: ## %cond.load1
2119; AVX1-NEXT:    vinsertps $16, (%rdi), %xmm0, %xmm4 ## xmm4 = xmm0[0],mem[0],xmm0[2,3]
2120; AVX1-NEXT:    vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7]
2121; AVX1-NEXT:    addq $4, %rdi
2122; AVX1-NEXT:    testb $4, %al
2123; AVX1-NEXT:    je LBB8_6
2124; AVX1-NEXT:  LBB8_5: ## %cond.load5
2125; AVX1-NEXT:    vinsertps $32, (%rdi), %xmm0, %xmm4 ## xmm4 = xmm0[0,1],mem[0],xmm0[3]
2126; AVX1-NEXT:    vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7]
2127; AVX1-NEXT:    addq $4, %rdi
2128; AVX1-NEXT:    testb $8, %al
2129; AVX1-NEXT:    je LBB8_8
2130; AVX1-NEXT:  LBB8_7: ## %cond.load9
2131; AVX1-NEXT:    vinsertps $48, (%rdi), %xmm0, %xmm4 ## xmm4 = xmm0[0,1,2],mem[0]
2132; AVX1-NEXT:    vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7]
2133; AVX1-NEXT:    addq $4, %rdi
2134; AVX1-NEXT:    testb $16, %al
2135; AVX1-NEXT:    je LBB8_10
2136; AVX1-NEXT:  LBB8_9: ## %cond.load13
2137; AVX1-NEXT:    vbroadcastss (%rdi), %ymm4
2138; AVX1-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4],ymm0[5,6,7]
2139; AVX1-NEXT:    addq $4, %rdi
2140; AVX1-NEXT:    testb $32, %al
2141; AVX1-NEXT:    je LBB8_12
2142; AVX1-NEXT:  LBB8_11: ## %cond.load17
2143; AVX1-NEXT:    vbroadcastss (%rdi), %ymm4
2144; AVX1-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm4[5],ymm0[6,7]
2145; AVX1-NEXT:    addq $4, %rdi
2146; AVX1-NEXT:    testb $64, %al
2147; AVX1-NEXT:    je LBB8_14
2148; AVX1-NEXT:  LBB8_13: ## %cond.load21
2149; AVX1-NEXT:    vbroadcastss (%rdi), %ymm4
2150; AVX1-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm4[6],ymm0[7]
2151; AVX1-NEXT:    addq $4, %rdi
2152; AVX1-NEXT:    testb %al, %al
2153; AVX1-NEXT:    jns LBB8_16
2154; AVX1-NEXT:  LBB8_15: ## %cond.load25
2155; AVX1-NEXT:    vbroadcastss (%rdi), %ymm4
2156; AVX1-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm4[7]
2157; AVX1-NEXT:    addq $4, %rdi
2158; AVX1-NEXT:    testl $256, %eax ## imm = 0x100
2159; AVX1-NEXT:    je LBB8_18
2160; AVX1-NEXT:  LBB8_17: ## %cond.load29
2161; AVX1-NEXT:    vmovss (%rdi), %xmm4 ## xmm4 = mem[0],zero,zero,zero
2162; AVX1-NEXT:    vblendps {{.*#+}} ymm1 = ymm4[0],ymm1[1,2,3,4,5,6,7]
2163; AVX1-NEXT:    addq $4, %rdi
2164; AVX1-NEXT:    testl $512, %eax ## imm = 0x200
2165; AVX1-NEXT:    je LBB8_20
2166; AVX1-NEXT:  LBB8_19: ## %cond.load33
2167; AVX1-NEXT:    vinsertps $16, (%rdi), %xmm1, %xmm4 ## xmm4 = xmm1[0],mem[0],xmm1[2,3]
2168; AVX1-NEXT:    vblendps {{.*#+}} ymm1 = ymm4[0,1,2,3],ymm1[4,5,6,7]
2169; AVX1-NEXT:    addq $4, %rdi
2170; AVX1-NEXT:    testl $1024, %eax ## imm = 0x400
2171; AVX1-NEXT:    je LBB8_22
2172; AVX1-NEXT:  LBB8_21: ## %cond.load37
2173; AVX1-NEXT:    vinsertps $32, (%rdi), %xmm1, %xmm4 ## xmm4 = xmm1[0,1],mem[0],xmm1[3]
2174; AVX1-NEXT:    vblendps {{.*#+}} ymm1 = ymm4[0,1,2,3],ymm1[4,5,6,7]
2175; AVX1-NEXT:    addq $4, %rdi
2176; AVX1-NEXT:    testl $2048, %eax ## imm = 0x800
2177; AVX1-NEXT:    je LBB8_24
2178; AVX1-NEXT:  LBB8_23: ## %cond.load41
2179; AVX1-NEXT:    vinsertps $48, (%rdi), %xmm1, %xmm4 ## xmm4 = xmm1[0,1,2],mem[0]
2180; AVX1-NEXT:    vblendps {{.*#+}} ymm1 = ymm4[0,1,2,3],ymm1[4,5,6,7]
2181; AVX1-NEXT:    addq $4, %rdi
2182; AVX1-NEXT:    testl $4096, %eax ## imm = 0x1000
2183; AVX1-NEXT:    je LBB8_26
2184; AVX1-NEXT:  LBB8_25: ## %cond.load45
2185; AVX1-NEXT:    vbroadcastss (%rdi), %ymm4
2186; AVX1-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4],ymm1[5,6,7]
2187; AVX1-NEXT:    addq $4, %rdi
2188; AVX1-NEXT:    testl $8192, %eax ## imm = 0x2000
2189; AVX1-NEXT:    je LBB8_28
2190; AVX1-NEXT:  LBB8_27: ## %cond.load49
2191; AVX1-NEXT:    vbroadcastss (%rdi), %ymm4
2192; AVX1-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm4[5],ymm1[6,7]
2193; AVX1-NEXT:    addq $4, %rdi
2194; AVX1-NEXT:    testl $16384, %eax ## imm = 0x4000
2195; AVX1-NEXT:    je LBB8_30
2196; AVX1-NEXT:  LBB8_29: ## %cond.load53
2197; AVX1-NEXT:    vbroadcastss (%rdi), %ymm4
2198; AVX1-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm4[6],ymm1[7]
2199; AVX1-NEXT:    addq $4, %rdi
2200; AVX1-NEXT:    testw %ax, %ax
2201; AVX1-NEXT:    jns LBB8_32
2202; AVX1-NEXT:  LBB8_31: ## %cond.load57
2203; AVX1-NEXT:    vbroadcastss (%rdi), %ymm4
2204; AVX1-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm4[7]
2205; AVX1-NEXT:    addq $4, %rdi
2206; AVX1-NEXT:    testl $65536, %eax ## imm = 0x10000
2207; AVX1-NEXT:    je LBB8_34
2208; AVX1-NEXT:  LBB8_33: ## %cond.load61
2209; AVX1-NEXT:    vmovss (%rdi), %xmm4 ## xmm4 = mem[0],zero,zero,zero
2210; AVX1-NEXT:    vblendps {{.*#+}} ymm2 = ymm4[0],ymm2[1,2,3,4,5,6,7]
2211; AVX1-NEXT:    addq $4, %rdi
2212; AVX1-NEXT:    testl $131072, %eax ## imm = 0x20000
2213; AVX1-NEXT:    je LBB8_36
2214; AVX1-NEXT:  LBB8_35: ## %cond.load65
2215; AVX1-NEXT:    vinsertps $16, (%rdi), %xmm2, %xmm4 ## xmm4 = xmm2[0],mem[0],xmm2[2,3]
2216; AVX1-NEXT:    vblendps {{.*#+}} ymm2 = ymm4[0,1,2,3],ymm2[4,5,6,7]
2217; AVX1-NEXT:    addq $4, %rdi
2218; AVX1-NEXT:    testl $262144, %eax ## imm = 0x40000
2219; AVX1-NEXT:    je LBB8_38
2220; AVX1-NEXT:  LBB8_37: ## %cond.load69
2221; AVX1-NEXT:    vinsertps $32, (%rdi), %xmm2, %xmm4 ## xmm4 = xmm2[0,1],mem[0],xmm2[3]
2222; AVX1-NEXT:    vblendps {{.*#+}} ymm2 = ymm4[0,1,2,3],ymm2[4,5,6,7]
2223; AVX1-NEXT:    addq $4, %rdi
2224; AVX1-NEXT:    testl $524288, %eax ## imm = 0x80000
2225; AVX1-NEXT:    je LBB8_40
2226; AVX1-NEXT:  LBB8_39: ## %cond.load73
2227; AVX1-NEXT:    vinsertps $48, (%rdi), %xmm2, %xmm4 ## xmm4 = xmm2[0,1,2],mem[0]
2228; AVX1-NEXT:    vblendps {{.*#+}} ymm2 = ymm4[0,1,2,3],ymm2[4,5,6,7]
2229; AVX1-NEXT:    addq $4, %rdi
2230; AVX1-NEXT:    testl $1048576, %eax ## imm = 0x100000
2231; AVX1-NEXT:    je LBB8_42
2232; AVX1-NEXT:  LBB8_41: ## %cond.load77
2233; AVX1-NEXT:    vbroadcastss (%rdi), %ymm4
2234; AVX1-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4],ymm2[5,6,7]
2235; AVX1-NEXT:    addq $4, %rdi
2236; AVX1-NEXT:    testl $2097152, %eax ## imm = 0x200000
2237; AVX1-NEXT:    je LBB8_44
2238; AVX1-NEXT:  LBB8_43: ## %cond.load81
2239; AVX1-NEXT:    vbroadcastss (%rdi), %ymm4
2240; AVX1-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm4[5],ymm2[6,7]
2241; AVX1-NEXT:    addq $4, %rdi
2242; AVX1-NEXT:    testl $4194304, %eax ## imm = 0x400000
2243; AVX1-NEXT:    je LBB8_46
2244; AVX1-NEXT:  LBB8_45: ## %cond.load85
2245; AVX1-NEXT:    vbroadcastss (%rdi), %ymm4
2246; AVX1-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm4[6],ymm2[7]
2247; AVX1-NEXT:    addq $4, %rdi
2248; AVX1-NEXT:    testl $8388608, %eax ## imm = 0x800000
2249; AVX1-NEXT:    je LBB8_48
2250; AVX1-NEXT:  LBB8_47: ## %cond.load89
2251; AVX1-NEXT:    vbroadcastss (%rdi), %ymm4
2252; AVX1-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm4[7]
2253; AVX1-NEXT:    addq $4, %rdi
2254; AVX1-NEXT:    testl $16777216, %eax ## imm = 0x1000000
2255; AVX1-NEXT:    je LBB8_50
2256; AVX1-NEXT:  LBB8_49: ## %cond.load93
2257; AVX1-NEXT:    vmovss (%rdi), %xmm4 ## xmm4 = mem[0],zero,zero,zero
2258; AVX1-NEXT:    vblendps {{.*#+}} ymm3 = ymm4[0],ymm3[1,2,3,4,5,6,7]
2259; AVX1-NEXT:    addq $4, %rdi
2260; AVX1-NEXT:    testl $33554432, %eax ## imm = 0x2000000
2261; AVX1-NEXT:    je LBB8_52
2262; AVX1-NEXT:  LBB8_51: ## %cond.load97
2263; AVX1-NEXT:    vinsertps $16, (%rdi), %xmm3, %xmm4 ## xmm4 = xmm3[0],mem[0],xmm3[2,3]
2264; AVX1-NEXT:    vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
2265; AVX1-NEXT:    addq $4, %rdi
2266; AVX1-NEXT:    testl $67108864, %eax ## imm = 0x4000000
2267; AVX1-NEXT:    je LBB8_54
2268; AVX1-NEXT:  LBB8_53: ## %cond.load101
2269; AVX1-NEXT:    vinsertps $32, (%rdi), %xmm3, %xmm4 ## xmm4 = xmm3[0,1],mem[0],xmm3[3]
2270; AVX1-NEXT:    vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
2271; AVX1-NEXT:    addq $4, %rdi
2272; AVX1-NEXT:    testl $134217728, %eax ## imm = 0x8000000
2273; AVX1-NEXT:    je LBB8_56
2274; AVX1-NEXT:  LBB8_55: ## %cond.load105
2275; AVX1-NEXT:    vinsertps $48, (%rdi), %xmm3, %xmm4 ## xmm4 = xmm3[0,1,2],mem[0]
2276; AVX1-NEXT:    vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
2277; AVX1-NEXT:    addq $4, %rdi
2278; AVX1-NEXT:    testl $268435456, %eax ## imm = 0x10000000
2279; AVX1-NEXT:    je LBB8_58
2280; AVX1-NEXT:  LBB8_57: ## %cond.load109
2281; AVX1-NEXT:    vbroadcastss (%rdi), %ymm4
2282; AVX1-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4],ymm3[5,6,7]
2283; AVX1-NEXT:    addq $4, %rdi
2284; AVX1-NEXT:    testl $536870912, %eax ## imm = 0x20000000
2285; AVX1-NEXT:    je LBB8_60
2286; AVX1-NEXT:  LBB8_59: ## %cond.load113
2287; AVX1-NEXT:    vbroadcastss (%rdi), %ymm4
2288; AVX1-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm4[5],ymm3[6,7]
2289; AVX1-NEXT:    addq $4, %rdi
2290; AVX1-NEXT:    testl $1073741824, %eax ## imm = 0x40000000
2291; AVX1-NEXT:    je LBB8_62
2292; AVX1-NEXT:  LBB8_61: ## %cond.load117
2293; AVX1-NEXT:    vbroadcastss (%rdi), %ymm4
2294; AVX1-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm4[6],ymm3[7]
2295; AVX1-NEXT:    addq $4, %rdi
2296; AVX1-NEXT:    testl $-2147483648, %eax ## imm = 0x80000000
2297; AVX1-NEXT:    je LBB8_64
2298; AVX1-NEXT:  LBB8_63: ## %cond.load121
2299; AVX1-NEXT:    vbroadcastss (%rdi), %ymm4
2300; AVX1-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm4[7]
2301; AVX1-NEXT:    retq
2302;
2303; AVX2-LABEL: expandload_v32f32_v32i32:
2304; AVX2:       ## %bb.0:
2305; AVX2-NEXT:    vpxor %xmm8, %xmm8, %xmm8
2306; AVX2-NEXT:    vpcmpeqd %ymm7, %ymm8, %ymm7
2307; AVX2-NEXT:    vpcmpeqd %ymm6, %ymm8, %ymm6
2308; AVX2-NEXT:    vpackssdw %ymm7, %ymm6, %ymm6
2309; AVX2-NEXT:    vpermq {{.*#+}} ymm6 = ymm6[0,2,1,3]
2310; AVX2-NEXT:    vpcmpeqd %ymm5, %ymm8, %ymm5
2311; AVX2-NEXT:    vpcmpeqd %ymm4, %ymm8, %ymm4
2312; AVX2-NEXT:    vpackssdw %ymm5, %ymm4, %ymm4
2313; AVX2-NEXT:    vpermq {{.*#+}} ymm4 = ymm4[0,2,1,3]
2314; AVX2-NEXT:    vpacksswb %ymm6, %ymm4, %ymm4
2315; AVX2-NEXT:    vpermq {{.*#+}} ymm4 = ymm4[0,2,1,3]
2316; AVX2-NEXT:    vpmovmskb %ymm4, %eax
2317; AVX2-NEXT:    testb $1, %al
2318; AVX2-NEXT:    jne LBB8_1
2319; AVX2-NEXT:  ## %bb.2: ## %else
2320; AVX2-NEXT:    testb $2, %al
2321; AVX2-NEXT:    jne LBB8_3
2322; AVX2-NEXT:  LBB8_4: ## %else2
2323; AVX2-NEXT:    testb $4, %al
2324; AVX2-NEXT:    jne LBB8_5
2325; AVX2-NEXT:  LBB8_6: ## %else6
2326; AVX2-NEXT:    testb $8, %al
2327; AVX2-NEXT:    jne LBB8_7
2328; AVX2-NEXT:  LBB8_8: ## %else10
2329; AVX2-NEXT:    testb $16, %al
2330; AVX2-NEXT:    jne LBB8_9
2331; AVX2-NEXT:  LBB8_10: ## %else14
2332; AVX2-NEXT:    testb $32, %al
2333; AVX2-NEXT:    jne LBB8_11
2334; AVX2-NEXT:  LBB8_12: ## %else18
2335; AVX2-NEXT:    testb $64, %al
2336; AVX2-NEXT:    jne LBB8_13
2337; AVX2-NEXT:  LBB8_14: ## %else22
2338; AVX2-NEXT:    testb %al, %al
2339; AVX2-NEXT:    js LBB8_15
2340; AVX2-NEXT:  LBB8_16: ## %else26
2341; AVX2-NEXT:    testl $256, %eax ## imm = 0x100
2342; AVX2-NEXT:    jne LBB8_17
2343; AVX2-NEXT:  LBB8_18: ## %else30
2344; AVX2-NEXT:    testl $512, %eax ## imm = 0x200
2345; AVX2-NEXT:    jne LBB8_19
2346; AVX2-NEXT:  LBB8_20: ## %else34
2347; AVX2-NEXT:    testl $1024, %eax ## imm = 0x400
2348; AVX2-NEXT:    jne LBB8_21
2349; AVX2-NEXT:  LBB8_22: ## %else38
2350; AVX2-NEXT:    testl $2048, %eax ## imm = 0x800
2351; AVX2-NEXT:    jne LBB8_23
2352; AVX2-NEXT:  LBB8_24: ## %else42
2353; AVX2-NEXT:    testl $4096, %eax ## imm = 0x1000
2354; AVX2-NEXT:    jne LBB8_25
2355; AVX2-NEXT:  LBB8_26: ## %else46
2356; AVX2-NEXT:    testl $8192, %eax ## imm = 0x2000
2357; AVX2-NEXT:    jne LBB8_27
2358; AVX2-NEXT:  LBB8_28: ## %else50
2359; AVX2-NEXT:    testl $16384, %eax ## imm = 0x4000
2360; AVX2-NEXT:    jne LBB8_29
2361; AVX2-NEXT:  LBB8_30: ## %else54
2362; AVX2-NEXT:    testw %ax, %ax
2363; AVX2-NEXT:    js LBB8_31
2364; AVX2-NEXT:  LBB8_32: ## %else58
2365; AVX2-NEXT:    testl $65536, %eax ## imm = 0x10000
2366; AVX2-NEXT:    jne LBB8_33
2367; AVX2-NEXT:  LBB8_34: ## %else62
2368; AVX2-NEXT:    testl $131072, %eax ## imm = 0x20000
2369; AVX2-NEXT:    jne LBB8_35
2370; AVX2-NEXT:  LBB8_36: ## %else66
2371; AVX2-NEXT:    testl $262144, %eax ## imm = 0x40000
2372; AVX2-NEXT:    jne LBB8_37
2373; AVX2-NEXT:  LBB8_38: ## %else70
2374; AVX2-NEXT:    testl $524288, %eax ## imm = 0x80000
2375; AVX2-NEXT:    jne LBB8_39
2376; AVX2-NEXT:  LBB8_40: ## %else74
2377; AVX2-NEXT:    testl $1048576, %eax ## imm = 0x100000
2378; AVX2-NEXT:    jne LBB8_41
2379; AVX2-NEXT:  LBB8_42: ## %else78
2380; AVX2-NEXT:    testl $2097152, %eax ## imm = 0x200000
2381; AVX2-NEXT:    jne LBB8_43
2382; AVX2-NEXT:  LBB8_44: ## %else82
2383; AVX2-NEXT:    testl $4194304, %eax ## imm = 0x400000
2384; AVX2-NEXT:    jne LBB8_45
2385; AVX2-NEXT:  LBB8_46: ## %else86
2386; AVX2-NEXT:    testl $8388608, %eax ## imm = 0x800000
2387; AVX2-NEXT:    jne LBB8_47
2388; AVX2-NEXT:  LBB8_48: ## %else90
2389; AVX2-NEXT:    testl $16777216, %eax ## imm = 0x1000000
2390; AVX2-NEXT:    jne LBB8_49
2391; AVX2-NEXT:  LBB8_50: ## %else94
2392; AVX2-NEXT:    testl $33554432, %eax ## imm = 0x2000000
2393; AVX2-NEXT:    jne LBB8_51
2394; AVX2-NEXT:  LBB8_52: ## %else98
2395; AVX2-NEXT:    testl $67108864, %eax ## imm = 0x4000000
2396; AVX2-NEXT:    jne LBB8_53
2397; AVX2-NEXT:  LBB8_54: ## %else102
2398; AVX2-NEXT:    testl $134217728, %eax ## imm = 0x8000000
2399; AVX2-NEXT:    jne LBB8_55
2400; AVX2-NEXT:  LBB8_56: ## %else106
2401; AVX2-NEXT:    testl $268435456, %eax ## imm = 0x10000000
2402; AVX2-NEXT:    jne LBB8_57
2403; AVX2-NEXT:  LBB8_58: ## %else110
2404; AVX2-NEXT:    testl $536870912, %eax ## imm = 0x20000000
2405; AVX2-NEXT:    jne LBB8_59
2406; AVX2-NEXT:  LBB8_60: ## %else114
2407; AVX2-NEXT:    testl $1073741824, %eax ## imm = 0x40000000
2408; AVX2-NEXT:    jne LBB8_61
2409; AVX2-NEXT:  LBB8_62: ## %else118
2410; AVX2-NEXT:    testl $-2147483648, %eax ## imm = 0x80000000
2411; AVX2-NEXT:    jne LBB8_63
2412; AVX2-NEXT:  LBB8_64: ## %else122
2413; AVX2-NEXT:    retq
2414; AVX2-NEXT:  LBB8_1: ## %cond.load
2415; AVX2-NEXT:    vmovd (%rdi), %xmm4 ## xmm4 = mem[0],zero,zero,zero
2416; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm4[0],ymm0[1,2,3,4,5,6,7]
2417; AVX2-NEXT:    addq $4, %rdi
2418; AVX2-NEXT:    testb $2, %al
2419; AVX2-NEXT:    je LBB8_4
2420; AVX2-NEXT:  LBB8_3: ## %cond.load1
2421; AVX2-NEXT:    vinsertps $16, (%rdi), %xmm0, %xmm4 ## xmm4 = xmm0[0],mem[0],xmm0[2,3]
2422; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7]
2423; AVX2-NEXT:    addq $4, %rdi
2424; AVX2-NEXT:    testb $4, %al
2425; AVX2-NEXT:    je LBB8_6
2426; AVX2-NEXT:  LBB8_5: ## %cond.load5
2427; AVX2-NEXT:    vinsertps $32, (%rdi), %xmm0, %xmm4 ## xmm4 = xmm0[0,1],mem[0],xmm0[3]
2428; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7]
2429; AVX2-NEXT:    addq $4, %rdi
2430; AVX2-NEXT:    testb $8, %al
2431; AVX2-NEXT:    je LBB8_8
2432; AVX2-NEXT:  LBB8_7: ## %cond.load9
2433; AVX2-NEXT:    vinsertps $48, (%rdi), %xmm0, %xmm4 ## xmm4 = xmm0[0,1,2],mem[0]
2434; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7]
2435; AVX2-NEXT:    addq $4, %rdi
2436; AVX2-NEXT:    testb $16, %al
2437; AVX2-NEXT:    je LBB8_10
2438; AVX2-NEXT:  LBB8_9: ## %cond.load13
2439; AVX2-NEXT:    vbroadcastss (%rdi), %ymm4
2440; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4],ymm0[5,6,7]
2441; AVX2-NEXT:    addq $4, %rdi
2442; AVX2-NEXT:    testb $32, %al
2443; AVX2-NEXT:    je LBB8_12
2444; AVX2-NEXT:  LBB8_11: ## %cond.load17
2445; AVX2-NEXT:    vbroadcastss (%rdi), %ymm4
2446; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm4[5],ymm0[6,7]
2447; AVX2-NEXT:    addq $4, %rdi
2448; AVX2-NEXT:    testb $64, %al
2449; AVX2-NEXT:    je LBB8_14
2450; AVX2-NEXT:  LBB8_13: ## %cond.load21
2451; AVX2-NEXT:    vbroadcastss (%rdi), %ymm4
2452; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm4[6],ymm0[7]
2453; AVX2-NEXT:    addq $4, %rdi
2454; AVX2-NEXT:    testb %al, %al
2455; AVX2-NEXT:    jns LBB8_16
2456; AVX2-NEXT:  LBB8_15: ## %cond.load25
2457; AVX2-NEXT:    vbroadcastss (%rdi), %ymm4
2458; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm4[7]
2459; AVX2-NEXT:    addq $4, %rdi
2460; AVX2-NEXT:    testl $256, %eax ## imm = 0x100
2461; AVX2-NEXT:    je LBB8_18
2462; AVX2-NEXT:  LBB8_17: ## %cond.load29
2463; AVX2-NEXT:    vmovd (%rdi), %xmm4 ## xmm4 = mem[0],zero,zero,zero
2464; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm4[0],ymm1[1,2,3,4,5,6,7]
2465; AVX2-NEXT:    addq $4, %rdi
2466; AVX2-NEXT:    testl $512, %eax ## imm = 0x200
2467; AVX2-NEXT:    je LBB8_20
2468; AVX2-NEXT:  LBB8_19: ## %cond.load33
2469; AVX2-NEXT:    vinsertps $16, (%rdi), %xmm1, %xmm4 ## xmm4 = xmm1[0],mem[0],xmm1[2,3]
2470; AVX2-NEXT:    vblendps {{.*#+}} ymm1 = ymm4[0,1,2,3],ymm1[4,5,6,7]
2471; AVX2-NEXT:    addq $4, %rdi
2472; AVX2-NEXT:    testl $1024, %eax ## imm = 0x400
2473; AVX2-NEXT:    je LBB8_22
2474; AVX2-NEXT:  LBB8_21: ## %cond.load37
2475; AVX2-NEXT:    vinsertps $32, (%rdi), %xmm1, %xmm4 ## xmm4 = xmm1[0,1],mem[0],xmm1[3]
2476; AVX2-NEXT:    vblendps {{.*#+}} ymm1 = ymm4[0,1,2,3],ymm1[4,5,6,7]
2477; AVX2-NEXT:    addq $4, %rdi
2478; AVX2-NEXT:    testl $2048, %eax ## imm = 0x800
2479; AVX2-NEXT:    je LBB8_24
2480; AVX2-NEXT:  LBB8_23: ## %cond.load41
2481; AVX2-NEXT:    vinsertps $48, (%rdi), %xmm1, %xmm4 ## xmm4 = xmm1[0,1,2],mem[0]
2482; AVX2-NEXT:    vblendps {{.*#+}} ymm1 = ymm4[0,1,2,3],ymm1[4,5,6,7]
2483; AVX2-NEXT:    addq $4, %rdi
2484; AVX2-NEXT:    testl $4096, %eax ## imm = 0x1000
2485; AVX2-NEXT:    je LBB8_26
2486; AVX2-NEXT:  LBB8_25: ## %cond.load45
2487; AVX2-NEXT:    vbroadcastss (%rdi), %ymm4
2488; AVX2-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4],ymm1[5,6,7]
2489; AVX2-NEXT:    addq $4, %rdi
2490; AVX2-NEXT:    testl $8192, %eax ## imm = 0x2000
2491; AVX2-NEXT:    je LBB8_28
2492; AVX2-NEXT:  LBB8_27: ## %cond.load49
2493; AVX2-NEXT:    vbroadcastss (%rdi), %ymm4
2494; AVX2-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm4[5],ymm1[6,7]
2495; AVX2-NEXT:    addq $4, %rdi
2496; AVX2-NEXT:    testl $16384, %eax ## imm = 0x4000
2497; AVX2-NEXT:    je LBB8_30
2498; AVX2-NEXT:  LBB8_29: ## %cond.load53
2499; AVX2-NEXT:    vbroadcastss (%rdi), %ymm4
2500; AVX2-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm4[6],ymm1[7]
2501; AVX2-NEXT:    addq $4, %rdi
2502; AVX2-NEXT:    testw %ax, %ax
2503; AVX2-NEXT:    jns LBB8_32
2504; AVX2-NEXT:  LBB8_31: ## %cond.load57
2505; AVX2-NEXT:    vbroadcastss (%rdi), %ymm4
2506; AVX2-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm4[7]
2507; AVX2-NEXT:    addq $4, %rdi
2508; AVX2-NEXT:    testl $65536, %eax ## imm = 0x10000
2509; AVX2-NEXT:    je LBB8_34
2510; AVX2-NEXT:  LBB8_33: ## %cond.load61
2511; AVX2-NEXT:    vmovd (%rdi), %xmm4 ## xmm4 = mem[0],zero,zero,zero
2512; AVX2-NEXT:    vpblendd {{.*#+}} ymm2 = ymm4[0],ymm2[1,2,3,4,5,6,7]
2513; AVX2-NEXT:    addq $4, %rdi
2514; AVX2-NEXT:    testl $131072, %eax ## imm = 0x20000
2515; AVX2-NEXT:    je LBB8_36
2516; AVX2-NEXT:  LBB8_35: ## %cond.load65
2517; AVX2-NEXT:    vinsertps $16, (%rdi), %xmm2, %xmm4 ## xmm4 = xmm2[0],mem[0],xmm2[2,3]
2518; AVX2-NEXT:    vblendps {{.*#+}} ymm2 = ymm4[0,1,2,3],ymm2[4,5,6,7]
2519; AVX2-NEXT:    addq $4, %rdi
2520; AVX2-NEXT:    testl $262144, %eax ## imm = 0x40000
2521; AVX2-NEXT:    je LBB8_38
2522; AVX2-NEXT:  LBB8_37: ## %cond.load69
2523; AVX2-NEXT:    vinsertps $32, (%rdi), %xmm2, %xmm4 ## xmm4 = xmm2[0,1],mem[0],xmm2[3]
2524; AVX2-NEXT:    vblendps {{.*#+}} ymm2 = ymm4[0,1,2,3],ymm2[4,5,6,7]
2525; AVX2-NEXT:    addq $4, %rdi
2526; AVX2-NEXT:    testl $524288, %eax ## imm = 0x80000
2527; AVX2-NEXT:    je LBB8_40
2528; AVX2-NEXT:  LBB8_39: ## %cond.load73
2529; AVX2-NEXT:    vinsertps $48, (%rdi), %xmm2, %xmm4 ## xmm4 = xmm2[0,1,2],mem[0]
2530; AVX2-NEXT:    vblendps {{.*#+}} ymm2 = ymm4[0,1,2,3],ymm2[4,5,6,7]
2531; AVX2-NEXT:    addq $4, %rdi
2532; AVX2-NEXT:    testl $1048576, %eax ## imm = 0x100000
2533; AVX2-NEXT:    je LBB8_42
2534; AVX2-NEXT:  LBB8_41: ## %cond.load77
2535; AVX2-NEXT:    vbroadcastss (%rdi), %ymm4
2536; AVX2-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4],ymm2[5,6,7]
2537; AVX2-NEXT:    addq $4, %rdi
2538; AVX2-NEXT:    testl $2097152, %eax ## imm = 0x200000
2539; AVX2-NEXT:    je LBB8_44
2540; AVX2-NEXT:  LBB8_43: ## %cond.load81
2541; AVX2-NEXT:    vbroadcastss (%rdi), %ymm4
2542; AVX2-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm4[5],ymm2[6,7]
2543; AVX2-NEXT:    addq $4, %rdi
2544; AVX2-NEXT:    testl $4194304, %eax ## imm = 0x400000
2545; AVX2-NEXT:    je LBB8_46
2546; AVX2-NEXT:  LBB8_45: ## %cond.load85
2547; AVX2-NEXT:    vbroadcastss (%rdi), %ymm4
2548; AVX2-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm4[6],ymm2[7]
2549; AVX2-NEXT:    addq $4, %rdi
2550; AVX2-NEXT:    testl $8388608, %eax ## imm = 0x800000
2551; AVX2-NEXT:    je LBB8_48
2552; AVX2-NEXT:  LBB8_47: ## %cond.load89
2553; AVX2-NEXT:    vbroadcastss (%rdi), %ymm4
2554; AVX2-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm4[7]
2555; AVX2-NEXT:    addq $4, %rdi
2556; AVX2-NEXT:    testl $16777216, %eax ## imm = 0x1000000
2557; AVX2-NEXT:    je LBB8_50
2558; AVX2-NEXT:  LBB8_49: ## %cond.load93
2559; AVX2-NEXT:    vmovd (%rdi), %xmm4 ## xmm4 = mem[0],zero,zero,zero
2560; AVX2-NEXT:    vpblendd {{.*#+}} ymm3 = ymm4[0],ymm3[1,2,3,4,5,6,7]
2561; AVX2-NEXT:    addq $4, %rdi
2562; AVX2-NEXT:    testl $33554432, %eax ## imm = 0x2000000
2563; AVX2-NEXT:    je LBB8_52
2564; AVX2-NEXT:  LBB8_51: ## %cond.load97
2565; AVX2-NEXT:    vinsertps $16, (%rdi), %xmm3, %xmm4 ## xmm4 = xmm3[0],mem[0],xmm3[2,3]
2566; AVX2-NEXT:    vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
2567; AVX2-NEXT:    addq $4, %rdi
2568; AVX2-NEXT:    testl $67108864, %eax ## imm = 0x4000000
2569; AVX2-NEXT:    je LBB8_54
2570; AVX2-NEXT:  LBB8_53: ## %cond.load101
2571; AVX2-NEXT:    vinsertps $32, (%rdi), %xmm3, %xmm4 ## xmm4 = xmm3[0,1],mem[0],xmm3[3]
2572; AVX2-NEXT:    vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
2573; AVX2-NEXT:    addq $4, %rdi
2574; AVX2-NEXT:    testl $134217728, %eax ## imm = 0x8000000
2575; AVX2-NEXT:    je LBB8_56
2576; AVX2-NEXT:  LBB8_55: ## %cond.load105
2577; AVX2-NEXT:    vinsertps $48, (%rdi), %xmm3, %xmm4 ## xmm4 = xmm3[0,1,2],mem[0]
2578; AVX2-NEXT:    vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
2579; AVX2-NEXT:    addq $4, %rdi
2580; AVX2-NEXT:    testl $268435456, %eax ## imm = 0x10000000
2581; AVX2-NEXT:    je LBB8_58
2582; AVX2-NEXT:  LBB8_57: ## %cond.load109
2583; AVX2-NEXT:    vbroadcastss (%rdi), %ymm4
2584; AVX2-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4],ymm3[5,6,7]
2585; AVX2-NEXT:    addq $4, %rdi
2586; AVX2-NEXT:    testl $536870912, %eax ## imm = 0x20000000
2587; AVX2-NEXT:    je LBB8_60
2588; AVX2-NEXT:  LBB8_59: ## %cond.load113
2589; AVX2-NEXT:    vbroadcastss (%rdi), %ymm4
2590; AVX2-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm4[5],ymm3[6,7]
2591; AVX2-NEXT:    addq $4, %rdi
2592; AVX2-NEXT:    testl $1073741824, %eax ## imm = 0x40000000
2593; AVX2-NEXT:    je LBB8_62
2594; AVX2-NEXT:  LBB8_61: ## %cond.load117
2595; AVX2-NEXT:    vbroadcastss (%rdi), %ymm4
2596; AVX2-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm4[6],ymm3[7]
2597; AVX2-NEXT:    addq $4, %rdi
2598; AVX2-NEXT:    testl $-2147483648, %eax ## imm = 0x80000000
2599; AVX2-NEXT:    je LBB8_64
2600; AVX2-NEXT:  LBB8_63: ## %cond.load121
2601; AVX2-NEXT:    vbroadcastss (%rdi), %ymm4
2602; AVX2-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm4[7]
2603; AVX2-NEXT:    retq
2604;
2605; AVX512-LABEL: expandload_v32f32_v32i32:
2606; AVX512:       ## %bb.0:
2607; AVX512-NEXT:    vptestnmd %zmm3, %zmm3, %k2
2608; AVX512-NEXT:    vptestnmd %zmm2, %zmm2, %k1
2609; AVX512-NEXT:    kmovw %k1, %eax
2610; AVX512-NEXT:    movl %eax, %ecx
2611; AVX512-NEXT:    shrl %ecx
2612; AVX512-NEXT:    andl $21845, %ecx ## imm = 0x5555
2613; AVX512-NEXT:    subl %ecx, %eax
2614; AVX512-NEXT:    movl %eax, %ecx
2615; AVX512-NEXT:    andl $858993459, %ecx ## imm = 0x33333333
2616; AVX512-NEXT:    shrl $2, %eax
2617; AVX512-NEXT:    andl $858993459, %eax ## imm = 0x33333333
2618; AVX512-NEXT:    addl %ecx, %eax
2619; AVX512-NEXT:    movl %eax, %ecx
2620; AVX512-NEXT:    shrl $4, %ecx
2621; AVX512-NEXT:    addl %eax, %ecx
2622; AVX512-NEXT:    andl $252645135, %ecx ## imm = 0xF0F0F0F
2623; AVX512-NEXT:    imull $16843009, %ecx, %eax ## imm = 0x1010101
2624; AVX512-NEXT:    shrl $24, %eax
2625; AVX512-NEXT:    vexpandps (%rdi,%rax,4), %zmm1 {%k2}
2626; AVX512-NEXT:    vexpandps (%rdi), %zmm0 {%k1}
2627; AVX512-NEXT:    retq
2628  %mask = icmp eq <32 x i32> %trigger, zeroinitializer
2629  %res = call <32 x float> @llvm.masked.expandload.v32f32(ptr %base, <32 x i1> %mask, <32 x float> %src0)
2630  ret <32 x float> %res
2631}
2632
2633;
2634; vXi64
2635;
2636
2637define <2 x i64> @expandload_v2i64_const(ptr %base, <2 x i64> %src0) {
2638; SSE2-LABEL: expandload_v2i64_const:
2639; SSE2:       ## %bb.0:
2640; SSE2-NEXT:    movsd (%rdi), %xmm1 ## xmm1 = mem[0],zero
2641; SSE2-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
2642; SSE2-NEXT:    retq
2643;
2644; SSE42-LABEL: expandload_v2i64_const:
2645; SSE42:       ## %bb.0:
2646; SSE42-NEXT:    pinsrq $1, (%rdi), %xmm0
2647; SSE42-NEXT:    retq
2648;
2649; AVX1OR2-LABEL: expandload_v2i64_const:
2650; AVX1OR2:       ## %bb.0:
2651; AVX1OR2-NEXT:    vpinsrq $1, (%rdi), %xmm0, %xmm0
2652; AVX1OR2-NEXT:    retq
2653;
2654; AVX512F-LABEL: expandload_v2i64_const:
2655; AVX512F:       ## %bb.0:
2656; AVX512F-NEXT:    ## kill: def $xmm0 killed $xmm0 def $zmm0
2657; AVX512F-NEXT:    movb $2, %al
2658; AVX512F-NEXT:    kmovw %eax, %k1
2659; AVX512F-NEXT:    vpexpandq (%rdi), %zmm0 {%k1}
2660; AVX512F-NEXT:    ## kill: def $xmm0 killed $xmm0 killed $zmm0
2661; AVX512F-NEXT:    vzeroupper
2662; AVX512F-NEXT:    retq
2663;
2664; AVX512VLDQ-LABEL: expandload_v2i64_const:
2665; AVX512VLDQ:       ## %bb.0:
2666; AVX512VLDQ-NEXT:    movb $2, %al
2667; AVX512VLDQ-NEXT:    kmovw %eax, %k1
2668; AVX512VLDQ-NEXT:    vpexpandq (%rdi), %xmm0 {%k1}
2669; AVX512VLDQ-NEXT:    retq
2670;
2671; AVX512VLBW-LABEL: expandload_v2i64_const:
2672; AVX512VLBW:       ## %bb.0:
2673; AVX512VLBW-NEXT:    movb $2, %al
2674; AVX512VLBW-NEXT:    kmovd %eax, %k1
2675; AVX512VLBW-NEXT:    vpexpandq (%rdi), %xmm0 {%k1}
2676; AVX512VLBW-NEXT:    retq
2677  %res = call <2 x i64> @llvm.masked.expandload.v2i64(ptr %base, <2 x i1> <i1 false, i1 true>, <2 x i64> %src0)
2678  ret <2 x i64>%res
2679}
2680
2681;
2682; vXi32
2683;
2684
2685define <4 x i32> @expandload_v4i32_v4i32(ptr %base, <4 x i32> %src0, <4 x i32> %trigger) {
2686; SSE2-LABEL: expandload_v4i32_v4i32:
2687; SSE2:       ## %bb.0:
2688; SSE2-NEXT:    pxor %xmm2, %xmm2
2689; SSE2-NEXT:    pcmpeqd %xmm1, %xmm2
2690; SSE2-NEXT:    movmskps %xmm2, %eax
2691; SSE2-NEXT:    testb $1, %al
2692; SSE2-NEXT:    jne LBB10_1
2693; SSE2-NEXT:  ## %bb.2: ## %else
2694; SSE2-NEXT:    testb $2, %al
2695; SSE2-NEXT:    jne LBB10_3
2696; SSE2-NEXT:  LBB10_4: ## %else2
2697; SSE2-NEXT:    testb $4, %al
2698; SSE2-NEXT:    jne LBB10_5
2699; SSE2-NEXT:  LBB10_6: ## %else6
2700; SSE2-NEXT:    testb $8, %al
2701; SSE2-NEXT:    jne LBB10_7
2702; SSE2-NEXT:  LBB10_8: ## %else10
2703; SSE2-NEXT:    retq
2704; SSE2-NEXT:  LBB10_1: ## %cond.load
2705; SSE2-NEXT:    movss (%rdi), %xmm1 ## xmm1 = mem[0],zero,zero,zero
2706; SSE2-NEXT:    movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
2707; SSE2-NEXT:    addq $4, %rdi
2708; SSE2-NEXT:    testb $2, %al
2709; SSE2-NEXT:    je LBB10_4
2710; SSE2-NEXT:  LBB10_3: ## %cond.load1
2711; SSE2-NEXT:    movss (%rdi), %xmm1 ## xmm1 = mem[0],zero,zero,zero
2712; SSE2-NEXT:    movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
2713; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[2,3]
2714; SSE2-NEXT:    addq $4, %rdi
2715; SSE2-NEXT:    movaps %xmm1, %xmm0
2716; SSE2-NEXT:    testb $4, %al
2717; SSE2-NEXT:    je LBB10_6
2718; SSE2-NEXT:  LBB10_5: ## %cond.load5
2719; SSE2-NEXT:    movss (%rdi), %xmm1 ## xmm1 = mem[0],zero,zero,zero
2720; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[3,0]
2721; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2]
2722; SSE2-NEXT:    addq $4, %rdi
2723; SSE2-NEXT:    testb $8, %al
2724; SSE2-NEXT:    je LBB10_8
2725; SSE2-NEXT:  LBB10_7: ## %cond.load9
2726; SSE2-NEXT:    movss (%rdi), %xmm1 ## xmm1 = mem[0],zero,zero,zero
2727; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3]
2728; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,0]
2729; SSE2-NEXT:    retq
2730;
2731; SSE42-LABEL: expandload_v4i32_v4i32:
2732; SSE42:       ## %bb.0:
2733; SSE42-NEXT:    pxor %xmm2, %xmm2
2734; SSE42-NEXT:    pcmpeqd %xmm1, %xmm2
2735; SSE42-NEXT:    movmskps %xmm2, %eax
2736; SSE42-NEXT:    testb $1, %al
2737; SSE42-NEXT:    jne LBB10_1
2738; SSE42-NEXT:  ## %bb.2: ## %else
2739; SSE42-NEXT:    testb $2, %al
2740; SSE42-NEXT:    jne LBB10_3
2741; SSE42-NEXT:  LBB10_4: ## %else2
2742; SSE42-NEXT:    testb $4, %al
2743; SSE42-NEXT:    jne LBB10_5
2744; SSE42-NEXT:  LBB10_6: ## %else6
2745; SSE42-NEXT:    testb $8, %al
2746; SSE42-NEXT:    jne LBB10_7
2747; SSE42-NEXT:  LBB10_8: ## %else10
2748; SSE42-NEXT:    retq
2749; SSE42-NEXT:  LBB10_1: ## %cond.load
2750; SSE42-NEXT:    pinsrd $0, (%rdi), %xmm0
2751; SSE42-NEXT:    addq $4, %rdi
2752; SSE42-NEXT:    testb $2, %al
2753; SSE42-NEXT:    je LBB10_4
2754; SSE42-NEXT:  LBB10_3: ## %cond.load1
2755; SSE42-NEXT:    pinsrd $1, (%rdi), %xmm0
2756; SSE42-NEXT:    addq $4, %rdi
2757; SSE42-NEXT:    testb $4, %al
2758; SSE42-NEXT:    je LBB10_6
2759; SSE42-NEXT:  LBB10_5: ## %cond.load5
2760; SSE42-NEXT:    pinsrd $2, (%rdi), %xmm0
2761; SSE42-NEXT:    addq $4, %rdi
2762; SSE42-NEXT:    testb $8, %al
2763; SSE42-NEXT:    je LBB10_8
2764; SSE42-NEXT:  LBB10_7: ## %cond.load9
2765; SSE42-NEXT:    pinsrd $3, (%rdi), %xmm0
2766; SSE42-NEXT:    retq
2767;
2768; AVX1OR2-LABEL: expandload_v4i32_v4i32:
2769; AVX1OR2:       ## %bb.0:
2770; AVX1OR2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
2771; AVX1OR2-NEXT:    vpcmpeqd %xmm2, %xmm1, %xmm1
2772; AVX1OR2-NEXT:    vmovmskps %xmm1, %eax
2773; AVX1OR2-NEXT:    testb $1, %al
2774; AVX1OR2-NEXT:    jne LBB10_1
2775; AVX1OR2-NEXT:  ## %bb.2: ## %else
2776; AVX1OR2-NEXT:    testb $2, %al
2777; AVX1OR2-NEXT:    jne LBB10_3
2778; AVX1OR2-NEXT:  LBB10_4: ## %else2
2779; AVX1OR2-NEXT:    testb $4, %al
2780; AVX1OR2-NEXT:    jne LBB10_5
2781; AVX1OR2-NEXT:  LBB10_6: ## %else6
2782; AVX1OR2-NEXT:    testb $8, %al
2783; AVX1OR2-NEXT:    jne LBB10_7
2784; AVX1OR2-NEXT:  LBB10_8: ## %else10
2785; AVX1OR2-NEXT:    retq
2786; AVX1OR2-NEXT:  LBB10_1: ## %cond.load
2787; AVX1OR2-NEXT:    vpinsrd $0, (%rdi), %xmm0, %xmm0
2788; AVX1OR2-NEXT:    addq $4, %rdi
2789; AVX1OR2-NEXT:    testb $2, %al
2790; AVX1OR2-NEXT:    je LBB10_4
2791; AVX1OR2-NEXT:  LBB10_3: ## %cond.load1
2792; AVX1OR2-NEXT:    vpinsrd $1, (%rdi), %xmm0, %xmm0
2793; AVX1OR2-NEXT:    addq $4, %rdi
2794; AVX1OR2-NEXT:    testb $4, %al
2795; AVX1OR2-NEXT:    je LBB10_6
2796; AVX1OR2-NEXT:  LBB10_5: ## %cond.load5
2797; AVX1OR2-NEXT:    vpinsrd $2, (%rdi), %xmm0, %xmm0
2798; AVX1OR2-NEXT:    addq $4, %rdi
2799; AVX1OR2-NEXT:    testb $8, %al
2800; AVX1OR2-NEXT:    je LBB10_8
2801; AVX1OR2-NEXT:  LBB10_7: ## %cond.load9
2802; AVX1OR2-NEXT:    vpinsrd $3, (%rdi), %xmm0, %xmm0
2803; AVX1OR2-NEXT:    retq
2804;
2805; AVX512F-LABEL: expandload_v4i32_v4i32:
2806; AVX512F:       ## %bb.0:
2807; AVX512F-NEXT:    ## kill: def $xmm1 killed $xmm1 def $zmm1
2808; AVX512F-NEXT:    ## kill: def $xmm0 killed $xmm0 def $zmm0
2809; AVX512F-NEXT:    vptestnmd %zmm1, %zmm1, %k0
2810; AVX512F-NEXT:    kshiftlw $12, %k0, %k0
2811; AVX512F-NEXT:    kshiftrw $12, %k0, %k1
2812; AVX512F-NEXT:    vpexpandd (%rdi), %zmm0 {%k1}
2813; AVX512F-NEXT:    ## kill: def $xmm0 killed $xmm0 killed $zmm0
2814; AVX512F-NEXT:    vzeroupper
2815; AVX512F-NEXT:    retq
2816;
2817; AVX512VL-LABEL: expandload_v4i32_v4i32:
2818; AVX512VL:       ## %bb.0:
2819; AVX512VL-NEXT:    vptestnmd %xmm1, %xmm1, %k1
2820; AVX512VL-NEXT:    vpexpandd (%rdi), %xmm0 {%k1}
2821; AVX512VL-NEXT:    retq
2822  %mask = icmp eq <4 x i32> %trigger, zeroinitializer
2823  %res = call <4 x i32> @llvm.masked.expandload.v4i32(ptr %base, <4 x i1> %mask, <4 x i32> %src0)
2824  ret <4 x i32>%res
2825}
2826
2827;
2828; vXi16
2829;
2830
2831define <8 x i16> @expandload_v8i16_v8i16(ptr %base, <8 x i16> %src0, <8 x i16> %trigger) {
2832; SSE-LABEL: expandload_v8i16_v8i16:
2833; SSE:       ## %bb.0:
2834; SSE-NEXT:    pxor %xmm2, %xmm2
2835; SSE-NEXT:    pcmpeqw %xmm1, %xmm2
2836; SSE-NEXT:    packsswb %xmm2, %xmm2
2837; SSE-NEXT:    pmovmskb %xmm2, %eax
2838; SSE-NEXT:    testb $1, %al
2839; SSE-NEXT:    jne LBB11_1
2840; SSE-NEXT:  ## %bb.2: ## %else
2841; SSE-NEXT:    testb $2, %al
2842; SSE-NEXT:    jne LBB11_3
2843; SSE-NEXT:  LBB11_4: ## %else2
2844; SSE-NEXT:    testb $4, %al
2845; SSE-NEXT:    jne LBB11_5
2846; SSE-NEXT:  LBB11_6: ## %else6
2847; SSE-NEXT:    testb $8, %al
2848; SSE-NEXT:    jne LBB11_7
2849; SSE-NEXT:  LBB11_8: ## %else10
2850; SSE-NEXT:    testb $16, %al
2851; SSE-NEXT:    jne LBB11_9
2852; SSE-NEXT:  LBB11_10: ## %else14
2853; SSE-NEXT:    testb $32, %al
2854; SSE-NEXT:    jne LBB11_11
2855; SSE-NEXT:  LBB11_12: ## %else18
2856; SSE-NEXT:    testb $64, %al
2857; SSE-NEXT:    jne LBB11_13
2858; SSE-NEXT:  LBB11_14: ## %else22
2859; SSE-NEXT:    testb $-128, %al
2860; SSE-NEXT:    jne LBB11_15
2861; SSE-NEXT:  LBB11_16: ## %else26
2862; SSE-NEXT:    retq
2863; SSE-NEXT:  LBB11_1: ## %cond.load
2864; SSE-NEXT:    pinsrw $0, (%rdi), %xmm0
2865; SSE-NEXT:    addq $2, %rdi
2866; SSE-NEXT:    testb $2, %al
2867; SSE-NEXT:    je LBB11_4
2868; SSE-NEXT:  LBB11_3: ## %cond.load1
2869; SSE-NEXT:    pinsrw $1, (%rdi), %xmm0
2870; SSE-NEXT:    addq $2, %rdi
2871; SSE-NEXT:    testb $4, %al
2872; SSE-NEXT:    je LBB11_6
2873; SSE-NEXT:  LBB11_5: ## %cond.load5
2874; SSE-NEXT:    pinsrw $2, (%rdi), %xmm0
2875; SSE-NEXT:    addq $2, %rdi
2876; SSE-NEXT:    testb $8, %al
2877; SSE-NEXT:    je LBB11_8
2878; SSE-NEXT:  LBB11_7: ## %cond.load9
2879; SSE-NEXT:    pinsrw $3, (%rdi), %xmm0
2880; SSE-NEXT:    addq $2, %rdi
2881; SSE-NEXT:    testb $16, %al
2882; SSE-NEXT:    je LBB11_10
2883; SSE-NEXT:  LBB11_9: ## %cond.load13
2884; SSE-NEXT:    pinsrw $4, (%rdi), %xmm0
2885; SSE-NEXT:    addq $2, %rdi
2886; SSE-NEXT:    testb $32, %al
2887; SSE-NEXT:    je LBB11_12
2888; SSE-NEXT:  LBB11_11: ## %cond.load17
2889; SSE-NEXT:    pinsrw $5, (%rdi), %xmm0
2890; SSE-NEXT:    addq $2, %rdi
2891; SSE-NEXT:    testb $64, %al
2892; SSE-NEXT:    je LBB11_14
2893; SSE-NEXT:  LBB11_13: ## %cond.load21
2894; SSE-NEXT:    pinsrw $6, (%rdi), %xmm0
2895; SSE-NEXT:    addq $2, %rdi
2896; SSE-NEXT:    testb $-128, %al
2897; SSE-NEXT:    je LBB11_16
2898; SSE-NEXT:  LBB11_15: ## %cond.load25
2899; SSE-NEXT:    pinsrw $7, (%rdi), %xmm0
2900; SSE-NEXT:    retq
2901;
2902; AVX1OR2-LABEL: expandload_v8i16_v8i16:
2903; AVX1OR2:       ## %bb.0:
2904; AVX1OR2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
2905; AVX1OR2-NEXT:    vpcmpeqw %xmm2, %xmm1, %xmm1
2906; AVX1OR2-NEXT:    vpacksswb %xmm1, %xmm1, %xmm1
2907; AVX1OR2-NEXT:    vpmovmskb %xmm1, %eax
2908; AVX1OR2-NEXT:    testb $1, %al
2909; AVX1OR2-NEXT:    jne LBB11_1
2910; AVX1OR2-NEXT:  ## %bb.2: ## %else
2911; AVX1OR2-NEXT:    testb $2, %al
2912; AVX1OR2-NEXT:    jne LBB11_3
2913; AVX1OR2-NEXT:  LBB11_4: ## %else2
2914; AVX1OR2-NEXT:    testb $4, %al
2915; AVX1OR2-NEXT:    jne LBB11_5
2916; AVX1OR2-NEXT:  LBB11_6: ## %else6
2917; AVX1OR2-NEXT:    testb $8, %al
2918; AVX1OR2-NEXT:    jne LBB11_7
2919; AVX1OR2-NEXT:  LBB11_8: ## %else10
2920; AVX1OR2-NEXT:    testb $16, %al
2921; AVX1OR2-NEXT:    jne LBB11_9
2922; AVX1OR2-NEXT:  LBB11_10: ## %else14
2923; AVX1OR2-NEXT:    testb $32, %al
2924; AVX1OR2-NEXT:    jne LBB11_11
2925; AVX1OR2-NEXT:  LBB11_12: ## %else18
2926; AVX1OR2-NEXT:    testb $64, %al
2927; AVX1OR2-NEXT:    jne LBB11_13
2928; AVX1OR2-NEXT:  LBB11_14: ## %else22
2929; AVX1OR2-NEXT:    testb $-128, %al
2930; AVX1OR2-NEXT:    jne LBB11_15
2931; AVX1OR2-NEXT:  LBB11_16: ## %else26
2932; AVX1OR2-NEXT:    retq
2933; AVX1OR2-NEXT:  LBB11_1: ## %cond.load
2934; AVX1OR2-NEXT:    vpinsrw $0, (%rdi), %xmm0, %xmm0
2935; AVX1OR2-NEXT:    addq $2, %rdi
2936; AVX1OR2-NEXT:    testb $2, %al
2937; AVX1OR2-NEXT:    je LBB11_4
2938; AVX1OR2-NEXT:  LBB11_3: ## %cond.load1
2939; AVX1OR2-NEXT:    vpinsrw $1, (%rdi), %xmm0, %xmm0
2940; AVX1OR2-NEXT:    addq $2, %rdi
2941; AVX1OR2-NEXT:    testb $4, %al
2942; AVX1OR2-NEXT:    je LBB11_6
2943; AVX1OR2-NEXT:  LBB11_5: ## %cond.load5
2944; AVX1OR2-NEXT:    vpinsrw $2, (%rdi), %xmm0, %xmm0
2945; AVX1OR2-NEXT:    addq $2, %rdi
2946; AVX1OR2-NEXT:    testb $8, %al
2947; AVX1OR2-NEXT:    je LBB11_8
2948; AVX1OR2-NEXT:  LBB11_7: ## %cond.load9
2949; AVX1OR2-NEXT:    vpinsrw $3, (%rdi), %xmm0, %xmm0
2950; AVX1OR2-NEXT:    addq $2, %rdi
2951; AVX1OR2-NEXT:    testb $16, %al
2952; AVX1OR2-NEXT:    je LBB11_10
2953; AVX1OR2-NEXT:  LBB11_9: ## %cond.load13
2954; AVX1OR2-NEXT:    vpinsrw $4, (%rdi), %xmm0, %xmm0
2955; AVX1OR2-NEXT:    addq $2, %rdi
2956; AVX1OR2-NEXT:    testb $32, %al
2957; AVX1OR2-NEXT:    je LBB11_12
2958; AVX1OR2-NEXT:  LBB11_11: ## %cond.load17
2959; AVX1OR2-NEXT:    vpinsrw $5, (%rdi), %xmm0, %xmm0
2960; AVX1OR2-NEXT:    addq $2, %rdi
2961; AVX1OR2-NEXT:    testb $64, %al
2962; AVX1OR2-NEXT:    je LBB11_14
2963; AVX1OR2-NEXT:  LBB11_13: ## %cond.load21
2964; AVX1OR2-NEXT:    vpinsrw $6, (%rdi), %xmm0, %xmm0
2965; AVX1OR2-NEXT:    addq $2, %rdi
2966; AVX1OR2-NEXT:    testb $-128, %al
2967; AVX1OR2-NEXT:    je LBB11_16
2968; AVX1OR2-NEXT:  LBB11_15: ## %cond.load25
2969; AVX1OR2-NEXT:    vpinsrw $7, (%rdi), %xmm0, %xmm0
2970; AVX1OR2-NEXT:    retq
2971;
2972; AVX512F-LABEL: expandload_v8i16_v8i16:
2973; AVX512F:       ## %bb.0:
2974; AVX512F-NEXT:    vpxor %xmm2, %xmm2, %xmm2
2975; AVX512F-NEXT:    vpcmpeqw %xmm2, %xmm1, %xmm1
2976; AVX512F-NEXT:    vpmovsxwq %xmm1, %zmm1
2977; AVX512F-NEXT:    vptestmq %zmm1, %zmm1, %k0
2978; AVX512F-NEXT:    kmovw %k0, %eax
2979; AVX512F-NEXT:    testb $1, %al
2980; AVX512F-NEXT:    jne LBB11_1
2981; AVX512F-NEXT:  ## %bb.2: ## %else
2982; AVX512F-NEXT:    testb $2, %al
2983; AVX512F-NEXT:    jne LBB11_3
2984; AVX512F-NEXT:  LBB11_4: ## %else2
2985; AVX512F-NEXT:    testb $4, %al
2986; AVX512F-NEXT:    jne LBB11_5
2987; AVX512F-NEXT:  LBB11_6: ## %else6
2988; AVX512F-NEXT:    testb $8, %al
2989; AVX512F-NEXT:    jne LBB11_7
2990; AVX512F-NEXT:  LBB11_8: ## %else10
2991; AVX512F-NEXT:    testb $16, %al
2992; AVX512F-NEXT:    jne LBB11_9
2993; AVX512F-NEXT:  LBB11_10: ## %else14
2994; AVX512F-NEXT:    testb $32, %al
2995; AVX512F-NEXT:    jne LBB11_11
2996; AVX512F-NEXT:  LBB11_12: ## %else18
2997; AVX512F-NEXT:    testb $64, %al
2998; AVX512F-NEXT:    jne LBB11_13
2999; AVX512F-NEXT:  LBB11_14: ## %else22
3000; AVX512F-NEXT:    testb $-128, %al
3001; AVX512F-NEXT:    jne LBB11_15
3002; AVX512F-NEXT:  LBB11_16: ## %else26
3003; AVX512F-NEXT:    vzeroupper
3004; AVX512F-NEXT:    retq
3005; AVX512F-NEXT:  LBB11_1: ## %cond.load
3006; AVX512F-NEXT:    vpinsrw $0, (%rdi), %xmm0, %xmm0
3007; AVX512F-NEXT:    addq $2, %rdi
3008; AVX512F-NEXT:    testb $2, %al
3009; AVX512F-NEXT:    je LBB11_4
3010; AVX512F-NEXT:  LBB11_3: ## %cond.load1
3011; AVX512F-NEXT:    vpinsrw $1, (%rdi), %xmm0, %xmm0
3012; AVX512F-NEXT:    addq $2, %rdi
3013; AVX512F-NEXT:    testb $4, %al
3014; AVX512F-NEXT:    je LBB11_6
3015; AVX512F-NEXT:  LBB11_5: ## %cond.load5
3016; AVX512F-NEXT:    vpinsrw $2, (%rdi), %xmm0, %xmm0
3017; AVX512F-NEXT:    addq $2, %rdi
3018; AVX512F-NEXT:    testb $8, %al
3019; AVX512F-NEXT:    je LBB11_8
3020; AVX512F-NEXT:  LBB11_7: ## %cond.load9
3021; AVX512F-NEXT:    vpinsrw $3, (%rdi), %xmm0, %xmm0
3022; AVX512F-NEXT:    addq $2, %rdi
3023; AVX512F-NEXT:    testb $16, %al
3024; AVX512F-NEXT:    je LBB11_10
3025; AVX512F-NEXT:  LBB11_9: ## %cond.load13
3026; AVX512F-NEXT:    vpinsrw $4, (%rdi), %xmm0, %xmm0
3027; AVX512F-NEXT:    addq $2, %rdi
3028; AVX512F-NEXT:    testb $32, %al
3029; AVX512F-NEXT:    je LBB11_12
3030; AVX512F-NEXT:  LBB11_11: ## %cond.load17
3031; AVX512F-NEXT:    vpinsrw $5, (%rdi), %xmm0, %xmm0
3032; AVX512F-NEXT:    addq $2, %rdi
3033; AVX512F-NEXT:    testb $64, %al
3034; AVX512F-NEXT:    je LBB11_14
3035; AVX512F-NEXT:  LBB11_13: ## %cond.load21
3036; AVX512F-NEXT:    vpinsrw $6, (%rdi), %xmm0, %xmm0
3037; AVX512F-NEXT:    addq $2, %rdi
3038; AVX512F-NEXT:    testb $-128, %al
3039; AVX512F-NEXT:    je LBB11_16
3040; AVX512F-NEXT:  LBB11_15: ## %cond.load25
3041; AVX512F-NEXT:    vpinsrw $7, (%rdi), %xmm0, %xmm0
3042; AVX512F-NEXT:    vzeroupper
3043; AVX512F-NEXT:    retq
3044;
3045; AVX512VLDQ-LABEL: expandload_v8i16_v8i16:
3046; AVX512VLDQ:       ## %bb.0:
3047; AVX512VLDQ-NEXT:    vpxor %xmm2, %xmm2, %xmm2
3048; AVX512VLDQ-NEXT:    vpcmpeqw %xmm2, %xmm1, %xmm1
3049; AVX512VLDQ-NEXT:    vpmovsxwd %xmm1, %ymm1
3050; AVX512VLDQ-NEXT:    vpmovd2m %ymm1, %k0
3051; AVX512VLDQ-NEXT:    kmovw %k0, %eax
3052; AVX512VLDQ-NEXT:    testb $1, %al
3053; AVX512VLDQ-NEXT:    jne LBB11_1
3054; AVX512VLDQ-NEXT:  ## %bb.2: ## %else
3055; AVX512VLDQ-NEXT:    testb $2, %al
3056; AVX512VLDQ-NEXT:    jne LBB11_3
3057; AVX512VLDQ-NEXT:  LBB11_4: ## %else2
3058; AVX512VLDQ-NEXT:    testb $4, %al
3059; AVX512VLDQ-NEXT:    jne LBB11_5
3060; AVX512VLDQ-NEXT:  LBB11_6: ## %else6
3061; AVX512VLDQ-NEXT:    testb $8, %al
3062; AVX512VLDQ-NEXT:    jne LBB11_7
3063; AVX512VLDQ-NEXT:  LBB11_8: ## %else10
3064; AVX512VLDQ-NEXT:    testb $16, %al
3065; AVX512VLDQ-NEXT:    jne LBB11_9
3066; AVX512VLDQ-NEXT:  LBB11_10: ## %else14
3067; AVX512VLDQ-NEXT:    testb $32, %al
3068; AVX512VLDQ-NEXT:    jne LBB11_11
3069; AVX512VLDQ-NEXT:  LBB11_12: ## %else18
3070; AVX512VLDQ-NEXT:    testb $64, %al
3071; AVX512VLDQ-NEXT:    jne LBB11_13
3072; AVX512VLDQ-NEXT:  LBB11_14: ## %else22
3073; AVX512VLDQ-NEXT:    testb $-128, %al
3074; AVX512VLDQ-NEXT:    jne LBB11_15
3075; AVX512VLDQ-NEXT:  LBB11_16: ## %else26
3076; AVX512VLDQ-NEXT:    vzeroupper
3077; AVX512VLDQ-NEXT:    retq
3078; AVX512VLDQ-NEXT:  LBB11_1: ## %cond.load
3079; AVX512VLDQ-NEXT:    vpinsrw $0, (%rdi), %xmm0, %xmm0
3080; AVX512VLDQ-NEXT:    addq $2, %rdi
3081; AVX512VLDQ-NEXT:    testb $2, %al
3082; AVX512VLDQ-NEXT:    je LBB11_4
3083; AVX512VLDQ-NEXT:  LBB11_3: ## %cond.load1
3084; AVX512VLDQ-NEXT:    vpinsrw $1, (%rdi), %xmm0, %xmm0
3085; AVX512VLDQ-NEXT:    addq $2, %rdi
3086; AVX512VLDQ-NEXT:    testb $4, %al
3087; AVX512VLDQ-NEXT:    je LBB11_6
3088; AVX512VLDQ-NEXT:  LBB11_5: ## %cond.load5
3089; AVX512VLDQ-NEXT:    vpinsrw $2, (%rdi), %xmm0, %xmm0
3090; AVX512VLDQ-NEXT:    addq $2, %rdi
3091; AVX512VLDQ-NEXT:    testb $8, %al
3092; AVX512VLDQ-NEXT:    je LBB11_8
3093; AVX512VLDQ-NEXT:  LBB11_7: ## %cond.load9
3094; AVX512VLDQ-NEXT:    vpinsrw $3, (%rdi), %xmm0, %xmm0
3095; AVX512VLDQ-NEXT:    addq $2, %rdi
3096; AVX512VLDQ-NEXT:    testb $16, %al
3097; AVX512VLDQ-NEXT:    je LBB11_10
3098; AVX512VLDQ-NEXT:  LBB11_9: ## %cond.load13
3099; AVX512VLDQ-NEXT:    vpinsrw $4, (%rdi), %xmm0, %xmm0
3100; AVX512VLDQ-NEXT:    addq $2, %rdi
3101; AVX512VLDQ-NEXT:    testb $32, %al
3102; AVX512VLDQ-NEXT:    je LBB11_12
3103; AVX512VLDQ-NEXT:  LBB11_11: ## %cond.load17
3104; AVX512VLDQ-NEXT:    vpinsrw $5, (%rdi), %xmm0, %xmm0
3105; AVX512VLDQ-NEXT:    addq $2, %rdi
3106; AVX512VLDQ-NEXT:    testb $64, %al
3107; AVX512VLDQ-NEXT:    je LBB11_14
3108; AVX512VLDQ-NEXT:  LBB11_13: ## %cond.load21
3109; AVX512VLDQ-NEXT:    vpinsrw $6, (%rdi), %xmm0, %xmm0
3110; AVX512VLDQ-NEXT:    addq $2, %rdi
3111; AVX512VLDQ-NEXT:    testb $-128, %al
3112; AVX512VLDQ-NEXT:    je LBB11_16
3113; AVX512VLDQ-NEXT:  LBB11_15: ## %cond.load25
3114; AVX512VLDQ-NEXT:    vpinsrw $7, (%rdi), %xmm0, %xmm0
3115; AVX512VLDQ-NEXT:    vzeroupper
3116; AVX512VLDQ-NEXT:    retq
3117;
3118; AVX512VLBW-LABEL: expandload_v8i16_v8i16:
3119; AVX512VLBW:       ## %bb.0:
3120; AVX512VLBW-NEXT:    vptestnmw %xmm1, %xmm1, %k0
3121; AVX512VLBW-NEXT:    kmovd %k0, %eax
3122; AVX512VLBW-NEXT:    testb $1, %al
3123; AVX512VLBW-NEXT:    jne LBB11_1
3124; AVX512VLBW-NEXT:  ## %bb.2: ## %else
3125; AVX512VLBW-NEXT:    testb $2, %al
3126; AVX512VLBW-NEXT:    jne LBB11_3
3127; AVX512VLBW-NEXT:  LBB11_4: ## %else2
3128; AVX512VLBW-NEXT:    testb $4, %al
3129; AVX512VLBW-NEXT:    jne LBB11_5
3130; AVX512VLBW-NEXT:  LBB11_6: ## %else6
3131; AVX512VLBW-NEXT:    testb $8, %al
3132; AVX512VLBW-NEXT:    jne LBB11_7
3133; AVX512VLBW-NEXT:  LBB11_8: ## %else10
3134; AVX512VLBW-NEXT:    testb $16, %al
3135; AVX512VLBW-NEXT:    jne LBB11_9
3136; AVX512VLBW-NEXT:  LBB11_10: ## %else14
3137; AVX512VLBW-NEXT:    testb $32, %al
3138; AVX512VLBW-NEXT:    jne LBB11_11
3139; AVX512VLBW-NEXT:  LBB11_12: ## %else18
3140; AVX512VLBW-NEXT:    testb $64, %al
3141; AVX512VLBW-NEXT:    jne LBB11_13
3142; AVX512VLBW-NEXT:  LBB11_14: ## %else22
3143; AVX512VLBW-NEXT:    testb $-128, %al
3144; AVX512VLBW-NEXT:    jne LBB11_15
3145; AVX512VLBW-NEXT:  LBB11_16: ## %else26
3146; AVX512VLBW-NEXT:    retq
3147; AVX512VLBW-NEXT:  LBB11_1: ## %cond.load
3148; AVX512VLBW-NEXT:    vpinsrw $0, (%rdi), %xmm0, %xmm0
3149; AVX512VLBW-NEXT:    addq $2, %rdi
3150; AVX512VLBW-NEXT:    testb $2, %al
3151; AVX512VLBW-NEXT:    je LBB11_4
3152; AVX512VLBW-NEXT:  LBB11_3: ## %cond.load1
3153; AVX512VLBW-NEXT:    vpinsrw $1, (%rdi), %xmm0, %xmm0
3154; AVX512VLBW-NEXT:    addq $2, %rdi
3155; AVX512VLBW-NEXT:    testb $4, %al
3156; AVX512VLBW-NEXT:    je LBB11_6
3157; AVX512VLBW-NEXT:  LBB11_5: ## %cond.load5
3158; AVX512VLBW-NEXT:    vpinsrw $2, (%rdi), %xmm0, %xmm0
3159; AVX512VLBW-NEXT:    addq $2, %rdi
3160; AVX512VLBW-NEXT:    testb $8, %al
3161; AVX512VLBW-NEXT:    je LBB11_8
3162; AVX512VLBW-NEXT:  LBB11_7: ## %cond.load9
3163; AVX512VLBW-NEXT:    vpinsrw $3, (%rdi), %xmm0, %xmm0
3164; AVX512VLBW-NEXT:    addq $2, %rdi
3165; AVX512VLBW-NEXT:    testb $16, %al
3166; AVX512VLBW-NEXT:    je LBB11_10
3167; AVX512VLBW-NEXT:  LBB11_9: ## %cond.load13
3168; AVX512VLBW-NEXT:    vpinsrw $4, (%rdi), %xmm0, %xmm0
3169; AVX512VLBW-NEXT:    addq $2, %rdi
3170; AVX512VLBW-NEXT:    testb $32, %al
3171; AVX512VLBW-NEXT:    je LBB11_12
3172; AVX512VLBW-NEXT:  LBB11_11: ## %cond.load17
3173; AVX512VLBW-NEXT:    vpinsrw $5, (%rdi), %xmm0, %xmm0
3174; AVX512VLBW-NEXT:    addq $2, %rdi
3175; AVX512VLBW-NEXT:    testb $64, %al
3176; AVX512VLBW-NEXT:    je LBB11_14
3177; AVX512VLBW-NEXT:  LBB11_13: ## %cond.load21
3178; AVX512VLBW-NEXT:    vpinsrw $6, (%rdi), %xmm0, %xmm0
3179; AVX512VLBW-NEXT:    addq $2, %rdi
3180; AVX512VLBW-NEXT:    testb $-128, %al
3181; AVX512VLBW-NEXT:    je LBB11_16
3182; AVX512VLBW-NEXT:  LBB11_15: ## %cond.load25
3183; AVX512VLBW-NEXT:    vpinsrw $7, (%rdi), %xmm0, %xmm0
3184; AVX512VLBW-NEXT:    retq
3185  %mask = icmp eq <8 x i16> %trigger, zeroinitializer
3186  %res = call <8 x i16> @llvm.masked.expandload.v8i16(ptr %base, <8 x i1> %mask, <8 x i16> %src0)
3187  ret <8 x i16>%res
3188}
3189
3190;
3191; vXi8
3192;
3193
3194define <16 x i8> @expandload_v16i8_v16i8(ptr %base, <16 x i8> %src0, <16 x i8> %trigger) {
3195; SSE2-LABEL: expandload_v16i8_v16i8:
3196; SSE2:       ## %bb.0:
3197; SSE2-NEXT:    pxor %xmm2, %xmm2
3198; SSE2-NEXT:    pcmpeqb %xmm1, %xmm2
3199; SSE2-NEXT:    pmovmskb %xmm2, %eax
3200; SSE2-NEXT:    testb $1, %al
3201; SSE2-NEXT:    jne LBB12_1
3202; SSE2-NEXT:  ## %bb.2: ## %else
3203; SSE2-NEXT:    testb $2, %al
3204; SSE2-NEXT:    jne LBB12_3
3205; SSE2-NEXT:  LBB12_4: ## %else2
3206; SSE2-NEXT:    testb $4, %al
3207; SSE2-NEXT:    jne LBB12_5
3208; SSE2-NEXT:  LBB12_6: ## %else6
3209; SSE2-NEXT:    testb $8, %al
3210; SSE2-NEXT:    jne LBB12_7
3211; SSE2-NEXT:  LBB12_8: ## %else10
3212; SSE2-NEXT:    testb $16, %al
3213; SSE2-NEXT:    jne LBB12_9
3214; SSE2-NEXT:  LBB12_10: ## %else14
3215; SSE2-NEXT:    testb $32, %al
3216; SSE2-NEXT:    jne LBB12_11
3217; SSE2-NEXT:  LBB12_12: ## %else18
3218; SSE2-NEXT:    testb $64, %al
3219; SSE2-NEXT:    jne LBB12_13
3220; SSE2-NEXT:  LBB12_14: ## %else22
3221; SSE2-NEXT:    testb %al, %al
3222; SSE2-NEXT:    js LBB12_15
3223; SSE2-NEXT:  LBB12_16: ## %else26
3224; SSE2-NEXT:    testl $256, %eax ## imm = 0x100
3225; SSE2-NEXT:    jne LBB12_17
3226; SSE2-NEXT:  LBB12_18: ## %else30
3227; SSE2-NEXT:    testl $512, %eax ## imm = 0x200
3228; SSE2-NEXT:    jne LBB12_19
3229; SSE2-NEXT:  LBB12_20: ## %else34
3230; SSE2-NEXT:    testl $1024, %eax ## imm = 0x400
3231; SSE2-NEXT:    jne LBB12_21
3232; SSE2-NEXT:  LBB12_22: ## %else38
3233; SSE2-NEXT:    testl $2048, %eax ## imm = 0x800
3234; SSE2-NEXT:    jne LBB12_23
3235; SSE2-NEXT:  LBB12_24: ## %else42
3236; SSE2-NEXT:    testl $4096, %eax ## imm = 0x1000
3237; SSE2-NEXT:    jne LBB12_25
3238; SSE2-NEXT:  LBB12_26: ## %else46
3239; SSE2-NEXT:    testl $8192, %eax ## imm = 0x2000
3240; SSE2-NEXT:    jne LBB12_27
3241; SSE2-NEXT:  LBB12_28: ## %else50
3242; SSE2-NEXT:    testl $16384, %eax ## imm = 0x4000
3243; SSE2-NEXT:    jne LBB12_29
3244; SSE2-NEXT:  LBB12_30: ## %else54
3245; SSE2-NEXT:    testl $32768, %eax ## imm = 0x8000
3246; SSE2-NEXT:    jne LBB12_31
3247; SSE2-NEXT:  LBB12_32: ## %else58
3248; SSE2-NEXT:    retq
3249; SSE2-NEXT:  LBB12_1: ## %cond.load
3250; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
3251; SSE2-NEXT:    pand %xmm1, %xmm0
3252; SSE2-NEXT:    movzbl (%rdi), %ecx
3253; SSE2-NEXT:    movd %ecx, %xmm2
3254; SSE2-NEXT:    pandn %xmm2, %xmm1
3255; SSE2-NEXT:    por %xmm1, %xmm0
3256; SSE2-NEXT:    incq %rdi
3257; SSE2-NEXT:    testb $2, %al
3258; SSE2-NEXT:    je LBB12_4
3259; SSE2-NEXT:  LBB12_3: ## %cond.load1
3260; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
3261; SSE2-NEXT:    pand %xmm1, %xmm0
3262; SSE2-NEXT:    movzbl (%rdi), %ecx
3263; SSE2-NEXT:    movd %ecx, %xmm2
3264; SSE2-NEXT:    psllw $8, %xmm2
3265; SSE2-NEXT:    pandn %xmm2, %xmm1
3266; SSE2-NEXT:    por %xmm1, %xmm0
3267; SSE2-NEXT:    incq %rdi
3268; SSE2-NEXT:    testb $4, %al
3269; SSE2-NEXT:    je LBB12_6
3270; SSE2-NEXT:  LBB12_5: ## %cond.load5
3271; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255]
3272; SSE2-NEXT:    pand %xmm1, %xmm0
3273; SSE2-NEXT:    movzbl (%rdi), %ecx
3274; SSE2-NEXT:    movd %ecx, %xmm2
3275; SSE2-NEXT:    pslld $16, %xmm2
3276; SSE2-NEXT:    pandn %xmm2, %xmm1
3277; SSE2-NEXT:    por %xmm1, %xmm0
3278; SSE2-NEXT:    incq %rdi
3279; SSE2-NEXT:    testb $8, %al
3280; SSE2-NEXT:    je LBB12_8
3281; SSE2-NEXT:  LBB12_7: ## %cond.load9
3282; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255]
3283; SSE2-NEXT:    pand %xmm1, %xmm0
3284; SSE2-NEXT:    movzbl (%rdi), %ecx
3285; SSE2-NEXT:    movd %ecx, %xmm2
3286; SSE2-NEXT:    pslld $24, %xmm2
3287; SSE2-NEXT:    pandn %xmm2, %xmm1
3288; SSE2-NEXT:    por %xmm1, %xmm0
3289; SSE2-NEXT:    incq %rdi
3290; SSE2-NEXT:    testb $16, %al
3291; SSE2-NEXT:    je LBB12_10
3292; SSE2-NEXT:  LBB12_9: ## %cond.load13
3293; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255]
3294; SSE2-NEXT:    pand %xmm1, %xmm0
3295; SSE2-NEXT:    movzbl (%rdi), %ecx
3296; SSE2-NEXT:    movd %ecx, %xmm2
3297; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,0,1,1]
3298; SSE2-NEXT:    pandn %xmm2, %xmm1
3299; SSE2-NEXT:    por %xmm1, %xmm0
3300; SSE2-NEXT:    incq %rdi
3301; SSE2-NEXT:    testb $32, %al
3302; SSE2-NEXT:    je LBB12_12
3303; SSE2-NEXT:  LBB12_11: ## %cond.load17
3304; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255]
3305; SSE2-NEXT:    pand %xmm1, %xmm0
3306; SSE2-NEXT:    movzbl (%rdi), %ecx
3307; SSE2-NEXT:    movd %ecx, %xmm2
3308; SSE2-NEXT:    psllq $40, %xmm2
3309; SSE2-NEXT:    pandn %xmm2, %xmm1
3310; SSE2-NEXT:    por %xmm1, %xmm0
3311; SSE2-NEXT:    incq %rdi
3312; SSE2-NEXT:    testb $64, %al
3313; SSE2-NEXT:    je LBB12_14
3314; SSE2-NEXT:  LBB12_13: ## %cond.load21
3315; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255]
3316; SSE2-NEXT:    pand %xmm1, %xmm0
3317; SSE2-NEXT:    movzbl (%rdi), %ecx
3318; SSE2-NEXT:    movd %ecx, %xmm2
3319; SSE2-NEXT:    psllq $48, %xmm2
3320; SSE2-NEXT:    pandn %xmm2, %xmm1
3321; SSE2-NEXT:    por %xmm1, %xmm0
3322; SSE2-NEXT:    incq %rdi
3323; SSE2-NEXT:    testb %al, %al
3324; SSE2-NEXT:    jns LBB12_16
3325; SSE2-NEXT:  LBB12_15: ## %cond.load25
3326; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255]
3327; SSE2-NEXT:    pand %xmm1, %xmm0
3328; SSE2-NEXT:    movzbl (%rdi), %ecx
3329; SSE2-NEXT:    movd %ecx, %xmm2
3330; SSE2-NEXT:    psllq $56, %xmm2
3331; SSE2-NEXT:    pandn %xmm2, %xmm1
3332; SSE2-NEXT:    por %xmm1, %xmm0
3333; SSE2-NEXT:    incq %rdi
3334; SSE2-NEXT:    testl $256, %eax ## imm = 0x100
3335; SSE2-NEXT:    je LBB12_18
3336; SSE2-NEXT:  LBB12_17: ## %cond.load29
3337; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255]
3338; SSE2-NEXT:    pand %xmm1, %xmm0
3339; SSE2-NEXT:    movzbl (%rdi), %ecx
3340; SSE2-NEXT:    movd %ecx, %xmm2
3341; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,1,0,1]
3342; SSE2-NEXT:    pandn %xmm2, %xmm1
3343; SSE2-NEXT:    por %xmm1, %xmm0
3344; SSE2-NEXT:    incq %rdi
3345; SSE2-NEXT:    testl $512, %eax ## imm = 0x200
3346; SSE2-NEXT:    je LBB12_20
3347; SSE2-NEXT:  LBB12_19: ## %cond.load33
3348; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255]
3349; SSE2-NEXT:    pand %xmm1, %xmm0
3350; SSE2-NEXT:    movzbl (%rdi), %ecx
3351; SSE2-NEXT:    movd %ecx, %xmm2
3352; SSE2-NEXT:    pslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5,6]
3353; SSE2-NEXT:    pandn %xmm2, %xmm1
3354; SSE2-NEXT:    por %xmm1, %xmm0
3355; SSE2-NEXT:    incq %rdi
3356; SSE2-NEXT:    testl $1024, %eax ## imm = 0x400
3357; SSE2-NEXT:    je LBB12_22
3358; SSE2-NEXT:  LBB12_21: ## %cond.load37
3359; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255]
3360; SSE2-NEXT:    pand %xmm1, %xmm0
3361; SSE2-NEXT:    movzbl (%rdi), %ecx
3362; SSE2-NEXT:    movd %ecx, %xmm2
3363; SSE2-NEXT:    pslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5]
3364; SSE2-NEXT:    pandn %xmm2, %xmm1
3365; SSE2-NEXT:    por %xmm1, %xmm0
3366; SSE2-NEXT:    incq %rdi
3367; SSE2-NEXT:    testl $2048, %eax ## imm = 0x800
3368; SSE2-NEXT:    je LBB12_24
3369; SSE2-NEXT:  LBB12_23: ## %cond.load41
3370; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255]
3371; SSE2-NEXT:    pand %xmm1, %xmm0
3372; SSE2-NEXT:    movzbl (%rdi), %ecx
3373; SSE2-NEXT:    movd %ecx, %xmm2
3374; SSE2-NEXT:    pslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4]
3375; SSE2-NEXT:    pandn %xmm2, %xmm1
3376; SSE2-NEXT:    por %xmm1, %xmm0
3377; SSE2-NEXT:    incq %rdi
3378; SSE2-NEXT:    testl $4096, %eax ## imm = 0x1000
3379; SSE2-NEXT:    je LBB12_26
3380; SSE2-NEXT:  LBB12_25: ## %cond.load45
3381; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255]
3382; SSE2-NEXT:    pand %xmm1, %xmm0
3383; SSE2-NEXT:    movzbl (%rdi), %ecx
3384; SSE2-NEXT:    movd %ecx, %xmm2
3385; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
3386; SSE2-NEXT:    pandn %xmm2, %xmm1
3387; SSE2-NEXT:    por %xmm1, %xmm0
3388; SSE2-NEXT:    incq %rdi
3389; SSE2-NEXT:    testl $8192, %eax ## imm = 0x2000
3390; SSE2-NEXT:    je LBB12_28
3391; SSE2-NEXT:  LBB12_27: ## %cond.load49
3392; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255]
3393; SSE2-NEXT:    pand %xmm1, %xmm0
3394; SSE2-NEXT:    movzbl (%rdi), %ecx
3395; SSE2-NEXT:    movd %ecx, %xmm2
3396; SSE2-NEXT:    pslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2]
3397; SSE2-NEXT:    pandn %xmm2, %xmm1
3398; SSE2-NEXT:    por %xmm1, %xmm0
3399; SSE2-NEXT:    incq %rdi
3400; SSE2-NEXT:    testl $16384, %eax ## imm = 0x4000
3401; SSE2-NEXT:    je LBB12_30
3402; SSE2-NEXT:  LBB12_29: ## %cond.load53
3403; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255]
3404; SSE2-NEXT:    pand %xmm1, %xmm0
3405; SSE2-NEXT:    movzbl (%rdi), %ecx
3406; SSE2-NEXT:    movd %ecx, %xmm2
3407; SSE2-NEXT:    pslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1]
3408; SSE2-NEXT:    pandn %xmm2, %xmm1
3409; SSE2-NEXT:    por %xmm1, %xmm0
3410; SSE2-NEXT:    incq %rdi
3411; SSE2-NEXT:    testl $32768, %eax ## imm = 0x8000
3412; SSE2-NEXT:    je LBB12_32
3413; SSE2-NEXT:  LBB12_31: ## %cond.load57
3414; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
3415; SSE2-NEXT:    movzbl (%rdi), %eax
3416; SSE2-NEXT:    movd %eax, %xmm1
3417; SSE2-NEXT:    pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0]
3418; SSE2-NEXT:    por %xmm1, %xmm0
3419; SSE2-NEXT:    retq
3420;
3421; SSE42-LABEL: expandload_v16i8_v16i8:
3422; SSE42:       ## %bb.0:
3423; SSE42-NEXT:    pxor %xmm2, %xmm2
3424; SSE42-NEXT:    pcmpeqb %xmm1, %xmm2
3425; SSE42-NEXT:    pmovmskb %xmm2, %eax
3426; SSE42-NEXT:    testb $1, %al
3427; SSE42-NEXT:    jne LBB12_1
3428; SSE42-NEXT:  ## %bb.2: ## %else
3429; SSE42-NEXT:    testb $2, %al
3430; SSE42-NEXT:    jne LBB12_3
3431; SSE42-NEXT:  LBB12_4: ## %else2
3432; SSE42-NEXT:    testb $4, %al
3433; SSE42-NEXT:    jne LBB12_5
3434; SSE42-NEXT:  LBB12_6: ## %else6
3435; SSE42-NEXT:    testb $8, %al
3436; SSE42-NEXT:    jne LBB12_7
3437; SSE42-NEXT:  LBB12_8: ## %else10
3438; SSE42-NEXT:    testb $16, %al
3439; SSE42-NEXT:    jne LBB12_9
3440; SSE42-NEXT:  LBB12_10: ## %else14
3441; SSE42-NEXT:    testb $32, %al
3442; SSE42-NEXT:    jne LBB12_11
3443; SSE42-NEXT:  LBB12_12: ## %else18
3444; SSE42-NEXT:    testb $64, %al
3445; SSE42-NEXT:    jne LBB12_13
3446; SSE42-NEXT:  LBB12_14: ## %else22
3447; SSE42-NEXT:    testb %al, %al
3448; SSE42-NEXT:    js LBB12_15
3449; SSE42-NEXT:  LBB12_16: ## %else26
3450; SSE42-NEXT:    testl $256, %eax ## imm = 0x100
3451; SSE42-NEXT:    jne LBB12_17
3452; SSE42-NEXT:  LBB12_18: ## %else30
3453; SSE42-NEXT:    testl $512, %eax ## imm = 0x200
3454; SSE42-NEXT:    jne LBB12_19
3455; SSE42-NEXT:  LBB12_20: ## %else34
3456; SSE42-NEXT:    testl $1024, %eax ## imm = 0x400
3457; SSE42-NEXT:    jne LBB12_21
3458; SSE42-NEXT:  LBB12_22: ## %else38
3459; SSE42-NEXT:    testl $2048, %eax ## imm = 0x800
3460; SSE42-NEXT:    jne LBB12_23
3461; SSE42-NEXT:  LBB12_24: ## %else42
3462; SSE42-NEXT:    testl $4096, %eax ## imm = 0x1000
3463; SSE42-NEXT:    jne LBB12_25
3464; SSE42-NEXT:  LBB12_26: ## %else46
3465; SSE42-NEXT:    testl $8192, %eax ## imm = 0x2000
3466; SSE42-NEXT:    jne LBB12_27
3467; SSE42-NEXT:  LBB12_28: ## %else50
3468; SSE42-NEXT:    testl $16384, %eax ## imm = 0x4000
3469; SSE42-NEXT:    jne LBB12_29
3470; SSE42-NEXT:  LBB12_30: ## %else54
3471; SSE42-NEXT:    testl $32768, %eax ## imm = 0x8000
3472; SSE42-NEXT:    jne LBB12_31
3473; SSE42-NEXT:  LBB12_32: ## %else58
3474; SSE42-NEXT:    retq
3475; SSE42-NEXT:  LBB12_1: ## %cond.load
3476; SSE42-NEXT:    pinsrb $0, (%rdi), %xmm0
3477; SSE42-NEXT:    incq %rdi
3478; SSE42-NEXT:    testb $2, %al
3479; SSE42-NEXT:    je LBB12_4
3480; SSE42-NEXT:  LBB12_3: ## %cond.load1
3481; SSE42-NEXT:    pinsrb $1, (%rdi), %xmm0
3482; SSE42-NEXT:    incq %rdi
3483; SSE42-NEXT:    testb $4, %al
3484; SSE42-NEXT:    je LBB12_6
3485; SSE42-NEXT:  LBB12_5: ## %cond.load5
3486; SSE42-NEXT:    pinsrb $2, (%rdi), %xmm0
3487; SSE42-NEXT:    incq %rdi
3488; SSE42-NEXT:    testb $8, %al
3489; SSE42-NEXT:    je LBB12_8
3490; SSE42-NEXT:  LBB12_7: ## %cond.load9
3491; SSE42-NEXT:    pinsrb $3, (%rdi), %xmm0
3492; SSE42-NEXT:    incq %rdi
3493; SSE42-NEXT:    testb $16, %al
3494; SSE42-NEXT:    je LBB12_10
3495; SSE42-NEXT:  LBB12_9: ## %cond.load13
3496; SSE42-NEXT:    pinsrb $4, (%rdi), %xmm0
3497; SSE42-NEXT:    incq %rdi
3498; SSE42-NEXT:    testb $32, %al
3499; SSE42-NEXT:    je LBB12_12
3500; SSE42-NEXT:  LBB12_11: ## %cond.load17
3501; SSE42-NEXT:    pinsrb $5, (%rdi), %xmm0
3502; SSE42-NEXT:    incq %rdi
3503; SSE42-NEXT:    testb $64, %al
3504; SSE42-NEXT:    je LBB12_14
3505; SSE42-NEXT:  LBB12_13: ## %cond.load21
3506; SSE42-NEXT:    pinsrb $6, (%rdi), %xmm0
3507; SSE42-NEXT:    incq %rdi
3508; SSE42-NEXT:    testb %al, %al
3509; SSE42-NEXT:    jns LBB12_16
3510; SSE42-NEXT:  LBB12_15: ## %cond.load25
3511; SSE42-NEXT:    pinsrb $7, (%rdi), %xmm0
3512; SSE42-NEXT:    incq %rdi
3513; SSE42-NEXT:    testl $256, %eax ## imm = 0x100
3514; SSE42-NEXT:    je LBB12_18
3515; SSE42-NEXT:  LBB12_17: ## %cond.load29
3516; SSE42-NEXT:    pinsrb $8, (%rdi), %xmm0
3517; SSE42-NEXT:    incq %rdi
3518; SSE42-NEXT:    testl $512, %eax ## imm = 0x200
3519; SSE42-NEXT:    je LBB12_20
3520; SSE42-NEXT:  LBB12_19: ## %cond.load33
3521; SSE42-NEXT:    pinsrb $9, (%rdi), %xmm0
3522; SSE42-NEXT:    incq %rdi
3523; SSE42-NEXT:    testl $1024, %eax ## imm = 0x400
3524; SSE42-NEXT:    je LBB12_22
3525; SSE42-NEXT:  LBB12_21: ## %cond.load37
3526; SSE42-NEXT:    pinsrb $10, (%rdi), %xmm0
3527; SSE42-NEXT:    incq %rdi
3528; SSE42-NEXT:    testl $2048, %eax ## imm = 0x800
3529; SSE42-NEXT:    je LBB12_24
3530; SSE42-NEXT:  LBB12_23: ## %cond.load41
3531; SSE42-NEXT:    pinsrb $11, (%rdi), %xmm0
3532; SSE42-NEXT:    incq %rdi
3533; SSE42-NEXT:    testl $4096, %eax ## imm = 0x1000
3534; SSE42-NEXT:    je LBB12_26
3535; SSE42-NEXT:  LBB12_25: ## %cond.load45
3536; SSE42-NEXT:    pinsrb $12, (%rdi), %xmm0
3537; SSE42-NEXT:    incq %rdi
3538; SSE42-NEXT:    testl $8192, %eax ## imm = 0x2000
3539; SSE42-NEXT:    je LBB12_28
3540; SSE42-NEXT:  LBB12_27: ## %cond.load49
3541; SSE42-NEXT:    pinsrb $13, (%rdi), %xmm0
3542; SSE42-NEXT:    incq %rdi
3543; SSE42-NEXT:    testl $16384, %eax ## imm = 0x4000
3544; SSE42-NEXT:    je LBB12_30
3545; SSE42-NEXT:  LBB12_29: ## %cond.load53
3546; SSE42-NEXT:    pinsrb $14, (%rdi), %xmm0
3547; SSE42-NEXT:    incq %rdi
3548; SSE42-NEXT:    testl $32768, %eax ## imm = 0x8000
3549; SSE42-NEXT:    je LBB12_32
3550; SSE42-NEXT:  LBB12_31: ## %cond.load57
3551; SSE42-NEXT:    pinsrb $15, (%rdi), %xmm0
3552; SSE42-NEXT:    retq
3553;
3554; AVX1OR2-LABEL: expandload_v16i8_v16i8:
3555; AVX1OR2:       ## %bb.0:
3556; AVX1OR2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
3557; AVX1OR2-NEXT:    vpcmpeqb %xmm2, %xmm1, %xmm1
3558; AVX1OR2-NEXT:    vpmovmskb %xmm1, %eax
3559; AVX1OR2-NEXT:    testb $1, %al
3560; AVX1OR2-NEXT:    jne LBB12_1
3561; AVX1OR2-NEXT:  ## %bb.2: ## %else
3562; AVX1OR2-NEXT:    testb $2, %al
3563; AVX1OR2-NEXT:    jne LBB12_3
3564; AVX1OR2-NEXT:  LBB12_4: ## %else2
3565; AVX1OR2-NEXT:    testb $4, %al
3566; AVX1OR2-NEXT:    jne LBB12_5
3567; AVX1OR2-NEXT:  LBB12_6: ## %else6
3568; AVX1OR2-NEXT:    testb $8, %al
3569; AVX1OR2-NEXT:    jne LBB12_7
3570; AVX1OR2-NEXT:  LBB12_8: ## %else10
3571; AVX1OR2-NEXT:    testb $16, %al
3572; AVX1OR2-NEXT:    jne LBB12_9
3573; AVX1OR2-NEXT:  LBB12_10: ## %else14
3574; AVX1OR2-NEXT:    testb $32, %al
3575; AVX1OR2-NEXT:    jne LBB12_11
3576; AVX1OR2-NEXT:  LBB12_12: ## %else18
3577; AVX1OR2-NEXT:    testb $64, %al
3578; AVX1OR2-NEXT:    jne LBB12_13
3579; AVX1OR2-NEXT:  LBB12_14: ## %else22
3580; AVX1OR2-NEXT:    testb %al, %al
3581; AVX1OR2-NEXT:    js LBB12_15
3582; AVX1OR2-NEXT:  LBB12_16: ## %else26
3583; AVX1OR2-NEXT:    testl $256, %eax ## imm = 0x100
3584; AVX1OR2-NEXT:    jne LBB12_17
3585; AVX1OR2-NEXT:  LBB12_18: ## %else30
3586; AVX1OR2-NEXT:    testl $512, %eax ## imm = 0x200
3587; AVX1OR2-NEXT:    jne LBB12_19
3588; AVX1OR2-NEXT:  LBB12_20: ## %else34
3589; AVX1OR2-NEXT:    testl $1024, %eax ## imm = 0x400
3590; AVX1OR2-NEXT:    jne LBB12_21
3591; AVX1OR2-NEXT:  LBB12_22: ## %else38
3592; AVX1OR2-NEXT:    testl $2048, %eax ## imm = 0x800
3593; AVX1OR2-NEXT:    jne LBB12_23
3594; AVX1OR2-NEXT:  LBB12_24: ## %else42
3595; AVX1OR2-NEXT:    testl $4096, %eax ## imm = 0x1000
3596; AVX1OR2-NEXT:    jne LBB12_25
3597; AVX1OR2-NEXT:  LBB12_26: ## %else46
3598; AVX1OR2-NEXT:    testl $8192, %eax ## imm = 0x2000
3599; AVX1OR2-NEXT:    jne LBB12_27
3600; AVX1OR2-NEXT:  LBB12_28: ## %else50
3601; AVX1OR2-NEXT:    testl $16384, %eax ## imm = 0x4000
3602; AVX1OR2-NEXT:    jne LBB12_29
3603; AVX1OR2-NEXT:  LBB12_30: ## %else54
3604; AVX1OR2-NEXT:    testl $32768, %eax ## imm = 0x8000
3605; AVX1OR2-NEXT:    jne LBB12_31
3606; AVX1OR2-NEXT:  LBB12_32: ## %else58
3607; AVX1OR2-NEXT:    retq
3608; AVX1OR2-NEXT:  LBB12_1: ## %cond.load
3609; AVX1OR2-NEXT:    vpinsrb $0, (%rdi), %xmm0, %xmm0
3610; AVX1OR2-NEXT:    incq %rdi
3611; AVX1OR2-NEXT:    testb $2, %al
3612; AVX1OR2-NEXT:    je LBB12_4
3613; AVX1OR2-NEXT:  LBB12_3: ## %cond.load1
3614; AVX1OR2-NEXT:    vpinsrb $1, (%rdi), %xmm0, %xmm0
3615; AVX1OR2-NEXT:    incq %rdi
3616; AVX1OR2-NEXT:    testb $4, %al
3617; AVX1OR2-NEXT:    je LBB12_6
3618; AVX1OR2-NEXT:  LBB12_5: ## %cond.load5
3619; AVX1OR2-NEXT:    vpinsrb $2, (%rdi), %xmm0, %xmm0
3620; AVX1OR2-NEXT:    incq %rdi
3621; AVX1OR2-NEXT:    testb $8, %al
3622; AVX1OR2-NEXT:    je LBB12_8
3623; AVX1OR2-NEXT:  LBB12_7: ## %cond.load9
3624; AVX1OR2-NEXT:    vpinsrb $3, (%rdi), %xmm0, %xmm0
3625; AVX1OR2-NEXT:    incq %rdi
3626; AVX1OR2-NEXT:    testb $16, %al
3627; AVX1OR2-NEXT:    je LBB12_10
3628; AVX1OR2-NEXT:  LBB12_9: ## %cond.load13
3629; AVX1OR2-NEXT:    vpinsrb $4, (%rdi), %xmm0, %xmm0
3630; AVX1OR2-NEXT:    incq %rdi
3631; AVX1OR2-NEXT:    testb $32, %al
3632; AVX1OR2-NEXT:    je LBB12_12
3633; AVX1OR2-NEXT:  LBB12_11: ## %cond.load17
3634; AVX1OR2-NEXT:    vpinsrb $5, (%rdi), %xmm0, %xmm0
3635; AVX1OR2-NEXT:    incq %rdi
3636; AVX1OR2-NEXT:    testb $64, %al
3637; AVX1OR2-NEXT:    je LBB12_14
3638; AVX1OR2-NEXT:  LBB12_13: ## %cond.load21
3639; AVX1OR2-NEXT:    vpinsrb $6, (%rdi), %xmm0, %xmm0
3640; AVX1OR2-NEXT:    incq %rdi
3641; AVX1OR2-NEXT:    testb %al, %al
3642; AVX1OR2-NEXT:    jns LBB12_16
3643; AVX1OR2-NEXT:  LBB12_15: ## %cond.load25
3644; AVX1OR2-NEXT:    vpinsrb $7, (%rdi), %xmm0, %xmm0
3645; AVX1OR2-NEXT:    incq %rdi
3646; AVX1OR2-NEXT:    testl $256, %eax ## imm = 0x100
3647; AVX1OR2-NEXT:    je LBB12_18
3648; AVX1OR2-NEXT:  LBB12_17: ## %cond.load29
3649; AVX1OR2-NEXT:    vpinsrb $8, (%rdi), %xmm0, %xmm0
3650; AVX1OR2-NEXT:    incq %rdi
3651; AVX1OR2-NEXT:    testl $512, %eax ## imm = 0x200
3652; AVX1OR2-NEXT:    je LBB12_20
3653; AVX1OR2-NEXT:  LBB12_19: ## %cond.load33
3654; AVX1OR2-NEXT:    vpinsrb $9, (%rdi), %xmm0, %xmm0
3655; AVX1OR2-NEXT:    incq %rdi
3656; AVX1OR2-NEXT:    testl $1024, %eax ## imm = 0x400
3657; AVX1OR2-NEXT:    je LBB12_22
3658; AVX1OR2-NEXT:  LBB12_21: ## %cond.load37
3659; AVX1OR2-NEXT:    vpinsrb $10, (%rdi), %xmm0, %xmm0
3660; AVX1OR2-NEXT:    incq %rdi
3661; AVX1OR2-NEXT:    testl $2048, %eax ## imm = 0x800
3662; AVX1OR2-NEXT:    je LBB12_24
3663; AVX1OR2-NEXT:  LBB12_23: ## %cond.load41
3664; AVX1OR2-NEXT:    vpinsrb $11, (%rdi), %xmm0, %xmm0
3665; AVX1OR2-NEXT:    incq %rdi
3666; AVX1OR2-NEXT:    testl $4096, %eax ## imm = 0x1000
3667; AVX1OR2-NEXT:    je LBB12_26
3668; AVX1OR2-NEXT:  LBB12_25: ## %cond.load45
3669; AVX1OR2-NEXT:    vpinsrb $12, (%rdi), %xmm0, %xmm0
3670; AVX1OR2-NEXT:    incq %rdi
3671; AVX1OR2-NEXT:    testl $8192, %eax ## imm = 0x2000
3672; AVX1OR2-NEXT:    je LBB12_28
3673; AVX1OR2-NEXT:  LBB12_27: ## %cond.load49
3674; AVX1OR2-NEXT:    vpinsrb $13, (%rdi), %xmm0, %xmm0
3675; AVX1OR2-NEXT:    incq %rdi
3676; AVX1OR2-NEXT:    testl $16384, %eax ## imm = 0x4000
3677; AVX1OR2-NEXT:    je LBB12_30
3678; AVX1OR2-NEXT:  LBB12_29: ## %cond.load53
3679; AVX1OR2-NEXT:    vpinsrb $14, (%rdi), %xmm0, %xmm0
3680; AVX1OR2-NEXT:    incq %rdi
3681; AVX1OR2-NEXT:    testl $32768, %eax ## imm = 0x8000
3682; AVX1OR2-NEXT:    je LBB12_32
3683; AVX1OR2-NEXT:  LBB12_31: ## %cond.load57
3684; AVX1OR2-NEXT:    vpinsrb $15, (%rdi), %xmm0, %xmm0
3685; AVX1OR2-NEXT:    retq
3686;
3687; AVX512F-LABEL: expandload_v16i8_v16i8:
3688; AVX512F:       ## %bb.0:
3689; AVX512F-NEXT:    vpxor %xmm2, %xmm2, %xmm2
3690; AVX512F-NEXT:    vpcmpeqb %xmm2, %xmm1, %xmm1
3691; AVX512F-NEXT:    vpmovmskb %xmm1, %eax
3692; AVX512F-NEXT:    testb $1, %al
3693; AVX512F-NEXT:    jne LBB12_1
3694; AVX512F-NEXT:  ## %bb.2: ## %else
3695; AVX512F-NEXT:    testb $2, %al
3696; AVX512F-NEXT:    jne LBB12_3
3697; AVX512F-NEXT:  LBB12_4: ## %else2
3698; AVX512F-NEXT:    testb $4, %al
3699; AVX512F-NEXT:    jne LBB12_5
3700; AVX512F-NEXT:  LBB12_6: ## %else6
3701; AVX512F-NEXT:    testb $8, %al
3702; AVX512F-NEXT:    jne LBB12_7
3703; AVX512F-NEXT:  LBB12_8: ## %else10
3704; AVX512F-NEXT:    testb $16, %al
3705; AVX512F-NEXT:    jne LBB12_9
3706; AVX512F-NEXT:  LBB12_10: ## %else14
3707; AVX512F-NEXT:    testb $32, %al
3708; AVX512F-NEXT:    jne LBB12_11
3709; AVX512F-NEXT:  LBB12_12: ## %else18
3710; AVX512F-NEXT:    testb $64, %al
3711; AVX512F-NEXT:    jne LBB12_13
3712; AVX512F-NEXT:  LBB12_14: ## %else22
3713; AVX512F-NEXT:    testb %al, %al
3714; AVX512F-NEXT:    js LBB12_15
3715; AVX512F-NEXT:  LBB12_16: ## %else26
3716; AVX512F-NEXT:    testl $256, %eax ## imm = 0x100
3717; AVX512F-NEXT:    jne LBB12_17
3718; AVX512F-NEXT:  LBB12_18: ## %else30
3719; AVX512F-NEXT:    testl $512, %eax ## imm = 0x200
3720; AVX512F-NEXT:    jne LBB12_19
3721; AVX512F-NEXT:  LBB12_20: ## %else34
3722; AVX512F-NEXT:    testl $1024, %eax ## imm = 0x400
3723; AVX512F-NEXT:    jne LBB12_21
3724; AVX512F-NEXT:  LBB12_22: ## %else38
3725; AVX512F-NEXT:    testl $2048, %eax ## imm = 0x800
3726; AVX512F-NEXT:    jne LBB12_23
3727; AVX512F-NEXT:  LBB12_24: ## %else42
3728; AVX512F-NEXT:    testl $4096, %eax ## imm = 0x1000
3729; AVX512F-NEXT:    jne LBB12_25
3730; AVX512F-NEXT:  LBB12_26: ## %else46
3731; AVX512F-NEXT:    testl $8192, %eax ## imm = 0x2000
3732; AVX512F-NEXT:    jne LBB12_27
3733; AVX512F-NEXT:  LBB12_28: ## %else50
3734; AVX512F-NEXT:    testl $16384, %eax ## imm = 0x4000
3735; AVX512F-NEXT:    jne LBB12_29
3736; AVX512F-NEXT:  LBB12_30: ## %else54
3737; AVX512F-NEXT:    testl $32768, %eax ## imm = 0x8000
3738; AVX512F-NEXT:    jne LBB12_31
3739; AVX512F-NEXT:  LBB12_32: ## %else58
3740; AVX512F-NEXT:    retq
3741; AVX512F-NEXT:  LBB12_1: ## %cond.load
3742; AVX512F-NEXT:    vpinsrb $0, (%rdi), %xmm0, %xmm0
3743; AVX512F-NEXT:    incq %rdi
3744; AVX512F-NEXT:    testb $2, %al
3745; AVX512F-NEXT:    je LBB12_4
3746; AVX512F-NEXT:  LBB12_3: ## %cond.load1
3747; AVX512F-NEXT:    vpinsrb $1, (%rdi), %xmm0, %xmm0
3748; AVX512F-NEXT:    incq %rdi
3749; AVX512F-NEXT:    testb $4, %al
3750; AVX512F-NEXT:    je LBB12_6
3751; AVX512F-NEXT:  LBB12_5: ## %cond.load5
3752; AVX512F-NEXT:    vpinsrb $2, (%rdi), %xmm0, %xmm0
3753; AVX512F-NEXT:    incq %rdi
3754; AVX512F-NEXT:    testb $8, %al
3755; AVX512F-NEXT:    je LBB12_8
3756; AVX512F-NEXT:  LBB12_7: ## %cond.load9
3757; AVX512F-NEXT:    vpinsrb $3, (%rdi), %xmm0, %xmm0
3758; AVX512F-NEXT:    incq %rdi
3759; AVX512F-NEXT:    testb $16, %al
3760; AVX512F-NEXT:    je LBB12_10
3761; AVX512F-NEXT:  LBB12_9: ## %cond.load13
3762; AVX512F-NEXT:    vpinsrb $4, (%rdi), %xmm0, %xmm0
3763; AVX512F-NEXT:    incq %rdi
3764; AVX512F-NEXT:    testb $32, %al
3765; AVX512F-NEXT:    je LBB12_12
3766; AVX512F-NEXT:  LBB12_11: ## %cond.load17
3767; AVX512F-NEXT:    vpinsrb $5, (%rdi), %xmm0, %xmm0
3768; AVX512F-NEXT:    incq %rdi
3769; AVX512F-NEXT:    testb $64, %al
3770; AVX512F-NEXT:    je LBB12_14
3771; AVX512F-NEXT:  LBB12_13: ## %cond.load21
3772; AVX512F-NEXT:    vpinsrb $6, (%rdi), %xmm0, %xmm0
3773; AVX512F-NEXT:    incq %rdi
3774; AVX512F-NEXT:    testb %al, %al
3775; AVX512F-NEXT:    jns LBB12_16
3776; AVX512F-NEXT:  LBB12_15: ## %cond.load25
3777; AVX512F-NEXT:    vpinsrb $7, (%rdi), %xmm0, %xmm0
3778; AVX512F-NEXT:    incq %rdi
3779; AVX512F-NEXT:    testl $256, %eax ## imm = 0x100
3780; AVX512F-NEXT:    je LBB12_18
3781; AVX512F-NEXT:  LBB12_17: ## %cond.load29
3782; AVX512F-NEXT:    vpinsrb $8, (%rdi), %xmm0, %xmm0
3783; AVX512F-NEXT:    incq %rdi
3784; AVX512F-NEXT:    testl $512, %eax ## imm = 0x200
3785; AVX512F-NEXT:    je LBB12_20
3786; AVX512F-NEXT:  LBB12_19: ## %cond.load33
3787; AVX512F-NEXT:    vpinsrb $9, (%rdi), %xmm0, %xmm0
3788; AVX512F-NEXT:    incq %rdi
3789; AVX512F-NEXT:    testl $1024, %eax ## imm = 0x400
3790; AVX512F-NEXT:    je LBB12_22
3791; AVX512F-NEXT:  LBB12_21: ## %cond.load37
3792; AVX512F-NEXT:    vpinsrb $10, (%rdi), %xmm0, %xmm0
3793; AVX512F-NEXT:    incq %rdi
3794; AVX512F-NEXT:    testl $2048, %eax ## imm = 0x800
3795; AVX512F-NEXT:    je LBB12_24
3796; AVX512F-NEXT:  LBB12_23: ## %cond.load41
3797; AVX512F-NEXT:    vpinsrb $11, (%rdi), %xmm0, %xmm0
3798; AVX512F-NEXT:    incq %rdi
3799; AVX512F-NEXT:    testl $4096, %eax ## imm = 0x1000
3800; AVX512F-NEXT:    je LBB12_26
3801; AVX512F-NEXT:  LBB12_25: ## %cond.load45
3802; AVX512F-NEXT:    vpinsrb $12, (%rdi), %xmm0, %xmm0
3803; AVX512F-NEXT:    incq %rdi
3804; AVX512F-NEXT:    testl $8192, %eax ## imm = 0x2000
3805; AVX512F-NEXT:    je LBB12_28
3806; AVX512F-NEXT:  LBB12_27: ## %cond.load49
3807; AVX512F-NEXT:    vpinsrb $13, (%rdi), %xmm0, %xmm0
3808; AVX512F-NEXT:    incq %rdi
3809; AVX512F-NEXT:    testl $16384, %eax ## imm = 0x4000
3810; AVX512F-NEXT:    je LBB12_30
3811; AVX512F-NEXT:  LBB12_29: ## %cond.load53
3812; AVX512F-NEXT:    vpinsrb $14, (%rdi), %xmm0, %xmm0
3813; AVX512F-NEXT:    incq %rdi
3814; AVX512F-NEXT:    testl $32768, %eax ## imm = 0x8000
3815; AVX512F-NEXT:    je LBB12_32
3816; AVX512F-NEXT:  LBB12_31: ## %cond.load57
3817; AVX512F-NEXT:    vpinsrb $15, (%rdi), %xmm0, %xmm0
3818; AVX512F-NEXT:    retq
3819;
3820; AVX512VLDQ-LABEL: expandload_v16i8_v16i8:
3821; AVX512VLDQ:       ## %bb.0:
3822; AVX512VLDQ-NEXT:    vpxor %xmm2, %xmm2, %xmm2
3823; AVX512VLDQ-NEXT:    vpcmpeqb %xmm2, %xmm1, %xmm1
3824; AVX512VLDQ-NEXT:    vpmovmskb %xmm1, %eax
3825; AVX512VLDQ-NEXT:    testb $1, %al
3826; AVX512VLDQ-NEXT:    jne LBB12_1
3827; AVX512VLDQ-NEXT:  ## %bb.2: ## %else
3828; AVX512VLDQ-NEXT:    testb $2, %al
3829; AVX512VLDQ-NEXT:    jne LBB12_3
3830; AVX512VLDQ-NEXT:  LBB12_4: ## %else2
3831; AVX512VLDQ-NEXT:    testb $4, %al
3832; AVX512VLDQ-NEXT:    jne LBB12_5
3833; AVX512VLDQ-NEXT:  LBB12_6: ## %else6
3834; AVX512VLDQ-NEXT:    testb $8, %al
3835; AVX512VLDQ-NEXT:    jne LBB12_7
3836; AVX512VLDQ-NEXT:  LBB12_8: ## %else10
3837; AVX512VLDQ-NEXT:    testb $16, %al
3838; AVX512VLDQ-NEXT:    jne LBB12_9
3839; AVX512VLDQ-NEXT:  LBB12_10: ## %else14
3840; AVX512VLDQ-NEXT:    testb $32, %al
3841; AVX512VLDQ-NEXT:    jne LBB12_11
3842; AVX512VLDQ-NEXT:  LBB12_12: ## %else18
3843; AVX512VLDQ-NEXT:    testb $64, %al
3844; AVX512VLDQ-NEXT:    jne LBB12_13
3845; AVX512VLDQ-NEXT:  LBB12_14: ## %else22
3846; AVX512VLDQ-NEXT:    testb %al, %al
3847; AVX512VLDQ-NEXT:    js LBB12_15
3848; AVX512VLDQ-NEXT:  LBB12_16: ## %else26
3849; AVX512VLDQ-NEXT:    testl $256, %eax ## imm = 0x100
3850; AVX512VLDQ-NEXT:    jne LBB12_17
3851; AVX512VLDQ-NEXT:  LBB12_18: ## %else30
3852; AVX512VLDQ-NEXT:    testl $512, %eax ## imm = 0x200
3853; AVX512VLDQ-NEXT:    jne LBB12_19
3854; AVX512VLDQ-NEXT:  LBB12_20: ## %else34
3855; AVX512VLDQ-NEXT:    testl $1024, %eax ## imm = 0x400
3856; AVX512VLDQ-NEXT:    jne LBB12_21
3857; AVX512VLDQ-NEXT:  LBB12_22: ## %else38
3858; AVX512VLDQ-NEXT:    testl $2048, %eax ## imm = 0x800
3859; AVX512VLDQ-NEXT:    jne LBB12_23
3860; AVX512VLDQ-NEXT:  LBB12_24: ## %else42
3861; AVX512VLDQ-NEXT:    testl $4096, %eax ## imm = 0x1000
3862; AVX512VLDQ-NEXT:    jne LBB12_25
3863; AVX512VLDQ-NEXT:  LBB12_26: ## %else46
3864; AVX512VLDQ-NEXT:    testl $8192, %eax ## imm = 0x2000
3865; AVX512VLDQ-NEXT:    jne LBB12_27
3866; AVX512VLDQ-NEXT:  LBB12_28: ## %else50
3867; AVX512VLDQ-NEXT:    testl $16384, %eax ## imm = 0x4000
3868; AVX512VLDQ-NEXT:    jne LBB12_29
3869; AVX512VLDQ-NEXT:  LBB12_30: ## %else54
3870; AVX512VLDQ-NEXT:    testl $32768, %eax ## imm = 0x8000
3871; AVX512VLDQ-NEXT:    jne LBB12_31
3872; AVX512VLDQ-NEXT:  LBB12_32: ## %else58
3873; AVX512VLDQ-NEXT:    retq
3874; AVX512VLDQ-NEXT:  LBB12_1: ## %cond.load
3875; AVX512VLDQ-NEXT:    vpinsrb $0, (%rdi), %xmm0, %xmm0
3876; AVX512VLDQ-NEXT:    incq %rdi
3877; AVX512VLDQ-NEXT:    testb $2, %al
3878; AVX512VLDQ-NEXT:    je LBB12_4
3879; AVX512VLDQ-NEXT:  LBB12_3: ## %cond.load1
3880; AVX512VLDQ-NEXT:    vpinsrb $1, (%rdi), %xmm0, %xmm0
3881; AVX512VLDQ-NEXT:    incq %rdi
3882; AVX512VLDQ-NEXT:    testb $4, %al
3883; AVX512VLDQ-NEXT:    je LBB12_6
3884; AVX512VLDQ-NEXT:  LBB12_5: ## %cond.load5
3885; AVX512VLDQ-NEXT:    vpinsrb $2, (%rdi), %xmm0, %xmm0
3886; AVX512VLDQ-NEXT:    incq %rdi
3887; AVX512VLDQ-NEXT:    testb $8, %al
3888; AVX512VLDQ-NEXT:    je LBB12_8
3889; AVX512VLDQ-NEXT:  LBB12_7: ## %cond.load9
3890; AVX512VLDQ-NEXT:    vpinsrb $3, (%rdi), %xmm0, %xmm0
3891; AVX512VLDQ-NEXT:    incq %rdi
3892; AVX512VLDQ-NEXT:    testb $16, %al
3893; AVX512VLDQ-NEXT:    je LBB12_10
3894; AVX512VLDQ-NEXT:  LBB12_9: ## %cond.load13
3895; AVX512VLDQ-NEXT:    vpinsrb $4, (%rdi), %xmm0, %xmm0
3896; AVX512VLDQ-NEXT:    incq %rdi
3897; AVX512VLDQ-NEXT:    testb $32, %al
3898; AVX512VLDQ-NEXT:    je LBB12_12
3899; AVX512VLDQ-NEXT:  LBB12_11: ## %cond.load17
3900; AVX512VLDQ-NEXT:    vpinsrb $5, (%rdi), %xmm0, %xmm0
3901; AVX512VLDQ-NEXT:    incq %rdi
3902; AVX512VLDQ-NEXT:    testb $64, %al
3903; AVX512VLDQ-NEXT:    je LBB12_14
3904; AVX512VLDQ-NEXT:  LBB12_13: ## %cond.load21
3905; AVX512VLDQ-NEXT:    vpinsrb $6, (%rdi), %xmm0, %xmm0
3906; AVX512VLDQ-NEXT:    incq %rdi
3907; AVX512VLDQ-NEXT:    testb %al, %al
3908; AVX512VLDQ-NEXT:    jns LBB12_16
3909; AVX512VLDQ-NEXT:  LBB12_15: ## %cond.load25
3910; AVX512VLDQ-NEXT:    vpinsrb $7, (%rdi), %xmm0, %xmm0
3911; AVX512VLDQ-NEXT:    incq %rdi
3912; AVX512VLDQ-NEXT:    testl $256, %eax ## imm = 0x100
3913; AVX512VLDQ-NEXT:    je LBB12_18
3914; AVX512VLDQ-NEXT:  LBB12_17: ## %cond.load29
3915; AVX512VLDQ-NEXT:    vpinsrb $8, (%rdi), %xmm0, %xmm0
3916; AVX512VLDQ-NEXT:    incq %rdi
3917; AVX512VLDQ-NEXT:    testl $512, %eax ## imm = 0x200
3918; AVX512VLDQ-NEXT:    je LBB12_20
3919; AVX512VLDQ-NEXT:  LBB12_19: ## %cond.load33
3920; AVX512VLDQ-NEXT:    vpinsrb $9, (%rdi), %xmm0, %xmm0
3921; AVX512VLDQ-NEXT:    incq %rdi
3922; AVX512VLDQ-NEXT:    testl $1024, %eax ## imm = 0x400
3923; AVX512VLDQ-NEXT:    je LBB12_22
3924; AVX512VLDQ-NEXT:  LBB12_21: ## %cond.load37
3925; AVX512VLDQ-NEXT:    vpinsrb $10, (%rdi), %xmm0, %xmm0
3926; AVX512VLDQ-NEXT:    incq %rdi
3927; AVX512VLDQ-NEXT:    testl $2048, %eax ## imm = 0x800
3928; AVX512VLDQ-NEXT:    je LBB12_24
3929; AVX512VLDQ-NEXT:  LBB12_23: ## %cond.load41
3930; AVX512VLDQ-NEXT:    vpinsrb $11, (%rdi), %xmm0, %xmm0
3931; AVX512VLDQ-NEXT:    incq %rdi
3932; AVX512VLDQ-NEXT:    testl $4096, %eax ## imm = 0x1000
3933; AVX512VLDQ-NEXT:    je LBB12_26
3934; AVX512VLDQ-NEXT:  LBB12_25: ## %cond.load45
3935; AVX512VLDQ-NEXT:    vpinsrb $12, (%rdi), %xmm0, %xmm0
3936; AVX512VLDQ-NEXT:    incq %rdi
3937; AVX512VLDQ-NEXT:    testl $8192, %eax ## imm = 0x2000
3938; AVX512VLDQ-NEXT:    je LBB12_28
3939; AVX512VLDQ-NEXT:  LBB12_27: ## %cond.load49
3940; AVX512VLDQ-NEXT:    vpinsrb $13, (%rdi), %xmm0, %xmm0
3941; AVX512VLDQ-NEXT:    incq %rdi
3942; AVX512VLDQ-NEXT:    testl $16384, %eax ## imm = 0x4000
3943; AVX512VLDQ-NEXT:    je LBB12_30
3944; AVX512VLDQ-NEXT:  LBB12_29: ## %cond.load53
3945; AVX512VLDQ-NEXT:    vpinsrb $14, (%rdi), %xmm0, %xmm0
3946; AVX512VLDQ-NEXT:    incq %rdi
3947; AVX512VLDQ-NEXT:    testl $32768, %eax ## imm = 0x8000
3948; AVX512VLDQ-NEXT:    je LBB12_32
3949; AVX512VLDQ-NEXT:  LBB12_31: ## %cond.load57
3950; AVX512VLDQ-NEXT:    vpinsrb $15, (%rdi), %xmm0, %xmm0
3951; AVX512VLDQ-NEXT:    retq
3952;
3953; AVX512VLBW-LABEL: expandload_v16i8_v16i8:
3954; AVX512VLBW:       ## %bb.0:
3955; AVX512VLBW-NEXT:    vptestnmb %xmm1, %xmm1, %k0
3956; AVX512VLBW-NEXT:    kmovd %k0, %eax
3957; AVX512VLBW-NEXT:    testb $1, %al
3958; AVX512VLBW-NEXT:    jne LBB12_1
3959; AVX512VLBW-NEXT:  ## %bb.2: ## %else
3960; AVX512VLBW-NEXT:    testb $2, %al
3961; AVX512VLBW-NEXT:    jne LBB12_3
3962; AVX512VLBW-NEXT:  LBB12_4: ## %else2
3963; AVX512VLBW-NEXT:    testb $4, %al
3964; AVX512VLBW-NEXT:    jne LBB12_5
3965; AVX512VLBW-NEXT:  LBB12_6: ## %else6
3966; AVX512VLBW-NEXT:    testb $8, %al
3967; AVX512VLBW-NEXT:    jne LBB12_7
3968; AVX512VLBW-NEXT:  LBB12_8: ## %else10
3969; AVX512VLBW-NEXT:    testb $16, %al
3970; AVX512VLBW-NEXT:    jne LBB12_9
3971; AVX512VLBW-NEXT:  LBB12_10: ## %else14
3972; AVX512VLBW-NEXT:    testb $32, %al
3973; AVX512VLBW-NEXT:    jne LBB12_11
3974; AVX512VLBW-NEXT:  LBB12_12: ## %else18
3975; AVX512VLBW-NEXT:    testb $64, %al
3976; AVX512VLBW-NEXT:    jne LBB12_13
3977; AVX512VLBW-NEXT:  LBB12_14: ## %else22
3978; AVX512VLBW-NEXT:    testb %al, %al
3979; AVX512VLBW-NEXT:    js LBB12_15
3980; AVX512VLBW-NEXT:  LBB12_16: ## %else26
3981; AVX512VLBW-NEXT:    testl $256, %eax ## imm = 0x100
3982; AVX512VLBW-NEXT:    jne LBB12_17
3983; AVX512VLBW-NEXT:  LBB12_18: ## %else30
3984; AVX512VLBW-NEXT:    testl $512, %eax ## imm = 0x200
3985; AVX512VLBW-NEXT:    jne LBB12_19
3986; AVX512VLBW-NEXT:  LBB12_20: ## %else34
3987; AVX512VLBW-NEXT:    testl $1024, %eax ## imm = 0x400
3988; AVX512VLBW-NEXT:    jne LBB12_21
3989; AVX512VLBW-NEXT:  LBB12_22: ## %else38
3990; AVX512VLBW-NEXT:    testl $2048, %eax ## imm = 0x800
3991; AVX512VLBW-NEXT:    jne LBB12_23
3992; AVX512VLBW-NEXT:  LBB12_24: ## %else42
3993; AVX512VLBW-NEXT:    testl $4096, %eax ## imm = 0x1000
3994; AVX512VLBW-NEXT:    jne LBB12_25
3995; AVX512VLBW-NEXT:  LBB12_26: ## %else46
3996; AVX512VLBW-NEXT:    testl $8192, %eax ## imm = 0x2000
3997; AVX512VLBW-NEXT:    jne LBB12_27
3998; AVX512VLBW-NEXT:  LBB12_28: ## %else50
3999; AVX512VLBW-NEXT:    testl $16384, %eax ## imm = 0x4000
4000; AVX512VLBW-NEXT:    jne LBB12_29
4001; AVX512VLBW-NEXT:  LBB12_30: ## %else54
4002; AVX512VLBW-NEXT:    testl $32768, %eax ## imm = 0x8000
4003; AVX512VLBW-NEXT:    jne LBB12_31
4004; AVX512VLBW-NEXT:  LBB12_32: ## %else58
4005; AVX512VLBW-NEXT:    retq
4006; AVX512VLBW-NEXT:  LBB12_1: ## %cond.load
4007; AVX512VLBW-NEXT:    vpinsrb $0, (%rdi), %xmm0, %xmm0
4008; AVX512VLBW-NEXT:    incq %rdi
4009; AVX512VLBW-NEXT:    testb $2, %al
4010; AVX512VLBW-NEXT:    je LBB12_4
4011; AVX512VLBW-NEXT:  LBB12_3: ## %cond.load1
4012; AVX512VLBW-NEXT:    vpinsrb $1, (%rdi), %xmm0, %xmm0
4013; AVX512VLBW-NEXT:    incq %rdi
4014; AVX512VLBW-NEXT:    testb $4, %al
4015; AVX512VLBW-NEXT:    je LBB12_6
4016; AVX512VLBW-NEXT:  LBB12_5: ## %cond.load5
4017; AVX512VLBW-NEXT:    vpinsrb $2, (%rdi), %xmm0, %xmm0
4018; AVX512VLBW-NEXT:    incq %rdi
4019; AVX512VLBW-NEXT:    testb $8, %al
4020; AVX512VLBW-NEXT:    je LBB12_8
4021; AVX512VLBW-NEXT:  LBB12_7: ## %cond.load9
4022; AVX512VLBW-NEXT:    vpinsrb $3, (%rdi), %xmm0, %xmm0
4023; AVX512VLBW-NEXT:    incq %rdi
4024; AVX512VLBW-NEXT:    testb $16, %al
4025; AVX512VLBW-NEXT:    je LBB12_10
4026; AVX512VLBW-NEXT:  LBB12_9: ## %cond.load13
4027; AVX512VLBW-NEXT:    vpinsrb $4, (%rdi), %xmm0, %xmm0
4028; AVX512VLBW-NEXT:    incq %rdi
4029; AVX512VLBW-NEXT:    testb $32, %al
4030; AVX512VLBW-NEXT:    je LBB12_12
4031; AVX512VLBW-NEXT:  LBB12_11: ## %cond.load17
4032; AVX512VLBW-NEXT:    vpinsrb $5, (%rdi), %xmm0, %xmm0
4033; AVX512VLBW-NEXT:    incq %rdi
4034; AVX512VLBW-NEXT:    testb $64, %al
4035; AVX512VLBW-NEXT:    je LBB12_14
4036; AVX512VLBW-NEXT:  LBB12_13: ## %cond.load21
4037; AVX512VLBW-NEXT:    vpinsrb $6, (%rdi), %xmm0, %xmm0
4038; AVX512VLBW-NEXT:    incq %rdi
4039; AVX512VLBW-NEXT:    testb %al, %al
4040; AVX512VLBW-NEXT:    jns LBB12_16
4041; AVX512VLBW-NEXT:  LBB12_15: ## %cond.load25
4042; AVX512VLBW-NEXT:    vpinsrb $7, (%rdi), %xmm0, %xmm0
4043; AVX512VLBW-NEXT:    incq %rdi
4044; AVX512VLBW-NEXT:    testl $256, %eax ## imm = 0x100
4045; AVX512VLBW-NEXT:    je LBB12_18
4046; AVX512VLBW-NEXT:  LBB12_17: ## %cond.load29
4047; AVX512VLBW-NEXT:    vpinsrb $8, (%rdi), %xmm0, %xmm0
4048; AVX512VLBW-NEXT:    incq %rdi
4049; AVX512VLBW-NEXT:    testl $512, %eax ## imm = 0x200
4050; AVX512VLBW-NEXT:    je LBB12_20
4051; AVX512VLBW-NEXT:  LBB12_19: ## %cond.load33
4052; AVX512VLBW-NEXT:    vpinsrb $9, (%rdi), %xmm0, %xmm0
4053; AVX512VLBW-NEXT:    incq %rdi
4054; AVX512VLBW-NEXT:    testl $1024, %eax ## imm = 0x400
4055; AVX512VLBW-NEXT:    je LBB12_22
4056; AVX512VLBW-NEXT:  LBB12_21: ## %cond.load37
4057; AVX512VLBW-NEXT:    vpinsrb $10, (%rdi), %xmm0, %xmm0
4058; AVX512VLBW-NEXT:    incq %rdi
4059; AVX512VLBW-NEXT:    testl $2048, %eax ## imm = 0x800
4060; AVX512VLBW-NEXT:    je LBB12_24
4061; AVX512VLBW-NEXT:  LBB12_23: ## %cond.load41
4062; AVX512VLBW-NEXT:    vpinsrb $11, (%rdi), %xmm0, %xmm0
4063; AVX512VLBW-NEXT:    incq %rdi
4064; AVX512VLBW-NEXT:    testl $4096, %eax ## imm = 0x1000
4065; AVX512VLBW-NEXT:    je LBB12_26
4066; AVX512VLBW-NEXT:  LBB12_25: ## %cond.load45
4067; AVX512VLBW-NEXT:    vpinsrb $12, (%rdi), %xmm0, %xmm0
4068; AVX512VLBW-NEXT:    incq %rdi
4069; AVX512VLBW-NEXT:    testl $8192, %eax ## imm = 0x2000
4070; AVX512VLBW-NEXT:    je LBB12_28
4071; AVX512VLBW-NEXT:  LBB12_27: ## %cond.load49
4072; AVX512VLBW-NEXT:    vpinsrb $13, (%rdi), %xmm0, %xmm0
4073; AVX512VLBW-NEXT:    incq %rdi
4074; AVX512VLBW-NEXT:    testl $16384, %eax ## imm = 0x4000
4075; AVX512VLBW-NEXT:    je LBB12_30
4076; AVX512VLBW-NEXT:  LBB12_29: ## %cond.load53
4077; AVX512VLBW-NEXT:    vpinsrb $14, (%rdi), %xmm0, %xmm0
4078; AVX512VLBW-NEXT:    incq %rdi
4079; AVX512VLBW-NEXT:    testl $32768, %eax ## imm = 0x8000
4080; AVX512VLBW-NEXT:    je LBB12_32
4081; AVX512VLBW-NEXT:  LBB12_31: ## %cond.load57
4082; AVX512VLBW-NEXT:    vpinsrb $15, (%rdi), %xmm0, %xmm0
4083; AVX512VLBW-NEXT:    retq
4084  %mask = icmp eq <16 x i8> %trigger, zeroinitializer
4085  %res = call <16 x i8> @llvm.masked.expandload.v16i8(ptr %base, <16 x i1> %mask, <16 x i8> %src0)
4086  ret <16 x i8>%res
4087}
4088
4089declare <16 x double> @llvm.masked.expandload.v16f64(ptr, <16 x i1>, <16 x double>)
4090declare <8 x double> @llvm.masked.expandload.v8f64(ptr, <8 x i1>, <8 x double>)
4091declare <4 x double> @llvm.masked.expandload.v4f64(ptr, <4 x i1>, <4 x double>)
4092declare <2 x double> @llvm.masked.expandload.v2f64(ptr, <2 x i1>, <2 x double>)
4093declare <1 x double> @llvm.masked.expandload.v1f64(ptr, <1 x i1>, <1 x double>)
4094
4095declare <32 x float> @llvm.masked.expandload.v32f32(ptr, <32 x i1>, <32 x float>)
4096declare <16 x float> @llvm.masked.expandload.v16f32(ptr, <16 x i1>, <16 x float>)
4097declare <8 x float> @llvm.masked.expandload.v8f32(ptr, <8 x i1>, <8 x float>)
4098declare <4 x float> @llvm.masked.expandload.v4f32(ptr, <4 x i1>, <4 x float>)
4099declare <2 x float> @llvm.masked.expandload.v2f32(ptr, <2 x i1>, <2 x float>)
4100
4101declare <8 x i64> @llvm.masked.expandload.v8i64(ptr, <8 x i1>, <8 x i64>)
4102declare <4 x i64> @llvm.masked.expandload.v4i64(ptr, <4 x i1>, <4 x i64>)
4103declare <2 x i64> @llvm.masked.expandload.v2i64(ptr, <2 x i1>, <2 x i64>)
4104declare <1 x i64> @llvm.masked.expandload.v1i64(ptr, <1 x i1>, <1 x i64>)
4105
4106declare <16 x i32> @llvm.masked.expandload.v16i32(ptr, <16 x i1>, <16 x i32>)
4107declare <8 x i32> @llvm.masked.expandload.v8i32(ptr, <8 x i1>, <8 x i32>)
4108declare <4 x i32> @llvm.masked.expandload.v4i32(ptr, <4 x i1>, <4 x i32>)
4109declare <2 x i32> @llvm.masked.expandload.v2i32(ptr, <2 x i1>, <2 x i32>)
4110
4111declare <32 x i16> @llvm.masked.expandload.v32i16(ptr, <32 x i1>, <32 x i16>)
4112declare <16 x i16> @llvm.masked.expandload.v16i16(ptr, <16 x i1>, <16 x i16>)
4113declare <8 x i16> @llvm.masked.expandload.v8i16(ptr, <8 x i1>, <8 x i16>)
4114declare <4 x i16> @llvm.masked.expandload.v4i16(ptr, <4 x i1>, <4 x i16>)
4115
4116declare <64 x i8> @llvm.masked.expandload.v64i8(ptr, <64 x i1>, <64 x i8>)
4117declare <32 x i8> @llvm.masked.expandload.v32i8(ptr, <32 x i1>, <32 x i8>)
4118declare <16 x i8> @llvm.masked.expandload.v16i8(ptr, <16 x i1>, <16 x i8>)
4119declare <8 x i8> @llvm.masked.expandload.v8i8(ptr, <8 x i1>, <8 x i8>)
4120