xref: /llvm-project/llvm/test/CodeGen/X86/masked_load.ll (revision b5d35feacb7246573c6a4ab2bddc4919a4228ed5)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -disable-peephole -mtriple=x86_64-apple-darwin -mattr=sse2    | FileCheck %s --check-prefixes=SSE,SSE2
3; RUN: llc < %s -disable-peephole -mtriple=x86_64-apple-darwin -mattr=sse4.2  | FileCheck %s --check-prefixes=SSE,SSE42
4; RUN: llc < %s -disable-peephole -mtriple=x86_64-apple-darwin -mattr=avx     | FileCheck %s --check-prefixes=AVX,AVX1OR2,AVX1
5; RUN: llc < %s -disable-peephole -mtriple=x86_64-apple-darwin -mattr=avx2    | FileCheck %s --check-prefixes=AVX,AVX1OR2,AVX2
6; RUN: llc < %s -disable-peephole -mtriple=x86_64-apple-darwin -mattr=avx512f | FileCheck %s --check-prefixes=AVX,AVX512,AVX512F
7; RUN: llc < %s -disable-peephole -mtriple=x86_64-apple-darwin -mattr=avx512f,avx512dq,avx512vl | FileCheck %s --check-prefixes=AVX,AVX512,AVX512VL,AVX512VLDQ
8; RUN: llc < %s -disable-peephole -mtriple=x86_64-apple-darwin -mattr=avx512f,avx512bw,avx512vl | FileCheck %s --check-prefixes=AVX,AVX512,AVX512VL,AVX512VLBW
9; RUN: llc < %s -mtriple=i686-apple-darwin -mattr=avx512f,avx512bw,avx512dq,avx512vl -verify-machineinstrs | FileCheck %s --check-prefixes=X86-AVX512
10
11;
12; vXf64
13;
14
15define <1 x double> @load_v1f64_v1i64(<1 x i64> %trigger, ptr %addr, <1 x double> %dst) {
16; SSE-LABEL: load_v1f64_v1i64:
17; SSE:       ## %bb.0:
18; SSE-NEXT:    testq %rdi, %rdi
19; SSE-NEXT:    jne LBB0_2
20; SSE-NEXT:  ## %bb.1: ## %cond.load
21; SSE-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
22; SSE-NEXT:  LBB0_2: ## %else
23; SSE-NEXT:    retq
24;
25; AVX-LABEL: load_v1f64_v1i64:
26; AVX:       ## %bb.0:
27; AVX-NEXT:    testq %rdi, %rdi
28; AVX-NEXT:    jne LBB0_2
29; AVX-NEXT:  ## %bb.1: ## %cond.load
30; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
31; AVX-NEXT:  LBB0_2: ## %else
32; AVX-NEXT:    retq
33;
34; X86-AVX512-LABEL: load_v1f64_v1i64:
35; X86-AVX512:       ## %bb.0:
36; X86-AVX512-NEXT:    subl $12, %esp
37; X86-AVX512-NEXT:    .cfi_def_cfa_offset 16
38; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
39; X86-AVX512-NEXT:    orl {{[0-9]+}}(%esp), %eax
40; X86-AVX512-NEXT:    jne LBB0_1
41; X86-AVX512-NEXT:  ## %bb.2: ## %cond.load
42; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
43; X86-AVX512-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
44; X86-AVX512-NEXT:    jmp LBB0_3
45; X86-AVX512-NEXT:  LBB0_1:
46; X86-AVX512-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
47; X86-AVX512-NEXT:  LBB0_3: ## %else
48; X86-AVX512-NEXT:    vmovsd %xmm0, (%esp)
49; X86-AVX512-NEXT:    fldl (%esp)
50; X86-AVX512-NEXT:    addl $12, %esp
51; X86-AVX512-NEXT:    retl
52  %mask = icmp eq <1 x i64> %trigger, zeroinitializer
53  %res = call <1 x double> @llvm.masked.load.v1f64.p0(ptr %addr, i32 4, <1 x i1> %mask, <1 x double> %dst)
54  ret <1 x double> %res
55}
56
57define <2 x double> @load_v2f64_v2i64(<2 x i64> %trigger, ptr %addr, <2 x double> %dst) {
58; SSE2-LABEL: load_v2f64_v2i64:
59; SSE2:       ## %bb.0:
60; SSE2-NEXT:    pxor %xmm2, %xmm2
61; SSE2-NEXT:    pcmpeqd %xmm0, %xmm2
62; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[1,0,3,2]
63; SSE2-NEXT:    pand %xmm2, %xmm0
64; SSE2-NEXT:    movmskpd %xmm0, %eax
65; SSE2-NEXT:    testb $1, %al
66; SSE2-NEXT:    jne LBB1_1
67; SSE2-NEXT:  ## %bb.2: ## %else
68; SSE2-NEXT:    testb $2, %al
69; SSE2-NEXT:    jne LBB1_3
70; SSE2-NEXT:  LBB1_4: ## %else2
71; SSE2-NEXT:    movaps %xmm1, %xmm0
72; SSE2-NEXT:    retq
73; SSE2-NEXT:  LBB1_1: ## %cond.load
74; SSE2-NEXT:    movlps {{.*#+}} xmm1 = mem[0,1],xmm1[2,3]
75; SSE2-NEXT:    testb $2, %al
76; SSE2-NEXT:    je LBB1_4
77; SSE2-NEXT:  LBB1_3: ## %cond.load1
78; SSE2-NEXT:    movhps {{.*#+}} xmm1 = xmm1[0,1],mem[0,1]
79; SSE2-NEXT:    movaps %xmm1, %xmm0
80; SSE2-NEXT:    retq
81;
82; SSE42-LABEL: load_v2f64_v2i64:
83; SSE42:       ## %bb.0:
84; SSE42-NEXT:    pxor %xmm2, %xmm2
85; SSE42-NEXT:    pcmpeqq %xmm0, %xmm2
86; SSE42-NEXT:    movmskpd %xmm2, %eax
87; SSE42-NEXT:    testb $1, %al
88; SSE42-NEXT:    jne LBB1_1
89; SSE42-NEXT:  ## %bb.2: ## %else
90; SSE42-NEXT:    testb $2, %al
91; SSE42-NEXT:    jne LBB1_3
92; SSE42-NEXT:  LBB1_4: ## %else2
93; SSE42-NEXT:    movaps %xmm1, %xmm0
94; SSE42-NEXT:    retq
95; SSE42-NEXT:  LBB1_1: ## %cond.load
96; SSE42-NEXT:    movlps {{.*#+}} xmm1 = mem[0,1],xmm1[2,3]
97; SSE42-NEXT:    testb $2, %al
98; SSE42-NEXT:    je LBB1_4
99; SSE42-NEXT:  LBB1_3: ## %cond.load1
100; SSE42-NEXT:    movhps {{.*#+}} xmm1 = xmm1[0,1],mem[0,1]
101; SSE42-NEXT:    movaps %xmm1, %xmm0
102; SSE42-NEXT:    retq
103;
104; AVX1OR2-LABEL: load_v2f64_v2i64:
105; AVX1OR2:       ## %bb.0:
106; AVX1OR2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
107; AVX1OR2-NEXT:    vpcmpeqq %xmm2, %xmm0, %xmm0
108; AVX1OR2-NEXT:    vmaskmovpd (%rdi), %xmm0, %xmm2
109; AVX1OR2-NEXT:    vblendvpd %xmm0, %xmm2, %xmm1, %xmm0
110; AVX1OR2-NEXT:    retq
111;
112; AVX512F-LABEL: load_v2f64_v2i64:
113; AVX512F:       ## %bb.0:
114; AVX512F-NEXT:    ## kill: def $xmm1 killed $xmm1 def $zmm1
115; AVX512F-NEXT:    ## kill: def $xmm0 killed $xmm0 def $zmm0
116; AVX512F-NEXT:    vptestnmq %zmm0, %zmm0, %k0
117; AVX512F-NEXT:    kshiftlw $14, %k0, %k0
118; AVX512F-NEXT:    kshiftrw $14, %k0, %k1
119; AVX512F-NEXT:    vblendmpd (%rdi), %zmm1, %zmm0 {%k1}
120; AVX512F-NEXT:    ## kill: def $xmm0 killed $xmm0 killed $zmm0
121; AVX512F-NEXT:    vzeroupper
122; AVX512F-NEXT:    retq
123;
124; AVX512VL-LABEL: load_v2f64_v2i64:
125; AVX512VL:       ## %bb.0:
126; AVX512VL-NEXT:    vptestnmq %xmm0, %xmm0, %k1
127; AVX512VL-NEXT:    vblendmpd (%rdi), %xmm1, %xmm0 {%k1}
128; AVX512VL-NEXT:    retq
129;
130; X86-AVX512-LABEL: load_v2f64_v2i64:
131; X86-AVX512:       ## %bb.0:
132; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
133; X86-AVX512-NEXT:    vptestnmq %xmm0, %xmm0, %k1
134; X86-AVX512-NEXT:    vblendmpd (%eax), %xmm1, %xmm0 {%k1}
135; X86-AVX512-NEXT:    retl
136  %mask = icmp eq <2 x i64> %trigger, zeroinitializer
137  %res = call <2 x double> @llvm.masked.load.v2f64.p0(ptr %addr, i32 4, <2 x i1> %mask, <2 x double> %dst)
138  ret <2 x double> %res
139}
140
141define <4 x double> @load_v4f64_v4i32(<4 x i32> %trigger, ptr %addr, <4 x double> %dst) {
142; SSE-LABEL: load_v4f64_v4i32:
143; SSE:       ## %bb.0:
144; SSE-NEXT:    pxor %xmm3, %xmm3
145; SSE-NEXT:    pcmpeqd %xmm0, %xmm3
146; SSE-NEXT:    movmskps %xmm3, %eax
147; SSE-NEXT:    testb $1, %al
148; SSE-NEXT:    jne LBB2_1
149; SSE-NEXT:  ## %bb.2: ## %else
150; SSE-NEXT:    testb $2, %al
151; SSE-NEXT:    jne LBB2_3
152; SSE-NEXT:  LBB2_4: ## %else2
153; SSE-NEXT:    testb $4, %al
154; SSE-NEXT:    jne LBB2_5
155; SSE-NEXT:  LBB2_6: ## %else5
156; SSE-NEXT:    testb $8, %al
157; SSE-NEXT:    je LBB2_8
158; SSE-NEXT:  LBB2_7: ## %cond.load7
159; SSE-NEXT:    movhps {{.*#+}} xmm2 = xmm2[0,1],mem[0,1]
160; SSE-NEXT:  LBB2_8: ## %else8
161; SSE-NEXT:    movaps %xmm1, %xmm0
162; SSE-NEXT:    movaps %xmm2, %xmm1
163; SSE-NEXT:    retq
164; SSE-NEXT:  LBB2_1: ## %cond.load
165; SSE-NEXT:    movlps {{.*#+}} xmm1 = mem[0,1],xmm1[2,3]
166; SSE-NEXT:    testb $2, %al
167; SSE-NEXT:    je LBB2_4
168; SSE-NEXT:  LBB2_3: ## %cond.load1
169; SSE-NEXT:    movhps {{.*#+}} xmm1 = xmm1[0,1],mem[0,1]
170; SSE-NEXT:    testb $4, %al
171; SSE-NEXT:    je LBB2_6
172; SSE-NEXT:  LBB2_5: ## %cond.load4
173; SSE-NEXT:    movlps {{.*#+}} xmm2 = mem[0,1],xmm2[2,3]
174; SSE-NEXT:    testb $8, %al
175; SSE-NEXT:    jne LBB2_7
176; SSE-NEXT:    jmp LBB2_8
177;
178; AVX1-LABEL: load_v4f64_v4i32:
179; AVX1:       ## %bb.0:
180; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
181; AVX1-NEXT:    vpcmpeqd %xmm2, %xmm0, %xmm0
182; AVX1-NEXT:    vpmovsxdq %xmm0, %xmm2
183; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3]
184; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm2, %ymm0
185; AVX1-NEXT:    vmaskmovpd (%rdi), %ymm0, %ymm2
186; AVX1-NEXT:    vblendvpd %ymm0, %ymm2, %ymm1, %ymm0
187; AVX1-NEXT:    retq
188;
189; AVX2-LABEL: load_v4f64_v4i32:
190; AVX2:       ## %bb.0:
191; AVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
192; AVX2-NEXT:    vpcmpeqd %xmm2, %xmm0, %xmm0
193; AVX2-NEXT:    vpmovsxdq %xmm0, %ymm0
194; AVX2-NEXT:    vmaskmovpd (%rdi), %ymm0, %ymm2
195; AVX2-NEXT:    vblendvpd %ymm0, %ymm2, %ymm1, %ymm0
196; AVX2-NEXT:    retq
197;
198; AVX512F-LABEL: load_v4f64_v4i32:
199; AVX512F:       ## %bb.0:
200; AVX512F-NEXT:    ## kill: def $ymm1 killed $ymm1 def $zmm1
201; AVX512F-NEXT:    ## kill: def $xmm0 killed $xmm0 def $zmm0
202; AVX512F-NEXT:    vptestnmd %zmm0, %zmm0, %k0
203; AVX512F-NEXT:    kshiftlw $12, %k0, %k0
204; AVX512F-NEXT:    kshiftrw $12, %k0, %k1
205; AVX512F-NEXT:    vblendmpd (%rdi), %zmm1, %zmm0 {%k1}
206; AVX512F-NEXT:    ## kill: def $ymm0 killed $ymm0 killed $zmm0
207; AVX512F-NEXT:    retq
208;
209; AVX512VL-LABEL: load_v4f64_v4i32:
210; AVX512VL:       ## %bb.0:
211; AVX512VL-NEXT:    vptestnmd %xmm0, %xmm0, %k1
212; AVX512VL-NEXT:    vblendmpd (%rdi), %ymm1, %ymm0 {%k1}
213; AVX512VL-NEXT:    retq
214;
215; X86-AVX512-LABEL: load_v4f64_v4i32:
216; X86-AVX512:       ## %bb.0:
217; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
218; X86-AVX512-NEXT:    vptestnmd %xmm0, %xmm0, %k1
219; X86-AVX512-NEXT:    vblendmpd (%eax), %ymm1, %ymm0 {%k1}
220; X86-AVX512-NEXT:    retl
221  %mask = icmp eq <4 x i32> %trigger, zeroinitializer
222  %res = call <4 x double> @llvm.masked.load.v4f64.p0(ptr %addr, i32 32, <4 x i1> %mask, <4 x double> %dst)
223  ret <4 x double> %res
224}
225
226define <4 x double> @load_v4f64_v4i32_zero(<4 x i32> %trigger, ptr %addr) {
227; SSE-LABEL: load_v4f64_v4i32_zero:
228; SSE:       ## %bb.0:
229; SSE-NEXT:    movdqa %xmm0, %xmm1
230; SSE-NEXT:    pxor %xmm0, %xmm0
231; SSE-NEXT:    pcmpeqd %xmm0, %xmm1
232; SSE-NEXT:    movmskps %xmm1, %eax
233; SSE-NEXT:    testb $1, %al
234; SSE-NEXT:    xorps %xmm1, %xmm1
235; SSE-NEXT:    jne LBB3_1
236; SSE-NEXT:  ## %bb.2: ## %else
237; SSE-NEXT:    testb $2, %al
238; SSE-NEXT:    jne LBB3_3
239; SSE-NEXT:  LBB3_4: ## %else2
240; SSE-NEXT:    testb $4, %al
241; SSE-NEXT:    jne LBB3_5
242; SSE-NEXT:  LBB3_6: ## %else5
243; SSE-NEXT:    testb $8, %al
244; SSE-NEXT:    jne LBB3_7
245; SSE-NEXT:  LBB3_8: ## %else8
246; SSE-NEXT:    retq
247; SSE-NEXT:  LBB3_1: ## %cond.load
248; SSE-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
249; SSE-NEXT:    testb $2, %al
250; SSE-NEXT:    je LBB3_4
251; SSE-NEXT:  LBB3_3: ## %cond.load1
252; SSE-NEXT:    movhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1]
253; SSE-NEXT:    testb $4, %al
254; SSE-NEXT:    je LBB3_6
255; SSE-NEXT:  LBB3_5: ## %cond.load4
256; SSE-NEXT:    movlps {{.*#+}} xmm1 = mem[0,1],xmm1[2,3]
257; SSE-NEXT:    testb $8, %al
258; SSE-NEXT:    je LBB3_8
259; SSE-NEXT:  LBB3_7: ## %cond.load7
260; SSE-NEXT:    movhps {{.*#+}} xmm1 = xmm1[0,1],mem[0,1]
261; SSE-NEXT:    retq
262;
263; AVX1-LABEL: load_v4f64_v4i32_zero:
264; AVX1:       ## %bb.0:
265; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
266; AVX1-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm0
267; AVX1-NEXT:    vpmovsxdq %xmm0, %xmm1
268; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3]
269; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
270; AVX1-NEXT:    vmaskmovpd (%rdi), %ymm0, %ymm0
271; AVX1-NEXT:    retq
272;
273; AVX2-LABEL: load_v4f64_v4i32_zero:
274; AVX2:       ## %bb.0:
275; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
276; AVX2-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm0
277; AVX2-NEXT:    vpmovsxdq %xmm0, %ymm0
278; AVX2-NEXT:    vmaskmovpd (%rdi), %ymm0, %ymm0
279; AVX2-NEXT:    retq
280;
281; AVX512F-LABEL: load_v4f64_v4i32_zero:
282; AVX512F:       ## %bb.0:
283; AVX512F-NEXT:    ## kill: def $xmm0 killed $xmm0 def $zmm0
284; AVX512F-NEXT:    vptestnmd %zmm0, %zmm0, %k0
285; AVX512F-NEXT:    kshiftlw $12, %k0, %k0
286; AVX512F-NEXT:    kshiftrw $12, %k0, %k1
287; AVX512F-NEXT:    vmovupd (%rdi), %zmm0 {%k1} {z}
288; AVX512F-NEXT:    ## kill: def $ymm0 killed $ymm0 killed $zmm0
289; AVX512F-NEXT:    retq
290;
291; AVX512VL-LABEL: load_v4f64_v4i32_zero:
292; AVX512VL:       ## %bb.0:
293; AVX512VL-NEXT:    vptestnmd %xmm0, %xmm0, %k1
294; AVX512VL-NEXT:    vmovapd (%rdi), %ymm0 {%k1} {z}
295; AVX512VL-NEXT:    retq
296;
297; X86-AVX512-LABEL: load_v4f64_v4i32_zero:
298; X86-AVX512:       ## %bb.0:
299; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
300; X86-AVX512-NEXT:    vptestnmd %xmm0, %xmm0, %k1
301; X86-AVX512-NEXT:    vmovapd (%eax), %ymm0 {%k1} {z}
302; X86-AVX512-NEXT:    retl
303  %mask = icmp eq <4 x i32> %trigger, zeroinitializer
304  %res = call <4 x double> @llvm.masked.load.v4f64.p0(ptr %addr, i32 32, <4 x i1> %mask, <4 x double>zeroinitializer)
305  ret <4 x double> %res
306}
307
308define <4 x double> @load_v4f64_v4i64(<4 x i64> %trigger, ptr %addr, <4 x double> %dst) {
309; SSE2-LABEL: load_v4f64_v4i64:
310; SSE2:       ## %bb.0:
311; SSE2-NEXT:    pxor %xmm4, %xmm4
312; SSE2-NEXT:    pcmpeqd %xmm4, %xmm1
313; SSE2-NEXT:    pcmpeqd %xmm4, %xmm0
314; SSE2-NEXT:    movdqa %xmm0, %xmm4
315; SSE2-NEXT:    shufps {{.*#+}} xmm4 = xmm4[1,3],xmm1[1,3]
316; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
317; SSE2-NEXT:    andps %xmm4, %xmm0
318; SSE2-NEXT:    movmskps %xmm0, %eax
319; SSE2-NEXT:    testb $1, %al
320; SSE2-NEXT:    jne LBB4_1
321; SSE2-NEXT:  ## %bb.2: ## %else
322; SSE2-NEXT:    testb $2, %al
323; SSE2-NEXT:    jne LBB4_3
324; SSE2-NEXT:  LBB4_4: ## %else2
325; SSE2-NEXT:    testb $4, %al
326; SSE2-NEXT:    jne LBB4_5
327; SSE2-NEXT:  LBB4_6: ## %else5
328; SSE2-NEXT:    testb $8, %al
329; SSE2-NEXT:    je LBB4_8
330; SSE2-NEXT:  LBB4_7: ## %cond.load7
331; SSE2-NEXT:    movhps {{.*#+}} xmm3 = xmm3[0,1],mem[0,1]
332; SSE2-NEXT:  LBB4_8: ## %else8
333; SSE2-NEXT:    movaps %xmm2, %xmm0
334; SSE2-NEXT:    movaps %xmm3, %xmm1
335; SSE2-NEXT:    retq
336; SSE2-NEXT:  LBB4_1: ## %cond.load
337; SSE2-NEXT:    movlps {{.*#+}} xmm2 = mem[0,1],xmm2[2,3]
338; SSE2-NEXT:    testb $2, %al
339; SSE2-NEXT:    je LBB4_4
340; SSE2-NEXT:  LBB4_3: ## %cond.load1
341; SSE2-NEXT:    movhps {{.*#+}} xmm2 = xmm2[0,1],mem[0,1]
342; SSE2-NEXT:    testb $4, %al
343; SSE2-NEXT:    je LBB4_6
344; SSE2-NEXT:  LBB4_5: ## %cond.load4
345; SSE2-NEXT:    movlps {{.*#+}} xmm3 = mem[0,1],xmm3[2,3]
346; SSE2-NEXT:    testb $8, %al
347; SSE2-NEXT:    jne LBB4_7
348; SSE2-NEXT:    jmp LBB4_8
349;
350; SSE42-LABEL: load_v4f64_v4i64:
351; SSE42:       ## %bb.0:
352; SSE42-NEXT:    pxor %xmm4, %xmm4
353; SSE42-NEXT:    pcmpeqq %xmm4, %xmm1
354; SSE42-NEXT:    pcmpeqq %xmm4, %xmm0
355; SSE42-NEXT:    packssdw %xmm1, %xmm0
356; SSE42-NEXT:    movmskps %xmm0, %eax
357; SSE42-NEXT:    testb $1, %al
358; SSE42-NEXT:    jne LBB4_1
359; SSE42-NEXT:  ## %bb.2: ## %else
360; SSE42-NEXT:    testb $2, %al
361; SSE42-NEXT:    jne LBB4_3
362; SSE42-NEXT:  LBB4_4: ## %else2
363; SSE42-NEXT:    testb $4, %al
364; SSE42-NEXT:    jne LBB4_5
365; SSE42-NEXT:  LBB4_6: ## %else5
366; SSE42-NEXT:    testb $8, %al
367; SSE42-NEXT:    je LBB4_8
368; SSE42-NEXT:  LBB4_7: ## %cond.load7
369; SSE42-NEXT:    movhps {{.*#+}} xmm3 = xmm3[0,1],mem[0,1]
370; SSE42-NEXT:  LBB4_8: ## %else8
371; SSE42-NEXT:    movaps %xmm2, %xmm0
372; SSE42-NEXT:    movaps %xmm3, %xmm1
373; SSE42-NEXT:    retq
374; SSE42-NEXT:  LBB4_1: ## %cond.load
375; SSE42-NEXT:    movlps {{.*#+}} xmm2 = mem[0,1],xmm2[2,3]
376; SSE42-NEXT:    testb $2, %al
377; SSE42-NEXT:    je LBB4_4
378; SSE42-NEXT:  LBB4_3: ## %cond.load1
379; SSE42-NEXT:    movhps {{.*#+}} xmm2 = xmm2[0,1],mem[0,1]
380; SSE42-NEXT:    testb $4, %al
381; SSE42-NEXT:    je LBB4_6
382; SSE42-NEXT:  LBB4_5: ## %cond.load4
383; SSE42-NEXT:    movlps {{.*#+}} xmm3 = mem[0,1],xmm3[2,3]
384; SSE42-NEXT:    testb $8, %al
385; SSE42-NEXT:    jne LBB4_7
386; SSE42-NEXT:    jmp LBB4_8
387;
388; AVX1-LABEL: load_v4f64_v4i64:
389; AVX1:       ## %bb.0:
390; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
391; AVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
392; AVX1-NEXT:    vpcmpeqq %xmm3, %xmm2, %xmm2
393; AVX1-NEXT:    vpcmpeqq %xmm3, %xmm0, %xmm0
394; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
395; AVX1-NEXT:    vmaskmovpd (%rdi), %ymm0, %ymm2
396; AVX1-NEXT:    vblendvpd %ymm0, %ymm2, %ymm1, %ymm0
397; AVX1-NEXT:    retq
398;
399; AVX2-LABEL: load_v4f64_v4i64:
400; AVX2:       ## %bb.0:
401; AVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
402; AVX2-NEXT:    vpcmpeqq %ymm2, %ymm0, %ymm0
403; AVX2-NEXT:    vmaskmovpd (%rdi), %ymm0, %ymm2
404; AVX2-NEXT:    vblendvpd %ymm0, %ymm2, %ymm1, %ymm0
405; AVX2-NEXT:    retq
406;
407; AVX512F-LABEL: load_v4f64_v4i64:
408; AVX512F:       ## %bb.0:
409; AVX512F-NEXT:    ## kill: def $ymm1 killed $ymm1 def $zmm1
410; AVX512F-NEXT:    ## kill: def $ymm0 killed $ymm0 def $zmm0
411; AVX512F-NEXT:    vptestnmq %zmm0, %zmm0, %k0
412; AVX512F-NEXT:    kshiftlw $12, %k0, %k0
413; AVX512F-NEXT:    kshiftrw $12, %k0, %k1
414; AVX512F-NEXT:    vblendmpd (%rdi), %zmm1, %zmm0 {%k1}
415; AVX512F-NEXT:    ## kill: def $ymm0 killed $ymm0 killed $zmm0
416; AVX512F-NEXT:    retq
417;
418; AVX512VL-LABEL: load_v4f64_v4i64:
419; AVX512VL:       ## %bb.0:
420; AVX512VL-NEXT:    vptestnmq %ymm0, %ymm0, %k1
421; AVX512VL-NEXT:    vblendmpd (%rdi), %ymm1, %ymm0 {%k1}
422; AVX512VL-NEXT:    retq
423;
424; X86-AVX512-LABEL: load_v4f64_v4i64:
425; X86-AVX512:       ## %bb.0:
426; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
427; X86-AVX512-NEXT:    vptestnmq %ymm0, %ymm0, %k1
428; X86-AVX512-NEXT:    vblendmpd (%eax), %ymm1, %ymm0 {%k1}
429; X86-AVX512-NEXT:    retl
430  %mask = icmp eq <4 x i64> %trigger, zeroinitializer
431  %res = call <4 x double> @llvm.masked.load.v4f64.p0(ptr %addr, i32 4, <4 x i1> %mask, <4 x double> %dst)
432  ret <4 x double> %res
433}
434
435define <8 x double> @load_v8f64_v8i16(<8 x i16> %trigger, ptr %addr, <8 x double> %dst) {
436; SSE-LABEL: load_v8f64_v8i16:
437; SSE:       ## %bb.0:
438; SSE-NEXT:    pxor %xmm5, %xmm5
439; SSE-NEXT:    pcmpeqw %xmm0, %xmm5
440; SSE-NEXT:    packsswb %xmm5, %xmm5
441; SSE-NEXT:    pmovmskb %xmm5, %eax
442; SSE-NEXT:    testb $1, %al
443; SSE-NEXT:    jne LBB5_1
444; SSE-NEXT:  ## %bb.2: ## %else
445; SSE-NEXT:    testb $2, %al
446; SSE-NEXT:    jne LBB5_3
447; SSE-NEXT:  LBB5_4: ## %else2
448; SSE-NEXT:    testb $4, %al
449; SSE-NEXT:    jne LBB5_5
450; SSE-NEXT:  LBB5_6: ## %else5
451; SSE-NEXT:    testb $8, %al
452; SSE-NEXT:    jne LBB5_7
453; SSE-NEXT:  LBB5_8: ## %else8
454; SSE-NEXT:    testb $16, %al
455; SSE-NEXT:    jne LBB5_9
456; SSE-NEXT:  LBB5_10: ## %else11
457; SSE-NEXT:    testb $32, %al
458; SSE-NEXT:    jne LBB5_11
459; SSE-NEXT:  LBB5_12: ## %else14
460; SSE-NEXT:    testb $64, %al
461; SSE-NEXT:    jne LBB5_13
462; SSE-NEXT:  LBB5_14: ## %else17
463; SSE-NEXT:    testb $-128, %al
464; SSE-NEXT:    je LBB5_16
465; SSE-NEXT:  LBB5_15: ## %cond.load19
466; SSE-NEXT:    movhps {{.*#+}} xmm4 = xmm4[0,1],mem[0,1]
467; SSE-NEXT:  LBB5_16: ## %else20
468; SSE-NEXT:    movaps %xmm1, %xmm0
469; SSE-NEXT:    movaps %xmm2, %xmm1
470; SSE-NEXT:    movaps %xmm3, %xmm2
471; SSE-NEXT:    movaps %xmm4, %xmm3
472; SSE-NEXT:    retq
473; SSE-NEXT:  LBB5_1: ## %cond.load
474; SSE-NEXT:    movlps {{.*#+}} xmm1 = mem[0,1],xmm1[2,3]
475; SSE-NEXT:    testb $2, %al
476; SSE-NEXT:    je LBB5_4
477; SSE-NEXT:  LBB5_3: ## %cond.load1
478; SSE-NEXT:    movhps {{.*#+}} xmm1 = xmm1[0,1],mem[0,1]
479; SSE-NEXT:    testb $4, %al
480; SSE-NEXT:    je LBB5_6
481; SSE-NEXT:  LBB5_5: ## %cond.load4
482; SSE-NEXT:    movlps {{.*#+}} xmm2 = mem[0,1],xmm2[2,3]
483; SSE-NEXT:    testb $8, %al
484; SSE-NEXT:    je LBB5_8
485; SSE-NEXT:  LBB5_7: ## %cond.load7
486; SSE-NEXT:    movhps {{.*#+}} xmm2 = xmm2[0,1],mem[0,1]
487; SSE-NEXT:    testb $16, %al
488; SSE-NEXT:    je LBB5_10
489; SSE-NEXT:  LBB5_9: ## %cond.load10
490; SSE-NEXT:    movlps {{.*#+}} xmm3 = mem[0,1],xmm3[2,3]
491; SSE-NEXT:    testb $32, %al
492; SSE-NEXT:    je LBB5_12
493; SSE-NEXT:  LBB5_11: ## %cond.load13
494; SSE-NEXT:    movhps {{.*#+}} xmm3 = xmm3[0,1],mem[0,1]
495; SSE-NEXT:    testb $64, %al
496; SSE-NEXT:    je LBB5_14
497; SSE-NEXT:  LBB5_13: ## %cond.load16
498; SSE-NEXT:    movlps {{.*#+}} xmm4 = mem[0,1],xmm4[2,3]
499; SSE-NEXT:    testb $-128, %al
500; SSE-NEXT:    jne LBB5_15
501; SSE-NEXT:    jmp LBB5_16
502;
503; AVX1-LABEL: load_v8f64_v8i16:
504; AVX1:       ## %bb.0:
505; AVX1-NEXT:    vpshufd {{.*#+}} xmm3 = xmm0[2,3,2,3]
506; AVX1-NEXT:    vpxor %xmm4, %xmm4, %xmm4
507; AVX1-NEXT:    vpcmpeqw %xmm4, %xmm3, %xmm3
508; AVX1-NEXT:    vpmovsxwq %xmm3, %xmm5
509; AVX1-NEXT:    vpshufd {{.*#+}} xmm3 = xmm3[1,1,1,1]
510; AVX1-NEXT:    vpmovsxwq %xmm3, %xmm3
511; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm5, %ymm3
512; AVX1-NEXT:    vpcmpeqw %xmm4, %xmm0, %xmm0
513; AVX1-NEXT:    vpmovsxwq %xmm0, %xmm4
514; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
515; AVX1-NEXT:    vpmovsxwq %xmm0, %xmm0
516; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm4, %ymm0
517; AVX1-NEXT:    vmaskmovpd (%rdi), %ymm0, %ymm4
518; AVX1-NEXT:    vblendvpd %ymm0, %ymm4, %ymm1, %ymm0
519; AVX1-NEXT:    vmaskmovpd 32(%rdi), %ymm3, %ymm1
520; AVX1-NEXT:    vblendvpd %ymm3, %ymm1, %ymm2, %ymm1
521; AVX1-NEXT:    retq
522;
523; AVX2-LABEL: load_v8f64_v8i16:
524; AVX2:       ## %bb.0:
525; AVX2-NEXT:    vpshufd {{.*#+}} xmm3 = xmm0[2,3,2,3]
526; AVX2-NEXT:    vpxor %xmm4, %xmm4, %xmm4
527; AVX2-NEXT:    vpcmpeqw %xmm4, %xmm3, %xmm3
528; AVX2-NEXT:    vpmovsxwq %xmm3, %ymm3
529; AVX2-NEXT:    vpcmpeqw %xmm4, %xmm0, %xmm0
530; AVX2-NEXT:    vpmovsxwq %xmm0, %ymm0
531; AVX2-NEXT:    vmaskmovpd (%rdi), %ymm0, %ymm4
532; AVX2-NEXT:    vblendvpd %ymm0, %ymm4, %ymm1, %ymm0
533; AVX2-NEXT:    vmaskmovpd 32(%rdi), %ymm3, %ymm1
534; AVX2-NEXT:    vblendvpd %ymm3, %ymm1, %ymm2, %ymm1
535; AVX2-NEXT:    retq
536;
537; AVX512F-LABEL: load_v8f64_v8i16:
538; AVX512F:       ## %bb.0:
539; AVX512F-NEXT:    vpxor %xmm2, %xmm2, %xmm2
540; AVX512F-NEXT:    vpcmpeqw %xmm2, %xmm0, %xmm0
541; AVX512F-NEXT:    vpmovsxwq %xmm0, %zmm0
542; AVX512F-NEXT:    vptestmq %zmm0, %zmm0, %k1
543; AVX512F-NEXT:    vblendmpd (%rdi), %zmm1, %zmm0 {%k1}
544; AVX512F-NEXT:    retq
545;
546; AVX512VLDQ-LABEL: load_v8f64_v8i16:
547; AVX512VLDQ:       ## %bb.0:
548; AVX512VLDQ-NEXT:    vpxor %xmm2, %xmm2, %xmm2
549; AVX512VLDQ-NEXT:    vpcmpeqw %xmm2, %xmm0, %xmm0
550; AVX512VLDQ-NEXT:    vpmovsxwd %xmm0, %ymm0
551; AVX512VLDQ-NEXT:    vpmovd2m %ymm0, %k1
552; AVX512VLDQ-NEXT:    vblendmpd (%rdi), %zmm1, %zmm0 {%k1}
553; AVX512VLDQ-NEXT:    retq
554;
555; AVX512VLBW-LABEL: load_v8f64_v8i16:
556; AVX512VLBW:       ## %bb.0:
557; AVX512VLBW-NEXT:    vptestnmw %xmm0, %xmm0, %k1
558; AVX512VLBW-NEXT:    vblendmpd (%rdi), %zmm1, %zmm0 {%k1}
559; AVX512VLBW-NEXT:    retq
560;
561; X86-AVX512-LABEL: load_v8f64_v8i16:
562; X86-AVX512:       ## %bb.0:
563; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
564; X86-AVX512-NEXT:    vptestnmw %xmm0, %xmm0, %k1
565; X86-AVX512-NEXT:    vblendmpd (%eax), %zmm1, %zmm0 {%k1}
566; X86-AVX512-NEXT:    retl
567  %mask = icmp eq <8 x i16> %trigger, zeroinitializer
568  %res = call <8 x double> @llvm.masked.load.v8f64.p0(ptr %addr, i32 4, <8 x i1> %mask, <8 x double> %dst)
569  ret <8 x double> %res
570}
571
572define <8 x double> @load_v8f64_v8i64(<8 x i64> %trigger, ptr %addr, <8 x double> %dst) {
573; SSE2-LABEL: load_v8f64_v8i64:
574; SSE2:       ## %bb.0:
575; SSE2-NEXT:    pxor %xmm8, %xmm8
576; SSE2-NEXT:    pcmpeqd %xmm8, %xmm3
577; SSE2-NEXT:    pshufd {{.*#+}} xmm9 = xmm3[1,0,3,2]
578; SSE2-NEXT:    pand %xmm3, %xmm9
579; SSE2-NEXT:    pcmpeqd %xmm8, %xmm2
580; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[1,0,3,2]
581; SSE2-NEXT:    pand %xmm2, %xmm3
582; SSE2-NEXT:    packssdw %xmm9, %xmm3
583; SSE2-NEXT:    pcmpeqd %xmm8, %xmm1
584; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,0,3,2]
585; SSE2-NEXT:    pand %xmm1, %xmm2
586; SSE2-NEXT:    pcmpeqd %xmm8, %xmm0
587; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,0,3,2]
588; SSE2-NEXT:    pand %xmm0, %xmm1
589; SSE2-NEXT:    packssdw %xmm2, %xmm1
590; SSE2-NEXT:    packssdw %xmm3, %xmm1
591; SSE2-NEXT:    packsswb %xmm1, %xmm1
592; SSE2-NEXT:    pmovmskb %xmm1, %eax
593; SSE2-NEXT:    testb $1, %al
594; SSE2-NEXT:    jne LBB6_1
595; SSE2-NEXT:  ## %bb.2: ## %else
596; SSE2-NEXT:    testb $2, %al
597; SSE2-NEXT:    jne LBB6_3
598; SSE2-NEXT:  LBB6_4: ## %else2
599; SSE2-NEXT:    testb $4, %al
600; SSE2-NEXT:    jne LBB6_5
601; SSE2-NEXT:  LBB6_6: ## %else5
602; SSE2-NEXT:    testb $8, %al
603; SSE2-NEXT:    jne LBB6_7
604; SSE2-NEXT:  LBB6_8: ## %else8
605; SSE2-NEXT:    testb $16, %al
606; SSE2-NEXT:    jne LBB6_9
607; SSE2-NEXT:  LBB6_10: ## %else11
608; SSE2-NEXT:    testb $32, %al
609; SSE2-NEXT:    jne LBB6_11
610; SSE2-NEXT:  LBB6_12: ## %else14
611; SSE2-NEXT:    testb $64, %al
612; SSE2-NEXT:    jne LBB6_13
613; SSE2-NEXT:  LBB6_14: ## %else17
614; SSE2-NEXT:    testb $-128, %al
615; SSE2-NEXT:    je LBB6_16
616; SSE2-NEXT:  LBB6_15: ## %cond.load19
617; SSE2-NEXT:    movhps {{.*#+}} xmm7 = xmm7[0,1],mem[0,1]
618; SSE2-NEXT:  LBB6_16: ## %else20
619; SSE2-NEXT:    movaps %xmm4, %xmm0
620; SSE2-NEXT:    movaps %xmm5, %xmm1
621; SSE2-NEXT:    movaps %xmm6, %xmm2
622; SSE2-NEXT:    movaps %xmm7, %xmm3
623; SSE2-NEXT:    retq
624; SSE2-NEXT:  LBB6_1: ## %cond.load
625; SSE2-NEXT:    movlps {{.*#+}} xmm4 = mem[0,1],xmm4[2,3]
626; SSE2-NEXT:    testb $2, %al
627; SSE2-NEXT:    je LBB6_4
628; SSE2-NEXT:  LBB6_3: ## %cond.load1
629; SSE2-NEXT:    movhps {{.*#+}} xmm4 = xmm4[0,1],mem[0,1]
630; SSE2-NEXT:    testb $4, %al
631; SSE2-NEXT:    je LBB6_6
632; SSE2-NEXT:  LBB6_5: ## %cond.load4
633; SSE2-NEXT:    movlps {{.*#+}} xmm5 = mem[0,1],xmm5[2,3]
634; SSE2-NEXT:    testb $8, %al
635; SSE2-NEXT:    je LBB6_8
636; SSE2-NEXT:  LBB6_7: ## %cond.load7
637; SSE2-NEXT:    movhps {{.*#+}} xmm5 = xmm5[0,1],mem[0,1]
638; SSE2-NEXT:    testb $16, %al
639; SSE2-NEXT:    je LBB6_10
640; SSE2-NEXT:  LBB6_9: ## %cond.load10
641; SSE2-NEXT:    movlps {{.*#+}} xmm6 = mem[0,1],xmm6[2,3]
642; SSE2-NEXT:    testb $32, %al
643; SSE2-NEXT:    je LBB6_12
644; SSE2-NEXT:  LBB6_11: ## %cond.load13
645; SSE2-NEXT:    movhps {{.*#+}} xmm6 = xmm6[0,1],mem[0,1]
646; SSE2-NEXT:    testb $64, %al
647; SSE2-NEXT:    je LBB6_14
648; SSE2-NEXT:  LBB6_13: ## %cond.load16
649; SSE2-NEXT:    movlps {{.*#+}} xmm7 = mem[0,1],xmm7[2,3]
650; SSE2-NEXT:    testb $-128, %al
651; SSE2-NEXT:    jne LBB6_15
652; SSE2-NEXT:    jmp LBB6_16
653;
654; SSE42-LABEL: load_v8f64_v8i64:
655; SSE42:       ## %bb.0:
656; SSE42-NEXT:    pxor %xmm8, %xmm8
657; SSE42-NEXT:    pcmpeqq %xmm8, %xmm3
658; SSE42-NEXT:    pcmpeqq %xmm8, %xmm2
659; SSE42-NEXT:    packssdw %xmm3, %xmm2
660; SSE42-NEXT:    pcmpeqq %xmm8, %xmm1
661; SSE42-NEXT:    pcmpeqq %xmm8, %xmm0
662; SSE42-NEXT:    packssdw %xmm1, %xmm0
663; SSE42-NEXT:    packssdw %xmm2, %xmm0
664; SSE42-NEXT:    packsswb %xmm0, %xmm0
665; SSE42-NEXT:    pmovmskb %xmm0, %eax
666; SSE42-NEXT:    testb $1, %al
667; SSE42-NEXT:    jne LBB6_1
668; SSE42-NEXT:  ## %bb.2: ## %else
669; SSE42-NEXT:    testb $2, %al
670; SSE42-NEXT:    jne LBB6_3
671; SSE42-NEXT:  LBB6_4: ## %else2
672; SSE42-NEXT:    testb $4, %al
673; SSE42-NEXT:    jne LBB6_5
674; SSE42-NEXT:  LBB6_6: ## %else5
675; SSE42-NEXT:    testb $8, %al
676; SSE42-NEXT:    jne LBB6_7
677; SSE42-NEXT:  LBB6_8: ## %else8
678; SSE42-NEXT:    testb $16, %al
679; SSE42-NEXT:    jne LBB6_9
680; SSE42-NEXT:  LBB6_10: ## %else11
681; SSE42-NEXT:    testb $32, %al
682; SSE42-NEXT:    jne LBB6_11
683; SSE42-NEXT:  LBB6_12: ## %else14
684; SSE42-NEXT:    testb $64, %al
685; SSE42-NEXT:    jne LBB6_13
686; SSE42-NEXT:  LBB6_14: ## %else17
687; SSE42-NEXT:    testb $-128, %al
688; SSE42-NEXT:    je LBB6_16
689; SSE42-NEXT:  LBB6_15: ## %cond.load19
690; SSE42-NEXT:    movhps {{.*#+}} xmm7 = xmm7[0,1],mem[0,1]
691; SSE42-NEXT:  LBB6_16: ## %else20
692; SSE42-NEXT:    movaps %xmm4, %xmm0
693; SSE42-NEXT:    movaps %xmm5, %xmm1
694; SSE42-NEXT:    movaps %xmm6, %xmm2
695; SSE42-NEXT:    movaps %xmm7, %xmm3
696; SSE42-NEXT:    retq
697; SSE42-NEXT:  LBB6_1: ## %cond.load
698; SSE42-NEXT:    movlps {{.*#+}} xmm4 = mem[0,1],xmm4[2,3]
699; SSE42-NEXT:    testb $2, %al
700; SSE42-NEXT:    je LBB6_4
701; SSE42-NEXT:  LBB6_3: ## %cond.load1
702; SSE42-NEXT:    movhps {{.*#+}} xmm4 = xmm4[0,1],mem[0,1]
703; SSE42-NEXT:    testb $4, %al
704; SSE42-NEXT:    je LBB6_6
705; SSE42-NEXT:  LBB6_5: ## %cond.load4
706; SSE42-NEXT:    movlps {{.*#+}} xmm5 = mem[0,1],xmm5[2,3]
707; SSE42-NEXT:    testb $8, %al
708; SSE42-NEXT:    je LBB6_8
709; SSE42-NEXT:  LBB6_7: ## %cond.load7
710; SSE42-NEXT:    movhps {{.*#+}} xmm5 = xmm5[0,1],mem[0,1]
711; SSE42-NEXT:    testb $16, %al
712; SSE42-NEXT:    je LBB6_10
713; SSE42-NEXT:  LBB6_9: ## %cond.load10
714; SSE42-NEXT:    movlps {{.*#+}} xmm6 = mem[0,1],xmm6[2,3]
715; SSE42-NEXT:    testb $32, %al
716; SSE42-NEXT:    je LBB6_12
717; SSE42-NEXT:  LBB6_11: ## %cond.load13
718; SSE42-NEXT:    movhps {{.*#+}} xmm6 = xmm6[0,1],mem[0,1]
719; SSE42-NEXT:    testb $64, %al
720; SSE42-NEXT:    je LBB6_14
721; SSE42-NEXT:  LBB6_13: ## %cond.load16
722; SSE42-NEXT:    movlps {{.*#+}} xmm7 = mem[0,1],xmm7[2,3]
723; SSE42-NEXT:    testb $-128, %al
724; SSE42-NEXT:    jne LBB6_15
725; SSE42-NEXT:    jmp LBB6_16
726;
727; AVX1-LABEL: load_v8f64_v8i64:
728; AVX1:       ## %bb.0:
729; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm4
730; AVX1-NEXT:    vpxor %xmm5, %xmm5, %xmm5
731; AVX1-NEXT:    vpcmpeqq %xmm5, %xmm4, %xmm4
732; AVX1-NEXT:    vpcmpeqq %xmm5, %xmm1, %xmm1
733; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm1, %ymm1
734; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm4
735; AVX1-NEXT:    vpcmpeqq %xmm5, %xmm4, %xmm4
736; AVX1-NEXT:    vpcmpeqq %xmm5, %xmm0, %xmm0
737; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm0, %ymm0
738; AVX1-NEXT:    vmaskmovpd (%rdi), %ymm0, %ymm4
739; AVX1-NEXT:    vblendvpd %ymm0, %ymm4, %ymm2, %ymm0
740; AVX1-NEXT:    vmaskmovpd 32(%rdi), %ymm1, %ymm2
741; AVX1-NEXT:    vblendvpd %ymm1, %ymm2, %ymm3, %ymm1
742; AVX1-NEXT:    retq
743;
744; AVX2-LABEL: load_v8f64_v8i64:
745; AVX2:       ## %bb.0:
746; AVX2-NEXT:    vpxor %xmm4, %xmm4, %xmm4
747; AVX2-NEXT:    vpcmpeqq %ymm4, %ymm1, %ymm1
748; AVX2-NEXT:    vpcmpeqq %ymm4, %ymm0, %ymm0
749; AVX2-NEXT:    vmaskmovpd (%rdi), %ymm0, %ymm4
750; AVX2-NEXT:    vblendvpd %ymm0, %ymm4, %ymm2, %ymm0
751; AVX2-NEXT:    vmaskmovpd 32(%rdi), %ymm1, %ymm2
752; AVX2-NEXT:    vblendvpd %ymm1, %ymm2, %ymm3, %ymm1
753; AVX2-NEXT:    retq
754;
755; AVX512-LABEL: load_v8f64_v8i64:
756; AVX512:       ## %bb.0:
757; AVX512-NEXT:    vptestnmq %zmm0, %zmm0, %k1
758; AVX512-NEXT:    vblendmpd (%rdi), %zmm1, %zmm0 {%k1}
759; AVX512-NEXT:    retq
760;
761; X86-AVX512-LABEL: load_v8f64_v8i64:
762; X86-AVX512:       ## %bb.0:
763; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
764; X86-AVX512-NEXT:    vptestnmq %zmm0, %zmm0, %k1
765; X86-AVX512-NEXT:    vblendmpd (%eax), %zmm1, %zmm0 {%k1}
766; X86-AVX512-NEXT:    retl
767  %mask = icmp eq <8 x i64> %trigger, zeroinitializer
768  %res = call <8 x double> @llvm.masked.load.v8f64.p0(ptr %addr, i32 4, <8 x i1> %mask, <8 x double> %dst)
769  ret <8 x double> %res
770}
771
772;
773; vXf32
774;
775
776define <2 x float> @load_v2f32_v2i32(<2 x i32> %trigger, ptr %addr, <2 x float> %dst) {
777; SSE2-LABEL: load_v2f32_v2i32:
778; SSE2:       ## %bb.0:
779; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
780; SSE2-NEXT:    pxor %xmm2, %xmm2
781; SSE2-NEXT:    pcmpeqd %xmm0, %xmm2
782; SSE2-NEXT:    movmskpd %xmm2, %eax
783; SSE2-NEXT:    testb $1, %al
784; SSE2-NEXT:    jne LBB7_1
785; SSE2-NEXT:  ## %bb.2: ## %else
786; SSE2-NEXT:    testb $2, %al
787; SSE2-NEXT:    jne LBB7_3
788; SSE2-NEXT:  LBB7_4: ## %else2
789; SSE2-NEXT:    movaps %xmm1, %xmm0
790; SSE2-NEXT:    retq
791; SSE2-NEXT:  LBB7_1: ## %cond.load
792; SSE2-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
793; SSE2-NEXT:    movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
794; SSE2-NEXT:    testb $2, %al
795; SSE2-NEXT:    je LBB7_4
796; SSE2-NEXT:  LBB7_3: ## %cond.load1
797; SSE2-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
798; SSE2-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
799; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3]
800; SSE2-NEXT:    movaps %xmm0, %xmm1
801; SSE2-NEXT:    movaps %xmm1, %xmm0
802; SSE2-NEXT:    retq
803;
804; SSE42-LABEL: load_v2f32_v2i32:
805; SSE42:       ## %bb.0:
806; SSE42-NEXT:    pxor %xmm2, %xmm2
807; SSE42-NEXT:    pcmpeqd %xmm0, %xmm2
808; SSE42-NEXT:    pmovsxdq %xmm2, %xmm0
809; SSE42-NEXT:    movmskpd %xmm0, %eax
810; SSE42-NEXT:    testb $1, %al
811; SSE42-NEXT:    jne LBB7_1
812; SSE42-NEXT:  ## %bb.2: ## %else
813; SSE42-NEXT:    testb $2, %al
814; SSE42-NEXT:    jne LBB7_3
815; SSE42-NEXT:  LBB7_4: ## %else2
816; SSE42-NEXT:    movaps %xmm1, %xmm0
817; SSE42-NEXT:    retq
818; SSE42-NEXT:  LBB7_1: ## %cond.load
819; SSE42-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
820; SSE42-NEXT:    blendps {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
821; SSE42-NEXT:    testb $2, %al
822; SSE42-NEXT:    je LBB7_4
823; SSE42-NEXT:  LBB7_3: ## %cond.load1
824; SSE42-NEXT:    insertps {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[2,3]
825; SSE42-NEXT:    movaps %xmm1, %xmm0
826; SSE42-NEXT:    retq
827;
828; AVX1OR2-LABEL: load_v2f32_v2i32:
829; AVX1OR2:       ## %bb.0:
830; AVX1OR2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
831; AVX1OR2-NEXT:    vpcmpeqd %xmm2, %xmm0, %xmm0
832; AVX1OR2-NEXT:    vmovq {{.*#+}} xmm0 = xmm0[0],zero
833; AVX1OR2-NEXT:    vmaskmovps (%rdi), %xmm0, %xmm2
834; AVX1OR2-NEXT:    vblendvps %xmm0, %xmm2, %xmm1, %xmm0
835; AVX1OR2-NEXT:    retq
836;
837; AVX512F-LABEL: load_v2f32_v2i32:
838; AVX512F:       ## %bb.0:
839; AVX512F-NEXT:    ## kill: def $xmm1 killed $xmm1 def $zmm1
840; AVX512F-NEXT:    ## kill: def $xmm0 killed $xmm0 def $zmm0
841; AVX512F-NEXT:    vptestnmd %zmm0, %zmm0, %k0
842; AVX512F-NEXT:    kshiftlw $14, %k0, %k0
843; AVX512F-NEXT:    kshiftrw $14, %k0, %k1
844; AVX512F-NEXT:    vblendmps (%rdi), %zmm1, %zmm0 {%k1}
845; AVX512F-NEXT:    ## kill: def $xmm0 killed $xmm0 killed $zmm0
846; AVX512F-NEXT:    vzeroupper
847; AVX512F-NEXT:    retq
848;
849; AVX512VLDQ-LABEL: load_v2f32_v2i32:
850; AVX512VLDQ:       ## %bb.0:
851; AVX512VLDQ-NEXT:    vptestnmd %xmm0, %xmm0, %k0
852; AVX512VLDQ-NEXT:    kshiftlb $6, %k0, %k0
853; AVX512VLDQ-NEXT:    kshiftrb $6, %k0, %k1
854; AVX512VLDQ-NEXT:    vblendmps (%rdi), %xmm1, %xmm0 {%k1}
855; AVX512VLDQ-NEXT:    retq
856;
857; AVX512VLBW-LABEL: load_v2f32_v2i32:
858; AVX512VLBW:       ## %bb.0:
859; AVX512VLBW-NEXT:    vptestnmd %xmm0, %xmm0, %k0
860; AVX512VLBW-NEXT:    kshiftlw $14, %k0, %k0
861; AVX512VLBW-NEXT:    kshiftrw $14, %k0, %k1
862; AVX512VLBW-NEXT:    vblendmps (%rdi), %xmm1, %xmm0 {%k1}
863; AVX512VLBW-NEXT:    retq
864;
865; X86-AVX512-LABEL: load_v2f32_v2i32:
866; X86-AVX512:       ## %bb.0:
867; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
868; X86-AVX512-NEXT:    vptestnmd %xmm0, %xmm0, %k0
869; X86-AVX512-NEXT:    kshiftlb $6, %k0, %k0
870; X86-AVX512-NEXT:    kshiftrb $6, %k0, %k1
871; X86-AVX512-NEXT:    vblendmps (%eax), %xmm1, %xmm0 {%k1}
872; X86-AVX512-NEXT:    retl
873  %mask = icmp eq <2 x i32> %trigger, zeroinitializer
874  %res = call <2 x float> @llvm.masked.load.v2f32.p0(ptr %addr, i32 4, <2 x i1> %mask, <2 x float> %dst)
875  ret <2 x float> %res
876}
877
878define <2 x float> @load_v2f32_v2i32_undef(<2 x i32> %trigger, ptr %addr) {
879; SSE2-LABEL: load_v2f32_v2i32_undef:
880; SSE2:       ## %bb.0:
881; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
882; SSE2-NEXT:    pxor %xmm1, %xmm1
883; SSE2-NEXT:    pcmpeqd %xmm0, %xmm1
884; SSE2-NEXT:    movmskpd %xmm1, %eax
885; SSE2-NEXT:    testb $1, %al
886; SSE2-NEXT:    ## implicit-def: $xmm0
887; SSE2-NEXT:    jne LBB8_1
888; SSE2-NEXT:  ## %bb.2: ## %else
889; SSE2-NEXT:    testb $2, %al
890; SSE2-NEXT:    jne LBB8_3
891; SSE2-NEXT:  LBB8_4: ## %else2
892; SSE2-NEXT:    retq
893; SSE2-NEXT:  LBB8_1: ## %cond.load
894; SSE2-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
895; SSE2-NEXT:    testb $2, %al
896; SSE2-NEXT:    je LBB8_4
897; SSE2-NEXT:  LBB8_3: ## %cond.load1
898; SSE2-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
899; SSE2-NEXT:    movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
900; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[2,3]
901; SSE2-NEXT:    movaps %xmm1, %xmm0
902; SSE2-NEXT:    retq
903;
904; SSE42-LABEL: load_v2f32_v2i32_undef:
905; SSE42:       ## %bb.0:
906; SSE42-NEXT:    pxor %xmm1, %xmm1
907; SSE42-NEXT:    pcmpeqd %xmm0, %xmm1
908; SSE42-NEXT:    pmovsxdq %xmm1, %xmm0
909; SSE42-NEXT:    movmskpd %xmm0, %eax
910; SSE42-NEXT:    testb $1, %al
911; SSE42-NEXT:    ## implicit-def: $xmm0
912; SSE42-NEXT:    jne LBB8_1
913; SSE42-NEXT:  ## %bb.2: ## %else
914; SSE42-NEXT:    testb $2, %al
915; SSE42-NEXT:    jne LBB8_3
916; SSE42-NEXT:  LBB8_4: ## %else2
917; SSE42-NEXT:    retq
918; SSE42-NEXT:  LBB8_1: ## %cond.load
919; SSE42-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
920; SSE42-NEXT:    testb $2, %al
921; SSE42-NEXT:    je LBB8_4
922; SSE42-NEXT:  LBB8_3: ## %cond.load1
923; SSE42-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3]
924; SSE42-NEXT:    retq
925;
926; AVX1OR2-LABEL: load_v2f32_v2i32_undef:
927; AVX1OR2:       ## %bb.0:
928; AVX1OR2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
929; AVX1OR2-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm0
930; AVX1OR2-NEXT:    vmovq {{.*#+}} xmm0 = xmm0[0],zero
931; AVX1OR2-NEXT:    vmaskmovps (%rdi), %xmm0, %xmm0
932; AVX1OR2-NEXT:    retq
933;
934; AVX512F-LABEL: load_v2f32_v2i32_undef:
935; AVX512F:       ## %bb.0:
936; AVX512F-NEXT:    ## kill: def $xmm0 killed $xmm0 def $zmm0
937; AVX512F-NEXT:    vptestnmd %zmm0, %zmm0, %k0
938; AVX512F-NEXT:    kshiftlw $14, %k0, %k0
939; AVX512F-NEXT:    kshiftrw $14, %k0, %k1
940; AVX512F-NEXT:    vmovups (%rdi), %zmm0 {%k1} {z}
941; AVX512F-NEXT:    ## kill: def $xmm0 killed $xmm0 killed $zmm0
942; AVX512F-NEXT:    vzeroupper
943; AVX512F-NEXT:    retq
944;
945; AVX512VLDQ-LABEL: load_v2f32_v2i32_undef:
946; AVX512VLDQ:       ## %bb.0:
947; AVX512VLDQ-NEXT:    vptestnmd %xmm0, %xmm0, %k0
948; AVX512VLDQ-NEXT:    kshiftlb $6, %k0, %k0
949; AVX512VLDQ-NEXT:    kshiftrb $6, %k0, %k1
950; AVX512VLDQ-NEXT:    vmovups (%rdi), %xmm0 {%k1} {z}
951; AVX512VLDQ-NEXT:    retq
952;
953; AVX512VLBW-LABEL: load_v2f32_v2i32_undef:
954; AVX512VLBW:       ## %bb.0:
955; AVX512VLBW-NEXT:    vptestnmd %xmm0, %xmm0, %k0
956; AVX512VLBW-NEXT:    kshiftlw $14, %k0, %k0
957; AVX512VLBW-NEXT:    kshiftrw $14, %k0, %k1
958; AVX512VLBW-NEXT:    vmovups (%rdi), %xmm0 {%k1} {z}
959; AVX512VLBW-NEXT:    retq
960;
961; X86-AVX512-LABEL: load_v2f32_v2i32_undef:
962; X86-AVX512:       ## %bb.0:
963; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
964; X86-AVX512-NEXT:    vptestnmd %xmm0, %xmm0, %k0
965; X86-AVX512-NEXT:    kshiftlb $6, %k0, %k0
966; X86-AVX512-NEXT:    kshiftrb $6, %k0, %k1
967; X86-AVX512-NEXT:    vmovups (%eax), %xmm0 {%k1} {z}
968; X86-AVX512-NEXT:    retl
969  %mask = icmp eq <2 x i32> %trigger, zeroinitializer
970  %res = call <2 x float> @llvm.masked.load.v2f32.p0(ptr %addr, i32 4, <2 x i1> %mask, <2 x float>undef)
971  ret <2 x float> %res
972}
973
974define <4 x float> @load_v4f32_v4i32(<4 x i32> %trigger, ptr %addr, <4 x float> %dst) {
975; SSE2-LABEL: load_v4f32_v4i32:
976; SSE2:       ## %bb.0:
977; SSE2-NEXT:    pxor %xmm2, %xmm2
978; SSE2-NEXT:    pcmpeqd %xmm0, %xmm2
979; SSE2-NEXT:    movmskps %xmm2, %eax
980; SSE2-NEXT:    testb $1, %al
981; SSE2-NEXT:    jne LBB9_1
982; SSE2-NEXT:  ## %bb.2: ## %else
983; SSE2-NEXT:    testb $2, %al
984; SSE2-NEXT:    jne LBB9_3
985; SSE2-NEXT:  LBB9_4: ## %else2
986; SSE2-NEXT:    testb $4, %al
987; SSE2-NEXT:    jne LBB9_5
988; SSE2-NEXT:  LBB9_6: ## %else5
989; SSE2-NEXT:    testb $8, %al
990; SSE2-NEXT:    jne LBB9_7
991; SSE2-NEXT:  LBB9_8: ## %else8
992; SSE2-NEXT:    movaps %xmm1, %xmm0
993; SSE2-NEXT:    retq
994; SSE2-NEXT:  LBB9_1: ## %cond.load
995; SSE2-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
996; SSE2-NEXT:    movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
997; SSE2-NEXT:    testb $2, %al
998; SSE2-NEXT:    je LBB9_4
999; SSE2-NEXT:  LBB9_3: ## %cond.load1
1000; SSE2-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
1001; SSE2-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1002; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3]
1003; SSE2-NEXT:    movaps %xmm0, %xmm1
1004; SSE2-NEXT:    testb $4, %al
1005; SSE2-NEXT:    je LBB9_6
1006; SSE2-NEXT:  LBB9_5: ## %cond.load4
1007; SSE2-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
1008; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[3,0]
1009; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[0,2]
1010; SSE2-NEXT:    testb $8, %al
1011; SSE2-NEXT:    je LBB9_8
1012; SSE2-NEXT:  LBB9_7: ## %cond.load7
1013; SSE2-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
1014; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
1015; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0]
1016; SSE2-NEXT:    movaps %xmm1, %xmm0
1017; SSE2-NEXT:    retq
1018;
1019; SSE42-LABEL: load_v4f32_v4i32:
1020; SSE42:       ## %bb.0:
1021; SSE42-NEXT:    pxor %xmm2, %xmm2
1022; SSE42-NEXT:    pcmpeqd %xmm0, %xmm2
1023; SSE42-NEXT:    movmskps %xmm2, %eax
1024; SSE42-NEXT:    testb $1, %al
1025; SSE42-NEXT:    jne LBB9_1
1026; SSE42-NEXT:  ## %bb.2: ## %else
1027; SSE42-NEXT:    testb $2, %al
1028; SSE42-NEXT:    jne LBB9_3
1029; SSE42-NEXT:  LBB9_4: ## %else2
1030; SSE42-NEXT:    testb $4, %al
1031; SSE42-NEXT:    jne LBB9_5
1032; SSE42-NEXT:  LBB9_6: ## %else5
1033; SSE42-NEXT:    testb $8, %al
1034; SSE42-NEXT:    jne LBB9_7
1035; SSE42-NEXT:  LBB9_8: ## %else8
1036; SSE42-NEXT:    movaps %xmm1, %xmm0
1037; SSE42-NEXT:    retq
1038; SSE42-NEXT:  LBB9_1: ## %cond.load
1039; SSE42-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
1040; SSE42-NEXT:    pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3,4,5,6,7]
1041; SSE42-NEXT:    testb $2, %al
1042; SSE42-NEXT:    je LBB9_4
1043; SSE42-NEXT:  LBB9_3: ## %cond.load1
1044; SSE42-NEXT:    insertps {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[2,3]
1045; SSE42-NEXT:    testb $4, %al
1046; SSE42-NEXT:    je LBB9_6
1047; SSE42-NEXT:  LBB9_5: ## %cond.load4
1048; SSE42-NEXT:    insertps {{.*#+}} xmm1 = xmm1[0,1],mem[0],xmm1[3]
1049; SSE42-NEXT:    testb $8, %al
1050; SSE42-NEXT:    je LBB9_8
1051; SSE42-NEXT:  LBB9_7: ## %cond.load7
1052; SSE42-NEXT:    insertps {{.*#+}} xmm1 = xmm1[0,1,2],mem[0]
1053; SSE42-NEXT:    movaps %xmm1, %xmm0
1054; SSE42-NEXT:    retq
1055;
1056; AVX1OR2-LABEL: load_v4f32_v4i32:
1057; AVX1OR2:       ## %bb.0:
1058; AVX1OR2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
1059; AVX1OR2-NEXT:    vpcmpeqd %xmm2, %xmm0, %xmm0
1060; AVX1OR2-NEXT:    vmaskmovps (%rdi), %xmm0, %xmm2
1061; AVX1OR2-NEXT:    vblendvps %xmm0, %xmm2, %xmm1, %xmm0
1062; AVX1OR2-NEXT:    retq
1063;
1064; AVX512F-LABEL: load_v4f32_v4i32:
1065; AVX512F:       ## %bb.0:
1066; AVX512F-NEXT:    ## kill: def $xmm1 killed $xmm1 def $zmm1
1067; AVX512F-NEXT:    ## kill: def $xmm0 killed $xmm0 def $zmm0
1068; AVX512F-NEXT:    vptestnmd %zmm0, %zmm0, %k0
1069; AVX512F-NEXT:    kshiftlw $12, %k0, %k0
1070; AVX512F-NEXT:    kshiftrw $12, %k0, %k1
1071; AVX512F-NEXT:    vblendmps (%rdi), %zmm1, %zmm0 {%k1}
1072; AVX512F-NEXT:    ## kill: def $xmm0 killed $xmm0 killed $zmm0
1073; AVX512F-NEXT:    vzeroupper
1074; AVX512F-NEXT:    retq
1075;
1076; AVX512VL-LABEL: load_v4f32_v4i32:
1077; AVX512VL:       ## %bb.0:
1078; AVX512VL-NEXT:    vptestnmd %xmm0, %xmm0, %k1
1079; AVX512VL-NEXT:    vblendmps (%rdi), %xmm1, %xmm0 {%k1}
1080; AVX512VL-NEXT:    retq
1081;
1082; X86-AVX512-LABEL: load_v4f32_v4i32:
1083; X86-AVX512:       ## %bb.0:
1084; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
1085; X86-AVX512-NEXT:    vptestnmd %xmm0, %xmm0, %k1
1086; X86-AVX512-NEXT:    vblendmps (%eax), %xmm1, %xmm0 {%k1}
1087; X86-AVX512-NEXT:    retl
1088  %mask = icmp eq <4 x i32> %trigger, zeroinitializer
1089  %res = call <4 x float> @llvm.masked.load.v4f32.p0(ptr %addr, i32 4, <4 x i1> %mask, <4 x float> %dst)
1090  ret <4 x float> %res
1091}
1092
1093define <8 x float> @load_v8f32_v8i1_zero(<8 x i1> %mask, ptr %addr) {
1094; SSE2-LABEL: load_v8f32_v8i1_zero:
1095; SSE2:       ## %bb.0:
1096; SSE2-NEXT:    psllw $15, %xmm0
1097; SSE2-NEXT:    packsswb %xmm0, %xmm0
1098; SSE2-NEXT:    pmovmskb %xmm0, %eax
1099; SSE2-NEXT:    pxor %xmm0, %xmm0
1100; SSE2-NEXT:    testb $1, %al
1101; SSE2-NEXT:    xorps %xmm1, %xmm1
1102; SSE2-NEXT:    jne LBB10_1
1103; SSE2-NEXT:  ## %bb.2: ## %else
1104; SSE2-NEXT:    testb $2, %al
1105; SSE2-NEXT:    jne LBB10_3
1106; SSE2-NEXT:  LBB10_4: ## %else2
1107; SSE2-NEXT:    testb $4, %al
1108; SSE2-NEXT:    jne LBB10_5
1109; SSE2-NEXT:  LBB10_6: ## %else5
1110; SSE2-NEXT:    testb $8, %al
1111; SSE2-NEXT:    jne LBB10_7
1112; SSE2-NEXT:  LBB10_8: ## %else8
1113; SSE2-NEXT:    testb $16, %al
1114; SSE2-NEXT:    jne LBB10_9
1115; SSE2-NEXT:  LBB10_10: ## %else11
1116; SSE2-NEXT:    testb $32, %al
1117; SSE2-NEXT:    jne LBB10_11
1118; SSE2-NEXT:  LBB10_12: ## %else14
1119; SSE2-NEXT:    testb $64, %al
1120; SSE2-NEXT:    jne LBB10_13
1121; SSE2-NEXT:  LBB10_14: ## %else17
1122; SSE2-NEXT:    testb $-128, %al
1123; SSE2-NEXT:    jne LBB10_15
1124; SSE2-NEXT:  LBB10_16: ## %else20
1125; SSE2-NEXT:    retq
1126; SSE2-NEXT:  LBB10_1: ## %cond.load
1127; SSE2-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
1128; SSE2-NEXT:    testb $2, %al
1129; SSE2-NEXT:    je LBB10_4
1130; SSE2-NEXT:  LBB10_3: ## %cond.load1
1131; SSE2-NEXT:    movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
1132; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm0[0]
1133; SSE2-NEXT:    shufps {{.*#+}} xmm2 = xmm2[2,0],xmm0[2,3]
1134; SSE2-NEXT:    movaps %xmm2, %xmm0
1135; SSE2-NEXT:    testb $4, %al
1136; SSE2-NEXT:    je LBB10_6
1137; SSE2-NEXT:  LBB10_5: ## %cond.load4
1138; SSE2-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
1139; SSE2-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,0],xmm0[3,0]
1140; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1],xmm2[0,2]
1141; SSE2-NEXT:    testb $8, %al
1142; SSE2-NEXT:    je LBB10_8
1143; SSE2-NEXT:  LBB10_7: ## %cond.load7
1144; SSE2-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
1145; SSE2-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,3]
1146; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,0]
1147; SSE2-NEXT:    testb $16, %al
1148; SSE2-NEXT:    je LBB10_10
1149; SSE2-NEXT:  LBB10_9: ## %cond.load10
1150; SSE2-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
1151; SSE2-NEXT:    movss {{.*#+}} xmm1 = xmm2[0],xmm1[1,2,3]
1152; SSE2-NEXT:    testb $32, %al
1153; SSE2-NEXT:    je LBB10_12
1154; SSE2-NEXT:  LBB10_11: ## %cond.load13
1155; SSE2-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
1156; SSE2-NEXT:    movlhps {{.*#+}} xmm2 = xmm2[0],xmm1[0]
1157; SSE2-NEXT:    shufps {{.*#+}} xmm2 = xmm2[2,0],xmm1[2,3]
1158; SSE2-NEXT:    movaps %xmm2, %xmm1
1159; SSE2-NEXT:    testb $64, %al
1160; SSE2-NEXT:    je LBB10_14
1161; SSE2-NEXT:  LBB10_13: ## %cond.load16
1162; SSE2-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
1163; SSE2-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,0],xmm1[3,0]
1164; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0,2]
1165; SSE2-NEXT:    testb $-128, %al
1166; SSE2-NEXT:    je LBB10_16
1167; SSE2-NEXT:  LBB10_15: ## %cond.load19
1168; SSE2-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
1169; SSE2-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3]
1170; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,0]
1171; SSE2-NEXT:    retq
1172;
1173; SSE42-LABEL: load_v8f32_v8i1_zero:
1174; SSE42:       ## %bb.0:
1175; SSE42-NEXT:    psllw $15, %xmm0
1176; SSE42-NEXT:    packsswb %xmm0, %xmm0
1177; SSE42-NEXT:    pmovmskb %xmm0, %eax
1178; SSE42-NEXT:    pxor %xmm0, %xmm0
1179; SSE42-NEXT:    testb $1, %al
1180; SSE42-NEXT:    xorps %xmm1, %xmm1
1181; SSE42-NEXT:    jne LBB10_1
1182; SSE42-NEXT:  ## %bb.2: ## %else
1183; SSE42-NEXT:    testb $2, %al
1184; SSE42-NEXT:    jne LBB10_3
1185; SSE42-NEXT:  LBB10_4: ## %else2
1186; SSE42-NEXT:    testb $4, %al
1187; SSE42-NEXT:    jne LBB10_5
1188; SSE42-NEXT:  LBB10_6: ## %else5
1189; SSE42-NEXT:    testb $8, %al
1190; SSE42-NEXT:    jne LBB10_7
1191; SSE42-NEXT:  LBB10_8: ## %else8
1192; SSE42-NEXT:    testb $16, %al
1193; SSE42-NEXT:    jne LBB10_9
1194; SSE42-NEXT:  LBB10_10: ## %else11
1195; SSE42-NEXT:    testb $32, %al
1196; SSE42-NEXT:    jne LBB10_11
1197; SSE42-NEXT:  LBB10_12: ## %else14
1198; SSE42-NEXT:    testb $64, %al
1199; SSE42-NEXT:    jne LBB10_13
1200; SSE42-NEXT:  LBB10_14: ## %else17
1201; SSE42-NEXT:    testb $-128, %al
1202; SSE42-NEXT:    jne LBB10_15
1203; SSE42-NEXT:  LBB10_16: ## %else20
1204; SSE42-NEXT:    retq
1205; SSE42-NEXT:  LBB10_1: ## %cond.load
1206; SSE42-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
1207; SSE42-NEXT:    testb $2, %al
1208; SSE42-NEXT:    je LBB10_4
1209; SSE42-NEXT:  LBB10_3: ## %cond.load1
1210; SSE42-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3]
1211; SSE42-NEXT:    testb $4, %al
1212; SSE42-NEXT:    je LBB10_6
1213; SSE42-NEXT:  LBB10_5: ## %cond.load4
1214; SSE42-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
1215; SSE42-NEXT:    testb $8, %al
1216; SSE42-NEXT:    je LBB10_8
1217; SSE42-NEXT:  LBB10_7: ## %cond.load7
1218; SSE42-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0]
1219; SSE42-NEXT:    testb $16, %al
1220; SSE42-NEXT:    je LBB10_10
1221; SSE42-NEXT:  LBB10_9: ## %cond.load10
1222; SSE42-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
1223; SSE42-NEXT:    blendps {{.*#+}} xmm1 = xmm2[0],xmm1[1,2,3]
1224; SSE42-NEXT:    testb $32, %al
1225; SSE42-NEXT:    je LBB10_12
1226; SSE42-NEXT:  LBB10_11: ## %cond.load13
1227; SSE42-NEXT:    insertps {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[2,3]
1228; SSE42-NEXT:    testb $64, %al
1229; SSE42-NEXT:    je LBB10_14
1230; SSE42-NEXT:  LBB10_13: ## %cond.load16
1231; SSE42-NEXT:    insertps {{.*#+}} xmm1 = xmm1[0,1],mem[0],xmm1[3]
1232; SSE42-NEXT:    testb $-128, %al
1233; SSE42-NEXT:    je LBB10_16
1234; SSE42-NEXT:  LBB10_15: ## %cond.load19
1235; SSE42-NEXT:    insertps {{.*#+}} xmm1 = xmm1[0,1,2],mem[0]
1236; SSE42-NEXT:    retq
1237;
1238; AVX1-LABEL: load_v8f32_v8i1_zero:
1239; AVX1:       ## %bb.0:
1240; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
1241; AVX1-NEXT:    vpslld $31, %xmm1, %xmm1
1242; AVX1-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7]
1243; AVX1-NEXT:    vpslld $31, %xmm0, %xmm0
1244; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
1245; AVX1-NEXT:    vmaskmovps (%rdi), %ymm0, %ymm0
1246; AVX1-NEXT:    retq
1247;
1248; AVX2-LABEL: load_v8f32_v8i1_zero:
1249; AVX2:       ## %bb.0:
1250; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
1251; AVX2-NEXT:    vpslld $31, %ymm0, %ymm0
1252; AVX2-NEXT:    vmaskmovps (%rdi), %ymm0, %ymm0
1253; AVX2-NEXT:    retq
1254;
1255; AVX512F-LABEL: load_v8f32_v8i1_zero:
1256; AVX512F:       ## %bb.0:
1257; AVX512F-NEXT:    vpmovsxwq %xmm0, %zmm0
1258; AVX512F-NEXT:    vpsllq $63, %zmm0, %zmm0
1259; AVX512F-NEXT:    vptestmq %zmm0, %zmm0, %k1
1260; AVX512F-NEXT:    vmovups (%rdi), %zmm0 {%k1} {z}
1261; AVX512F-NEXT:    ## kill: def $ymm0 killed $ymm0 killed $zmm0
1262; AVX512F-NEXT:    retq
1263;
1264; AVX512VLDQ-LABEL: load_v8f32_v8i1_zero:
1265; AVX512VLDQ:       ## %bb.0:
1266; AVX512VLDQ-NEXT:    vpmovsxwd %xmm0, %ymm0
1267; AVX512VLDQ-NEXT:    vpslld $31, %ymm0, %ymm0
1268; AVX512VLDQ-NEXT:    vpmovd2m %ymm0, %k1
1269; AVX512VLDQ-NEXT:    vmovaps (%rdi), %ymm0 {%k1} {z}
1270; AVX512VLDQ-NEXT:    retq
1271;
1272; AVX512VLBW-LABEL: load_v8f32_v8i1_zero:
1273; AVX512VLBW:       ## %bb.0:
1274; AVX512VLBW-NEXT:    vpsllw $15, %xmm0, %xmm0
1275; AVX512VLBW-NEXT:    vpmovw2m %xmm0, %k1
1276; AVX512VLBW-NEXT:    vmovaps (%rdi), %ymm0 {%k1} {z}
1277; AVX512VLBW-NEXT:    retq
1278;
1279; X86-AVX512-LABEL: load_v8f32_v8i1_zero:
1280; X86-AVX512:       ## %bb.0:
1281; X86-AVX512-NEXT:    vpsllw $15, %xmm0, %xmm0
1282; X86-AVX512-NEXT:    vpmovw2m %xmm0, %k1
1283; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
1284; X86-AVX512-NEXT:    vmovaps (%eax), %ymm0 {%k1} {z}
1285; X86-AVX512-NEXT:    retl
1286  %res = call <8 x float> @llvm.masked.load.v8f32.p0(ptr %addr, i32 32, <8 x i1> %mask, <8 x float> zeroinitializer)
1287  ret <8 x float> %res
1288}
1289
1290define <8 x float> @load_v8f32_v8i32(<8 x i32> %trigger, ptr %addr, <8 x float> %dst) {
1291; SSE2-LABEL: load_v8f32_v8i32:
1292; SSE2:       ## %bb.0:
1293; SSE2-NEXT:    pxor %xmm4, %xmm4
1294; SSE2-NEXT:    pcmpeqd %xmm4, %xmm1
1295; SSE2-NEXT:    pcmpeqd %xmm4, %xmm0
1296; SSE2-NEXT:    packssdw %xmm1, %xmm0
1297; SSE2-NEXT:    packsswb %xmm0, %xmm0
1298; SSE2-NEXT:    pmovmskb %xmm0, %eax
1299; SSE2-NEXT:    testb $1, %al
1300; SSE2-NEXT:    jne LBB11_1
1301; SSE2-NEXT:  ## %bb.2: ## %else
1302; SSE2-NEXT:    testb $2, %al
1303; SSE2-NEXT:    jne LBB11_3
1304; SSE2-NEXT:  LBB11_4: ## %else2
1305; SSE2-NEXT:    testb $4, %al
1306; SSE2-NEXT:    jne LBB11_5
1307; SSE2-NEXT:  LBB11_6: ## %else5
1308; SSE2-NEXT:    testb $8, %al
1309; SSE2-NEXT:    jne LBB11_7
1310; SSE2-NEXT:  LBB11_8: ## %else8
1311; SSE2-NEXT:    testb $16, %al
1312; SSE2-NEXT:    jne LBB11_9
1313; SSE2-NEXT:  LBB11_10: ## %else11
1314; SSE2-NEXT:    testb $32, %al
1315; SSE2-NEXT:    jne LBB11_11
1316; SSE2-NEXT:  LBB11_12: ## %else14
1317; SSE2-NEXT:    testb $64, %al
1318; SSE2-NEXT:    jne LBB11_13
1319; SSE2-NEXT:  LBB11_14: ## %else17
1320; SSE2-NEXT:    testb $-128, %al
1321; SSE2-NEXT:    je LBB11_16
1322; SSE2-NEXT:  LBB11_15: ## %cond.load19
1323; SSE2-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
1324; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,3]
1325; SSE2-NEXT:    shufps {{.*#+}} xmm3 = xmm3[0,1],xmm0[2,0]
1326; SSE2-NEXT:  LBB11_16: ## %else20
1327; SSE2-NEXT:    movaps %xmm2, %xmm0
1328; SSE2-NEXT:    movaps %xmm3, %xmm1
1329; SSE2-NEXT:    retq
1330; SSE2-NEXT:  LBB11_1: ## %cond.load
1331; SSE2-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
1332; SSE2-NEXT:    movss {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3]
1333; SSE2-NEXT:    testb $2, %al
1334; SSE2-NEXT:    je LBB11_4
1335; SSE2-NEXT:  LBB11_3: ## %cond.load1
1336; SSE2-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
1337; SSE2-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0]
1338; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,0],xmm2[2,3]
1339; SSE2-NEXT:    movaps %xmm0, %xmm2
1340; SSE2-NEXT:    testb $4, %al
1341; SSE2-NEXT:    je LBB11_6
1342; SSE2-NEXT:  LBB11_5: ## %cond.load4
1343; SSE2-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
1344; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,0],xmm2[3,0]
1345; SSE2-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[0,2]
1346; SSE2-NEXT:    testb $8, %al
1347; SSE2-NEXT:    je LBB11_8
1348; SSE2-NEXT:  LBB11_7: ## %cond.load7
1349; SSE2-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
1350; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3]
1351; SSE2-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,0]
1352; SSE2-NEXT:    testb $16, %al
1353; SSE2-NEXT:    je LBB11_10
1354; SSE2-NEXT:  LBB11_9: ## %cond.load10
1355; SSE2-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
1356; SSE2-NEXT:    movss {{.*#+}} xmm3 = xmm0[0],xmm3[1,2,3]
1357; SSE2-NEXT:    testb $32, %al
1358; SSE2-NEXT:    je LBB11_12
1359; SSE2-NEXT:  LBB11_11: ## %cond.load13
1360; SSE2-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
1361; SSE2-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm3[0]
1362; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,0],xmm3[2,3]
1363; SSE2-NEXT:    movaps %xmm0, %xmm3
1364; SSE2-NEXT:    testb $64, %al
1365; SSE2-NEXT:    je LBB11_14
1366; SSE2-NEXT:  LBB11_13: ## %cond.load16
1367; SSE2-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
1368; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,0],xmm3[3,0]
1369; SSE2-NEXT:    shufps {{.*#+}} xmm3 = xmm3[0,1],xmm0[0,2]
1370; SSE2-NEXT:    testb $-128, %al
1371; SSE2-NEXT:    jne LBB11_15
1372; SSE2-NEXT:    jmp LBB11_16
1373;
1374; SSE42-LABEL: load_v8f32_v8i32:
1375; SSE42:       ## %bb.0:
1376; SSE42-NEXT:    pxor %xmm4, %xmm4
1377; SSE42-NEXT:    pcmpeqd %xmm4, %xmm1
1378; SSE42-NEXT:    pcmpeqd %xmm4, %xmm0
1379; SSE42-NEXT:    packssdw %xmm1, %xmm0
1380; SSE42-NEXT:    packsswb %xmm0, %xmm0
1381; SSE42-NEXT:    pmovmskb %xmm0, %eax
1382; SSE42-NEXT:    testb $1, %al
1383; SSE42-NEXT:    jne LBB11_1
1384; SSE42-NEXT:  ## %bb.2: ## %else
1385; SSE42-NEXT:    testb $2, %al
1386; SSE42-NEXT:    jne LBB11_3
1387; SSE42-NEXT:  LBB11_4: ## %else2
1388; SSE42-NEXT:    testb $4, %al
1389; SSE42-NEXT:    jne LBB11_5
1390; SSE42-NEXT:  LBB11_6: ## %else5
1391; SSE42-NEXT:    testb $8, %al
1392; SSE42-NEXT:    jne LBB11_7
1393; SSE42-NEXT:  LBB11_8: ## %else8
1394; SSE42-NEXT:    testb $16, %al
1395; SSE42-NEXT:    jne LBB11_9
1396; SSE42-NEXT:  LBB11_10: ## %else11
1397; SSE42-NEXT:    testb $32, %al
1398; SSE42-NEXT:    jne LBB11_11
1399; SSE42-NEXT:  LBB11_12: ## %else14
1400; SSE42-NEXT:    testb $64, %al
1401; SSE42-NEXT:    jne LBB11_13
1402; SSE42-NEXT:  LBB11_14: ## %else17
1403; SSE42-NEXT:    testb $-128, %al
1404; SSE42-NEXT:    je LBB11_16
1405; SSE42-NEXT:  LBB11_15: ## %cond.load19
1406; SSE42-NEXT:    insertps {{.*#+}} xmm3 = xmm3[0,1,2],mem[0]
1407; SSE42-NEXT:  LBB11_16: ## %else20
1408; SSE42-NEXT:    movaps %xmm2, %xmm0
1409; SSE42-NEXT:    movaps %xmm3, %xmm1
1410; SSE42-NEXT:    retq
1411; SSE42-NEXT:  LBB11_1: ## %cond.load
1412; SSE42-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
1413; SSE42-NEXT:    pblendw {{.*#+}} xmm2 = xmm0[0,1],xmm2[2,3,4,5,6,7]
1414; SSE42-NEXT:    testb $2, %al
1415; SSE42-NEXT:    je LBB11_4
1416; SSE42-NEXT:  LBB11_3: ## %cond.load1
1417; SSE42-NEXT:    insertps {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[2,3]
1418; SSE42-NEXT:    testb $4, %al
1419; SSE42-NEXT:    je LBB11_6
1420; SSE42-NEXT:  LBB11_5: ## %cond.load4
1421; SSE42-NEXT:    insertps {{.*#+}} xmm2 = xmm2[0,1],mem[0],xmm2[3]
1422; SSE42-NEXT:    testb $8, %al
1423; SSE42-NEXT:    je LBB11_8
1424; SSE42-NEXT:  LBB11_7: ## %cond.load7
1425; SSE42-NEXT:    insertps {{.*#+}} xmm2 = xmm2[0,1,2],mem[0]
1426; SSE42-NEXT:    testb $16, %al
1427; SSE42-NEXT:    je LBB11_10
1428; SSE42-NEXT:  LBB11_9: ## %cond.load10
1429; SSE42-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
1430; SSE42-NEXT:    pblendw {{.*#+}} xmm3 = xmm0[0,1],xmm3[2,3,4,5,6,7]
1431; SSE42-NEXT:    testb $32, %al
1432; SSE42-NEXT:    je LBB11_12
1433; SSE42-NEXT:  LBB11_11: ## %cond.load13
1434; SSE42-NEXT:    insertps {{.*#+}} xmm3 = xmm3[0],mem[0],xmm3[2,3]
1435; SSE42-NEXT:    testb $64, %al
1436; SSE42-NEXT:    je LBB11_14
1437; SSE42-NEXT:  LBB11_13: ## %cond.load16
1438; SSE42-NEXT:    insertps {{.*#+}} xmm3 = xmm3[0,1],mem[0],xmm3[3]
1439; SSE42-NEXT:    testb $-128, %al
1440; SSE42-NEXT:    jne LBB11_15
1441; SSE42-NEXT:    jmp LBB11_16
1442;
1443; AVX1-LABEL: load_v8f32_v8i32:
1444; AVX1:       ## %bb.0:
1445; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
1446; AVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
1447; AVX1-NEXT:    vpcmpeqd %xmm3, %xmm2, %xmm2
1448; AVX1-NEXT:    vpcmpeqd %xmm3, %xmm0, %xmm0
1449; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
1450; AVX1-NEXT:    vmaskmovps (%rdi), %ymm0, %ymm2
1451; AVX1-NEXT:    vblendvps %ymm0, %ymm2, %ymm1, %ymm0
1452; AVX1-NEXT:    retq
1453;
1454; AVX2-LABEL: load_v8f32_v8i32:
1455; AVX2:       ## %bb.0:
1456; AVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
1457; AVX2-NEXT:    vpcmpeqd %ymm2, %ymm0, %ymm0
1458; AVX2-NEXT:    vmaskmovps (%rdi), %ymm0, %ymm2
1459; AVX2-NEXT:    vblendvps %ymm0, %ymm2, %ymm1, %ymm0
1460; AVX2-NEXT:    retq
1461;
1462; AVX512F-LABEL: load_v8f32_v8i32:
1463; AVX512F:       ## %bb.0:
1464; AVX512F-NEXT:    ## kill: def $ymm1 killed $ymm1 def $zmm1
1465; AVX512F-NEXT:    ## kill: def $ymm0 killed $ymm0 def $zmm0
1466; AVX512F-NEXT:    vptestnmd %zmm0, %zmm0, %k0
1467; AVX512F-NEXT:    kshiftlw $8, %k0, %k0
1468; AVX512F-NEXT:    kshiftrw $8, %k0, %k1
1469; AVX512F-NEXT:    vblendmps (%rdi), %zmm1, %zmm0 {%k1}
1470; AVX512F-NEXT:    ## kill: def $ymm0 killed $ymm0 killed $zmm0
1471; AVX512F-NEXT:    retq
1472;
1473; AVX512VL-LABEL: load_v8f32_v8i32:
1474; AVX512VL:       ## %bb.0:
1475; AVX512VL-NEXT:    vptestnmd %ymm0, %ymm0, %k1
1476; AVX512VL-NEXT:    vblendmps (%rdi), %ymm1, %ymm0 {%k1}
1477; AVX512VL-NEXT:    retq
1478;
1479; X86-AVX512-LABEL: load_v8f32_v8i32:
1480; X86-AVX512:       ## %bb.0:
1481; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
1482; X86-AVX512-NEXT:    vptestnmd %ymm0, %ymm0, %k1
1483; X86-AVX512-NEXT:    vblendmps (%eax), %ymm1, %ymm0 {%k1}
1484; X86-AVX512-NEXT:    retl
1485  %mask = icmp eq <8 x i32> %trigger, zeroinitializer
1486  %res = call <8 x float> @llvm.masked.load.v8f32.p0(ptr %addr, i32 32, <8 x i1> %mask, <8 x float> %dst)
1487  ret <8 x float> %res
1488}
1489
1490
1491;
1492; vXf64
1493;
1494
1495define <1 x i64> @load_v1i64_v1i64(<1 x i64> %trigger, ptr %addr, <1 x i64> %dst) {
1496; SSE-LABEL: load_v1i64_v1i64:
1497; SSE:       ## %bb.0:
1498; SSE-NEXT:    testq %rdi, %rdi
1499; SSE-NEXT:    jne LBB12_1
1500; SSE-NEXT:  ## %bb.2: ## %cond.load
1501; SSE-NEXT:    movq (%rsi), %rax
1502; SSE-NEXT:    retq
1503; SSE-NEXT:  LBB12_1:
1504; SSE-NEXT:    movq %rdx, %rax
1505; SSE-NEXT:    retq
1506;
1507; AVX-LABEL: load_v1i64_v1i64:
1508; AVX:       ## %bb.0:
1509; AVX-NEXT:    testq %rdi, %rdi
1510; AVX-NEXT:    jne LBB12_1
1511; AVX-NEXT:  ## %bb.2: ## %cond.load
1512; AVX-NEXT:    movq (%rsi), %rax
1513; AVX-NEXT:    retq
1514; AVX-NEXT:  LBB12_1:
1515; AVX-NEXT:    movq %rdx, %rax
1516; AVX-NEXT:    retq
1517;
1518; X86-AVX512-LABEL: load_v1i64_v1i64:
1519; X86-AVX512:       ## %bb.0:
1520; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
1521; X86-AVX512-NEXT:    orl {{[0-9]+}}(%esp), %eax
1522; X86-AVX512-NEXT:    jne LBB12_1
1523; X86-AVX512-NEXT:  ## %bb.2: ## %cond.load
1524; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %ecx
1525; X86-AVX512-NEXT:    movl (%ecx), %eax
1526; X86-AVX512-NEXT:    movl 4(%ecx), %edx
1527; X86-AVX512-NEXT:    retl
1528; X86-AVX512-NEXT:  LBB12_1:
1529; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %edx
1530; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
1531; X86-AVX512-NEXT:    retl
1532  %mask = icmp eq <1 x i64> %trigger, zeroinitializer
1533  %res = call <1 x i64> @llvm.masked.load.v1i64.p0(ptr %addr, i32 4, <1 x i1> %mask, <1 x i64> %dst)
1534  ret <1 x i64> %res
1535}
1536
1537define <2 x i64> @load_v2i64_v2i64(<2 x i64> %trigger, ptr %addr, <2 x i64> %dst) {
1538; SSE2-LABEL: load_v2i64_v2i64:
1539; SSE2:       ## %bb.0:
1540; SSE2-NEXT:    pxor %xmm2, %xmm2
1541; SSE2-NEXT:    pcmpeqd %xmm0, %xmm2
1542; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[1,0,3,2]
1543; SSE2-NEXT:    pand %xmm2, %xmm0
1544; SSE2-NEXT:    movmskpd %xmm0, %eax
1545; SSE2-NEXT:    testb $1, %al
1546; SSE2-NEXT:    jne LBB13_1
1547; SSE2-NEXT:  ## %bb.2: ## %else
1548; SSE2-NEXT:    testb $2, %al
1549; SSE2-NEXT:    jne LBB13_3
1550; SSE2-NEXT:  LBB13_4: ## %else2
1551; SSE2-NEXT:    movaps %xmm1, %xmm0
1552; SSE2-NEXT:    retq
1553; SSE2-NEXT:  LBB13_1: ## %cond.load
1554; SSE2-NEXT:    movlps {{.*#+}} xmm1 = mem[0,1],xmm1[2,3]
1555; SSE2-NEXT:    testb $2, %al
1556; SSE2-NEXT:    je LBB13_4
1557; SSE2-NEXT:  LBB13_3: ## %cond.load1
1558; SSE2-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
1559; SSE2-NEXT:    movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
1560; SSE2-NEXT:    movaps %xmm1, %xmm0
1561; SSE2-NEXT:    retq
1562;
1563; SSE42-LABEL: load_v2i64_v2i64:
1564; SSE42:       ## %bb.0:
1565; SSE42-NEXT:    pxor %xmm2, %xmm2
1566; SSE42-NEXT:    pcmpeqq %xmm0, %xmm2
1567; SSE42-NEXT:    movmskpd %xmm2, %eax
1568; SSE42-NEXT:    testb $1, %al
1569; SSE42-NEXT:    jne LBB13_1
1570; SSE42-NEXT:  ## %bb.2: ## %else
1571; SSE42-NEXT:    testb $2, %al
1572; SSE42-NEXT:    jne LBB13_3
1573; SSE42-NEXT:  LBB13_4: ## %else2
1574; SSE42-NEXT:    movdqa %xmm1, %xmm0
1575; SSE42-NEXT:    retq
1576; SSE42-NEXT:  LBB13_1: ## %cond.load
1577; SSE42-NEXT:    pinsrq $0, (%rdi), %xmm1
1578; SSE42-NEXT:    testb $2, %al
1579; SSE42-NEXT:    je LBB13_4
1580; SSE42-NEXT:  LBB13_3: ## %cond.load1
1581; SSE42-NEXT:    pinsrq $1, 8(%rdi), %xmm1
1582; SSE42-NEXT:    movdqa %xmm1, %xmm0
1583; SSE42-NEXT:    retq
1584;
1585; AVX1-LABEL: load_v2i64_v2i64:
1586; AVX1:       ## %bb.0:
1587; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
1588; AVX1-NEXT:    vpcmpeqq %xmm2, %xmm0, %xmm0
1589; AVX1-NEXT:    vmaskmovpd (%rdi), %xmm0, %xmm2
1590; AVX1-NEXT:    vblendvpd %xmm0, %xmm2, %xmm1, %xmm0
1591; AVX1-NEXT:    retq
1592;
1593; AVX2-LABEL: load_v2i64_v2i64:
1594; AVX2:       ## %bb.0:
1595; AVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
1596; AVX2-NEXT:    vpcmpeqq %xmm2, %xmm0, %xmm0
1597; AVX2-NEXT:    vpmaskmovq (%rdi), %xmm0, %xmm2
1598; AVX2-NEXT:    vblendvpd %xmm0, %xmm2, %xmm1, %xmm0
1599; AVX2-NEXT:    retq
1600;
1601; AVX512F-LABEL: load_v2i64_v2i64:
1602; AVX512F:       ## %bb.0:
1603; AVX512F-NEXT:    ## kill: def $xmm1 killed $xmm1 def $zmm1
1604; AVX512F-NEXT:    ## kill: def $xmm0 killed $xmm0 def $zmm0
1605; AVX512F-NEXT:    vptestnmq %zmm0, %zmm0, %k0
1606; AVX512F-NEXT:    kshiftlw $14, %k0, %k0
1607; AVX512F-NEXT:    kshiftrw $14, %k0, %k1
1608; AVX512F-NEXT:    vpblendmq (%rdi), %zmm1, %zmm0 {%k1}
1609; AVX512F-NEXT:    ## kill: def $xmm0 killed $xmm0 killed $zmm0
1610; AVX512F-NEXT:    vzeroupper
1611; AVX512F-NEXT:    retq
1612;
1613; AVX512VL-LABEL: load_v2i64_v2i64:
1614; AVX512VL:       ## %bb.0:
1615; AVX512VL-NEXT:    vptestnmq %xmm0, %xmm0, %k1
1616; AVX512VL-NEXT:    vpblendmq (%rdi), %xmm1, %xmm0 {%k1}
1617; AVX512VL-NEXT:    retq
1618;
1619; X86-AVX512-LABEL: load_v2i64_v2i64:
1620; X86-AVX512:       ## %bb.0:
1621; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
1622; X86-AVX512-NEXT:    vptestnmq %xmm0, %xmm0, %k1
1623; X86-AVX512-NEXT:    vpblendmq (%eax), %xmm1, %xmm0 {%k1}
1624; X86-AVX512-NEXT:    retl
1625  %mask = icmp eq <2 x i64> %trigger, zeroinitializer
1626  %res = call <2 x i64> @llvm.masked.load.v2i64.p0(ptr %addr, i32 4, <2 x i1> %mask, <2 x i64> %dst)
1627  ret <2 x i64> %res
1628}
1629
1630define <4 x i64> @load_v4i64_v4i64(<4 x i64> %trigger, ptr %addr, <4 x i64> %dst) {
1631; SSE2-LABEL: load_v4i64_v4i64:
1632; SSE2:       ## %bb.0:
1633; SSE2-NEXT:    pxor %xmm4, %xmm4
1634; SSE2-NEXT:    pcmpeqd %xmm4, %xmm1
1635; SSE2-NEXT:    pcmpeqd %xmm4, %xmm0
1636; SSE2-NEXT:    movdqa %xmm0, %xmm4
1637; SSE2-NEXT:    shufps {{.*#+}} xmm4 = xmm4[1,3],xmm1[1,3]
1638; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
1639; SSE2-NEXT:    andps %xmm4, %xmm0
1640; SSE2-NEXT:    movmskps %xmm0, %eax
1641; SSE2-NEXT:    testb $1, %al
1642; SSE2-NEXT:    jne LBB14_1
1643; SSE2-NEXT:  ## %bb.2: ## %else
1644; SSE2-NEXT:    testb $2, %al
1645; SSE2-NEXT:    jne LBB14_3
1646; SSE2-NEXT:  LBB14_4: ## %else2
1647; SSE2-NEXT:    testb $4, %al
1648; SSE2-NEXT:    jne LBB14_5
1649; SSE2-NEXT:  LBB14_6: ## %else5
1650; SSE2-NEXT:    testb $8, %al
1651; SSE2-NEXT:    je LBB14_8
1652; SSE2-NEXT:  LBB14_7: ## %cond.load7
1653; SSE2-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
1654; SSE2-NEXT:    movlhps {{.*#+}} xmm3 = xmm3[0],xmm0[0]
1655; SSE2-NEXT:  LBB14_8: ## %else8
1656; SSE2-NEXT:    movaps %xmm2, %xmm0
1657; SSE2-NEXT:    movaps %xmm3, %xmm1
1658; SSE2-NEXT:    retq
1659; SSE2-NEXT:  LBB14_1: ## %cond.load
1660; SSE2-NEXT:    movlps {{.*#+}} xmm2 = mem[0,1],xmm2[2,3]
1661; SSE2-NEXT:    testb $2, %al
1662; SSE2-NEXT:    je LBB14_4
1663; SSE2-NEXT:  LBB14_3: ## %cond.load1
1664; SSE2-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
1665; SSE2-NEXT:    movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
1666; SSE2-NEXT:    testb $4, %al
1667; SSE2-NEXT:    je LBB14_6
1668; SSE2-NEXT:  LBB14_5: ## %cond.load4
1669; SSE2-NEXT:    movlps {{.*#+}} xmm3 = mem[0,1],xmm3[2,3]
1670; SSE2-NEXT:    testb $8, %al
1671; SSE2-NEXT:    jne LBB14_7
1672; SSE2-NEXT:    jmp LBB14_8
1673;
1674; SSE42-LABEL: load_v4i64_v4i64:
1675; SSE42:       ## %bb.0:
1676; SSE42-NEXT:    pxor %xmm4, %xmm4
1677; SSE42-NEXT:    pcmpeqq %xmm4, %xmm1
1678; SSE42-NEXT:    pcmpeqq %xmm4, %xmm0
1679; SSE42-NEXT:    packssdw %xmm1, %xmm0
1680; SSE42-NEXT:    movmskps %xmm0, %eax
1681; SSE42-NEXT:    testb $1, %al
1682; SSE42-NEXT:    jne LBB14_1
1683; SSE42-NEXT:  ## %bb.2: ## %else
1684; SSE42-NEXT:    testb $2, %al
1685; SSE42-NEXT:    jne LBB14_3
1686; SSE42-NEXT:  LBB14_4: ## %else2
1687; SSE42-NEXT:    testb $4, %al
1688; SSE42-NEXT:    jne LBB14_5
1689; SSE42-NEXT:  LBB14_6: ## %else5
1690; SSE42-NEXT:    testb $8, %al
1691; SSE42-NEXT:    je LBB14_8
1692; SSE42-NEXT:  LBB14_7: ## %cond.load7
1693; SSE42-NEXT:    pinsrq $1, 24(%rdi), %xmm3
1694; SSE42-NEXT:  LBB14_8: ## %else8
1695; SSE42-NEXT:    movdqa %xmm2, %xmm0
1696; SSE42-NEXT:    movdqa %xmm3, %xmm1
1697; SSE42-NEXT:    retq
1698; SSE42-NEXT:  LBB14_1: ## %cond.load
1699; SSE42-NEXT:    pinsrq $0, (%rdi), %xmm2
1700; SSE42-NEXT:    testb $2, %al
1701; SSE42-NEXT:    je LBB14_4
1702; SSE42-NEXT:  LBB14_3: ## %cond.load1
1703; SSE42-NEXT:    pinsrq $1, 8(%rdi), %xmm2
1704; SSE42-NEXT:    testb $4, %al
1705; SSE42-NEXT:    je LBB14_6
1706; SSE42-NEXT:  LBB14_5: ## %cond.load4
1707; SSE42-NEXT:    pinsrq $0, 16(%rdi), %xmm3
1708; SSE42-NEXT:    testb $8, %al
1709; SSE42-NEXT:    jne LBB14_7
1710; SSE42-NEXT:    jmp LBB14_8
1711;
1712; AVX1-LABEL: load_v4i64_v4i64:
1713; AVX1:       ## %bb.0:
1714; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
1715; AVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
1716; AVX1-NEXT:    vpcmpeqq %xmm3, %xmm2, %xmm2
1717; AVX1-NEXT:    vpcmpeqq %xmm3, %xmm0, %xmm0
1718; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
1719; AVX1-NEXT:    vmaskmovpd (%rdi), %ymm0, %ymm2
1720; AVX1-NEXT:    vblendvpd %ymm0, %ymm2, %ymm1, %ymm0
1721; AVX1-NEXT:    retq
1722;
1723; AVX2-LABEL: load_v4i64_v4i64:
1724; AVX2:       ## %bb.0:
1725; AVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
1726; AVX2-NEXT:    vpcmpeqq %ymm2, %ymm0, %ymm0
1727; AVX2-NEXT:    vpmaskmovq (%rdi), %ymm0, %ymm2
1728; AVX2-NEXT:    vblendvpd %ymm0, %ymm2, %ymm1, %ymm0
1729; AVX2-NEXT:    retq
1730;
1731; AVX512F-LABEL: load_v4i64_v4i64:
1732; AVX512F:       ## %bb.0:
1733; AVX512F-NEXT:    ## kill: def $ymm1 killed $ymm1 def $zmm1
1734; AVX512F-NEXT:    ## kill: def $ymm0 killed $ymm0 def $zmm0
1735; AVX512F-NEXT:    vptestnmq %zmm0, %zmm0, %k0
1736; AVX512F-NEXT:    kshiftlw $12, %k0, %k0
1737; AVX512F-NEXT:    kshiftrw $12, %k0, %k1
1738; AVX512F-NEXT:    vpblendmq (%rdi), %zmm1, %zmm0 {%k1}
1739; AVX512F-NEXT:    ## kill: def $ymm0 killed $ymm0 killed $zmm0
1740; AVX512F-NEXT:    retq
1741;
1742; AVX512VL-LABEL: load_v4i64_v4i64:
1743; AVX512VL:       ## %bb.0:
1744; AVX512VL-NEXT:    vptestnmq %ymm0, %ymm0, %k1
1745; AVX512VL-NEXT:    vpblendmq (%rdi), %ymm1, %ymm0 {%k1}
1746; AVX512VL-NEXT:    retq
1747;
1748; X86-AVX512-LABEL: load_v4i64_v4i64:
1749; X86-AVX512:       ## %bb.0:
1750; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
1751; X86-AVX512-NEXT:    vptestnmq %ymm0, %ymm0, %k1
1752; X86-AVX512-NEXT:    vpblendmq (%eax), %ymm1, %ymm0 {%k1}
1753; X86-AVX512-NEXT:    retl
1754  %mask = icmp eq <4 x i64> %trigger, zeroinitializer
1755  %res = call <4 x i64> @llvm.masked.load.v4i64.p0(ptr %addr, i32 4, <4 x i1> %mask, <4 x i64> %dst)
1756  ret <4 x i64> %res
1757}
1758
1759define <8 x i64> @load_v8i64_v8i16(<8 x i16> %trigger, ptr %addr, <8 x i64> %dst) {
1760; SSE2-LABEL: load_v8i64_v8i16:
1761; SSE2:       ## %bb.0:
1762; SSE2-NEXT:    pxor %xmm5, %xmm5
1763; SSE2-NEXT:    pcmpeqw %xmm0, %xmm5
1764; SSE2-NEXT:    packsswb %xmm5, %xmm5
1765; SSE2-NEXT:    pmovmskb %xmm5, %eax
1766; SSE2-NEXT:    testb $1, %al
1767; SSE2-NEXT:    jne LBB15_1
1768; SSE2-NEXT:  ## %bb.2: ## %else
1769; SSE2-NEXT:    testb $2, %al
1770; SSE2-NEXT:    jne LBB15_3
1771; SSE2-NEXT:  LBB15_4: ## %else2
1772; SSE2-NEXT:    testb $4, %al
1773; SSE2-NEXT:    jne LBB15_5
1774; SSE2-NEXT:  LBB15_6: ## %else5
1775; SSE2-NEXT:    testb $8, %al
1776; SSE2-NEXT:    jne LBB15_7
1777; SSE2-NEXT:  LBB15_8: ## %else8
1778; SSE2-NEXT:    testb $16, %al
1779; SSE2-NEXT:    jne LBB15_9
1780; SSE2-NEXT:  LBB15_10: ## %else11
1781; SSE2-NEXT:    testb $32, %al
1782; SSE2-NEXT:    jne LBB15_11
1783; SSE2-NEXT:  LBB15_12: ## %else14
1784; SSE2-NEXT:    testb $64, %al
1785; SSE2-NEXT:    jne LBB15_13
1786; SSE2-NEXT:  LBB15_14: ## %else17
1787; SSE2-NEXT:    testb $-128, %al
1788; SSE2-NEXT:    je LBB15_16
1789; SSE2-NEXT:  LBB15_15: ## %cond.load19
1790; SSE2-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
1791; SSE2-NEXT:    movlhps {{.*#+}} xmm4 = xmm4[0],xmm0[0]
1792; SSE2-NEXT:  LBB15_16: ## %else20
1793; SSE2-NEXT:    movaps %xmm1, %xmm0
1794; SSE2-NEXT:    movaps %xmm2, %xmm1
1795; SSE2-NEXT:    movaps %xmm3, %xmm2
1796; SSE2-NEXT:    movaps %xmm4, %xmm3
1797; SSE2-NEXT:    retq
1798; SSE2-NEXT:  LBB15_1: ## %cond.load
1799; SSE2-NEXT:    movlps {{.*#+}} xmm1 = mem[0,1],xmm1[2,3]
1800; SSE2-NEXT:    testb $2, %al
1801; SSE2-NEXT:    je LBB15_4
1802; SSE2-NEXT:  LBB15_3: ## %cond.load1
1803; SSE2-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
1804; SSE2-NEXT:    movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
1805; SSE2-NEXT:    testb $4, %al
1806; SSE2-NEXT:    je LBB15_6
1807; SSE2-NEXT:  LBB15_5: ## %cond.load4
1808; SSE2-NEXT:    movlps {{.*#+}} xmm2 = mem[0,1],xmm2[2,3]
1809; SSE2-NEXT:    testb $8, %al
1810; SSE2-NEXT:    je LBB15_8
1811; SSE2-NEXT:  LBB15_7: ## %cond.load7
1812; SSE2-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
1813; SSE2-NEXT:    movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
1814; SSE2-NEXT:    testb $16, %al
1815; SSE2-NEXT:    je LBB15_10
1816; SSE2-NEXT:  LBB15_9: ## %cond.load10
1817; SSE2-NEXT:    movlps {{.*#+}} xmm3 = mem[0,1],xmm3[2,3]
1818; SSE2-NEXT:    testb $32, %al
1819; SSE2-NEXT:    je LBB15_12
1820; SSE2-NEXT:  LBB15_11: ## %cond.load13
1821; SSE2-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
1822; SSE2-NEXT:    movlhps {{.*#+}} xmm3 = xmm3[0],xmm0[0]
1823; SSE2-NEXT:    testb $64, %al
1824; SSE2-NEXT:    je LBB15_14
1825; SSE2-NEXT:  LBB15_13: ## %cond.load16
1826; SSE2-NEXT:    movlps {{.*#+}} xmm4 = mem[0,1],xmm4[2,3]
1827; SSE2-NEXT:    testb $-128, %al
1828; SSE2-NEXT:    jne LBB15_15
1829; SSE2-NEXT:    jmp LBB15_16
1830;
1831; SSE42-LABEL: load_v8i64_v8i16:
1832; SSE42:       ## %bb.0:
1833; SSE42-NEXT:    pxor %xmm5, %xmm5
1834; SSE42-NEXT:    pcmpeqw %xmm0, %xmm5
1835; SSE42-NEXT:    packsswb %xmm5, %xmm5
1836; SSE42-NEXT:    pmovmskb %xmm5, %eax
1837; SSE42-NEXT:    testb $1, %al
1838; SSE42-NEXT:    jne LBB15_1
1839; SSE42-NEXT:  ## %bb.2: ## %else
1840; SSE42-NEXT:    testb $2, %al
1841; SSE42-NEXT:    jne LBB15_3
1842; SSE42-NEXT:  LBB15_4: ## %else2
1843; SSE42-NEXT:    testb $4, %al
1844; SSE42-NEXT:    jne LBB15_5
1845; SSE42-NEXT:  LBB15_6: ## %else5
1846; SSE42-NEXT:    testb $8, %al
1847; SSE42-NEXT:    jne LBB15_7
1848; SSE42-NEXT:  LBB15_8: ## %else8
1849; SSE42-NEXT:    testb $16, %al
1850; SSE42-NEXT:    jne LBB15_9
1851; SSE42-NEXT:  LBB15_10: ## %else11
1852; SSE42-NEXT:    testb $32, %al
1853; SSE42-NEXT:    jne LBB15_11
1854; SSE42-NEXT:  LBB15_12: ## %else14
1855; SSE42-NEXT:    testb $64, %al
1856; SSE42-NEXT:    jne LBB15_13
1857; SSE42-NEXT:  LBB15_14: ## %else17
1858; SSE42-NEXT:    testb $-128, %al
1859; SSE42-NEXT:    je LBB15_16
1860; SSE42-NEXT:  LBB15_15: ## %cond.load19
1861; SSE42-NEXT:    pinsrq $1, 56(%rdi), %xmm4
1862; SSE42-NEXT:  LBB15_16: ## %else20
1863; SSE42-NEXT:    movdqa %xmm1, %xmm0
1864; SSE42-NEXT:    movdqa %xmm2, %xmm1
1865; SSE42-NEXT:    movdqa %xmm3, %xmm2
1866; SSE42-NEXT:    movdqa %xmm4, %xmm3
1867; SSE42-NEXT:    retq
1868; SSE42-NEXT:  LBB15_1: ## %cond.load
1869; SSE42-NEXT:    pinsrq $0, (%rdi), %xmm1
1870; SSE42-NEXT:    testb $2, %al
1871; SSE42-NEXT:    je LBB15_4
1872; SSE42-NEXT:  LBB15_3: ## %cond.load1
1873; SSE42-NEXT:    pinsrq $1, 8(%rdi), %xmm1
1874; SSE42-NEXT:    testb $4, %al
1875; SSE42-NEXT:    je LBB15_6
1876; SSE42-NEXT:  LBB15_5: ## %cond.load4
1877; SSE42-NEXT:    pinsrq $0, 16(%rdi), %xmm2
1878; SSE42-NEXT:    testb $8, %al
1879; SSE42-NEXT:    je LBB15_8
1880; SSE42-NEXT:  LBB15_7: ## %cond.load7
1881; SSE42-NEXT:    pinsrq $1, 24(%rdi), %xmm2
1882; SSE42-NEXT:    testb $16, %al
1883; SSE42-NEXT:    je LBB15_10
1884; SSE42-NEXT:  LBB15_9: ## %cond.load10
1885; SSE42-NEXT:    pinsrq $0, 32(%rdi), %xmm3
1886; SSE42-NEXT:    testb $32, %al
1887; SSE42-NEXT:    je LBB15_12
1888; SSE42-NEXT:  LBB15_11: ## %cond.load13
1889; SSE42-NEXT:    pinsrq $1, 40(%rdi), %xmm3
1890; SSE42-NEXT:    testb $64, %al
1891; SSE42-NEXT:    je LBB15_14
1892; SSE42-NEXT:  LBB15_13: ## %cond.load16
1893; SSE42-NEXT:    pinsrq $0, 48(%rdi), %xmm4
1894; SSE42-NEXT:    testb $-128, %al
1895; SSE42-NEXT:    jne LBB15_15
1896; SSE42-NEXT:    jmp LBB15_16
1897;
1898; AVX1-LABEL: load_v8i64_v8i16:
1899; AVX1:       ## %bb.0:
1900; AVX1-NEXT:    vpshufd {{.*#+}} xmm3 = xmm0[2,3,2,3]
1901; AVX1-NEXT:    vpxor %xmm4, %xmm4, %xmm4
1902; AVX1-NEXT:    vpcmpeqw %xmm4, %xmm3, %xmm3
1903; AVX1-NEXT:    vpmovsxwq %xmm3, %xmm5
1904; AVX1-NEXT:    vpshufd {{.*#+}} xmm3 = xmm3[1,1,1,1]
1905; AVX1-NEXT:    vpmovsxwq %xmm3, %xmm3
1906; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm5, %ymm3
1907; AVX1-NEXT:    vpcmpeqw %xmm4, %xmm0, %xmm0
1908; AVX1-NEXT:    vpmovsxwq %xmm0, %xmm4
1909; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
1910; AVX1-NEXT:    vpmovsxwq %xmm0, %xmm0
1911; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm4, %ymm0
1912; AVX1-NEXT:    vmaskmovpd (%rdi), %ymm0, %ymm4
1913; AVX1-NEXT:    vblendvpd %ymm0, %ymm4, %ymm1, %ymm0
1914; AVX1-NEXT:    vmaskmovpd 32(%rdi), %ymm3, %ymm1
1915; AVX1-NEXT:    vblendvpd %ymm3, %ymm1, %ymm2, %ymm1
1916; AVX1-NEXT:    retq
1917;
1918; AVX2-LABEL: load_v8i64_v8i16:
1919; AVX2:       ## %bb.0:
1920; AVX2-NEXT:    vpshufd {{.*#+}} xmm3 = xmm0[2,3,2,3]
1921; AVX2-NEXT:    vpxor %xmm4, %xmm4, %xmm4
1922; AVX2-NEXT:    vpcmpeqw %xmm4, %xmm3, %xmm3
1923; AVX2-NEXT:    vpmovsxwq %xmm3, %ymm3
1924; AVX2-NEXT:    vpcmpeqw %xmm4, %xmm0, %xmm0
1925; AVX2-NEXT:    vpmovsxwq %xmm0, %ymm0
1926; AVX2-NEXT:    vpmaskmovq (%rdi), %ymm0, %ymm4
1927; AVX2-NEXT:    vblendvpd %ymm0, %ymm4, %ymm1, %ymm0
1928; AVX2-NEXT:    vpmaskmovq 32(%rdi), %ymm3, %ymm1
1929; AVX2-NEXT:    vblendvpd %ymm3, %ymm1, %ymm2, %ymm1
1930; AVX2-NEXT:    retq
1931;
1932; AVX512F-LABEL: load_v8i64_v8i16:
1933; AVX512F:       ## %bb.0:
1934; AVX512F-NEXT:    vpxor %xmm2, %xmm2, %xmm2
1935; AVX512F-NEXT:    vpcmpeqw %xmm2, %xmm0, %xmm0
1936; AVX512F-NEXT:    vpmovsxwq %xmm0, %zmm0
1937; AVX512F-NEXT:    vptestmq %zmm0, %zmm0, %k1
1938; AVX512F-NEXT:    vpblendmq (%rdi), %zmm1, %zmm0 {%k1}
1939; AVX512F-NEXT:    retq
1940;
1941; AVX512VLDQ-LABEL: load_v8i64_v8i16:
1942; AVX512VLDQ:       ## %bb.0:
1943; AVX512VLDQ-NEXT:    vpxor %xmm2, %xmm2, %xmm2
1944; AVX512VLDQ-NEXT:    vpcmpeqw %xmm2, %xmm0, %xmm0
1945; AVX512VLDQ-NEXT:    vpmovsxwd %xmm0, %ymm0
1946; AVX512VLDQ-NEXT:    vpmovd2m %ymm0, %k1
1947; AVX512VLDQ-NEXT:    vpblendmq (%rdi), %zmm1, %zmm0 {%k1}
1948; AVX512VLDQ-NEXT:    retq
1949;
1950; AVX512VLBW-LABEL: load_v8i64_v8i16:
1951; AVX512VLBW:       ## %bb.0:
1952; AVX512VLBW-NEXT:    vptestnmw %xmm0, %xmm0, %k1
1953; AVX512VLBW-NEXT:    vpblendmq (%rdi), %zmm1, %zmm0 {%k1}
1954; AVX512VLBW-NEXT:    retq
1955;
1956; X86-AVX512-LABEL: load_v8i64_v8i16:
1957; X86-AVX512:       ## %bb.0:
1958; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
1959; X86-AVX512-NEXT:    vptestnmw %xmm0, %xmm0, %k1
1960; X86-AVX512-NEXT:    vpblendmq (%eax), %zmm1, %zmm0 {%k1}
1961; X86-AVX512-NEXT:    retl
1962  %mask = icmp eq <8 x i16> %trigger, zeroinitializer
1963  %res = call <8 x i64> @llvm.masked.load.v8i64.p0(ptr %addr, i32 4, <8 x i1> %mask, <8 x i64> %dst)
1964  ret <8 x i64> %res
1965}
1966
1967define <8 x i64> @load_v8i64_v8i64(<8 x i64> %trigger, ptr %addr, <8 x i64> %dst) {
1968; SSE2-LABEL: load_v8i64_v8i64:
1969; SSE2:       ## %bb.0:
1970; SSE2-NEXT:    pxor %xmm8, %xmm8
1971; SSE2-NEXT:    pcmpeqd %xmm8, %xmm3
1972; SSE2-NEXT:    pshufd {{.*#+}} xmm9 = xmm3[1,0,3,2]
1973; SSE2-NEXT:    pand %xmm3, %xmm9
1974; SSE2-NEXT:    pcmpeqd %xmm8, %xmm2
1975; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[1,0,3,2]
1976; SSE2-NEXT:    pand %xmm2, %xmm3
1977; SSE2-NEXT:    packssdw %xmm9, %xmm3
1978; SSE2-NEXT:    pcmpeqd %xmm8, %xmm1
1979; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,0,3,2]
1980; SSE2-NEXT:    pand %xmm1, %xmm2
1981; SSE2-NEXT:    pcmpeqd %xmm8, %xmm0
1982; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,0,3,2]
1983; SSE2-NEXT:    pand %xmm0, %xmm1
1984; SSE2-NEXT:    packssdw %xmm2, %xmm1
1985; SSE2-NEXT:    packssdw %xmm3, %xmm1
1986; SSE2-NEXT:    packsswb %xmm1, %xmm1
1987; SSE2-NEXT:    pmovmskb %xmm1, %eax
1988; SSE2-NEXT:    testb $1, %al
1989; SSE2-NEXT:    jne LBB16_1
1990; SSE2-NEXT:  ## %bb.2: ## %else
1991; SSE2-NEXT:    testb $2, %al
1992; SSE2-NEXT:    jne LBB16_3
1993; SSE2-NEXT:  LBB16_4: ## %else2
1994; SSE2-NEXT:    testb $4, %al
1995; SSE2-NEXT:    jne LBB16_5
1996; SSE2-NEXT:  LBB16_6: ## %else5
1997; SSE2-NEXT:    testb $8, %al
1998; SSE2-NEXT:    jne LBB16_7
1999; SSE2-NEXT:  LBB16_8: ## %else8
2000; SSE2-NEXT:    testb $16, %al
2001; SSE2-NEXT:    jne LBB16_9
2002; SSE2-NEXT:  LBB16_10: ## %else11
2003; SSE2-NEXT:    testb $32, %al
2004; SSE2-NEXT:    jne LBB16_11
2005; SSE2-NEXT:  LBB16_12: ## %else14
2006; SSE2-NEXT:    testb $64, %al
2007; SSE2-NEXT:    jne LBB16_13
2008; SSE2-NEXT:  LBB16_14: ## %else17
2009; SSE2-NEXT:    testb $-128, %al
2010; SSE2-NEXT:    je LBB16_16
2011; SSE2-NEXT:  LBB16_15: ## %cond.load19
2012; SSE2-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
2013; SSE2-NEXT:    movlhps {{.*#+}} xmm7 = xmm7[0],xmm0[0]
2014; SSE2-NEXT:  LBB16_16: ## %else20
2015; SSE2-NEXT:    movaps %xmm4, %xmm0
2016; SSE2-NEXT:    movaps %xmm5, %xmm1
2017; SSE2-NEXT:    movaps %xmm6, %xmm2
2018; SSE2-NEXT:    movaps %xmm7, %xmm3
2019; SSE2-NEXT:    retq
2020; SSE2-NEXT:  LBB16_1: ## %cond.load
2021; SSE2-NEXT:    movlps {{.*#+}} xmm4 = mem[0,1],xmm4[2,3]
2022; SSE2-NEXT:    testb $2, %al
2023; SSE2-NEXT:    je LBB16_4
2024; SSE2-NEXT:  LBB16_3: ## %cond.load1
2025; SSE2-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
2026; SSE2-NEXT:    movlhps {{.*#+}} xmm4 = xmm4[0],xmm0[0]
2027; SSE2-NEXT:    testb $4, %al
2028; SSE2-NEXT:    je LBB16_6
2029; SSE2-NEXT:  LBB16_5: ## %cond.load4
2030; SSE2-NEXT:    movlps {{.*#+}} xmm5 = mem[0,1],xmm5[2,3]
2031; SSE2-NEXT:    testb $8, %al
2032; SSE2-NEXT:    je LBB16_8
2033; SSE2-NEXT:  LBB16_7: ## %cond.load7
2034; SSE2-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
2035; SSE2-NEXT:    movlhps {{.*#+}} xmm5 = xmm5[0],xmm0[0]
2036; SSE2-NEXT:    testb $16, %al
2037; SSE2-NEXT:    je LBB16_10
2038; SSE2-NEXT:  LBB16_9: ## %cond.load10
2039; SSE2-NEXT:    movlps {{.*#+}} xmm6 = mem[0,1],xmm6[2,3]
2040; SSE2-NEXT:    testb $32, %al
2041; SSE2-NEXT:    je LBB16_12
2042; SSE2-NEXT:  LBB16_11: ## %cond.load13
2043; SSE2-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
2044; SSE2-NEXT:    movlhps {{.*#+}} xmm6 = xmm6[0],xmm0[0]
2045; SSE2-NEXT:    testb $64, %al
2046; SSE2-NEXT:    je LBB16_14
2047; SSE2-NEXT:  LBB16_13: ## %cond.load16
2048; SSE2-NEXT:    movlps {{.*#+}} xmm7 = mem[0,1],xmm7[2,3]
2049; SSE2-NEXT:    testb $-128, %al
2050; SSE2-NEXT:    jne LBB16_15
2051; SSE2-NEXT:    jmp LBB16_16
2052;
2053; SSE42-LABEL: load_v8i64_v8i64:
2054; SSE42:       ## %bb.0:
2055; SSE42-NEXT:    pxor %xmm8, %xmm8
2056; SSE42-NEXT:    pcmpeqq %xmm8, %xmm3
2057; SSE42-NEXT:    pcmpeqq %xmm8, %xmm2
2058; SSE42-NEXT:    packssdw %xmm3, %xmm2
2059; SSE42-NEXT:    pcmpeqq %xmm8, %xmm1
2060; SSE42-NEXT:    pcmpeqq %xmm8, %xmm0
2061; SSE42-NEXT:    packssdw %xmm1, %xmm0
2062; SSE42-NEXT:    packssdw %xmm2, %xmm0
2063; SSE42-NEXT:    packsswb %xmm0, %xmm0
2064; SSE42-NEXT:    pmovmskb %xmm0, %eax
2065; SSE42-NEXT:    testb $1, %al
2066; SSE42-NEXT:    jne LBB16_1
2067; SSE42-NEXT:  ## %bb.2: ## %else
2068; SSE42-NEXT:    testb $2, %al
2069; SSE42-NEXT:    jne LBB16_3
2070; SSE42-NEXT:  LBB16_4: ## %else2
2071; SSE42-NEXT:    testb $4, %al
2072; SSE42-NEXT:    jne LBB16_5
2073; SSE42-NEXT:  LBB16_6: ## %else5
2074; SSE42-NEXT:    testb $8, %al
2075; SSE42-NEXT:    jne LBB16_7
2076; SSE42-NEXT:  LBB16_8: ## %else8
2077; SSE42-NEXT:    testb $16, %al
2078; SSE42-NEXT:    jne LBB16_9
2079; SSE42-NEXT:  LBB16_10: ## %else11
2080; SSE42-NEXT:    testb $32, %al
2081; SSE42-NEXT:    jne LBB16_11
2082; SSE42-NEXT:  LBB16_12: ## %else14
2083; SSE42-NEXT:    testb $64, %al
2084; SSE42-NEXT:    jne LBB16_13
2085; SSE42-NEXT:  LBB16_14: ## %else17
2086; SSE42-NEXT:    testb $-128, %al
2087; SSE42-NEXT:    je LBB16_16
2088; SSE42-NEXT:  LBB16_15: ## %cond.load19
2089; SSE42-NEXT:    pinsrq $1, 56(%rdi), %xmm7
2090; SSE42-NEXT:  LBB16_16: ## %else20
2091; SSE42-NEXT:    movdqa %xmm4, %xmm0
2092; SSE42-NEXT:    movdqa %xmm5, %xmm1
2093; SSE42-NEXT:    movdqa %xmm6, %xmm2
2094; SSE42-NEXT:    movdqa %xmm7, %xmm3
2095; SSE42-NEXT:    retq
2096; SSE42-NEXT:  LBB16_1: ## %cond.load
2097; SSE42-NEXT:    pinsrq $0, (%rdi), %xmm4
2098; SSE42-NEXT:    testb $2, %al
2099; SSE42-NEXT:    je LBB16_4
2100; SSE42-NEXT:  LBB16_3: ## %cond.load1
2101; SSE42-NEXT:    pinsrq $1, 8(%rdi), %xmm4
2102; SSE42-NEXT:    testb $4, %al
2103; SSE42-NEXT:    je LBB16_6
2104; SSE42-NEXT:  LBB16_5: ## %cond.load4
2105; SSE42-NEXT:    pinsrq $0, 16(%rdi), %xmm5
2106; SSE42-NEXT:    testb $8, %al
2107; SSE42-NEXT:    je LBB16_8
2108; SSE42-NEXT:  LBB16_7: ## %cond.load7
2109; SSE42-NEXT:    pinsrq $1, 24(%rdi), %xmm5
2110; SSE42-NEXT:    testb $16, %al
2111; SSE42-NEXT:    je LBB16_10
2112; SSE42-NEXT:  LBB16_9: ## %cond.load10
2113; SSE42-NEXT:    pinsrq $0, 32(%rdi), %xmm6
2114; SSE42-NEXT:    testb $32, %al
2115; SSE42-NEXT:    je LBB16_12
2116; SSE42-NEXT:  LBB16_11: ## %cond.load13
2117; SSE42-NEXT:    pinsrq $1, 40(%rdi), %xmm6
2118; SSE42-NEXT:    testb $64, %al
2119; SSE42-NEXT:    je LBB16_14
2120; SSE42-NEXT:  LBB16_13: ## %cond.load16
2121; SSE42-NEXT:    pinsrq $0, 48(%rdi), %xmm7
2122; SSE42-NEXT:    testb $-128, %al
2123; SSE42-NEXT:    jne LBB16_15
2124; SSE42-NEXT:    jmp LBB16_16
2125;
2126; AVX1-LABEL: load_v8i64_v8i64:
2127; AVX1:       ## %bb.0:
2128; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm4
2129; AVX1-NEXT:    vpxor %xmm5, %xmm5, %xmm5
2130; AVX1-NEXT:    vpcmpeqq %xmm5, %xmm4, %xmm4
2131; AVX1-NEXT:    vpcmpeqq %xmm5, %xmm1, %xmm1
2132; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm1, %ymm1
2133; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm4
2134; AVX1-NEXT:    vpcmpeqq %xmm5, %xmm4, %xmm4
2135; AVX1-NEXT:    vpcmpeqq %xmm5, %xmm0, %xmm0
2136; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm0, %ymm0
2137; AVX1-NEXT:    vmaskmovpd (%rdi), %ymm0, %ymm4
2138; AVX1-NEXT:    vblendvpd %ymm0, %ymm4, %ymm2, %ymm0
2139; AVX1-NEXT:    vmaskmovpd 32(%rdi), %ymm1, %ymm2
2140; AVX1-NEXT:    vblendvpd %ymm1, %ymm2, %ymm3, %ymm1
2141; AVX1-NEXT:    retq
2142;
2143; AVX2-LABEL: load_v8i64_v8i64:
2144; AVX2:       ## %bb.0:
2145; AVX2-NEXT:    vpxor %xmm4, %xmm4, %xmm4
2146; AVX2-NEXT:    vpcmpeqq %ymm4, %ymm1, %ymm1
2147; AVX2-NEXT:    vpcmpeqq %ymm4, %ymm0, %ymm0
2148; AVX2-NEXT:    vpmaskmovq (%rdi), %ymm0, %ymm4
2149; AVX2-NEXT:    vblendvpd %ymm0, %ymm4, %ymm2, %ymm0
2150; AVX2-NEXT:    vpmaskmovq 32(%rdi), %ymm1, %ymm2
2151; AVX2-NEXT:    vblendvpd %ymm1, %ymm2, %ymm3, %ymm1
2152; AVX2-NEXT:    retq
2153;
2154; AVX512-LABEL: load_v8i64_v8i64:
2155; AVX512:       ## %bb.0:
2156; AVX512-NEXT:    vptestnmq %zmm0, %zmm0, %k1
2157; AVX512-NEXT:    vpblendmq (%rdi), %zmm1, %zmm0 {%k1}
2158; AVX512-NEXT:    retq
2159;
2160; X86-AVX512-LABEL: load_v8i64_v8i64:
2161; X86-AVX512:       ## %bb.0:
2162; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
2163; X86-AVX512-NEXT:    vptestnmq %zmm0, %zmm0, %k1
2164; X86-AVX512-NEXT:    vpblendmq (%eax), %zmm1, %zmm0 {%k1}
2165; X86-AVX512-NEXT:    retl
2166  %mask = icmp eq <8 x i64> %trigger, zeroinitializer
2167  %res = call <8 x i64> @llvm.masked.load.v8i64.p0(ptr %addr, i32 4, <8 x i1> %mask, <8 x i64> %dst)
2168  ret <8 x i64> %res
2169}
2170
2171;
2172; vXi32
2173;
2174
2175define <2 x i32> @load_v2i32_v2i32(<2 x i32> %trigger, ptr %addr, <2 x i32> %dst) {
2176; SSE2-LABEL: load_v2i32_v2i32:
2177; SSE2:       ## %bb.0:
2178; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
2179; SSE2-NEXT:    pxor %xmm2, %xmm2
2180; SSE2-NEXT:    pcmpeqd %xmm0, %xmm2
2181; SSE2-NEXT:    movmskpd %xmm2, %eax
2182; SSE2-NEXT:    testb $1, %al
2183; SSE2-NEXT:    jne LBB17_1
2184; SSE2-NEXT:  ## %bb.2: ## %else
2185; SSE2-NEXT:    testb $2, %al
2186; SSE2-NEXT:    jne LBB17_3
2187; SSE2-NEXT:  LBB17_4: ## %else2
2188; SSE2-NEXT:    movaps %xmm1, %xmm0
2189; SSE2-NEXT:    retq
2190; SSE2-NEXT:  LBB17_1: ## %cond.load
2191; SSE2-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
2192; SSE2-NEXT:    movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
2193; SSE2-NEXT:    testb $2, %al
2194; SSE2-NEXT:    je LBB17_4
2195; SSE2-NEXT:  LBB17_3: ## %cond.load1
2196; SSE2-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
2197; SSE2-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
2198; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3]
2199; SSE2-NEXT:    movaps %xmm0, %xmm1
2200; SSE2-NEXT:    movaps %xmm1, %xmm0
2201; SSE2-NEXT:    retq
2202;
2203; SSE42-LABEL: load_v2i32_v2i32:
2204; SSE42:       ## %bb.0:
2205; SSE42-NEXT:    pxor %xmm2, %xmm2
2206; SSE42-NEXT:    pcmpeqd %xmm0, %xmm2
2207; SSE42-NEXT:    pmovsxdq %xmm2, %xmm0
2208; SSE42-NEXT:    movmskpd %xmm0, %eax
2209; SSE42-NEXT:    testb $1, %al
2210; SSE42-NEXT:    jne LBB17_1
2211; SSE42-NEXT:  ## %bb.2: ## %else
2212; SSE42-NEXT:    testb $2, %al
2213; SSE42-NEXT:    jne LBB17_3
2214; SSE42-NEXT:  LBB17_4: ## %else2
2215; SSE42-NEXT:    movdqa %xmm1, %xmm0
2216; SSE42-NEXT:    retq
2217; SSE42-NEXT:  LBB17_1: ## %cond.load
2218; SSE42-NEXT:    pinsrd $0, (%rdi), %xmm1
2219; SSE42-NEXT:    testb $2, %al
2220; SSE42-NEXT:    je LBB17_4
2221; SSE42-NEXT:  LBB17_3: ## %cond.load1
2222; SSE42-NEXT:    pinsrd $1, 4(%rdi), %xmm1
2223; SSE42-NEXT:    movdqa %xmm1, %xmm0
2224; SSE42-NEXT:    retq
2225;
2226; AVX1-LABEL: load_v2i32_v2i32:
2227; AVX1:       ## %bb.0:
2228; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
2229; AVX1-NEXT:    vpcmpeqd %xmm2, %xmm0, %xmm0
2230; AVX1-NEXT:    vmovq {{.*#+}} xmm0 = xmm0[0],zero
2231; AVX1-NEXT:    vmaskmovps (%rdi), %xmm0, %xmm2
2232; AVX1-NEXT:    vblendvps %xmm0, %xmm2, %xmm1, %xmm0
2233; AVX1-NEXT:    retq
2234;
2235; AVX2-LABEL: load_v2i32_v2i32:
2236; AVX2:       ## %bb.0:
2237; AVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
2238; AVX2-NEXT:    vpcmpeqd %xmm2, %xmm0, %xmm0
2239; AVX2-NEXT:    vmovq {{.*#+}} xmm0 = xmm0[0],zero
2240; AVX2-NEXT:    vpmaskmovd (%rdi), %xmm0, %xmm2
2241; AVX2-NEXT:    vblendvps %xmm0, %xmm2, %xmm1, %xmm0
2242; AVX2-NEXT:    retq
2243;
2244; AVX512F-LABEL: load_v2i32_v2i32:
2245; AVX512F:       ## %bb.0:
2246; AVX512F-NEXT:    ## kill: def $xmm1 killed $xmm1 def $zmm1
2247; AVX512F-NEXT:    ## kill: def $xmm0 killed $xmm0 def $zmm0
2248; AVX512F-NEXT:    vptestnmd %zmm0, %zmm0, %k0
2249; AVX512F-NEXT:    kshiftlw $14, %k0, %k0
2250; AVX512F-NEXT:    kshiftrw $14, %k0, %k1
2251; AVX512F-NEXT:    vpblendmd (%rdi), %zmm1, %zmm0 {%k1}
2252; AVX512F-NEXT:    ## kill: def $xmm0 killed $xmm0 killed $zmm0
2253; AVX512F-NEXT:    vzeroupper
2254; AVX512F-NEXT:    retq
2255;
2256; AVX512VLDQ-LABEL: load_v2i32_v2i32:
2257; AVX512VLDQ:       ## %bb.0:
2258; AVX512VLDQ-NEXT:    vptestnmd %xmm0, %xmm0, %k0
2259; AVX512VLDQ-NEXT:    kshiftlb $6, %k0, %k0
2260; AVX512VLDQ-NEXT:    kshiftrb $6, %k0, %k1
2261; AVX512VLDQ-NEXT:    vpblendmd (%rdi), %xmm1, %xmm0 {%k1}
2262; AVX512VLDQ-NEXT:    retq
2263;
2264; AVX512VLBW-LABEL: load_v2i32_v2i32:
2265; AVX512VLBW:       ## %bb.0:
2266; AVX512VLBW-NEXT:    vptestnmd %xmm0, %xmm0, %k0
2267; AVX512VLBW-NEXT:    kshiftlw $14, %k0, %k0
2268; AVX512VLBW-NEXT:    kshiftrw $14, %k0, %k1
2269; AVX512VLBW-NEXT:    vpblendmd (%rdi), %xmm1, %xmm0 {%k1}
2270; AVX512VLBW-NEXT:    retq
2271;
2272; X86-AVX512-LABEL: load_v2i32_v2i32:
2273; X86-AVX512:       ## %bb.0:
2274; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
2275; X86-AVX512-NEXT:    vptestnmd %xmm0, %xmm0, %k0
2276; X86-AVX512-NEXT:    kshiftlb $6, %k0, %k0
2277; X86-AVX512-NEXT:    kshiftrb $6, %k0, %k1
2278; X86-AVX512-NEXT:    vpblendmd (%eax), %xmm1, %xmm0 {%k1}
2279; X86-AVX512-NEXT:    retl
2280  %mask = icmp eq <2 x i32> %trigger, zeroinitializer
2281  %res = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr %addr, i32 4, <2 x i1> %mask, <2 x i32> %dst)
2282  ret <2 x i32> %res
2283}
2284
2285define <4 x i32> @load_v4i32_v4i32(<4 x i32> %trigger, ptr %addr, <4 x i32> %dst) {
2286; SSE2-LABEL: load_v4i32_v4i32:
2287; SSE2:       ## %bb.0:
2288; SSE2-NEXT:    pxor %xmm2, %xmm2
2289; SSE2-NEXT:    pcmpeqd %xmm0, %xmm2
2290; SSE2-NEXT:    movmskps %xmm2, %eax
2291; SSE2-NEXT:    testb $1, %al
2292; SSE2-NEXT:    jne LBB18_1
2293; SSE2-NEXT:  ## %bb.2: ## %else
2294; SSE2-NEXT:    testb $2, %al
2295; SSE2-NEXT:    jne LBB18_3
2296; SSE2-NEXT:  LBB18_4: ## %else2
2297; SSE2-NEXT:    testb $4, %al
2298; SSE2-NEXT:    jne LBB18_5
2299; SSE2-NEXT:  LBB18_6: ## %else5
2300; SSE2-NEXT:    testb $8, %al
2301; SSE2-NEXT:    jne LBB18_7
2302; SSE2-NEXT:  LBB18_8: ## %else8
2303; SSE2-NEXT:    movaps %xmm1, %xmm0
2304; SSE2-NEXT:    retq
2305; SSE2-NEXT:  LBB18_1: ## %cond.load
2306; SSE2-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
2307; SSE2-NEXT:    movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
2308; SSE2-NEXT:    testb $2, %al
2309; SSE2-NEXT:    je LBB18_4
2310; SSE2-NEXT:  LBB18_3: ## %cond.load1
2311; SSE2-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
2312; SSE2-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
2313; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3]
2314; SSE2-NEXT:    movaps %xmm0, %xmm1
2315; SSE2-NEXT:    testb $4, %al
2316; SSE2-NEXT:    je LBB18_6
2317; SSE2-NEXT:  LBB18_5: ## %cond.load4
2318; SSE2-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
2319; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[3,0]
2320; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[0,2]
2321; SSE2-NEXT:    testb $8, %al
2322; SSE2-NEXT:    je LBB18_8
2323; SSE2-NEXT:  LBB18_7: ## %cond.load7
2324; SSE2-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
2325; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
2326; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0]
2327; SSE2-NEXT:    movaps %xmm1, %xmm0
2328; SSE2-NEXT:    retq
2329;
2330; SSE42-LABEL: load_v4i32_v4i32:
2331; SSE42:       ## %bb.0:
2332; SSE42-NEXT:    pxor %xmm2, %xmm2
2333; SSE42-NEXT:    pcmpeqd %xmm0, %xmm2
2334; SSE42-NEXT:    movmskps %xmm2, %eax
2335; SSE42-NEXT:    testb $1, %al
2336; SSE42-NEXT:    jne LBB18_1
2337; SSE42-NEXT:  ## %bb.2: ## %else
2338; SSE42-NEXT:    testb $2, %al
2339; SSE42-NEXT:    jne LBB18_3
2340; SSE42-NEXT:  LBB18_4: ## %else2
2341; SSE42-NEXT:    testb $4, %al
2342; SSE42-NEXT:    jne LBB18_5
2343; SSE42-NEXT:  LBB18_6: ## %else5
2344; SSE42-NEXT:    testb $8, %al
2345; SSE42-NEXT:    jne LBB18_7
2346; SSE42-NEXT:  LBB18_8: ## %else8
2347; SSE42-NEXT:    movdqa %xmm1, %xmm0
2348; SSE42-NEXT:    retq
2349; SSE42-NEXT:  LBB18_1: ## %cond.load
2350; SSE42-NEXT:    pinsrd $0, (%rdi), %xmm1
2351; SSE42-NEXT:    testb $2, %al
2352; SSE42-NEXT:    je LBB18_4
2353; SSE42-NEXT:  LBB18_3: ## %cond.load1
2354; SSE42-NEXT:    pinsrd $1, 4(%rdi), %xmm1
2355; SSE42-NEXT:    testb $4, %al
2356; SSE42-NEXT:    je LBB18_6
2357; SSE42-NEXT:  LBB18_5: ## %cond.load4
2358; SSE42-NEXT:    pinsrd $2, 8(%rdi), %xmm1
2359; SSE42-NEXT:    testb $8, %al
2360; SSE42-NEXT:    je LBB18_8
2361; SSE42-NEXT:  LBB18_7: ## %cond.load7
2362; SSE42-NEXT:    pinsrd $3, 12(%rdi), %xmm1
2363; SSE42-NEXT:    movdqa %xmm1, %xmm0
2364; SSE42-NEXT:    retq
2365;
2366; AVX1-LABEL: load_v4i32_v4i32:
2367; AVX1:       ## %bb.0:
2368; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
2369; AVX1-NEXT:    vpcmpeqd %xmm2, %xmm0, %xmm0
2370; AVX1-NEXT:    vmaskmovps (%rdi), %xmm0, %xmm2
2371; AVX1-NEXT:    vblendvps %xmm0, %xmm2, %xmm1, %xmm0
2372; AVX1-NEXT:    retq
2373;
2374; AVX2-LABEL: load_v4i32_v4i32:
2375; AVX2:       ## %bb.0:
2376; AVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
2377; AVX2-NEXT:    vpcmpeqd %xmm2, %xmm0, %xmm0
2378; AVX2-NEXT:    vpmaskmovd (%rdi), %xmm0, %xmm2
2379; AVX2-NEXT:    vblendvps %xmm0, %xmm2, %xmm1, %xmm0
2380; AVX2-NEXT:    retq
2381;
2382; AVX512F-LABEL: load_v4i32_v4i32:
2383; AVX512F:       ## %bb.0:
2384; AVX512F-NEXT:    ## kill: def $xmm1 killed $xmm1 def $zmm1
2385; AVX512F-NEXT:    ## kill: def $xmm0 killed $xmm0 def $zmm0
2386; AVX512F-NEXT:    vptestnmd %zmm0, %zmm0, %k0
2387; AVX512F-NEXT:    kshiftlw $12, %k0, %k0
2388; AVX512F-NEXT:    kshiftrw $12, %k0, %k1
2389; AVX512F-NEXT:    vpblendmd (%rdi), %zmm1, %zmm0 {%k1}
2390; AVX512F-NEXT:    ## kill: def $xmm0 killed $xmm0 killed $zmm0
2391; AVX512F-NEXT:    vzeroupper
2392; AVX512F-NEXT:    retq
2393;
2394; AVX512VL-LABEL: load_v4i32_v4i32:
2395; AVX512VL:       ## %bb.0:
2396; AVX512VL-NEXT:    vptestnmd %xmm0, %xmm0, %k1
2397; AVX512VL-NEXT:    vpblendmd (%rdi), %xmm1, %xmm0 {%k1}
2398; AVX512VL-NEXT:    retq
2399;
2400; X86-AVX512-LABEL: load_v4i32_v4i32:
2401; X86-AVX512:       ## %bb.0:
2402; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
2403; X86-AVX512-NEXT:    vptestnmd %xmm0, %xmm0, %k1
2404; X86-AVX512-NEXT:    vpblendmd (%eax), %xmm1, %xmm0 {%k1}
2405; X86-AVX512-NEXT:    retl
2406  %mask = icmp eq <4 x i32> %trigger, zeroinitializer
2407  %res = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %addr, i32 4, <4 x i1> %mask, <4 x i32> %dst)
2408  ret <4 x i32> %res
2409}
2410
2411define <8 x i32> @load_v8i32_v8i1(<8 x i1> %mask, ptr %addr, <8 x i32> %dst) {
2412; SSE2-LABEL: load_v8i32_v8i1:
2413; SSE2:       ## %bb.0:
2414; SSE2-NEXT:    psllw $15, %xmm0
2415; SSE2-NEXT:    packsswb %xmm0, %xmm0
2416; SSE2-NEXT:    pmovmskb %xmm0, %eax
2417; SSE2-NEXT:    testb $1, %al
2418; SSE2-NEXT:    jne LBB19_1
2419; SSE2-NEXT:  ## %bb.2: ## %else
2420; SSE2-NEXT:    testb $2, %al
2421; SSE2-NEXT:    jne LBB19_3
2422; SSE2-NEXT:  LBB19_4: ## %else2
2423; SSE2-NEXT:    testb $4, %al
2424; SSE2-NEXT:    jne LBB19_5
2425; SSE2-NEXT:  LBB19_6: ## %else5
2426; SSE2-NEXT:    testb $8, %al
2427; SSE2-NEXT:    jne LBB19_7
2428; SSE2-NEXT:  LBB19_8: ## %else8
2429; SSE2-NEXT:    testb $16, %al
2430; SSE2-NEXT:    jne LBB19_9
2431; SSE2-NEXT:  LBB19_10: ## %else11
2432; SSE2-NEXT:    testb $32, %al
2433; SSE2-NEXT:    jne LBB19_11
2434; SSE2-NEXT:  LBB19_12: ## %else14
2435; SSE2-NEXT:    testb $64, %al
2436; SSE2-NEXT:    jne LBB19_13
2437; SSE2-NEXT:  LBB19_14: ## %else17
2438; SSE2-NEXT:    testb $-128, %al
2439; SSE2-NEXT:    je LBB19_16
2440; SSE2-NEXT:  LBB19_15: ## %cond.load19
2441; SSE2-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
2442; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3]
2443; SSE2-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,0]
2444; SSE2-NEXT:  LBB19_16: ## %else20
2445; SSE2-NEXT:    movaps %xmm1, %xmm0
2446; SSE2-NEXT:    movaps %xmm2, %xmm1
2447; SSE2-NEXT:    retq
2448; SSE2-NEXT:  LBB19_1: ## %cond.load
2449; SSE2-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
2450; SSE2-NEXT:    movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
2451; SSE2-NEXT:    testb $2, %al
2452; SSE2-NEXT:    je LBB19_4
2453; SSE2-NEXT:  LBB19_3: ## %cond.load1
2454; SSE2-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
2455; SSE2-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
2456; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3]
2457; SSE2-NEXT:    movaps %xmm0, %xmm1
2458; SSE2-NEXT:    testb $4, %al
2459; SSE2-NEXT:    je LBB19_6
2460; SSE2-NEXT:  LBB19_5: ## %cond.load4
2461; SSE2-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
2462; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[3,0]
2463; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[0,2]
2464; SSE2-NEXT:    testb $8, %al
2465; SSE2-NEXT:    je LBB19_8
2466; SSE2-NEXT:  LBB19_7: ## %cond.load7
2467; SSE2-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
2468; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
2469; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0]
2470; SSE2-NEXT:    testb $16, %al
2471; SSE2-NEXT:    je LBB19_10
2472; SSE2-NEXT:  LBB19_9: ## %cond.load10
2473; SSE2-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
2474; SSE2-NEXT:    movss {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3]
2475; SSE2-NEXT:    testb $32, %al
2476; SSE2-NEXT:    je LBB19_12
2477; SSE2-NEXT:  LBB19_11: ## %cond.load13
2478; SSE2-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
2479; SSE2-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0]
2480; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,0],xmm2[2,3]
2481; SSE2-NEXT:    movaps %xmm0, %xmm2
2482; SSE2-NEXT:    testb $64, %al
2483; SSE2-NEXT:    je LBB19_14
2484; SSE2-NEXT:  LBB19_13: ## %cond.load16
2485; SSE2-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
2486; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,0],xmm2[3,0]
2487; SSE2-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[0,2]
2488; SSE2-NEXT:    testb $-128, %al
2489; SSE2-NEXT:    jne LBB19_15
2490; SSE2-NEXT:    jmp LBB19_16
2491;
2492; SSE42-LABEL: load_v8i32_v8i1:
2493; SSE42:       ## %bb.0:
2494; SSE42-NEXT:    psllw $15, %xmm0
2495; SSE42-NEXT:    packsswb %xmm0, %xmm0
2496; SSE42-NEXT:    pmovmskb %xmm0, %eax
2497; SSE42-NEXT:    testb $1, %al
2498; SSE42-NEXT:    jne LBB19_1
2499; SSE42-NEXT:  ## %bb.2: ## %else
2500; SSE42-NEXT:    testb $2, %al
2501; SSE42-NEXT:    jne LBB19_3
2502; SSE42-NEXT:  LBB19_4: ## %else2
2503; SSE42-NEXT:    testb $4, %al
2504; SSE42-NEXT:    jne LBB19_5
2505; SSE42-NEXT:  LBB19_6: ## %else5
2506; SSE42-NEXT:    testb $8, %al
2507; SSE42-NEXT:    jne LBB19_7
2508; SSE42-NEXT:  LBB19_8: ## %else8
2509; SSE42-NEXT:    testb $16, %al
2510; SSE42-NEXT:    jne LBB19_9
2511; SSE42-NEXT:  LBB19_10: ## %else11
2512; SSE42-NEXT:    testb $32, %al
2513; SSE42-NEXT:    jne LBB19_11
2514; SSE42-NEXT:  LBB19_12: ## %else14
2515; SSE42-NEXT:    testb $64, %al
2516; SSE42-NEXT:    jne LBB19_13
2517; SSE42-NEXT:  LBB19_14: ## %else17
2518; SSE42-NEXT:    testb $-128, %al
2519; SSE42-NEXT:    je LBB19_16
2520; SSE42-NEXT:  LBB19_15: ## %cond.load19
2521; SSE42-NEXT:    pinsrd $3, 28(%rdi), %xmm2
2522; SSE42-NEXT:  LBB19_16: ## %else20
2523; SSE42-NEXT:    movdqa %xmm1, %xmm0
2524; SSE42-NEXT:    movdqa %xmm2, %xmm1
2525; SSE42-NEXT:    retq
2526; SSE42-NEXT:  LBB19_1: ## %cond.load
2527; SSE42-NEXT:    pinsrd $0, (%rdi), %xmm1
2528; SSE42-NEXT:    testb $2, %al
2529; SSE42-NEXT:    je LBB19_4
2530; SSE42-NEXT:  LBB19_3: ## %cond.load1
2531; SSE42-NEXT:    pinsrd $1, 4(%rdi), %xmm1
2532; SSE42-NEXT:    testb $4, %al
2533; SSE42-NEXT:    je LBB19_6
2534; SSE42-NEXT:  LBB19_5: ## %cond.load4
2535; SSE42-NEXT:    pinsrd $2, 8(%rdi), %xmm1
2536; SSE42-NEXT:    testb $8, %al
2537; SSE42-NEXT:    je LBB19_8
2538; SSE42-NEXT:  LBB19_7: ## %cond.load7
2539; SSE42-NEXT:    pinsrd $3, 12(%rdi), %xmm1
2540; SSE42-NEXT:    testb $16, %al
2541; SSE42-NEXT:    je LBB19_10
2542; SSE42-NEXT:  LBB19_9: ## %cond.load10
2543; SSE42-NEXT:    pinsrd $0, 16(%rdi), %xmm2
2544; SSE42-NEXT:    testb $32, %al
2545; SSE42-NEXT:    je LBB19_12
2546; SSE42-NEXT:  LBB19_11: ## %cond.load13
2547; SSE42-NEXT:    pinsrd $1, 20(%rdi), %xmm2
2548; SSE42-NEXT:    testb $64, %al
2549; SSE42-NEXT:    je LBB19_14
2550; SSE42-NEXT:  LBB19_13: ## %cond.load16
2551; SSE42-NEXT:    pinsrd $2, 24(%rdi), %xmm2
2552; SSE42-NEXT:    testb $-128, %al
2553; SSE42-NEXT:    jne LBB19_15
2554; SSE42-NEXT:    jmp LBB19_16
2555;
2556; AVX1-LABEL: load_v8i32_v8i1:
2557; AVX1:       ## %bb.0:
2558; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
2559; AVX1-NEXT:    vpslld $31, %xmm2, %xmm2
2560; AVX1-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7]
2561; AVX1-NEXT:    vpslld $31, %xmm0, %xmm0
2562; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm2, %ymm0
2563; AVX1-NEXT:    vmaskmovps (%rdi), %ymm0, %ymm2
2564; AVX1-NEXT:    vblendvps %ymm0, %ymm2, %ymm1, %ymm0
2565; AVX1-NEXT:    retq
2566;
2567; AVX2-LABEL: load_v8i32_v8i1:
2568; AVX2:       ## %bb.0:
2569; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
2570; AVX2-NEXT:    vpslld $31, %ymm0, %ymm0
2571; AVX2-NEXT:    vpmaskmovd (%rdi), %ymm0, %ymm2
2572; AVX2-NEXT:    vblendvps %ymm0, %ymm2, %ymm1, %ymm0
2573; AVX2-NEXT:    retq
2574;
2575; AVX512F-LABEL: load_v8i32_v8i1:
2576; AVX512F:       ## %bb.0:
2577; AVX512F-NEXT:    ## kill: def $ymm1 killed $ymm1 def $zmm1
2578; AVX512F-NEXT:    vpmovsxwq %xmm0, %zmm0
2579; AVX512F-NEXT:    vpsllq $63, %zmm0, %zmm0
2580; AVX512F-NEXT:    vptestmq %zmm0, %zmm0, %k1
2581; AVX512F-NEXT:    vpblendmd (%rdi), %zmm1, %zmm0 {%k1}
2582; AVX512F-NEXT:    ## kill: def $ymm0 killed $ymm0 killed $zmm0
2583; AVX512F-NEXT:    retq
2584;
2585; AVX512VLDQ-LABEL: load_v8i32_v8i1:
2586; AVX512VLDQ:       ## %bb.0:
2587; AVX512VLDQ-NEXT:    vpmovsxwd %xmm0, %ymm0
2588; AVX512VLDQ-NEXT:    vpslld $31, %ymm0, %ymm0
2589; AVX512VLDQ-NEXT:    vpmovd2m %ymm0, %k1
2590; AVX512VLDQ-NEXT:    vpblendmd (%rdi), %ymm1, %ymm0 {%k1}
2591; AVX512VLDQ-NEXT:    retq
2592;
2593; AVX512VLBW-LABEL: load_v8i32_v8i1:
2594; AVX512VLBW:       ## %bb.0:
2595; AVX512VLBW-NEXT:    vpsllw $15, %xmm0, %xmm0
2596; AVX512VLBW-NEXT:    vpmovw2m %xmm0, %k1
2597; AVX512VLBW-NEXT:    vpblendmd (%rdi), %ymm1, %ymm0 {%k1}
2598; AVX512VLBW-NEXT:    retq
2599;
2600; X86-AVX512-LABEL: load_v8i32_v8i1:
2601; X86-AVX512:       ## %bb.0:
2602; X86-AVX512-NEXT:    vpsllw $15, %xmm0, %xmm0
2603; X86-AVX512-NEXT:    vpmovw2m %xmm0, %k1
2604; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
2605; X86-AVX512-NEXT:    vpblendmd (%eax), %ymm1, %ymm0 {%k1}
2606; X86-AVX512-NEXT:    retl
2607  %res = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr %addr, i32 4, <8 x i1> %mask, <8 x i32> %dst)
2608  ret <8 x i32> %res
2609}
2610
2611define <8 x i32> @load_v8i32_v8i1_zero(<8 x i1> %mask, ptr %addr) {
2612; SSE2-LABEL: load_v8i32_v8i1_zero:
2613; SSE2:       ## %bb.0:
2614; SSE2-NEXT:    psllw $15, %xmm0
2615; SSE2-NEXT:    packsswb %xmm0, %xmm0
2616; SSE2-NEXT:    pmovmskb %xmm0, %eax
2617; SSE2-NEXT:    pxor %xmm0, %xmm0
2618; SSE2-NEXT:    testb $1, %al
2619; SSE2-NEXT:    xorps %xmm1, %xmm1
2620; SSE2-NEXT:    jne LBB20_1
2621; SSE2-NEXT:  ## %bb.2: ## %else
2622; SSE2-NEXT:    testb $2, %al
2623; SSE2-NEXT:    jne LBB20_3
2624; SSE2-NEXT:  LBB20_4: ## %else2
2625; SSE2-NEXT:    testb $4, %al
2626; SSE2-NEXT:    jne LBB20_5
2627; SSE2-NEXT:  LBB20_6: ## %else5
2628; SSE2-NEXT:    testb $8, %al
2629; SSE2-NEXT:    jne LBB20_7
2630; SSE2-NEXT:  LBB20_8: ## %else8
2631; SSE2-NEXT:    testb $16, %al
2632; SSE2-NEXT:    jne LBB20_9
2633; SSE2-NEXT:  LBB20_10: ## %else11
2634; SSE2-NEXT:    testb $32, %al
2635; SSE2-NEXT:    jne LBB20_11
2636; SSE2-NEXT:  LBB20_12: ## %else14
2637; SSE2-NEXT:    testb $64, %al
2638; SSE2-NEXT:    jne LBB20_13
2639; SSE2-NEXT:  LBB20_14: ## %else17
2640; SSE2-NEXT:    testb $-128, %al
2641; SSE2-NEXT:    jne LBB20_15
2642; SSE2-NEXT:  LBB20_16: ## %else20
2643; SSE2-NEXT:    retq
2644; SSE2-NEXT:  LBB20_1: ## %cond.load
2645; SSE2-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
2646; SSE2-NEXT:    testb $2, %al
2647; SSE2-NEXT:    je LBB20_4
2648; SSE2-NEXT:  LBB20_3: ## %cond.load1
2649; SSE2-NEXT:    movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
2650; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm0[0]
2651; SSE2-NEXT:    shufps {{.*#+}} xmm2 = xmm2[2,0],xmm0[2,3]
2652; SSE2-NEXT:    movaps %xmm2, %xmm0
2653; SSE2-NEXT:    testb $4, %al
2654; SSE2-NEXT:    je LBB20_6
2655; SSE2-NEXT:  LBB20_5: ## %cond.load4
2656; SSE2-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
2657; SSE2-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,0],xmm0[3,0]
2658; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1],xmm2[0,2]
2659; SSE2-NEXT:    testb $8, %al
2660; SSE2-NEXT:    je LBB20_8
2661; SSE2-NEXT:  LBB20_7: ## %cond.load7
2662; SSE2-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
2663; SSE2-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,3]
2664; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,0]
2665; SSE2-NEXT:    testb $16, %al
2666; SSE2-NEXT:    je LBB20_10
2667; SSE2-NEXT:  LBB20_9: ## %cond.load10
2668; SSE2-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
2669; SSE2-NEXT:    movss {{.*#+}} xmm1 = xmm2[0],xmm1[1,2,3]
2670; SSE2-NEXT:    testb $32, %al
2671; SSE2-NEXT:    je LBB20_12
2672; SSE2-NEXT:  LBB20_11: ## %cond.load13
2673; SSE2-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
2674; SSE2-NEXT:    movlhps {{.*#+}} xmm2 = xmm2[0],xmm1[0]
2675; SSE2-NEXT:    shufps {{.*#+}} xmm2 = xmm2[2,0],xmm1[2,3]
2676; SSE2-NEXT:    movaps %xmm2, %xmm1
2677; SSE2-NEXT:    testb $64, %al
2678; SSE2-NEXT:    je LBB20_14
2679; SSE2-NEXT:  LBB20_13: ## %cond.load16
2680; SSE2-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
2681; SSE2-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,0],xmm1[3,0]
2682; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0,2]
2683; SSE2-NEXT:    testb $-128, %al
2684; SSE2-NEXT:    je LBB20_16
2685; SSE2-NEXT:  LBB20_15: ## %cond.load19
2686; SSE2-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
2687; SSE2-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3]
2688; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,0]
2689; SSE2-NEXT:    retq
2690;
2691; SSE42-LABEL: load_v8i32_v8i1_zero:
2692; SSE42:       ## %bb.0:
2693; SSE42-NEXT:    psllw $15, %xmm0
2694; SSE42-NEXT:    packsswb %xmm0, %xmm0
2695; SSE42-NEXT:    pmovmskb %xmm0, %eax
2696; SSE42-NEXT:    pxor %xmm0, %xmm0
2697; SSE42-NEXT:    testb $1, %al
2698; SSE42-NEXT:    pxor %xmm1, %xmm1
2699; SSE42-NEXT:    jne LBB20_1
2700; SSE42-NEXT:  ## %bb.2: ## %else
2701; SSE42-NEXT:    testb $2, %al
2702; SSE42-NEXT:    jne LBB20_3
2703; SSE42-NEXT:  LBB20_4: ## %else2
2704; SSE42-NEXT:    testb $4, %al
2705; SSE42-NEXT:    jne LBB20_5
2706; SSE42-NEXT:  LBB20_6: ## %else5
2707; SSE42-NEXT:    testb $8, %al
2708; SSE42-NEXT:    jne LBB20_7
2709; SSE42-NEXT:  LBB20_8: ## %else8
2710; SSE42-NEXT:    testb $16, %al
2711; SSE42-NEXT:    jne LBB20_9
2712; SSE42-NEXT:  LBB20_10: ## %else11
2713; SSE42-NEXT:    testb $32, %al
2714; SSE42-NEXT:    jne LBB20_11
2715; SSE42-NEXT:  LBB20_12: ## %else14
2716; SSE42-NEXT:    testb $64, %al
2717; SSE42-NEXT:    jne LBB20_13
2718; SSE42-NEXT:  LBB20_14: ## %else17
2719; SSE42-NEXT:    testb $-128, %al
2720; SSE42-NEXT:    jne LBB20_15
2721; SSE42-NEXT:  LBB20_16: ## %else20
2722; SSE42-NEXT:    retq
2723; SSE42-NEXT:  LBB20_1: ## %cond.load
2724; SSE42-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
2725; SSE42-NEXT:    testb $2, %al
2726; SSE42-NEXT:    je LBB20_4
2727; SSE42-NEXT:  LBB20_3: ## %cond.load1
2728; SSE42-NEXT:    pinsrd $1, 4(%rdi), %xmm0
2729; SSE42-NEXT:    testb $4, %al
2730; SSE42-NEXT:    je LBB20_6
2731; SSE42-NEXT:  LBB20_5: ## %cond.load4
2732; SSE42-NEXT:    pinsrd $2, 8(%rdi), %xmm0
2733; SSE42-NEXT:    testb $8, %al
2734; SSE42-NEXT:    je LBB20_8
2735; SSE42-NEXT:  LBB20_7: ## %cond.load7
2736; SSE42-NEXT:    pinsrd $3, 12(%rdi), %xmm0
2737; SSE42-NEXT:    testb $16, %al
2738; SSE42-NEXT:    je LBB20_10
2739; SSE42-NEXT:  LBB20_9: ## %cond.load10
2740; SSE42-NEXT:    pinsrd $0, 16(%rdi), %xmm1
2741; SSE42-NEXT:    testb $32, %al
2742; SSE42-NEXT:    je LBB20_12
2743; SSE42-NEXT:  LBB20_11: ## %cond.load13
2744; SSE42-NEXT:    pinsrd $1, 20(%rdi), %xmm1
2745; SSE42-NEXT:    testb $64, %al
2746; SSE42-NEXT:    je LBB20_14
2747; SSE42-NEXT:  LBB20_13: ## %cond.load16
2748; SSE42-NEXT:    pinsrd $2, 24(%rdi), %xmm1
2749; SSE42-NEXT:    testb $-128, %al
2750; SSE42-NEXT:    je LBB20_16
2751; SSE42-NEXT:  LBB20_15: ## %cond.load19
2752; SSE42-NEXT:    pinsrd $3, 28(%rdi), %xmm1
2753; SSE42-NEXT:    retq
2754;
2755; AVX1-LABEL: load_v8i32_v8i1_zero:
2756; AVX1:       ## %bb.0:
2757; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
2758; AVX1-NEXT:    vpslld $31, %xmm1, %xmm1
2759; AVX1-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7]
2760; AVX1-NEXT:    vpslld $31, %xmm0, %xmm0
2761; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
2762; AVX1-NEXT:    vmaskmovps (%rdi), %ymm0, %ymm0
2763; AVX1-NEXT:    retq
2764;
2765; AVX2-LABEL: load_v8i32_v8i1_zero:
2766; AVX2:       ## %bb.0:
2767; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
2768; AVX2-NEXT:    vpslld $31, %ymm0, %ymm0
2769; AVX2-NEXT:    vpmaskmovd (%rdi), %ymm0, %ymm0
2770; AVX2-NEXT:    retq
2771;
2772; AVX512F-LABEL: load_v8i32_v8i1_zero:
2773; AVX512F:       ## %bb.0:
2774; AVX512F-NEXT:    vpmovsxwq %xmm0, %zmm0
2775; AVX512F-NEXT:    vpsllq $63, %zmm0, %zmm0
2776; AVX512F-NEXT:    vptestmq %zmm0, %zmm0, %k1
2777; AVX512F-NEXT:    vmovdqu32 (%rdi), %zmm0 {%k1} {z}
2778; AVX512F-NEXT:    ## kill: def $ymm0 killed $ymm0 killed $zmm0
2779; AVX512F-NEXT:    retq
2780;
2781; AVX512VLDQ-LABEL: load_v8i32_v8i1_zero:
2782; AVX512VLDQ:       ## %bb.0:
2783; AVX512VLDQ-NEXT:    vpmovsxwd %xmm0, %ymm0
2784; AVX512VLDQ-NEXT:    vpslld $31, %ymm0, %ymm0
2785; AVX512VLDQ-NEXT:    vpmovd2m %ymm0, %k1
2786; AVX512VLDQ-NEXT:    vmovdqu32 (%rdi), %ymm0 {%k1} {z}
2787; AVX512VLDQ-NEXT:    retq
2788;
2789; AVX512VLBW-LABEL: load_v8i32_v8i1_zero:
2790; AVX512VLBW:       ## %bb.0:
2791; AVX512VLBW-NEXT:    vpsllw $15, %xmm0, %xmm0
2792; AVX512VLBW-NEXT:    vpmovw2m %xmm0, %k1
2793; AVX512VLBW-NEXT:    vmovdqu32 (%rdi), %ymm0 {%k1} {z}
2794; AVX512VLBW-NEXT:    retq
2795;
2796; X86-AVX512-LABEL: load_v8i32_v8i1_zero:
2797; X86-AVX512:       ## %bb.0:
2798; X86-AVX512-NEXT:    vpsllw $15, %xmm0, %xmm0
2799; X86-AVX512-NEXT:    vpmovw2m %xmm0, %k1
2800; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
2801; X86-AVX512-NEXT:    vmovdqu32 (%eax), %ymm0 {%k1} {z}
2802; X86-AVX512-NEXT:    retl
2803  %res = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr %addr, i32 4, <8 x i1> %mask, <8 x i32> zeroinitializer)
2804  ret <8 x i32> %res
2805}
2806
2807;
2808; vXi16
2809;
2810
2811define <8 x i16> @load_v8i16_v8i16(<8 x i16> %trigger, ptr %addr, <8 x i16> %dst) {
2812; SSE-LABEL: load_v8i16_v8i16:
2813; SSE:       ## %bb.0:
2814; SSE-NEXT:    packsswb %xmm0, %xmm0
2815; SSE-NEXT:    pmovmskb %xmm0, %eax
2816; SSE-NEXT:    testb $1, %al
2817; SSE-NEXT:    jne LBB21_1
2818; SSE-NEXT:  ## %bb.2: ## %else
2819; SSE-NEXT:    testb $2, %al
2820; SSE-NEXT:    jne LBB21_3
2821; SSE-NEXT:  LBB21_4: ## %else2
2822; SSE-NEXT:    testb $4, %al
2823; SSE-NEXT:    jne LBB21_5
2824; SSE-NEXT:  LBB21_6: ## %else5
2825; SSE-NEXT:    testb $8, %al
2826; SSE-NEXT:    jne LBB21_7
2827; SSE-NEXT:  LBB21_8: ## %else8
2828; SSE-NEXT:    testb $16, %al
2829; SSE-NEXT:    jne LBB21_9
2830; SSE-NEXT:  LBB21_10: ## %else11
2831; SSE-NEXT:    testb $32, %al
2832; SSE-NEXT:    jne LBB21_11
2833; SSE-NEXT:  LBB21_12: ## %else14
2834; SSE-NEXT:    testb $64, %al
2835; SSE-NEXT:    jne LBB21_13
2836; SSE-NEXT:  LBB21_14: ## %else17
2837; SSE-NEXT:    testb $-128, %al
2838; SSE-NEXT:    jne LBB21_15
2839; SSE-NEXT:  LBB21_16: ## %else20
2840; SSE-NEXT:    movdqa %xmm1, %xmm0
2841; SSE-NEXT:    retq
2842; SSE-NEXT:  LBB21_1: ## %cond.load
2843; SSE-NEXT:    pinsrw $0, (%rdi), %xmm1
2844; SSE-NEXT:    testb $2, %al
2845; SSE-NEXT:    je LBB21_4
2846; SSE-NEXT:  LBB21_3: ## %cond.load1
2847; SSE-NEXT:    pinsrw $1, 2(%rdi), %xmm1
2848; SSE-NEXT:    testb $4, %al
2849; SSE-NEXT:    je LBB21_6
2850; SSE-NEXT:  LBB21_5: ## %cond.load4
2851; SSE-NEXT:    pinsrw $2, 4(%rdi), %xmm1
2852; SSE-NEXT:    testb $8, %al
2853; SSE-NEXT:    je LBB21_8
2854; SSE-NEXT:  LBB21_7: ## %cond.load7
2855; SSE-NEXT:    pinsrw $3, 6(%rdi), %xmm1
2856; SSE-NEXT:    testb $16, %al
2857; SSE-NEXT:    je LBB21_10
2858; SSE-NEXT:  LBB21_9: ## %cond.load10
2859; SSE-NEXT:    pinsrw $4, 8(%rdi), %xmm1
2860; SSE-NEXT:    testb $32, %al
2861; SSE-NEXT:    je LBB21_12
2862; SSE-NEXT:  LBB21_11: ## %cond.load13
2863; SSE-NEXT:    pinsrw $5, 10(%rdi), %xmm1
2864; SSE-NEXT:    testb $64, %al
2865; SSE-NEXT:    je LBB21_14
2866; SSE-NEXT:  LBB21_13: ## %cond.load16
2867; SSE-NEXT:    pinsrw $6, 12(%rdi), %xmm1
2868; SSE-NEXT:    testb $-128, %al
2869; SSE-NEXT:    je LBB21_16
2870; SSE-NEXT:  LBB21_15: ## %cond.load19
2871; SSE-NEXT:    pinsrw $7, 14(%rdi), %xmm1
2872; SSE-NEXT:    movdqa %xmm1, %xmm0
2873; SSE-NEXT:    retq
2874;
2875; AVX1OR2-LABEL: load_v8i16_v8i16:
2876; AVX1OR2:       ## %bb.0:
2877; AVX1OR2-NEXT:    vpacksswb %xmm0, %xmm0, %xmm0
2878; AVX1OR2-NEXT:    vpmovmskb %xmm0, %eax
2879; AVX1OR2-NEXT:    testb $1, %al
2880; AVX1OR2-NEXT:    jne LBB21_1
2881; AVX1OR2-NEXT:  ## %bb.2: ## %else
2882; AVX1OR2-NEXT:    testb $2, %al
2883; AVX1OR2-NEXT:    jne LBB21_3
2884; AVX1OR2-NEXT:  LBB21_4: ## %else2
2885; AVX1OR2-NEXT:    testb $4, %al
2886; AVX1OR2-NEXT:    jne LBB21_5
2887; AVX1OR2-NEXT:  LBB21_6: ## %else5
2888; AVX1OR2-NEXT:    testb $8, %al
2889; AVX1OR2-NEXT:    jne LBB21_7
2890; AVX1OR2-NEXT:  LBB21_8: ## %else8
2891; AVX1OR2-NEXT:    testb $16, %al
2892; AVX1OR2-NEXT:    jne LBB21_9
2893; AVX1OR2-NEXT:  LBB21_10: ## %else11
2894; AVX1OR2-NEXT:    testb $32, %al
2895; AVX1OR2-NEXT:    jne LBB21_11
2896; AVX1OR2-NEXT:  LBB21_12: ## %else14
2897; AVX1OR2-NEXT:    testb $64, %al
2898; AVX1OR2-NEXT:    jne LBB21_13
2899; AVX1OR2-NEXT:  LBB21_14: ## %else17
2900; AVX1OR2-NEXT:    testb $-128, %al
2901; AVX1OR2-NEXT:    jne LBB21_15
2902; AVX1OR2-NEXT:  LBB21_16: ## %else20
2903; AVX1OR2-NEXT:    vmovdqa %xmm1, %xmm0
2904; AVX1OR2-NEXT:    retq
2905; AVX1OR2-NEXT:  LBB21_1: ## %cond.load
2906; AVX1OR2-NEXT:    vpinsrw $0, (%rdi), %xmm1, %xmm1
2907; AVX1OR2-NEXT:    testb $2, %al
2908; AVX1OR2-NEXT:    je LBB21_4
2909; AVX1OR2-NEXT:  LBB21_3: ## %cond.load1
2910; AVX1OR2-NEXT:    vpinsrw $1, 2(%rdi), %xmm1, %xmm1
2911; AVX1OR2-NEXT:    testb $4, %al
2912; AVX1OR2-NEXT:    je LBB21_6
2913; AVX1OR2-NEXT:  LBB21_5: ## %cond.load4
2914; AVX1OR2-NEXT:    vpinsrw $2, 4(%rdi), %xmm1, %xmm1
2915; AVX1OR2-NEXT:    testb $8, %al
2916; AVX1OR2-NEXT:    je LBB21_8
2917; AVX1OR2-NEXT:  LBB21_7: ## %cond.load7
2918; AVX1OR2-NEXT:    vpinsrw $3, 6(%rdi), %xmm1, %xmm1
2919; AVX1OR2-NEXT:    testb $16, %al
2920; AVX1OR2-NEXT:    je LBB21_10
2921; AVX1OR2-NEXT:  LBB21_9: ## %cond.load10
2922; AVX1OR2-NEXT:    vpinsrw $4, 8(%rdi), %xmm1, %xmm1
2923; AVX1OR2-NEXT:    testb $32, %al
2924; AVX1OR2-NEXT:    je LBB21_12
2925; AVX1OR2-NEXT:  LBB21_11: ## %cond.load13
2926; AVX1OR2-NEXT:    vpinsrw $5, 10(%rdi), %xmm1, %xmm1
2927; AVX1OR2-NEXT:    testb $64, %al
2928; AVX1OR2-NEXT:    je LBB21_14
2929; AVX1OR2-NEXT:  LBB21_13: ## %cond.load16
2930; AVX1OR2-NEXT:    vpinsrw $6, 12(%rdi), %xmm1, %xmm1
2931; AVX1OR2-NEXT:    testb $-128, %al
2932; AVX1OR2-NEXT:    je LBB21_16
2933; AVX1OR2-NEXT:  LBB21_15: ## %cond.load19
2934; AVX1OR2-NEXT:    vpinsrw $7, 14(%rdi), %xmm1, %xmm1
2935; AVX1OR2-NEXT:    vmovdqa %xmm1, %xmm0
2936; AVX1OR2-NEXT:    retq
2937;
2938; AVX512F-LABEL: load_v8i16_v8i16:
2939; AVX512F:       ## %bb.0:
2940; AVX512F-NEXT:    vpxor %xmm2, %xmm2, %xmm2
2941; AVX512F-NEXT:    vpcmpgtw %xmm0, %xmm2, %xmm0
2942; AVX512F-NEXT:    vpmovsxwq %xmm0, %zmm0
2943; AVX512F-NEXT:    vptestmq %zmm0, %zmm0, %k0
2944; AVX512F-NEXT:    kmovw %k0, %eax
2945; AVX512F-NEXT:    testb $1, %al
2946; AVX512F-NEXT:    jne LBB21_1
2947; AVX512F-NEXT:  ## %bb.2: ## %else
2948; AVX512F-NEXT:    testb $2, %al
2949; AVX512F-NEXT:    jne LBB21_3
2950; AVX512F-NEXT:  LBB21_4: ## %else2
2951; AVX512F-NEXT:    testb $4, %al
2952; AVX512F-NEXT:    jne LBB21_5
2953; AVX512F-NEXT:  LBB21_6: ## %else5
2954; AVX512F-NEXT:    testb $8, %al
2955; AVX512F-NEXT:    jne LBB21_7
2956; AVX512F-NEXT:  LBB21_8: ## %else8
2957; AVX512F-NEXT:    testb $16, %al
2958; AVX512F-NEXT:    jne LBB21_9
2959; AVX512F-NEXT:  LBB21_10: ## %else11
2960; AVX512F-NEXT:    testb $32, %al
2961; AVX512F-NEXT:    jne LBB21_11
2962; AVX512F-NEXT:  LBB21_12: ## %else14
2963; AVX512F-NEXT:    testb $64, %al
2964; AVX512F-NEXT:    jne LBB21_13
2965; AVX512F-NEXT:  LBB21_14: ## %else17
2966; AVX512F-NEXT:    testb $-128, %al
2967; AVX512F-NEXT:    jne LBB21_15
2968; AVX512F-NEXT:  LBB21_16: ## %else20
2969; AVX512F-NEXT:    vmovdqa %xmm1, %xmm0
2970; AVX512F-NEXT:    vzeroupper
2971; AVX512F-NEXT:    retq
2972; AVX512F-NEXT:  LBB21_1: ## %cond.load
2973; AVX512F-NEXT:    vpinsrw $0, (%rdi), %xmm1, %xmm1
2974; AVX512F-NEXT:    testb $2, %al
2975; AVX512F-NEXT:    je LBB21_4
2976; AVX512F-NEXT:  LBB21_3: ## %cond.load1
2977; AVX512F-NEXT:    vpinsrw $1, 2(%rdi), %xmm1, %xmm1
2978; AVX512F-NEXT:    testb $4, %al
2979; AVX512F-NEXT:    je LBB21_6
2980; AVX512F-NEXT:  LBB21_5: ## %cond.load4
2981; AVX512F-NEXT:    vpinsrw $2, 4(%rdi), %xmm1, %xmm1
2982; AVX512F-NEXT:    testb $8, %al
2983; AVX512F-NEXT:    je LBB21_8
2984; AVX512F-NEXT:  LBB21_7: ## %cond.load7
2985; AVX512F-NEXT:    vpinsrw $3, 6(%rdi), %xmm1, %xmm1
2986; AVX512F-NEXT:    testb $16, %al
2987; AVX512F-NEXT:    je LBB21_10
2988; AVX512F-NEXT:  LBB21_9: ## %cond.load10
2989; AVX512F-NEXT:    vpinsrw $4, 8(%rdi), %xmm1, %xmm1
2990; AVX512F-NEXT:    testb $32, %al
2991; AVX512F-NEXT:    je LBB21_12
2992; AVX512F-NEXT:  LBB21_11: ## %cond.load13
2993; AVX512F-NEXT:    vpinsrw $5, 10(%rdi), %xmm1, %xmm1
2994; AVX512F-NEXT:    testb $64, %al
2995; AVX512F-NEXT:    je LBB21_14
2996; AVX512F-NEXT:  LBB21_13: ## %cond.load16
2997; AVX512F-NEXT:    vpinsrw $6, 12(%rdi), %xmm1, %xmm1
2998; AVX512F-NEXT:    testb $-128, %al
2999; AVX512F-NEXT:    je LBB21_16
3000; AVX512F-NEXT:  LBB21_15: ## %cond.load19
3001; AVX512F-NEXT:    vpinsrw $7, 14(%rdi), %xmm1, %xmm1
3002; AVX512F-NEXT:    vmovdqa %xmm1, %xmm0
3003; AVX512F-NEXT:    vzeroupper
3004; AVX512F-NEXT:    retq
3005;
3006; AVX512VLDQ-LABEL: load_v8i16_v8i16:
3007; AVX512VLDQ:       ## %bb.0:
3008; AVX512VLDQ-NEXT:    vpxor %xmm2, %xmm2, %xmm2
3009; AVX512VLDQ-NEXT:    vpcmpgtw %xmm0, %xmm2, %xmm0
3010; AVX512VLDQ-NEXT:    vpmovsxwd %xmm0, %ymm0
3011; AVX512VLDQ-NEXT:    vpmovd2m %ymm0, %k0
3012; AVX512VLDQ-NEXT:    kmovw %k0, %eax
3013; AVX512VLDQ-NEXT:    testb $1, %al
3014; AVX512VLDQ-NEXT:    jne LBB21_1
3015; AVX512VLDQ-NEXT:  ## %bb.2: ## %else
3016; AVX512VLDQ-NEXT:    testb $2, %al
3017; AVX512VLDQ-NEXT:    jne LBB21_3
3018; AVX512VLDQ-NEXT:  LBB21_4: ## %else2
3019; AVX512VLDQ-NEXT:    testb $4, %al
3020; AVX512VLDQ-NEXT:    jne LBB21_5
3021; AVX512VLDQ-NEXT:  LBB21_6: ## %else5
3022; AVX512VLDQ-NEXT:    testb $8, %al
3023; AVX512VLDQ-NEXT:    jne LBB21_7
3024; AVX512VLDQ-NEXT:  LBB21_8: ## %else8
3025; AVX512VLDQ-NEXT:    testb $16, %al
3026; AVX512VLDQ-NEXT:    jne LBB21_9
3027; AVX512VLDQ-NEXT:  LBB21_10: ## %else11
3028; AVX512VLDQ-NEXT:    testb $32, %al
3029; AVX512VLDQ-NEXT:    jne LBB21_11
3030; AVX512VLDQ-NEXT:  LBB21_12: ## %else14
3031; AVX512VLDQ-NEXT:    testb $64, %al
3032; AVX512VLDQ-NEXT:    jne LBB21_13
3033; AVX512VLDQ-NEXT:  LBB21_14: ## %else17
3034; AVX512VLDQ-NEXT:    testb $-128, %al
3035; AVX512VLDQ-NEXT:    jne LBB21_15
3036; AVX512VLDQ-NEXT:  LBB21_16: ## %else20
3037; AVX512VLDQ-NEXT:    vmovdqa %xmm1, %xmm0
3038; AVX512VLDQ-NEXT:    vzeroupper
3039; AVX512VLDQ-NEXT:    retq
3040; AVX512VLDQ-NEXT:  LBB21_1: ## %cond.load
3041; AVX512VLDQ-NEXT:    vpinsrw $0, (%rdi), %xmm1, %xmm1
3042; AVX512VLDQ-NEXT:    testb $2, %al
3043; AVX512VLDQ-NEXT:    je LBB21_4
3044; AVX512VLDQ-NEXT:  LBB21_3: ## %cond.load1
3045; AVX512VLDQ-NEXT:    vpinsrw $1, 2(%rdi), %xmm1, %xmm1
3046; AVX512VLDQ-NEXT:    testb $4, %al
3047; AVX512VLDQ-NEXT:    je LBB21_6
3048; AVX512VLDQ-NEXT:  LBB21_5: ## %cond.load4
3049; AVX512VLDQ-NEXT:    vpinsrw $2, 4(%rdi), %xmm1, %xmm1
3050; AVX512VLDQ-NEXT:    testb $8, %al
3051; AVX512VLDQ-NEXT:    je LBB21_8
3052; AVX512VLDQ-NEXT:  LBB21_7: ## %cond.load7
3053; AVX512VLDQ-NEXT:    vpinsrw $3, 6(%rdi), %xmm1, %xmm1
3054; AVX512VLDQ-NEXT:    testb $16, %al
3055; AVX512VLDQ-NEXT:    je LBB21_10
3056; AVX512VLDQ-NEXT:  LBB21_9: ## %cond.load10
3057; AVX512VLDQ-NEXT:    vpinsrw $4, 8(%rdi), %xmm1, %xmm1
3058; AVX512VLDQ-NEXT:    testb $32, %al
3059; AVX512VLDQ-NEXT:    je LBB21_12
3060; AVX512VLDQ-NEXT:  LBB21_11: ## %cond.load13
3061; AVX512VLDQ-NEXT:    vpinsrw $5, 10(%rdi), %xmm1, %xmm1
3062; AVX512VLDQ-NEXT:    testb $64, %al
3063; AVX512VLDQ-NEXT:    je LBB21_14
3064; AVX512VLDQ-NEXT:  LBB21_13: ## %cond.load16
3065; AVX512VLDQ-NEXT:    vpinsrw $6, 12(%rdi), %xmm1, %xmm1
3066; AVX512VLDQ-NEXT:    testb $-128, %al
3067; AVX512VLDQ-NEXT:    je LBB21_16
3068; AVX512VLDQ-NEXT:  LBB21_15: ## %cond.load19
3069; AVX512VLDQ-NEXT:    vpinsrw $7, 14(%rdi), %xmm1, %xmm1
3070; AVX512VLDQ-NEXT:    vmovdqa %xmm1, %xmm0
3071; AVX512VLDQ-NEXT:    vzeroupper
3072; AVX512VLDQ-NEXT:    retq
3073;
3074; AVX512VLBW-LABEL: load_v8i16_v8i16:
3075; AVX512VLBW:       ## %bb.0:
3076; AVX512VLBW-NEXT:    vpmovw2m %xmm0, %k1
3077; AVX512VLBW-NEXT:    vpblendmw (%rdi), %xmm1, %xmm0 {%k1}
3078; AVX512VLBW-NEXT:    retq
3079;
3080; X86-AVX512-LABEL: load_v8i16_v8i16:
3081; X86-AVX512:       ## %bb.0:
3082; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
3083; X86-AVX512-NEXT:    vpmovw2m %xmm0, %k1
3084; X86-AVX512-NEXT:    vpblendmw (%eax), %xmm1, %xmm0 {%k1}
3085; X86-AVX512-NEXT:    retl
3086  %mask = icmp slt <8 x i16> %trigger, zeroinitializer
3087  %res = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr %addr, i32 4, <8 x i1> %mask, <8 x i16> %dst)
3088  ret <8 x i16> %res
3089}
3090
3091define <16 x i16> @load_v16i16_v16i16(<16 x i16> %trigger, ptr %addr, <16 x i16> %dst) {
3092; SSE-LABEL: load_v16i16_v16i16:
3093; SSE:       ## %bb.0:
3094; SSE-NEXT:    packsswb %xmm1, %xmm0
3095; SSE-NEXT:    pmovmskb %xmm0, %eax
3096; SSE-NEXT:    testb $1, %al
3097; SSE-NEXT:    jne LBB22_1
3098; SSE-NEXT:  ## %bb.2: ## %else
3099; SSE-NEXT:    testb $2, %al
3100; SSE-NEXT:    jne LBB22_3
3101; SSE-NEXT:  LBB22_4: ## %else2
3102; SSE-NEXT:    testb $4, %al
3103; SSE-NEXT:    jne LBB22_5
3104; SSE-NEXT:  LBB22_6: ## %else5
3105; SSE-NEXT:    testb $8, %al
3106; SSE-NEXT:    jne LBB22_7
3107; SSE-NEXT:  LBB22_8: ## %else8
3108; SSE-NEXT:    testb $16, %al
3109; SSE-NEXT:    jne LBB22_9
3110; SSE-NEXT:  LBB22_10: ## %else11
3111; SSE-NEXT:    testb $32, %al
3112; SSE-NEXT:    jne LBB22_11
3113; SSE-NEXT:  LBB22_12: ## %else14
3114; SSE-NEXT:    testb $64, %al
3115; SSE-NEXT:    jne LBB22_13
3116; SSE-NEXT:  LBB22_14: ## %else17
3117; SSE-NEXT:    testb %al, %al
3118; SSE-NEXT:    js LBB22_15
3119; SSE-NEXT:  LBB22_16: ## %else20
3120; SSE-NEXT:    testl $256, %eax ## imm = 0x100
3121; SSE-NEXT:    jne LBB22_17
3122; SSE-NEXT:  LBB22_18: ## %else23
3123; SSE-NEXT:    testl $512, %eax ## imm = 0x200
3124; SSE-NEXT:    jne LBB22_19
3125; SSE-NEXT:  LBB22_20: ## %else26
3126; SSE-NEXT:    testl $1024, %eax ## imm = 0x400
3127; SSE-NEXT:    jne LBB22_21
3128; SSE-NEXT:  LBB22_22: ## %else29
3129; SSE-NEXT:    testl $2048, %eax ## imm = 0x800
3130; SSE-NEXT:    jne LBB22_23
3131; SSE-NEXT:  LBB22_24: ## %else32
3132; SSE-NEXT:    testl $4096, %eax ## imm = 0x1000
3133; SSE-NEXT:    jne LBB22_25
3134; SSE-NEXT:  LBB22_26: ## %else35
3135; SSE-NEXT:    testl $8192, %eax ## imm = 0x2000
3136; SSE-NEXT:    jne LBB22_27
3137; SSE-NEXT:  LBB22_28: ## %else38
3138; SSE-NEXT:    testl $16384, %eax ## imm = 0x4000
3139; SSE-NEXT:    jne LBB22_29
3140; SSE-NEXT:  LBB22_30: ## %else41
3141; SSE-NEXT:    testl $32768, %eax ## imm = 0x8000
3142; SSE-NEXT:    je LBB22_32
3143; SSE-NEXT:  LBB22_31: ## %cond.load43
3144; SSE-NEXT:    pinsrw $7, 30(%rdi), %xmm3
3145; SSE-NEXT:  LBB22_32: ## %else44
3146; SSE-NEXT:    movdqa %xmm2, %xmm0
3147; SSE-NEXT:    movdqa %xmm3, %xmm1
3148; SSE-NEXT:    retq
3149; SSE-NEXT:  LBB22_1: ## %cond.load
3150; SSE-NEXT:    pinsrw $0, (%rdi), %xmm2
3151; SSE-NEXT:    testb $2, %al
3152; SSE-NEXT:    je LBB22_4
3153; SSE-NEXT:  LBB22_3: ## %cond.load1
3154; SSE-NEXT:    pinsrw $1, 2(%rdi), %xmm2
3155; SSE-NEXT:    testb $4, %al
3156; SSE-NEXT:    je LBB22_6
3157; SSE-NEXT:  LBB22_5: ## %cond.load4
3158; SSE-NEXT:    pinsrw $2, 4(%rdi), %xmm2
3159; SSE-NEXT:    testb $8, %al
3160; SSE-NEXT:    je LBB22_8
3161; SSE-NEXT:  LBB22_7: ## %cond.load7
3162; SSE-NEXT:    pinsrw $3, 6(%rdi), %xmm2
3163; SSE-NEXT:    testb $16, %al
3164; SSE-NEXT:    je LBB22_10
3165; SSE-NEXT:  LBB22_9: ## %cond.load10
3166; SSE-NEXT:    pinsrw $4, 8(%rdi), %xmm2
3167; SSE-NEXT:    testb $32, %al
3168; SSE-NEXT:    je LBB22_12
3169; SSE-NEXT:  LBB22_11: ## %cond.load13
3170; SSE-NEXT:    pinsrw $5, 10(%rdi), %xmm2
3171; SSE-NEXT:    testb $64, %al
3172; SSE-NEXT:    je LBB22_14
3173; SSE-NEXT:  LBB22_13: ## %cond.load16
3174; SSE-NEXT:    pinsrw $6, 12(%rdi), %xmm2
3175; SSE-NEXT:    testb %al, %al
3176; SSE-NEXT:    jns LBB22_16
3177; SSE-NEXT:  LBB22_15: ## %cond.load19
3178; SSE-NEXT:    pinsrw $7, 14(%rdi), %xmm2
3179; SSE-NEXT:    testl $256, %eax ## imm = 0x100
3180; SSE-NEXT:    je LBB22_18
3181; SSE-NEXT:  LBB22_17: ## %cond.load22
3182; SSE-NEXT:    pinsrw $0, 16(%rdi), %xmm3
3183; SSE-NEXT:    testl $512, %eax ## imm = 0x200
3184; SSE-NEXT:    je LBB22_20
3185; SSE-NEXT:  LBB22_19: ## %cond.load25
3186; SSE-NEXT:    pinsrw $1, 18(%rdi), %xmm3
3187; SSE-NEXT:    testl $1024, %eax ## imm = 0x400
3188; SSE-NEXT:    je LBB22_22
3189; SSE-NEXT:  LBB22_21: ## %cond.load28
3190; SSE-NEXT:    pinsrw $2, 20(%rdi), %xmm3
3191; SSE-NEXT:    testl $2048, %eax ## imm = 0x800
3192; SSE-NEXT:    je LBB22_24
3193; SSE-NEXT:  LBB22_23: ## %cond.load31
3194; SSE-NEXT:    pinsrw $3, 22(%rdi), %xmm3
3195; SSE-NEXT:    testl $4096, %eax ## imm = 0x1000
3196; SSE-NEXT:    je LBB22_26
3197; SSE-NEXT:  LBB22_25: ## %cond.load34
3198; SSE-NEXT:    pinsrw $4, 24(%rdi), %xmm3
3199; SSE-NEXT:    testl $8192, %eax ## imm = 0x2000
3200; SSE-NEXT:    je LBB22_28
3201; SSE-NEXT:  LBB22_27: ## %cond.load37
3202; SSE-NEXT:    pinsrw $5, 26(%rdi), %xmm3
3203; SSE-NEXT:    testl $16384, %eax ## imm = 0x4000
3204; SSE-NEXT:    je LBB22_30
3205; SSE-NEXT:  LBB22_29: ## %cond.load40
3206; SSE-NEXT:    pinsrw $6, 28(%rdi), %xmm3
3207; SSE-NEXT:    testl $32768, %eax ## imm = 0x8000
3208; SSE-NEXT:    jne LBB22_31
3209; SSE-NEXT:    jmp LBB22_32
3210;
3211; AVX1-LABEL: load_v16i16_v16i16:
3212; AVX1:       ## %bb.0:
3213; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
3214; AVX1-NEXT:    vpacksswb %xmm2, %xmm0, %xmm0
3215; AVX1-NEXT:    vpmovmskb %xmm0, %eax
3216; AVX1-NEXT:    testb $1, %al
3217; AVX1-NEXT:    jne LBB22_1
3218; AVX1-NEXT:  ## %bb.2: ## %else
3219; AVX1-NEXT:    testb $2, %al
3220; AVX1-NEXT:    jne LBB22_3
3221; AVX1-NEXT:  LBB22_4: ## %else2
3222; AVX1-NEXT:    testb $4, %al
3223; AVX1-NEXT:    jne LBB22_5
3224; AVX1-NEXT:  LBB22_6: ## %else5
3225; AVX1-NEXT:    testb $8, %al
3226; AVX1-NEXT:    jne LBB22_7
3227; AVX1-NEXT:  LBB22_8: ## %else8
3228; AVX1-NEXT:    testb $16, %al
3229; AVX1-NEXT:    jne LBB22_9
3230; AVX1-NEXT:  LBB22_10: ## %else11
3231; AVX1-NEXT:    testb $32, %al
3232; AVX1-NEXT:    jne LBB22_11
3233; AVX1-NEXT:  LBB22_12: ## %else14
3234; AVX1-NEXT:    testb $64, %al
3235; AVX1-NEXT:    jne LBB22_13
3236; AVX1-NEXT:  LBB22_14: ## %else17
3237; AVX1-NEXT:    testb %al, %al
3238; AVX1-NEXT:    js LBB22_15
3239; AVX1-NEXT:  LBB22_16: ## %else20
3240; AVX1-NEXT:    testl $256, %eax ## imm = 0x100
3241; AVX1-NEXT:    jne LBB22_17
3242; AVX1-NEXT:  LBB22_18: ## %else23
3243; AVX1-NEXT:    testl $512, %eax ## imm = 0x200
3244; AVX1-NEXT:    jne LBB22_19
3245; AVX1-NEXT:  LBB22_20: ## %else26
3246; AVX1-NEXT:    testl $1024, %eax ## imm = 0x400
3247; AVX1-NEXT:    jne LBB22_21
3248; AVX1-NEXT:  LBB22_22: ## %else29
3249; AVX1-NEXT:    testl $2048, %eax ## imm = 0x800
3250; AVX1-NEXT:    jne LBB22_23
3251; AVX1-NEXT:  LBB22_24: ## %else32
3252; AVX1-NEXT:    testl $4096, %eax ## imm = 0x1000
3253; AVX1-NEXT:    jne LBB22_25
3254; AVX1-NEXT:  LBB22_26: ## %else35
3255; AVX1-NEXT:    testl $8192, %eax ## imm = 0x2000
3256; AVX1-NEXT:    jne LBB22_27
3257; AVX1-NEXT:  LBB22_28: ## %else38
3258; AVX1-NEXT:    testl $16384, %eax ## imm = 0x4000
3259; AVX1-NEXT:    jne LBB22_29
3260; AVX1-NEXT:  LBB22_30: ## %else41
3261; AVX1-NEXT:    testl $32768, %eax ## imm = 0x8000
3262; AVX1-NEXT:    jne LBB22_31
3263; AVX1-NEXT:  LBB22_32: ## %else44
3264; AVX1-NEXT:    vmovaps %ymm1, %ymm0
3265; AVX1-NEXT:    retq
3266; AVX1-NEXT:  LBB22_1: ## %cond.load
3267; AVX1-NEXT:    vpinsrw $0, (%rdi), %xmm1, %xmm0
3268; AVX1-NEXT:    vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
3269; AVX1-NEXT:    testb $2, %al
3270; AVX1-NEXT:    je LBB22_4
3271; AVX1-NEXT:  LBB22_3: ## %cond.load1
3272; AVX1-NEXT:    vpinsrw $1, 2(%rdi), %xmm1, %xmm0
3273; AVX1-NEXT:    vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
3274; AVX1-NEXT:    testb $4, %al
3275; AVX1-NEXT:    je LBB22_6
3276; AVX1-NEXT:  LBB22_5: ## %cond.load4
3277; AVX1-NEXT:    vpinsrw $2, 4(%rdi), %xmm1, %xmm0
3278; AVX1-NEXT:    vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
3279; AVX1-NEXT:    testb $8, %al
3280; AVX1-NEXT:    je LBB22_8
3281; AVX1-NEXT:  LBB22_7: ## %cond.load7
3282; AVX1-NEXT:    vpinsrw $3, 6(%rdi), %xmm1, %xmm0
3283; AVX1-NEXT:    vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
3284; AVX1-NEXT:    testb $16, %al
3285; AVX1-NEXT:    je LBB22_10
3286; AVX1-NEXT:  LBB22_9: ## %cond.load10
3287; AVX1-NEXT:    vpinsrw $4, 8(%rdi), %xmm1, %xmm0
3288; AVX1-NEXT:    vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
3289; AVX1-NEXT:    testb $32, %al
3290; AVX1-NEXT:    je LBB22_12
3291; AVX1-NEXT:  LBB22_11: ## %cond.load13
3292; AVX1-NEXT:    vpinsrw $5, 10(%rdi), %xmm1, %xmm0
3293; AVX1-NEXT:    vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
3294; AVX1-NEXT:    testb $64, %al
3295; AVX1-NEXT:    je LBB22_14
3296; AVX1-NEXT:  LBB22_13: ## %cond.load16
3297; AVX1-NEXT:    vpinsrw $6, 12(%rdi), %xmm1, %xmm0
3298; AVX1-NEXT:    vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
3299; AVX1-NEXT:    testb %al, %al
3300; AVX1-NEXT:    jns LBB22_16
3301; AVX1-NEXT:  LBB22_15: ## %cond.load19
3302; AVX1-NEXT:    vpinsrw $7, 14(%rdi), %xmm1, %xmm0
3303; AVX1-NEXT:    vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
3304; AVX1-NEXT:    testl $256, %eax ## imm = 0x100
3305; AVX1-NEXT:    je LBB22_18
3306; AVX1-NEXT:  LBB22_17: ## %cond.load22
3307; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm0
3308; AVX1-NEXT:    vpinsrw $0, 16(%rdi), %xmm0, %xmm0
3309; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm1
3310; AVX1-NEXT:    testl $512, %eax ## imm = 0x200
3311; AVX1-NEXT:    je LBB22_20
3312; AVX1-NEXT:  LBB22_19: ## %cond.load25
3313; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm0
3314; AVX1-NEXT:    vpinsrw $1, 18(%rdi), %xmm0, %xmm0
3315; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm1
3316; AVX1-NEXT:    testl $1024, %eax ## imm = 0x400
3317; AVX1-NEXT:    je LBB22_22
3318; AVX1-NEXT:  LBB22_21: ## %cond.load28
3319; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm0
3320; AVX1-NEXT:    vpinsrw $2, 20(%rdi), %xmm0, %xmm0
3321; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm1
3322; AVX1-NEXT:    testl $2048, %eax ## imm = 0x800
3323; AVX1-NEXT:    je LBB22_24
3324; AVX1-NEXT:  LBB22_23: ## %cond.load31
3325; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm0
3326; AVX1-NEXT:    vpinsrw $3, 22(%rdi), %xmm0, %xmm0
3327; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm1
3328; AVX1-NEXT:    testl $4096, %eax ## imm = 0x1000
3329; AVX1-NEXT:    je LBB22_26
3330; AVX1-NEXT:  LBB22_25: ## %cond.load34
3331; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm0
3332; AVX1-NEXT:    vpinsrw $4, 24(%rdi), %xmm0, %xmm0
3333; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm1
3334; AVX1-NEXT:    testl $8192, %eax ## imm = 0x2000
3335; AVX1-NEXT:    je LBB22_28
3336; AVX1-NEXT:  LBB22_27: ## %cond.load37
3337; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm0
3338; AVX1-NEXT:    vpinsrw $5, 26(%rdi), %xmm0, %xmm0
3339; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm1
3340; AVX1-NEXT:    testl $16384, %eax ## imm = 0x4000
3341; AVX1-NEXT:    je LBB22_30
3342; AVX1-NEXT:  LBB22_29: ## %cond.load40
3343; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm0
3344; AVX1-NEXT:    vpinsrw $6, 28(%rdi), %xmm0, %xmm0
3345; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm1
3346; AVX1-NEXT:    testl $32768, %eax ## imm = 0x8000
3347; AVX1-NEXT:    je LBB22_32
3348; AVX1-NEXT:  LBB22_31: ## %cond.load43
3349; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm0
3350; AVX1-NEXT:    vpinsrw $7, 30(%rdi), %xmm0, %xmm0
3351; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm1
3352; AVX1-NEXT:    vmovaps %ymm1, %ymm0
3353; AVX1-NEXT:    retq
3354;
3355; AVX2-LABEL: load_v16i16_v16i16:
3356; AVX2:       ## %bb.0:
3357; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm2
3358; AVX2-NEXT:    vpacksswb %xmm2, %xmm0, %xmm0
3359; AVX2-NEXT:    vpmovmskb %xmm0, %eax
3360; AVX2-NEXT:    testb $1, %al
3361; AVX2-NEXT:    jne LBB22_1
3362; AVX2-NEXT:  ## %bb.2: ## %else
3363; AVX2-NEXT:    testb $2, %al
3364; AVX2-NEXT:    jne LBB22_3
3365; AVX2-NEXT:  LBB22_4: ## %else2
3366; AVX2-NEXT:    testb $4, %al
3367; AVX2-NEXT:    jne LBB22_5
3368; AVX2-NEXT:  LBB22_6: ## %else5
3369; AVX2-NEXT:    testb $8, %al
3370; AVX2-NEXT:    jne LBB22_7
3371; AVX2-NEXT:  LBB22_8: ## %else8
3372; AVX2-NEXT:    testb $16, %al
3373; AVX2-NEXT:    jne LBB22_9
3374; AVX2-NEXT:  LBB22_10: ## %else11
3375; AVX2-NEXT:    testb $32, %al
3376; AVX2-NEXT:    jne LBB22_11
3377; AVX2-NEXT:  LBB22_12: ## %else14
3378; AVX2-NEXT:    testb $64, %al
3379; AVX2-NEXT:    jne LBB22_13
3380; AVX2-NEXT:  LBB22_14: ## %else17
3381; AVX2-NEXT:    testb %al, %al
3382; AVX2-NEXT:    js LBB22_15
3383; AVX2-NEXT:  LBB22_16: ## %else20
3384; AVX2-NEXT:    testl $256, %eax ## imm = 0x100
3385; AVX2-NEXT:    jne LBB22_17
3386; AVX2-NEXT:  LBB22_18: ## %else23
3387; AVX2-NEXT:    testl $512, %eax ## imm = 0x200
3388; AVX2-NEXT:    jne LBB22_19
3389; AVX2-NEXT:  LBB22_20: ## %else26
3390; AVX2-NEXT:    testl $1024, %eax ## imm = 0x400
3391; AVX2-NEXT:    jne LBB22_21
3392; AVX2-NEXT:  LBB22_22: ## %else29
3393; AVX2-NEXT:    testl $2048, %eax ## imm = 0x800
3394; AVX2-NEXT:    jne LBB22_23
3395; AVX2-NEXT:  LBB22_24: ## %else32
3396; AVX2-NEXT:    testl $4096, %eax ## imm = 0x1000
3397; AVX2-NEXT:    jne LBB22_25
3398; AVX2-NEXT:  LBB22_26: ## %else35
3399; AVX2-NEXT:    testl $8192, %eax ## imm = 0x2000
3400; AVX2-NEXT:    jne LBB22_27
3401; AVX2-NEXT:  LBB22_28: ## %else38
3402; AVX2-NEXT:    testl $16384, %eax ## imm = 0x4000
3403; AVX2-NEXT:    jne LBB22_29
3404; AVX2-NEXT:  LBB22_30: ## %else41
3405; AVX2-NEXT:    testl $32768, %eax ## imm = 0x8000
3406; AVX2-NEXT:    jne LBB22_31
3407; AVX2-NEXT:  LBB22_32: ## %else44
3408; AVX2-NEXT:    vmovdqa %ymm1, %ymm0
3409; AVX2-NEXT:    retq
3410; AVX2-NEXT:  LBB22_1: ## %cond.load
3411; AVX2-NEXT:    vpinsrw $0, (%rdi), %xmm1, %xmm0
3412; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
3413; AVX2-NEXT:    testb $2, %al
3414; AVX2-NEXT:    je LBB22_4
3415; AVX2-NEXT:  LBB22_3: ## %cond.load1
3416; AVX2-NEXT:    vpinsrw $1, 2(%rdi), %xmm1, %xmm0
3417; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
3418; AVX2-NEXT:    testb $4, %al
3419; AVX2-NEXT:    je LBB22_6
3420; AVX2-NEXT:  LBB22_5: ## %cond.load4
3421; AVX2-NEXT:    vpinsrw $2, 4(%rdi), %xmm1, %xmm0
3422; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
3423; AVX2-NEXT:    testb $8, %al
3424; AVX2-NEXT:    je LBB22_8
3425; AVX2-NEXT:  LBB22_7: ## %cond.load7
3426; AVX2-NEXT:    vpinsrw $3, 6(%rdi), %xmm1, %xmm0
3427; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
3428; AVX2-NEXT:    testb $16, %al
3429; AVX2-NEXT:    je LBB22_10
3430; AVX2-NEXT:  LBB22_9: ## %cond.load10
3431; AVX2-NEXT:    vpinsrw $4, 8(%rdi), %xmm1, %xmm0
3432; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
3433; AVX2-NEXT:    testb $32, %al
3434; AVX2-NEXT:    je LBB22_12
3435; AVX2-NEXT:  LBB22_11: ## %cond.load13
3436; AVX2-NEXT:    vpinsrw $5, 10(%rdi), %xmm1, %xmm0
3437; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
3438; AVX2-NEXT:    testb $64, %al
3439; AVX2-NEXT:    je LBB22_14
3440; AVX2-NEXT:  LBB22_13: ## %cond.load16
3441; AVX2-NEXT:    vpinsrw $6, 12(%rdi), %xmm1, %xmm0
3442; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
3443; AVX2-NEXT:    testb %al, %al
3444; AVX2-NEXT:    jns LBB22_16
3445; AVX2-NEXT:  LBB22_15: ## %cond.load19
3446; AVX2-NEXT:    vpinsrw $7, 14(%rdi), %xmm1, %xmm0
3447; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
3448; AVX2-NEXT:    testl $256, %eax ## imm = 0x100
3449; AVX2-NEXT:    je LBB22_18
3450; AVX2-NEXT:  LBB22_17: ## %cond.load22
3451; AVX2-NEXT:    vpbroadcastw 16(%rdi), %ymm0
3452; AVX2-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6,7],ymm0[8],ymm1[9,10,11,12,13,14,15]
3453; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7]
3454; AVX2-NEXT:    testl $512, %eax ## imm = 0x200
3455; AVX2-NEXT:    je LBB22_20
3456; AVX2-NEXT:  LBB22_19: ## %cond.load25
3457; AVX2-NEXT:    vpbroadcastw 18(%rdi), %ymm0
3458; AVX2-NEXT:    vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3,4,5,6,7,8],ymm0[9],ymm1[10,11,12,13,14,15]
3459; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7]
3460; AVX2-NEXT:    testl $1024, %eax ## imm = 0x400
3461; AVX2-NEXT:    je LBB22_22
3462; AVX2-NEXT:  LBB22_21: ## %cond.load28
3463; AVX2-NEXT:    vpbroadcastw 20(%rdi), %ymm0
3464; AVX2-NEXT:    vpblendw {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4,5,6,7,8,9],ymm0[10],ymm1[11,12,13,14,15]
3465; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7]
3466; AVX2-NEXT:    testl $2048, %eax ## imm = 0x800
3467; AVX2-NEXT:    je LBB22_24
3468; AVX2-NEXT:  LBB22_23: ## %cond.load31
3469; AVX2-NEXT:    vpbroadcastw 22(%rdi), %ymm0
3470; AVX2-NEXT:    vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6,7,8,9,10],ymm0[11],ymm1[12,13,14,15]
3471; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7]
3472; AVX2-NEXT:    testl $4096, %eax ## imm = 0x1000
3473; AVX2-NEXT:    je LBB22_26
3474; AVX2-NEXT:  LBB22_25: ## %cond.load34
3475; AVX2-NEXT:    vpbroadcastw 24(%rdi), %ymm0
3476; AVX2-NEXT:    vpblendw {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4],ymm1[5,6,7,8,9,10,11],ymm0[12],ymm1[13,14,15]
3477; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7]
3478; AVX2-NEXT:    testl $8192, %eax ## imm = 0x2000
3479; AVX2-NEXT:    je LBB22_28
3480; AVX2-NEXT:  LBB22_27: ## %cond.load37
3481; AVX2-NEXT:    vpbroadcastw 26(%rdi), %ymm0
3482; AVX2-NEXT:    vpblendw {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5],ymm1[6,7,8,9,10,11,12],ymm0[13],ymm1[14,15]
3483; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7]
3484; AVX2-NEXT:    testl $16384, %eax ## imm = 0x4000
3485; AVX2-NEXT:    je LBB22_30
3486; AVX2-NEXT:  LBB22_29: ## %cond.load40
3487; AVX2-NEXT:    vpbroadcastw 28(%rdi), %ymm0
3488; AVX2-NEXT:    vpblendw {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6],ymm1[7,8,9,10,11,12,13],ymm0[14],ymm1[15]
3489; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7]
3490; AVX2-NEXT:    testl $32768, %eax ## imm = 0x8000
3491; AVX2-NEXT:    je LBB22_32
3492; AVX2-NEXT:  LBB22_31: ## %cond.load43
3493; AVX2-NEXT:    vpbroadcastw 30(%rdi), %ymm0
3494; AVX2-NEXT:    vpblendw {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7],ymm1[8,9,10,11,12,13,14],ymm0[15]
3495; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7]
3496; AVX2-NEXT:    vmovdqa %ymm1, %ymm0
3497; AVX2-NEXT:    retq
3498;
3499; AVX512F-LABEL: load_v16i16_v16i16:
3500; AVX512F:       ## %bb.0:
3501; AVX512F-NEXT:    vpxor %xmm2, %xmm2, %xmm2
3502; AVX512F-NEXT:    vpcmpgtw %ymm0, %ymm2, %ymm0
3503; AVX512F-NEXT:    vpmovsxwd %ymm0, %zmm0
3504; AVX512F-NEXT:    vptestmd %zmm0, %zmm0, %k0
3505; AVX512F-NEXT:    kmovw %k0, %eax
3506; AVX512F-NEXT:    testb $1, %al
3507; AVX512F-NEXT:    jne LBB22_1
3508; AVX512F-NEXT:  ## %bb.2: ## %else
3509; AVX512F-NEXT:    testb $2, %al
3510; AVX512F-NEXT:    jne LBB22_3
3511; AVX512F-NEXT:  LBB22_4: ## %else2
3512; AVX512F-NEXT:    testb $4, %al
3513; AVX512F-NEXT:    jne LBB22_5
3514; AVX512F-NEXT:  LBB22_6: ## %else5
3515; AVX512F-NEXT:    testb $8, %al
3516; AVX512F-NEXT:    jne LBB22_7
3517; AVX512F-NEXT:  LBB22_8: ## %else8
3518; AVX512F-NEXT:    testb $16, %al
3519; AVX512F-NEXT:    jne LBB22_9
3520; AVX512F-NEXT:  LBB22_10: ## %else11
3521; AVX512F-NEXT:    testb $32, %al
3522; AVX512F-NEXT:    jne LBB22_11
3523; AVX512F-NEXT:  LBB22_12: ## %else14
3524; AVX512F-NEXT:    testb $64, %al
3525; AVX512F-NEXT:    jne LBB22_13
3526; AVX512F-NEXT:  LBB22_14: ## %else17
3527; AVX512F-NEXT:    testb %al, %al
3528; AVX512F-NEXT:    js LBB22_15
3529; AVX512F-NEXT:  LBB22_16: ## %else20
3530; AVX512F-NEXT:    testl $256, %eax ## imm = 0x100
3531; AVX512F-NEXT:    jne LBB22_17
3532; AVX512F-NEXT:  LBB22_18: ## %else23
3533; AVX512F-NEXT:    testl $512, %eax ## imm = 0x200
3534; AVX512F-NEXT:    jne LBB22_19
3535; AVX512F-NEXT:  LBB22_20: ## %else26
3536; AVX512F-NEXT:    testl $1024, %eax ## imm = 0x400
3537; AVX512F-NEXT:    jne LBB22_21
3538; AVX512F-NEXT:  LBB22_22: ## %else29
3539; AVX512F-NEXT:    testl $2048, %eax ## imm = 0x800
3540; AVX512F-NEXT:    jne LBB22_23
3541; AVX512F-NEXT:  LBB22_24: ## %else32
3542; AVX512F-NEXT:    testl $4096, %eax ## imm = 0x1000
3543; AVX512F-NEXT:    jne LBB22_25
3544; AVX512F-NEXT:  LBB22_26: ## %else35
3545; AVX512F-NEXT:    testl $8192, %eax ## imm = 0x2000
3546; AVX512F-NEXT:    jne LBB22_27
3547; AVX512F-NEXT:  LBB22_28: ## %else38
3548; AVX512F-NEXT:    testl $16384, %eax ## imm = 0x4000
3549; AVX512F-NEXT:    jne LBB22_29
3550; AVX512F-NEXT:  LBB22_30: ## %else41
3551; AVX512F-NEXT:    testl $32768, %eax ## imm = 0x8000
3552; AVX512F-NEXT:    jne LBB22_31
3553; AVX512F-NEXT:  LBB22_32: ## %else44
3554; AVX512F-NEXT:    vmovdqa %ymm1, %ymm0
3555; AVX512F-NEXT:    retq
3556; AVX512F-NEXT:  LBB22_1: ## %cond.load
3557; AVX512F-NEXT:    vpinsrw $0, (%rdi), %xmm1, %xmm0
3558; AVX512F-NEXT:    vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
3559; AVX512F-NEXT:    testb $2, %al
3560; AVX512F-NEXT:    je LBB22_4
3561; AVX512F-NEXT:  LBB22_3: ## %cond.load1
3562; AVX512F-NEXT:    vpinsrw $1, 2(%rdi), %xmm1, %xmm0
3563; AVX512F-NEXT:    vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
3564; AVX512F-NEXT:    testb $4, %al
3565; AVX512F-NEXT:    je LBB22_6
3566; AVX512F-NEXT:  LBB22_5: ## %cond.load4
3567; AVX512F-NEXT:    vpinsrw $2, 4(%rdi), %xmm1, %xmm0
3568; AVX512F-NEXT:    vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
3569; AVX512F-NEXT:    testb $8, %al
3570; AVX512F-NEXT:    je LBB22_8
3571; AVX512F-NEXT:  LBB22_7: ## %cond.load7
3572; AVX512F-NEXT:    vpinsrw $3, 6(%rdi), %xmm1, %xmm0
3573; AVX512F-NEXT:    vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
3574; AVX512F-NEXT:    testb $16, %al
3575; AVX512F-NEXT:    je LBB22_10
3576; AVX512F-NEXT:  LBB22_9: ## %cond.load10
3577; AVX512F-NEXT:    vpinsrw $4, 8(%rdi), %xmm1, %xmm0
3578; AVX512F-NEXT:    vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
3579; AVX512F-NEXT:    testb $32, %al
3580; AVX512F-NEXT:    je LBB22_12
3581; AVX512F-NEXT:  LBB22_11: ## %cond.load13
3582; AVX512F-NEXT:    vpinsrw $5, 10(%rdi), %xmm1, %xmm0
3583; AVX512F-NEXT:    vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
3584; AVX512F-NEXT:    testb $64, %al
3585; AVX512F-NEXT:    je LBB22_14
3586; AVX512F-NEXT:  LBB22_13: ## %cond.load16
3587; AVX512F-NEXT:    vpinsrw $6, 12(%rdi), %xmm1, %xmm0
3588; AVX512F-NEXT:    vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
3589; AVX512F-NEXT:    testb %al, %al
3590; AVX512F-NEXT:    jns LBB22_16
3591; AVX512F-NEXT:  LBB22_15: ## %cond.load19
3592; AVX512F-NEXT:    vpinsrw $7, 14(%rdi), %xmm1, %xmm0
3593; AVX512F-NEXT:    vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
3594; AVX512F-NEXT:    testl $256, %eax ## imm = 0x100
3595; AVX512F-NEXT:    je LBB22_18
3596; AVX512F-NEXT:  LBB22_17: ## %cond.load22
3597; AVX512F-NEXT:    vpbroadcastw 16(%rdi), %ymm0
3598; AVX512F-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6,7],ymm0[8],ymm1[9,10,11,12,13,14,15]
3599; AVX512F-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7]
3600; AVX512F-NEXT:    testl $512, %eax ## imm = 0x200
3601; AVX512F-NEXT:    je LBB22_20
3602; AVX512F-NEXT:  LBB22_19: ## %cond.load25
3603; AVX512F-NEXT:    vpbroadcastw 18(%rdi), %ymm0
3604; AVX512F-NEXT:    vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3,4,5,6,7,8],ymm0[9],ymm1[10,11,12,13,14,15]
3605; AVX512F-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7]
3606; AVX512F-NEXT:    testl $1024, %eax ## imm = 0x400
3607; AVX512F-NEXT:    je LBB22_22
3608; AVX512F-NEXT:  LBB22_21: ## %cond.load28
3609; AVX512F-NEXT:    vpbroadcastw 20(%rdi), %ymm0
3610; AVX512F-NEXT:    vpblendw {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4,5,6,7,8,9],ymm0[10],ymm1[11,12,13,14,15]
3611; AVX512F-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7]
3612; AVX512F-NEXT:    testl $2048, %eax ## imm = 0x800
3613; AVX512F-NEXT:    je LBB22_24
3614; AVX512F-NEXT:  LBB22_23: ## %cond.load31
3615; AVX512F-NEXT:    vpbroadcastw 22(%rdi), %ymm0
3616; AVX512F-NEXT:    vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6,7,8,9,10],ymm0[11],ymm1[12,13,14,15]
3617; AVX512F-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7]
3618; AVX512F-NEXT:    testl $4096, %eax ## imm = 0x1000
3619; AVX512F-NEXT:    je LBB22_26
3620; AVX512F-NEXT:  LBB22_25: ## %cond.load34
3621; AVX512F-NEXT:    vpbroadcastw 24(%rdi), %ymm0
3622; AVX512F-NEXT:    vpblendw {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4],ymm1[5,6,7,8,9,10,11],ymm0[12],ymm1[13,14,15]
3623; AVX512F-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7]
3624; AVX512F-NEXT:    testl $8192, %eax ## imm = 0x2000
3625; AVX512F-NEXT:    je LBB22_28
3626; AVX512F-NEXT:  LBB22_27: ## %cond.load37
3627; AVX512F-NEXT:    vpbroadcastw 26(%rdi), %ymm0
3628; AVX512F-NEXT:    vpblendw {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5],ymm1[6,7,8,9,10,11,12],ymm0[13],ymm1[14,15]
3629; AVX512F-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7]
3630; AVX512F-NEXT:    testl $16384, %eax ## imm = 0x4000
3631; AVX512F-NEXT:    je LBB22_30
3632; AVX512F-NEXT:  LBB22_29: ## %cond.load40
3633; AVX512F-NEXT:    vpbroadcastw 28(%rdi), %ymm0
3634; AVX512F-NEXT:    vpblendw {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6],ymm1[7,8,9,10,11,12,13],ymm0[14],ymm1[15]
3635; AVX512F-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7]
3636; AVX512F-NEXT:    testl $32768, %eax ## imm = 0x8000
3637; AVX512F-NEXT:    je LBB22_32
3638; AVX512F-NEXT:  LBB22_31: ## %cond.load43
3639; AVX512F-NEXT:    vpbroadcastw 30(%rdi), %ymm0
3640; AVX512F-NEXT:    vpblendw {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7],ymm1[8,9,10,11,12,13,14],ymm0[15]
3641; AVX512F-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7]
3642; AVX512F-NEXT:    vmovdqa %ymm1, %ymm0
3643; AVX512F-NEXT:    retq
3644;
3645; AVX512VLDQ-LABEL: load_v16i16_v16i16:
3646; AVX512VLDQ:       ## %bb.0:
3647; AVX512VLDQ-NEXT:    vpxor %xmm2, %xmm2, %xmm2
3648; AVX512VLDQ-NEXT:    vpcmpgtw %ymm0, %ymm2, %ymm0
3649; AVX512VLDQ-NEXT:    vpmovsxwd %ymm0, %zmm0
3650; AVX512VLDQ-NEXT:    vpmovd2m %zmm0, %k0
3651; AVX512VLDQ-NEXT:    kmovw %k0, %eax
3652; AVX512VLDQ-NEXT:    testb $1, %al
3653; AVX512VLDQ-NEXT:    jne LBB22_1
3654; AVX512VLDQ-NEXT:  ## %bb.2: ## %else
3655; AVX512VLDQ-NEXT:    testb $2, %al
3656; AVX512VLDQ-NEXT:    jne LBB22_3
3657; AVX512VLDQ-NEXT:  LBB22_4: ## %else2
3658; AVX512VLDQ-NEXT:    testb $4, %al
3659; AVX512VLDQ-NEXT:    jne LBB22_5
3660; AVX512VLDQ-NEXT:  LBB22_6: ## %else5
3661; AVX512VLDQ-NEXT:    testb $8, %al
3662; AVX512VLDQ-NEXT:    jne LBB22_7
3663; AVX512VLDQ-NEXT:  LBB22_8: ## %else8
3664; AVX512VLDQ-NEXT:    testb $16, %al
3665; AVX512VLDQ-NEXT:    jne LBB22_9
3666; AVX512VLDQ-NEXT:  LBB22_10: ## %else11
3667; AVX512VLDQ-NEXT:    testb $32, %al
3668; AVX512VLDQ-NEXT:    jne LBB22_11
3669; AVX512VLDQ-NEXT:  LBB22_12: ## %else14
3670; AVX512VLDQ-NEXT:    testb $64, %al
3671; AVX512VLDQ-NEXT:    jne LBB22_13
3672; AVX512VLDQ-NEXT:  LBB22_14: ## %else17
3673; AVX512VLDQ-NEXT:    testb %al, %al
3674; AVX512VLDQ-NEXT:    js LBB22_15
3675; AVX512VLDQ-NEXT:  LBB22_16: ## %else20
3676; AVX512VLDQ-NEXT:    testl $256, %eax ## imm = 0x100
3677; AVX512VLDQ-NEXT:    jne LBB22_17
3678; AVX512VLDQ-NEXT:  LBB22_18: ## %else23
3679; AVX512VLDQ-NEXT:    testl $512, %eax ## imm = 0x200
3680; AVX512VLDQ-NEXT:    jne LBB22_19
3681; AVX512VLDQ-NEXT:  LBB22_20: ## %else26
3682; AVX512VLDQ-NEXT:    testl $1024, %eax ## imm = 0x400
3683; AVX512VLDQ-NEXT:    jne LBB22_21
3684; AVX512VLDQ-NEXT:  LBB22_22: ## %else29
3685; AVX512VLDQ-NEXT:    testl $2048, %eax ## imm = 0x800
3686; AVX512VLDQ-NEXT:    jne LBB22_23
3687; AVX512VLDQ-NEXT:  LBB22_24: ## %else32
3688; AVX512VLDQ-NEXT:    testl $4096, %eax ## imm = 0x1000
3689; AVX512VLDQ-NEXT:    jne LBB22_25
3690; AVX512VLDQ-NEXT:  LBB22_26: ## %else35
3691; AVX512VLDQ-NEXT:    testl $8192, %eax ## imm = 0x2000
3692; AVX512VLDQ-NEXT:    jne LBB22_27
3693; AVX512VLDQ-NEXT:  LBB22_28: ## %else38
3694; AVX512VLDQ-NEXT:    testl $16384, %eax ## imm = 0x4000
3695; AVX512VLDQ-NEXT:    jne LBB22_29
3696; AVX512VLDQ-NEXT:  LBB22_30: ## %else41
3697; AVX512VLDQ-NEXT:    testl $32768, %eax ## imm = 0x8000
3698; AVX512VLDQ-NEXT:    jne LBB22_31
3699; AVX512VLDQ-NEXT:  LBB22_32: ## %else44
3700; AVX512VLDQ-NEXT:    vmovdqa %ymm1, %ymm0
3701; AVX512VLDQ-NEXT:    retq
3702; AVX512VLDQ-NEXT:  LBB22_1: ## %cond.load
3703; AVX512VLDQ-NEXT:    vpinsrw $0, (%rdi), %xmm1, %xmm0
3704; AVX512VLDQ-NEXT:    vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
3705; AVX512VLDQ-NEXT:    testb $2, %al
3706; AVX512VLDQ-NEXT:    je LBB22_4
3707; AVX512VLDQ-NEXT:  LBB22_3: ## %cond.load1
3708; AVX512VLDQ-NEXT:    vpinsrw $1, 2(%rdi), %xmm1, %xmm0
3709; AVX512VLDQ-NEXT:    vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
3710; AVX512VLDQ-NEXT:    testb $4, %al
3711; AVX512VLDQ-NEXT:    je LBB22_6
3712; AVX512VLDQ-NEXT:  LBB22_5: ## %cond.load4
3713; AVX512VLDQ-NEXT:    vpinsrw $2, 4(%rdi), %xmm1, %xmm0
3714; AVX512VLDQ-NEXT:    vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
3715; AVX512VLDQ-NEXT:    testb $8, %al
3716; AVX512VLDQ-NEXT:    je LBB22_8
3717; AVX512VLDQ-NEXT:  LBB22_7: ## %cond.load7
3718; AVX512VLDQ-NEXT:    vpinsrw $3, 6(%rdi), %xmm1, %xmm0
3719; AVX512VLDQ-NEXT:    vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
3720; AVX512VLDQ-NEXT:    testb $16, %al
3721; AVX512VLDQ-NEXT:    je LBB22_10
3722; AVX512VLDQ-NEXT:  LBB22_9: ## %cond.load10
3723; AVX512VLDQ-NEXT:    vpinsrw $4, 8(%rdi), %xmm1, %xmm0
3724; AVX512VLDQ-NEXT:    vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
3725; AVX512VLDQ-NEXT:    testb $32, %al
3726; AVX512VLDQ-NEXT:    je LBB22_12
3727; AVX512VLDQ-NEXT:  LBB22_11: ## %cond.load13
3728; AVX512VLDQ-NEXT:    vpinsrw $5, 10(%rdi), %xmm1, %xmm0
3729; AVX512VLDQ-NEXT:    vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
3730; AVX512VLDQ-NEXT:    testb $64, %al
3731; AVX512VLDQ-NEXT:    je LBB22_14
3732; AVX512VLDQ-NEXT:  LBB22_13: ## %cond.load16
3733; AVX512VLDQ-NEXT:    vpinsrw $6, 12(%rdi), %xmm1, %xmm0
3734; AVX512VLDQ-NEXT:    vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
3735; AVX512VLDQ-NEXT:    testb %al, %al
3736; AVX512VLDQ-NEXT:    jns LBB22_16
3737; AVX512VLDQ-NEXT:  LBB22_15: ## %cond.load19
3738; AVX512VLDQ-NEXT:    vpinsrw $7, 14(%rdi), %xmm1, %xmm0
3739; AVX512VLDQ-NEXT:    vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
3740; AVX512VLDQ-NEXT:    testl $256, %eax ## imm = 0x100
3741; AVX512VLDQ-NEXT:    je LBB22_18
3742; AVX512VLDQ-NEXT:  LBB22_17: ## %cond.load22
3743; AVX512VLDQ-NEXT:    vpbroadcastw 16(%rdi), %ymm0
3744; AVX512VLDQ-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6,7],ymm0[8],ymm1[9,10,11,12,13,14,15]
3745; AVX512VLDQ-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7]
3746; AVX512VLDQ-NEXT:    testl $512, %eax ## imm = 0x200
3747; AVX512VLDQ-NEXT:    je LBB22_20
3748; AVX512VLDQ-NEXT:  LBB22_19: ## %cond.load25
3749; AVX512VLDQ-NEXT:    vpbroadcastw 18(%rdi), %ymm0
3750; AVX512VLDQ-NEXT:    vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3,4,5,6,7,8],ymm0[9],ymm1[10,11,12,13,14,15]
3751; AVX512VLDQ-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7]
3752; AVX512VLDQ-NEXT:    testl $1024, %eax ## imm = 0x400
3753; AVX512VLDQ-NEXT:    je LBB22_22
3754; AVX512VLDQ-NEXT:  LBB22_21: ## %cond.load28
3755; AVX512VLDQ-NEXT:    vpbroadcastw 20(%rdi), %ymm0
3756; AVX512VLDQ-NEXT:    vpblendw {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4,5,6,7,8,9],ymm0[10],ymm1[11,12,13,14,15]
3757; AVX512VLDQ-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7]
3758; AVX512VLDQ-NEXT:    testl $2048, %eax ## imm = 0x800
3759; AVX512VLDQ-NEXT:    je LBB22_24
3760; AVX512VLDQ-NEXT:  LBB22_23: ## %cond.load31
3761; AVX512VLDQ-NEXT:    vpbroadcastw 22(%rdi), %ymm0
3762; AVX512VLDQ-NEXT:    vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6,7,8,9,10],ymm0[11],ymm1[12,13,14,15]
3763; AVX512VLDQ-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7]
3764; AVX512VLDQ-NEXT:    testl $4096, %eax ## imm = 0x1000
3765; AVX512VLDQ-NEXT:    je LBB22_26
3766; AVX512VLDQ-NEXT:  LBB22_25: ## %cond.load34
3767; AVX512VLDQ-NEXT:    vpbroadcastw 24(%rdi), %ymm0
3768; AVX512VLDQ-NEXT:    vpblendw {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4],ymm1[5,6,7,8,9,10,11],ymm0[12],ymm1[13,14,15]
3769; AVX512VLDQ-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7]
3770; AVX512VLDQ-NEXT:    testl $8192, %eax ## imm = 0x2000
3771; AVX512VLDQ-NEXT:    je LBB22_28
3772; AVX512VLDQ-NEXT:  LBB22_27: ## %cond.load37
3773; AVX512VLDQ-NEXT:    vpbroadcastw 26(%rdi), %ymm0
3774; AVX512VLDQ-NEXT:    vpblendw {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5],ymm1[6,7,8,9,10,11,12],ymm0[13],ymm1[14,15]
3775; AVX512VLDQ-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7]
3776; AVX512VLDQ-NEXT:    testl $16384, %eax ## imm = 0x4000
3777; AVX512VLDQ-NEXT:    je LBB22_30
3778; AVX512VLDQ-NEXT:  LBB22_29: ## %cond.load40
3779; AVX512VLDQ-NEXT:    vpbroadcastw 28(%rdi), %ymm0
3780; AVX512VLDQ-NEXT:    vpblendw {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6],ymm1[7,8,9,10,11,12,13],ymm0[14],ymm1[15]
3781; AVX512VLDQ-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7]
3782; AVX512VLDQ-NEXT:    testl $32768, %eax ## imm = 0x8000
3783; AVX512VLDQ-NEXT:    je LBB22_32
3784; AVX512VLDQ-NEXT:  LBB22_31: ## %cond.load43
3785; AVX512VLDQ-NEXT:    vpbroadcastw 30(%rdi), %ymm0
3786; AVX512VLDQ-NEXT:    vpblendw {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7],ymm1[8,9,10,11,12,13,14],ymm0[15]
3787; AVX512VLDQ-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7]
3788; AVX512VLDQ-NEXT:    vmovdqa %ymm1, %ymm0
3789; AVX512VLDQ-NEXT:    retq
3790;
3791; AVX512VLBW-LABEL: load_v16i16_v16i16:
3792; AVX512VLBW:       ## %bb.0:
3793; AVX512VLBW-NEXT:    vpmovw2m %ymm0, %k1
3794; AVX512VLBW-NEXT:    vpblendmw (%rdi), %ymm1, %ymm0 {%k1}
3795; AVX512VLBW-NEXT:    retq
3796;
3797; X86-AVX512-LABEL: load_v16i16_v16i16:
3798; X86-AVX512:       ## %bb.0:
3799; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
3800; X86-AVX512-NEXT:    vpmovw2m %ymm0, %k1
3801; X86-AVX512-NEXT:    vpblendmw (%eax), %ymm1, %ymm0 {%k1}
3802; X86-AVX512-NEXT:    retl
3803  %mask = icmp slt <16 x i16> %trigger, zeroinitializer
3804  %res = call <16 x i16> @llvm.masked.load.v16i16.p0(ptr %addr, i32 4, <16 x i1> %mask, <16 x i16> %dst)
3805  ret <16 x i16> %res
3806}
3807
3808;
3809; vXi8
3810;
3811
3812define <16 x i8> @load_v16i8_v16i8(<16 x i8> %trigger, ptr %addr, <16 x i8> %dst) {
3813; SSE2-LABEL: load_v16i8_v16i8:
3814; SSE2:       ## %bb.0:
3815; SSE2-NEXT:    pmovmskb %xmm0, %eax
3816; SSE2-NEXT:    testb $1, %al
3817; SSE2-NEXT:    jne LBB23_1
3818; SSE2-NEXT:  ## %bb.2: ## %else
3819; SSE2-NEXT:    testb $2, %al
3820; SSE2-NEXT:    jne LBB23_3
3821; SSE2-NEXT:  LBB23_4: ## %else2
3822; SSE2-NEXT:    testb $4, %al
3823; SSE2-NEXT:    jne LBB23_5
3824; SSE2-NEXT:  LBB23_6: ## %else5
3825; SSE2-NEXT:    testb $8, %al
3826; SSE2-NEXT:    jne LBB23_7
3827; SSE2-NEXT:  LBB23_8: ## %else8
3828; SSE2-NEXT:    testb $16, %al
3829; SSE2-NEXT:    jne LBB23_9
3830; SSE2-NEXT:  LBB23_10: ## %else11
3831; SSE2-NEXT:    testb $32, %al
3832; SSE2-NEXT:    jne LBB23_11
3833; SSE2-NEXT:  LBB23_12: ## %else14
3834; SSE2-NEXT:    testb $64, %al
3835; SSE2-NEXT:    jne LBB23_13
3836; SSE2-NEXT:  LBB23_14: ## %else17
3837; SSE2-NEXT:    testb %al, %al
3838; SSE2-NEXT:    js LBB23_15
3839; SSE2-NEXT:  LBB23_16: ## %else20
3840; SSE2-NEXT:    testl $256, %eax ## imm = 0x100
3841; SSE2-NEXT:    jne LBB23_17
3842; SSE2-NEXT:  LBB23_18: ## %else23
3843; SSE2-NEXT:    testl $512, %eax ## imm = 0x200
3844; SSE2-NEXT:    jne LBB23_19
3845; SSE2-NEXT:  LBB23_20: ## %else26
3846; SSE2-NEXT:    testl $1024, %eax ## imm = 0x400
3847; SSE2-NEXT:    jne LBB23_21
3848; SSE2-NEXT:  LBB23_22: ## %else29
3849; SSE2-NEXT:    testl $2048, %eax ## imm = 0x800
3850; SSE2-NEXT:    jne LBB23_23
3851; SSE2-NEXT:  LBB23_24: ## %else32
3852; SSE2-NEXT:    testl $4096, %eax ## imm = 0x1000
3853; SSE2-NEXT:    jne LBB23_25
3854; SSE2-NEXT:  LBB23_26: ## %else35
3855; SSE2-NEXT:    testl $8192, %eax ## imm = 0x2000
3856; SSE2-NEXT:    jne LBB23_27
3857; SSE2-NEXT:  LBB23_28: ## %else38
3858; SSE2-NEXT:    testl $16384, %eax ## imm = 0x4000
3859; SSE2-NEXT:    jne LBB23_29
3860; SSE2-NEXT:  LBB23_30: ## %else41
3861; SSE2-NEXT:    testl $32768, %eax ## imm = 0x8000
3862; SSE2-NEXT:    jne LBB23_31
3863; SSE2-NEXT:  LBB23_32: ## %else44
3864; SSE2-NEXT:    movdqa %xmm1, %xmm0
3865; SSE2-NEXT:    retq
3866; SSE2-NEXT:  LBB23_1: ## %cond.load
3867; SSE2-NEXT:    movdqa {{.*#+}} xmm0 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
3868; SSE2-NEXT:    pand %xmm0, %xmm1
3869; SSE2-NEXT:    movzbl (%rdi), %ecx
3870; SSE2-NEXT:    movd %ecx, %xmm2
3871; SSE2-NEXT:    pandn %xmm2, %xmm0
3872; SSE2-NEXT:    por %xmm0, %xmm1
3873; SSE2-NEXT:    testb $2, %al
3874; SSE2-NEXT:    je LBB23_4
3875; SSE2-NEXT:  LBB23_3: ## %cond.load1
3876; SSE2-NEXT:    movdqa {{.*#+}} xmm0 = [255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
3877; SSE2-NEXT:    pand %xmm0, %xmm1
3878; SSE2-NEXT:    movzbl 1(%rdi), %ecx
3879; SSE2-NEXT:    movd %ecx, %xmm2
3880; SSE2-NEXT:    psllw $8, %xmm2
3881; SSE2-NEXT:    pandn %xmm2, %xmm0
3882; SSE2-NEXT:    por %xmm0, %xmm1
3883; SSE2-NEXT:    testb $4, %al
3884; SSE2-NEXT:    je LBB23_6
3885; SSE2-NEXT:  LBB23_5: ## %cond.load4
3886; SSE2-NEXT:    movdqa {{.*#+}} xmm0 = [255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255]
3887; SSE2-NEXT:    pand %xmm0, %xmm1
3888; SSE2-NEXT:    movzbl 2(%rdi), %ecx
3889; SSE2-NEXT:    movd %ecx, %xmm2
3890; SSE2-NEXT:    pslld $16, %xmm2
3891; SSE2-NEXT:    pandn %xmm2, %xmm0
3892; SSE2-NEXT:    por %xmm0, %xmm1
3893; SSE2-NEXT:    testb $8, %al
3894; SSE2-NEXT:    je LBB23_8
3895; SSE2-NEXT:  LBB23_7: ## %cond.load7
3896; SSE2-NEXT:    movdqa {{.*#+}} xmm0 = [255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255]
3897; SSE2-NEXT:    pand %xmm0, %xmm1
3898; SSE2-NEXT:    movzbl 3(%rdi), %ecx
3899; SSE2-NEXT:    movd %ecx, %xmm2
3900; SSE2-NEXT:    pslld $24, %xmm2
3901; SSE2-NEXT:    pandn %xmm2, %xmm0
3902; SSE2-NEXT:    por %xmm0, %xmm1
3903; SSE2-NEXT:    testb $16, %al
3904; SSE2-NEXT:    je LBB23_10
3905; SSE2-NEXT:  LBB23_9: ## %cond.load10
3906; SSE2-NEXT:    movdqa {{.*#+}} xmm0 = [255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255]
3907; SSE2-NEXT:    pand %xmm0, %xmm1
3908; SSE2-NEXT:    movzbl 4(%rdi), %ecx
3909; SSE2-NEXT:    movd %ecx, %xmm2
3910; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,0,1,1]
3911; SSE2-NEXT:    pandn %xmm2, %xmm0
3912; SSE2-NEXT:    por %xmm0, %xmm1
3913; SSE2-NEXT:    testb $32, %al
3914; SSE2-NEXT:    je LBB23_12
3915; SSE2-NEXT:  LBB23_11: ## %cond.load13
3916; SSE2-NEXT:    movdqa {{.*#+}} xmm0 = [255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255]
3917; SSE2-NEXT:    pand %xmm0, %xmm1
3918; SSE2-NEXT:    movzbl 5(%rdi), %ecx
3919; SSE2-NEXT:    movd %ecx, %xmm2
3920; SSE2-NEXT:    psllq $40, %xmm2
3921; SSE2-NEXT:    pandn %xmm2, %xmm0
3922; SSE2-NEXT:    por %xmm0, %xmm1
3923; SSE2-NEXT:    testb $64, %al
3924; SSE2-NEXT:    je LBB23_14
3925; SSE2-NEXT:  LBB23_13: ## %cond.load16
3926; SSE2-NEXT:    movdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255]
3927; SSE2-NEXT:    pand %xmm0, %xmm1
3928; SSE2-NEXT:    movzbl 6(%rdi), %ecx
3929; SSE2-NEXT:    movd %ecx, %xmm2
3930; SSE2-NEXT:    psllq $48, %xmm2
3931; SSE2-NEXT:    pandn %xmm2, %xmm0
3932; SSE2-NEXT:    por %xmm0, %xmm1
3933; SSE2-NEXT:    testb %al, %al
3934; SSE2-NEXT:    jns LBB23_16
3935; SSE2-NEXT:  LBB23_15: ## %cond.load19
3936; SSE2-NEXT:    movdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255]
3937; SSE2-NEXT:    pand %xmm0, %xmm1
3938; SSE2-NEXT:    movzbl 7(%rdi), %ecx
3939; SSE2-NEXT:    movd %ecx, %xmm2
3940; SSE2-NEXT:    psllq $56, %xmm2
3941; SSE2-NEXT:    pandn %xmm2, %xmm0
3942; SSE2-NEXT:    por %xmm0, %xmm1
3943; SSE2-NEXT:    testl $256, %eax ## imm = 0x100
3944; SSE2-NEXT:    je LBB23_18
3945; SSE2-NEXT:  LBB23_17: ## %cond.load22
3946; SSE2-NEXT:    movdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255]
3947; SSE2-NEXT:    pand %xmm0, %xmm1
3948; SSE2-NEXT:    movzbl 8(%rdi), %ecx
3949; SSE2-NEXT:    movd %ecx, %xmm2
3950; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,1,0,1]
3951; SSE2-NEXT:    pandn %xmm2, %xmm0
3952; SSE2-NEXT:    por %xmm0, %xmm1
3953; SSE2-NEXT:    testl $512, %eax ## imm = 0x200
3954; SSE2-NEXT:    je LBB23_20
3955; SSE2-NEXT:  LBB23_19: ## %cond.load25
3956; SSE2-NEXT:    movdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255]
3957; SSE2-NEXT:    pand %xmm0, %xmm1
3958; SSE2-NEXT:    movzbl 9(%rdi), %ecx
3959; SSE2-NEXT:    movd %ecx, %xmm2
3960; SSE2-NEXT:    pslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5,6]
3961; SSE2-NEXT:    pandn %xmm2, %xmm0
3962; SSE2-NEXT:    por %xmm0, %xmm1
3963; SSE2-NEXT:    testl $1024, %eax ## imm = 0x400
3964; SSE2-NEXT:    je LBB23_22
3965; SSE2-NEXT:  LBB23_21: ## %cond.load28
3966; SSE2-NEXT:    movdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255]
3967; SSE2-NEXT:    pand %xmm0, %xmm1
3968; SSE2-NEXT:    movzbl 10(%rdi), %ecx
3969; SSE2-NEXT:    movd %ecx, %xmm2
3970; SSE2-NEXT:    pslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5]
3971; SSE2-NEXT:    pandn %xmm2, %xmm0
3972; SSE2-NEXT:    por %xmm0, %xmm1
3973; SSE2-NEXT:    testl $2048, %eax ## imm = 0x800
3974; SSE2-NEXT:    je LBB23_24
3975; SSE2-NEXT:  LBB23_23: ## %cond.load31
3976; SSE2-NEXT:    movdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255]
3977; SSE2-NEXT:    pand %xmm0, %xmm1
3978; SSE2-NEXT:    movzbl 11(%rdi), %ecx
3979; SSE2-NEXT:    movd %ecx, %xmm2
3980; SSE2-NEXT:    pslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4]
3981; SSE2-NEXT:    pandn %xmm2, %xmm0
3982; SSE2-NEXT:    por %xmm0, %xmm1
3983; SSE2-NEXT:    testl $4096, %eax ## imm = 0x1000
3984; SSE2-NEXT:    je LBB23_26
3985; SSE2-NEXT:  LBB23_25: ## %cond.load34
3986; SSE2-NEXT:    movdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255]
3987; SSE2-NEXT:    pand %xmm0, %xmm1
3988; SSE2-NEXT:    movzbl 12(%rdi), %ecx
3989; SSE2-NEXT:    movd %ecx, %xmm2
3990; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
3991; SSE2-NEXT:    pandn %xmm2, %xmm0
3992; SSE2-NEXT:    por %xmm0, %xmm1
3993; SSE2-NEXT:    testl $8192, %eax ## imm = 0x2000
3994; SSE2-NEXT:    je LBB23_28
3995; SSE2-NEXT:  LBB23_27: ## %cond.load37
3996; SSE2-NEXT:    movdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255]
3997; SSE2-NEXT:    pand %xmm0, %xmm1
3998; SSE2-NEXT:    movzbl 13(%rdi), %ecx
3999; SSE2-NEXT:    movd %ecx, %xmm2
4000; SSE2-NEXT:    pslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2]
4001; SSE2-NEXT:    pandn %xmm2, %xmm0
4002; SSE2-NEXT:    por %xmm0, %xmm1
4003; SSE2-NEXT:    testl $16384, %eax ## imm = 0x4000
4004; SSE2-NEXT:    je LBB23_30
4005; SSE2-NEXT:  LBB23_29: ## %cond.load40
4006; SSE2-NEXT:    movdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255]
4007; SSE2-NEXT:    pand %xmm0, %xmm1
4008; SSE2-NEXT:    movzbl 14(%rdi), %ecx
4009; SSE2-NEXT:    movd %ecx, %xmm2
4010; SSE2-NEXT:    pslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1]
4011; SSE2-NEXT:    pandn %xmm2, %xmm0
4012; SSE2-NEXT:    por %xmm0, %xmm1
4013; SSE2-NEXT:    testl $32768, %eax ## imm = 0x8000
4014; SSE2-NEXT:    je LBB23_32
4015; SSE2-NEXT:  LBB23_31: ## %cond.load43
4016; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
4017; SSE2-NEXT:    movzbl 15(%rdi), %eax
4018; SSE2-NEXT:    movd %eax, %xmm0
4019; SSE2-NEXT:    pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0]
4020; SSE2-NEXT:    por %xmm0, %xmm1
4021; SSE2-NEXT:    movdqa %xmm1, %xmm0
4022; SSE2-NEXT:    retq
4023;
4024; SSE42-LABEL: load_v16i8_v16i8:
4025; SSE42:       ## %bb.0:
4026; SSE42-NEXT:    pmovmskb %xmm0, %eax
4027; SSE42-NEXT:    testb $1, %al
4028; SSE42-NEXT:    jne LBB23_1
4029; SSE42-NEXT:  ## %bb.2: ## %else
4030; SSE42-NEXT:    testb $2, %al
4031; SSE42-NEXT:    jne LBB23_3
4032; SSE42-NEXT:  LBB23_4: ## %else2
4033; SSE42-NEXT:    testb $4, %al
4034; SSE42-NEXT:    jne LBB23_5
4035; SSE42-NEXT:  LBB23_6: ## %else5
4036; SSE42-NEXT:    testb $8, %al
4037; SSE42-NEXT:    jne LBB23_7
4038; SSE42-NEXT:  LBB23_8: ## %else8
4039; SSE42-NEXT:    testb $16, %al
4040; SSE42-NEXT:    jne LBB23_9
4041; SSE42-NEXT:  LBB23_10: ## %else11
4042; SSE42-NEXT:    testb $32, %al
4043; SSE42-NEXT:    jne LBB23_11
4044; SSE42-NEXT:  LBB23_12: ## %else14
4045; SSE42-NEXT:    testb $64, %al
4046; SSE42-NEXT:    jne LBB23_13
4047; SSE42-NEXT:  LBB23_14: ## %else17
4048; SSE42-NEXT:    testb %al, %al
4049; SSE42-NEXT:    js LBB23_15
4050; SSE42-NEXT:  LBB23_16: ## %else20
4051; SSE42-NEXT:    testl $256, %eax ## imm = 0x100
4052; SSE42-NEXT:    jne LBB23_17
4053; SSE42-NEXT:  LBB23_18: ## %else23
4054; SSE42-NEXT:    testl $512, %eax ## imm = 0x200
4055; SSE42-NEXT:    jne LBB23_19
4056; SSE42-NEXT:  LBB23_20: ## %else26
4057; SSE42-NEXT:    testl $1024, %eax ## imm = 0x400
4058; SSE42-NEXT:    jne LBB23_21
4059; SSE42-NEXT:  LBB23_22: ## %else29
4060; SSE42-NEXT:    testl $2048, %eax ## imm = 0x800
4061; SSE42-NEXT:    jne LBB23_23
4062; SSE42-NEXT:  LBB23_24: ## %else32
4063; SSE42-NEXT:    testl $4096, %eax ## imm = 0x1000
4064; SSE42-NEXT:    jne LBB23_25
4065; SSE42-NEXT:  LBB23_26: ## %else35
4066; SSE42-NEXT:    testl $8192, %eax ## imm = 0x2000
4067; SSE42-NEXT:    jne LBB23_27
4068; SSE42-NEXT:  LBB23_28: ## %else38
4069; SSE42-NEXT:    testl $16384, %eax ## imm = 0x4000
4070; SSE42-NEXT:    jne LBB23_29
4071; SSE42-NEXT:  LBB23_30: ## %else41
4072; SSE42-NEXT:    testl $32768, %eax ## imm = 0x8000
4073; SSE42-NEXT:    jne LBB23_31
4074; SSE42-NEXT:  LBB23_32: ## %else44
4075; SSE42-NEXT:    movdqa %xmm1, %xmm0
4076; SSE42-NEXT:    retq
4077; SSE42-NEXT:  LBB23_1: ## %cond.load
4078; SSE42-NEXT:    pinsrb $0, (%rdi), %xmm1
4079; SSE42-NEXT:    testb $2, %al
4080; SSE42-NEXT:    je LBB23_4
4081; SSE42-NEXT:  LBB23_3: ## %cond.load1
4082; SSE42-NEXT:    pinsrb $1, 1(%rdi), %xmm1
4083; SSE42-NEXT:    testb $4, %al
4084; SSE42-NEXT:    je LBB23_6
4085; SSE42-NEXT:  LBB23_5: ## %cond.load4
4086; SSE42-NEXT:    pinsrb $2, 2(%rdi), %xmm1
4087; SSE42-NEXT:    testb $8, %al
4088; SSE42-NEXT:    je LBB23_8
4089; SSE42-NEXT:  LBB23_7: ## %cond.load7
4090; SSE42-NEXT:    pinsrb $3, 3(%rdi), %xmm1
4091; SSE42-NEXT:    testb $16, %al
4092; SSE42-NEXT:    je LBB23_10
4093; SSE42-NEXT:  LBB23_9: ## %cond.load10
4094; SSE42-NEXT:    pinsrb $4, 4(%rdi), %xmm1
4095; SSE42-NEXT:    testb $32, %al
4096; SSE42-NEXT:    je LBB23_12
4097; SSE42-NEXT:  LBB23_11: ## %cond.load13
4098; SSE42-NEXT:    pinsrb $5, 5(%rdi), %xmm1
4099; SSE42-NEXT:    testb $64, %al
4100; SSE42-NEXT:    je LBB23_14
4101; SSE42-NEXT:  LBB23_13: ## %cond.load16
4102; SSE42-NEXT:    pinsrb $6, 6(%rdi), %xmm1
4103; SSE42-NEXT:    testb %al, %al
4104; SSE42-NEXT:    jns LBB23_16
4105; SSE42-NEXT:  LBB23_15: ## %cond.load19
4106; SSE42-NEXT:    pinsrb $7, 7(%rdi), %xmm1
4107; SSE42-NEXT:    testl $256, %eax ## imm = 0x100
4108; SSE42-NEXT:    je LBB23_18
4109; SSE42-NEXT:  LBB23_17: ## %cond.load22
4110; SSE42-NEXT:    pinsrb $8, 8(%rdi), %xmm1
4111; SSE42-NEXT:    testl $512, %eax ## imm = 0x200
4112; SSE42-NEXT:    je LBB23_20
4113; SSE42-NEXT:  LBB23_19: ## %cond.load25
4114; SSE42-NEXT:    pinsrb $9, 9(%rdi), %xmm1
4115; SSE42-NEXT:    testl $1024, %eax ## imm = 0x400
4116; SSE42-NEXT:    je LBB23_22
4117; SSE42-NEXT:  LBB23_21: ## %cond.load28
4118; SSE42-NEXT:    pinsrb $10, 10(%rdi), %xmm1
4119; SSE42-NEXT:    testl $2048, %eax ## imm = 0x800
4120; SSE42-NEXT:    je LBB23_24
4121; SSE42-NEXT:  LBB23_23: ## %cond.load31
4122; SSE42-NEXT:    pinsrb $11, 11(%rdi), %xmm1
4123; SSE42-NEXT:    testl $4096, %eax ## imm = 0x1000
4124; SSE42-NEXT:    je LBB23_26
4125; SSE42-NEXT:  LBB23_25: ## %cond.load34
4126; SSE42-NEXT:    pinsrb $12, 12(%rdi), %xmm1
4127; SSE42-NEXT:    testl $8192, %eax ## imm = 0x2000
4128; SSE42-NEXT:    je LBB23_28
4129; SSE42-NEXT:  LBB23_27: ## %cond.load37
4130; SSE42-NEXT:    pinsrb $13, 13(%rdi), %xmm1
4131; SSE42-NEXT:    testl $16384, %eax ## imm = 0x4000
4132; SSE42-NEXT:    je LBB23_30
4133; SSE42-NEXT:  LBB23_29: ## %cond.load40
4134; SSE42-NEXT:    pinsrb $14, 14(%rdi), %xmm1
4135; SSE42-NEXT:    testl $32768, %eax ## imm = 0x8000
4136; SSE42-NEXT:    je LBB23_32
4137; SSE42-NEXT:  LBB23_31: ## %cond.load43
4138; SSE42-NEXT:    pinsrb $15, 15(%rdi), %xmm1
4139; SSE42-NEXT:    movdqa %xmm1, %xmm0
4140; SSE42-NEXT:    retq
4141;
4142; AVX1OR2-LABEL: load_v16i8_v16i8:
4143; AVX1OR2:       ## %bb.0:
4144; AVX1OR2-NEXT:    vpmovmskb %xmm0, %eax
4145; AVX1OR2-NEXT:    testb $1, %al
4146; AVX1OR2-NEXT:    jne LBB23_1
4147; AVX1OR2-NEXT:  ## %bb.2: ## %else
4148; AVX1OR2-NEXT:    testb $2, %al
4149; AVX1OR2-NEXT:    jne LBB23_3
4150; AVX1OR2-NEXT:  LBB23_4: ## %else2
4151; AVX1OR2-NEXT:    testb $4, %al
4152; AVX1OR2-NEXT:    jne LBB23_5
4153; AVX1OR2-NEXT:  LBB23_6: ## %else5
4154; AVX1OR2-NEXT:    testb $8, %al
4155; AVX1OR2-NEXT:    jne LBB23_7
4156; AVX1OR2-NEXT:  LBB23_8: ## %else8
4157; AVX1OR2-NEXT:    testb $16, %al
4158; AVX1OR2-NEXT:    jne LBB23_9
4159; AVX1OR2-NEXT:  LBB23_10: ## %else11
4160; AVX1OR2-NEXT:    testb $32, %al
4161; AVX1OR2-NEXT:    jne LBB23_11
4162; AVX1OR2-NEXT:  LBB23_12: ## %else14
4163; AVX1OR2-NEXT:    testb $64, %al
4164; AVX1OR2-NEXT:    jne LBB23_13
4165; AVX1OR2-NEXT:  LBB23_14: ## %else17
4166; AVX1OR2-NEXT:    testb %al, %al
4167; AVX1OR2-NEXT:    js LBB23_15
4168; AVX1OR2-NEXT:  LBB23_16: ## %else20
4169; AVX1OR2-NEXT:    testl $256, %eax ## imm = 0x100
4170; AVX1OR2-NEXT:    jne LBB23_17
4171; AVX1OR2-NEXT:  LBB23_18: ## %else23
4172; AVX1OR2-NEXT:    testl $512, %eax ## imm = 0x200
4173; AVX1OR2-NEXT:    jne LBB23_19
4174; AVX1OR2-NEXT:  LBB23_20: ## %else26
4175; AVX1OR2-NEXT:    testl $1024, %eax ## imm = 0x400
4176; AVX1OR2-NEXT:    jne LBB23_21
4177; AVX1OR2-NEXT:  LBB23_22: ## %else29
4178; AVX1OR2-NEXT:    testl $2048, %eax ## imm = 0x800
4179; AVX1OR2-NEXT:    jne LBB23_23
4180; AVX1OR2-NEXT:  LBB23_24: ## %else32
4181; AVX1OR2-NEXT:    testl $4096, %eax ## imm = 0x1000
4182; AVX1OR2-NEXT:    jne LBB23_25
4183; AVX1OR2-NEXT:  LBB23_26: ## %else35
4184; AVX1OR2-NEXT:    testl $8192, %eax ## imm = 0x2000
4185; AVX1OR2-NEXT:    jne LBB23_27
4186; AVX1OR2-NEXT:  LBB23_28: ## %else38
4187; AVX1OR2-NEXT:    testl $16384, %eax ## imm = 0x4000
4188; AVX1OR2-NEXT:    jne LBB23_29
4189; AVX1OR2-NEXT:  LBB23_30: ## %else41
4190; AVX1OR2-NEXT:    testl $32768, %eax ## imm = 0x8000
4191; AVX1OR2-NEXT:    jne LBB23_31
4192; AVX1OR2-NEXT:  LBB23_32: ## %else44
4193; AVX1OR2-NEXT:    vmovdqa %xmm1, %xmm0
4194; AVX1OR2-NEXT:    retq
4195; AVX1OR2-NEXT:  LBB23_1: ## %cond.load
4196; AVX1OR2-NEXT:    vpinsrb $0, (%rdi), %xmm1, %xmm1
4197; AVX1OR2-NEXT:    testb $2, %al
4198; AVX1OR2-NEXT:    je LBB23_4
4199; AVX1OR2-NEXT:  LBB23_3: ## %cond.load1
4200; AVX1OR2-NEXT:    vpinsrb $1, 1(%rdi), %xmm1, %xmm1
4201; AVX1OR2-NEXT:    testb $4, %al
4202; AVX1OR2-NEXT:    je LBB23_6
4203; AVX1OR2-NEXT:  LBB23_5: ## %cond.load4
4204; AVX1OR2-NEXT:    vpinsrb $2, 2(%rdi), %xmm1, %xmm1
4205; AVX1OR2-NEXT:    testb $8, %al
4206; AVX1OR2-NEXT:    je LBB23_8
4207; AVX1OR2-NEXT:  LBB23_7: ## %cond.load7
4208; AVX1OR2-NEXT:    vpinsrb $3, 3(%rdi), %xmm1, %xmm1
4209; AVX1OR2-NEXT:    testb $16, %al
4210; AVX1OR2-NEXT:    je LBB23_10
4211; AVX1OR2-NEXT:  LBB23_9: ## %cond.load10
4212; AVX1OR2-NEXT:    vpinsrb $4, 4(%rdi), %xmm1, %xmm1
4213; AVX1OR2-NEXT:    testb $32, %al
4214; AVX1OR2-NEXT:    je LBB23_12
4215; AVX1OR2-NEXT:  LBB23_11: ## %cond.load13
4216; AVX1OR2-NEXT:    vpinsrb $5, 5(%rdi), %xmm1, %xmm1
4217; AVX1OR2-NEXT:    testb $64, %al
4218; AVX1OR2-NEXT:    je LBB23_14
4219; AVX1OR2-NEXT:  LBB23_13: ## %cond.load16
4220; AVX1OR2-NEXT:    vpinsrb $6, 6(%rdi), %xmm1, %xmm1
4221; AVX1OR2-NEXT:    testb %al, %al
4222; AVX1OR2-NEXT:    jns LBB23_16
4223; AVX1OR2-NEXT:  LBB23_15: ## %cond.load19
4224; AVX1OR2-NEXT:    vpinsrb $7, 7(%rdi), %xmm1, %xmm1
4225; AVX1OR2-NEXT:    testl $256, %eax ## imm = 0x100
4226; AVX1OR2-NEXT:    je LBB23_18
4227; AVX1OR2-NEXT:  LBB23_17: ## %cond.load22
4228; AVX1OR2-NEXT:    vpinsrb $8, 8(%rdi), %xmm1, %xmm1
4229; AVX1OR2-NEXT:    testl $512, %eax ## imm = 0x200
4230; AVX1OR2-NEXT:    je LBB23_20
4231; AVX1OR2-NEXT:  LBB23_19: ## %cond.load25
4232; AVX1OR2-NEXT:    vpinsrb $9, 9(%rdi), %xmm1, %xmm1
4233; AVX1OR2-NEXT:    testl $1024, %eax ## imm = 0x400
4234; AVX1OR2-NEXT:    je LBB23_22
4235; AVX1OR2-NEXT:  LBB23_21: ## %cond.load28
4236; AVX1OR2-NEXT:    vpinsrb $10, 10(%rdi), %xmm1, %xmm1
4237; AVX1OR2-NEXT:    testl $2048, %eax ## imm = 0x800
4238; AVX1OR2-NEXT:    je LBB23_24
4239; AVX1OR2-NEXT:  LBB23_23: ## %cond.load31
4240; AVX1OR2-NEXT:    vpinsrb $11, 11(%rdi), %xmm1, %xmm1
4241; AVX1OR2-NEXT:    testl $4096, %eax ## imm = 0x1000
4242; AVX1OR2-NEXT:    je LBB23_26
4243; AVX1OR2-NEXT:  LBB23_25: ## %cond.load34
4244; AVX1OR2-NEXT:    vpinsrb $12, 12(%rdi), %xmm1, %xmm1
4245; AVX1OR2-NEXT:    testl $8192, %eax ## imm = 0x2000
4246; AVX1OR2-NEXT:    je LBB23_28
4247; AVX1OR2-NEXT:  LBB23_27: ## %cond.load37
4248; AVX1OR2-NEXT:    vpinsrb $13, 13(%rdi), %xmm1, %xmm1
4249; AVX1OR2-NEXT:    testl $16384, %eax ## imm = 0x4000
4250; AVX1OR2-NEXT:    je LBB23_30
4251; AVX1OR2-NEXT:  LBB23_29: ## %cond.load40
4252; AVX1OR2-NEXT:    vpinsrb $14, 14(%rdi), %xmm1, %xmm1
4253; AVX1OR2-NEXT:    testl $32768, %eax ## imm = 0x8000
4254; AVX1OR2-NEXT:    je LBB23_32
4255; AVX1OR2-NEXT:  LBB23_31: ## %cond.load43
4256; AVX1OR2-NEXT:    vpinsrb $15, 15(%rdi), %xmm1, %xmm1
4257; AVX1OR2-NEXT:    vmovdqa %xmm1, %xmm0
4258; AVX1OR2-NEXT:    retq
4259;
4260; AVX512F-LABEL: load_v16i8_v16i8:
4261; AVX512F:       ## %bb.0:
4262; AVX512F-NEXT:    vpmovmskb %xmm0, %eax
4263; AVX512F-NEXT:    testb $1, %al
4264; AVX512F-NEXT:    jne LBB23_1
4265; AVX512F-NEXT:  ## %bb.2: ## %else
4266; AVX512F-NEXT:    testb $2, %al
4267; AVX512F-NEXT:    jne LBB23_3
4268; AVX512F-NEXT:  LBB23_4: ## %else2
4269; AVX512F-NEXT:    testb $4, %al
4270; AVX512F-NEXT:    jne LBB23_5
4271; AVX512F-NEXT:  LBB23_6: ## %else5
4272; AVX512F-NEXT:    testb $8, %al
4273; AVX512F-NEXT:    jne LBB23_7
4274; AVX512F-NEXT:  LBB23_8: ## %else8
4275; AVX512F-NEXT:    testb $16, %al
4276; AVX512F-NEXT:    jne LBB23_9
4277; AVX512F-NEXT:  LBB23_10: ## %else11
4278; AVX512F-NEXT:    testb $32, %al
4279; AVX512F-NEXT:    jne LBB23_11
4280; AVX512F-NEXT:  LBB23_12: ## %else14
4281; AVX512F-NEXT:    testb $64, %al
4282; AVX512F-NEXT:    jne LBB23_13
4283; AVX512F-NEXT:  LBB23_14: ## %else17
4284; AVX512F-NEXT:    testb %al, %al
4285; AVX512F-NEXT:    js LBB23_15
4286; AVX512F-NEXT:  LBB23_16: ## %else20
4287; AVX512F-NEXT:    testl $256, %eax ## imm = 0x100
4288; AVX512F-NEXT:    jne LBB23_17
4289; AVX512F-NEXT:  LBB23_18: ## %else23
4290; AVX512F-NEXT:    testl $512, %eax ## imm = 0x200
4291; AVX512F-NEXT:    jne LBB23_19
4292; AVX512F-NEXT:  LBB23_20: ## %else26
4293; AVX512F-NEXT:    testl $1024, %eax ## imm = 0x400
4294; AVX512F-NEXT:    jne LBB23_21
4295; AVX512F-NEXT:  LBB23_22: ## %else29
4296; AVX512F-NEXT:    testl $2048, %eax ## imm = 0x800
4297; AVX512F-NEXT:    jne LBB23_23
4298; AVX512F-NEXT:  LBB23_24: ## %else32
4299; AVX512F-NEXT:    testl $4096, %eax ## imm = 0x1000
4300; AVX512F-NEXT:    jne LBB23_25
4301; AVX512F-NEXT:  LBB23_26: ## %else35
4302; AVX512F-NEXT:    testl $8192, %eax ## imm = 0x2000
4303; AVX512F-NEXT:    jne LBB23_27
4304; AVX512F-NEXT:  LBB23_28: ## %else38
4305; AVX512F-NEXT:    testl $16384, %eax ## imm = 0x4000
4306; AVX512F-NEXT:    jne LBB23_29
4307; AVX512F-NEXT:  LBB23_30: ## %else41
4308; AVX512F-NEXT:    testl $32768, %eax ## imm = 0x8000
4309; AVX512F-NEXT:    jne LBB23_31
4310; AVX512F-NEXT:  LBB23_32: ## %else44
4311; AVX512F-NEXT:    vmovdqa %xmm1, %xmm0
4312; AVX512F-NEXT:    retq
4313; AVX512F-NEXT:  LBB23_1: ## %cond.load
4314; AVX512F-NEXT:    vpinsrb $0, (%rdi), %xmm1, %xmm1
4315; AVX512F-NEXT:    testb $2, %al
4316; AVX512F-NEXT:    je LBB23_4
4317; AVX512F-NEXT:  LBB23_3: ## %cond.load1
4318; AVX512F-NEXT:    vpinsrb $1, 1(%rdi), %xmm1, %xmm1
4319; AVX512F-NEXT:    testb $4, %al
4320; AVX512F-NEXT:    je LBB23_6
4321; AVX512F-NEXT:  LBB23_5: ## %cond.load4
4322; AVX512F-NEXT:    vpinsrb $2, 2(%rdi), %xmm1, %xmm1
4323; AVX512F-NEXT:    testb $8, %al
4324; AVX512F-NEXT:    je LBB23_8
4325; AVX512F-NEXT:  LBB23_7: ## %cond.load7
4326; AVX512F-NEXT:    vpinsrb $3, 3(%rdi), %xmm1, %xmm1
4327; AVX512F-NEXT:    testb $16, %al
4328; AVX512F-NEXT:    je LBB23_10
4329; AVX512F-NEXT:  LBB23_9: ## %cond.load10
4330; AVX512F-NEXT:    vpinsrb $4, 4(%rdi), %xmm1, %xmm1
4331; AVX512F-NEXT:    testb $32, %al
4332; AVX512F-NEXT:    je LBB23_12
4333; AVX512F-NEXT:  LBB23_11: ## %cond.load13
4334; AVX512F-NEXT:    vpinsrb $5, 5(%rdi), %xmm1, %xmm1
4335; AVX512F-NEXT:    testb $64, %al
4336; AVX512F-NEXT:    je LBB23_14
4337; AVX512F-NEXT:  LBB23_13: ## %cond.load16
4338; AVX512F-NEXT:    vpinsrb $6, 6(%rdi), %xmm1, %xmm1
4339; AVX512F-NEXT:    testb %al, %al
4340; AVX512F-NEXT:    jns LBB23_16
4341; AVX512F-NEXT:  LBB23_15: ## %cond.load19
4342; AVX512F-NEXT:    vpinsrb $7, 7(%rdi), %xmm1, %xmm1
4343; AVX512F-NEXT:    testl $256, %eax ## imm = 0x100
4344; AVX512F-NEXT:    je LBB23_18
4345; AVX512F-NEXT:  LBB23_17: ## %cond.load22
4346; AVX512F-NEXT:    vpinsrb $8, 8(%rdi), %xmm1, %xmm1
4347; AVX512F-NEXT:    testl $512, %eax ## imm = 0x200
4348; AVX512F-NEXT:    je LBB23_20
4349; AVX512F-NEXT:  LBB23_19: ## %cond.load25
4350; AVX512F-NEXT:    vpinsrb $9, 9(%rdi), %xmm1, %xmm1
4351; AVX512F-NEXT:    testl $1024, %eax ## imm = 0x400
4352; AVX512F-NEXT:    je LBB23_22
4353; AVX512F-NEXT:  LBB23_21: ## %cond.load28
4354; AVX512F-NEXT:    vpinsrb $10, 10(%rdi), %xmm1, %xmm1
4355; AVX512F-NEXT:    testl $2048, %eax ## imm = 0x800
4356; AVX512F-NEXT:    je LBB23_24
4357; AVX512F-NEXT:  LBB23_23: ## %cond.load31
4358; AVX512F-NEXT:    vpinsrb $11, 11(%rdi), %xmm1, %xmm1
4359; AVX512F-NEXT:    testl $4096, %eax ## imm = 0x1000
4360; AVX512F-NEXT:    je LBB23_26
4361; AVX512F-NEXT:  LBB23_25: ## %cond.load34
4362; AVX512F-NEXT:    vpinsrb $12, 12(%rdi), %xmm1, %xmm1
4363; AVX512F-NEXT:    testl $8192, %eax ## imm = 0x2000
4364; AVX512F-NEXT:    je LBB23_28
4365; AVX512F-NEXT:  LBB23_27: ## %cond.load37
4366; AVX512F-NEXT:    vpinsrb $13, 13(%rdi), %xmm1, %xmm1
4367; AVX512F-NEXT:    testl $16384, %eax ## imm = 0x4000
4368; AVX512F-NEXT:    je LBB23_30
4369; AVX512F-NEXT:  LBB23_29: ## %cond.load40
4370; AVX512F-NEXT:    vpinsrb $14, 14(%rdi), %xmm1, %xmm1
4371; AVX512F-NEXT:    testl $32768, %eax ## imm = 0x8000
4372; AVX512F-NEXT:    je LBB23_32
4373; AVX512F-NEXT:  LBB23_31: ## %cond.load43
4374; AVX512F-NEXT:    vpinsrb $15, 15(%rdi), %xmm1, %xmm1
4375; AVX512F-NEXT:    vmovdqa %xmm1, %xmm0
4376; AVX512F-NEXT:    retq
4377;
4378; AVX512VLDQ-LABEL: load_v16i8_v16i8:
4379; AVX512VLDQ:       ## %bb.0:
4380; AVX512VLDQ-NEXT:    vpmovmskb %xmm0, %eax
4381; AVX512VLDQ-NEXT:    testb $1, %al
4382; AVX512VLDQ-NEXT:    jne LBB23_1
4383; AVX512VLDQ-NEXT:  ## %bb.2: ## %else
4384; AVX512VLDQ-NEXT:    testb $2, %al
4385; AVX512VLDQ-NEXT:    jne LBB23_3
4386; AVX512VLDQ-NEXT:  LBB23_4: ## %else2
4387; AVX512VLDQ-NEXT:    testb $4, %al
4388; AVX512VLDQ-NEXT:    jne LBB23_5
4389; AVX512VLDQ-NEXT:  LBB23_6: ## %else5
4390; AVX512VLDQ-NEXT:    testb $8, %al
4391; AVX512VLDQ-NEXT:    jne LBB23_7
4392; AVX512VLDQ-NEXT:  LBB23_8: ## %else8
4393; AVX512VLDQ-NEXT:    testb $16, %al
4394; AVX512VLDQ-NEXT:    jne LBB23_9
4395; AVX512VLDQ-NEXT:  LBB23_10: ## %else11
4396; AVX512VLDQ-NEXT:    testb $32, %al
4397; AVX512VLDQ-NEXT:    jne LBB23_11
4398; AVX512VLDQ-NEXT:  LBB23_12: ## %else14
4399; AVX512VLDQ-NEXT:    testb $64, %al
4400; AVX512VLDQ-NEXT:    jne LBB23_13
4401; AVX512VLDQ-NEXT:  LBB23_14: ## %else17
4402; AVX512VLDQ-NEXT:    testb %al, %al
4403; AVX512VLDQ-NEXT:    js LBB23_15
4404; AVX512VLDQ-NEXT:  LBB23_16: ## %else20
4405; AVX512VLDQ-NEXT:    testl $256, %eax ## imm = 0x100
4406; AVX512VLDQ-NEXT:    jne LBB23_17
4407; AVX512VLDQ-NEXT:  LBB23_18: ## %else23
4408; AVX512VLDQ-NEXT:    testl $512, %eax ## imm = 0x200
4409; AVX512VLDQ-NEXT:    jne LBB23_19
4410; AVX512VLDQ-NEXT:  LBB23_20: ## %else26
4411; AVX512VLDQ-NEXT:    testl $1024, %eax ## imm = 0x400
4412; AVX512VLDQ-NEXT:    jne LBB23_21
4413; AVX512VLDQ-NEXT:  LBB23_22: ## %else29
4414; AVX512VLDQ-NEXT:    testl $2048, %eax ## imm = 0x800
4415; AVX512VLDQ-NEXT:    jne LBB23_23
4416; AVX512VLDQ-NEXT:  LBB23_24: ## %else32
4417; AVX512VLDQ-NEXT:    testl $4096, %eax ## imm = 0x1000
4418; AVX512VLDQ-NEXT:    jne LBB23_25
4419; AVX512VLDQ-NEXT:  LBB23_26: ## %else35
4420; AVX512VLDQ-NEXT:    testl $8192, %eax ## imm = 0x2000
4421; AVX512VLDQ-NEXT:    jne LBB23_27
4422; AVX512VLDQ-NEXT:  LBB23_28: ## %else38
4423; AVX512VLDQ-NEXT:    testl $16384, %eax ## imm = 0x4000
4424; AVX512VLDQ-NEXT:    jne LBB23_29
4425; AVX512VLDQ-NEXT:  LBB23_30: ## %else41
4426; AVX512VLDQ-NEXT:    testl $32768, %eax ## imm = 0x8000
4427; AVX512VLDQ-NEXT:    jne LBB23_31
4428; AVX512VLDQ-NEXT:  LBB23_32: ## %else44
4429; AVX512VLDQ-NEXT:    vmovdqa %xmm1, %xmm0
4430; AVX512VLDQ-NEXT:    retq
4431; AVX512VLDQ-NEXT:  LBB23_1: ## %cond.load
4432; AVX512VLDQ-NEXT:    vpinsrb $0, (%rdi), %xmm1, %xmm1
4433; AVX512VLDQ-NEXT:    testb $2, %al
4434; AVX512VLDQ-NEXT:    je LBB23_4
4435; AVX512VLDQ-NEXT:  LBB23_3: ## %cond.load1
4436; AVX512VLDQ-NEXT:    vpinsrb $1, 1(%rdi), %xmm1, %xmm1
4437; AVX512VLDQ-NEXT:    testb $4, %al
4438; AVX512VLDQ-NEXT:    je LBB23_6
4439; AVX512VLDQ-NEXT:  LBB23_5: ## %cond.load4
4440; AVX512VLDQ-NEXT:    vpinsrb $2, 2(%rdi), %xmm1, %xmm1
4441; AVX512VLDQ-NEXT:    testb $8, %al
4442; AVX512VLDQ-NEXT:    je LBB23_8
4443; AVX512VLDQ-NEXT:  LBB23_7: ## %cond.load7
4444; AVX512VLDQ-NEXT:    vpinsrb $3, 3(%rdi), %xmm1, %xmm1
4445; AVX512VLDQ-NEXT:    testb $16, %al
4446; AVX512VLDQ-NEXT:    je LBB23_10
4447; AVX512VLDQ-NEXT:  LBB23_9: ## %cond.load10
4448; AVX512VLDQ-NEXT:    vpinsrb $4, 4(%rdi), %xmm1, %xmm1
4449; AVX512VLDQ-NEXT:    testb $32, %al
4450; AVX512VLDQ-NEXT:    je LBB23_12
4451; AVX512VLDQ-NEXT:  LBB23_11: ## %cond.load13
4452; AVX512VLDQ-NEXT:    vpinsrb $5, 5(%rdi), %xmm1, %xmm1
4453; AVX512VLDQ-NEXT:    testb $64, %al
4454; AVX512VLDQ-NEXT:    je LBB23_14
4455; AVX512VLDQ-NEXT:  LBB23_13: ## %cond.load16
4456; AVX512VLDQ-NEXT:    vpinsrb $6, 6(%rdi), %xmm1, %xmm1
4457; AVX512VLDQ-NEXT:    testb %al, %al
4458; AVX512VLDQ-NEXT:    jns LBB23_16
4459; AVX512VLDQ-NEXT:  LBB23_15: ## %cond.load19
4460; AVX512VLDQ-NEXT:    vpinsrb $7, 7(%rdi), %xmm1, %xmm1
4461; AVX512VLDQ-NEXT:    testl $256, %eax ## imm = 0x100
4462; AVX512VLDQ-NEXT:    je LBB23_18
4463; AVX512VLDQ-NEXT:  LBB23_17: ## %cond.load22
4464; AVX512VLDQ-NEXT:    vpinsrb $8, 8(%rdi), %xmm1, %xmm1
4465; AVX512VLDQ-NEXT:    testl $512, %eax ## imm = 0x200
4466; AVX512VLDQ-NEXT:    je LBB23_20
4467; AVX512VLDQ-NEXT:  LBB23_19: ## %cond.load25
4468; AVX512VLDQ-NEXT:    vpinsrb $9, 9(%rdi), %xmm1, %xmm1
4469; AVX512VLDQ-NEXT:    testl $1024, %eax ## imm = 0x400
4470; AVX512VLDQ-NEXT:    je LBB23_22
4471; AVX512VLDQ-NEXT:  LBB23_21: ## %cond.load28
4472; AVX512VLDQ-NEXT:    vpinsrb $10, 10(%rdi), %xmm1, %xmm1
4473; AVX512VLDQ-NEXT:    testl $2048, %eax ## imm = 0x800
4474; AVX512VLDQ-NEXT:    je LBB23_24
4475; AVX512VLDQ-NEXT:  LBB23_23: ## %cond.load31
4476; AVX512VLDQ-NEXT:    vpinsrb $11, 11(%rdi), %xmm1, %xmm1
4477; AVX512VLDQ-NEXT:    testl $4096, %eax ## imm = 0x1000
4478; AVX512VLDQ-NEXT:    je LBB23_26
4479; AVX512VLDQ-NEXT:  LBB23_25: ## %cond.load34
4480; AVX512VLDQ-NEXT:    vpinsrb $12, 12(%rdi), %xmm1, %xmm1
4481; AVX512VLDQ-NEXT:    testl $8192, %eax ## imm = 0x2000
4482; AVX512VLDQ-NEXT:    je LBB23_28
4483; AVX512VLDQ-NEXT:  LBB23_27: ## %cond.load37
4484; AVX512VLDQ-NEXT:    vpinsrb $13, 13(%rdi), %xmm1, %xmm1
4485; AVX512VLDQ-NEXT:    testl $16384, %eax ## imm = 0x4000
4486; AVX512VLDQ-NEXT:    je LBB23_30
4487; AVX512VLDQ-NEXT:  LBB23_29: ## %cond.load40
4488; AVX512VLDQ-NEXT:    vpinsrb $14, 14(%rdi), %xmm1, %xmm1
4489; AVX512VLDQ-NEXT:    testl $32768, %eax ## imm = 0x8000
4490; AVX512VLDQ-NEXT:    je LBB23_32
4491; AVX512VLDQ-NEXT:  LBB23_31: ## %cond.load43
4492; AVX512VLDQ-NEXT:    vpinsrb $15, 15(%rdi), %xmm1, %xmm1
4493; AVX512VLDQ-NEXT:    vmovdqa %xmm1, %xmm0
4494; AVX512VLDQ-NEXT:    retq
4495;
4496; AVX512VLBW-LABEL: load_v16i8_v16i8:
4497; AVX512VLBW:       ## %bb.0:
4498; AVX512VLBW-NEXT:    vpmovb2m %xmm0, %k1
4499; AVX512VLBW-NEXT:    vpblendmb (%rdi), %xmm1, %xmm0 {%k1}
4500; AVX512VLBW-NEXT:    retq
4501;
4502; X86-AVX512-LABEL: load_v16i8_v16i8:
4503; X86-AVX512:       ## %bb.0:
4504; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
4505; X86-AVX512-NEXT:    vpmovb2m %xmm0, %k1
4506; X86-AVX512-NEXT:    vpblendmb (%eax), %xmm1, %xmm0 {%k1}
4507; X86-AVX512-NEXT:    retl
4508  %mask = icmp slt <16 x i8> %trigger, zeroinitializer
4509  %res = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr %addr, i32 4, <16 x i1> %mask, <16 x i8> %dst)
4510  ret <16 x i8> %res
4511}
4512
4513define <32 x i8> @load_v32i8_v32i8(<32 x i8> %trigger, ptr %addr, <32 x i8> %dst) {
4514; SSE2-LABEL: load_v32i8_v32i8:
4515; SSE2:       ## %bb.0:
4516; SSE2-NEXT:    pmovmskb %xmm0, %ecx
4517; SSE2-NEXT:    pmovmskb %xmm1, %eax
4518; SSE2-NEXT:    shll $16, %eax
4519; SSE2-NEXT:    orl %ecx, %eax
4520; SSE2-NEXT:    testb $1, %al
4521; SSE2-NEXT:    jne LBB24_1
4522; SSE2-NEXT:  ## %bb.2: ## %else
4523; SSE2-NEXT:    testb $2, %al
4524; SSE2-NEXT:    jne LBB24_3
4525; SSE2-NEXT:  LBB24_4: ## %else2
4526; SSE2-NEXT:    testb $4, %al
4527; SSE2-NEXT:    jne LBB24_5
4528; SSE2-NEXT:  LBB24_6: ## %else5
4529; SSE2-NEXT:    testb $8, %al
4530; SSE2-NEXT:    jne LBB24_7
4531; SSE2-NEXT:  LBB24_8: ## %else8
4532; SSE2-NEXT:    testb $16, %al
4533; SSE2-NEXT:    jne LBB24_9
4534; SSE2-NEXT:  LBB24_10: ## %else11
4535; SSE2-NEXT:    testb $32, %al
4536; SSE2-NEXT:    jne LBB24_11
4537; SSE2-NEXT:  LBB24_12: ## %else14
4538; SSE2-NEXT:    testb $64, %al
4539; SSE2-NEXT:    jne LBB24_13
4540; SSE2-NEXT:  LBB24_14: ## %else17
4541; SSE2-NEXT:    testb %al, %al
4542; SSE2-NEXT:    js LBB24_15
4543; SSE2-NEXT:  LBB24_16: ## %else20
4544; SSE2-NEXT:    testl $256, %eax ## imm = 0x100
4545; SSE2-NEXT:    jne LBB24_17
4546; SSE2-NEXT:  LBB24_18: ## %else23
4547; SSE2-NEXT:    testl $512, %eax ## imm = 0x200
4548; SSE2-NEXT:    jne LBB24_19
4549; SSE2-NEXT:  LBB24_20: ## %else26
4550; SSE2-NEXT:    testl $1024, %eax ## imm = 0x400
4551; SSE2-NEXT:    jne LBB24_21
4552; SSE2-NEXT:  LBB24_22: ## %else29
4553; SSE2-NEXT:    testl $2048, %eax ## imm = 0x800
4554; SSE2-NEXT:    jne LBB24_23
4555; SSE2-NEXT:  LBB24_24: ## %else32
4556; SSE2-NEXT:    testl $4096, %eax ## imm = 0x1000
4557; SSE2-NEXT:    jne LBB24_25
4558; SSE2-NEXT:  LBB24_26: ## %else35
4559; SSE2-NEXT:    testl $8192, %eax ## imm = 0x2000
4560; SSE2-NEXT:    jne LBB24_27
4561; SSE2-NEXT:  LBB24_28: ## %else38
4562; SSE2-NEXT:    testl $16384, %eax ## imm = 0x4000
4563; SSE2-NEXT:    jne LBB24_29
4564; SSE2-NEXT:  LBB24_30: ## %else41
4565; SSE2-NEXT:    testw %ax, %ax
4566; SSE2-NEXT:    js LBB24_31
4567; SSE2-NEXT:  LBB24_32: ## %else44
4568; SSE2-NEXT:    testl $65536, %eax ## imm = 0x10000
4569; SSE2-NEXT:    jne LBB24_33
4570; SSE2-NEXT:  LBB24_34: ## %else47
4571; SSE2-NEXT:    testl $131072, %eax ## imm = 0x20000
4572; SSE2-NEXT:    jne LBB24_35
4573; SSE2-NEXT:  LBB24_36: ## %else50
4574; SSE2-NEXT:    testl $262144, %eax ## imm = 0x40000
4575; SSE2-NEXT:    jne LBB24_37
4576; SSE2-NEXT:  LBB24_38: ## %else53
4577; SSE2-NEXT:    testl $524288, %eax ## imm = 0x80000
4578; SSE2-NEXT:    jne LBB24_39
4579; SSE2-NEXT:  LBB24_40: ## %else56
4580; SSE2-NEXT:    testl $1048576, %eax ## imm = 0x100000
4581; SSE2-NEXT:    jne LBB24_41
4582; SSE2-NEXT:  LBB24_42: ## %else59
4583; SSE2-NEXT:    testl $2097152, %eax ## imm = 0x200000
4584; SSE2-NEXT:    jne LBB24_43
4585; SSE2-NEXT:  LBB24_44: ## %else62
4586; SSE2-NEXT:    testl $4194304, %eax ## imm = 0x400000
4587; SSE2-NEXT:    jne LBB24_45
4588; SSE2-NEXT:  LBB24_46: ## %else65
4589; SSE2-NEXT:    testl $8388608, %eax ## imm = 0x800000
4590; SSE2-NEXT:    jne LBB24_47
4591; SSE2-NEXT:  LBB24_48: ## %else68
4592; SSE2-NEXT:    testl $16777216, %eax ## imm = 0x1000000
4593; SSE2-NEXT:    jne LBB24_49
4594; SSE2-NEXT:  LBB24_50: ## %else71
4595; SSE2-NEXT:    testl $33554432, %eax ## imm = 0x2000000
4596; SSE2-NEXT:    jne LBB24_51
4597; SSE2-NEXT:  LBB24_52: ## %else74
4598; SSE2-NEXT:    testl $67108864, %eax ## imm = 0x4000000
4599; SSE2-NEXT:    jne LBB24_53
4600; SSE2-NEXT:  LBB24_54: ## %else77
4601; SSE2-NEXT:    testl $134217728, %eax ## imm = 0x8000000
4602; SSE2-NEXT:    jne LBB24_55
4603; SSE2-NEXT:  LBB24_56: ## %else80
4604; SSE2-NEXT:    testl $268435456, %eax ## imm = 0x10000000
4605; SSE2-NEXT:    jne LBB24_57
4606; SSE2-NEXT:  LBB24_58: ## %else83
4607; SSE2-NEXT:    testl $536870912, %eax ## imm = 0x20000000
4608; SSE2-NEXT:    jne LBB24_59
4609; SSE2-NEXT:  LBB24_60: ## %else86
4610; SSE2-NEXT:    testl $1073741824, %eax ## imm = 0x40000000
4611; SSE2-NEXT:    jne LBB24_61
4612; SSE2-NEXT:  LBB24_62: ## %else89
4613; SSE2-NEXT:    testl $-2147483648, %eax ## imm = 0x80000000
4614; SSE2-NEXT:    je LBB24_64
4615; SSE2-NEXT:  LBB24_63: ## %cond.load91
4616; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
4617; SSE2-NEXT:    movzbl 31(%rdi), %eax
4618; SSE2-NEXT:    movd %eax, %xmm0
4619; SSE2-NEXT:    pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0]
4620; SSE2-NEXT:    por %xmm0, %xmm3
4621; SSE2-NEXT:  LBB24_64: ## %else92
4622; SSE2-NEXT:    movdqa %xmm2, %xmm0
4623; SSE2-NEXT:    movdqa %xmm3, %xmm1
4624; SSE2-NEXT:    retq
4625; SSE2-NEXT:  LBB24_1: ## %cond.load
4626; SSE2-NEXT:    movdqa {{.*#+}} xmm0 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
4627; SSE2-NEXT:    pand %xmm0, %xmm2
4628; SSE2-NEXT:    movzbl (%rdi), %ecx
4629; SSE2-NEXT:    movd %ecx, %xmm1
4630; SSE2-NEXT:    pandn %xmm1, %xmm0
4631; SSE2-NEXT:    por %xmm0, %xmm2
4632; SSE2-NEXT:    testb $2, %al
4633; SSE2-NEXT:    je LBB24_4
4634; SSE2-NEXT:  LBB24_3: ## %cond.load1
4635; SSE2-NEXT:    movdqa {{.*#+}} xmm0 = [255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
4636; SSE2-NEXT:    pand %xmm0, %xmm2
4637; SSE2-NEXT:    movzbl 1(%rdi), %ecx
4638; SSE2-NEXT:    movd %ecx, %xmm1
4639; SSE2-NEXT:    psllw $8, %xmm1
4640; SSE2-NEXT:    pandn %xmm1, %xmm0
4641; SSE2-NEXT:    por %xmm0, %xmm2
4642; SSE2-NEXT:    testb $4, %al
4643; SSE2-NEXT:    je LBB24_6
4644; SSE2-NEXT:  LBB24_5: ## %cond.load4
4645; SSE2-NEXT:    movdqa {{.*#+}} xmm0 = [255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255]
4646; SSE2-NEXT:    pand %xmm0, %xmm2
4647; SSE2-NEXT:    movzbl 2(%rdi), %ecx
4648; SSE2-NEXT:    movd %ecx, %xmm1
4649; SSE2-NEXT:    pslld $16, %xmm1
4650; SSE2-NEXT:    pandn %xmm1, %xmm0
4651; SSE2-NEXT:    por %xmm0, %xmm2
4652; SSE2-NEXT:    testb $8, %al
4653; SSE2-NEXT:    je LBB24_8
4654; SSE2-NEXT:  LBB24_7: ## %cond.load7
4655; SSE2-NEXT:    movdqa {{.*#+}} xmm0 = [255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255]
4656; SSE2-NEXT:    pand %xmm0, %xmm2
4657; SSE2-NEXT:    movzbl 3(%rdi), %ecx
4658; SSE2-NEXT:    movd %ecx, %xmm1
4659; SSE2-NEXT:    pslld $24, %xmm1
4660; SSE2-NEXT:    pandn %xmm1, %xmm0
4661; SSE2-NEXT:    por %xmm0, %xmm2
4662; SSE2-NEXT:    testb $16, %al
4663; SSE2-NEXT:    je LBB24_10
4664; SSE2-NEXT:  LBB24_9: ## %cond.load10
4665; SSE2-NEXT:    movdqa {{.*#+}} xmm0 = [255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255]
4666; SSE2-NEXT:    pand %xmm0, %xmm2
4667; SSE2-NEXT:    movzbl 4(%rdi), %ecx
4668; SSE2-NEXT:    movd %ecx, %xmm1
4669; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1]
4670; SSE2-NEXT:    pandn %xmm1, %xmm0
4671; SSE2-NEXT:    por %xmm0, %xmm2
4672; SSE2-NEXT:    testb $32, %al
4673; SSE2-NEXT:    je LBB24_12
4674; SSE2-NEXT:  LBB24_11: ## %cond.load13
4675; SSE2-NEXT:    movdqa {{.*#+}} xmm0 = [255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255]
4676; SSE2-NEXT:    pand %xmm0, %xmm2
4677; SSE2-NEXT:    movzbl 5(%rdi), %ecx
4678; SSE2-NEXT:    movd %ecx, %xmm1
4679; SSE2-NEXT:    psllq $40, %xmm1
4680; SSE2-NEXT:    pandn %xmm1, %xmm0
4681; SSE2-NEXT:    por %xmm0, %xmm2
4682; SSE2-NEXT:    testb $64, %al
4683; SSE2-NEXT:    je LBB24_14
4684; SSE2-NEXT:  LBB24_13: ## %cond.load16
4685; SSE2-NEXT:    movdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255]
4686; SSE2-NEXT:    pand %xmm0, %xmm2
4687; SSE2-NEXT:    movzbl 6(%rdi), %ecx
4688; SSE2-NEXT:    movd %ecx, %xmm1
4689; SSE2-NEXT:    psllq $48, %xmm1
4690; SSE2-NEXT:    pandn %xmm1, %xmm0
4691; SSE2-NEXT:    por %xmm0, %xmm2
4692; SSE2-NEXT:    testb %al, %al
4693; SSE2-NEXT:    jns LBB24_16
4694; SSE2-NEXT:  LBB24_15: ## %cond.load19
4695; SSE2-NEXT:    movdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255]
4696; SSE2-NEXT:    pand %xmm0, %xmm2
4697; SSE2-NEXT:    movzbl 7(%rdi), %ecx
4698; SSE2-NEXT:    movd %ecx, %xmm1
4699; SSE2-NEXT:    psllq $56, %xmm1
4700; SSE2-NEXT:    pandn %xmm1, %xmm0
4701; SSE2-NEXT:    por %xmm0, %xmm2
4702; SSE2-NEXT:    testl $256, %eax ## imm = 0x100
4703; SSE2-NEXT:    je LBB24_18
4704; SSE2-NEXT:  LBB24_17: ## %cond.load22
4705; SSE2-NEXT:    movdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255]
4706; SSE2-NEXT:    pand %xmm0, %xmm2
4707; SSE2-NEXT:    movzbl 8(%rdi), %ecx
4708; SSE2-NEXT:    movd %ecx, %xmm1
4709; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
4710; SSE2-NEXT:    pandn %xmm1, %xmm0
4711; SSE2-NEXT:    por %xmm0, %xmm2
4712; SSE2-NEXT:    testl $512, %eax ## imm = 0x200
4713; SSE2-NEXT:    je LBB24_20
4714; SSE2-NEXT:  LBB24_19: ## %cond.load25
4715; SSE2-NEXT:    movdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255]
4716; SSE2-NEXT:    pand %xmm0, %xmm2
4717; SSE2-NEXT:    movzbl 9(%rdi), %ecx
4718; SSE2-NEXT:    movd %ecx, %xmm1
4719; SSE2-NEXT:    pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5,6]
4720; SSE2-NEXT:    pandn %xmm1, %xmm0
4721; SSE2-NEXT:    por %xmm0, %xmm2
4722; SSE2-NEXT:    testl $1024, %eax ## imm = 0x400
4723; SSE2-NEXT:    je LBB24_22
4724; SSE2-NEXT:  LBB24_21: ## %cond.load28
4725; SSE2-NEXT:    movdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255]
4726; SSE2-NEXT:    pand %xmm0, %xmm2
4727; SSE2-NEXT:    movzbl 10(%rdi), %ecx
4728; SSE2-NEXT:    movd %ecx, %xmm1
4729; SSE2-NEXT:    pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5]
4730; SSE2-NEXT:    pandn %xmm1, %xmm0
4731; SSE2-NEXT:    por %xmm0, %xmm2
4732; SSE2-NEXT:    testl $2048, %eax ## imm = 0x800
4733; SSE2-NEXT:    je LBB24_24
4734; SSE2-NEXT:  LBB24_23: ## %cond.load31
4735; SSE2-NEXT:    movdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255]
4736; SSE2-NEXT:    pand %xmm0, %xmm2
4737; SSE2-NEXT:    movzbl 11(%rdi), %ecx
4738; SSE2-NEXT:    movd %ecx, %xmm1
4739; SSE2-NEXT:    pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4]
4740; SSE2-NEXT:    pandn %xmm1, %xmm0
4741; SSE2-NEXT:    por %xmm0, %xmm2
4742; SSE2-NEXT:    testl $4096, %eax ## imm = 0x1000
4743; SSE2-NEXT:    je LBB24_26
4744; SSE2-NEXT:  LBB24_25: ## %cond.load34
4745; SSE2-NEXT:    movdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255]
4746; SSE2-NEXT:    pand %xmm0, %xmm2
4747; SSE2-NEXT:    movzbl 12(%rdi), %ecx
4748; SSE2-NEXT:    movd %ecx, %xmm1
4749; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
4750; SSE2-NEXT:    pandn %xmm1, %xmm0
4751; SSE2-NEXT:    por %xmm0, %xmm2
4752; SSE2-NEXT:    testl $8192, %eax ## imm = 0x2000
4753; SSE2-NEXT:    je LBB24_28
4754; SSE2-NEXT:  LBB24_27: ## %cond.load37
4755; SSE2-NEXT:    movdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255]
4756; SSE2-NEXT:    pand %xmm0, %xmm2
4757; SSE2-NEXT:    movzbl 13(%rdi), %ecx
4758; SSE2-NEXT:    movd %ecx, %xmm1
4759; SSE2-NEXT:    pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2]
4760; SSE2-NEXT:    pandn %xmm1, %xmm0
4761; SSE2-NEXT:    por %xmm0, %xmm2
4762; SSE2-NEXT:    testl $16384, %eax ## imm = 0x4000
4763; SSE2-NEXT:    je LBB24_30
4764; SSE2-NEXT:  LBB24_29: ## %cond.load40
4765; SSE2-NEXT:    movdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255]
4766; SSE2-NEXT:    pand %xmm0, %xmm2
4767; SSE2-NEXT:    movzbl 14(%rdi), %ecx
4768; SSE2-NEXT:    movd %ecx, %xmm1
4769; SSE2-NEXT:    pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1]
4770; SSE2-NEXT:    pandn %xmm1, %xmm0
4771; SSE2-NEXT:    por %xmm0, %xmm2
4772; SSE2-NEXT:    testw %ax, %ax
4773; SSE2-NEXT:    jns LBB24_32
4774; SSE2-NEXT:  LBB24_31: ## %cond.load43
4775; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
4776; SSE2-NEXT:    movzbl 15(%rdi), %ecx
4777; SSE2-NEXT:    movd %ecx, %xmm0
4778; SSE2-NEXT:    pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0]
4779; SSE2-NEXT:    por %xmm0, %xmm2
4780; SSE2-NEXT:    testl $65536, %eax ## imm = 0x10000
4781; SSE2-NEXT:    je LBB24_34
4782; SSE2-NEXT:  LBB24_33: ## %cond.load46
4783; SSE2-NEXT:    movdqa {{.*#+}} xmm0 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
4784; SSE2-NEXT:    pand %xmm0, %xmm3
4785; SSE2-NEXT:    movzbl 16(%rdi), %ecx
4786; SSE2-NEXT:    movd %ecx, %xmm1
4787; SSE2-NEXT:    pandn %xmm1, %xmm0
4788; SSE2-NEXT:    por %xmm0, %xmm3
4789; SSE2-NEXT:    testl $131072, %eax ## imm = 0x20000
4790; SSE2-NEXT:    je LBB24_36
4791; SSE2-NEXT:  LBB24_35: ## %cond.load49
4792; SSE2-NEXT:    movdqa {{.*#+}} xmm0 = [255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
4793; SSE2-NEXT:    pand %xmm0, %xmm3
4794; SSE2-NEXT:    movzbl 17(%rdi), %ecx
4795; SSE2-NEXT:    movd %ecx, %xmm1
4796; SSE2-NEXT:    psllw $8, %xmm1
4797; SSE2-NEXT:    pandn %xmm1, %xmm0
4798; SSE2-NEXT:    por %xmm0, %xmm3
4799; SSE2-NEXT:    testl $262144, %eax ## imm = 0x40000
4800; SSE2-NEXT:    je LBB24_38
4801; SSE2-NEXT:  LBB24_37: ## %cond.load52
4802; SSE2-NEXT:    movdqa {{.*#+}} xmm0 = [255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255]
4803; SSE2-NEXT:    pand %xmm0, %xmm3
4804; SSE2-NEXT:    movzbl 18(%rdi), %ecx
4805; SSE2-NEXT:    movd %ecx, %xmm1
4806; SSE2-NEXT:    pslld $16, %xmm1
4807; SSE2-NEXT:    pandn %xmm1, %xmm0
4808; SSE2-NEXT:    por %xmm0, %xmm3
4809; SSE2-NEXT:    testl $524288, %eax ## imm = 0x80000
4810; SSE2-NEXT:    je LBB24_40
4811; SSE2-NEXT:  LBB24_39: ## %cond.load55
4812; SSE2-NEXT:    movdqa {{.*#+}} xmm0 = [255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255]
4813; SSE2-NEXT:    pand %xmm0, %xmm3
4814; SSE2-NEXT:    movzbl 19(%rdi), %ecx
4815; SSE2-NEXT:    movd %ecx, %xmm1
4816; SSE2-NEXT:    pslld $24, %xmm1
4817; SSE2-NEXT:    pandn %xmm1, %xmm0
4818; SSE2-NEXT:    por %xmm0, %xmm3
4819; SSE2-NEXT:    testl $1048576, %eax ## imm = 0x100000
4820; SSE2-NEXT:    je LBB24_42
4821; SSE2-NEXT:  LBB24_41: ## %cond.load58
4822; SSE2-NEXT:    movdqa {{.*#+}} xmm0 = [255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255]
4823; SSE2-NEXT:    pand %xmm0, %xmm3
4824; SSE2-NEXT:    movzbl 20(%rdi), %ecx
4825; SSE2-NEXT:    movd %ecx, %xmm1
4826; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1]
4827; SSE2-NEXT:    pandn %xmm1, %xmm0
4828; SSE2-NEXT:    por %xmm0, %xmm3
4829; SSE2-NEXT:    testl $2097152, %eax ## imm = 0x200000
4830; SSE2-NEXT:    je LBB24_44
4831; SSE2-NEXT:  LBB24_43: ## %cond.load61
4832; SSE2-NEXT:    movdqa {{.*#+}} xmm0 = [255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255]
4833; SSE2-NEXT:    pand %xmm0, %xmm3
4834; SSE2-NEXT:    movzbl 21(%rdi), %ecx
4835; SSE2-NEXT:    movd %ecx, %xmm1
4836; SSE2-NEXT:    psllq $40, %xmm1
4837; SSE2-NEXT:    pandn %xmm1, %xmm0
4838; SSE2-NEXT:    por %xmm0, %xmm3
4839; SSE2-NEXT:    testl $4194304, %eax ## imm = 0x400000
4840; SSE2-NEXT:    je LBB24_46
4841; SSE2-NEXT:  LBB24_45: ## %cond.load64
4842; SSE2-NEXT:    movdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255]
4843; SSE2-NEXT:    pand %xmm0, %xmm3
4844; SSE2-NEXT:    movzbl 22(%rdi), %ecx
4845; SSE2-NEXT:    movd %ecx, %xmm1
4846; SSE2-NEXT:    psllq $48, %xmm1
4847; SSE2-NEXT:    pandn %xmm1, %xmm0
4848; SSE2-NEXT:    por %xmm0, %xmm3
4849; SSE2-NEXT:    testl $8388608, %eax ## imm = 0x800000
4850; SSE2-NEXT:    je LBB24_48
4851; SSE2-NEXT:  LBB24_47: ## %cond.load67
4852; SSE2-NEXT:    movdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255]
4853; SSE2-NEXT:    pand %xmm0, %xmm3
4854; SSE2-NEXT:    movzbl 23(%rdi), %ecx
4855; SSE2-NEXT:    movd %ecx, %xmm1
4856; SSE2-NEXT:    psllq $56, %xmm1
4857; SSE2-NEXT:    pandn %xmm1, %xmm0
4858; SSE2-NEXT:    por %xmm0, %xmm3
4859; SSE2-NEXT:    testl $16777216, %eax ## imm = 0x1000000
4860; SSE2-NEXT:    je LBB24_50
4861; SSE2-NEXT:  LBB24_49: ## %cond.load70
4862; SSE2-NEXT:    movdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255]
4863; SSE2-NEXT:    pand %xmm0, %xmm3
4864; SSE2-NEXT:    movzbl 24(%rdi), %ecx
4865; SSE2-NEXT:    movd %ecx, %xmm1
4866; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
4867; SSE2-NEXT:    pandn %xmm1, %xmm0
4868; SSE2-NEXT:    por %xmm0, %xmm3
4869; SSE2-NEXT:    testl $33554432, %eax ## imm = 0x2000000
4870; SSE2-NEXT:    je LBB24_52
4871; SSE2-NEXT:  LBB24_51: ## %cond.load73
4872; SSE2-NEXT:    movdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255]
4873; SSE2-NEXT:    pand %xmm0, %xmm3
4874; SSE2-NEXT:    movzbl 25(%rdi), %ecx
4875; SSE2-NEXT:    movd %ecx, %xmm1
4876; SSE2-NEXT:    pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5,6]
4877; SSE2-NEXT:    pandn %xmm1, %xmm0
4878; SSE2-NEXT:    por %xmm0, %xmm3
4879; SSE2-NEXT:    testl $67108864, %eax ## imm = 0x4000000
4880; SSE2-NEXT:    je LBB24_54
4881; SSE2-NEXT:  LBB24_53: ## %cond.load76
4882; SSE2-NEXT:    movdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255]
4883; SSE2-NEXT:    pand %xmm0, %xmm3
4884; SSE2-NEXT:    movzbl 26(%rdi), %ecx
4885; SSE2-NEXT:    movd %ecx, %xmm1
4886; SSE2-NEXT:    pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5]
4887; SSE2-NEXT:    pandn %xmm1, %xmm0
4888; SSE2-NEXT:    por %xmm0, %xmm3
4889; SSE2-NEXT:    testl $134217728, %eax ## imm = 0x8000000
4890; SSE2-NEXT:    je LBB24_56
4891; SSE2-NEXT:  LBB24_55: ## %cond.load79
4892; SSE2-NEXT:    movdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255]
4893; SSE2-NEXT:    pand %xmm0, %xmm3
4894; SSE2-NEXT:    movzbl 27(%rdi), %ecx
4895; SSE2-NEXT:    movd %ecx, %xmm1
4896; SSE2-NEXT:    pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4]
4897; SSE2-NEXT:    pandn %xmm1, %xmm0
4898; SSE2-NEXT:    por %xmm0, %xmm3
4899; SSE2-NEXT:    testl $268435456, %eax ## imm = 0x10000000
4900; SSE2-NEXT:    je LBB24_58
4901; SSE2-NEXT:  LBB24_57: ## %cond.load82
4902; SSE2-NEXT:    movdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255]
4903; SSE2-NEXT:    pand %xmm0, %xmm3
4904; SSE2-NEXT:    movzbl 28(%rdi), %ecx
4905; SSE2-NEXT:    movd %ecx, %xmm1
4906; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
4907; SSE2-NEXT:    pandn %xmm1, %xmm0
4908; SSE2-NEXT:    por %xmm0, %xmm3
4909; SSE2-NEXT:    testl $536870912, %eax ## imm = 0x20000000
4910; SSE2-NEXT:    je LBB24_60
4911; SSE2-NEXT:  LBB24_59: ## %cond.load85
4912; SSE2-NEXT:    movdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255]
4913; SSE2-NEXT:    pand %xmm0, %xmm3
4914; SSE2-NEXT:    movzbl 29(%rdi), %ecx
4915; SSE2-NEXT:    movd %ecx, %xmm1
4916; SSE2-NEXT:    pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2]
4917; SSE2-NEXT:    pandn %xmm1, %xmm0
4918; SSE2-NEXT:    por %xmm0, %xmm3
4919; SSE2-NEXT:    testl $1073741824, %eax ## imm = 0x40000000
4920; SSE2-NEXT:    je LBB24_62
4921; SSE2-NEXT:  LBB24_61: ## %cond.load88
4922; SSE2-NEXT:    movdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255]
4923; SSE2-NEXT:    pand %xmm0, %xmm3
4924; SSE2-NEXT:    movzbl 30(%rdi), %ecx
4925; SSE2-NEXT:    movd %ecx, %xmm1
4926; SSE2-NEXT:    pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1]
4927; SSE2-NEXT:    pandn %xmm1, %xmm0
4928; SSE2-NEXT:    por %xmm0, %xmm3
4929; SSE2-NEXT:    testl $-2147483648, %eax ## imm = 0x80000000
4930; SSE2-NEXT:    jne LBB24_63
4931; SSE2-NEXT:    jmp LBB24_64
4932;
4933; SSE42-LABEL: load_v32i8_v32i8:
4934; SSE42:       ## %bb.0:
4935; SSE42-NEXT:    pmovmskb %xmm0, %ecx
4936; SSE42-NEXT:    pmovmskb %xmm1, %eax
4937; SSE42-NEXT:    shll $16, %eax
4938; SSE42-NEXT:    orl %ecx, %eax
4939; SSE42-NEXT:    testb $1, %al
4940; SSE42-NEXT:    jne LBB24_1
4941; SSE42-NEXT:  ## %bb.2: ## %else
4942; SSE42-NEXT:    testb $2, %al
4943; SSE42-NEXT:    jne LBB24_3
4944; SSE42-NEXT:  LBB24_4: ## %else2
4945; SSE42-NEXT:    testb $4, %al
4946; SSE42-NEXT:    jne LBB24_5
4947; SSE42-NEXT:  LBB24_6: ## %else5
4948; SSE42-NEXT:    testb $8, %al
4949; SSE42-NEXT:    jne LBB24_7
4950; SSE42-NEXT:  LBB24_8: ## %else8
4951; SSE42-NEXT:    testb $16, %al
4952; SSE42-NEXT:    jne LBB24_9
4953; SSE42-NEXT:  LBB24_10: ## %else11
4954; SSE42-NEXT:    testb $32, %al
4955; SSE42-NEXT:    jne LBB24_11
4956; SSE42-NEXT:  LBB24_12: ## %else14
4957; SSE42-NEXT:    testb $64, %al
4958; SSE42-NEXT:    jne LBB24_13
4959; SSE42-NEXT:  LBB24_14: ## %else17
4960; SSE42-NEXT:    testb %al, %al
4961; SSE42-NEXT:    js LBB24_15
4962; SSE42-NEXT:  LBB24_16: ## %else20
4963; SSE42-NEXT:    testl $256, %eax ## imm = 0x100
4964; SSE42-NEXT:    jne LBB24_17
4965; SSE42-NEXT:  LBB24_18: ## %else23
4966; SSE42-NEXT:    testl $512, %eax ## imm = 0x200
4967; SSE42-NEXT:    jne LBB24_19
4968; SSE42-NEXT:  LBB24_20: ## %else26
4969; SSE42-NEXT:    testl $1024, %eax ## imm = 0x400
4970; SSE42-NEXT:    jne LBB24_21
4971; SSE42-NEXT:  LBB24_22: ## %else29
4972; SSE42-NEXT:    testl $2048, %eax ## imm = 0x800
4973; SSE42-NEXT:    jne LBB24_23
4974; SSE42-NEXT:  LBB24_24: ## %else32
4975; SSE42-NEXT:    testl $4096, %eax ## imm = 0x1000
4976; SSE42-NEXT:    jne LBB24_25
4977; SSE42-NEXT:  LBB24_26: ## %else35
4978; SSE42-NEXT:    testl $8192, %eax ## imm = 0x2000
4979; SSE42-NEXT:    jne LBB24_27
4980; SSE42-NEXT:  LBB24_28: ## %else38
4981; SSE42-NEXT:    testl $16384, %eax ## imm = 0x4000
4982; SSE42-NEXT:    jne LBB24_29
4983; SSE42-NEXT:  LBB24_30: ## %else41
4984; SSE42-NEXT:    testw %ax, %ax
4985; SSE42-NEXT:    js LBB24_31
4986; SSE42-NEXT:  LBB24_32: ## %else44
4987; SSE42-NEXT:    testl $65536, %eax ## imm = 0x10000
4988; SSE42-NEXT:    jne LBB24_33
4989; SSE42-NEXT:  LBB24_34: ## %else47
4990; SSE42-NEXT:    testl $131072, %eax ## imm = 0x20000
4991; SSE42-NEXT:    jne LBB24_35
4992; SSE42-NEXT:  LBB24_36: ## %else50
4993; SSE42-NEXT:    testl $262144, %eax ## imm = 0x40000
4994; SSE42-NEXT:    jne LBB24_37
4995; SSE42-NEXT:  LBB24_38: ## %else53
4996; SSE42-NEXT:    testl $524288, %eax ## imm = 0x80000
4997; SSE42-NEXT:    jne LBB24_39
4998; SSE42-NEXT:  LBB24_40: ## %else56
4999; SSE42-NEXT:    testl $1048576, %eax ## imm = 0x100000
5000; SSE42-NEXT:    jne LBB24_41
5001; SSE42-NEXT:  LBB24_42: ## %else59
5002; SSE42-NEXT:    testl $2097152, %eax ## imm = 0x200000
5003; SSE42-NEXT:    jne LBB24_43
5004; SSE42-NEXT:  LBB24_44: ## %else62
5005; SSE42-NEXT:    testl $4194304, %eax ## imm = 0x400000
5006; SSE42-NEXT:    jne LBB24_45
5007; SSE42-NEXT:  LBB24_46: ## %else65
5008; SSE42-NEXT:    testl $8388608, %eax ## imm = 0x800000
5009; SSE42-NEXT:    jne LBB24_47
5010; SSE42-NEXT:  LBB24_48: ## %else68
5011; SSE42-NEXT:    testl $16777216, %eax ## imm = 0x1000000
5012; SSE42-NEXT:    jne LBB24_49
5013; SSE42-NEXT:  LBB24_50: ## %else71
5014; SSE42-NEXT:    testl $33554432, %eax ## imm = 0x2000000
5015; SSE42-NEXT:    jne LBB24_51
5016; SSE42-NEXT:  LBB24_52: ## %else74
5017; SSE42-NEXT:    testl $67108864, %eax ## imm = 0x4000000
5018; SSE42-NEXT:    jne LBB24_53
5019; SSE42-NEXT:  LBB24_54: ## %else77
5020; SSE42-NEXT:    testl $134217728, %eax ## imm = 0x8000000
5021; SSE42-NEXT:    jne LBB24_55
5022; SSE42-NEXT:  LBB24_56: ## %else80
5023; SSE42-NEXT:    testl $268435456, %eax ## imm = 0x10000000
5024; SSE42-NEXT:    jne LBB24_57
5025; SSE42-NEXT:  LBB24_58: ## %else83
5026; SSE42-NEXT:    testl $536870912, %eax ## imm = 0x20000000
5027; SSE42-NEXT:    jne LBB24_59
5028; SSE42-NEXT:  LBB24_60: ## %else86
5029; SSE42-NEXT:    testl $1073741824, %eax ## imm = 0x40000000
5030; SSE42-NEXT:    jne LBB24_61
5031; SSE42-NEXT:  LBB24_62: ## %else89
5032; SSE42-NEXT:    testl $-2147483648, %eax ## imm = 0x80000000
5033; SSE42-NEXT:    je LBB24_64
5034; SSE42-NEXT:  LBB24_63: ## %cond.load91
5035; SSE42-NEXT:    pinsrb $15, 31(%rdi), %xmm3
5036; SSE42-NEXT:  LBB24_64: ## %else92
5037; SSE42-NEXT:    movdqa %xmm2, %xmm0
5038; SSE42-NEXT:    movdqa %xmm3, %xmm1
5039; SSE42-NEXT:    retq
5040; SSE42-NEXT:  LBB24_1: ## %cond.load
5041; SSE42-NEXT:    pinsrb $0, (%rdi), %xmm2
5042; SSE42-NEXT:    testb $2, %al
5043; SSE42-NEXT:    je LBB24_4
5044; SSE42-NEXT:  LBB24_3: ## %cond.load1
5045; SSE42-NEXT:    pinsrb $1, 1(%rdi), %xmm2
5046; SSE42-NEXT:    testb $4, %al
5047; SSE42-NEXT:    je LBB24_6
5048; SSE42-NEXT:  LBB24_5: ## %cond.load4
5049; SSE42-NEXT:    pinsrb $2, 2(%rdi), %xmm2
5050; SSE42-NEXT:    testb $8, %al
5051; SSE42-NEXT:    je LBB24_8
5052; SSE42-NEXT:  LBB24_7: ## %cond.load7
5053; SSE42-NEXT:    pinsrb $3, 3(%rdi), %xmm2
5054; SSE42-NEXT:    testb $16, %al
5055; SSE42-NEXT:    je LBB24_10
5056; SSE42-NEXT:  LBB24_9: ## %cond.load10
5057; SSE42-NEXT:    pinsrb $4, 4(%rdi), %xmm2
5058; SSE42-NEXT:    testb $32, %al
5059; SSE42-NEXT:    je LBB24_12
5060; SSE42-NEXT:  LBB24_11: ## %cond.load13
5061; SSE42-NEXT:    pinsrb $5, 5(%rdi), %xmm2
5062; SSE42-NEXT:    testb $64, %al
5063; SSE42-NEXT:    je LBB24_14
5064; SSE42-NEXT:  LBB24_13: ## %cond.load16
5065; SSE42-NEXT:    pinsrb $6, 6(%rdi), %xmm2
5066; SSE42-NEXT:    testb %al, %al
5067; SSE42-NEXT:    jns LBB24_16
5068; SSE42-NEXT:  LBB24_15: ## %cond.load19
5069; SSE42-NEXT:    pinsrb $7, 7(%rdi), %xmm2
5070; SSE42-NEXT:    testl $256, %eax ## imm = 0x100
5071; SSE42-NEXT:    je LBB24_18
5072; SSE42-NEXT:  LBB24_17: ## %cond.load22
5073; SSE42-NEXT:    pinsrb $8, 8(%rdi), %xmm2
5074; SSE42-NEXT:    testl $512, %eax ## imm = 0x200
5075; SSE42-NEXT:    je LBB24_20
5076; SSE42-NEXT:  LBB24_19: ## %cond.load25
5077; SSE42-NEXT:    pinsrb $9, 9(%rdi), %xmm2
5078; SSE42-NEXT:    testl $1024, %eax ## imm = 0x400
5079; SSE42-NEXT:    je LBB24_22
5080; SSE42-NEXT:  LBB24_21: ## %cond.load28
5081; SSE42-NEXT:    pinsrb $10, 10(%rdi), %xmm2
5082; SSE42-NEXT:    testl $2048, %eax ## imm = 0x800
5083; SSE42-NEXT:    je LBB24_24
5084; SSE42-NEXT:  LBB24_23: ## %cond.load31
5085; SSE42-NEXT:    pinsrb $11, 11(%rdi), %xmm2
5086; SSE42-NEXT:    testl $4096, %eax ## imm = 0x1000
5087; SSE42-NEXT:    je LBB24_26
5088; SSE42-NEXT:  LBB24_25: ## %cond.load34
5089; SSE42-NEXT:    pinsrb $12, 12(%rdi), %xmm2
5090; SSE42-NEXT:    testl $8192, %eax ## imm = 0x2000
5091; SSE42-NEXT:    je LBB24_28
5092; SSE42-NEXT:  LBB24_27: ## %cond.load37
5093; SSE42-NEXT:    pinsrb $13, 13(%rdi), %xmm2
5094; SSE42-NEXT:    testl $16384, %eax ## imm = 0x4000
5095; SSE42-NEXT:    je LBB24_30
5096; SSE42-NEXT:  LBB24_29: ## %cond.load40
5097; SSE42-NEXT:    pinsrb $14, 14(%rdi), %xmm2
5098; SSE42-NEXT:    testw %ax, %ax
5099; SSE42-NEXT:    jns LBB24_32
5100; SSE42-NEXT:  LBB24_31: ## %cond.load43
5101; SSE42-NEXT:    pinsrb $15, 15(%rdi), %xmm2
5102; SSE42-NEXT:    testl $65536, %eax ## imm = 0x10000
5103; SSE42-NEXT:    je LBB24_34
5104; SSE42-NEXT:  LBB24_33: ## %cond.load46
5105; SSE42-NEXT:    pinsrb $0, 16(%rdi), %xmm3
5106; SSE42-NEXT:    testl $131072, %eax ## imm = 0x20000
5107; SSE42-NEXT:    je LBB24_36
5108; SSE42-NEXT:  LBB24_35: ## %cond.load49
5109; SSE42-NEXT:    pinsrb $1, 17(%rdi), %xmm3
5110; SSE42-NEXT:    testl $262144, %eax ## imm = 0x40000
5111; SSE42-NEXT:    je LBB24_38
5112; SSE42-NEXT:  LBB24_37: ## %cond.load52
5113; SSE42-NEXT:    pinsrb $2, 18(%rdi), %xmm3
5114; SSE42-NEXT:    testl $524288, %eax ## imm = 0x80000
5115; SSE42-NEXT:    je LBB24_40
5116; SSE42-NEXT:  LBB24_39: ## %cond.load55
5117; SSE42-NEXT:    pinsrb $3, 19(%rdi), %xmm3
5118; SSE42-NEXT:    testl $1048576, %eax ## imm = 0x100000
5119; SSE42-NEXT:    je LBB24_42
5120; SSE42-NEXT:  LBB24_41: ## %cond.load58
5121; SSE42-NEXT:    pinsrb $4, 20(%rdi), %xmm3
5122; SSE42-NEXT:    testl $2097152, %eax ## imm = 0x200000
5123; SSE42-NEXT:    je LBB24_44
5124; SSE42-NEXT:  LBB24_43: ## %cond.load61
5125; SSE42-NEXT:    pinsrb $5, 21(%rdi), %xmm3
5126; SSE42-NEXT:    testl $4194304, %eax ## imm = 0x400000
5127; SSE42-NEXT:    je LBB24_46
5128; SSE42-NEXT:  LBB24_45: ## %cond.load64
5129; SSE42-NEXT:    pinsrb $6, 22(%rdi), %xmm3
5130; SSE42-NEXT:    testl $8388608, %eax ## imm = 0x800000
5131; SSE42-NEXT:    je LBB24_48
5132; SSE42-NEXT:  LBB24_47: ## %cond.load67
5133; SSE42-NEXT:    pinsrb $7, 23(%rdi), %xmm3
5134; SSE42-NEXT:    testl $16777216, %eax ## imm = 0x1000000
5135; SSE42-NEXT:    je LBB24_50
5136; SSE42-NEXT:  LBB24_49: ## %cond.load70
5137; SSE42-NEXT:    pinsrb $8, 24(%rdi), %xmm3
5138; SSE42-NEXT:    testl $33554432, %eax ## imm = 0x2000000
5139; SSE42-NEXT:    je LBB24_52
5140; SSE42-NEXT:  LBB24_51: ## %cond.load73
5141; SSE42-NEXT:    pinsrb $9, 25(%rdi), %xmm3
5142; SSE42-NEXT:    testl $67108864, %eax ## imm = 0x4000000
5143; SSE42-NEXT:    je LBB24_54
5144; SSE42-NEXT:  LBB24_53: ## %cond.load76
5145; SSE42-NEXT:    pinsrb $10, 26(%rdi), %xmm3
5146; SSE42-NEXT:    testl $134217728, %eax ## imm = 0x8000000
5147; SSE42-NEXT:    je LBB24_56
5148; SSE42-NEXT:  LBB24_55: ## %cond.load79
5149; SSE42-NEXT:    pinsrb $11, 27(%rdi), %xmm3
5150; SSE42-NEXT:    testl $268435456, %eax ## imm = 0x10000000
5151; SSE42-NEXT:    je LBB24_58
5152; SSE42-NEXT:  LBB24_57: ## %cond.load82
5153; SSE42-NEXT:    pinsrb $12, 28(%rdi), %xmm3
5154; SSE42-NEXT:    testl $536870912, %eax ## imm = 0x20000000
5155; SSE42-NEXT:    je LBB24_60
5156; SSE42-NEXT:  LBB24_59: ## %cond.load85
5157; SSE42-NEXT:    pinsrb $13, 29(%rdi), %xmm3
5158; SSE42-NEXT:    testl $1073741824, %eax ## imm = 0x40000000
5159; SSE42-NEXT:    je LBB24_62
5160; SSE42-NEXT:  LBB24_61: ## %cond.load88
5161; SSE42-NEXT:    pinsrb $14, 30(%rdi), %xmm3
5162; SSE42-NEXT:    testl $-2147483648, %eax ## imm = 0x80000000
5163; SSE42-NEXT:    jne LBB24_63
5164; SSE42-NEXT:    jmp LBB24_64
5165;
5166; AVX1-LABEL: load_v32i8_v32i8:
5167; AVX1:       ## %bb.0:
5168; AVX1-NEXT:    vpmovmskb %xmm0, %ecx
5169; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
5170; AVX1-NEXT:    vpmovmskb %xmm0, %eax
5171; AVX1-NEXT:    shll $16, %eax
5172; AVX1-NEXT:    orl %ecx, %eax
5173; AVX1-NEXT:    testb $1, %al
5174; AVX1-NEXT:    jne LBB24_1
5175; AVX1-NEXT:  ## %bb.2: ## %else
5176; AVX1-NEXT:    testb $2, %al
5177; AVX1-NEXT:    jne LBB24_3
5178; AVX1-NEXT:  LBB24_4: ## %else2
5179; AVX1-NEXT:    testb $4, %al
5180; AVX1-NEXT:    jne LBB24_5
5181; AVX1-NEXT:  LBB24_6: ## %else5
5182; AVX1-NEXT:    testb $8, %al
5183; AVX1-NEXT:    jne LBB24_7
5184; AVX1-NEXT:  LBB24_8: ## %else8
5185; AVX1-NEXT:    testb $16, %al
5186; AVX1-NEXT:    jne LBB24_9
5187; AVX1-NEXT:  LBB24_10: ## %else11
5188; AVX1-NEXT:    testb $32, %al
5189; AVX1-NEXT:    jne LBB24_11
5190; AVX1-NEXT:  LBB24_12: ## %else14
5191; AVX1-NEXT:    testb $64, %al
5192; AVX1-NEXT:    jne LBB24_13
5193; AVX1-NEXT:  LBB24_14: ## %else17
5194; AVX1-NEXT:    testb %al, %al
5195; AVX1-NEXT:    js LBB24_15
5196; AVX1-NEXT:  LBB24_16: ## %else20
5197; AVX1-NEXT:    testl $256, %eax ## imm = 0x100
5198; AVX1-NEXT:    jne LBB24_17
5199; AVX1-NEXT:  LBB24_18: ## %else23
5200; AVX1-NEXT:    testl $512, %eax ## imm = 0x200
5201; AVX1-NEXT:    jne LBB24_19
5202; AVX1-NEXT:  LBB24_20: ## %else26
5203; AVX1-NEXT:    testl $1024, %eax ## imm = 0x400
5204; AVX1-NEXT:    jne LBB24_21
5205; AVX1-NEXT:  LBB24_22: ## %else29
5206; AVX1-NEXT:    testl $2048, %eax ## imm = 0x800
5207; AVX1-NEXT:    jne LBB24_23
5208; AVX1-NEXT:  LBB24_24: ## %else32
5209; AVX1-NEXT:    testl $4096, %eax ## imm = 0x1000
5210; AVX1-NEXT:    jne LBB24_25
5211; AVX1-NEXT:  LBB24_26: ## %else35
5212; AVX1-NEXT:    testl $8192, %eax ## imm = 0x2000
5213; AVX1-NEXT:    jne LBB24_27
5214; AVX1-NEXT:  LBB24_28: ## %else38
5215; AVX1-NEXT:    testl $16384, %eax ## imm = 0x4000
5216; AVX1-NEXT:    jne LBB24_29
5217; AVX1-NEXT:  LBB24_30: ## %else41
5218; AVX1-NEXT:    testw %ax, %ax
5219; AVX1-NEXT:    js LBB24_31
5220; AVX1-NEXT:  LBB24_32: ## %else44
5221; AVX1-NEXT:    testl $65536, %eax ## imm = 0x10000
5222; AVX1-NEXT:    jne LBB24_33
5223; AVX1-NEXT:  LBB24_34: ## %else47
5224; AVX1-NEXT:    testl $131072, %eax ## imm = 0x20000
5225; AVX1-NEXT:    jne LBB24_35
5226; AVX1-NEXT:  LBB24_36: ## %else50
5227; AVX1-NEXT:    testl $262144, %eax ## imm = 0x40000
5228; AVX1-NEXT:    jne LBB24_37
5229; AVX1-NEXT:  LBB24_38: ## %else53
5230; AVX1-NEXT:    testl $524288, %eax ## imm = 0x80000
5231; AVX1-NEXT:    jne LBB24_39
5232; AVX1-NEXT:  LBB24_40: ## %else56
5233; AVX1-NEXT:    testl $1048576, %eax ## imm = 0x100000
5234; AVX1-NEXT:    jne LBB24_41
5235; AVX1-NEXT:  LBB24_42: ## %else59
5236; AVX1-NEXT:    testl $2097152, %eax ## imm = 0x200000
5237; AVX1-NEXT:    jne LBB24_43
5238; AVX1-NEXT:  LBB24_44: ## %else62
5239; AVX1-NEXT:    testl $4194304, %eax ## imm = 0x400000
5240; AVX1-NEXT:    jne LBB24_45
5241; AVX1-NEXT:  LBB24_46: ## %else65
5242; AVX1-NEXT:    testl $8388608, %eax ## imm = 0x800000
5243; AVX1-NEXT:    jne LBB24_47
5244; AVX1-NEXT:  LBB24_48: ## %else68
5245; AVX1-NEXT:    testl $16777216, %eax ## imm = 0x1000000
5246; AVX1-NEXT:    jne LBB24_49
5247; AVX1-NEXT:  LBB24_50: ## %else71
5248; AVX1-NEXT:    testl $33554432, %eax ## imm = 0x2000000
5249; AVX1-NEXT:    jne LBB24_51
5250; AVX1-NEXT:  LBB24_52: ## %else74
5251; AVX1-NEXT:    testl $67108864, %eax ## imm = 0x4000000
5252; AVX1-NEXT:    jne LBB24_53
5253; AVX1-NEXT:  LBB24_54: ## %else77
5254; AVX1-NEXT:    testl $134217728, %eax ## imm = 0x8000000
5255; AVX1-NEXT:    jne LBB24_55
5256; AVX1-NEXT:  LBB24_56: ## %else80
5257; AVX1-NEXT:    testl $268435456, %eax ## imm = 0x10000000
5258; AVX1-NEXT:    jne LBB24_57
5259; AVX1-NEXT:  LBB24_58: ## %else83
5260; AVX1-NEXT:    testl $536870912, %eax ## imm = 0x20000000
5261; AVX1-NEXT:    jne LBB24_59
5262; AVX1-NEXT:  LBB24_60: ## %else86
5263; AVX1-NEXT:    testl $1073741824, %eax ## imm = 0x40000000
5264; AVX1-NEXT:    jne LBB24_61
5265; AVX1-NEXT:  LBB24_62: ## %else89
5266; AVX1-NEXT:    testl $-2147483648, %eax ## imm = 0x80000000
5267; AVX1-NEXT:    jne LBB24_63
5268; AVX1-NEXT:  LBB24_64: ## %else92
5269; AVX1-NEXT:    vmovaps %ymm1, %ymm0
5270; AVX1-NEXT:    retq
5271; AVX1-NEXT:  LBB24_1: ## %cond.load
5272; AVX1-NEXT:    vpinsrb $0, (%rdi), %xmm1, %xmm0
5273; AVX1-NEXT:    vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
5274; AVX1-NEXT:    testb $2, %al
5275; AVX1-NEXT:    je LBB24_4
5276; AVX1-NEXT:  LBB24_3: ## %cond.load1
5277; AVX1-NEXT:    vpinsrb $1, 1(%rdi), %xmm1, %xmm0
5278; AVX1-NEXT:    vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
5279; AVX1-NEXT:    testb $4, %al
5280; AVX1-NEXT:    je LBB24_6
5281; AVX1-NEXT:  LBB24_5: ## %cond.load4
5282; AVX1-NEXT:    vpinsrb $2, 2(%rdi), %xmm1, %xmm0
5283; AVX1-NEXT:    vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
5284; AVX1-NEXT:    testb $8, %al
5285; AVX1-NEXT:    je LBB24_8
5286; AVX1-NEXT:  LBB24_7: ## %cond.load7
5287; AVX1-NEXT:    vpinsrb $3, 3(%rdi), %xmm1, %xmm0
5288; AVX1-NEXT:    vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
5289; AVX1-NEXT:    testb $16, %al
5290; AVX1-NEXT:    je LBB24_10
5291; AVX1-NEXT:  LBB24_9: ## %cond.load10
5292; AVX1-NEXT:    vpinsrb $4, 4(%rdi), %xmm1, %xmm0
5293; AVX1-NEXT:    vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
5294; AVX1-NEXT:    testb $32, %al
5295; AVX1-NEXT:    je LBB24_12
5296; AVX1-NEXT:  LBB24_11: ## %cond.load13
5297; AVX1-NEXT:    vpinsrb $5, 5(%rdi), %xmm1, %xmm0
5298; AVX1-NEXT:    vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
5299; AVX1-NEXT:    testb $64, %al
5300; AVX1-NEXT:    je LBB24_14
5301; AVX1-NEXT:  LBB24_13: ## %cond.load16
5302; AVX1-NEXT:    vpinsrb $6, 6(%rdi), %xmm1, %xmm0
5303; AVX1-NEXT:    vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
5304; AVX1-NEXT:    testb %al, %al
5305; AVX1-NEXT:    jns LBB24_16
5306; AVX1-NEXT:  LBB24_15: ## %cond.load19
5307; AVX1-NEXT:    vpinsrb $7, 7(%rdi), %xmm1, %xmm0
5308; AVX1-NEXT:    vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
5309; AVX1-NEXT:    testl $256, %eax ## imm = 0x100
5310; AVX1-NEXT:    je LBB24_18
5311; AVX1-NEXT:  LBB24_17: ## %cond.load22
5312; AVX1-NEXT:    vpinsrb $8, 8(%rdi), %xmm1, %xmm0
5313; AVX1-NEXT:    vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
5314; AVX1-NEXT:    testl $512, %eax ## imm = 0x200
5315; AVX1-NEXT:    je LBB24_20
5316; AVX1-NEXT:  LBB24_19: ## %cond.load25
5317; AVX1-NEXT:    vpinsrb $9, 9(%rdi), %xmm1, %xmm0
5318; AVX1-NEXT:    vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
5319; AVX1-NEXT:    testl $1024, %eax ## imm = 0x400
5320; AVX1-NEXT:    je LBB24_22
5321; AVX1-NEXT:  LBB24_21: ## %cond.load28
5322; AVX1-NEXT:    vpinsrb $10, 10(%rdi), %xmm1, %xmm0
5323; AVX1-NEXT:    vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
5324; AVX1-NEXT:    testl $2048, %eax ## imm = 0x800
5325; AVX1-NEXT:    je LBB24_24
5326; AVX1-NEXT:  LBB24_23: ## %cond.load31
5327; AVX1-NEXT:    vpinsrb $11, 11(%rdi), %xmm1, %xmm0
5328; AVX1-NEXT:    vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
5329; AVX1-NEXT:    testl $4096, %eax ## imm = 0x1000
5330; AVX1-NEXT:    je LBB24_26
5331; AVX1-NEXT:  LBB24_25: ## %cond.load34
5332; AVX1-NEXT:    vpinsrb $12, 12(%rdi), %xmm1, %xmm0
5333; AVX1-NEXT:    vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
5334; AVX1-NEXT:    testl $8192, %eax ## imm = 0x2000
5335; AVX1-NEXT:    je LBB24_28
5336; AVX1-NEXT:  LBB24_27: ## %cond.load37
5337; AVX1-NEXT:    vpinsrb $13, 13(%rdi), %xmm1, %xmm0
5338; AVX1-NEXT:    vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
5339; AVX1-NEXT:    testl $16384, %eax ## imm = 0x4000
5340; AVX1-NEXT:    je LBB24_30
5341; AVX1-NEXT:  LBB24_29: ## %cond.load40
5342; AVX1-NEXT:    vpinsrb $14, 14(%rdi), %xmm1, %xmm0
5343; AVX1-NEXT:    vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
5344; AVX1-NEXT:    testw %ax, %ax
5345; AVX1-NEXT:    jns LBB24_32
5346; AVX1-NEXT:  LBB24_31: ## %cond.load43
5347; AVX1-NEXT:    vpinsrb $15, 15(%rdi), %xmm1, %xmm0
5348; AVX1-NEXT:    vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
5349; AVX1-NEXT:    testl $65536, %eax ## imm = 0x10000
5350; AVX1-NEXT:    je LBB24_34
5351; AVX1-NEXT:  LBB24_33: ## %cond.load46
5352; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm0
5353; AVX1-NEXT:    vpinsrb $0, 16(%rdi), %xmm0, %xmm0
5354; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm1
5355; AVX1-NEXT:    testl $131072, %eax ## imm = 0x20000
5356; AVX1-NEXT:    je LBB24_36
5357; AVX1-NEXT:  LBB24_35: ## %cond.load49
5358; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm0
5359; AVX1-NEXT:    vpinsrb $1, 17(%rdi), %xmm0, %xmm0
5360; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm1
5361; AVX1-NEXT:    testl $262144, %eax ## imm = 0x40000
5362; AVX1-NEXT:    je LBB24_38
5363; AVX1-NEXT:  LBB24_37: ## %cond.load52
5364; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm0
5365; AVX1-NEXT:    vpinsrb $2, 18(%rdi), %xmm0, %xmm0
5366; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm1
5367; AVX1-NEXT:    testl $524288, %eax ## imm = 0x80000
5368; AVX1-NEXT:    je LBB24_40
5369; AVX1-NEXT:  LBB24_39: ## %cond.load55
5370; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm0
5371; AVX1-NEXT:    vpinsrb $3, 19(%rdi), %xmm0, %xmm0
5372; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm1
5373; AVX1-NEXT:    testl $1048576, %eax ## imm = 0x100000
5374; AVX1-NEXT:    je LBB24_42
5375; AVX1-NEXT:  LBB24_41: ## %cond.load58
5376; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm0
5377; AVX1-NEXT:    vpinsrb $4, 20(%rdi), %xmm0, %xmm0
5378; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm1
5379; AVX1-NEXT:    testl $2097152, %eax ## imm = 0x200000
5380; AVX1-NEXT:    je LBB24_44
5381; AVX1-NEXT:  LBB24_43: ## %cond.load61
5382; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm0
5383; AVX1-NEXT:    vpinsrb $5, 21(%rdi), %xmm0, %xmm0
5384; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm1
5385; AVX1-NEXT:    testl $4194304, %eax ## imm = 0x400000
5386; AVX1-NEXT:    je LBB24_46
5387; AVX1-NEXT:  LBB24_45: ## %cond.load64
5388; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm0
5389; AVX1-NEXT:    vpinsrb $6, 22(%rdi), %xmm0, %xmm0
5390; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm1
5391; AVX1-NEXT:    testl $8388608, %eax ## imm = 0x800000
5392; AVX1-NEXT:    je LBB24_48
5393; AVX1-NEXT:  LBB24_47: ## %cond.load67
5394; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm0
5395; AVX1-NEXT:    vpinsrb $7, 23(%rdi), %xmm0, %xmm0
5396; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm1
5397; AVX1-NEXT:    testl $16777216, %eax ## imm = 0x1000000
5398; AVX1-NEXT:    je LBB24_50
5399; AVX1-NEXT:  LBB24_49: ## %cond.load70
5400; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm0
5401; AVX1-NEXT:    vpinsrb $8, 24(%rdi), %xmm0, %xmm0
5402; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm1
5403; AVX1-NEXT:    testl $33554432, %eax ## imm = 0x2000000
5404; AVX1-NEXT:    je LBB24_52
5405; AVX1-NEXT:  LBB24_51: ## %cond.load73
5406; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm0
5407; AVX1-NEXT:    vpinsrb $9, 25(%rdi), %xmm0, %xmm0
5408; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm1
5409; AVX1-NEXT:    testl $67108864, %eax ## imm = 0x4000000
5410; AVX1-NEXT:    je LBB24_54
5411; AVX1-NEXT:  LBB24_53: ## %cond.load76
5412; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm0
5413; AVX1-NEXT:    vpinsrb $10, 26(%rdi), %xmm0, %xmm0
5414; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm1
5415; AVX1-NEXT:    testl $134217728, %eax ## imm = 0x8000000
5416; AVX1-NEXT:    je LBB24_56
5417; AVX1-NEXT:  LBB24_55: ## %cond.load79
5418; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm0
5419; AVX1-NEXT:    vpinsrb $11, 27(%rdi), %xmm0, %xmm0
5420; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm1
5421; AVX1-NEXT:    testl $268435456, %eax ## imm = 0x10000000
5422; AVX1-NEXT:    je LBB24_58
5423; AVX1-NEXT:  LBB24_57: ## %cond.load82
5424; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm0
5425; AVX1-NEXT:    vpinsrb $12, 28(%rdi), %xmm0, %xmm0
5426; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm1
5427; AVX1-NEXT:    testl $536870912, %eax ## imm = 0x20000000
5428; AVX1-NEXT:    je LBB24_60
5429; AVX1-NEXT:  LBB24_59: ## %cond.load85
5430; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm0
5431; AVX1-NEXT:    vpinsrb $13, 29(%rdi), %xmm0, %xmm0
5432; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm1
5433; AVX1-NEXT:    testl $1073741824, %eax ## imm = 0x40000000
5434; AVX1-NEXT:    je LBB24_62
5435; AVX1-NEXT:  LBB24_61: ## %cond.load88
5436; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm0
5437; AVX1-NEXT:    vpinsrb $14, 30(%rdi), %xmm0, %xmm0
5438; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm1
5439; AVX1-NEXT:    testl $-2147483648, %eax ## imm = 0x80000000
5440; AVX1-NEXT:    je LBB24_64
5441; AVX1-NEXT:  LBB24_63: ## %cond.load91
5442; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm0
5443; AVX1-NEXT:    vpinsrb $15, 31(%rdi), %xmm0, %xmm0
5444; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm1
5445; AVX1-NEXT:    vmovaps %ymm1, %ymm0
5446; AVX1-NEXT:    retq
5447;
5448; AVX2-LABEL: load_v32i8_v32i8:
5449; AVX2:       ## %bb.0:
5450; AVX2-NEXT:    vpmovmskb %ymm0, %eax
5451; AVX2-NEXT:    testb $1, %al
5452; AVX2-NEXT:    jne LBB24_1
5453; AVX2-NEXT:  ## %bb.2: ## %else
5454; AVX2-NEXT:    testb $2, %al
5455; AVX2-NEXT:    jne LBB24_3
5456; AVX2-NEXT:  LBB24_4: ## %else2
5457; AVX2-NEXT:    testb $4, %al
5458; AVX2-NEXT:    jne LBB24_5
5459; AVX2-NEXT:  LBB24_6: ## %else5
5460; AVX2-NEXT:    testb $8, %al
5461; AVX2-NEXT:    jne LBB24_7
5462; AVX2-NEXT:  LBB24_8: ## %else8
5463; AVX2-NEXT:    testb $16, %al
5464; AVX2-NEXT:    jne LBB24_9
5465; AVX2-NEXT:  LBB24_10: ## %else11
5466; AVX2-NEXT:    testb $32, %al
5467; AVX2-NEXT:    jne LBB24_11
5468; AVX2-NEXT:  LBB24_12: ## %else14
5469; AVX2-NEXT:    testb $64, %al
5470; AVX2-NEXT:    jne LBB24_13
5471; AVX2-NEXT:  LBB24_14: ## %else17
5472; AVX2-NEXT:    testb %al, %al
5473; AVX2-NEXT:    js LBB24_15
5474; AVX2-NEXT:  LBB24_16: ## %else20
5475; AVX2-NEXT:    testl $256, %eax ## imm = 0x100
5476; AVX2-NEXT:    jne LBB24_17
5477; AVX2-NEXT:  LBB24_18: ## %else23
5478; AVX2-NEXT:    testl $512, %eax ## imm = 0x200
5479; AVX2-NEXT:    jne LBB24_19
5480; AVX2-NEXT:  LBB24_20: ## %else26
5481; AVX2-NEXT:    testl $1024, %eax ## imm = 0x400
5482; AVX2-NEXT:    jne LBB24_21
5483; AVX2-NEXT:  LBB24_22: ## %else29
5484; AVX2-NEXT:    testl $2048, %eax ## imm = 0x800
5485; AVX2-NEXT:    jne LBB24_23
5486; AVX2-NEXT:  LBB24_24: ## %else32
5487; AVX2-NEXT:    testl $4096, %eax ## imm = 0x1000
5488; AVX2-NEXT:    jne LBB24_25
5489; AVX2-NEXT:  LBB24_26: ## %else35
5490; AVX2-NEXT:    testl $8192, %eax ## imm = 0x2000
5491; AVX2-NEXT:    jne LBB24_27
5492; AVX2-NEXT:  LBB24_28: ## %else38
5493; AVX2-NEXT:    testl $16384, %eax ## imm = 0x4000
5494; AVX2-NEXT:    jne LBB24_29
5495; AVX2-NEXT:  LBB24_30: ## %else41
5496; AVX2-NEXT:    testw %ax, %ax
5497; AVX2-NEXT:    js LBB24_31
5498; AVX2-NEXT:  LBB24_32: ## %else44
5499; AVX2-NEXT:    testl $65536, %eax ## imm = 0x10000
5500; AVX2-NEXT:    jne LBB24_33
5501; AVX2-NEXT:  LBB24_34: ## %else47
5502; AVX2-NEXT:    testl $131072, %eax ## imm = 0x20000
5503; AVX2-NEXT:    jne LBB24_35
5504; AVX2-NEXT:  LBB24_36: ## %else50
5505; AVX2-NEXT:    testl $262144, %eax ## imm = 0x40000
5506; AVX2-NEXT:    jne LBB24_37
5507; AVX2-NEXT:  LBB24_38: ## %else53
5508; AVX2-NEXT:    testl $524288, %eax ## imm = 0x80000
5509; AVX2-NEXT:    jne LBB24_39
5510; AVX2-NEXT:  LBB24_40: ## %else56
5511; AVX2-NEXT:    testl $1048576, %eax ## imm = 0x100000
5512; AVX2-NEXT:    jne LBB24_41
5513; AVX2-NEXT:  LBB24_42: ## %else59
5514; AVX2-NEXT:    testl $2097152, %eax ## imm = 0x200000
5515; AVX2-NEXT:    jne LBB24_43
5516; AVX2-NEXT:  LBB24_44: ## %else62
5517; AVX2-NEXT:    testl $4194304, %eax ## imm = 0x400000
5518; AVX2-NEXT:    jne LBB24_45
5519; AVX2-NEXT:  LBB24_46: ## %else65
5520; AVX2-NEXT:    testl $8388608, %eax ## imm = 0x800000
5521; AVX2-NEXT:    jne LBB24_47
5522; AVX2-NEXT:  LBB24_48: ## %else68
5523; AVX2-NEXT:    testl $16777216, %eax ## imm = 0x1000000
5524; AVX2-NEXT:    jne LBB24_49
5525; AVX2-NEXT:  LBB24_50: ## %else71
5526; AVX2-NEXT:    testl $33554432, %eax ## imm = 0x2000000
5527; AVX2-NEXT:    jne LBB24_51
5528; AVX2-NEXT:  LBB24_52: ## %else74
5529; AVX2-NEXT:    testl $67108864, %eax ## imm = 0x4000000
5530; AVX2-NEXT:    jne LBB24_53
5531; AVX2-NEXT:  LBB24_54: ## %else77
5532; AVX2-NEXT:    testl $134217728, %eax ## imm = 0x8000000
5533; AVX2-NEXT:    jne LBB24_55
5534; AVX2-NEXT:  LBB24_56: ## %else80
5535; AVX2-NEXT:    testl $268435456, %eax ## imm = 0x10000000
5536; AVX2-NEXT:    jne LBB24_57
5537; AVX2-NEXT:  LBB24_58: ## %else83
5538; AVX2-NEXT:    testl $536870912, %eax ## imm = 0x20000000
5539; AVX2-NEXT:    jne LBB24_59
5540; AVX2-NEXT:  LBB24_60: ## %else86
5541; AVX2-NEXT:    testl $1073741824, %eax ## imm = 0x40000000
5542; AVX2-NEXT:    jne LBB24_61
5543; AVX2-NEXT:  LBB24_62: ## %else89
5544; AVX2-NEXT:    testl $-2147483648, %eax ## imm = 0x80000000
5545; AVX2-NEXT:    jne LBB24_63
5546; AVX2-NEXT:  LBB24_64: ## %else92
5547; AVX2-NEXT:    vmovdqa %ymm1, %ymm0
5548; AVX2-NEXT:    retq
5549; AVX2-NEXT:  LBB24_1: ## %cond.load
5550; AVX2-NEXT:    vpinsrb $0, (%rdi), %xmm1, %xmm0
5551; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
5552; AVX2-NEXT:    testb $2, %al
5553; AVX2-NEXT:    je LBB24_4
5554; AVX2-NEXT:  LBB24_3: ## %cond.load1
5555; AVX2-NEXT:    vpinsrb $1, 1(%rdi), %xmm1, %xmm0
5556; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
5557; AVX2-NEXT:    testb $4, %al
5558; AVX2-NEXT:    je LBB24_6
5559; AVX2-NEXT:  LBB24_5: ## %cond.load4
5560; AVX2-NEXT:    vpinsrb $2, 2(%rdi), %xmm1, %xmm0
5561; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
5562; AVX2-NEXT:    testb $8, %al
5563; AVX2-NEXT:    je LBB24_8
5564; AVX2-NEXT:  LBB24_7: ## %cond.load7
5565; AVX2-NEXT:    vpinsrb $3, 3(%rdi), %xmm1, %xmm0
5566; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
5567; AVX2-NEXT:    testb $16, %al
5568; AVX2-NEXT:    je LBB24_10
5569; AVX2-NEXT:  LBB24_9: ## %cond.load10
5570; AVX2-NEXT:    vpinsrb $4, 4(%rdi), %xmm1, %xmm0
5571; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
5572; AVX2-NEXT:    testb $32, %al
5573; AVX2-NEXT:    je LBB24_12
5574; AVX2-NEXT:  LBB24_11: ## %cond.load13
5575; AVX2-NEXT:    vpinsrb $5, 5(%rdi), %xmm1, %xmm0
5576; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
5577; AVX2-NEXT:    testb $64, %al
5578; AVX2-NEXT:    je LBB24_14
5579; AVX2-NEXT:  LBB24_13: ## %cond.load16
5580; AVX2-NEXT:    vpinsrb $6, 6(%rdi), %xmm1, %xmm0
5581; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
5582; AVX2-NEXT:    testb %al, %al
5583; AVX2-NEXT:    jns LBB24_16
5584; AVX2-NEXT:  LBB24_15: ## %cond.load19
5585; AVX2-NEXT:    vpinsrb $7, 7(%rdi), %xmm1, %xmm0
5586; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
5587; AVX2-NEXT:    testl $256, %eax ## imm = 0x100
5588; AVX2-NEXT:    je LBB24_18
5589; AVX2-NEXT:  LBB24_17: ## %cond.load22
5590; AVX2-NEXT:    vpinsrb $8, 8(%rdi), %xmm1, %xmm0
5591; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
5592; AVX2-NEXT:    testl $512, %eax ## imm = 0x200
5593; AVX2-NEXT:    je LBB24_20
5594; AVX2-NEXT:  LBB24_19: ## %cond.load25
5595; AVX2-NEXT:    vpinsrb $9, 9(%rdi), %xmm1, %xmm0
5596; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
5597; AVX2-NEXT:    testl $1024, %eax ## imm = 0x400
5598; AVX2-NEXT:    je LBB24_22
5599; AVX2-NEXT:  LBB24_21: ## %cond.load28
5600; AVX2-NEXT:    vpinsrb $10, 10(%rdi), %xmm1, %xmm0
5601; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
5602; AVX2-NEXT:    testl $2048, %eax ## imm = 0x800
5603; AVX2-NEXT:    je LBB24_24
5604; AVX2-NEXT:  LBB24_23: ## %cond.load31
5605; AVX2-NEXT:    vpinsrb $11, 11(%rdi), %xmm1, %xmm0
5606; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
5607; AVX2-NEXT:    testl $4096, %eax ## imm = 0x1000
5608; AVX2-NEXT:    je LBB24_26
5609; AVX2-NEXT:  LBB24_25: ## %cond.load34
5610; AVX2-NEXT:    vpinsrb $12, 12(%rdi), %xmm1, %xmm0
5611; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
5612; AVX2-NEXT:    testl $8192, %eax ## imm = 0x2000
5613; AVX2-NEXT:    je LBB24_28
5614; AVX2-NEXT:  LBB24_27: ## %cond.load37
5615; AVX2-NEXT:    vpinsrb $13, 13(%rdi), %xmm1, %xmm0
5616; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
5617; AVX2-NEXT:    testl $16384, %eax ## imm = 0x4000
5618; AVX2-NEXT:    je LBB24_30
5619; AVX2-NEXT:  LBB24_29: ## %cond.load40
5620; AVX2-NEXT:    vpinsrb $14, 14(%rdi), %xmm1, %xmm0
5621; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
5622; AVX2-NEXT:    testw %ax, %ax
5623; AVX2-NEXT:    jns LBB24_32
5624; AVX2-NEXT:  LBB24_31: ## %cond.load43
5625; AVX2-NEXT:    vpinsrb $15, 15(%rdi), %xmm1, %xmm0
5626; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
5627; AVX2-NEXT:    testl $65536, %eax ## imm = 0x10000
5628; AVX2-NEXT:    je LBB24_34
5629; AVX2-NEXT:  LBB24_33: ## %cond.load46
5630; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm0
5631; AVX2-NEXT:    vpinsrb $0, 16(%rdi), %xmm0, %xmm0
5632; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm1
5633; AVX2-NEXT:    testl $131072, %eax ## imm = 0x20000
5634; AVX2-NEXT:    je LBB24_36
5635; AVX2-NEXT:  LBB24_35: ## %cond.load49
5636; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm0
5637; AVX2-NEXT:    vpinsrb $1, 17(%rdi), %xmm0, %xmm0
5638; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm1
5639; AVX2-NEXT:    testl $262144, %eax ## imm = 0x40000
5640; AVX2-NEXT:    je LBB24_38
5641; AVX2-NEXT:  LBB24_37: ## %cond.load52
5642; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm0
5643; AVX2-NEXT:    vpinsrb $2, 18(%rdi), %xmm0, %xmm0
5644; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm1
5645; AVX2-NEXT:    testl $524288, %eax ## imm = 0x80000
5646; AVX2-NEXT:    je LBB24_40
5647; AVX2-NEXT:  LBB24_39: ## %cond.load55
5648; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm0
5649; AVX2-NEXT:    vpinsrb $3, 19(%rdi), %xmm0, %xmm0
5650; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm1
5651; AVX2-NEXT:    testl $1048576, %eax ## imm = 0x100000
5652; AVX2-NEXT:    je LBB24_42
5653; AVX2-NEXT:  LBB24_41: ## %cond.load58
5654; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm0
5655; AVX2-NEXT:    vpinsrb $4, 20(%rdi), %xmm0, %xmm0
5656; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm1
5657; AVX2-NEXT:    testl $2097152, %eax ## imm = 0x200000
5658; AVX2-NEXT:    je LBB24_44
5659; AVX2-NEXT:  LBB24_43: ## %cond.load61
5660; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm0
5661; AVX2-NEXT:    vpinsrb $5, 21(%rdi), %xmm0, %xmm0
5662; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm1
5663; AVX2-NEXT:    testl $4194304, %eax ## imm = 0x400000
5664; AVX2-NEXT:    je LBB24_46
5665; AVX2-NEXT:  LBB24_45: ## %cond.load64
5666; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm0
5667; AVX2-NEXT:    vpinsrb $6, 22(%rdi), %xmm0, %xmm0
5668; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm1
5669; AVX2-NEXT:    testl $8388608, %eax ## imm = 0x800000
5670; AVX2-NEXT:    je LBB24_48
5671; AVX2-NEXT:  LBB24_47: ## %cond.load67
5672; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm0
5673; AVX2-NEXT:    vpinsrb $7, 23(%rdi), %xmm0, %xmm0
5674; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm1
5675; AVX2-NEXT:    testl $16777216, %eax ## imm = 0x1000000
5676; AVX2-NEXT:    je LBB24_50
5677; AVX2-NEXT:  LBB24_49: ## %cond.load70
5678; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm0
5679; AVX2-NEXT:    vpinsrb $8, 24(%rdi), %xmm0, %xmm0
5680; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm1
5681; AVX2-NEXT:    testl $33554432, %eax ## imm = 0x2000000
5682; AVX2-NEXT:    je LBB24_52
5683; AVX2-NEXT:  LBB24_51: ## %cond.load73
5684; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm0
5685; AVX2-NEXT:    vpinsrb $9, 25(%rdi), %xmm0, %xmm0
5686; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm1
5687; AVX2-NEXT:    testl $67108864, %eax ## imm = 0x4000000
5688; AVX2-NEXT:    je LBB24_54
5689; AVX2-NEXT:  LBB24_53: ## %cond.load76
5690; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm0
5691; AVX2-NEXT:    vpinsrb $10, 26(%rdi), %xmm0, %xmm0
5692; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm1
5693; AVX2-NEXT:    testl $134217728, %eax ## imm = 0x8000000
5694; AVX2-NEXT:    je LBB24_56
5695; AVX2-NEXT:  LBB24_55: ## %cond.load79
5696; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm0
5697; AVX2-NEXT:    vpinsrb $11, 27(%rdi), %xmm0, %xmm0
5698; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm1
5699; AVX2-NEXT:    testl $268435456, %eax ## imm = 0x10000000
5700; AVX2-NEXT:    je LBB24_58
5701; AVX2-NEXT:  LBB24_57: ## %cond.load82
5702; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm0
5703; AVX2-NEXT:    vpinsrb $12, 28(%rdi), %xmm0, %xmm0
5704; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm1
5705; AVX2-NEXT:    testl $536870912, %eax ## imm = 0x20000000
5706; AVX2-NEXT:    je LBB24_60
5707; AVX2-NEXT:  LBB24_59: ## %cond.load85
5708; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm0
5709; AVX2-NEXT:    vpinsrb $13, 29(%rdi), %xmm0, %xmm0
5710; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm1
5711; AVX2-NEXT:    testl $1073741824, %eax ## imm = 0x40000000
5712; AVX2-NEXT:    je LBB24_62
5713; AVX2-NEXT:  LBB24_61: ## %cond.load88
5714; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm0
5715; AVX2-NEXT:    vpinsrb $14, 30(%rdi), %xmm0, %xmm0
5716; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm1
5717; AVX2-NEXT:    testl $-2147483648, %eax ## imm = 0x80000000
5718; AVX2-NEXT:    je LBB24_64
5719; AVX2-NEXT:  LBB24_63: ## %cond.load91
5720; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm0
5721; AVX2-NEXT:    vpinsrb $15, 31(%rdi), %xmm0, %xmm0
5722; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm1
5723; AVX2-NEXT:    vmovdqa %ymm1, %ymm0
5724; AVX2-NEXT:    retq
5725;
5726; AVX512F-LABEL: load_v32i8_v32i8:
5727; AVX512F:       ## %bb.0:
5728; AVX512F-NEXT:    vpmovmskb %ymm0, %eax
5729; AVX512F-NEXT:    testb $1, %al
5730; AVX512F-NEXT:    jne LBB24_1
5731; AVX512F-NEXT:  ## %bb.2: ## %else
5732; AVX512F-NEXT:    testb $2, %al
5733; AVX512F-NEXT:    jne LBB24_3
5734; AVX512F-NEXT:  LBB24_4: ## %else2
5735; AVX512F-NEXT:    testb $4, %al
5736; AVX512F-NEXT:    jne LBB24_5
5737; AVX512F-NEXT:  LBB24_6: ## %else5
5738; AVX512F-NEXT:    testb $8, %al
5739; AVX512F-NEXT:    jne LBB24_7
5740; AVX512F-NEXT:  LBB24_8: ## %else8
5741; AVX512F-NEXT:    testb $16, %al
5742; AVX512F-NEXT:    jne LBB24_9
5743; AVX512F-NEXT:  LBB24_10: ## %else11
5744; AVX512F-NEXT:    testb $32, %al
5745; AVX512F-NEXT:    jne LBB24_11
5746; AVX512F-NEXT:  LBB24_12: ## %else14
5747; AVX512F-NEXT:    testb $64, %al
5748; AVX512F-NEXT:    jne LBB24_13
5749; AVX512F-NEXT:  LBB24_14: ## %else17
5750; AVX512F-NEXT:    testb %al, %al
5751; AVX512F-NEXT:    js LBB24_15
5752; AVX512F-NEXT:  LBB24_16: ## %else20
5753; AVX512F-NEXT:    testl $256, %eax ## imm = 0x100
5754; AVX512F-NEXT:    jne LBB24_17
5755; AVX512F-NEXT:  LBB24_18: ## %else23
5756; AVX512F-NEXT:    testl $512, %eax ## imm = 0x200
5757; AVX512F-NEXT:    jne LBB24_19
5758; AVX512F-NEXT:  LBB24_20: ## %else26
5759; AVX512F-NEXT:    testl $1024, %eax ## imm = 0x400
5760; AVX512F-NEXT:    jne LBB24_21
5761; AVX512F-NEXT:  LBB24_22: ## %else29
5762; AVX512F-NEXT:    testl $2048, %eax ## imm = 0x800
5763; AVX512F-NEXT:    jne LBB24_23
5764; AVX512F-NEXT:  LBB24_24: ## %else32
5765; AVX512F-NEXT:    testl $4096, %eax ## imm = 0x1000
5766; AVX512F-NEXT:    jne LBB24_25
5767; AVX512F-NEXT:  LBB24_26: ## %else35
5768; AVX512F-NEXT:    testl $8192, %eax ## imm = 0x2000
5769; AVX512F-NEXT:    jne LBB24_27
5770; AVX512F-NEXT:  LBB24_28: ## %else38
5771; AVX512F-NEXT:    testl $16384, %eax ## imm = 0x4000
5772; AVX512F-NEXT:    jne LBB24_29
5773; AVX512F-NEXT:  LBB24_30: ## %else41
5774; AVX512F-NEXT:    testw %ax, %ax
5775; AVX512F-NEXT:    js LBB24_31
5776; AVX512F-NEXT:  LBB24_32: ## %else44
5777; AVX512F-NEXT:    testl $65536, %eax ## imm = 0x10000
5778; AVX512F-NEXT:    jne LBB24_33
5779; AVX512F-NEXT:  LBB24_34: ## %else47
5780; AVX512F-NEXT:    testl $131072, %eax ## imm = 0x20000
5781; AVX512F-NEXT:    jne LBB24_35
5782; AVX512F-NEXT:  LBB24_36: ## %else50
5783; AVX512F-NEXT:    testl $262144, %eax ## imm = 0x40000
5784; AVX512F-NEXT:    jne LBB24_37
5785; AVX512F-NEXT:  LBB24_38: ## %else53
5786; AVX512F-NEXT:    testl $524288, %eax ## imm = 0x80000
5787; AVX512F-NEXT:    jne LBB24_39
5788; AVX512F-NEXT:  LBB24_40: ## %else56
5789; AVX512F-NEXT:    testl $1048576, %eax ## imm = 0x100000
5790; AVX512F-NEXT:    jne LBB24_41
5791; AVX512F-NEXT:  LBB24_42: ## %else59
5792; AVX512F-NEXT:    testl $2097152, %eax ## imm = 0x200000
5793; AVX512F-NEXT:    jne LBB24_43
5794; AVX512F-NEXT:  LBB24_44: ## %else62
5795; AVX512F-NEXT:    testl $4194304, %eax ## imm = 0x400000
5796; AVX512F-NEXT:    jne LBB24_45
5797; AVX512F-NEXT:  LBB24_46: ## %else65
5798; AVX512F-NEXT:    testl $8388608, %eax ## imm = 0x800000
5799; AVX512F-NEXT:    jne LBB24_47
5800; AVX512F-NEXT:  LBB24_48: ## %else68
5801; AVX512F-NEXT:    testl $16777216, %eax ## imm = 0x1000000
5802; AVX512F-NEXT:    jne LBB24_49
5803; AVX512F-NEXT:  LBB24_50: ## %else71
5804; AVX512F-NEXT:    testl $33554432, %eax ## imm = 0x2000000
5805; AVX512F-NEXT:    jne LBB24_51
5806; AVX512F-NEXT:  LBB24_52: ## %else74
5807; AVX512F-NEXT:    testl $67108864, %eax ## imm = 0x4000000
5808; AVX512F-NEXT:    jne LBB24_53
5809; AVX512F-NEXT:  LBB24_54: ## %else77
5810; AVX512F-NEXT:    testl $134217728, %eax ## imm = 0x8000000
5811; AVX512F-NEXT:    jne LBB24_55
5812; AVX512F-NEXT:  LBB24_56: ## %else80
5813; AVX512F-NEXT:    testl $268435456, %eax ## imm = 0x10000000
5814; AVX512F-NEXT:    jne LBB24_57
5815; AVX512F-NEXT:  LBB24_58: ## %else83
5816; AVX512F-NEXT:    testl $536870912, %eax ## imm = 0x20000000
5817; AVX512F-NEXT:    jne LBB24_59
5818; AVX512F-NEXT:  LBB24_60: ## %else86
5819; AVX512F-NEXT:    testl $1073741824, %eax ## imm = 0x40000000
5820; AVX512F-NEXT:    jne LBB24_61
5821; AVX512F-NEXT:  LBB24_62: ## %else89
5822; AVX512F-NEXT:    testl $-2147483648, %eax ## imm = 0x80000000
5823; AVX512F-NEXT:    jne LBB24_63
5824; AVX512F-NEXT:  LBB24_64: ## %else92
5825; AVX512F-NEXT:    vmovdqa %ymm1, %ymm0
5826; AVX512F-NEXT:    retq
5827; AVX512F-NEXT:  LBB24_1: ## %cond.load
5828; AVX512F-NEXT:    vpinsrb $0, (%rdi), %xmm1, %xmm0
5829; AVX512F-NEXT:    vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
5830; AVX512F-NEXT:    testb $2, %al
5831; AVX512F-NEXT:    je LBB24_4
5832; AVX512F-NEXT:  LBB24_3: ## %cond.load1
5833; AVX512F-NEXT:    vpinsrb $1, 1(%rdi), %xmm1, %xmm0
5834; AVX512F-NEXT:    vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
5835; AVX512F-NEXT:    testb $4, %al
5836; AVX512F-NEXT:    je LBB24_6
5837; AVX512F-NEXT:  LBB24_5: ## %cond.load4
5838; AVX512F-NEXT:    vpinsrb $2, 2(%rdi), %xmm1, %xmm0
5839; AVX512F-NEXT:    vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
5840; AVX512F-NEXT:    testb $8, %al
5841; AVX512F-NEXT:    je LBB24_8
5842; AVX512F-NEXT:  LBB24_7: ## %cond.load7
5843; AVX512F-NEXT:    vpinsrb $3, 3(%rdi), %xmm1, %xmm0
5844; AVX512F-NEXT:    vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
5845; AVX512F-NEXT:    testb $16, %al
5846; AVX512F-NEXT:    je LBB24_10
5847; AVX512F-NEXT:  LBB24_9: ## %cond.load10
5848; AVX512F-NEXT:    vpinsrb $4, 4(%rdi), %xmm1, %xmm0
5849; AVX512F-NEXT:    vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
5850; AVX512F-NEXT:    testb $32, %al
5851; AVX512F-NEXT:    je LBB24_12
5852; AVX512F-NEXT:  LBB24_11: ## %cond.load13
5853; AVX512F-NEXT:    vpinsrb $5, 5(%rdi), %xmm1, %xmm0
5854; AVX512F-NEXT:    vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
5855; AVX512F-NEXT:    testb $64, %al
5856; AVX512F-NEXT:    je LBB24_14
5857; AVX512F-NEXT:  LBB24_13: ## %cond.load16
5858; AVX512F-NEXT:    vpinsrb $6, 6(%rdi), %xmm1, %xmm0
5859; AVX512F-NEXT:    vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
5860; AVX512F-NEXT:    testb %al, %al
5861; AVX512F-NEXT:    jns LBB24_16
5862; AVX512F-NEXT:  LBB24_15: ## %cond.load19
5863; AVX512F-NEXT:    vpinsrb $7, 7(%rdi), %xmm1, %xmm0
5864; AVX512F-NEXT:    vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
5865; AVX512F-NEXT:    testl $256, %eax ## imm = 0x100
5866; AVX512F-NEXT:    je LBB24_18
5867; AVX512F-NEXT:  LBB24_17: ## %cond.load22
5868; AVX512F-NEXT:    vpinsrb $8, 8(%rdi), %xmm1, %xmm0
5869; AVX512F-NEXT:    vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
5870; AVX512F-NEXT:    testl $512, %eax ## imm = 0x200
5871; AVX512F-NEXT:    je LBB24_20
5872; AVX512F-NEXT:  LBB24_19: ## %cond.load25
5873; AVX512F-NEXT:    vpinsrb $9, 9(%rdi), %xmm1, %xmm0
5874; AVX512F-NEXT:    vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
5875; AVX512F-NEXT:    testl $1024, %eax ## imm = 0x400
5876; AVX512F-NEXT:    je LBB24_22
5877; AVX512F-NEXT:  LBB24_21: ## %cond.load28
5878; AVX512F-NEXT:    vpinsrb $10, 10(%rdi), %xmm1, %xmm0
5879; AVX512F-NEXT:    vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
5880; AVX512F-NEXT:    testl $2048, %eax ## imm = 0x800
5881; AVX512F-NEXT:    je LBB24_24
5882; AVX512F-NEXT:  LBB24_23: ## %cond.load31
5883; AVX512F-NEXT:    vpinsrb $11, 11(%rdi), %xmm1, %xmm0
5884; AVX512F-NEXT:    vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
5885; AVX512F-NEXT:    testl $4096, %eax ## imm = 0x1000
5886; AVX512F-NEXT:    je LBB24_26
5887; AVX512F-NEXT:  LBB24_25: ## %cond.load34
5888; AVX512F-NEXT:    vpinsrb $12, 12(%rdi), %xmm1, %xmm0
5889; AVX512F-NEXT:    vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
5890; AVX512F-NEXT:    testl $8192, %eax ## imm = 0x2000
5891; AVX512F-NEXT:    je LBB24_28
5892; AVX512F-NEXT:  LBB24_27: ## %cond.load37
5893; AVX512F-NEXT:    vpinsrb $13, 13(%rdi), %xmm1, %xmm0
5894; AVX512F-NEXT:    vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
5895; AVX512F-NEXT:    testl $16384, %eax ## imm = 0x4000
5896; AVX512F-NEXT:    je LBB24_30
5897; AVX512F-NEXT:  LBB24_29: ## %cond.load40
5898; AVX512F-NEXT:    vpinsrb $14, 14(%rdi), %xmm1, %xmm0
5899; AVX512F-NEXT:    vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
5900; AVX512F-NEXT:    testw %ax, %ax
5901; AVX512F-NEXT:    jns LBB24_32
5902; AVX512F-NEXT:  LBB24_31: ## %cond.load43
5903; AVX512F-NEXT:    vpinsrb $15, 15(%rdi), %xmm1, %xmm0
5904; AVX512F-NEXT:    vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
5905; AVX512F-NEXT:    testl $65536, %eax ## imm = 0x10000
5906; AVX512F-NEXT:    je LBB24_34
5907; AVX512F-NEXT:  LBB24_33: ## %cond.load46
5908; AVX512F-NEXT:    vextracti128 $1, %ymm1, %xmm0
5909; AVX512F-NEXT:    vpinsrb $0, 16(%rdi), %xmm0, %xmm0
5910; AVX512F-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm1
5911; AVX512F-NEXT:    testl $131072, %eax ## imm = 0x20000
5912; AVX512F-NEXT:    je LBB24_36
5913; AVX512F-NEXT:  LBB24_35: ## %cond.load49
5914; AVX512F-NEXT:    vextracti128 $1, %ymm1, %xmm0
5915; AVX512F-NEXT:    vpinsrb $1, 17(%rdi), %xmm0, %xmm0
5916; AVX512F-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm1
5917; AVX512F-NEXT:    testl $262144, %eax ## imm = 0x40000
5918; AVX512F-NEXT:    je LBB24_38
5919; AVX512F-NEXT:  LBB24_37: ## %cond.load52
5920; AVX512F-NEXT:    vextracti128 $1, %ymm1, %xmm0
5921; AVX512F-NEXT:    vpinsrb $2, 18(%rdi), %xmm0, %xmm0
5922; AVX512F-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm1
5923; AVX512F-NEXT:    testl $524288, %eax ## imm = 0x80000
5924; AVX512F-NEXT:    je LBB24_40
5925; AVX512F-NEXT:  LBB24_39: ## %cond.load55
5926; AVX512F-NEXT:    vextracti128 $1, %ymm1, %xmm0
5927; AVX512F-NEXT:    vpinsrb $3, 19(%rdi), %xmm0, %xmm0
5928; AVX512F-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm1
5929; AVX512F-NEXT:    testl $1048576, %eax ## imm = 0x100000
5930; AVX512F-NEXT:    je LBB24_42
5931; AVX512F-NEXT:  LBB24_41: ## %cond.load58
5932; AVX512F-NEXT:    vextracti128 $1, %ymm1, %xmm0
5933; AVX512F-NEXT:    vpinsrb $4, 20(%rdi), %xmm0, %xmm0
5934; AVX512F-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm1
5935; AVX512F-NEXT:    testl $2097152, %eax ## imm = 0x200000
5936; AVX512F-NEXT:    je LBB24_44
5937; AVX512F-NEXT:  LBB24_43: ## %cond.load61
5938; AVX512F-NEXT:    vextracti128 $1, %ymm1, %xmm0
5939; AVX512F-NEXT:    vpinsrb $5, 21(%rdi), %xmm0, %xmm0
5940; AVX512F-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm1
5941; AVX512F-NEXT:    testl $4194304, %eax ## imm = 0x400000
5942; AVX512F-NEXT:    je LBB24_46
5943; AVX512F-NEXT:  LBB24_45: ## %cond.load64
5944; AVX512F-NEXT:    vextracti128 $1, %ymm1, %xmm0
5945; AVX512F-NEXT:    vpinsrb $6, 22(%rdi), %xmm0, %xmm0
5946; AVX512F-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm1
5947; AVX512F-NEXT:    testl $8388608, %eax ## imm = 0x800000
5948; AVX512F-NEXT:    je LBB24_48
5949; AVX512F-NEXT:  LBB24_47: ## %cond.load67
5950; AVX512F-NEXT:    vextracti128 $1, %ymm1, %xmm0
5951; AVX512F-NEXT:    vpinsrb $7, 23(%rdi), %xmm0, %xmm0
5952; AVX512F-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm1
5953; AVX512F-NEXT:    testl $16777216, %eax ## imm = 0x1000000
5954; AVX512F-NEXT:    je LBB24_50
5955; AVX512F-NEXT:  LBB24_49: ## %cond.load70
5956; AVX512F-NEXT:    vextracti128 $1, %ymm1, %xmm0
5957; AVX512F-NEXT:    vpinsrb $8, 24(%rdi), %xmm0, %xmm0
5958; AVX512F-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm1
5959; AVX512F-NEXT:    testl $33554432, %eax ## imm = 0x2000000
5960; AVX512F-NEXT:    je LBB24_52
5961; AVX512F-NEXT:  LBB24_51: ## %cond.load73
5962; AVX512F-NEXT:    vextracti128 $1, %ymm1, %xmm0
5963; AVX512F-NEXT:    vpinsrb $9, 25(%rdi), %xmm0, %xmm0
5964; AVX512F-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm1
5965; AVX512F-NEXT:    testl $67108864, %eax ## imm = 0x4000000
5966; AVX512F-NEXT:    je LBB24_54
5967; AVX512F-NEXT:  LBB24_53: ## %cond.load76
5968; AVX512F-NEXT:    vextracti128 $1, %ymm1, %xmm0
5969; AVX512F-NEXT:    vpinsrb $10, 26(%rdi), %xmm0, %xmm0
5970; AVX512F-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm1
5971; AVX512F-NEXT:    testl $134217728, %eax ## imm = 0x8000000
5972; AVX512F-NEXT:    je LBB24_56
5973; AVX512F-NEXT:  LBB24_55: ## %cond.load79
5974; AVX512F-NEXT:    vextracti128 $1, %ymm1, %xmm0
5975; AVX512F-NEXT:    vpinsrb $11, 27(%rdi), %xmm0, %xmm0
5976; AVX512F-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm1
5977; AVX512F-NEXT:    testl $268435456, %eax ## imm = 0x10000000
5978; AVX512F-NEXT:    je LBB24_58
5979; AVX512F-NEXT:  LBB24_57: ## %cond.load82
5980; AVX512F-NEXT:    vextracti128 $1, %ymm1, %xmm0
5981; AVX512F-NEXT:    vpinsrb $12, 28(%rdi), %xmm0, %xmm0
5982; AVX512F-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm1
5983; AVX512F-NEXT:    testl $536870912, %eax ## imm = 0x20000000
5984; AVX512F-NEXT:    je LBB24_60
5985; AVX512F-NEXT:  LBB24_59: ## %cond.load85
5986; AVX512F-NEXT:    vextracti128 $1, %ymm1, %xmm0
5987; AVX512F-NEXT:    vpinsrb $13, 29(%rdi), %xmm0, %xmm0
5988; AVX512F-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm1
5989; AVX512F-NEXT:    testl $1073741824, %eax ## imm = 0x40000000
5990; AVX512F-NEXT:    je LBB24_62
5991; AVX512F-NEXT:  LBB24_61: ## %cond.load88
5992; AVX512F-NEXT:    vextracti128 $1, %ymm1, %xmm0
5993; AVX512F-NEXT:    vpinsrb $14, 30(%rdi), %xmm0, %xmm0
5994; AVX512F-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm1
5995; AVX512F-NEXT:    testl $-2147483648, %eax ## imm = 0x80000000
5996; AVX512F-NEXT:    je LBB24_64
5997; AVX512F-NEXT:  LBB24_63: ## %cond.load91
5998; AVX512F-NEXT:    vextracti128 $1, %ymm1, %xmm0
5999; AVX512F-NEXT:    vpinsrb $15, 31(%rdi), %xmm0, %xmm0
6000; AVX512F-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm1
6001; AVX512F-NEXT:    vmovdqa %ymm1, %ymm0
6002; AVX512F-NEXT:    retq
6003;
6004; AVX512VLDQ-LABEL: load_v32i8_v32i8:
6005; AVX512VLDQ:       ## %bb.0:
6006; AVX512VLDQ-NEXT:    vpmovmskb %ymm0, %eax
6007; AVX512VLDQ-NEXT:    testb $1, %al
6008; AVX512VLDQ-NEXT:    jne LBB24_1
6009; AVX512VLDQ-NEXT:  ## %bb.2: ## %else
6010; AVX512VLDQ-NEXT:    testb $2, %al
6011; AVX512VLDQ-NEXT:    jne LBB24_3
6012; AVX512VLDQ-NEXT:  LBB24_4: ## %else2
6013; AVX512VLDQ-NEXT:    testb $4, %al
6014; AVX512VLDQ-NEXT:    jne LBB24_5
6015; AVX512VLDQ-NEXT:  LBB24_6: ## %else5
6016; AVX512VLDQ-NEXT:    testb $8, %al
6017; AVX512VLDQ-NEXT:    jne LBB24_7
6018; AVX512VLDQ-NEXT:  LBB24_8: ## %else8
6019; AVX512VLDQ-NEXT:    testb $16, %al
6020; AVX512VLDQ-NEXT:    jne LBB24_9
6021; AVX512VLDQ-NEXT:  LBB24_10: ## %else11
6022; AVX512VLDQ-NEXT:    testb $32, %al
6023; AVX512VLDQ-NEXT:    jne LBB24_11
6024; AVX512VLDQ-NEXT:  LBB24_12: ## %else14
6025; AVX512VLDQ-NEXT:    testb $64, %al
6026; AVX512VLDQ-NEXT:    jne LBB24_13
6027; AVX512VLDQ-NEXT:  LBB24_14: ## %else17
6028; AVX512VLDQ-NEXT:    testb %al, %al
6029; AVX512VLDQ-NEXT:    js LBB24_15
6030; AVX512VLDQ-NEXT:  LBB24_16: ## %else20
6031; AVX512VLDQ-NEXT:    testl $256, %eax ## imm = 0x100
6032; AVX512VLDQ-NEXT:    jne LBB24_17
6033; AVX512VLDQ-NEXT:  LBB24_18: ## %else23
6034; AVX512VLDQ-NEXT:    testl $512, %eax ## imm = 0x200
6035; AVX512VLDQ-NEXT:    jne LBB24_19
6036; AVX512VLDQ-NEXT:  LBB24_20: ## %else26
6037; AVX512VLDQ-NEXT:    testl $1024, %eax ## imm = 0x400
6038; AVX512VLDQ-NEXT:    jne LBB24_21
6039; AVX512VLDQ-NEXT:  LBB24_22: ## %else29
6040; AVX512VLDQ-NEXT:    testl $2048, %eax ## imm = 0x800
6041; AVX512VLDQ-NEXT:    jne LBB24_23
6042; AVX512VLDQ-NEXT:  LBB24_24: ## %else32
6043; AVX512VLDQ-NEXT:    testl $4096, %eax ## imm = 0x1000
6044; AVX512VLDQ-NEXT:    jne LBB24_25
6045; AVX512VLDQ-NEXT:  LBB24_26: ## %else35
6046; AVX512VLDQ-NEXT:    testl $8192, %eax ## imm = 0x2000
6047; AVX512VLDQ-NEXT:    jne LBB24_27
6048; AVX512VLDQ-NEXT:  LBB24_28: ## %else38
6049; AVX512VLDQ-NEXT:    testl $16384, %eax ## imm = 0x4000
6050; AVX512VLDQ-NEXT:    jne LBB24_29
6051; AVX512VLDQ-NEXT:  LBB24_30: ## %else41
6052; AVX512VLDQ-NEXT:    testw %ax, %ax
6053; AVX512VLDQ-NEXT:    js LBB24_31
6054; AVX512VLDQ-NEXT:  LBB24_32: ## %else44
6055; AVX512VLDQ-NEXT:    testl $65536, %eax ## imm = 0x10000
6056; AVX512VLDQ-NEXT:    jne LBB24_33
6057; AVX512VLDQ-NEXT:  LBB24_34: ## %else47
6058; AVX512VLDQ-NEXT:    testl $131072, %eax ## imm = 0x20000
6059; AVX512VLDQ-NEXT:    jne LBB24_35
6060; AVX512VLDQ-NEXT:  LBB24_36: ## %else50
6061; AVX512VLDQ-NEXT:    testl $262144, %eax ## imm = 0x40000
6062; AVX512VLDQ-NEXT:    jne LBB24_37
6063; AVX512VLDQ-NEXT:  LBB24_38: ## %else53
6064; AVX512VLDQ-NEXT:    testl $524288, %eax ## imm = 0x80000
6065; AVX512VLDQ-NEXT:    jne LBB24_39
6066; AVX512VLDQ-NEXT:  LBB24_40: ## %else56
6067; AVX512VLDQ-NEXT:    testl $1048576, %eax ## imm = 0x100000
6068; AVX512VLDQ-NEXT:    jne LBB24_41
6069; AVX512VLDQ-NEXT:  LBB24_42: ## %else59
6070; AVX512VLDQ-NEXT:    testl $2097152, %eax ## imm = 0x200000
6071; AVX512VLDQ-NEXT:    jne LBB24_43
6072; AVX512VLDQ-NEXT:  LBB24_44: ## %else62
6073; AVX512VLDQ-NEXT:    testl $4194304, %eax ## imm = 0x400000
6074; AVX512VLDQ-NEXT:    jne LBB24_45
6075; AVX512VLDQ-NEXT:  LBB24_46: ## %else65
6076; AVX512VLDQ-NEXT:    testl $8388608, %eax ## imm = 0x800000
6077; AVX512VLDQ-NEXT:    jne LBB24_47
6078; AVX512VLDQ-NEXT:  LBB24_48: ## %else68
6079; AVX512VLDQ-NEXT:    testl $16777216, %eax ## imm = 0x1000000
6080; AVX512VLDQ-NEXT:    jne LBB24_49
6081; AVX512VLDQ-NEXT:  LBB24_50: ## %else71
6082; AVX512VLDQ-NEXT:    testl $33554432, %eax ## imm = 0x2000000
6083; AVX512VLDQ-NEXT:    jne LBB24_51
6084; AVX512VLDQ-NEXT:  LBB24_52: ## %else74
6085; AVX512VLDQ-NEXT:    testl $67108864, %eax ## imm = 0x4000000
6086; AVX512VLDQ-NEXT:    jne LBB24_53
6087; AVX512VLDQ-NEXT:  LBB24_54: ## %else77
6088; AVX512VLDQ-NEXT:    testl $134217728, %eax ## imm = 0x8000000
6089; AVX512VLDQ-NEXT:    jne LBB24_55
6090; AVX512VLDQ-NEXT:  LBB24_56: ## %else80
6091; AVX512VLDQ-NEXT:    testl $268435456, %eax ## imm = 0x10000000
6092; AVX512VLDQ-NEXT:    jne LBB24_57
6093; AVX512VLDQ-NEXT:  LBB24_58: ## %else83
6094; AVX512VLDQ-NEXT:    testl $536870912, %eax ## imm = 0x20000000
6095; AVX512VLDQ-NEXT:    jne LBB24_59
6096; AVX512VLDQ-NEXT:  LBB24_60: ## %else86
6097; AVX512VLDQ-NEXT:    testl $1073741824, %eax ## imm = 0x40000000
6098; AVX512VLDQ-NEXT:    jne LBB24_61
6099; AVX512VLDQ-NEXT:  LBB24_62: ## %else89
6100; AVX512VLDQ-NEXT:    testl $-2147483648, %eax ## imm = 0x80000000
6101; AVX512VLDQ-NEXT:    jne LBB24_63
6102; AVX512VLDQ-NEXT:  LBB24_64: ## %else92
6103; AVX512VLDQ-NEXT:    vmovdqa %ymm1, %ymm0
6104; AVX512VLDQ-NEXT:    retq
6105; AVX512VLDQ-NEXT:  LBB24_1: ## %cond.load
6106; AVX512VLDQ-NEXT:    vpinsrb $0, (%rdi), %xmm1, %xmm0
6107; AVX512VLDQ-NEXT:    vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
6108; AVX512VLDQ-NEXT:    testb $2, %al
6109; AVX512VLDQ-NEXT:    je LBB24_4
6110; AVX512VLDQ-NEXT:  LBB24_3: ## %cond.load1
6111; AVX512VLDQ-NEXT:    vpinsrb $1, 1(%rdi), %xmm1, %xmm0
6112; AVX512VLDQ-NEXT:    vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
6113; AVX512VLDQ-NEXT:    testb $4, %al
6114; AVX512VLDQ-NEXT:    je LBB24_6
6115; AVX512VLDQ-NEXT:  LBB24_5: ## %cond.load4
6116; AVX512VLDQ-NEXT:    vpinsrb $2, 2(%rdi), %xmm1, %xmm0
6117; AVX512VLDQ-NEXT:    vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
6118; AVX512VLDQ-NEXT:    testb $8, %al
6119; AVX512VLDQ-NEXT:    je LBB24_8
6120; AVX512VLDQ-NEXT:  LBB24_7: ## %cond.load7
6121; AVX512VLDQ-NEXT:    vpinsrb $3, 3(%rdi), %xmm1, %xmm0
6122; AVX512VLDQ-NEXT:    vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
6123; AVX512VLDQ-NEXT:    testb $16, %al
6124; AVX512VLDQ-NEXT:    je LBB24_10
6125; AVX512VLDQ-NEXT:  LBB24_9: ## %cond.load10
6126; AVX512VLDQ-NEXT:    vpinsrb $4, 4(%rdi), %xmm1, %xmm0
6127; AVX512VLDQ-NEXT:    vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
6128; AVX512VLDQ-NEXT:    testb $32, %al
6129; AVX512VLDQ-NEXT:    je LBB24_12
6130; AVX512VLDQ-NEXT:  LBB24_11: ## %cond.load13
6131; AVX512VLDQ-NEXT:    vpinsrb $5, 5(%rdi), %xmm1, %xmm0
6132; AVX512VLDQ-NEXT:    vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
6133; AVX512VLDQ-NEXT:    testb $64, %al
6134; AVX512VLDQ-NEXT:    je LBB24_14
6135; AVX512VLDQ-NEXT:  LBB24_13: ## %cond.load16
6136; AVX512VLDQ-NEXT:    vpinsrb $6, 6(%rdi), %xmm1, %xmm0
6137; AVX512VLDQ-NEXT:    vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
6138; AVX512VLDQ-NEXT:    testb %al, %al
6139; AVX512VLDQ-NEXT:    jns LBB24_16
6140; AVX512VLDQ-NEXT:  LBB24_15: ## %cond.load19
6141; AVX512VLDQ-NEXT:    vpinsrb $7, 7(%rdi), %xmm1, %xmm0
6142; AVX512VLDQ-NEXT:    vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
6143; AVX512VLDQ-NEXT:    testl $256, %eax ## imm = 0x100
6144; AVX512VLDQ-NEXT:    je LBB24_18
6145; AVX512VLDQ-NEXT:  LBB24_17: ## %cond.load22
6146; AVX512VLDQ-NEXT:    vpinsrb $8, 8(%rdi), %xmm1, %xmm0
6147; AVX512VLDQ-NEXT:    vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
6148; AVX512VLDQ-NEXT:    testl $512, %eax ## imm = 0x200
6149; AVX512VLDQ-NEXT:    je LBB24_20
6150; AVX512VLDQ-NEXT:  LBB24_19: ## %cond.load25
6151; AVX512VLDQ-NEXT:    vpinsrb $9, 9(%rdi), %xmm1, %xmm0
6152; AVX512VLDQ-NEXT:    vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
6153; AVX512VLDQ-NEXT:    testl $1024, %eax ## imm = 0x400
6154; AVX512VLDQ-NEXT:    je LBB24_22
6155; AVX512VLDQ-NEXT:  LBB24_21: ## %cond.load28
6156; AVX512VLDQ-NEXT:    vpinsrb $10, 10(%rdi), %xmm1, %xmm0
6157; AVX512VLDQ-NEXT:    vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
6158; AVX512VLDQ-NEXT:    testl $2048, %eax ## imm = 0x800
6159; AVX512VLDQ-NEXT:    je LBB24_24
6160; AVX512VLDQ-NEXT:  LBB24_23: ## %cond.load31
6161; AVX512VLDQ-NEXT:    vpinsrb $11, 11(%rdi), %xmm1, %xmm0
6162; AVX512VLDQ-NEXT:    vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
6163; AVX512VLDQ-NEXT:    testl $4096, %eax ## imm = 0x1000
6164; AVX512VLDQ-NEXT:    je LBB24_26
6165; AVX512VLDQ-NEXT:  LBB24_25: ## %cond.load34
6166; AVX512VLDQ-NEXT:    vpinsrb $12, 12(%rdi), %xmm1, %xmm0
6167; AVX512VLDQ-NEXT:    vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
6168; AVX512VLDQ-NEXT:    testl $8192, %eax ## imm = 0x2000
6169; AVX512VLDQ-NEXT:    je LBB24_28
6170; AVX512VLDQ-NEXT:  LBB24_27: ## %cond.load37
6171; AVX512VLDQ-NEXT:    vpinsrb $13, 13(%rdi), %xmm1, %xmm0
6172; AVX512VLDQ-NEXT:    vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
6173; AVX512VLDQ-NEXT:    testl $16384, %eax ## imm = 0x4000
6174; AVX512VLDQ-NEXT:    je LBB24_30
6175; AVX512VLDQ-NEXT:  LBB24_29: ## %cond.load40
6176; AVX512VLDQ-NEXT:    vpinsrb $14, 14(%rdi), %xmm1, %xmm0
6177; AVX512VLDQ-NEXT:    vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
6178; AVX512VLDQ-NEXT:    testw %ax, %ax
6179; AVX512VLDQ-NEXT:    jns LBB24_32
6180; AVX512VLDQ-NEXT:  LBB24_31: ## %cond.load43
6181; AVX512VLDQ-NEXT:    vpinsrb $15, 15(%rdi), %xmm1, %xmm0
6182; AVX512VLDQ-NEXT:    vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
6183; AVX512VLDQ-NEXT:    testl $65536, %eax ## imm = 0x10000
6184; AVX512VLDQ-NEXT:    je LBB24_34
6185; AVX512VLDQ-NEXT:  LBB24_33: ## %cond.load46
6186; AVX512VLDQ-NEXT:    vextracti128 $1, %ymm1, %xmm0
6187; AVX512VLDQ-NEXT:    vpinsrb $0, 16(%rdi), %xmm0, %xmm0
6188; AVX512VLDQ-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm1
6189; AVX512VLDQ-NEXT:    testl $131072, %eax ## imm = 0x20000
6190; AVX512VLDQ-NEXT:    je LBB24_36
6191; AVX512VLDQ-NEXT:  LBB24_35: ## %cond.load49
6192; AVX512VLDQ-NEXT:    vextracti128 $1, %ymm1, %xmm0
6193; AVX512VLDQ-NEXT:    vpinsrb $1, 17(%rdi), %xmm0, %xmm0
6194; AVX512VLDQ-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm1
6195; AVX512VLDQ-NEXT:    testl $262144, %eax ## imm = 0x40000
6196; AVX512VLDQ-NEXT:    je LBB24_38
6197; AVX512VLDQ-NEXT:  LBB24_37: ## %cond.load52
6198; AVX512VLDQ-NEXT:    vextracti128 $1, %ymm1, %xmm0
6199; AVX512VLDQ-NEXT:    vpinsrb $2, 18(%rdi), %xmm0, %xmm0
6200; AVX512VLDQ-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm1
6201; AVX512VLDQ-NEXT:    testl $524288, %eax ## imm = 0x80000
6202; AVX512VLDQ-NEXT:    je LBB24_40
6203; AVX512VLDQ-NEXT:  LBB24_39: ## %cond.load55
6204; AVX512VLDQ-NEXT:    vextracti128 $1, %ymm1, %xmm0
6205; AVX512VLDQ-NEXT:    vpinsrb $3, 19(%rdi), %xmm0, %xmm0
6206; AVX512VLDQ-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm1
6207; AVX512VLDQ-NEXT:    testl $1048576, %eax ## imm = 0x100000
6208; AVX512VLDQ-NEXT:    je LBB24_42
6209; AVX512VLDQ-NEXT:  LBB24_41: ## %cond.load58
6210; AVX512VLDQ-NEXT:    vextracti128 $1, %ymm1, %xmm0
6211; AVX512VLDQ-NEXT:    vpinsrb $4, 20(%rdi), %xmm0, %xmm0
6212; AVX512VLDQ-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm1
6213; AVX512VLDQ-NEXT:    testl $2097152, %eax ## imm = 0x200000
6214; AVX512VLDQ-NEXT:    je LBB24_44
6215; AVX512VLDQ-NEXT:  LBB24_43: ## %cond.load61
6216; AVX512VLDQ-NEXT:    vextracti128 $1, %ymm1, %xmm0
6217; AVX512VLDQ-NEXT:    vpinsrb $5, 21(%rdi), %xmm0, %xmm0
6218; AVX512VLDQ-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm1
6219; AVX512VLDQ-NEXT:    testl $4194304, %eax ## imm = 0x400000
6220; AVX512VLDQ-NEXT:    je LBB24_46
6221; AVX512VLDQ-NEXT:  LBB24_45: ## %cond.load64
6222; AVX512VLDQ-NEXT:    vextracti128 $1, %ymm1, %xmm0
6223; AVX512VLDQ-NEXT:    vpinsrb $6, 22(%rdi), %xmm0, %xmm0
6224; AVX512VLDQ-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm1
6225; AVX512VLDQ-NEXT:    testl $8388608, %eax ## imm = 0x800000
6226; AVX512VLDQ-NEXT:    je LBB24_48
6227; AVX512VLDQ-NEXT:  LBB24_47: ## %cond.load67
6228; AVX512VLDQ-NEXT:    vextracti128 $1, %ymm1, %xmm0
6229; AVX512VLDQ-NEXT:    vpinsrb $7, 23(%rdi), %xmm0, %xmm0
6230; AVX512VLDQ-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm1
6231; AVX512VLDQ-NEXT:    testl $16777216, %eax ## imm = 0x1000000
6232; AVX512VLDQ-NEXT:    je LBB24_50
6233; AVX512VLDQ-NEXT:  LBB24_49: ## %cond.load70
6234; AVX512VLDQ-NEXT:    vextracti128 $1, %ymm1, %xmm0
6235; AVX512VLDQ-NEXT:    vpinsrb $8, 24(%rdi), %xmm0, %xmm0
6236; AVX512VLDQ-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm1
6237; AVX512VLDQ-NEXT:    testl $33554432, %eax ## imm = 0x2000000
6238; AVX512VLDQ-NEXT:    je LBB24_52
6239; AVX512VLDQ-NEXT:  LBB24_51: ## %cond.load73
6240; AVX512VLDQ-NEXT:    vextracti128 $1, %ymm1, %xmm0
6241; AVX512VLDQ-NEXT:    vpinsrb $9, 25(%rdi), %xmm0, %xmm0
6242; AVX512VLDQ-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm1
6243; AVX512VLDQ-NEXT:    testl $67108864, %eax ## imm = 0x4000000
6244; AVX512VLDQ-NEXT:    je LBB24_54
6245; AVX512VLDQ-NEXT:  LBB24_53: ## %cond.load76
6246; AVX512VLDQ-NEXT:    vextracti128 $1, %ymm1, %xmm0
6247; AVX512VLDQ-NEXT:    vpinsrb $10, 26(%rdi), %xmm0, %xmm0
6248; AVX512VLDQ-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm1
6249; AVX512VLDQ-NEXT:    testl $134217728, %eax ## imm = 0x8000000
6250; AVX512VLDQ-NEXT:    je LBB24_56
6251; AVX512VLDQ-NEXT:  LBB24_55: ## %cond.load79
6252; AVX512VLDQ-NEXT:    vextracti128 $1, %ymm1, %xmm0
6253; AVX512VLDQ-NEXT:    vpinsrb $11, 27(%rdi), %xmm0, %xmm0
6254; AVX512VLDQ-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm1
6255; AVX512VLDQ-NEXT:    testl $268435456, %eax ## imm = 0x10000000
6256; AVX512VLDQ-NEXT:    je LBB24_58
6257; AVX512VLDQ-NEXT:  LBB24_57: ## %cond.load82
6258; AVX512VLDQ-NEXT:    vextracti128 $1, %ymm1, %xmm0
6259; AVX512VLDQ-NEXT:    vpinsrb $12, 28(%rdi), %xmm0, %xmm0
6260; AVX512VLDQ-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm1
6261; AVX512VLDQ-NEXT:    testl $536870912, %eax ## imm = 0x20000000
6262; AVX512VLDQ-NEXT:    je LBB24_60
6263; AVX512VLDQ-NEXT:  LBB24_59: ## %cond.load85
6264; AVX512VLDQ-NEXT:    vextracti128 $1, %ymm1, %xmm0
6265; AVX512VLDQ-NEXT:    vpinsrb $13, 29(%rdi), %xmm0, %xmm0
6266; AVX512VLDQ-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm1
6267; AVX512VLDQ-NEXT:    testl $1073741824, %eax ## imm = 0x40000000
6268; AVX512VLDQ-NEXT:    je LBB24_62
6269; AVX512VLDQ-NEXT:  LBB24_61: ## %cond.load88
6270; AVX512VLDQ-NEXT:    vextracti128 $1, %ymm1, %xmm0
6271; AVX512VLDQ-NEXT:    vpinsrb $14, 30(%rdi), %xmm0, %xmm0
6272; AVX512VLDQ-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm1
6273; AVX512VLDQ-NEXT:    testl $-2147483648, %eax ## imm = 0x80000000
6274; AVX512VLDQ-NEXT:    je LBB24_64
6275; AVX512VLDQ-NEXT:  LBB24_63: ## %cond.load91
6276; AVX512VLDQ-NEXT:    vextracti128 $1, %ymm1, %xmm0
6277; AVX512VLDQ-NEXT:    vpinsrb $15, 31(%rdi), %xmm0, %xmm0
6278; AVX512VLDQ-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm1
6279; AVX512VLDQ-NEXT:    vmovdqa %ymm1, %ymm0
6280; AVX512VLDQ-NEXT:    retq
6281;
6282; AVX512VLBW-LABEL: load_v32i8_v32i8:
6283; AVX512VLBW:       ## %bb.0:
6284; AVX512VLBW-NEXT:    vpmovb2m %ymm0, %k1
6285; AVX512VLBW-NEXT:    vpblendmb (%rdi), %ymm1, %ymm0 {%k1}
6286; AVX512VLBW-NEXT:    retq
6287;
6288; X86-AVX512-LABEL: load_v32i8_v32i8:
6289; X86-AVX512:       ## %bb.0:
6290; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
6291; X86-AVX512-NEXT:    vpmovb2m %ymm0, %k1
6292; X86-AVX512-NEXT:    vpblendmb (%eax), %ymm1, %ymm0 {%k1}
6293; X86-AVX512-NEXT:    retl
6294  %mask = icmp slt <32 x i8> %trigger, zeroinitializer
6295  %res = call <32 x i8> @llvm.masked.load.v32i8.p0(ptr %addr, i32 4, <32 x i1> %mask, <32 x i8> %dst)
6296  ret <32 x i8> %res
6297}
6298
6299;;; Loads with Constant Masks - these should be optimized to use something other than a variable blend.
6300
6301; 128-bit FP vectors are supported with AVX.
6302
6303define <4 x float> @mload_constmask_v4f32(ptr %addr, <4 x float> %dst) {
6304; SSE2-LABEL: mload_constmask_v4f32:
6305; SSE2:       ## %bb.0:
6306; SSE2-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
6307; SSE2-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
6308; SSE2-NEXT:    movss {{.*#+}} xmm3 = mem[0],zero,zero,zero
6309; SSE2-NEXT:    movlhps {{.*#+}} xmm3 = xmm3[0],xmm2[0]
6310; SSE2-NEXT:    movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
6311; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,0]
6312; SSE2-NEXT:    retq
6313;
6314; SSE42-LABEL: mload_constmask_v4f32:
6315; SSE42:       ## %bb.0:
6316; SSE42-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
6317; SSE42-NEXT:    blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
6318; SSE42-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
6319; SSE42-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0]
6320; SSE42-NEXT:    retq
6321;
6322; AVX1OR2-LABEL: mload_constmask_v4f32:
6323; AVX1OR2:       ## %bb.0:
6324; AVX1OR2-NEXT:    vblendps {{.*#+}} xmm0 = mem[0],xmm0[1],mem[2,3]
6325; AVX1OR2-NEXT:    retq
6326;
6327; AVX512F-LABEL: mload_constmask_v4f32:
6328; AVX512F:       ## %bb.0:
6329; AVX512F-NEXT:    ## kill: def $xmm0 killed $xmm0 def $zmm0
6330; AVX512F-NEXT:    movw $13, %ax
6331; AVX512F-NEXT:    kmovw %eax, %k1
6332; AVX512F-NEXT:    vmovups (%rdi), %zmm0 {%k1}
6333; AVX512F-NEXT:    ## kill: def $xmm0 killed $xmm0 killed $zmm0
6334; AVX512F-NEXT:    vzeroupper
6335; AVX512F-NEXT:    retq
6336;
6337; AVX512VLDQ-LABEL: mload_constmask_v4f32:
6338; AVX512VLDQ:       ## %bb.0:
6339; AVX512VLDQ-NEXT:    movb $13, %al
6340; AVX512VLDQ-NEXT:    kmovw %eax, %k1
6341; AVX512VLDQ-NEXT:    vmovups (%rdi), %xmm0 {%k1}
6342; AVX512VLDQ-NEXT:    retq
6343;
6344; AVX512VLBW-LABEL: mload_constmask_v4f32:
6345; AVX512VLBW:       ## %bb.0:
6346; AVX512VLBW-NEXT:    movb $13, %al
6347; AVX512VLBW-NEXT:    kmovd %eax, %k1
6348; AVX512VLBW-NEXT:    vmovups (%rdi), %xmm0 {%k1}
6349; AVX512VLBW-NEXT:    retq
6350;
6351; X86-AVX512-LABEL: mload_constmask_v4f32:
6352; X86-AVX512:       ## %bb.0:
6353; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
6354; X86-AVX512-NEXT:    movb $13, %cl
6355; X86-AVX512-NEXT:    kmovd %ecx, %k1
6356; X86-AVX512-NEXT:    vmovups (%eax), %xmm0 {%k1}
6357; X86-AVX512-NEXT:    retl
6358  %res = call <4 x float> @llvm.masked.load.v4f32.p0(ptr %addr, i32 4, <4 x i1> <i1 1, i1 0, i1 1, i1 1>, <4 x float> %dst)
6359  ret <4 x float> %res
6360}
6361
6362define <4 x float> @mload_constmask_v4f32_all(ptr %addr) {
6363; SSE-LABEL: mload_constmask_v4f32_all:
6364; SSE:       ## %bb.0:
6365; SSE-NEXT:    movups (%rdi), %xmm0
6366; SSE-NEXT:    retq
6367;
6368; AVX-LABEL: mload_constmask_v4f32_all:
6369; AVX:       ## %bb.0:
6370; AVX-NEXT:    vmovups (%rdi), %xmm0
6371; AVX-NEXT:    retq
6372;
6373; X86-AVX512-LABEL: mload_constmask_v4f32_all:
6374; X86-AVX512:       ## %bb.0:
6375; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
6376; X86-AVX512-NEXT:    vmovups (%eax), %xmm0
6377; X86-AVX512-NEXT:    retl
6378  %res = call <4 x float> @llvm.masked.load.v4f32.p0(ptr %addr, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x float>undef)
6379  ret <4 x float> %res
6380}
6381
6382define <2 x double> @mload_constmask_v2f64(ptr %addr, <2 x double> %dst) {
6383; SSE-LABEL: mload_constmask_v2f64:
6384; SSE:       ## %bb.0:
6385; SSE-NEXT:    movhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1]
6386; SSE-NEXT:    retq
6387;
6388; AVX-LABEL: mload_constmask_v2f64:
6389; AVX:       ## %bb.0:
6390; AVX-NEXT:    vmovhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1]
6391; AVX-NEXT:    retq
6392;
6393; X86-AVX512-LABEL: mload_constmask_v2f64:
6394; X86-AVX512:       ## %bb.0:
6395; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
6396; X86-AVX512-NEXT:    vmovhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1]
6397; X86-AVX512-NEXT:    retl
6398  %res = call <2 x double> @llvm.masked.load.v2f64.p0(ptr %addr, i32 4, <2 x i1> <i1 0, i1 1>, <2 x double> %dst)
6399  ret <2 x double> %res
6400}
6401
6402; 128-bit integer vectors are supported with AVX2.
6403
6404define <4 x i32> @mload_constmask_v4i32(ptr %addr, <4 x i32> %dst) {
6405; SSE2-LABEL: mload_constmask_v4i32:
6406; SSE2:       ## %bb.0:
6407; SSE2-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
6408; SSE2-NEXT:    movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
6409; SSE2-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
6410; SSE2-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
6411; SSE2-NEXT:    movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
6412; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[2,0],xmm2[2,0]
6413; SSE2-NEXT:    movaps %xmm1, %xmm0
6414; SSE2-NEXT:    retq
6415;
6416; SSE42-LABEL: mload_constmask_v4i32:
6417; SSE42:       ## %bb.0:
6418; SSE42-NEXT:    pinsrd $1, 4(%rdi), %xmm0
6419; SSE42-NEXT:    pinsrd $2, 8(%rdi), %xmm0
6420; SSE42-NEXT:    pinsrd $3, 12(%rdi), %xmm0
6421; SSE42-NEXT:    retq
6422;
6423; AVX1-LABEL: mload_constmask_v4i32:
6424; AVX1:       ## %bb.0:
6425; AVX1-NEXT:    vmovaps {{.*#+}} xmm1 = [0,4294967295,4294967295,4294967295]
6426; AVX1-NEXT:    vmaskmovps (%rdi), %xmm1, %xmm1
6427; AVX1-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
6428; AVX1-NEXT:    retq
6429;
6430; AVX2-LABEL: mload_constmask_v4i32:
6431; AVX2:       ## %bb.0:
6432; AVX2-NEXT:    vpmovsxbd {{.*#+}} xmm1 = [0,4294967295,4294967295,4294967295]
6433; AVX2-NEXT:    vpmaskmovd (%rdi), %xmm1, %xmm1
6434; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
6435; AVX2-NEXT:    retq
6436;
6437; AVX512F-LABEL: mload_constmask_v4i32:
6438; AVX512F:       ## %bb.0:
6439; AVX512F-NEXT:    ## kill: def $xmm0 killed $xmm0 def $zmm0
6440; AVX512F-NEXT:    movw $14, %ax
6441; AVX512F-NEXT:    kmovw %eax, %k1
6442; AVX512F-NEXT:    vmovdqu32 (%rdi), %zmm0 {%k1}
6443; AVX512F-NEXT:    ## kill: def $xmm0 killed $xmm0 killed $zmm0
6444; AVX512F-NEXT:    vzeroupper
6445; AVX512F-NEXT:    retq
6446;
6447; AVX512VLDQ-LABEL: mload_constmask_v4i32:
6448; AVX512VLDQ:       ## %bb.0:
6449; AVX512VLDQ-NEXT:    movb $14, %al
6450; AVX512VLDQ-NEXT:    kmovw %eax, %k1
6451; AVX512VLDQ-NEXT:    vmovdqu32 (%rdi), %xmm0 {%k1}
6452; AVX512VLDQ-NEXT:    retq
6453;
6454; AVX512VLBW-LABEL: mload_constmask_v4i32:
6455; AVX512VLBW:       ## %bb.0:
6456; AVX512VLBW-NEXT:    movb $14, %al
6457; AVX512VLBW-NEXT:    kmovd %eax, %k1
6458; AVX512VLBW-NEXT:    vmovdqu32 (%rdi), %xmm0 {%k1}
6459; AVX512VLBW-NEXT:    retq
6460;
6461; X86-AVX512-LABEL: mload_constmask_v4i32:
6462; X86-AVX512:       ## %bb.0:
6463; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
6464; X86-AVX512-NEXT:    movb $14, %cl
6465; X86-AVX512-NEXT:    kmovd %ecx, %k1
6466; X86-AVX512-NEXT:    vmovdqu32 (%eax), %xmm0 {%k1}
6467; X86-AVX512-NEXT:    retl
6468  %res = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %addr, i32 4, <4 x i1> <i1 0, i1 1, i1 1, i1 1>, <4 x i32> %dst)
6469  ret <4 x i32> %res
6470}
6471
6472define <2 x i64> @mload_constmask_v2i64(ptr %addr, <2 x i64> %dst) {
6473; SSE2-LABEL: mload_constmask_v2i64:
6474; SSE2:       ## %bb.0:
6475; SSE2-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
6476; SSE2-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
6477; SSE2-NEXT:    retq
6478;
6479; SSE42-LABEL: mload_constmask_v2i64:
6480; SSE42:       ## %bb.0:
6481; SSE42-NEXT:    pinsrq $1, 8(%rdi), %xmm0
6482; SSE42-NEXT:    retq
6483;
6484; AVX-LABEL: mload_constmask_v2i64:
6485; AVX:       ## %bb.0:
6486; AVX-NEXT:    vpinsrq $1, 8(%rdi), %xmm0, %xmm0
6487; AVX-NEXT:    retq
6488;
6489; X86-AVX512-LABEL: mload_constmask_v2i64:
6490; X86-AVX512:       ## %bb.0:
6491; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
6492; X86-AVX512-NEXT:    vmovhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1]
6493; X86-AVX512-NEXT:    retl
6494  %res = call <2 x i64> @llvm.masked.load.v2i64.p0(ptr %addr, i32 4, <2 x i1> <i1 0, i1 1>, <2 x i64> %dst)
6495  ret <2 x i64> %res
6496}
6497
6498; 256-bit FP vectors are supported with AVX.
6499
6500define <8 x float> @mload_constmask_v8f32(ptr %addr, <8 x float> %dst) {
6501; SSE2-LABEL: mload_constmask_v8f32:
6502; SSE2:       ## %bb.0:
6503; SSE2-NEXT:    movss {{.*#+}} xmm3 = mem[0],zero,zero,zero
6504; SSE2-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
6505; SSE2-NEXT:    movlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0]
6506; SSE2-NEXT:    movss {{.*#+}} xmm3 = mem[0],zero,zero,zero
6507; SSE2-NEXT:    shufps {{.*#+}} xmm3 = xmm3[0,0],xmm0[3,0]
6508; SSE2-NEXT:    shufps {{.*#+}} xmm2 = xmm2[2,0],xmm3[0,2]
6509; SSE2-NEXT:    movaps %xmm2, %xmm0
6510; SSE2-NEXT:    retq
6511;
6512; SSE42-LABEL: mload_constmask_v8f32:
6513; SSE42:       ## %bb.0:
6514; SSE42-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
6515; SSE42-NEXT:    blendps {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3]
6516; SSE42-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3]
6517; SSE42-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
6518; SSE42-NEXT:    retq
6519;
6520; AVX1OR2-LABEL: mload_constmask_v8f32:
6521; AVX1OR2:       ## %bb.0:
6522; AVX1OR2-NEXT:    vmovaps {{.*#+}} ymm1 = [4294967295,4294967295,4294967295,0,0,0,0,0]
6523; AVX1OR2-NEXT:    vmaskmovps (%rdi), %ymm1, %ymm1
6524; AVX1OR2-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7]
6525; AVX1OR2-NEXT:    retq
6526;
6527; AVX512F-LABEL: mload_constmask_v8f32:
6528; AVX512F:       ## %bb.0:
6529; AVX512F-NEXT:    ## kill: def $ymm0 killed $ymm0 def $zmm0
6530; AVX512F-NEXT:    movw $7, %ax
6531; AVX512F-NEXT:    kmovw %eax, %k1
6532; AVX512F-NEXT:    vmovups (%rdi), %zmm0 {%k1}
6533; AVX512F-NEXT:    ## kill: def $ymm0 killed $ymm0 killed $zmm0
6534; AVX512F-NEXT:    retq
6535;
6536; AVX512VLDQ-LABEL: mload_constmask_v8f32:
6537; AVX512VLDQ:       ## %bb.0:
6538; AVX512VLDQ-NEXT:    movb $7, %al
6539; AVX512VLDQ-NEXT:    kmovw %eax, %k1
6540; AVX512VLDQ-NEXT:    vmovups (%rdi), %ymm0 {%k1}
6541; AVX512VLDQ-NEXT:    retq
6542;
6543; AVX512VLBW-LABEL: mload_constmask_v8f32:
6544; AVX512VLBW:       ## %bb.0:
6545; AVX512VLBW-NEXT:    movb $7, %al
6546; AVX512VLBW-NEXT:    kmovd %eax, %k1
6547; AVX512VLBW-NEXT:    vmovups (%rdi), %ymm0 {%k1}
6548; AVX512VLBW-NEXT:    retq
6549;
6550; X86-AVX512-LABEL: mload_constmask_v8f32:
6551; X86-AVX512:       ## %bb.0:
6552; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
6553; X86-AVX512-NEXT:    movb $7, %cl
6554; X86-AVX512-NEXT:    kmovd %ecx, %k1
6555; X86-AVX512-NEXT:    vmovups (%eax), %ymm0 {%k1}
6556; X86-AVX512-NEXT:    retl
6557  %res = call <8 x float> @llvm.masked.load.v8f32.p0(ptr %addr, i32 4, <8 x i1> <i1 1, i1 1, i1 1, i1 0, i1 0, i1 0, i1 0, i1 0>, <8 x float> %dst)
6558  ret <8 x float> %res
6559}
6560
6561define <8 x float> @mload_constmask_v8f32_zero(ptr %addr, <8 x float> %dst) {
6562; SSE2-LABEL: mload_constmask_v8f32_zero:
6563; SSE2:       ## %bb.0:
6564; SSE2-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
6565; SSE2-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
6566; SSE2-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
6567; SSE2-NEXT:    xorps %xmm1, %xmm1
6568; SSE2-NEXT:    retq
6569;
6570; SSE42-LABEL: mload_constmask_v8f32_zero:
6571; SSE42:       ## %bb.0:
6572; SSE42-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
6573; SSE42-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],zero
6574; SSE42-NEXT:    xorps %xmm1, %xmm1
6575; SSE42-NEXT:    retq
6576;
6577; AVX1OR2-LABEL: mload_constmask_v8f32_zero:
6578; AVX1OR2:       ## %bb.0:
6579; AVX1OR2-NEXT:    vmovaps {{.*#+}} ymm0 = [4294967295,4294967295,4294967295,0,0,0,0,0]
6580; AVX1OR2-NEXT:    vmaskmovps (%rdi), %ymm0, %ymm0
6581; AVX1OR2-NEXT:    retq
6582;
6583; AVX512F-LABEL: mload_constmask_v8f32_zero:
6584; AVX512F:       ## %bb.0:
6585; AVX512F-NEXT:    movw $7, %ax
6586; AVX512F-NEXT:    kmovw %eax, %k1
6587; AVX512F-NEXT:    vmovups (%rdi), %zmm0 {%k1} {z}
6588; AVX512F-NEXT:    ## kill: def $ymm0 killed $ymm0 killed $zmm0
6589; AVX512F-NEXT:    retq
6590;
6591; AVX512VLDQ-LABEL: mload_constmask_v8f32_zero:
6592; AVX512VLDQ:       ## %bb.0:
6593; AVX512VLDQ-NEXT:    movb $7, %al
6594; AVX512VLDQ-NEXT:    kmovw %eax, %k1
6595; AVX512VLDQ-NEXT:    vmovups (%rdi), %ymm0 {%k1} {z}
6596; AVX512VLDQ-NEXT:    retq
6597;
6598; AVX512VLBW-LABEL: mload_constmask_v8f32_zero:
6599; AVX512VLBW:       ## %bb.0:
6600; AVX512VLBW-NEXT:    movb $7, %al
6601; AVX512VLBW-NEXT:    kmovd %eax, %k1
6602; AVX512VLBW-NEXT:    vmovups (%rdi), %ymm0 {%k1} {z}
6603; AVX512VLBW-NEXT:    retq
6604;
6605; X86-AVX512-LABEL: mload_constmask_v8f32_zero:
6606; X86-AVX512:       ## %bb.0:
6607; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
6608; X86-AVX512-NEXT:    movb $7, %cl
6609; X86-AVX512-NEXT:    kmovd %ecx, %k1
6610; X86-AVX512-NEXT:    vmovups (%eax), %ymm0 {%k1} {z}
6611; X86-AVX512-NEXT:    retl
6612  %res = call <8 x float> @llvm.masked.load.v8f32.p0(ptr %addr, i32 4, <8 x i1> <i1 1, i1 1, i1 1, i1 0, i1 0, i1 0, i1 0, i1 0>, <8 x float> zeroinitializer)
6613  ret <8 x float> %res
6614}
6615
6616define <4 x double> @mload_constmask_v4f64(ptr %addr, <4 x double> %dst) {
6617; SSE-LABEL: mload_constmask_v4f64:
6618; SSE:       ## %bb.0:
6619; SSE-NEXT:    movups (%rdi), %xmm0
6620; SSE-NEXT:    movlps {{.*#+}} xmm1 = mem[0,1],xmm1[2,3]
6621; SSE-NEXT:    retq
6622;
6623; AVX1OR2-LABEL: mload_constmask_v4f64:
6624; AVX1OR2:       ## %bb.0:
6625; AVX1OR2-NEXT:    vmovapd {{.*#+}} ymm1 = [18446744073709551615,18446744073709551615,18446744073709551615,0]
6626; AVX1OR2-NEXT:    vmaskmovpd (%rdi), %ymm1, %ymm1
6627; AVX1OR2-NEXT:    vblendpd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3]
6628; AVX1OR2-NEXT:    retq
6629;
6630; AVX512F-LABEL: mload_constmask_v4f64:
6631; AVX512F:       ## %bb.0:
6632; AVX512F-NEXT:    ## kill: def $ymm0 killed $ymm0 def $zmm0
6633; AVX512F-NEXT:    movb $7, %al
6634; AVX512F-NEXT:    kmovw %eax, %k1
6635; AVX512F-NEXT:    vmovupd (%rdi), %zmm0 {%k1}
6636; AVX512F-NEXT:    ## kill: def $ymm0 killed $ymm0 killed $zmm0
6637; AVX512F-NEXT:    retq
6638;
6639; AVX512VLDQ-LABEL: mload_constmask_v4f64:
6640; AVX512VLDQ:       ## %bb.0:
6641; AVX512VLDQ-NEXT:    movb $7, %al
6642; AVX512VLDQ-NEXT:    kmovw %eax, %k1
6643; AVX512VLDQ-NEXT:    vmovupd (%rdi), %ymm0 {%k1}
6644; AVX512VLDQ-NEXT:    retq
6645;
6646; AVX512VLBW-LABEL: mload_constmask_v4f64:
6647; AVX512VLBW:       ## %bb.0:
6648; AVX512VLBW-NEXT:    movb $7, %al
6649; AVX512VLBW-NEXT:    kmovd %eax, %k1
6650; AVX512VLBW-NEXT:    vmovupd (%rdi), %ymm0 {%k1}
6651; AVX512VLBW-NEXT:    retq
6652;
6653; X86-AVX512-LABEL: mload_constmask_v4f64:
6654; X86-AVX512:       ## %bb.0:
6655; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
6656; X86-AVX512-NEXT:    movb $7, %cl
6657; X86-AVX512-NEXT:    kmovd %ecx, %k1
6658; X86-AVX512-NEXT:    vmovupd (%eax), %ymm0 {%k1}
6659; X86-AVX512-NEXT:    retl
6660  %res = call <4 x double> @llvm.masked.load.v4f64.p0(ptr %addr, i32 4, <4 x i1> <i1 1, i1 1, i1 1, i1 0>, <4 x double> %dst)
6661  ret <4 x double> %res
6662}
6663
6664; 256-bit integer vectors are supported with AVX2.
6665
6666define <8 x i32> @mload_constmask_v8i32(ptr %addr, <8 x i32> %dst) {
6667; SSE2-LABEL: mload_constmask_v8i32:
6668; SSE2:       ## %bb.0:
6669; SSE2-NEXT:    movss {{.*#+}} xmm3 = mem[0],zero,zero,zero
6670; SSE2-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
6671; SSE2-NEXT:    movlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0]
6672; SSE2-NEXT:    movss {{.*#+}} xmm3 = mem[0],zero,zero,zero
6673; SSE2-NEXT:    shufps {{.*#+}} xmm3 = xmm3[0,0],xmm0[3,0]
6674; SSE2-NEXT:    shufps {{.*#+}} xmm2 = xmm2[2,0],xmm3[0,2]
6675; SSE2-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
6676; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
6677; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0]
6678; SSE2-NEXT:    movaps %xmm2, %xmm0
6679; SSE2-NEXT:    retq
6680;
6681; SSE42-LABEL: mload_constmask_v8i32:
6682; SSE42:       ## %bb.0:
6683; SSE42-NEXT:    pinsrd $0, (%rdi), %xmm0
6684; SSE42-NEXT:    pinsrd $1, 4(%rdi), %xmm0
6685; SSE42-NEXT:    pinsrd $2, 8(%rdi), %xmm0
6686; SSE42-NEXT:    pinsrd $3, 28(%rdi), %xmm1
6687; SSE42-NEXT:    retq
6688;
6689; AVX1OR2-LABEL: mload_constmask_v8i32:
6690; AVX1OR2:       ## %bb.0:
6691; AVX1OR2-NEXT:    vblendps {{.*#+}} ymm0 = mem[0,1,2],ymm0[3,4,5,6],mem[7]
6692; AVX1OR2-NEXT:    retq
6693;
6694; AVX512F-LABEL: mload_constmask_v8i32:
6695; AVX512F:       ## %bb.0:
6696; AVX512F-NEXT:    ## kill: def $ymm0 killed $ymm0 def $zmm0
6697; AVX512F-NEXT:    movw $135, %ax
6698; AVX512F-NEXT:    kmovw %eax, %k1
6699; AVX512F-NEXT:    vmovdqu32 (%rdi), %zmm0 {%k1}
6700; AVX512F-NEXT:    ## kill: def $ymm0 killed $ymm0 killed $zmm0
6701; AVX512F-NEXT:    retq
6702;
6703; AVX512VLDQ-LABEL: mload_constmask_v8i32:
6704; AVX512VLDQ:       ## %bb.0:
6705; AVX512VLDQ-NEXT:    movb $-121, %al
6706; AVX512VLDQ-NEXT:    kmovw %eax, %k1
6707; AVX512VLDQ-NEXT:    vmovdqu32 (%rdi), %ymm0 {%k1}
6708; AVX512VLDQ-NEXT:    retq
6709;
6710; AVX512VLBW-LABEL: mload_constmask_v8i32:
6711; AVX512VLBW:       ## %bb.0:
6712; AVX512VLBW-NEXT:    movb $-121, %al
6713; AVX512VLBW-NEXT:    kmovd %eax, %k1
6714; AVX512VLBW-NEXT:    vmovdqu32 (%rdi), %ymm0 {%k1}
6715; AVX512VLBW-NEXT:    retq
6716;
6717; X86-AVX512-LABEL: mload_constmask_v8i32:
6718; X86-AVX512:       ## %bb.0:
6719; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
6720; X86-AVX512-NEXT:    movb $-121, %cl
6721; X86-AVX512-NEXT:    kmovd %ecx, %k1
6722; X86-AVX512-NEXT:    vmovdqu32 (%eax), %ymm0 {%k1}
6723; X86-AVX512-NEXT:    retl
6724  %res = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr %addr, i32 4, <8 x i1> <i1 1, i1 1, i1 1, i1 0, i1 0, i1 0, i1 0, i1 1>, <8 x i32> %dst)
6725  ret <8 x i32> %res
6726}
6727
6728define <4 x i64> @mload_constmask_v4i64(ptr %addr, <4 x i64> %dst) {
6729; SSE2-LABEL: mload_constmask_v4i64:
6730; SSE2:       ## %bb.0:
6731; SSE2-NEXT:    movlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3]
6732; SSE2-NEXT:    movsd {{.*#+}} xmm2 = mem[0],zero
6733; SSE2-NEXT:    movlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0]
6734; SSE2-NEXT:    retq
6735;
6736; SSE42-LABEL: mload_constmask_v4i64:
6737; SSE42:       ## %bb.0:
6738; SSE42-NEXT:    pinsrq $0, (%rdi), %xmm0
6739; SSE42-NEXT:    pinsrq $1, 24(%rdi), %xmm1
6740; SSE42-NEXT:    retq
6741;
6742; AVX1OR2-LABEL: mload_constmask_v4i64:
6743; AVX1OR2:       ## %bb.0:
6744; AVX1OR2-NEXT:    vblendps {{.*#+}} ymm0 = mem[0,1],ymm0[2,3,4,5],mem[6,7]
6745; AVX1OR2-NEXT:    retq
6746;
6747; AVX512F-LABEL: mload_constmask_v4i64:
6748; AVX512F:       ## %bb.0:
6749; AVX512F-NEXT:    ## kill: def $ymm0 killed $ymm0 def $zmm0
6750; AVX512F-NEXT:    movb $9, %al
6751; AVX512F-NEXT:    kmovw %eax, %k1
6752; AVX512F-NEXT:    vmovdqu64 (%rdi), %zmm0 {%k1}
6753; AVX512F-NEXT:    ## kill: def $ymm0 killed $ymm0 killed $zmm0
6754; AVX512F-NEXT:    retq
6755;
6756; AVX512VLDQ-LABEL: mload_constmask_v4i64:
6757; AVX512VLDQ:       ## %bb.0:
6758; AVX512VLDQ-NEXT:    movb $9, %al
6759; AVX512VLDQ-NEXT:    kmovw %eax, %k1
6760; AVX512VLDQ-NEXT:    vmovdqu64 (%rdi), %ymm0 {%k1}
6761; AVX512VLDQ-NEXT:    retq
6762;
6763; AVX512VLBW-LABEL: mload_constmask_v4i64:
6764; AVX512VLBW:       ## %bb.0:
6765; AVX512VLBW-NEXT:    movb $9, %al
6766; AVX512VLBW-NEXT:    kmovd %eax, %k1
6767; AVX512VLBW-NEXT:    vmovdqu64 (%rdi), %ymm0 {%k1}
6768; AVX512VLBW-NEXT:    retq
6769;
6770; X86-AVX512-LABEL: mload_constmask_v4i64:
6771; X86-AVX512:       ## %bb.0:
6772; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
6773; X86-AVX512-NEXT:    movb $9, %cl
6774; X86-AVX512-NEXT:    kmovd %ecx, %k1
6775; X86-AVX512-NEXT:    vmovdqu64 (%eax), %ymm0 {%k1}
6776; X86-AVX512-NEXT:    retl
6777  %res = call <4 x i64> @llvm.masked.load.v4i64.p0(ptr %addr, i32 4, <4 x i1> <i1 1, i1 0, i1 0, i1 1>, <4 x i64> %dst)
6778  ret <4 x i64> %res
6779}
6780
6781; 512-bit FP vectors are supported with AVX512.
6782
6783define <8 x double> @mload_constmask_v8f64(ptr %addr, <8 x double> %dst) {
6784; SSE-LABEL: mload_constmask_v8f64:
6785; SSE:       ## %bb.0:
6786; SSE-NEXT:    movups (%rdi), %xmm0
6787; SSE-NEXT:    movlps {{.*#+}} xmm1 = mem[0,1],xmm1[2,3]
6788; SSE-NEXT:    movhps {{.*#+}} xmm3 = xmm3[0,1],mem[0,1]
6789; SSE-NEXT:    retq
6790;
6791; AVX1OR2-LABEL: mload_constmask_v8f64:
6792; AVX1OR2:       ## %bb.0:
6793; AVX1OR2-NEXT:    vblendps {{.*#+}} ymm0 = mem[0,1,2,3,4,5],ymm0[6,7]
6794; AVX1OR2-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],mem[6,7]
6795; AVX1OR2-NEXT:    retq
6796;
6797; AVX512F-LABEL: mload_constmask_v8f64:
6798; AVX512F:       ## %bb.0:
6799; AVX512F-NEXT:    movb $-121, %al
6800; AVX512F-NEXT:    kmovw %eax, %k1
6801; AVX512F-NEXT:    vmovupd (%rdi), %zmm0 {%k1}
6802; AVX512F-NEXT:    retq
6803;
6804; AVX512VLDQ-LABEL: mload_constmask_v8f64:
6805; AVX512VLDQ:       ## %bb.0:
6806; AVX512VLDQ-NEXT:    movb $-121, %al
6807; AVX512VLDQ-NEXT:    kmovw %eax, %k1
6808; AVX512VLDQ-NEXT:    vmovupd (%rdi), %zmm0 {%k1}
6809; AVX512VLDQ-NEXT:    retq
6810;
6811; AVX512VLBW-LABEL: mload_constmask_v8f64:
6812; AVX512VLBW:       ## %bb.0:
6813; AVX512VLBW-NEXT:    movb $-121, %al
6814; AVX512VLBW-NEXT:    kmovd %eax, %k1
6815; AVX512VLBW-NEXT:    vmovupd (%rdi), %zmm0 {%k1}
6816; AVX512VLBW-NEXT:    retq
6817;
6818; X86-AVX512-LABEL: mload_constmask_v8f64:
6819; X86-AVX512:       ## %bb.0:
6820; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
6821; X86-AVX512-NEXT:    movb $-121, %cl
6822; X86-AVX512-NEXT:    kmovd %ecx, %k1
6823; X86-AVX512-NEXT:    vmovupd (%eax), %zmm0 {%k1}
6824; X86-AVX512-NEXT:    retl
6825  %res = call <8 x double> @llvm.masked.load.v8f64.p0(ptr %addr, i32 4, <8 x i1> <i1 1, i1 1, i1 1, i1 0, i1 0, i1 0, i1 0, i1 1>, <8 x double> %dst)
6826  ret <8 x double> %res
6827}
6828
6829; Make sure we detect the mask is all ones after type
6830; legalization to use an unmasked load for some of the avx512 instructions.
6831define <16 x double> @mload_constmask_v16f64_allones_split(ptr %addr, <16 x double> %dst) {
6832; SSE-LABEL: mload_constmask_v16f64_allones_split:
6833; SSE:       ## %bb.0:
6834; SSE-NEXT:    movq %rdi, %rax
6835; SSE-NEXT:    movups (%rsi), %xmm0
6836; SSE-NEXT:    movups 16(%rsi), %xmm1
6837; SSE-NEXT:    movups 32(%rsi), %xmm2
6838; SSE-NEXT:    movups 48(%rsi), %xmm3
6839; SSE-NEXT:    movlps {{.*#+}} xmm4 = mem[0,1],xmm4[2,3]
6840; SSE-NEXT:    movlps {{.*#+}} xmm5 = mem[0,1],xmm5[2,3]
6841; SSE-NEXT:    movlps {{.*#+}} xmm6 = mem[0,1],xmm6[2,3]
6842; SSE-NEXT:    movlps {{.*#+}} xmm7 = mem[0,1],xmm7[2,3]
6843; SSE-NEXT:    movaps %xmm7, 112(%rdi)
6844; SSE-NEXT:    movaps %xmm6, 96(%rdi)
6845; SSE-NEXT:    movaps %xmm5, 80(%rdi)
6846; SSE-NEXT:    movaps %xmm4, 64(%rdi)
6847; SSE-NEXT:    movaps %xmm3, 48(%rdi)
6848; SSE-NEXT:    movaps %xmm2, 32(%rdi)
6849; SSE-NEXT:    movaps %xmm1, 16(%rdi)
6850; SSE-NEXT:    movaps %xmm0, (%rdi)
6851; SSE-NEXT:    retq
6852;
6853; AVX1OR2-LABEL: mload_constmask_v16f64_allones_split:
6854; AVX1OR2:       ## %bb.0:
6855; AVX1OR2-NEXT:    vbroadcastf128 {{.*#+}} ymm0 = [18446744073709551615,0,18446744073709551615,0]
6856; AVX1OR2-NEXT:    ## ymm0 = mem[0,1,0,1]
6857; AVX1OR2-NEXT:    vmaskmovpd 64(%rdi), %ymm0, %ymm1
6858; AVX1OR2-NEXT:    vblendpd {{.*#+}} ymm2 = ymm1[0],ymm2[1],ymm1[2],ymm2[3]
6859; AVX1OR2-NEXT:    vmaskmovpd 96(%rdi), %ymm0, %ymm0
6860; AVX1OR2-NEXT:    vblendpd {{.*#+}} ymm3 = ymm0[0],ymm3[1],ymm0[2],ymm3[3]
6861; AVX1OR2-NEXT:    vmovups (%rdi), %ymm0
6862; AVX1OR2-NEXT:    vmovups 32(%rdi), %ymm1
6863; AVX1OR2-NEXT:    retq
6864;
6865; AVX512F-LABEL: mload_constmask_v16f64_allones_split:
6866; AVX512F:       ## %bb.0:
6867; AVX512F-NEXT:    movb $85, %al
6868; AVX512F-NEXT:    kmovw %eax, %k1
6869; AVX512F-NEXT:    vmovupd 64(%rdi), %zmm1 {%k1}
6870; AVX512F-NEXT:    vmovups (%rdi), %zmm0
6871; AVX512F-NEXT:    retq
6872;
6873; AVX512VLDQ-LABEL: mload_constmask_v16f64_allones_split:
6874; AVX512VLDQ:       ## %bb.0:
6875; AVX512VLDQ-NEXT:    movb $85, %al
6876; AVX512VLDQ-NEXT:    kmovw %eax, %k1
6877; AVX512VLDQ-NEXT:    vmovupd 64(%rdi), %zmm1 {%k1}
6878; AVX512VLDQ-NEXT:    vmovups (%rdi), %zmm0
6879; AVX512VLDQ-NEXT:    retq
6880;
6881; AVX512VLBW-LABEL: mload_constmask_v16f64_allones_split:
6882; AVX512VLBW:       ## %bb.0:
6883; AVX512VLBW-NEXT:    movb $85, %al
6884; AVX512VLBW-NEXT:    kmovd %eax, %k1
6885; AVX512VLBW-NEXT:    vmovupd 64(%rdi), %zmm1 {%k1}
6886; AVX512VLBW-NEXT:    vmovups (%rdi), %zmm0
6887; AVX512VLBW-NEXT:    retq
6888;
6889; X86-AVX512-LABEL: mload_constmask_v16f64_allones_split:
6890; X86-AVX512:       ## %bb.0:
6891; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
6892; X86-AVX512-NEXT:    movb $85, %cl
6893; X86-AVX512-NEXT:    kmovd %ecx, %k1
6894; X86-AVX512-NEXT:    vmovupd 64(%eax), %zmm1 {%k1}
6895; X86-AVX512-NEXT:    vmovups (%eax), %zmm0
6896; X86-AVX512-NEXT:    retl
6897  %res = call <16 x double> @llvm.masked.load.v16f64.p0(ptr %addr, i32 4, <16 x i1> <i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0>, <16 x double> %dst)
6898  ret <16 x double> %res
6899}
6900
6901; If the pass-through operand is undef, no blend is needed.
6902
6903define <4 x double> @mload_constmask_v4f64_undef_passthrough(ptr %addr) {
6904; SSE-LABEL: mload_constmask_v4f64_undef_passthrough:
6905; SSE:       ## %bb.0:
6906; SSE-NEXT:    movups (%rdi), %xmm0
6907; SSE-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
6908; SSE-NEXT:    retq
6909;
6910; AVX1OR2-LABEL: mload_constmask_v4f64_undef_passthrough:
6911; AVX1OR2:       ## %bb.0:
6912; AVX1OR2-NEXT:    vmovapd {{.*#+}} ymm0 = [18446744073709551615,18446744073709551615,18446744073709551615,0]
6913; AVX1OR2-NEXT:    vmaskmovpd (%rdi), %ymm0, %ymm0
6914; AVX1OR2-NEXT:    retq
6915;
6916; AVX512F-LABEL: mload_constmask_v4f64_undef_passthrough:
6917; AVX512F:       ## %bb.0:
6918; AVX512F-NEXT:    movb $7, %al
6919; AVX512F-NEXT:    kmovw %eax, %k1
6920; AVX512F-NEXT:    vmovupd (%rdi), %zmm0 {%k1} {z}
6921; AVX512F-NEXT:    ## kill: def $ymm0 killed $ymm0 killed $zmm0
6922; AVX512F-NEXT:    retq
6923;
6924; AVX512VLDQ-LABEL: mload_constmask_v4f64_undef_passthrough:
6925; AVX512VLDQ:       ## %bb.0:
6926; AVX512VLDQ-NEXT:    movb $7, %al
6927; AVX512VLDQ-NEXT:    kmovw %eax, %k1
6928; AVX512VLDQ-NEXT:    vmovupd (%rdi), %ymm0 {%k1} {z}
6929; AVX512VLDQ-NEXT:    retq
6930;
6931; AVX512VLBW-LABEL: mload_constmask_v4f64_undef_passthrough:
6932; AVX512VLBW:       ## %bb.0:
6933; AVX512VLBW-NEXT:    movb $7, %al
6934; AVX512VLBW-NEXT:    kmovd %eax, %k1
6935; AVX512VLBW-NEXT:    vmovupd (%rdi), %ymm0 {%k1} {z}
6936; AVX512VLBW-NEXT:    retq
6937;
6938; X86-AVX512-LABEL: mload_constmask_v4f64_undef_passthrough:
6939; X86-AVX512:       ## %bb.0:
6940; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
6941; X86-AVX512-NEXT:    movb $7, %cl
6942; X86-AVX512-NEXT:    kmovd %ecx, %k1
6943; X86-AVX512-NEXT:    vmovupd (%eax), %ymm0 {%k1} {z}
6944; X86-AVX512-NEXT:    retl
6945  %res = call <4 x double> @llvm.masked.load.v4f64.p0(ptr %addr, i32 4, <4 x i1> <i1 1, i1 1, i1 1, i1 0>, <4 x double> undef)
6946  ret <4 x double> %res
6947}
6948
6949define <4 x i64> @mload_constmask_v4i64_undef_passthrough(ptr %addr) {
6950; SSE-LABEL: mload_constmask_v4i64_undef_passthrough:
6951; SSE:       ## %bb.0:
6952; SSE-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
6953; SSE-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
6954; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
6955; SSE-NEXT:    retq
6956;
6957; AVX1-LABEL: mload_constmask_v4i64_undef_passthrough:
6958; AVX1:       ## %bb.0:
6959; AVX1-NEXT:    vmovapd {{.*#+}} ymm0 = [0,18446744073709551615,18446744073709551615,0]
6960; AVX1-NEXT:    vmaskmovpd (%rdi), %ymm0, %ymm0
6961; AVX1-NEXT:    retq
6962;
6963; AVX2-LABEL: mload_constmask_v4i64_undef_passthrough:
6964; AVX2:       ## %bb.0:
6965; AVX2-NEXT:    vpmovsxbq {{.*#+}} ymm0 = [0,18446744073709551615,18446744073709551615,0]
6966; AVX2-NEXT:    vpmaskmovq (%rdi), %ymm0, %ymm0
6967; AVX2-NEXT:    retq
6968;
6969; AVX512F-LABEL: mload_constmask_v4i64_undef_passthrough:
6970; AVX512F:       ## %bb.0:
6971; AVX512F-NEXT:    movb $6, %al
6972; AVX512F-NEXT:    kmovw %eax, %k1
6973; AVX512F-NEXT:    vmovdqu64 (%rdi), %zmm0 {%k1} {z}
6974; AVX512F-NEXT:    ## kill: def $ymm0 killed $ymm0 killed $zmm0
6975; AVX512F-NEXT:    retq
6976;
6977; AVX512VLDQ-LABEL: mload_constmask_v4i64_undef_passthrough:
6978; AVX512VLDQ:       ## %bb.0:
6979; AVX512VLDQ-NEXT:    movb $6, %al
6980; AVX512VLDQ-NEXT:    kmovw %eax, %k1
6981; AVX512VLDQ-NEXT:    vmovdqu64 (%rdi), %ymm0 {%k1} {z}
6982; AVX512VLDQ-NEXT:    retq
6983;
6984; AVX512VLBW-LABEL: mload_constmask_v4i64_undef_passthrough:
6985; AVX512VLBW:       ## %bb.0:
6986; AVX512VLBW-NEXT:    movb $6, %al
6987; AVX512VLBW-NEXT:    kmovd %eax, %k1
6988; AVX512VLBW-NEXT:    vmovdqu64 (%rdi), %ymm0 {%k1} {z}
6989; AVX512VLBW-NEXT:    retq
6990;
6991; X86-AVX512-LABEL: mload_constmask_v4i64_undef_passthrough:
6992; X86-AVX512:       ## %bb.0:
6993; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
6994; X86-AVX512-NEXT:    movb $6, %cl
6995; X86-AVX512-NEXT:    kmovd %ecx, %k1
6996; X86-AVX512-NEXT:    vmovdqu64 (%eax), %ymm0 {%k1} {z}
6997; X86-AVX512-NEXT:    retl
6998  %res = call <4 x i64> @llvm.masked.load.v4i64.p0(ptr %addr, i32 4, <4 x i1> <i1 0, i1 1, i1 1, i1 0>, <4 x i64> undef)
6999  ret <4 x i64> %res
7000}
7001
7002;  When only one element of the mask is set, reduce to a scalar load.
7003
7004define <4 x i32> @load_one_mask_bit_set1(ptr %addr, <4 x i32> %val) {
7005; SSE2-LABEL: load_one_mask_bit_set1:
7006; SSE2:       ## %bb.0:
7007; SSE2-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
7008; SSE2-NEXT:    movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
7009; SSE2-NEXT:    retq
7010;
7011; SSE42-LABEL: load_one_mask_bit_set1:
7012; SSE42:       ## %bb.0:
7013; SSE42-NEXT:    pinsrd $0, (%rdi), %xmm0
7014; SSE42-NEXT:    retq
7015;
7016; AVX-LABEL: load_one_mask_bit_set1:
7017; AVX:       ## %bb.0:
7018; AVX-NEXT:    vpinsrd $0, (%rdi), %xmm0, %xmm0
7019; AVX-NEXT:    retq
7020;
7021; X86-AVX512-LABEL: load_one_mask_bit_set1:
7022; X86-AVX512:       ## %bb.0:
7023; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
7024; X86-AVX512-NEXT:    vpinsrd $0, (%eax), %xmm0, %xmm0
7025; X86-AVX512-NEXT:    retl
7026  %res = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %addr, i32 4, <4 x i1> <i1 true, i1 false, i1 false, i1 false>, <4 x i32> %val)
7027  ret <4 x i32> %res
7028}
7029
7030; Choose a different element to show that the correct address offset is produced.
7031
7032define <4 x float> @load_one_mask_bit_set2(ptr %addr, <4 x float> %val) {
7033; SSE2-LABEL: load_one_mask_bit_set2:
7034; SSE2:       ## %bb.0:
7035; SSE2-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
7036; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[3,0]
7037; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2]
7038; SSE2-NEXT:    retq
7039;
7040; SSE42-LABEL: load_one_mask_bit_set2:
7041; SSE42:       ## %bb.0:
7042; SSE42-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
7043; SSE42-NEXT:    retq
7044;
7045; AVX-LABEL: load_one_mask_bit_set2:
7046; AVX:       ## %bb.0:
7047; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
7048; AVX-NEXT:    retq
7049;
7050; X86-AVX512-LABEL: load_one_mask_bit_set2:
7051; X86-AVX512:       ## %bb.0:
7052; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
7053; X86-AVX512-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
7054; X86-AVX512-NEXT:    retl
7055  %res = call <4 x float> @llvm.masked.load.v4f32.p0(ptr %addr, i32 4, <4 x i1> <i1 false, i1 false, i1 true, i1 false>, <4 x float> %val)
7056  ret <4 x float> %res
7057}
7058
7059; Choose a different scalar type and a high element of a 256-bit vector because AVX doesn't support those evenly.
7060
7061define <4 x i64> @load_one_mask_bit_set3(ptr %addr, <4 x i64> %val) {
7062; SSE2-LABEL: load_one_mask_bit_set3:
7063; SSE2:       ## %bb.0:
7064; SSE2-NEXT:    movlps {{.*#+}} xmm1 = mem[0,1],xmm1[2,3]
7065; SSE2-NEXT:    retq
7066;
7067; SSE42-LABEL: load_one_mask_bit_set3:
7068; SSE42:       ## %bb.0:
7069; SSE42-NEXT:    pinsrq $0, 16(%rdi), %xmm1
7070; SSE42-NEXT:    retq
7071;
7072; AVX-LABEL: load_one_mask_bit_set3:
7073; AVX:       ## %bb.0:
7074; AVX-NEXT:    vbroadcastsd 16(%rdi), %ymm1
7075; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7]
7076; AVX-NEXT:    retq
7077;
7078; X86-AVX512-LABEL: load_one_mask_bit_set3:
7079; X86-AVX512:       ## %bb.0:
7080; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
7081; X86-AVX512-NEXT:    vbroadcastsd 16(%eax), %ymm1
7082; X86-AVX512-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7]
7083; X86-AVX512-NEXT:    retl
7084  %res = call <4 x i64> @llvm.masked.load.v4i64.p0(ptr %addr, i32 4, <4 x i1> <i1 false, i1 false, i1 true, i1 false>, <4 x i64> %val)
7085  ret <4 x i64> %res
7086}
7087
7088; Choose a different scalar type and a high element of a 256-bit vector because AVX doesn't support those evenly.
7089
7090define <4 x double> @load_one_mask_bit_set4(ptr %addr, <4 x double> %val) {
7091; SSE-LABEL: load_one_mask_bit_set4:
7092; SSE:       ## %bb.0:
7093; SSE-NEXT:    movhps {{.*#+}} xmm1 = xmm1[0,1],mem[0,1]
7094; SSE-NEXT:    retq
7095;
7096; AVX-LABEL: load_one_mask_bit_set4:
7097; AVX:       ## %bb.0:
7098; AVX-NEXT:    vbroadcastsd 24(%rdi), %ymm1
7099; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
7100; AVX-NEXT:    retq
7101;
7102; X86-AVX512-LABEL: load_one_mask_bit_set4:
7103; X86-AVX512:       ## %bb.0:
7104; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
7105; X86-AVX512-NEXT:    vbroadcastsd 24(%eax), %ymm1
7106; X86-AVX512-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
7107; X86-AVX512-NEXT:    retl
7108  %res = call <4 x double> @llvm.masked.load.v4f64.p0(ptr %addr, i32 4, <4 x i1> <i1 false, i1 false, i1 false, i1 true>, <4 x double> %val)
7109  ret <4 x double> %res
7110}
7111
7112; Try a 512-bit vector to make sure AVX doesn't die and AVX512 works as expected.
7113
7114define <8 x double> @load_one_mask_bit_set5(ptr %addr, <8 x double> %val) {
7115; SSE-LABEL: load_one_mask_bit_set5:
7116; SSE:       ## %bb.0:
7117; SSE-NEXT:    movhps {{.*#+}} xmm3 = xmm3[0,1],mem[0,1]
7118; SSE-NEXT:    retq
7119;
7120; AVX1OR2-LABEL: load_one_mask_bit_set5:
7121; AVX1OR2:       ## %bb.0:
7122; AVX1OR2-NEXT:    vbroadcastsd 56(%rdi), %ymm2
7123; AVX1OR2-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7]
7124; AVX1OR2-NEXT:    retq
7125;
7126; AVX512F-LABEL: load_one_mask_bit_set5:
7127; AVX512F:       ## %bb.0:
7128; AVX512F-NEXT:    movb $-128, %al
7129; AVX512F-NEXT:    kmovw %eax, %k1
7130; AVX512F-NEXT:    vbroadcastsd 56(%rdi), %zmm0 {%k1}
7131; AVX512F-NEXT:    retq
7132;
7133; AVX512VLDQ-LABEL: load_one_mask_bit_set5:
7134; AVX512VLDQ:       ## %bb.0:
7135; AVX512VLDQ-NEXT:    movb $-128, %al
7136; AVX512VLDQ-NEXT:    kmovw %eax, %k1
7137; AVX512VLDQ-NEXT:    vbroadcastsd 56(%rdi), %zmm0 {%k1}
7138; AVX512VLDQ-NEXT:    retq
7139;
7140; AVX512VLBW-LABEL: load_one_mask_bit_set5:
7141; AVX512VLBW:       ## %bb.0:
7142; AVX512VLBW-NEXT:    movb $-128, %al
7143; AVX512VLBW-NEXT:    kmovd %eax, %k1
7144; AVX512VLBW-NEXT:    vbroadcastsd 56(%rdi), %zmm0 {%k1}
7145; AVX512VLBW-NEXT:    retq
7146;
7147; X86-AVX512-LABEL: load_one_mask_bit_set5:
7148; X86-AVX512:       ## %bb.0:
7149; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
7150; X86-AVX512-NEXT:    movb $-128, %cl
7151; X86-AVX512-NEXT:    kmovd %ecx, %k1
7152; X86-AVX512-NEXT:    vbroadcastsd 56(%eax), %zmm0 {%k1}
7153; X86-AVX512-NEXT:    retl
7154  %res = call <8 x double> @llvm.masked.load.v8f64.p0(ptr %addr, i32 4, <8 x i1> <i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 true>, <8 x double> %val)
7155  ret <8 x double> %res
7156}
7157
7158define <16 x i64> @load_one_mask_bit_set6(ptr %addr, <16 x i64> %val) {
7159; SSE2-LABEL: load_one_mask_bit_set6:
7160; SSE2:       ## %bb.0:
7161; SSE2-NEXT:    movq %rdi, %rax
7162; SSE2-NEXT:    movlps {{.*#+}} xmm1 = mem[0,1],xmm1[2,3]
7163; SSE2-NEXT:    movlps {{.*#+}} xmm5 = mem[0,1],xmm5[2,3]
7164; SSE2-NEXT:    movsd {{.*#+}} xmm8 = mem[0],zero
7165; SSE2-NEXT:    movlhps {{.*#+}} xmm6 = xmm6[0],xmm8[0]
7166; SSE2-NEXT:    movaps %xmm7, 112(%rdi)
7167; SSE2-NEXT:    movaps %xmm5, 80(%rdi)
7168; SSE2-NEXT:    movaps %xmm4, 64(%rdi)
7169; SSE2-NEXT:    movaps %xmm3, 48(%rdi)
7170; SSE2-NEXT:    movaps %xmm2, 32(%rdi)
7171; SSE2-NEXT:    movaps %xmm1, 16(%rdi)
7172; SSE2-NEXT:    movaps %xmm0, (%rdi)
7173; SSE2-NEXT:    movaps %xmm6, 96(%rdi)
7174; SSE2-NEXT:    retq
7175;
7176; SSE42-LABEL: load_one_mask_bit_set6:
7177; SSE42:       ## %bb.0:
7178; SSE42-NEXT:    movq %rdi, %rax
7179; SSE42-NEXT:    pinsrq $0, 16(%rsi), %xmm1
7180; SSE42-NEXT:    pinsrq $0, 80(%rsi), %xmm5
7181; SSE42-NEXT:    pinsrq $1, 104(%rsi), %xmm6
7182; SSE42-NEXT:    movaps %xmm7, 112(%rdi)
7183; SSE42-NEXT:    movdqa %xmm6, 96(%rdi)
7184; SSE42-NEXT:    movdqa %xmm5, 80(%rdi)
7185; SSE42-NEXT:    movaps %xmm4, 64(%rdi)
7186; SSE42-NEXT:    movaps %xmm3, 48(%rdi)
7187; SSE42-NEXT:    movaps %xmm2, 32(%rdi)
7188; SSE42-NEXT:    movdqa %xmm1, 16(%rdi)
7189; SSE42-NEXT:    movaps %xmm0, (%rdi)
7190; SSE42-NEXT:    retq
7191;
7192; AVX1-LABEL: load_one_mask_bit_set6:
7193; AVX1:       ## %bb.0:
7194; AVX1-NEXT:    vmovapd {{.*#+}} ymm4 = [0,0,18446744073709551615,0]
7195; AVX1-NEXT:    vmaskmovpd (%rdi), %ymm4, %ymm5
7196; AVX1-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm5[2],ymm0[3]
7197; AVX1-NEXT:    vmaskmovpd 64(%rdi), %ymm4, %ymm4
7198; AVX1-NEXT:    vblendpd {{.*#+}} ymm2 = ymm2[0,1],ymm4[2],ymm2[3]
7199; AVX1-NEXT:    vmovapd {{.*#+}} ymm4 = [0,18446744073709551615,0,0]
7200; AVX1-NEXT:    vmaskmovpd 96(%rdi), %ymm4, %ymm4
7201; AVX1-NEXT:    vblendpd {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2,3]
7202; AVX1-NEXT:    retq
7203;
7204; AVX2-LABEL: load_one_mask_bit_set6:
7205; AVX2:       ## %bb.0:
7206; AVX2-NEXT:    vpmovsxbq {{.*#+}} ymm4 = [0,0,18446744073709551615,0]
7207; AVX2-NEXT:    vpmaskmovq (%rdi), %ymm4, %ymm5
7208; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5],ymm0[6,7]
7209; AVX2-NEXT:    vpmaskmovq 64(%rdi), %ymm4, %ymm4
7210; AVX2-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5],ymm2[6,7]
7211; AVX2-NEXT:    vpmovsxbq {{.*#+}} ymm4 = [0,18446744073709551615,0,0]
7212; AVX2-NEXT:    vpmaskmovq 96(%rdi), %ymm4, %ymm4
7213; AVX2-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm4[2,3],ymm3[4,5,6,7]
7214; AVX2-NEXT:    retq
7215;
7216; AVX512F-LABEL: load_one_mask_bit_set6:
7217; AVX512F:       ## %bb.0:
7218; AVX512F-NEXT:    movb $4, %al
7219; AVX512F-NEXT:    kmovw %eax, %k1
7220; AVX512F-NEXT:    vpbroadcastq 16(%rdi), %zmm0 {%k1}
7221; AVX512F-NEXT:    movb $36, %al
7222; AVX512F-NEXT:    kmovw %eax, %k1
7223; AVX512F-NEXT:    vmovdqu64 64(%rdi), %zmm1 {%k1}
7224; AVX512F-NEXT:    retq
7225;
7226; AVX512VLDQ-LABEL: load_one_mask_bit_set6:
7227; AVX512VLDQ:       ## %bb.0:
7228; AVX512VLDQ-NEXT:    movb $4, %al
7229; AVX512VLDQ-NEXT:    kmovw %eax, %k1
7230; AVX512VLDQ-NEXT:    vpbroadcastq 16(%rdi), %zmm0 {%k1}
7231; AVX512VLDQ-NEXT:    movb $36, %al
7232; AVX512VLDQ-NEXT:    kmovw %eax, %k1
7233; AVX512VLDQ-NEXT:    vmovdqu64 64(%rdi), %zmm1 {%k1}
7234; AVX512VLDQ-NEXT:    retq
7235;
7236; AVX512VLBW-LABEL: load_one_mask_bit_set6:
7237; AVX512VLBW:       ## %bb.0:
7238; AVX512VLBW-NEXT:    movb $4, %al
7239; AVX512VLBW-NEXT:    kmovd %eax, %k1
7240; AVX512VLBW-NEXT:    vpbroadcastq 16(%rdi), %zmm0 {%k1}
7241; AVX512VLBW-NEXT:    movb $36, %al
7242; AVX512VLBW-NEXT:    kmovd %eax, %k1
7243; AVX512VLBW-NEXT:    vmovdqu64 64(%rdi), %zmm1 {%k1}
7244; AVX512VLBW-NEXT:    retq
7245;
7246; X86-AVX512-LABEL: load_one_mask_bit_set6:
7247; X86-AVX512:       ## %bb.0:
7248; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
7249; X86-AVX512-NEXT:    movb $4, %cl
7250; X86-AVX512-NEXT:    kmovd %ecx, %k1
7251; X86-AVX512-NEXT:    vbroadcastsd 16(%eax), %zmm0 {%k1}
7252; X86-AVX512-NEXT:    movb $36, %cl
7253; X86-AVX512-NEXT:    kmovd %ecx, %k1
7254; X86-AVX512-NEXT:    vmovdqu64 64(%eax), %zmm1 {%k1}
7255; X86-AVX512-NEXT:    retl
7256  %res = call <16 x i64> @llvm.masked.load.v16i64.p0(ptr %addr, i32 4, <16 x i1> <i1 false, i1 false, i1 true, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false>, <16 x i64> %val)
7257  ret <16 x i64> %res
7258}
7259
7260define i32 @pr38986(i1 %c, ptr %p) {
7261; SSE-LABEL: pr38986:
7262; SSE:       ## %bb.0:
7263; SSE-NEXT:    testb $1, %dil
7264; SSE-NEXT:    ## implicit-def: $eax
7265; SSE-NEXT:    je LBB45_2
7266; SSE-NEXT:  ## %bb.1: ## %cond.load
7267; SSE-NEXT:    movl (%rsi), %eax
7268; SSE-NEXT:  LBB45_2: ## %else
7269; SSE-NEXT:    retq
7270;
7271; AVX-LABEL: pr38986:
7272; AVX:       ## %bb.0:
7273; AVX-NEXT:    testb $1, %dil
7274; AVX-NEXT:    ## implicit-def: $eax
7275; AVX-NEXT:    je LBB45_2
7276; AVX-NEXT:  ## %bb.1: ## %cond.load
7277; AVX-NEXT:    movl (%rsi), %eax
7278; AVX-NEXT:  LBB45_2: ## %else
7279; AVX-NEXT:    retq
7280;
7281; X86-AVX512-LABEL: pr38986:
7282; X86-AVX512:       ## %bb.0:
7283; X86-AVX512-NEXT:    testb $1, {{[0-9]+}}(%esp)
7284; X86-AVX512-NEXT:    ## implicit-def: $eax
7285; X86-AVX512-NEXT:    je LBB45_2
7286; X86-AVX512-NEXT:  ## %bb.1: ## %cond.load
7287; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
7288; X86-AVX512-NEXT:    movl (%eax), %eax
7289; X86-AVX512-NEXT:  LBB45_2: ## %else
7290; X86-AVX512-NEXT:    retl
7291 %vc = insertelement <1 x i1> undef, i1 %c, i32 0
7292 %L = call <1 x i32> @llvm.masked.load.v1i32.p0 (ptr %p, i32 4, <1 x i1> %vc, <1 x i32> undef)
7293 %ret = bitcast <1 x i32> %L to i32
7294 ret i32 %ret
7295}
7296
7297define <2 x double> @zero_mask(ptr %addr, <2 x double> %dst) {
7298; SSE-LABEL: zero_mask:
7299; SSE:       ## %bb.0:
7300; SSE-NEXT:    retq
7301;
7302; AVX-LABEL: zero_mask:
7303; AVX:       ## %bb.0:
7304; AVX-NEXT:    retq
7305;
7306; X86-AVX512-LABEL: zero_mask:
7307; X86-AVX512:       ## %bb.0:
7308; X86-AVX512-NEXT:    retl
7309  %res = call <2 x double> @llvm.masked.load.v2f64.p0(ptr %addr, i32 4, <2 x i1> zeroinitializer, <2 x double> %dst)
7310  ret <2 x double> %res
7311}
7312
7313declare <16 x double> @llvm.masked.load.v16f64.p0(ptr, i32, <16 x i1>, <16 x double>)
7314declare <8 x double> @llvm.masked.load.v8f64.p0(ptr, i32, <8 x i1>, <8 x double>)
7315declare <4 x double> @llvm.masked.load.v4f64.p0(ptr, i32, <4 x i1>, <4 x double>)
7316declare <2 x double> @llvm.masked.load.v2f64.p0(ptr, i32, <2 x i1>, <2 x double>)
7317declare <1 x double> @llvm.masked.load.v1f64.p0(ptr, i32, <1 x i1>, <1 x double>)
7318
7319declare <16 x float> @llvm.masked.load.v16f32.p0(ptr, i32, <16 x i1>, <16 x float>)
7320declare <8 x float> @llvm.masked.load.v8f32.p0(ptr, i32, <8 x i1>, <8 x float>)
7321declare <4 x float> @llvm.masked.load.v4f32.p0(ptr, i32, <4 x i1>, <4 x float>)
7322declare <2 x float> @llvm.masked.load.v2f32.p0(ptr, i32, <2 x i1>, <2 x float>)
7323
7324declare <16 x i64> @llvm.masked.load.v16i64.p0(ptr, i32, <16 x i1>, <16 x i64>)
7325declare <8 x i64> @llvm.masked.load.v8i64.p0(ptr, i32, <8 x i1>, <8 x i64>)
7326declare <4 x i64> @llvm.masked.load.v4i64.p0(ptr, i32, <4 x i1>, <4 x i64>)
7327declare <2 x i64> @llvm.masked.load.v2i64.p0(ptr, i32, <2 x i1>, <2 x i64>)
7328declare <1 x i64> @llvm.masked.load.v1i64.p0(ptr, i32, <1 x i1>, <1 x i64>)
7329
7330declare <16 x i32> @llvm.masked.load.v16i32.p0(ptr, i32, <16 x i1>, <16 x i32>)
7331declare <8 x i32> @llvm.masked.load.v8i32.p0(ptr, i32, <8 x i1>, <8 x i32>)
7332declare <4 x i32> @llvm.masked.load.v4i32.p0(ptr, i32, <4 x i1>, <4 x i32>)
7333declare <2 x i32> @llvm.masked.load.v2i32.p0(ptr, i32, <2 x i1>, <2 x i32>)
7334declare <1 x i32> @llvm.masked.load.v1i32.p0(ptr, i32, <1 x i1>, <1 x i32>)
7335
7336declare <32 x i16> @llvm.masked.load.v32i16.p0(ptr, i32, <32 x i1>, <32 x i16>)
7337declare <16 x i16> @llvm.masked.load.v16i16.p0(ptr, i32, <16 x i1>, <16 x i16>)
7338declare <8 x i16> @llvm.masked.load.v8i16.p0(ptr, i32, <8 x i1>, <8 x i16>)
7339declare <4 x i16> @llvm.masked.load.v4i16.p0(ptr, i32, <4 x i1>, <4 x i16>)
7340
7341declare <64 x i8> @llvm.masked.load.v64i8.p0(ptr, i32, <64 x i1>, <64 x i8>)
7342declare <32 x i8> @llvm.masked.load.v32i8.p0(ptr, i32, <32 x i1>, <32 x i8>)
7343declare <16 x i8> @llvm.masked.load.v16i8.p0(ptr, i32, <16 x i1>, <16 x i8>)
7344declare <8 x i8> @llvm.masked.load.v8i8.p0(ptr, i32, <8 x i1>, <8 x i8>)
7345