xref: /llvm-project/llvm/test/CodeGen/X86/vector-shift-by-select-loop.ll (revision 5c181a9191bfb758575329ff7eb8db4fc46ffac9)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=SSE
3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX1
4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX2
5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,+xop | FileCheck %s --check-prefix=XOP
6
7; PR37428 - https://bugs.llvm.org/show_bug.cgi?id=37428
8; This is a larger-than-usual regression test to verify that several backend
9; transforms are working together. We want to hoist the expansion of non-uniform
10; vector shifts out of a loop if we do not have real vector shift instructions.
11; See test/Transforms/CodeGenPrepare/X86/vec-shift.ll for the 1st step in that
12; sequence.
13
14define void @vector_variable_shift_left_loop(ptr nocapture %arr, ptr nocapture readonly %control, i32 %count, i32 %amt0, i32 %amt1) nounwind {
15; SSE-LABEL: vector_variable_shift_left_loop:
16; SSE:       # %bb.0: # %entry
17; SSE-NEXT:    testl %edx, %edx
18; SSE-NEXT:    jle .LBB0_9
19; SSE-NEXT:  # %bb.1: # %for.body.preheader
20; SSE-NEXT:    movl %ecx, %eax
21; SSE-NEXT:    movl %edx, %r9d
22; SSE-NEXT:    cmpl $31, %edx
23; SSE-NEXT:    ja .LBB0_3
24; SSE-NEXT:  # %bb.2:
25; SSE-NEXT:    xorl %edx, %edx
26; SSE-NEXT:    jmp .LBB0_6
27; SSE-NEXT:  .LBB0_3: # %vector.ph
28; SSE-NEXT:    movl %r9d, %edx
29; SSE-NEXT:    andl $-32, %edx
30; SSE-NEXT:    movd %eax, %xmm0
31; SSE-NEXT:    movd %r8d, %xmm1
32; SSE-NEXT:    xorl %ecx, %ecx
33; SSE-NEXT:    pxor %xmm8, %xmm8
34; SSE-NEXT:    pmovzxdq {{.*#+}} xmm9 = xmm0[0],zero,xmm0[1],zero
35; SSE-NEXT:    pmovzxdq {{.*#+}} xmm10 = xmm1[0],zero,xmm1[1],zero
36; SSE-NEXT:    .p2align 4
37; SSE-NEXT:  .LBB0_4: # %vector.body
38; SSE-NEXT:    # =>This Inner Loop Header: Depth=1
39; SSE-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
40; SSE-NEXT:    movq {{.*#+}} xmm1 = mem[0],zero
41; SSE-NEXT:    movq {{.*#+}} xmm2 = mem[0],zero
42; SSE-NEXT:    movq {{.*#+}} xmm11 = mem[0],zero
43; SSE-NEXT:    pcmpeqb %xmm8, %xmm0
44; SSE-NEXT:    pmovsxbd %xmm0, %xmm7
45; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
46; SSE-NEXT:    pmovsxbd %xmm0, %xmm0
47; SSE-NEXT:    pcmpeqb %xmm8, %xmm1
48; SSE-NEXT:    pmovsxbd %xmm1, %xmm5
49; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,1,1]
50; SSE-NEXT:    pmovsxbd %xmm1, %xmm6
51; SSE-NEXT:    pcmpeqb %xmm8, %xmm2
52; SSE-NEXT:    pmovsxbd %xmm2, %xmm3
53; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[1,1,1,1]
54; SSE-NEXT:    pmovsxbd %xmm1, %xmm4
55; SSE-NEXT:    pcmpeqb %xmm8, %xmm11
56; SSE-NEXT:    pmovsxbd %xmm11, %xmm1
57; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm11[1,1,1,1]
58; SSE-NEXT:    pmovsxbd %xmm2, %xmm2
59; SSE-NEXT:    movdqu 16(%rdi,%rcx,4), %xmm11
60; SSE-NEXT:    movdqa %xmm11, %xmm12
61; SSE-NEXT:    pslld %xmm9, %xmm12
62; SSE-NEXT:    pslld %xmm10, %xmm11
63; SSE-NEXT:    blendvps %xmm0, %xmm12, %xmm11
64; SSE-NEXT:    movdqu (%rdi,%rcx,4), %xmm12
65; SSE-NEXT:    movdqa %xmm12, %xmm13
66; SSE-NEXT:    pslld %xmm9, %xmm13
67; SSE-NEXT:    pslld %xmm10, %xmm12
68; SSE-NEXT:    movdqa %xmm7, %xmm0
69; SSE-NEXT:    blendvps %xmm0, %xmm13, %xmm12
70; SSE-NEXT:    movdqu 48(%rdi,%rcx,4), %xmm7
71; SSE-NEXT:    movdqa %xmm7, %xmm13
72; SSE-NEXT:    pslld %xmm9, %xmm13
73; SSE-NEXT:    pslld %xmm10, %xmm7
74; SSE-NEXT:    movdqa %xmm6, %xmm0
75; SSE-NEXT:    blendvps %xmm0, %xmm13, %xmm7
76; SSE-NEXT:    movdqu 32(%rdi,%rcx,4), %xmm6
77; SSE-NEXT:    movdqa %xmm6, %xmm13
78; SSE-NEXT:    pslld %xmm9, %xmm13
79; SSE-NEXT:    pslld %xmm10, %xmm6
80; SSE-NEXT:    movdqa %xmm5, %xmm0
81; SSE-NEXT:    blendvps %xmm0, %xmm13, %xmm6
82; SSE-NEXT:    movdqu 80(%rdi,%rcx,4), %xmm5
83; SSE-NEXT:    movdqa %xmm5, %xmm13
84; SSE-NEXT:    pslld %xmm9, %xmm13
85; SSE-NEXT:    pslld %xmm10, %xmm5
86; SSE-NEXT:    movdqa %xmm4, %xmm0
87; SSE-NEXT:    blendvps %xmm0, %xmm13, %xmm5
88; SSE-NEXT:    movdqu 64(%rdi,%rcx,4), %xmm4
89; SSE-NEXT:    movdqa %xmm4, %xmm13
90; SSE-NEXT:    pslld %xmm9, %xmm13
91; SSE-NEXT:    pslld %xmm10, %xmm4
92; SSE-NEXT:    movdqa %xmm3, %xmm0
93; SSE-NEXT:    blendvps %xmm0, %xmm13, %xmm4
94; SSE-NEXT:    movdqu 112(%rdi,%rcx,4), %xmm3
95; SSE-NEXT:    movdqa %xmm3, %xmm13
96; SSE-NEXT:    pslld %xmm9, %xmm13
97; SSE-NEXT:    pslld %xmm10, %xmm3
98; SSE-NEXT:    movdqa %xmm2, %xmm0
99; SSE-NEXT:    blendvps %xmm0, %xmm13, %xmm3
100; SSE-NEXT:    movdqu 96(%rdi,%rcx,4), %xmm2
101; SSE-NEXT:    movdqa %xmm2, %xmm13
102; SSE-NEXT:    pslld %xmm9, %xmm13
103; SSE-NEXT:    pslld %xmm10, %xmm2
104; SSE-NEXT:    movdqa %xmm1, %xmm0
105; SSE-NEXT:    blendvps %xmm0, %xmm13, %xmm2
106; SSE-NEXT:    movups %xmm12, (%rdi,%rcx,4)
107; SSE-NEXT:    movups %xmm11, 16(%rdi,%rcx,4)
108; SSE-NEXT:    movups %xmm6, 32(%rdi,%rcx,4)
109; SSE-NEXT:    movups %xmm7, 48(%rdi,%rcx,4)
110; SSE-NEXT:    movups %xmm4, 64(%rdi,%rcx,4)
111; SSE-NEXT:    movups %xmm5, 80(%rdi,%rcx,4)
112; SSE-NEXT:    movups %xmm2, 96(%rdi,%rcx,4)
113; SSE-NEXT:    movups %xmm3, 112(%rdi,%rcx,4)
114; SSE-NEXT:    addq $32, %rcx
115; SSE-NEXT:    cmpq %rcx, %rdx
116; SSE-NEXT:    jne .LBB0_4
117; SSE-NEXT:  # %bb.5: # %middle.block
118; SSE-NEXT:    cmpl %r9d, %edx
119; SSE-NEXT:    jne .LBB0_6
120; SSE-NEXT:  .LBB0_9: # %for.cond.cleanup
121; SSE-NEXT:    retq
122; SSE-NEXT:    .p2align 4
123; SSE-NEXT:  .LBB0_8: # %for.body
124; SSE-NEXT:    # in Loop: Header=BB0_6 Depth=1
125; SSE-NEXT:    # kill: def $cl killed $cl killed $ecx
126; SSE-NEXT:    shll %cl, (%rdi,%rdx,4)
127; SSE-NEXT:    incq %rdx
128; SSE-NEXT:    cmpq %rdx, %r9
129; SSE-NEXT:    je .LBB0_9
130; SSE-NEXT:  .LBB0_6: # %for.body
131; SSE-NEXT:    # =>This Inner Loop Header: Depth=1
132; SSE-NEXT:    cmpb $0, (%rsi,%rdx)
133; SSE-NEXT:    movl %eax, %ecx
134; SSE-NEXT:    je .LBB0_8
135; SSE-NEXT:  # %bb.7: # %for.body
136; SSE-NEXT:    # in Loop: Header=BB0_6 Depth=1
137; SSE-NEXT:    movl %r8d, %ecx
138; SSE-NEXT:    jmp .LBB0_8
139;
140; AVX1-LABEL: vector_variable_shift_left_loop:
141; AVX1:       # %bb.0: # %entry
142; AVX1-NEXT:    testl %edx, %edx
143; AVX1-NEXT:    jle .LBB0_9
144; AVX1-NEXT:  # %bb.1: # %for.body.preheader
145; AVX1-NEXT:    movl %ecx, %eax
146; AVX1-NEXT:    movl %edx, %r9d
147; AVX1-NEXT:    cmpl $31, %edx
148; AVX1-NEXT:    ja .LBB0_3
149; AVX1-NEXT:  # %bb.2:
150; AVX1-NEXT:    xorl %edx, %edx
151; AVX1-NEXT:    jmp .LBB0_6
152; AVX1-NEXT:  .LBB0_3: # %vector.ph
153; AVX1-NEXT:    movl %r9d, %edx
154; AVX1-NEXT:    andl $-32, %edx
155; AVX1-NEXT:    vmovd %eax, %xmm7
156; AVX1-NEXT:    vmovd %r8d, %xmm8
157; AVX1-NEXT:    xorl %ecx, %ecx
158; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm0 = xmm7[0],zero,xmm7[1],zero
159; AVX1-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
160; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm0 = xmm8[0],zero,xmm8[1],zero
161; AVX1-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
162; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm0 = xmm7[0],zero,xmm7[1],zero
163; AVX1-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
164; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm0 = xmm8[0],zero,xmm8[1],zero
165; AVX1-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
166; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm5 = xmm7[0],zero,xmm7[1],zero
167; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm6 = xmm8[0],zero,xmm8[1],zero
168; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm7 = xmm7[0],zero,xmm7[1],zero
169; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm8 = xmm8[0],zero,xmm8[1],zero
170; AVX1-NEXT:    .p2align 4
171; AVX1-NEXT:  .LBB0_4: # %vector.body
172; AVX1-NEXT:    # =>This Inner Loop Header: Depth=1
173; AVX1-NEXT:    vmovq {{.*#+}} xmm9 = mem[0],zero
174; AVX1-NEXT:    vmovq {{.*#+}} xmm10 = mem[0],zero
175; AVX1-NEXT:    vmovq {{.*#+}} xmm11 = mem[0],zero
176; AVX1-NEXT:    vmovq {{.*#+}} xmm12 = mem[0],zero
177; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
178; AVX1-NEXT:    vpcmpeqb %xmm2, %xmm9, %xmm9
179; AVX1-NEXT:    vpmovsxbd %xmm9, %xmm14
180; AVX1-NEXT:    vpshufd {{.*#+}} xmm9 = xmm9[1,1,1,1]
181; AVX1-NEXT:    vpmovsxbd %xmm9, %xmm15
182; AVX1-NEXT:    vpcmpeqb %xmm2, %xmm10, %xmm9
183; AVX1-NEXT:    vpmovsxbd %xmm9, %xmm0
184; AVX1-NEXT:    vpshufd {{.*#+}} xmm9 = xmm9[1,1,1,1]
185; AVX1-NEXT:    vpmovsxbd %xmm9, %xmm1
186; AVX1-NEXT:    vpcmpeqb %xmm2, %xmm11, %xmm9
187; AVX1-NEXT:    vpmovsxbd %xmm9, %xmm13
188; AVX1-NEXT:    vpshufd {{.*#+}} xmm9 = xmm9[1,1,1,1]
189; AVX1-NEXT:    vpmovsxbd %xmm9, %xmm11
190; AVX1-NEXT:    vpcmpeqb %xmm2, %xmm12, %xmm9
191; AVX1-NEXT:    vpmovsxbd %xmm9, %xmm10
192; AVX1-NEXT:    vpshufd {{.*#+}} xmm9 = xmm9[1,1,1,1]
193; AVX1-NEXT:    vpmovsxbd %xmm9, %xmm9
194; AVX1-NEXT:    vmovdqu (%rdi,%rcx,4), %xmm12
195; AVX1-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
196; AVX1-NEXT:    vpslld %xmm3, %xmm12, %xmm2
197; AVX1-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
198; AVX1-NEXT:    vpslld %xmm4, %xmm12, %xmm12
199; AVX1-NEXT:    vblendvps %xmm14, %xmm2, %xmm12, %xmm12
200; AVX1-NEXT:    vmovdqu 16(%rdi,%rcx,4), %xmm2
201; AVX1-NEXT:    vpslld %xmm3, %xmm2, %xmm14
202; AVX1-NEXT:    vpslld %xmm4, %xmm2, %xmm2
203; AVX1-NEXT:    vblendvps %xmm15, %xmm14, %xmm2, %xmm2
204; AVX1-NEXT:    vmovdqu 32(%rdi,%rcx,4), %xmm14
205; AVX1-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
206; AVX1-NEXT:    vpslld %xmm3, %xmm14, %xmm15
207; AVX1-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
208; AVX1-NEXT:    vpslld %xmm4, %xmm14, %xmm14
209; AVX1-NEXT:    vblendvps %xmm0, %xmm15, %xmm14, %xmm0
210; AVX1-NEXT:    vmovdqu 48(%rdi,%rcx,4), %xmm14
211; AVX1-NEXT:    vpslld %xmm3, %xmm14, %xmm15
212; AVX1-NEXT:    vpslld %xmm4, %xmm14, %xmm14
213; AVX1-NEXT:    vblendvps %xmm1, %xmm15, %xmm14, %xmm1
214; AVX1-NEXT:    vmovdqu 64(%rdi,%rcx,4), %xmm14
215; AVX1-NEXT:    vpslld %xmm5, %xmm14, %xmm15
216; AVX1-NEXT:    vpslld %xmm6, %xmm14, %xmm14
217; AVX1-NEXT:    vblendvps %xmm13, %xmm15, %xmm14, %xmm13
218; AVX1-NEXT:    vmovdqu 80(%rdi,%rcx,4), %xmm14
219; AVX1-NEXT:    vpslld %xmm5, %xmm14, %xmm15
220; AVX1-NEXT:    vpslld %xmm6, %xmm14, %xmm14
221; AVX1-NEXT:    vblendvps %xmm11, %xmm15, %xmm14, %xmm11
222; AVX1-NEXT:    vmovdqu 96(%rdi,%rcx,4), %xmm14
223; AVX1-NEXT:    vpslld %xmm7, %xmm14, %xmm15
224; AVX1-NEXT:    vpslld %xmm8, %xmm14, %xmm14
225; AVX1-NEXT:    vblendvps %xmm10, %xmm15, %xmm14, %xmm10
226; AVX1-NEXT:    vmovdqu 112(%rdi,%rcx,4), %xmm14
227; AVX1-NEXT:    vpslld %xmm7, %xmm14, %xmm15
228; AVX1-NEXT:    vpslld %xmm8, %xmm14, %xmm14
229; AVX1-NEXT:    vblendvps %xmm9, %xmm15, %xmm14, %xmm9
230; AVX1-NEXT:    vmovups %xmm12, (%rdi,%rcx,4)
231; AVX1-NEXT:    vmovups %xmm2, 16(%rdi,%rcx,4)
232; AVX1-NEXT:    vmovups %xmm0, 32(%rdi,%rcx,4)
233; AVX1-NEXT:    vmovups %xmm1, 48(%rdi,%rcx,4)
234; AVX1-NEXT:    vmovups %xmm13, 64(%rdi,%rcx,4)
235; AVX1-NEXT:    vmovups %xmm11, 80(%rdi,%rcx,4)
236; AVX1-NEXT:    vmovups %xmm10, 96(%rdi,%rcx,4)
237; AVX1-NEXT:    vmovups %xmm9, 112(%rdi,%rcx,4)
238; AVX1-NEXT:    addq $32, %rcx
239; AVX1-NEXT:    cmpq %rcx, %rdx
240; AVX1-NEXT:    jne .LBB0_4
241; AVX1-NEXT:  # %bb.5: # %middle.block
242; AVX1-NEXT:    cmpl %r9d, %edx
243; AVX1-NEXT:    jne .LBB0_6
244; AVX1-NEXT:  .LBB0_9: # %for.cond.cleanup
245; AVX1-NEXT:    vzeroupper
246; AVX1-NEXT:    retq
247; AVX1-NEXT:    .p2align 4
248; AVX1-NEXT:  .LBB0_8: # %for.body
249; AVX1-NEXT:    # in Loop: Header=BB0_6 Depth=1
250; AVX1-NEXT:    # kill: def $cl killed $cl killed $ecx
251; AVX1-NEXT:    shll %cl, (%rdi,%rdx,4)
252; AVX1-NEXT:    incq %rdx
253; AVX1-NEXT:    cmpq %rdx, %r9
254; AVX1-NEXT:    je .LBB0_9
255; AVX1-NEXT:  .LBB0_6: # %for.body
256; AVX1-NEXT:    # =>This Inner Loop Header: Depth=1
257; AVX1-NEXT:    cmpb $0, (%rsi,%rdx)
258; AVX1-NEXT:    movl %eax, %ecx
259; AVX1-NEXT:    je .LBB0_8
260; AVX1-NEXT:  # %bb.7: # %for.body
261; AVX1-NEXT:    # in Loop: Header=BB0_6 Depth=1
262; AVX1-NEXT:    movl %r8d, %ecx
263; AVX1-NEXT:    jmp .LBB0_8
264;
265; AVX2-LABEL: vector_variable_shift_left_loop:
266; AVX2:       # %bb.0: # %entry
267; AVX2-NEXT:    testl %edx, %edx
268; AVX2-NEXT:    jle .LBB0_9
269; AVX2-NEXT:  # %bb.1: # %for.body.preheader
270; AVX2-NEXT:    movl %ecx, %eax
271; AVX2-NEXT:    movl %edx, %r9d
272; AVX2-NEXT:    cmpl $31, %edx
273; AVX2-NEXT:    ja .LBB0_3
274; AVX2-NEXT:  # %bb.2:
275; AVX2-NEXT:    xorl %edx, %edx
276; AVX2-NEXT:    jmp .LBB0_6
277; AVX2-NEXT:  .LBB0_3: # %vector.ph
278; AVX2-NEXT:    movl %r9d, %edx
279; AVX2-NEXT:    andl $-32, %edx
280; AVX2-NEXT:    vmovd %eax, %xmm0
281; AVX2-NEXT:    vpbroadcastd %xmm0, %ymm0
282; AVX2-NEXT:    vmovd %r8d, %xmm1
283; AVX2-NEXT:    vpbroadcastd %xmm1, %ymm1
284; AVX2-NEXT:    xorl %ecx, %ecx
285; AVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
286; AVX2-NEXT:    .p2align 4
287; AVX2-NEXT:  .LBB0_4: # %vector.body
288; AVX2-NEXT:    # =>This Inner Loop Header: Depth=1
289; AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
290; AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
291; AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm5 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
292; AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm6 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
293; AVX2-NEXT:    vpcmpeqd %ymm2, %ymm3, %ymm3
294; AVX2-NEXT:    vblendvps %ymm3, %ymm0, %ymm1, %ymm3
295; AVX2-NEXT:    vpcmpeqd %ymm2, %ymm4, %ymm4
296; AVX2-NEXT:    vblendvps %ymm4, %ymm0, %ymm1, %ymm4
297; AVX2-NEXT:    vpcmpeqd %ymm2, %ymm5, %ymm5
298; AVX2-NEXT:    vblendvps %ymm5, %ymm0, %ymm1, %ymm5
299; AVX2-NEXT:    vpcmpeqd %ymm2, %ymm6, %ymm6
300; AVX2-NEXT:    vblendvps %ymm6, %ymm0, %ymm1, %ymm6
301; AVX2-NEXT:    vmovdqu (%rdi,%rcx,4), %ymm7
302; AVX2-NEXT:    vpsllvd %ymm3, %ymm7, %ymm3
303; AVX2-NEXT:    vmovdqu 32(%rdi,%rcx,4), %ymm7
304; AVX2-NEXT:    vpsllvd %ymm4, %ymm7, %ymm4
305; AVX2-NEXT:    vmovdqu 64(%rdi,%rcx,4), %ymm7
306; AVX2-NEXT:    vpsllvd %ymm5, %ymm7, %ymm5
307; AVX2-NEXT:    vmovdqu 96(%rdi,%rcx,4), %ymm7
308; AVX2-NEXT:    vpsllvd %ymm6, %ymm7, %ymm6
309; AVX2-NEXT:    vmovdqu %ymm3, (%rdi,%rcx,4)
310; AVX2-NEXT:    vmovdqu %ymm4, 32(%rdi,%rcx,4)
311; AVX2-NEXT:    vmovdqu %ymm5, 64(%rdi,%rcx,4)
312; AVX2-NEXT:    vmovdqu %ymm6, 96(%rdi,%rcx,4)
313; AVX2-NEXT:    addq $32, %rcx
314; AVX2-NEXT:    cmpq %rcx, %rdx
315; AVX2-NEXT:    jne .LBB0_4
316; AVX2-NEXT:  # %bb.5: # %middle.block
317; AVX2-NEXT:    cmpl %r9d, %edx
318; AVX2-NEXT:    jne .LBB0_6
319; AVX2-NEXT:  .LBB0_9: # %for.cond.cleanup
320; AVX2-NEXT:    vzeroupper
321; AVX2-NEXT:    retq
322; AVX2-NEXT:    .p2align 4
323; AVX2-NEXT:  .LBB0_8: # %for.body
324; AVX2-NEXT:    # in Loop: Header=BB0_6 Depth=1
325; AVX2-NEXT:    # kill: def $cl killed $cl killed $ecx
326; AVX2-NEXT:    shll %cl, (%rdi,%rdx,4)
327; AVX2-NEXT:    incq %rdx
328; AVX2-NEXT:    cmpq %rdx, %r9
329; AVX2-NEXT:    je .LBB0_9
330; AVX2-NEXT:  .LBB0_6: # %for.body
331; AVX2-NEXT:    # =>This Inner Loop Header: Depth=1
332; AVX2-NEXT:    cmpb $0, (%rsi,%rdx)
333; AVX2-NEXT:    movl %eax, %ecx
334; AVX2-NEXT:    je .LBB0_8
335; AVX2-NEXT:  # %bb.7: # %for.body
336; AVX2-NEXT:    # in Loop: Header=BB0_6 Depth=1
337; AVX2-NEXT:    movl %r8d, %ecx
338; AVX2-NEXT:    jmp .LBB0_8
339;
340; XOP-LABEL: vector_variable_shift_left_loop:
341; XOP:       # %bb.0: # %entry
342; XOP-NEXT:    testl %edx, %edx
343; XOP-NEXT:    jle .LBB0_9
344; XOP-NEXT:  # %bb.1: # %for.body.preheader
345; XOP-NEXT:    movl %ecx, %eax
346; XOP-NEXT:    movl %edx, %r9d
347; XOP-NEXT:    cmpl $31, %edx
348; XOP-NEXT:    ja .LBB0_3
349; XOP-NEXT:  # %bb.2:
350; XOP-NEXT:    xorl %edx, %edx
351; XOP-NEXT:    jmp .LBB0_6
352; XOP-NEXT:  .LBB0_3: # %vector.ph
353; XOP-NEXT:    movl %r9d, %edx
354; XOP-NEXT:    andl $-32, %edx
355; XOP-NEXT:    vmovd %eax, %xmm0
356; XOP-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
357; XOP-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
358; XOP-NEXT:    vmovd %r8d, %xmm1
359; XOP-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
360; XOP-NEXT:    vinsertf128 $1, %xmm1, %ymm1, %ymm1
361; XOP-NEXT:    xorl %ecx, %ecx
362; XOP-NEXT:    vpxor %xmm2, %xmm2, %xmm2
363; XOP-NEXT:    vextractf128 $1, %ymm0, %xmm3
364; XOP-NEXT:    vextractf128 $1, %ymm1, %xmm4
365; XOP-NEXT:    .p2align 4
366; XOP-NEXT:  .LBB0_4: # %vector.body
367; XOP-NEXT:    # =>This Inner Loop Header: Depth=1
368; XOP-NEXT:    vmovq {{.*#+}} xmm5 = mem[0],zero
369; XOP-NEXT:    vmovq {{.*#+}} xmm6 = mem[0],zero
370; XOP-NEXT:    vmovq {{.*#+}} xmm7 = mem[0],zero
371; XOP-NEXT:    vmovq {{.*#+}} xmm8 = mem[0],zero
372; XOP-NEXT:    vpcomeqb %xmm2, %xmm5, %xmm5
373; XOP-NEXT:    vpmovsxbd %xmm5, %xmm9
374; XOP-NEXT:    vpshufd {{.*#+}} xmm5 = xmm5[1,1,1,1]
375; XOP-NEXT:    vpmovsxbd %xmm5, %xmm5
376; XOP-NEXT:    vpcomeqb %xmm2, %xmm6, %xmm6
377; XOP-NEXT:    vpmovsxbd %xmm6, %xmm10
378; XOP-NEXT:    vpshufd {{.*#+}} xmm6 = xmm6[1,1,1,1]
379; XOP-NEXT:    vpmovsxbd %xmm6, %xmm6
380; XOP-NEXT:    vpcomeqb %xmm2, %xmm7, %xmm7
381; XOP-NEXT:    vpmovsxbd %xmm7, %xmm11
382; XOP-NEXT:    vpshufd {{.*#+}} xmm7 = xmm7[1,1,1,1]
383; XOP-NEXT:    vpmovsxbd %xmm7, %xmm7
384; XOP-NEXT:    vpcomeqb %xmm2, %xmm8, %xmm8
385; XOP-NEXT:    vpmovsxbd %xmm8, %xmm12
386; XOP-NEXT:    vpshufd {{.*#+}} xmm8 = xmm8[1,1,1,1]
387; XOP-NEXT:    vpmovsxbd %xmm8, %xmm8
388; XOP-NEXT:    vblendvps %xmm5, %xmm3, %xmm4, %xmm5
389; XOP-NEXT:    vpshld %xmm5, 16(%rdi,%rcx,4), %xmm5
390; XOP-NEXT:    vblendvps %xmm9, %xmm0, %xmm1, %xmm9
391; XOP-NEXT:    vpshld %xmm9, (%rdi,%rcx,4), %xmm9
392; XOP-NEXT:    vblendvps %xmm6, %xmm3, %xmm4, %xmm6
393; XOP-NEXT:    vpshld %xmm6, 48(%rdi,%rcx,4), %xmm6
394; XOP-NEXT:    vblendvps %xmm10, %xmm0, %xmm1, %xmm10
395; XOP-NEXT:    vpshld %xmm10, 32(%rdi,%rcx,4), %xmm10
396; XOP-NEXT:    vblendvps %xmm7, %xmm3, %xmm4, %xmm7
397; XOP-NEXT:    vpshld %xmm7, 80(%rdi,%rcx,4), %xmm7
398; XOP-NEXT:    vblendvps %xmm11, %xmm0, %xmm1, %xmm11
399; XOP-NEXT:    vpshld %xmm11, 64(%rdi,%rcx,4), %xmm11
400; XOP-NEXT:    vblendvps %xmm8, %xmm3, %xmm4, %xmm8
401; XOP-NEXT:    vpshld %xmm8, 112(%rdi,%rcx,4), %xmm8
402; XOP-NEXT:    vblendvps %xmm12, %xmm0, %xmm1, %xmm12
403; XOP-NEXT:    vpshld %xmm12, 96(%rdi,%rcx,4), %xmm12
404; XOP-NEXT:    vmovdqu %xmm9, (%rdi,%rcx,4)
405; XOP-NEXT:    vmovdqu %xmm5, 16(%rdi,%rcx,4)
406; XOP-NEXT:    vmovdqu %xmm10, 32(%rdi,%rcx,4)
407; XOP-NEXT:    vmovdqu %xmm6, 48(%rdi,%rcx,4)
408; XOP-NEXT:    vmovdqu %xmm11, 64(%rdi,%rcx,4)
409; XOP-NEXT:    vmovdqu %xmm7, 80(%rdi,%rcx,4)
410; XOP-NEXT:    vmovdqu %xmm12, 96(%rdi,%rcx,4)
411; XOP-NEXT:    vmovdqu %xmm8, 112(%rdi,%rcx,4)
412; XOP-NEXT:    addq $32, %rcx
413; XOP-NEXT:    cmpq %rcx, %rdx
414; XOP-NEXT:    jne .LBB0_4
415; XOP-NEXT:  # %bb.5: # %middle.block
416; XOP-NEXT:    cmpl %r9d, %edx
417; XOP-NEXT:    jne .LBB0_6
418; XOP-NEXT:  .LBB0_9: # %for.cond.cleanup
419; XOP-NEXT:    vzeroupper
420; XOP-NEXT:    retq
421; XOP-NEXT:    .p2align 4
422; XOP-NEXT:  .LBB0_8: # %for.body
423; XOP-NEXT:    # in Loop: Header=BB0_6 Depth=1
424; XOP-NEXT:    # kill: def $cl killed $cl killed $ecx
425; XOP-NEXT:    shll %cl, (%rdi,%rdx,4)
426; XOP-NEXT:    incq %rdx
427; XOP-NEXT:    cmpq %rdx, %r9
428; XOP-NEXT:    je .LBB0_9
429; XOP-NEXT:  .LBB0_6: # %for.body
430; XOP-NEXT:    # =>This Inner Loop Header: Depth=1
431; XOP-NEXT:    cmpb $0, (%rsi,%rdx)
432; XOP-NEXT:    movl %eax, %ecx
433; XOP-NEXT:    je .LBB0_8
434; XOP-NEXT:  # %bb.7: # %for.body
435; XOP-NEXT:    # in Loop: Header=BB0_6 Depth=1
436; XOP-NEXT:    movl %r8d, %ecx
437; XOP-NEXT:    jmp .LBB0_8
438entry:
439  %cmp12 = icmp sgt i32 %count, 0
440  br i1 %cmp12, label %for.body.preheader, label %for.cond.cleanup
441
442for.body.preheader:
443  %wide.trip.count = zext i32 %count to i64
444  %min.iters.check = icmp ult i32 %count, 32
445  br i1 %min.iters.check, label %for.body.preheader40, label %vector.ph
446
447for.body.preheader40:
448  %indvars.iv.ph = phi i64 [ 0, %for.body.preheader ], [ %n.vec, %middle.block ]
449  br label %for.body
450
451vector.ph:
452  %n.vec = and i64 %wide.trip.count, 4294967264
453  %broadcast.splatinsert20 = insertelement <8 x i32> poison, i32 %amt0, i32 0
454  %broadcast.splat21 = shufflevector <8 x i32> %broadcast.splatinsert20, <8 x i32> poison, <8 x i32> zeroinitializer
455  %broadcast.splatinsert22 = insertelement <8 x i32> poison, i32 %amt1, i32 0
456  %broadcast.splat23 = shufflevector <8 x i32> %broadcast.splatinsert22, <8 x i32> poison, <8 x i32> zeroinitializer
457  %broadcast.splatinsert24 = insertelement <8 x i32> poison, i32 %amt0, i32 0
458  %broadcast.splat25 = shufflevector <8 x i32> %broadcast.splatinsert24, <8 x i32> poison, <8 x i32> zeroinitializer
459  %broadcast.splatinsert26 = insertelement <8 x i32> poison, i32 %amt1, i32 0
460  %broadcast.splat27 = shufflevector <8 x i32> %broadcast.splatinsert26, <8 x i32> poison, <8 x i32> zeroinitializer
461  %broadcast.splatinsert28 = insertelement <8 x i32> poison, i32 %amt0, i32 0
462  %broadcast.splat29 = shufflevector <8 x i32> %broadcast.splatinsert28, <8 x i32> poison, <8 x i32> zeroinitializer
463  %broadcast.splatinsert30 = insertelement <8 x i32> poison, i32 %amt1, i32 0
464  %broadcast.splat31 = shufflevector <8 x i32> %broadcast.splatinsert30, <8 x i32> poison, <8 x i32> zeroinitializer
465  %broadcast.splatinsert32 = insertelement <8 x i32> poison, i32 %amt0, i32 0
466  %broadcast.splat33 = shufflevector <8 x i32> %broadcast.splatinsert32, <8 x i32> poison, <8 x i32> zeroinitializer
467  %broadcast.splatinsert34 = insertelement <8 x i32> poison, i32 %amt1, i32 0
468  %broadcast.splat35 = shufflevector <8 x i32> %broadcast.splatinsert34, <8 x i32> poison, <8 x i32> zeroinitializer
469  br label %vector.body
470
471vector.body:
472  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
473  %0 = getelementptr inbounds i8, ptr %control, i64 %index
474  %wide.load = load <8 x i8>, ptr %0, align 1
475  %1 = getelementptr inbounds i8, ptr %0, i64 8
476  %wide.load17 = load <8 x i8>, ptr %1, align 1
477  %2 = getelementptr inbounds i8, ptr %0, i64 16
478  %wide.load18 = load <8 x i8>, ptr %2, align 1
479  %3 = getelementptr inbounds i8, ptr %0, i64 24
480  %wide.load19 = load <8 x i8>, ptr %3, align 1
481  %4 = icmp eq <8 x i8> %wide.load, zeroinitializer
482  %5 = icmp eq <8 x i8> %wide.load17, zeroinitializer
483  %6 = icmp eq <8 x i8> %wide.load18, zeroinitializer
484  %7 = icmp eq <8 x i8> %wide.load19, zeroinitializer
485  %8 = select <8 x i1> %4, <8 x i32> %broadcast.splat21, <8 x i32> %broadcast.splat23
486  %9 = select <8 x i1> %5, <8 x i32> %broadcast.splat25, <8 x i32> %broadcast.splat27
487  %10 = select <8 x i1> %6, <8 x i32> %broadcast.splat29, <8 x i32> %broadcast.splat31
488  %11 = select <8 x i1> %7, <8 x i32> %broadcast.splat33, <8 x i32> %broadcast.splat35
489  %12 = getelementptr inbounds i32, ptr %arr, i64 %index
490  %wide.load36 = load <8 x i32>, ptr %12, align 4
491  %13 = getelementptr inbounds i32, ptr %12, i64 8
492  %wide.load37 = load <8 x i32>, ptr %13, align 4
493  %14 = getelementptr inbounds i32, ptr %12, i64 16
494  %wide.load38 = load <8 x i32>, ptr %14, align 4
495  %15 = getelementptr inbounds i32, ptr %12, i64 24
496  %wide.load39 = load <8 x i32>, ptr %15, align 4
497  %16 = shl <8 x i32> %wide.load36, %8
498  %17 = shl <8 x i32> %wide.load37, %9
499  %18 = shl <8 x i32> %wide.load38, %10
500  %19 = shl <8 x i32> %wide.load39, %11
501  store <8 x i32> %16, ptr %12, align 4
502  store <8 x i32> %17, ptr %13, align 4
503  store <8 x i32> %18, ptr %14, align 4
504  store <8 x i32> %19, ptr %15, align 4
505  %index.next = add i64 %index, 32
506  %20 = icmp eq i64 %index.next, %n.vec
507  br i1 %20, label %middle.block, label %vector.body
508
509middle.block:
510  %cmp.n = icmp eq i64 %n.vec, %wide.trip.count
511  br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader40
512
513for.cond.cleanup:
514  ret void
515
516for.body:
517  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ %indvars.iv.ph, %for.body.preheader40 ]
518  %arrayidx = getelementptr inbounds i8, ptr %control, i64 %indvars.iv
519  %21 = load i8, ptr %arrayidx, align 1
520  %tobool = icmp eq i8 %21, 0
521  %cond = select i1 %tobool, i32 %amt0, i32 %amt1
522  %arrayidx2 = getelementptr inbounds i32, ptr %arr, i64 %indvars.iv
523  %22 = load i32, ptr %arrayidx2, align 4
524  %shl = shl i32 %22, %cond
525  store i32 %shl, ptr %arrayidx2, align 4
526  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
527  %exitcond = icmp eq i64 %indvars.iv.next, %wide.trip.count
528  br i1 %exitcond, label %for.cond.cleanup, label %for.body
529}
530
531define void @vector_variable_shift_left_loop_simpler(ptr nocapture %arr, ptr nocapture readonly %control, i32 %count, i32 %amt0, i32 %amt1, i32 %x) nounwind {
532; SSE-LABEL: vector_variable_shift_left_loop_simpler:
533; SSE:       # %bb.0: # %entry
534; SSE-NEXT:    testl %edx, %edx
535; SSE-NEXT:    jle .LBB1_3
536; SSE-NEXT:  # %bb.1: # %vector.ph
537; SSE-NEXT:    movl %edx, %eax
538; SSE-NEXT:    andl $-4, %eax
539; SSE-NEXT:    movd %ecx, %xmm0
540; SSE-NEXT:    movd %r8d, %xmm3
541; SSE-NEXT:    movd %r9d, %xmm1
542; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
543; SSE-NEXT:    xorl %ecx, %ecx
544; SSE-NEXT:    pmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
545; SSE-NEXT:    movdqa %xmm1, %xmm2
546; SSE-NEXT:    pslld %xmm0, %xmm2
547; SSE-NEXT:    pmovzxdq {{.*#+}} xmm0 = xmm3[0],zero,xmm3[1],zero
548; SSE-NEXT:    pslld %xmm0, %xmm1
549; SSE-NEXT:    pxor %xmm3, %xmm3
550; SSE-NEXT:    .p2align 4
551; SSE-NEXT:  .LBB1_2: # %vector.body
552; SSE-NEXT:    # =>This Inner Loop Header: Depth=1
553; SSE-NEXT:    pmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
554; SSE-NEXT:    pcmpeqd %xmm3, %xmm0
555; SSE-NEXT:    movdqa %xmm1, %xmm4
556; SSE-NEXT:    blendvps %xmm0, %xmm2, %xmm4
557; SSE-NEXT:    movups %xmm4, (%rdi,%rcx,4)
558; SSE-NEXT:    addq $4, %rcx
559; SSE-NEXT:    cmpq %rcx, %rax
560; SSE-NEXT:    jne .LBB1_2
561; SSE-NEXT:  .LBB1_3: # %exit
562; SSE-NEXT:    retq
563;
564; AVX1-LABEL: vector_variable_shift_left_loop_simpler:
565; AVX1:       # %bb.0: # %entry
566; AVX1-NEXT:    testl %edx, %edx
567; AVX1-NEXT:    jle .LBB1_3
568; AVX1-NEXT:  # %bb.1: # %vector.ph
569; AVX1-NEXT:    movl %edx, %eax
570; AVX1-NEXT:    andl $-4, %eax
571; AVX1-NEXT:    vmovd %ecx, %xmm0
572; AVX1-NEXT:    vmovd %r8d, %xmm1
573; AVX1-NEXT:    vmovd %r9d, %xmm2
574; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
575; AVX1-NEXT:    xorl %ecx, %ecx
576; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
577; AVX1-NEXT:    vpslld %xmm0, %xmm2, %xmm0
578; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
579; AVX1-NEXT:    vpslld %xmm1, %xmm2, %xmm1
580; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
581; AVX1-NEXT:    .p2align 4
582; AVX1-NEXT:  .LBB1_2: # %vector.body
583; AVX1-NEXT:    # =>This Inner Loop Header: Depth=1
584; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
585; AVX1-NEXT:    vpcmpeqd %xmm2, %xmm3, %xmm3
586; AVX1-NEXT:    vblendvps %xmm3, %xmm0, %xmm1, %xmm3
587; AVX1-NEXT:    vmovups %xmm3, (%rdi,%rcx,4)
588; AVX1-NEXT:    addq $4, %rcx
589; AVX1-NEXT:    cmpq %rcx, %rax
590; AVX1-NEXT:    jne .LBB1_2
591; AVX1-NEXT:  .LBB1_3: # %exit
592; AVX1-NEXT:    retq
593;
594; AVX2-LABEL: vector_variable_shift_left_loop_simpler:
595; AVX2:       # %bb.0: # %entry
596; AVX2-NEXT:    testl %edx, %edx
597; AVX2-NEXT:    jle .LBB1_3
598; AVX2-NEXT:  # %bb.1: # %vector.ph
599; AVX2-NEXT:    movl %edx, %eax
600; AVX2-NEXT:    andl $-4, %eax
601; AVX2-NEXT:    vmovd %ecx, %xmm0
602; AVX2-NEXT:    vpbroadcastd %xmm0, %xmm0
603; AVX2-NEXT:    vmovd %r8d, %xmm1
604; AVX2-NEXT:    vpbroadcastd %xmm1, %xmm1
605; AVX2-NEXT:    vmovd %r9d, %xmm2
606; AVX2-NEXT:    vpbroadcastd %xmm2, %xmm2
607; AVX2-NEXT:    xorl %ecx, %ecx
608; AVX2-NEXT:    vpxor %xmm3, %xmm3, %xmm3
609; AVX2-NEXT:    .p2align 4
610; AVX2-NEXT:  .LBB1_2: # %vector.body
611; AVX2-NEXT:    # =>This Inner Loop Header: Depth=1
612; AVX2-NEXT:    vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
613; AVX2-NEXT:    vpcmpeqd %xmm3, %xmm4, %xmm4
614; AVX2-NEXT:    vblendvps %xmm4, %xmm0, %xmm1, %xmm4
615; AVX2-NEXT:    vpsllvd %xmm4, %xmm2, %xmm4
616; AVX2-NEXT:    vmovdqu %xmm4, (%rdi,%rcx,4)
617; AVX2-NEXT:    addq $4, %rcx
618; AVX2-NEXT:    cmpq %rcx, %rax
619; AVX2-NEXT:    jne .LBB1_2
620; AVX2-NEXT:  .LBB1_3: # %exit
621; AVX2-NEXT:    retq
622;
623; XOP-LABEL: vector_variable_shift_left_loop_simpler:
624; XOP:       # %bb.0: # %entry
625; XOP-NEXT:    testl %edx, %edx
626; XOP-NEXT:    jle .LBB1_3
627; XOP-NEXT:  # %bb.1: # %vector.ph
628; XOP-NEXT:    movl %edx, %eax
629; XOP-NEXT:    andl $-4, %eax
630; XOP-NEXT:    vmovd %ecx, %xmm0
631; XOP-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
632; XOP-NEXT:    vmovd %r8d, %xmm1
633; XOP-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
634; XOP-NEXT:    vmovd %r9d, %xmm2
635; XOP-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
636; XOP-NEXT:    xorl %ecx, %ecx
637; XOP-NEXT:    vpxor %xmm3, %xmm3, %xmm3
638; XOP-NEXT:    .p2align 4
639; XOP-NEXT:  .LBB1_2: # %vector.body
640; XOP-NEXT:    # =>This Inner Loop Header: Depth=1
641; XOP-NEXT:    vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
642; XOP-NEXT:    vpcomeqd %xmm3, %xmm4, %xmm4
643; XOP-NEXT:    vblendvps %xmm4, %xmm0, %xmm1, %xmm4
644; XOP-NEXT:    vpshld %xmm4, %xmm2, %xmm4
645; XOP-NEXT:    vmovdqu %xmm4, (%rdi,%rcx,4)
646; XOP-NEXT:    addq $4, %rcx
647; XOP-NEXT:    cmpq %rcx, %rax
648; XOP-NEXT:    jne .LBB1_2
649; XOP-NEXT:  .LBB1_3: # %exit
650; XOP-NEXT:    retq
651entry:
652  %cmp16 = icmp sgt i32 %count, 0
653  %wide.trip.count = zext i32 %count to i64
654  br i1 %cmp16, label %vector.ph, label %exit
655
656vector.ph:
657  %n.vec = and i64 %wide.trip.count, 4294967292
658  %splatinsert18 = insertelement <4 x i32> poison, i32 %amt0, i32 0
659  %splat1 = shufflevector <4 x i32> %splatinsert18, <4 x i32> poison, <4 x i32> zeroinitializer
660  %splatinsert20 = insertelement <4 x i32> poison, i32 %amt1, i32 0
661  %splat2 = shufflevector <4 x i32> %splatinsert20, <4 x i32> poison, <4 x i32> zeroinitializer
662  %splatinsert22 = insertelement <4 x i32> poison, i32 %x, i32 0
663  %splat3 = shufflevector <4 x i32> %splatinsert22, <4 x i32> poison, <4 x i32> zeroinitializer
664  br label %vector.body
665
666vector.body:
667  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
668  %0 = getelementptr inbounds i8, ptr %control, i64 %index
669  %wide.load = load <4 x i8>, ptr %0, align 1
670  %1 = icmp eq <4 x i8> %wide.load, zeroinitializer
671  %2 = select <4 x i1> %1, <4 x i32> %splat1, <4 x i32> %splat2
672  %3 = shl <4 x i32> %splat3, %2
673  %4 = getelementptr inbounds i32, ptr %arr, i64 %index
674  store <4 x i32> %3, ptr %4, align 4
675  %index.next = add i64 %index, 4
676  %5 = icmp eq i64 %index.next, %n.vec
677  br i1 %5, label %exit, label %vector.body
678
679exit:
680  ret void
681}
682