xref: /llvm-project/llvm/test/CodeGen/X86/avg.ll (revision 92715cf43b18d497cd034bdc7787b3a8eeb2edc5)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=CHECK,SSE2
3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=CHECK,AVX,AVX1
4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=CHECK,AVX,AVX2
5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=CHECK,AVX,AVX512,AVX512F
6; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw | FileCheck %s --check-prefixes=CHECK,AVX,AVX512,AVX512BW
7
8define void @avg_v4i8(ptr %a, ptr %b) nounwind {
9; SSE2-LABEL: avg_v4i8:
10; SSE2:       # %bb.0:
11; SSE2-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
12; SSE2-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
13; SSE2-NEXT:    pavgb %xmm0, %xmm1
14; SSE2-NEXT:    movd %xmm1, (%rax)
15; SSE2-NEXT:    retq
16;
17; AVX-LABEL: avg_v4i8:
18; AVX:       # %bb.0:
19; AVX-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
20; AVX-NEXT:    vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
21; AVX-NEXT:    vpavgb %xmm1, %xmm0, %xmm0
22; AVX-NEXT:    vmovd %xmm0, (%rax)
23; AVX-NEXT:    retq
24  %1 = load <4 x i8>, ptr %a
25  %2 = load <4 x i8>, ptr %b
26  %3 = zext <4 x i8> %1 to <4 x i32>
27  %4 = zext <4 x i8> %2 to <4 x i32>
28  %5 = add nuw nsw <4 x i32> %3, <i32 1, i32 1, i32 1, i32 1>
29  %6 = add nuw nsw <4 x i32> %5, %4
30  %7 = lshr <4 x i32> %6, <i32 1, i32 1, i32 1, i32 1>
31  %8 = trunc <4 x i32> %7 to <4 x i8>
32  store <4 x i8> %8, ptr undef, align 4
33  ret void
34}
35
36define void @avg_v8i8(ptr %a, ptr %b) nounwind {
37; SSE2-LABEL: avg_v8i8:
38; SSE2:       # %bb.0:
39; SSE2-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
40; SSE2-NEXT:    movq {{.*#+}} xmm1 = mem[0],zero
41; SSE2-NEXT:    pavgb %xmm0, %xmm1
42; SSE2-NEXT:    movq %xmm1, (%rax)
43; SSE2-NEXT:    retq
44;
45; AVX-LABEL: avg_v8i8:
46; AVX:       # %bb.0:
47; AVX-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
48; AVX-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
49; AVX-NEXT:    vpavgb %xmm1, %xmm0, %xmm0
50; AVX-NEXT:    vmovq %xmm0, (%rax)
51; AVX-NEXT:    retq
52  %1 = load <8 x i8>, ptr %a
53  %2 = load <8 x i8>, ptr %b
54  %3 = zext <8 x i8> %1 to <8 x i32>
55  %4 = zext <8 x i8> %2 to <8 x i32>
56  %5 = add nuw nsw <8 x i32> %3, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
57  %6 = add nuw nsw <8 x i32> %5, %4
58  %7 = lshr <8 x i32> %6, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
59  %8 = trunc <8 x i32> %7 to <8 x i8>
60  store <8 x i8> %8, ptr undef, align 4
61  ret void
62}
63
64define void @avg_v16i8(ptr %a, ptr %b) nounwind {
65; SSE2-LABEL: avg_v16i8:
66; SSE2:       # %bb.0:
67; SSE2-NEXT:    movdqa (%rdi), %xmm0
68; SSE2-NEXT:    pavgb (%rsi), %xmm0
69; SSE2-NEXT:    movdqu %xmm0, (%rax)
70; SSE2-NEXT:    retq
71;
72; AVX-LABEL: avg_v16i8:
73; AVX:       # %bb.0:
74; AVX-NEXT:    vmovdqa (%rdi), %xmm0
75; AVX-NEXT:    vpavgb (%rsi), %xmm0, %xmm0
76; AVX-NEXT:    vmovdqu %xmm0, (%rax)
77; AVX-NEXT:    retq
78  %1 = load <16 x i8>, ptr %a
79  %2 = load <16 x i8>, ptr %b
80  %3 = zext <16 x i8> %1 to <16 x i32>
81  %4 = zext <16 x i8> %2 to <16 x i32>
82  %5 = add nuw nsw <16 x i32> %3, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
83  %6 = add nuw nsw <16 x i32> %5, %4
84  %7 = lshr <16 x i32> %6, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
85  %8 = trunc <16 x i32> %7 to <16 x i8>
86  store <16 x i8> %8, ptr undef, align 4
87  ret void
88}
89
90define void @avg_v24i8(ptr %a, ptr %b) nounwind {
91; SSE2-LABEL: avg_v24i8:
92; SSE2:       # %bb.0:
93; SSE2-NEXT:    movdqa (%rdi), %xmm0
94; SSE2-NEXT:    movdqa 16(%rdi), %xmm1
95; SSE2-NEXT:    pavgb (%rsi), %xmm0
96; SSE2-NEXT:    pavgb 16(%rsi), %xmm1
97; SSE2-NEXT:    movq %xmm1, (%rax)
98; SSE2-NEXT:    movdqu %xmm0, (%rax)
99; SSE2-NEXT:    retq
100;
101; AVX1-LABEL: avg_v24i8:
102; AVX1:       # %bb.0:
103; AVX1-NEXT:    vmovdqa (%rdi), %xmm0
104; AVX1-NEXT:    vmovdqa 16(%rdi), %xmm1
105; AVX1-NEXT:    vpavgb (%rsi), %xmm0, %xmm0
106; AVX1-NEXT:    vpavgb 16(%rsi), %xmm1, %xmm1
107; AVX1-NEXT:    vmovq %xmm1, (%rax)
108; AVX1-NEXT:    vmovdqu %xmm0, (%rax)
109; AVX1-NEXT:    retq
110;
111; AVX2-LABEL: avg_v24i8:
112; AVX2:       # %bb.0:
113; AVX2-NEXT:    vmovdqa (%rdi), %ymm0
114; AVX2-NEXT:    vpavgb (%rsi), %ymm0, %ymm0
115; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
116; AVX2-NEXT:    vmovq %xmm1, (%rax)
117; AVX2-NEXT:    vmovdqu %xmm0, (%rax)
118; AVX2-NEXT:    vzeroupper
119; AVX2-NEXT:    retq
120;
121; AVX512-LABEL: avg_v24i8:
122; AVX512:       # %bb.0:
123; AVX512-NEXT:    vmovdqa (%rdi), %ymm0
124; AVX512-NEXT:    vpavgb (%rsi), %ymm0, %ymm0
125; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
126; AVX512-NEXT:    vmovq %xmm1, (%rax)
127; AVX512-NEXT:    vmovdqu %xmm0, (%rax)
128; AVX512-NEXT:    vzeroupper
129; AVX512-NEXT:    retq
130  %1 = load <24 x i8>, ptr %a
131  %2 = load <24 x i8>, ptr %b
132  %3 = zext <24 x i8> %1 to <24 x i32>
133  %4 = zext <24 x i8> %2 to <24 x i32>
134  %5 = add nuw nsw <24 x i32> %3, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
135  %6 = add nuw nsw <24 x i32> %5, %4
136  %7 = lshr <24 x i32> %6, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
137  %8 = trunc <24 x i32> %7 to <24 x i8>
138  store <24 x i8> %8, ptr undef, align 4
139  ret void
140}
141
142define void @avg_v32i8(ptr %a, ptr %b) nounwind {
143; SSE2-LABEL: avg_v32i8:
144; SSE2:       # %bb.0:
145; SSE2-NEXT:    movdqa (%rdi), %xmm0
146; SSE2-NEXT:    movdqa 16(%rdi), %xmm1
147; SSE2-NEXT:    pavgb (%rsi), %xmm0
148; SSE2-NEXT:    pavgb 16(%rsi), %xmm1
149; SSE2-NEXT:    movdqu %xmm1, (%rax)
150; SSE2-NEXT:    movdqu %xmm0, (%rax)
151; SSE2-NEXT:    retq
152;
153; AVX1-LABEL: avg_v32i8:
154; AVX1:       # %bb.0:
155; AVX1-NEXT:    vmovdqa (%rdi), %xmm0
156; AVX1-NEXT:    vmovdqa 16(%rdi), %xmm1
157; AVX1-NEXT:    vpavgb (%rsi), %xmm0, %xmm0
158; AVX1-NEXT:    vpavgb 16(%rsi), %xmm1, %xmm1
159; AVX1-NEXT:    vmovdqu %xmm1, (%rax)
160; AVX1-NEXT:    vmovdqu %xmm0, (%rax)
161; AVX1-NEXT:    retq
162;
163; AVX2-LABEL: avg_v32i8:
164; AVX2:       # %bb.0:
165; AVX2-NEXT:    vmovdqa (%rdi), %ymm0
166; AVX2-NEXT:    vpavgb (%rsi), %ymm0, %ymm0
167; AVX2-NEXT:    vmovdqu %ymm0, (%rax)
168; AVX2-NEXT:    vzeroupper
169; AVX2-NEXT:    retq
170;
171; AVX512-LABEL: avg_v32i8:
172; AVX512:       # %bb.0:
173; AVX512-NEXT:    vmovdqa (%rdi), %ymm0
174; AVX512-NEXT:    vpavgb (%rsi), %ymm0, %ymm0
175; AVX512-NEXT:    vmovdqu %ymm0, (%rax)
176; AVX512-NEXT:    vzeroupper
177; AVX512-NEXT:    retq
178  %1 = load <32 x i8>, ptr %a
179  %2 = load <32 x i8>, ptr %b
180  %3 = zext <32 x i8> %1 to <32 x i32>
181  %4 = zext <32 x i8> %2 to <32 x i32>
182  %5 = add nuw nsw <32 x i32> %3, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
183  %6 = add nuw nsw <32 x i32> %5, %4
184  %7 = lshr <32 x i32> %6, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
185  %8 = trunc <32 x i32> %7 to <32 x i8>
186  store <32 x i8> %8, ptr undef, align 4
187  ret void
188}
189
190define void @avg_v48i8(ptr %a, ptr %b) nounwind {
191; SSE2-LABEL: avg_v48i8:
192; SSE2:       # %bb.0:
193; SSE2-NEXT:    movdqa (%rdi), %xmm0
194; SSE2-NEXT:    movdqa 16(%rdi), %xmm1
195; SSE2-NEXT:    movdqa 32(%rdi), %xmm2
196; SSE2-NEXT:    pavgb (%rsi), %xmm0
197; SSE2-NEXT:    pavgb 16(%rsi), %xmm1
198; SSE2-NEXT:    pavgb 32(%rsi), %xmm2
199; SSE2-NEXT:    movdqu %xmm2, (%rax)
200; SSE2-NEXT:    movdqu %xmm1, (%rax)
201; SSE2-NEXT:    movdqu %xmm0, (%rax)
202; SSE2-NEXT:    retq
203;
204; AVX1-LABEL: avg_v48i8:
205; AVX1:       # %bb.0:
206; AVX1-NEXT:    vmovdqa (%rdi), %xmm0
207; AVX1-NEXT:    vmovdqa 16(%rdi), %xmm1
208; AVX1-NEXT:    vmovdqa 32(%rdi), %xmm2
209; AVX1-NEXT:    vpavgb (%rsi), %xmm0, %xmm0
210; AVX1-NEXT:    vpavgb 16(%rsi), %xmm1, %xmm1
211; AVX1-NEXT:    vpavgb 32(%rsi), %xmm2, %xmm2
212; AVX1-NEXT:    vmovdqu %xmm1, (%rax)
213; AVX1-NEXT:    vmovdqu %xmm0, (%rax)
214; AVX1-NEXT:    vmovdqu %xmm2, (%rax)
215; AVX1-NEXT:    retq
216;
217; AVX2-LABEL: avg_v48i8:
218; AVX2:       # %bb.0:
219; AVX2-NEXT:    vmovdqa (%rdi), %ymm0
220; AVX2-NEXT:    vpavgb (%rsi), %ymm0, %ymm0
221; AVX2-NEXT:    vmovdqa 32(%rdi), %xmm1
222; AVX2-NEXT:    vpavgb 32(%rsi), %xmm1, %xmm1
223; AVX2-NEXT:    vmovdqu %xmm1, (%rax)
224; AVX2-NEXT:    vmovdqu %ymm0, (%rax)
225; AVX2-NEXT:    vzeroupper
226; AVX2-NEXT:    retq
227;
228; AVX512F-LABEL: avg_v48i8:
229; AVX512F:       # %bb.0:
230; AVX512F-NEXT:    vmovdqa (%rdi), %ymm0
231; AVX512F-NEXT:    vpavgb (%rsi), %ymm0, %ymm0
232; AVX512F-NEXT:    vmovdqa 32(%rdi), %xmm1
233; AVX512F-NEXT:    vpavgb 32(%rsi), %xmm1, %xmm1
234; AVX512F-NEXT:    vmovdqu %xmm1, (%rax)
235; AVX512F-NEXT:    vmovdqu %ymm0, (%rax)
236; AVX512F-NEXT:    vzeroupper
237; AVX512F-NEXT:    retq
238;
239; AVX512BW-LABEL: avg_v48i8:
240; AVX512BW:       # %bb.0:
241; AVX512BW-NEXT:    vmovdqa64 (%rdi), %zmm0
242; AVX512BW-NEXT:    vpavgb (%rsi), %zmm0, %zmm0
243; AVX512BW-NEXT:    vextracti32x4 $2, %zmm0, (%rax)
244; AVX512BW-NEXT:    vmovdqu %ymm0, (%rax)
245; AVX512BW-NEXT:    vzeroupper
246; AVX512BW-NEXT:    retq
247  %1 = load <48 x i8>, ptr %a
248  %2 = load <48 x i8>, ptr %b
249  %3 = zext <48 x i8> %1 to <48 x i32>
250  %4 = zext <48 x i8> %2 to <48 x i32>
251  %5 = add nuw nsw <48 x i32> %3, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
252  %6 = add nuw nsw <48 x i32> %5, %4
253  %7 = lshr <48 x i32> %6, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
254  %8 = trunc <48 x i32> %7 to <48 x i8>
255  store <48 x i8> %8, ptr undef, align 4
256  ret void
257}
258
259define void @avg_v64i8(ptr %a, ptr %b) nounwind {
260; SSE2-LABEL: avg_v64i8:
261; SSE2:       # %bb.0:
262; SSE2-NEXT:    movdqa (%rdi), %xmm0
263; SSE2-NEXT:    movdqa 16(%rdi), %xmm1
264; SSE2-NEXT:    movdqa 32(%rdi), %xmm2
265; SSE2-NEXT:    movdqa 48(%rdi), %xmm3
266; SSE2-NEXT:    pavgb (%rsi), %xmm0
267; SSE2-NEXT:    pavgb 16(%rsi), %xmm1
268; SSE2-NEXT:    pavgb 32(%rsi), %xmm2
269; SSE2-NEXT:    pavgb 48(%rsi), %xmm3
270; SSE2-NEXT:    movdqu %xmm3, (%rax)
271; SSE2-NEXT:    movdqu %xmm2, (%rax)
272; SSE2-NEXT:    movdqu %xmm1, (%rax)
273; SSE2-NEXT:    movdqu %xmm0, (%rax)
274; SSE2-NEXT:    retq
275;
276; AVX1-LABEL: avg_v64i8:
277; AVX1:       # %bb.0:
278; AVX1-NEXT:    vmovdqa (%rdi), %xmm0
279; AVX1-NEXT:    vmovdqa 16(%rdi), %xmm1
280; AVX1-NEXT:    vmovdqa 32(%rdi), %xmm2
281; AVX1-NEXT:    vmovdqa 48(%rdi), %xmm3
282; AVX1-NEXT:    vpavgb (%rsi), %xmm0, %xmm0
283; AVX1-NEXT:    vpavgb 16(%rsi), %xmm1, %xmm1
284; AVX1-NEXT:    vpavgb 32(%rsi), %xmm2, %xmm2
285; AVX1-NEXT:    vpavgb 48(%rsi), %xmm3, %xmm3
286; AVX1-NEXT:    vmovdqu %xmm3, (%rax)
287; AVX1-NEXT:    vmovdqu %xmm2, (%rax)
288; AVX1-NEXT:    vmovdqu %xmm1, (%rax)
289; AVX1-NEXT:    vmovdqu %xmm0, (%rax)
290; AVX1-NEXT:    retq
291;
292; AVX2-LABEL: avg_v64i8:
293; AVX2:       # %bb.0:
294; AVX2-NEXT:    vmovdqa (%rdi), %ymm0
295; AVX2-NEXT:    vmovdqa 32(%rdi), %ymm1
296; AVX2-NEXT:    vpavgb (%rsi), %ymm0, %ymm0
297; AVX2-NEXT:    vpavgb 32(%rsi), %ymm1, %ymm1
298; AVX2-NEXT:    vmovdqu %ymm1, (%rax)
299; AVX2-NEXT:    vmovdqu %ymm0, (%rax)
300; AVX2-NEXT:    vzeroupper
301; AVX2-NEXT:    retq
302;
303; AVX512F-LABEL: avg_v64i8:
304; AVX512F:       # %bb.0:
305; AVX512F-NEXT:    vmovdqa (%rdi), %ymm0
306; AVX512F-NEXT:    vmovdqa 32(%rdi), %ymm1
307; AVX512F-NEXT:    vpavgb (%rsi), %ymm0, %ymm0
308; AVX512F-NEXT:    vpavgb 32(%rsi), %ymm1, %ymm1
309; AVX512F-NEXT:    vmovdqu %ymm1, (%rax)
310; AVX512F-NEXT:    vmovdqu %ymm0, (%rax)
311; AVX512F-NEXT:    vzeroupper
312; AVX512F-NEXT:    retq
313;
314; AVX512BW-LABEL: avg_v64i8:
315; AVX512BW:       # %bb.0:
316; AVX512BW-NEXT:    vmovdqa64 (%rdi), %zmm0
317; AVX512BW-NEXT:    vpavgb (%rsi), %zmm0, %zmm0
318; AVX512BW-NEXT:    vmovdqu64 %zmm0, (%rax)
319; AVX512BW-NEXT:    vzeroupper
320; AVX512BW-NEXT:    retq
321  %1 = load <64 x i8>, ptr %a
322  %2 = load <64 x i8>, ptr %b
323  %3 = zext <64 x i8> %1 to <64 x i32>
324  %4 = zext <64 x i8> %2 to <64 x i32>
325  %5 = add nuw nsw <64 x i32> %3, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
326  %6 = add nuw nsw <64 x i32> %5, %4
327  %7 = lshr <64 x i32> %6, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
328  %8 = trunc <64 x i32> %7 to <64 x i8>
329  store <64 x i8> %8, ptr undef, align 4
330  ret void
331}
332
333define void @avg_v4i16(ptr %a, ptr %b) nounwind {
334; SSE2-LABEL: avg_v4i16:
335; SSE2:       # %bb.0:
336; SSE2-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
337; SSE2-NEXT:    movq {{.*#+}} xmm1 = mem[0],zero
338; SSE2-NEXT:    pavgw %xmm0, %xmm1
339; SSE2-NEXT:    movq %xmm1, (%rax)
340; SSE2-NEXT:    retq
341;
342; AVX-LABEL: avg_v4i16:
343; AVX:       # %bb.0:
344; AVX-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
345; AVX-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
346; AVX-NEXT:    vpavgw %xmm1, %xmm0, %xmm0
347; AVX-NEXT:    vmovq %xmm0, (%rax)
348; AVX-NEXT:    retq
349  %1 = load <4 x i16>, ptr %a
350  %2 = load <4 x i16>, ptr %b
351  %3 = zext <4 x i16> %1 to <4 x i32>
352  %4 = zext <4 x i16> %2 to <4 x i32>
353  %5 = add nuw nsw <4 x i32> %3, <i32 1, i32 1, i32 1, i32 1>
354  %6 = add nuw nsw <4 x i32> %5, %4
355  %7 = lshr <4 x i32> %6, <i32 1, i32 1, i32 1, i32 1>
356  %8 = trunc <4 x i32> %7 to <4 x i16>
357  store <4 x i16> %8, ptr undef, align 4
358  ret void
359}
360
361define void @avg_v8i16(ptr %a, ptr %b) nounwind {
362; SSE2-LABEL: avg_v8i16:
363; SSE2:       # %bb.0:
364; SSE2-NEXT:    movdqa (%rdi), %xmm0
365; SSE2-NEXT:    pavgw (%rsi), %xmm0
366; SSE2-NEXT:    movdqu %xmm0, (%rax)
367; SSE2-NEXT:    retq
368;
369; AVX-LABEL: avg_v8i16:
370; AVX:       # %bb.0:
371; AVX-NEXT:    vmovdqa (%rdi), %xmm0
372; AVX-NEXT:    vpavgw (%rsi), %xmm0, %xmm0
373; AVX-NEXT:    vmovdqu %xmm0, (%rax)
374; AVX-NEXT:    retq
375  %1 = load <8 x i16>, ptr %a
376  %2 = load <8 x i16>, ptr %b
377  %3 = zext <8 x i16> %1 to <8 x i32>
378  %4 = zext <8 x i16> %2 to <8 x i32>
379  %5 = add nuw nsw <8 x i32> %3, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
380  %6 = add nuw nsw <8 x i32> %5, %4
381  %7 = lshr <8 x i32> %6, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
382  %8 = trunc <8 x i32> %7 to <8 x i16>
383  store <8 x i16> %8, ptr undef, align 4
384  ret void
385}
386
387define void @avg_v16i16(ptr %a, ptr %b) nounwind {
388; SSE2-LABEL: avg_v16i16:
389; SSE2:       # %bb.0:
390; SSE2-NEXT:    movdqa (%rdi), %xmm0
391; SSE2-NEXT:    movdqa 16(%rdi), %xmm1
392; SSE2-NEXT:    pavgw (%rsi), %xmm0
393; SSE2-NEXT:    pavgw 16(%rsi), %xmm1
394; SSE2-NEXT:    movdqu %xmm1, (%rax)
395; SSE2-NEXT:    movdqu %xmm0, (%rax)
396; SSE2-NEXT:    retq
397;
398; AVX1-LABEL: avg_v16i16:
399; AVX1:       # %bb.0:
400; AVX1-NEXT:    vmovdqa (%rdi), %xmm0
401; AVX1-NEXT:    vmovdqa 16(%rdi), %xmm1
402; AVX1-NEXT:    vpavgw (%rsi), %xmm0, %xmm0
403; AVX1-NEXT:    vpavgw 16(%rsi), %xmm1, %xmm1
404; AVX1-NEXT:    vmovdqu %xmm1, (%rax)
405; AVX1-NEXT:    vmovdqu %xmm0, (%rax)
406; AVX1-NEXT:    retq
407;
408; AVX2-LABEL: avg_v16i16:
409; AVX2:       # %bb.0:
410; AVX2-NEXT:    vmovdqa (%rdi), %ymm0
411; AVX2-NEXT:    vpavgw (%rsi), %ymm0, %ymm0
412; AVX2-NEXT:    vmovdqu %ymm0, (%rax)
413; AVX2-NEXT:    vzeroupper
414; AVX2-NEXT:    retq
415;
416; AVX512-LABEL: avg_v16i16:
417; AVX512:       # %bb.0:
418; AVX512-NEXT:    vmovdqa (%rdi), %ymm0
419; AVX512-NEXT:    vpavgw (%rsi), %ymm0, %ymm0
420; AVX512-NEXT:    vmovdqu %ymm0, (%rax)
421; AVX512-NEXT:    vzeroupper
422; AVX512-NEXT:    retq
423  %1 = load <16 x i16>, ptr %a
424  %2 = load <16 x i16>, ptr %b
425  %3 = zext <16 x i16> %1 to <16 x i32>
426  %4 = zext <16 x i16> %2 to <16 x i32>
427  %5 = add nuw nsw <16 x i32> %3, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
428  %6 = add nuw nsw <16 x i32> %5, %4
429  %7 = lshr <16 x i32> %6, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
430  %8 = trunc <16 x i32> %7 to <16 x i16>
431  store <16 x i16> %8, ptr undef, align 4
432  ret void
433}
434
435define void @avg_v32i16(ptr %a, ptr %b) nounwind {
436; SSE2-LABEL: avg_v32i16:
437; SSE2:       # %bb.0:
438; SSE2-NEXT:    movdqa (%rdi), %xmm0
439; SSE2-NEXT:    movdqa 16(%rdi), %xmm1
440; SSE2-NEXT:    movdqa 32(%rdi), %xmm2
441; SSE2-NEXT:    movdqa 48(%rdi), %xmm3
442; SSE2-NEXT:    pavgw (%rsi), %xmm0
443; SSE2-NEXT:    pavgw 16(%rsi), %xmm1
444; SSE2-NEXT:    pavgw 32(%rsi), %xmm2
445; SSE2-NEXT:    pavgw 48(%rsi), %xmm3
446; SSE2-NEXT:    movdqu %xmm3, (%rax)
447; SSE2-NEXT:    movdqu %xmm2, (%rax)
448; SSE2-NEXT:    movdqu %xmm1, (%rax)
449; SSE2-NEXT:    movdqu %xmm0, (%rax)
450; SSE2-NEXT:    retq
451;
452; AVX1-LABEL: avg_v32i16:
453; AVX1:       # %bb.0:
454; AVX1-NEXT:    vmovdqa (%rdi), %xmm0
455; AVX1-NEXT:    vmovdqa 16(%rdi), %xmm1
456; AVX1-NEXT:    vmovdqa 32(%rdi), %xmm2
457; AVX1-NEXT:    vmovdqa 48(%rdi), %xmm3
458; AVX1-NEXT:    vpavgw (%rsi), %xmm0, %xmm0
459; AVX1-NEXT:    vpavgw 16(%rsi), %xmm1, %xmm1
460; AVX1-NEXT:    vpavgw 32(%rsi), %xmm2, %xmm2
461; AVX1-NEXT:    vpavgw 48(%rsi), %xmm3, %xmm3
462; AVX1-NEXT:    vmovdqu %xmm3, (%rax)
463; AVX1-NEXT:    vmovdqu %xmm2, (%rax)
464; AVX1-NEXT:    vmovdqu %xmm1, (%rax)
465; AVX1-NEXT:    vmovdqu %xmm0, (%rax)
466; AVX1-NEXT:    retq
467;
468; AVX2-LABEL: avg_v32i16:
469; AVX2:       # %bb.0:
470; AVX2-NEXT:    vmovdqa (%rdi), %ymm0
471; AVX2-NEXT:    vmovdqa 32(%rdi), %ymm1
472; AVX2-NEXT:    vpavgw (%rsi), %ymm0, %ymm0
473; AVX2-NEXT:    vpavgw 32(%rsi), %ymm1, %ymm1
474; AVX2-NEXT:    vmovdqu %ymm1, (%rax)
475; AVX2-NEXT:    vmovdqu %ymm0, (%rax)
476; AVX2-NEXT:    vzeroupper
477; AVX2-NEXT:    retq
478;
479; AVX512F-LABEL: avg_v32i16:
480; AVX512F:       # %bb.0:
481; AVX512F-NEXT:    vmovdqa (%rdi), %ymm0
482; AVX512F-NEXT:    vmovdqa 32(%rdi), %ymm1
483; AVX512F-NEXT:    vpavgw (%rsi), %ymm0, %ymm0
484; AVX512F-NEXT:    vpavgw 32(%rsi), %ymm1, %ymm1
485; AVX512F-NEXT:    vmovdqu %ymm1, (%rax)
486; AVX512F-NEXT:    vmovdqu %ymm0, (%rax)
487; AVX512F-NEXT:    vzeroupper
488; AVX512F-NEXT:    retq
489;
490; AVX512BW-LABEL: avg_v32i16:
491; AVX512BW:       # %bb.0:
492; AVX512BW-NEXT:    vmovdqa64 (%rdi), %zmm0
493; AVX512BW-NEXT:    vpavgw (%rsi), %zmm0, %zmm0
494; AVX512BW-NEXT:    vmovdqu64 %zmm0, (%rax)
495; AVX512BW-NEXT:    vzeroupper
496; AVX512BW-NEXT:    retq
497  %1 = load <32 x i16>, ptr %a
498  %2 = load <32 x i16>, ptr %b
499  %3 = zext <32 x i16> %1 to <32 x i32>
500  %4 = zext <32 x i16> %2 to <32 x i32>
501  %5 = add nuw nsw <32 x i32> %3, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
502  %6 = add nuw nsw <32 x i32> %5, %4
503  %7 = lshr <32 x i32> %6, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
504  %8 = trunc <32 x i32> %7 to <32 x i16>
505  store <32 x i16> %8, ptr undef, align 4
506  ret void
507}
508
509define void @avg_v40i16(ptr %a, ptr %b) nounwind {
510; SSE2-LABEL: avg_v40i16:
511; SSE2:       # %bb.0:
512; SSE2-NEXT:    movdqa 64(%rdi), %xmm0
513; SSE2-NEXT:    movdqa (%rdi), %xmm1
514; SSE2-NEXT:    movdqa 16(%rdi), %xmm2
515; SSE2-NEXT:    movdqa 32(%rdi), %xmm3
516; SSE2-NEXT:    movdqa 48(%rdi), %xmm4
517; SSE2-NEXT:    pavgw (%rsi), %xmm1
518; SSE2-NEXT:    pavgw 16(%rsi), %xmm2
519; SSE2-NEXT:    pavgw 32(%rsi), %xmm3
520; SSE2-NEXT:    pavgw 48(%rsi), %xmm4
521; SSE2-NEXT:    pavgw 64(%rsi), %xmm0
522; SSE2-NEXT:    movdqu %xmm0, (%rax)
523; SSE2-NEXT:    movdqu %xmm4, (%rax)
524; SSE2-NEXT:    movdqu %xmm3, (%rax)
525; SSE2-NEXT:    movdqu %xmm2, (%rax)
526; SSE2-NEXT:    movdqu %xmm1, (%rax)
527; SSE2-NEXT:    retq
528;
529; AVX1-LABEL: avg_v40i16:
530; AVX1:       # %bb.0:
531; AVX1-NEXT:    vmovdqa (%rdi), %xmm0
532; AVX1-NEXT:    vmovdqa 16(%rdi), %xmm1
533; AVX1-NEXT:    vmovdqa 32(%rdi), %xmm2
534; AVX1-NEXT:    vmovdqa 48(%rdi), %xmm3
535; AVX1-NEXT:    vpavgw (%rsi), %xmm0, %xmm0
536; AVX1-NEXT:    vpavgw 16(%rsi), %xmm1, %xmm1
537; AVX1-NEXT:    vpavgw 32(%rsi), %xmm2, %xmm2
538; AVX1-NEXT:    vpavgw 48(%rsi), %xmm3, %xmm3
539; AVX1-NEXT:    vmovdqa 64(%rdi), %xmm4
540; AVX1-NEXT:    vpavgw 64(%rsi), %xmm4, %xmm4
541; AVX1-NEXT:    vmovdqu %xmm3, (%rax)
542; AVX1-NEXT:    vmovdqu %xmm2, (%rax)
543; AVX1-NEXT:    vmovdqu %xmm1, (%rax)
544; AVX1-NEXT:    vmovdqu %xmm0, (%rax)
545; AVX1-NEXT:    vmovdqu %xmm4, (%rax)
546; AVX1-NEXT:    retq
547;
548; AVX2-LABEL: avg_v40i16:
549; AVX2:       # %bb.0:
550; AVX2-NEXT:    vmovdqa (%rdi), %ymm0
551; AVX2-NEXT:    vmovdqa 32(%rdi), %ymm1
552; AVX2-NEXT:    vpavgw (%rsi), %ymm0, %ymm0
553; AVX2-NEXT:    vpavgw 32(%rsi), %ymm1, %ymm1
554; AVX2-NEXT:    vmovdqa 64(%rdi), %xmm2
555; AVX2-NEXT:    vpavgw 64(%rsi), %xmm2, %xmm2
556; AVX2-NEXT:    vmovdqu %xmm2, (%rax)
557; AVX2-NEXT:    vmovdqu %ymm1, (%rax)
558; AVX2-NEXT:    vmovdqu %ymm0, (%rax)
559; AVX2-NEXT:    vzeroupper
560; AVX2-NEXT:    retq
561;
562; AVX512F-LABEL: avg_v40i16:
563; AVX512F:       # %bb.0:
564; AVX512F-NEXT:    vmovdqa (%rdi), %ymm0
565; AVX512F-NEXT:    vmovdqa 32(%rdi), %ymm1
566; AVX512F-NEXT:    vpavgw (%rsi), %ymm0, %ymm0
567; AVX512F-NEXT:    vpavgw 32(%rsi), %ymm1, %ymm1
568; AVX512F-NEXT:    vmovdqa 64(%rdi), %xmm2
569; AVX512F-NEXT:    vpavgw 64(%rsi), %xmm2, %xmm2
570; AVX512F-NEXT:    vmovdqu %ymm1, (%rax)
571; AVX512F-NEXT:    vmovdqu %ymm0, (%rax)
572; AVX512F-NEXT:    vmovdqu %xmm2, (%rax)
573; AVX512F-NEXT:    vzeroupper
574; AVX512F-NEXT:    retq
575;
576; AVX512BW-LABEL: avg_v40i16:
577; AVX512BW:       # %bb.0:
578; AVX512BW-NEXT:    vmovdqa64 (%rdi), %zmm0
579; AVX512BW-NEXT:    vpavgw (%rsi), %zmm0, %zmm0
580; AVX512BW-NEXT:    vmovdqa 64(%rdi), %xmm1
581; AVX512BW-NEXT:    vpavgw 64(%rsi), %xmm1, %xmm1
582; AVX512BW-NEXT:    vmovdqu %xmm1, (%rax)
583; AVX512BW-NEXT:    vmovdqu64 %zmm0, (%rax)
584; AVX512BW-NEXT:    vzeroupper
585; AVX512BW-NEXT:    retq
586  %1 = load <40 x i16>, ptr %a
587  %2 = load <40 x i16>, ptr %b
588  %3 = zext <40 x i16> %1 to <40 x i32>
589  %4 = zext <40 x i16> %2 to <40 x i32>
590  %5 = add nuw nsw <40 x i32> %3, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
591  %6 = add nuw nsw <40 x i32> %5, %4
592  %7 = lshr <40 x i32> %6, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
593  %8 = trunc <40 x i32> %7 to <40 x i16>
594  store <40 x i16> %8, ptr undef, align 4
595  ret void
596}
597
598define void @avg_v4i8_2(ptr %a, ptr %b) nounwind {
599; SSE2-LABEL: avg_v4i8_2:
600; SSE2:       # %bb.0:
601; SSE2-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
602; SSE2-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
603; SSE2-NEXT:    pavgb %xmm0, %xmm1
604; SSE2-NEXT:    movd %xmm1, (%rax)
605; SSE2-NEXT:    retq
606;
607; AVX-LABEL: avg_v4i8_2:
608; AVX:       # %bb.0:
609; AVX-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
610; AVX-NEXT:    vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
611; AVX-NEXT:    vpavgb %xmm1, %xmm0, %xmm0
612; AVX-NEXT:    vmovd %xmm0, (%rax)
613; AVX-NEXT:    retq
614  %1 = load <4 x i8>, ptr %a
615  %2 = load <4 x i8>, ptr %b
616  %3 = zext <4 x i8> %1 to <4 x i32>
617  %4 = zext <4 x i8> %2 to <4 x i32>
618  %5 = add nuw nsw <4 x i32> %3, %4
619  %6 = add nuw nsw <4 x i32> %5, <i32 1, i32 1, i32 1, i32 1>
620  %7 = lshr <4 x i32> %6, <i32 1, i32 1, i32 1, i32 1>
621  %8 = trunc <4 x i32> %7 to <4 x i8>
622  store <4 x i8> %8, ptr undef, align 4
623  ret void
624}
625
626define void @avg_v8i8_2(ptr %a, ptr %b) nounwind {
627; SSE2-LABEL: avg_v8i8_2:
628; SSE2:       # %bb.0:
629; SSE2-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
630; SSE2-NEXT:    movq {{.*#+}} xmm1 = mem[0],zero
631; SSE2-NEXT:    pavgb %xmm0, %xmm1
632; SSE2-NEXT:    movq %xmm1, (%rax)
633; SSE2-NEXT:    retq
634;
635; AVX-LABEL: avg_v8i8_2:
636; AVX:       # %bb.0:
637; AVX-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
638; AVX-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
639; AVX-NEXT:    vpavgb %xmm1, %xmm0, %xmm0
640; AVX-NEXT:    vmovq %xmm0, (%rax)
641; AVX-NEXT:    retq
642  %1 = load <8 x i8>, ptr %a
643  %2 = load <8 x i8>, ptr %b
644  %3 = zext <8 x i8> %1 to <8 x i32>
645  %4 = zext <8 x i8> %2 to <8 x i32>
646  %5 = add nuw nsw <8 x i32> %3, %4
647  %6 = add nuw nsw <8 x i32> %5, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
648  %7 = lshr <8 x i32> %6, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
649  %8 = trunc <8 x i32> %7 to <8 x i8>
650  store <8 x i8> %8, ptr undef, align 4
651  ret void
652}
653
654define void @avg_v16i8_2(ptr %a, ptr %b) nounwind {
655; SSE2-LABEL: avg_v16i8_2:
656; SSE2:       # %bb.0:
657; SSE2-NEXT:    movdqa (%rdi), %xmm0
658; SSE2-NEXT:    pavgb (%rsi), %xmm0
659; SSE2-NEXT:    movdqu %xmm0, (%rax)
660; SSE2-NEXT:    retq
661;
662; AVX-LABEL: avg_v16i8_2:
663; AVX:       # %bb.0:
664; AVX-NEXT:    vmovdqa (%rdi), %xmm0
665; AVX-NEXT:    vpavgb (%rsi), %xmm0, %xmm0
666; AVX-NEXT:    vmovdqu %xmm0, (%rax)
667; AVX-NEXT:    retq
668  %1 = load <16 x i8>, ptr %a
669  %2 = load <16 x i8>, ptr %b
670  %3 = zext <16 x i8> %1 to <16 x i32>
671  %4 = zext <16 x i8> %2 to <16 x i32>
672  %5 = add nuw nsw <16 x i32> %3, %4
673  %6 = add nuw nsw <16 x i32> %5, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
674  %7 = lshr <16 x i32> %6, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
675  %8 = trunc <16 x i32> %7 to <16 x i8>
676  store <16 x i8> %8, ptr undef, align 4
677  ret void
678}
679
680define void @avg_v32i8_2(ptr %a, ptr %b) nounwind {
681; SSE2-LABEL: avg_v32i8_2:
682; SSE2:       # %bb.0:
683; SSE2-NEXT:    movdqa (%rdi), %xmm0
684; SSE2-NEXT:    movdqa 16(%rdi), %xmm1
685; SSE2-NEXT:    pavgb (%rsi), %xmm0
686; SSE2-NEXT:    pavgb 16(%rsi), %xmm1
687; SSE2-NEXT:    movdqu %xmm1, (%rax)
688; SSE2-NEXT:    movdqu %xmm0, (%rax)
689; SSE2-NEXT:    retq
690;
691; AVX1-LABEL: avg_v32i8_2:
692; AVX1:       # %bb.0:
693; AVX1-NEXT:    vmovdqa (%rdi), %xmm0
694; AVX1-NEXT:    vmovdqa 16(%rdi), %xmm1
695; AVX1-NEXT:    vpavgb (%rsi), %xmm0, %xmm0
696; AVX1-NEXT:    vpavgb 16(%rsi), %xmm1, %xmm1
697; AVX1-NEXT:    vmovdqu %xmm1, (%rax)
698; AVX1-NEXT:    vmovdqu %xmm0, (%rax)
699; AVX1-NEXT:    retq
700;
701; AVX2-LABEL: avg_v32i8_2:
702; AVX2:       # %bb.0:
703; AVX2-NEXT:    vmovdqa (%rdi), %ymm0
704; AVX2-NEXT:    vpavgb (%rsi), %ymm0, %ymm0
705; AVX2-NEXT:    vmovdqu %ymm0, (%rax)
706; AVX2-NEXT:    vzeroupper
707; AVX2-NEXT:    retq
708;
709; AVX512-LABEL: avg_v32i8_2:
710; AVX512:       # %bb.0:
711; AVX512-NEXT:    vmovdqa (%rdi), %ymm0
712; AVX512-NEXT:    vpavgb (%rsi), %ymm0, %ymm0
713; AVX512-NEXT:    vmovdqu %ymm0, (%rax)
714; AVX512-NEXT:    vzeroupper
715; AVX512-NEXT:    retq
716  %1 = load <32 x i8>, ptr %a
717  %2 = load <32 x i8>, ptr %b
718  %3 = zext <32 x i8> %1 to <32 x i32>
719  %4 = zext <32 x i8> %2 to <32 x i32>
720  %5 = add nuw nsw <32 x i32> %3, %4
721  %6 = add nuw nsw <32 x i32> %5, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
722  %7 = lshr <32 x i32> %6, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
723  %8 = trunc <32 x i32> %7 to <32 x i8>
724  store <32 x i8> %8, ptr undef, align 4
725  ret void
726}
727
728define void @avg_v64i8_2(ptr %a, ptr %b) nounwind {
729; SSE2-LABEL: avg_v64i8_2:
730; SSE2:       # %bb.0:
731; SSE2-NEXT:    movaps (%rsi), %xmm0
732; SSE2-NEXT:    movaps 16(%rsi), %xmm1
733; SSE2-NEXT:    movaps 32(%rsi), %xmm2
734; SSE2-NEXT:    movaps 48(%rsi), %xmm3
735; SSE2-NEXT:    movups %xmm3, (%rax)
736; SSE2-NEXT:    movups %xmm2, (%rax)
737; SSE2-NEXT:    movups %xmm1, (%rax)
738; SSE2-NEXT:    movups %xmm0, (%rax)
739; SSE2-NEXT:    retq
740;
741; AVX1-LABEL: avg_v64i8_2:
742; AVX1:       # %bb.0:
743; AVX1-NEXT:    vmovaps (%rsi), %ymm0
744; AVX1-NEXT:    vmovaps 32(%rsi), %ymm1
745; AVX1-NEXT:    vmovups %ymm1, (%rax)
746; AVX1-NEXT:    vmovups %ymm0, (%rax)
747; AVX1-NEXT:    vzeroupper
748; AVX1-NEXT:    retq
749;
750; AVX2-LABEL: avg_v64i8_2:
751; AVX2:       # %bb.0:
752; AVX2-NEXT:    vmovaps (%rsi), %ymm0
753; AVX2-NEXT:    vmovaps 32(%rsi), %ymm1
754; AVX2-NEXT:    vmovups %ymm1, (%rax)
755; AVX2-NEXT:    vmovups %ymm0, (%rax)
756; AVX2-NEXT:    vzeroupper
757; AVX2-NEXT:    retq
758;
759; AVX512-LABEL: avg_v64i8_2:
760; AVX512:       # %bb.0:
761; AVX512-NEXT:    vmovaps (%rsi), %zmm0
762; AVX512-NEXT:    vmovups %zmm0, (%rax)
763; AVX512-NEXT:    vzeroupper
764; AVX512-NEXT:    retq
765  %1 = load <64 x i8>, ptr %a
766  %2 = load <64 x i8>, ptr %b
767  %3 = zext <64 x i8> %1 to <64 x i32>
768  %4 = zext <64 x i8> %2 to <64 x i32>
769  %5 = add nuw nsw <64 x i32> %4, %4
770  %6 = add nuw nsw <64 x i32> %5, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
771  %7 = lshr <64 x i32> %6, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
772  %8 = trunc <64 x i32> %7 to <64 x i8>
773  store <64 x i8> %8, ptr undef, align 4
774  ret void
775}
776
777
778define void @avg_v4i16_2(ptr %a, ptr %b) nounwind {
779; SSE2-LABEL: avg_v4i16_2:
780; SSE2:       # %bb.0:
781; SSE2-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
782; SSE2-NEXT:    movq {{.*#+}} xmm1 = mem[0],zero
783; SSE2-NEXT:    pavgw %xmm0, %xmm1
784; SSE2-NEXT:    movq %xmm1, (%rax)
785; SSE2-NEXT:    retq
786;
787; AVX-LABEL: avg_v4i16_2:
788; AVX:       # %bb.0:
789; AVX-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
790; AVX-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
791; AVX-NEXT:    vpavgw %xmm1, %xmm0, %xmm0
792; AVX-NEXT:    vmovq %xmm0, (%rax)
793; AVX-NEXT:    retq
794  %1 = load <4 x i16>, ptr %a
795  %2 = load <4 x i16>, ptr %b
796  %3 = zext <4 x i16> %1 to <4 x i32>
797  %4 = zext <4 x i16> %2 to <4 x i32>
798  %5 = add nuw nsw <4 x i32> %3, %4
799  %6 = add nuw nsw <4 x i32> %5, <i32 1, i32 1, i32 1, i32 1>
800  %7 = lshr <4 x i32> %6, <i32 1, i32 1, i32 1, i32 1>
801  %8 = trunc <4 x i32> %7 to <4 x i16>
802  store <4 x i16> %8, ptr undef, align 4
803  ret void
804}
805
806define void @avg_v8i16_2(ptr %a, ptr %b) nounwind {
807; SSE2-LABEL: avg_v8i16_2:
808; SSE2:       # %bb.0:
809; SSE2-NEXT:    movdqa (%rdi), %xmm0
810; SSE2-NEXT:    pavgw (%rsi), %xmm0
811; SSE2-NEXT:    movdqu %xmm0, (%rax)
812; SSE2-NEXT:    retq
813;
814; AVX-LABEL: avg_v8i16_2:
815; AVX:       # %bb.0:
816; AVX-NEXT:    vmovdqa (%rdi), %xmm0
817; AVX-NEXT:    vpavgw (%rsi), %xmm0, %xmm0
818; AVX-NEXT:    vmovdqu %xmm0, (%rax)
819; AVX-NEXT:    retq
820  %1 = load <8 x i16>, ptr %a
821  %2 = load <8 x i16>, ptr %b
822  %3 = zext <8 x i16> %1 to <8 x i32>
823  %4 = zext <8 x i16> %2 to <8 x i32>
824  %5 = add nuw nsw <8 x i32> %3, %4
825  %6 = add nuw nsw <8 x i32> %5, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
826  %7 = lshr <8 x i32> %6, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
827  %8 = trunc <8 x i32> %7 to <8 x i16>
828  store <8 x i16> %8, ptr undef, align 4
829  ret void
830}
831
832define void @avg_v16i16_2(ptr %a, ptr %b) nounwind {
833; SSE2-LABEL: avg_v16i16_2:
834; SSE2:       # %bb.0:
835; SSE2-NEXT:    movdqa (%rdi), %xmm0
836; SSE2-NEXT:    movdqa 16(%rdi), %xmm1
837; SSE2-NEXT:    pavgw (%rsi), %xmm0
838; SSE2-NEXT:    pavgw 16(%rsi), %xmm1
839; SSE2-NEXT:    movdqu %xmm1, (%rax)
840; SSE2-NEXT:    movdqu %xmm0, (%rax)
841; SSE2-NEXT:    retq
842;
843; AVX1-LABEL: avg_v16i16_2:
844; AVX1:       # %bb.0:
845; AVX1-NEXT:    vmovdqa (%rdi), %xmm0
846; AVX1-NEXT:    vmovdqa 16(%rdi), %xmm1
847; AVX1-NEXT:    vpavgw (%rsi), %xmm0, %xmm0
848; AVX1-NEXT:    vpavgw 16(%rsi), %xmm1, %xmm1
849; AVX1-NEXT:    vmovdqu %xmm1, (%rax)
850; AVX1-NEXT:    vmovdqu %xmm0, (%rax)
851; AVX1-NEXT:    retq
852;
853; AVX2-LABEL: avg_v16i16_2:
854; AVX2:       # %bb.0:
855; AVX2-NEXT:    vmovdqa (%rdi), %ymm0
856; AVX2-NEXT:    vpavgw (%rsi), %ymm0, %ymm0
857; AVX2-NEXT:    vmovdqu %ymm0, (%rax)
858; AVX2-NEXT:    vzeroupper
859; AVX2-NEXT:    retq
860;
861; AVX512-LABEL: avg_v16i16_2:
862; AVX512:       # %bb.0:
863; AVX512-NEXT:    vmovdqa (%rdi), %ymm0
864; AVX512-NEXT:    vpavgw (%rsi), %ymm0, %ymm0
865; AVX512-NEXT:    vmovdqu %ymm0, (%rax)
866; AVX512-NEXT:    vzeroupper
867; AVX512-NEXT:    retq
868  %1 = load <16 x i16>, ptr %a
869  %2 = load <16 x i16>, ptr %b
870  %3 = zext <16 x i16> %1 to <16 x i32>
871  %4 = zext <16 x i16> %2 to <16 x i32>
872  %5 = add nuw nsw <16 x i32> %3, %4
873  %6 = add nuw nsw <16 x i32> %5, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
874  %7 = lshr <16 x i32> %6, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
875  %8 = trunc <16 x i32> %7 to <16 x i16>
876  store <16 x i16> %8, ptr undef, align 4
877  ret void
878}
879
880define void @avg_v32i16_2(ptr %a, ptr %b) nounwind {
881; SSE2-LABEL: avg_v32i16_2:
882; SSE2:       # %bb.0:
883; SSE2-NEXT:    movdqa (%rdi), %xmm0
884; SSE2-NEXT:    movdqa 16(%rdi), %xmm1
885; SSE2-NEXT:    movdqa 32(%rdi), %xmm2
886; SSE2-NEXT:    movdqa 48(%rdi), %xmm3
887; SSE2-NEXT:    pavgw (%rsi), %xmm0
888; SSE2-NEXT:    pavgw 16(%rsi), %xmm1
889; SSE2-NEXT:    pavgw 32(%rsi), %xmm2
890; SSE2-NEXT:    pavgw 48(%rsi), %xmm3
891; SSE2-NEXT:    movdqu %xmm3, (%rax)
892; SSE2-NEXT:    movdqu %xmm2, (%rax)
893; SSE2-NEXT:    movdqu %xmm1, (%rax)
894; SSE2-NEXT:    movdqu %xmm0, (%rax)
895; SSE2-NEXT:    retq
896;
897; AVX1-LABEL: avg_v32i16_2:
898; AVX1:       # %bb.0:
899; AVX1-NEXT:    vmovdqa (%rdi), %xmm0
900; AVX1-NEXT:    vmovdqa 16(%rdi), %xmm1
901; AVX1-NEXT:    vmovdqa 32(%rdi), %xmm2
902; AVX1-NEXT:    vmovdqa 48(%rdi), %xmm3
903; AVX1-NEXT:    vpavgw (%rsi), %xmm0, %xmm0
904; AVX1-NEXT:    vpavgw 16(%rsi), %xmm1, %xmm1
905; AVX1-NEXT:    vpavgw 32(%rsi), %xmm2, %xmm2
906; AVX1-NEXT:    vpavgw 48(%rsi), %xmm3, %xmm3
907; AVX1-NEXT:    vmovdqu %xmm3, (%rax)
908; AVX1-NEXT:    vmovdqu %xmm2, (%rax)
909; AVX1-NEXT:    vmovdqu %xmm1, (%rax)
910; AVX1-NEXT:    vmovdqu %xmm0, (%rax)
911; AVX1-NEXT:    retq
912;
913; AVX2-LABEL: avg_v32i16_2:
914; AVX2:       # %bb.0:
915; AVX2-NEXT:    vmovdqa (%rdi), %ymm0
916; AVX2-NEXT:    vmovdqa 32(%rdi), %ymm1
917; AVX2-NEXT:    vpavgw (%rsi), %ymm0, %ymm0
918; AVX2-NEXT:    vpavgw 32(%rsi), %ymm1, %ymm1
919; AVX2-NEXT:    vmovdqu %ymm1, (%rax)
920; AVX2-NEXT:    vmovdqu %ymm0, (%rax)
921; AVX2-NEXT:    vzeroupper
922; AVX2-NEXT:    retq
923;
924; AVX512F-LABEL: avg_v32i16_2:
925; AVX512F:       # %bb.0:
926; AVX512F-NEXT:    vmovdqa (%rdi), %ymm0
927; AVX512F-NEXT:    vmovdqa 32(%rdi), %ymm1
928; AVX512F-NEXT:    vpavgw (%rsi), %ymm0, %ymm0
929; AVX512F-NEXT:    vpavgw 32(%rsi), %ymm1, %ymm1
930; AVX512F-NEXT:    vmovdqu %ymm1, (%rax)
931; AVX512F-NEXT:    vmovdqu %ymm0, (%rax)
932; AVX512F-NEXT:    vzeroupper
933; AVX512F-NEXT:    retq
934;
935; AVX512BW-LABEL: avg_v32i16_2:
936; AVX512BW:       # %bb.0:
937; AVX512BW-NEXT:    vmovdqa64 (%rdi), %zmm0
938; AVX512BW-NEXT:    vpavgw (%rsi), %zmm0, %zmm0
939; AVX512BW-NEXT:    vmovdqu64 %zmm0, (%rax)
940; AVX512BW-NEXT:    vzeroupper
941; AVX512BW-NEXT:    retq
942  %1 = load <32 x i16>, ptr %a
943  %2 = load <32 x i16>, ptr %b
944  %3 = zext <32 x i16> %1 to <32 x i32>
945  %4 = zext <32 x i16> %2 to <32 x i32>
946  %5 = add nuw nsw <32 x i32> %3, %4
947  %6 = add nuw nsw <32 x i32> %5, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
948  %7 = lshr <32 x i32> %6, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
949  %8 = trunc <32 x i32> %7 to <32 x i16>
950  store <32 x i16> %8, ptr undef, align 4
951  ret void
952}
953
954define void @avg_v4i8_const(ptr %a) nounwind {
955; SSE2-LABEL: avg_v4i8_const:
956; SSE2:       # %bb.0:
957; SSE2-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
958; SSE2-NEXT:    pavgb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
959; SSE2-NEXT:    movd %xmm0, (%rax)
960; SSE2-NEXT:    retq
961;
962; AVX-LABEL: avg_v4i8_const:
963; AVX:       # %bb.0:
964; AVX-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
965; AVX-NEXT:    vpavgb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
966; AVX-NEXT:    vmovd %xmm0, (%rax)
967; AVX-NEXT:    retq
968  %1 = load <4 x i8>, ptr %a
969  %2 = zext <4 x i8> %1 to <4 x i32>
970  %3 = add nuw nsw <4 x i32> %2, <i32 1, i32 2, i32 3, i32 4>
971  %4 = lshr <4 x i32> %3, <i32 1, i32 1, i32 1, i32 1>
972  %5 = trunc <4 x i32> %4 to <4 x i8>
973  store <4 x i8> %5, ptr undef, align 4
974  ret void
975}
976
977define void @avg_v8i8_const(ptr %a) nounwind {
978; SSE2-LABEL: avg_v8i8_const:
979; SSE2:       # %bb.0:
980; SSE2-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
981; SSE2-NEXT:    pavgb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
982; SSE2-NEXT:    movq %xmm0, (%rax)
983; SSE2-NEXT:    retq
984;
985; AVX-LABEL: avg_v8i8_const:
986; AVX:       # %bb.0:
987; AVX-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
988; AVX-NEXT:    vpavgb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
989; AVX-NEXT:    vmovq %xmm0, (%rax)
990; AVX-NEXT:    retq
991  %1 = load <8 x i8>, ptr %a
992  %2 = zext <8 x i8> %1 to <8 x i32>
993  %3 = add nuw nsw <8 x i32> %2, <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
994  %4 = lshr <8 x i32> %3, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
995  %5 = trunc <8 x i32> %4 to <8 x i8>
996  store <8 x i8> %5, ptr undef, align 4
997  ret void
998}
999
1000define void @avg_v16i8_const(ptr %a) nounwind {
1001; SSE2-LABEL: avg_v16i8_const:
1002; SSE2:       # %bb.0:
1003; SSE2-NEXT:    movdqa (%rdi), %xmm0
1004; SSE2-NEXT:    pavgb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
1005; SSE2-NEXT:    movdqu %xmm0, (%rax)
1006; SSE2-NEXT:    retq
1007;
1008; AVX-LABEL: avg_v16i8_const:
1009; AVX:       # %bb.0:
1010; AVX-NEXT:    vmovdqa (%rdi), %xmm0
1011; AVX-NEXT:    vpavgb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1012; AVX-NEXT:    vmovdqu %xmm0, (%rax)
1013; AVX-NEXT:    retq
1014  %1 = load <16 x i8>, ptr %a
1015  %2 = zext <16 x i8> %1 to <16 x i32>
1016  %3 = add nuw nsw <16 x i32> %2, <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
1017  %4 = lshr <16 x i32> %3, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
1018  %5 = trunc <16 x i32> %4 to <16 x i8>
1019  store <16 x i8> %5, ptr undef, align 4
1020  ret void
1021}
1022
1023define void @avg_v32i8_const(ptr %a) nounwind {
1024; SSE2-LABEL: avg_v32i8_const:
1025; SSE2:       # %bb.0:
1026; SSE2-NEXT:    movdqa {{.*#+}} xmm0 = [0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7]
1027; SSE2-NEXT:    movdqa (%rdi), %xmm1
1028; SSE2-NEXT:    pavgb %xmm0, %xmm1
1029; SSE2-NEXT:    pavgb 16(%rdi), %xmm0
1030; SSE2-NEXT:    movdqu %xmm0, (%rax)
1031; SSE2-NEXT:    movdqu %xmm1, (%rax)
1032; SSE2-NEXT:    retq
1033;
1034; AVX1-LABEL: avg_v32i8_const:
1035; AVX1:       # %bb.0:
1036; AVX1-NEXT:    vmovddup {{.*#+}} xmm0 = [0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7]
1037; AVX1-NEXT:    # xmm0 = mem[0,0]
1038; AVX1-NEXT:    vpavgb (%rdi), %xmm0, %xmm1
1039; AVX1-NEXT:    vpavgb 16(%rdi), %xmm0, %xmm0
1040; AVX1-NEXT:    vmovdqu %xmm0, (%rax)
1041; AVX1-NEXT:    vmovdqu %xmm1, (%rax)
1042; AVX1-NEXT:    retq
1043;
1044; AVX2-LABEL: avg_v32i8_const:
1045; AVX2:       # %bb.0:
1046; AVX2-NEXT:    vmovdqa (%rdi), %ymm0
1047; AVX2-NEXT:    vpavgb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1048; AVX2-NEXT:    vmovdqu %ymm0, (%rax)
1049; AVX2-NEXT:    vzeroupper
1050; AVX2-NEXT:    retq
1051;
1052; AVX512-LABEL: avg_v32i8_const:
1053; AVX512:       # %bb.0:
1054; AVX512-NEXT:    vmovdqa (%rdi), %ymm0
1055; AVX512-NEXT:    vpavgb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1056; AVX512-NEXT:    vmovdqu %ymm0, (%rax)
1057; AVX512-NEXT:    vzeroupper
1058; AVX512-NEXT:    retq
1059  %1 = load <32 x i8>, ptr %a
1060  %2 = zext <32 x i8> %1 to <32 x i32>
1061  %3 = add nuw nsw <32 x i32> %2, <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
1062  %4 = lshr <32 x i32> %3, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
1063  %5 = trunc <32 x i32> %4 to <32 x i8>
1064  store <32 x i8> %5, ptr undef, align 4
1065  ret void
1066}
1067
1068define void @avg_v64i8_const(ptr %a) nounwind {
1069; SSE2-LABEL: avg_v64i8_const:
1070; SSE2:       # %bb.0:
1071; SSE2-NEXT:    movdqa {{.*#+}} xmm0 = [0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7]
1072; SSE2-NEXT:    movdqa (%rdi), %xmm1
1073; SSE2-NEXT:    pavgb %xmm0, %xmm1
1074; SSE2-NEXT:    movdqa 16(%rdi), %xmm2
1075; SSE2-NEXT:    pavgb %xmm0, %xmm2
1076; SSE2-NEXT:    movdqa 32(%rdi), %xmm3
1077; SSE2-NEXT:    pavgb %xmm0, %xmm3
1078; SSE2-NEXT:    pavgb 48(%rdi), %xmm0
1079; SSE2-NEXT:    movdqu %xmm0, (%rax)
1080; SSE2-NEXT:    movdqu %xmm3, (%rax)
1081; SSE2-NEXT:    movdqu %xmm2, (%rax)
1082; SSE2-NEXT:    movdqu %xmm1, (%rax)
1083; SSE2-NEXT:    retq
1084;
1085; AVX1-LABEL: avg_v64i8_const:
1086; AVX1:       # %bb.0:
1087; AVX1-NEXT:    vmovddup {{.*#+}} xmm0 = [0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7]
1088; AVX1-NEXT:    # xmm0 = mem[0,0]
1089; AVX1-NEXT:    vpavgb (%rdi), %xmm0, %xmm1
1090; AVX1-NEXT:    vpavgb 16(%rdi), %xmm0, %xmm2
1091; AVX1-NEXT:    vpavgb 32(%rdi), %xmm0, %xmm3
1092; AVX1-NEXT:    vpavgb 48(%rdi), %xmm0, %xmm0
1093; AVX1-NEXT:    vmovdqu %xmm0, (%rax)
1094; AVX1-NEXT:    vmovdqu %xmm3, (%rax)
1095; AVX1-NEXT:    vmovdqu %xmm2, (%rax)
1096; AVX1-NEXT:    vmovdqu %xmm1, (%rax)
1097; AVX1-NEXT:    retq
1098;
1099; AVX2-LABEL: avg_v64i8_const:
1100; AVX2:       # %bb.0:
1101; AVX2-NEXT:    vpbroadcastq {{.*#+}} ymm0 = [0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7]
1102; AVX2-NEXT:    vpavgb (%rdi), %ymm0, %ymm1
1103; AVX2-NEXT:    vpavgb 32(%rdi), %ymm0, %ymm0
1104; AVX2-NEXT:    vmovdqu %ymm0, (%rax)
1105; AVX2-NEXT:    vmovdqu %ymm1, (%rax)
1106; AVX2-NEXT:    vzeroupper
1107; AVX2-NEXT:    retq
1108;
1109; AVX512F-LABEL: avg_v64i8_const:
1110; AVX512F:       # %bb.0:
1111; AVX512F-NEXT:    vpbroadcastq {{.*#+}} ymm0 = [0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7]
1112; AVX512F-NEXT:    vpavgb (%rdi), %ymm0, %ymm1
1113; AVX512F-NEXT:    vpavgb 32(%rdi), %ymm0, %ymm0
1114; AVX512F-NEXT:    vmovdqu %ymm0, (%rax)
1115; AVX512F-NEXT:    vmovdqu %ymm1, (%rax)
1116; AVX512F-NEXT:    vzeroupper
1117; AVX512F-NEXT:    retq
1118;
1119; AVX512BW-LABEL: avg_v64i8_const:
1120; AVX512BW:       # %bb.0:
1121; AVX512BW-NEXT:    vmovdqa64 (%rdi), %zmm0
1122; AVX512BW-NEXT:    vpavgb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
1123; AVX512BW-NEXT:    vmovdqu64 %zmm0, (%rax)
1124; AVX512BW-NEXT:    vzeroupper
1125; AVX512BW-NEXT:    retq
1126  %1 = load <64 x i8>, ptr %a
1127  %2 = zext <64 x i8> %1 to <64 x i32>
1128  %3 = add nuw nsw <64 x i32> %2, <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
1129  %4 = lshr <64 x i32> %3, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
1130  %5 = trunc <64 x i32> %4 to <64 x i8>
1131  store <64 x i8> %5, ptr undef, align 4
1132  ret void
1133}
1134
1135define void @avg_v4i16_const(ptr %a) nounwind {
1136; SSE2-LABEL: avg_v4i16_const:
1137; SSE2:       # %bb.0:
1138; SSE2-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
1139; SSE2-NEXT:    pavgw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
1140; SSE2-NEXT:    movq %xmm0, (%rax)
1141; SSE2-NEXT:    retq
1142;
1143; AVX-LABEL: avg_v4i16_const:
1144; AVX:       # %bb.0:
1145; AVX-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
1146; AVX-NEXT:    vpavgw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1147; AVX-NEXT:    vmovq %xmm0, (%rax)
1148; AVX-NEXT:    retq
1149  %1 = load <4 x i16>, ptr %a
1150  %2 = zext <4 x i16> %1 to <4 x i32>
1151  %3 = add nuw nsw <4 x i32> %2, <i32 1, i32 2, i32 3, i32 4>
1152  %4 = lshr <4 x i32> %3, <i32 1, i32 1, i32 1, i32 1>
1153  %5 = trunc <4 x i32> %4 to <4 x i16>
1154  store <4 x i16> %5, ptr undef, align 4
1155  ret void
1156}
1157
1158define void @avg_v8i16_const(ptr %a) nounwind {
1159; SSE2-LABEL: avg_v8i16_const:
1160; SSE2:       # %bb.0:
1161; SSE2-NEXT:    movdqa (%rdi), %xmm0
1162; SSE2-NEXT:    pavgw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
1163; SSE2-NEXT:    movdqu %xmm0, (%rax)
1164; SSE2-NEXT:    retq
1165;
1166; AVX-LABEL: avg_v8i16_const:
1167; AVX:       # %bb.0:
1168; AVX-NEXT:    vmovdqa (%rdi), %xmm0
1169; AVX-NEXT:    vpavgw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1170; AVX-NEXT:    vmovdqu %xmm0, (%rax)
1171; AVX-NEXT:    retq
1172  %1 = load <8 x i16>, ptr %a
1173  %2 = zext <8 x i16> %1 to <8 x i32>
1174  %3 = add nuw nsw <8 x i32> %2, <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
1175  %4 = lshr <8 x i32> %3, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
1176  %5 = trunc <8 x i32> %4 to <8 x i16>
1177  store <8 x i16> %5, ptr undef, align 4
1178  ret void
1179}
1180
1181define void @avg_v16i16_const(ptr %a) nounwind {
1182; SSE2-LABEL: avg_v16i16_const:
1183; SSE2:       # %bb.0:
1184; SSE2-NEXT:    movdqa {{.*#+}} xmm0 = [0,1,2,3,4,5,6,7]
1185; SSE2-NEXT:    movdqa (%rdi), %xmm1
1186; SSE2-NEXT:    pavgw %xmm0, %xmm1
1187; SSE2-NEXT:    pavgw 16(%rdi), %xmm0
1188; SSE2-NEXT:    movdqu %xmm0, (%rax)
1189; SSE2-NEXT:    movdqu %xmm1, (%rax)
1190; SSE2-NEXT:    retq
1191;
1192; AVX1-LABEL: avg_v16i16_const:
1193; AVX1:       # %bb.0:
1194; AVX1-NEXT:    vpmovsxbw {{.*#+}} xmm0 = [0,1,2,3,4,5,6,7]
1195; AVX1-NEXT:    vpavgw (%rdi), %xmm0, %xmm1
1196; AVX1-NEXT:    vpavgw 16(%rdi), %xmm0, %xmm0
1197; AVX1-NEXT:    vmovdqu %xmm0, (%rax)
1198; AVX1-NEXT:    vmovdqu %xmm1, (%rax)
1199; AVX1-NEXT:    retq
1200;
1201; AVX2-LABEL: avg_v16i16_const:
1202; AVX2:       # %bb.0:
1203; AVX2-NEXT:    vmovdqa (%rdi), %ymm0
1204; AVX2-NEXT:    vpavgw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1205; AVX2-NEXT:    vmovdqu %ymm0, (%rax)
1206; AVX2-NEXT:    vzeroupper
1207; AVX2-NEXT:    retq
1208;
1209; AVX512-LABEL: avg_v16i16_const:
1210; AVX512:       # %bb.0:
1211; AVX512-NEXT:    vmovdqa (%rdi), %ymm0
1212; AVX512-NEXT:    vpavgw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1213; AVX512-NEXT:    vmovdqu %ymm0, (%rax)
1214; AVX512-NEXT:    vzeroupper
1215; AVX512-NEXT:    retq
1216  %1 = load <16 x i16>, ptr %a
1217  %2 = zext <16 x i16> %1 to <16 x i32>
1218  %3 = add nuw nsw <16 x i32> %2, <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
1219  %4 = lshr <16 x i32> %3, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
1220  %5 = trunc <16 x i32> %4 to <16 x i16>
1221  store <16 x i16> %5, ptr undef, align 4
1222  ret void
1223}
1224
1225define void @avg_v32i16_const(ptr %a) nounwind {
1226; SSE2-LABEL: avg_v32i16_const:
1227; SSE2:       # %bb.0:
1228; SSE2-NEXT:    movdqa {{.*#+}} xmm0 = [0,1,2,3,4,5,6,7]
1229; SSE2-NEXT:    movdqa (%rdi), %xmm1
1230; SSE2-NEXT:    pavgw %xmm0, %xmm1
1231; SSE2-NEXT:    movdqa 16(%rdi), %xmm2
1232; SSE2-NEXT:    pavgw %xmm0, %xmm2
1233; SSE2-NEXT:    movdqa 32(%rdi), %xmm3
1234; SSE2-NEXT:    pavgw %xmm0, %xmm3
1235; SSE2-NEXT:    pavgw 48(%rdi), %xmm0
1236; SSE2-NEXT:    movdqu %xmm0, (%rax)
1237; SSE2-NEXT:    movdqu %xmm3, (%rax)
1238; SSE2-NEXT:    movdqu %xmm2, (%rax)
1239; SSE2-NEXT:    movdqu %xmm1, (%rax)
1240; SSE2-NEXT:    retq
1241;
1242; AVX1-LABEL: avg_v32i16_const:
1243; AVX1:       # %bb.0:
1244; AVX1-NEXT:    vpmovsxbw {{.*#+}} xmm0 = [0,1,2,3,4,5,6,7]
1245; AVX1-NEXT:    vpavgw (%rdi), %xmm0, %xmm1
1246; AVX1-NEXT:    vpavgw 16(%rdi), %xmm0, %xmm2
1247; AVX1-NEXT:    vpavgw 32(%rdi), %xmm0, %xmm3
1248; AVX1-NEXT:    vpavgw 48(%rdi), %xmm0, %xmm0
1249; AVX1-NEXT:    vmovdqu %xmm0, (%rax)
1250; AVX1-NEXT:    vmovdqu %xmm3, (%rax)
1251; AVX1-NEXT:    vmovdqu %xmm2, (%rax)
1252; AVX1-NEXT:    vmovdqu %xmm1, (%rax)
1253; AVX1-NEXT:    retq
1254;
1255; AVX2-LABEL: avg_v32i16_const:
1256; AVX2:       # %bb.0:
1257; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm0 = [0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7]
1258; AVX2-NEXT:    # ymm0 = mem[0,1,0,1]
1259; AVX2-NEXT:    vpavgw (%rdi), %ymm0, %ymm1
1260; AVX2-NEXT:    vpavgw 32(%rdi), %ymm0, %ymm0
1261; AVX2-NEXT:    vmovdqu %ymm0, (%rax)
1262; AVX2-NEXT:    vmovdqu %ymm1, (%rax)
1263; AVX2-NEXT:    vzeroupper
1264; AVX2-NEXT:    retq
1265;
1266; AVX512F-LABEL: avg_v32i16_const:
1267; AVX512F:       # %bb.0:
1268; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm0 = [0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7]
1269; AVX512F-NEXT:    # ymm0 = mem[0,1,0,1]
1270; AVX512F-NEXT:    vpavgw (%rdi), %ymm0, %ymm1
1271; AVX512F-NEXT:    vpavgw 32(%rdi), %ymm0, %ymm0
1272; AVX512F-NEXT:    vmovdqu %ymm0, (%rax)
1273; AVX512F-NEXT:    vmovdqu %ymm1, (%rax)
1274; AVX512F-NEXT:    vzeroupper
1275; AVX512F-NEXT:    retq
1276;
1277; AVX512BW-LABEL: avg_v32i16_const:
1278; AVX512BW:       # %bb.0:
1279; AVX512BW-NEXT:    vmovdqa64 (%rdi), %zmm0
1280; AVX512BW-NEXT:    vpavgw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
1281; AVX512BW-NEXT:    vmovdqu64 %zmm0, (%rax)
1282; AVX512BW-NEXT:    vzeroupper
1283; AVX512BW-NEXT:    retq
1284  %1 = load <32 x i16>, ptr %a
1285  %2 = zext <32 x i16> %1 to <32 x i32>
1286  %3 = add nuw nsw <32 x i32> %2, <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
1287  %4 = lshr <32 x i32> %3, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
1288  %5 = trunc <32 x i32> %4 to <32 x i16>
1289  store <32 x i16> %5, ptr undef, align 4
1290  ret void
1291}
1292
1293define <16 x i8> @avg_v16i8_3(<16 x i8> %a, <16 x i8> %b) nounwind {
1294; SSE2-LABEL: avg_v16i8_3:
1295; SSE2:       # %bb.0:
1296; SSE2-NEXT:    pavgb %xmm1, %xmm0
1297; SSE2-NEXT:    retq
1298;
1299; AVX-LABEL: avg_v16i8_3:
1300; AVX:       # %bb.0:
1301; AVX-NEXT:    vpavgb %xmm1, %xmm0, %xmm0
1302; AVX-NEXT:    retq
1303  %za = zext <16 x i8> %a to <16 x i16>
1304  %zb = zext <16 x i8> %b to <16 x i16>
1305  %add = add nuw nsw <16 x i16> %za, %zb
1306  %add1 = add nuw nsw <16 x i16> %add, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
1307  %lshr = lshr <16 x i16> %add1, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
1308  %res = trunc <16 x i16> %lshr to <16 x i8>
1309  ret <16 x i8> %res
1310}
1311
1312define <32 x i8> @avg_v32i8_3(<32 x i8> %a, <32 x i8> %b) nounwind {
1313; SSE2-LABEL: avg_v32i8_3:
1314; SSE2:       # %bb.0:
1315; SSE2-NEXT:    pavgb %xmm2, %xmm0
1316; SSE2-NEXT:    pavgb %xmm3, %xmm1
1317; SSE2-NEXT:    retq
1318;
1319; AVX1-LABEL: avg_v32i8_3:
1320; AVX1:       # %bb.0:
1321; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
1322; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
1323; AVX1-NEXT:    vpavgb %xmm2, %xmm3, %xmm2
1324; AVX1-NEXT:    vpavgb %xmm1, %xmm0, %xmm0
1325; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
1326; AVX1-NEXT:    retq
1327;
1328; AVX2-LABEL: avg_v32i8_3:
1329; AVX2:       # %bb.0:
1330; AVX2-NEXT:    vpavgb %ymm1, %ymm0, %ymm0
1331; AVX2-NEXT:    retq
1332;
1333; AVX512-LABEL: avg_v32i8_3:
1334; AVX512:       # %bb.0:
1335; AVX512-NEXT:    vpavgb %ymm1, %ymm0, %ymm0
1336; AVX512-NEXT:    retq
1337  %za = zext <32 x i8> %a to <32 x i16>
1338  %zb = zext <32 x i8> %b to <32 x i16>
1339  %add = add nuw nsw <32 x i16> %za, %zb
1340  %add1 = add nuw nsw <32 x i16> %add, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
1341  %lshr = lshr <32 x i16> %add1, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
1342  %res = trunc <32 x i16> %lshr to <32 x i8>
1343  ret <32 x i8> %res
1344}
1345
1346define <64 x i8> @avg_v64i8_3(<64 x i8> %a, <64 x i8> %b) nounwind {
1347; SSE2-LABEL: avg_v64i8_3:
1348; SSE2:       # %bb.0:
1349; SSE2-NEXT:    pavgb %xmm4, %xmm0
1350; SSE2-NEXT:    pavgb %xmm5, %xmm1
1351; SSE2-NEXT:    pavgb %xmm6, %xmm2
1352; SSE2-NEXT:    pavgb %xmm7, %xmm3
1353; SSE2-NEXT:    retq
1354;
1355; AVX1-LABEL: avg_v64i8_3:
1356; AVX1:       # %bb.0:
1357; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm4
1358; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm5
1359; AVX1-NEXT:    vpavgb %xmm4, %xmm5, %xmm4
1360; AVX1-NEXT:    vpavgb %xmm2, %xmm0, %xmm0
1361; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm0, %ymm0
1362; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm2
1363; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm4
1364; AVX1-NEXT:    vpavgb %xmm2, %xmm4, %xmm2
1365; AVX1-NEXT:    vpavgb %xmm3, %xmm1, %xmm1
1366; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
1367; AVX1-NEXT:    retq
1368;
1369; AVX2-LABEL: avg_v64i8_3:
1370; AVX2:       # %bb.0:
1371; AVX2-NEXT:    vpavgb %ymm2, %ymm0, %ymm0
1372; AVX2-NEXT:    vpavgb %ymm3, %ymm1, %ymm1
1373; AVX2-NEXT:    retq
1374;
1375; AVX512F-LABEL: avg_v64i8_3:
1376; AVX512F:       # %bb.0:
1377; AVX512F-NEXT:    vextracti64x4 $1, %zmm1, %ymm2
1378; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm3
1379; AVX512F-NEXT:    vpavgb %ymm2, %ymm3, %ymm2
1380; AVX512F-NEXT:    vpavgb %ymm1, %ymm0, %ymm0
1381; AVX512F-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm0
1382; AVX512F-NEXT:    retq
1383;
1384; AVX512BW-LABEL: avg_v64i8_3:
1385; AVX512BW:       # %bb.0:
1386; AVX512BW-NEXT:    vpavgb %zmm1, %zmm0, %zmm0
1387; AVX512BW-NEXT:    retq
1388  %za = zext <64 x i8> %a to <64 x i16>
1389  %zb = zext <64 x i8> %b to <64 x i16>
1390  %add = add nuw nsw <64 x i16> %za, %zb
1391  %add1 = add nuw nsw <64 x i16> %add, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
1392  %lshr = lshr <64 x i16> %add1, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
1393  %res = trunc <64 x i16> %lshr to <64 x i8>
1394  ret <64 x i8> %res
1395}
1396
1397define <512 x i8> @avg_v512i8_3(<512 x i8> %a, <512 x i8> %b) nounwind {
1398; SSE2-LABEL: avg_v512i8_3:
1399; SSE2:       # %bb.0:
1400; SSE2-NEXT:    movq %rdi, %rax
1401; SSE2-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm8
1402; SSE2-NEXT:    pavgb {{[0-9]+}}(%rsp), %xmm8
1403; SSE2-NEXT:    movdqa %xmm8, 496(%rdi)
1404; SSE2-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm8
1405; SSE2-NEXT:    pavgb {{[0-9]+}}(%rsp), %xmm8
1406; SSE2-NEXT:    movdqa %xmm8, 480(%rdi)
1407; SSE2-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm8
1408; SSE2-NEXT:    pavgb {{[0-9]+}}(%rsp), %xmm8
1409; SSE2-NEXT:    movdqa %xmm8, 464(%rdi)
1410; SSE2-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm8
1411; SSE2-NEXT:    pavgb {{[0-9]+}}(%rsp), %xmm8
1412; SSE2-NEXT:    movdqa %xmm8, 448(%rdi)
1413; SSE2-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm8
1414; SSE2-NEXT:    pavgb {{[0-9]+}}(%rsp), %xmm8
1415; SSE2-NEXT:    movdqa %xmm8, 432(%rdi)
1416; SSE2-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm8
1417; SSE2-NEXT:    pavgb {{[0-9]+}}(%rsp), %xmm8
1418; SSE2-NEXT:    movdqa %xmm8, 416(%rdi)
1419; SSE2-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm8
1420; SSE2-NEXT:    pavgb {{[0-9]+}}(%rsp), %xmm8
1421; SSE2-NEXT:    movdqa %xmm8, 400(%rdi)
1422; SSE2-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm8
1423; SSE2-NEXT:    pavgb {{[0-9]+}}(%rsp), %xmm8
1424; SSE2-NEXT:    movdqa %xmm8, 384(%rdi)
1425; SSE2-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm8
1426; SSE2-NEXT:    pavgb {{[0-9]+}}(%rsp), %xmm8
1427; SSE2-NEXT:    movdqa %xmm8, 368(%rdi)
1428; SSE2-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm8
1429; SSE2-NEXT:    pavgb {{[0-9]+}}(%rsp), %xmm8
1430; SSE2-NEXT:    movdqa %xmm8, 352(%rdi)
1431; SSE2-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm8
1432; SSE2-NEXT:    pavgb {{[0-9]+}}(%rsp), %xmm8
1433; SSE2-NEXT:    movdqa %xmm8, 336(%rdi)
1434; SSE2-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm8
1435; SSE2-NEXT:    pavgb {{[0-9]+}}(%rsp), %xmm8
1436; SSE2-NEXT:    movdqa %xmm8, 320(%rdi)
1437; SSE2-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm8
1438; SSE2-NEXT:    pavgb {{[0-9]+}}(%rsp), %xmm8
1439; SSE2-NEXT:    movdqa %xmm8, 304(%rdi)
1440; SSE2-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm8
1441; SSE2-NEXT:    pavgb {{[0-9]+}}(%rsp), %xmm8
1442; SSE2-NEXT:    movdqa %xmm8, 288(%rdi)
1443; SSE2-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm8
1444; SSE2-NEXT:    pavgb {{[0-9]+}}(%rsp), %xmm8
1445; SSE2-NEXT:    movdqa %xmm8, 272(%rdi)
1446; SSE2-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm8
1447; SSE2-NEXT:    pavgb {{[0-9]+}}(%rsp), %xmm8
1448; SSE2-NEXT:    movdqa %xmm8, 256(%rdi)
1449; SSE2-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm8
1450; SSE2-NEXT:    pavgb {{[0-9]+}}(%rsp), %xmm8
1451; SSE2-NEXT:    movdqa %xmm8, 240(%rdi)
1452; SSE2-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm8
1453; SSE2-NEXT:    pavgb {{[0-9]+}}(%rsp), %xmm8
1454; SSE2-NEXT:    movdqa %xmm8, 224(%rdi)
1455; SSE2-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm8
1456; SSE2-NEXT:    pavgb {{[0-9]+}}(%rsp), %xmm8
1457; SSE2-NEXT:    movdqa %xmm8, 208(%rdi)
1458; SSE2-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm8
1459; SSE2-NEXT:    pavgb {{[0-9]+}}(%rsp), %xmm8
1460; SSE2-NEXT:    movdqa %xmm8, 192(%rdi)
1461; SSE2-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm8
1462; SSE2-NEXT:    pavgb {{[0-9]+}}(%rsp), %xmm8
1463; SSE2-NEXT:    movdqa %xmm8, 176(%rdi)
1464; SSE2-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm8
1465; SSE2-NEXT:    pavgb {{[0-9]+}}(%rsp), %xmm8
1466; SSE2-NEXT:    movdqa %xmm8, 160(%rdi)
1467; SSE2-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm8
1468; SSE2-NEXT:    pavgb {{[0-9]+}}(%rsp), %xmm8
1469; SSE2-NEXT:    movdqa %xmm8, 144(%rdi)
1470; SSE2-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm8
1471; SSE2-NEXT:    pavgb {{[0-9]+}}(%rsp), %xmm8
1472; SSE2-NEXT:    movdqa %xmm8, 128(%rdi)
1473; SSE2-NEXT:    pavgb {{[0-9]+}}(%rsp), %xmm7
1474; SSE2-NEXT:    movdqa %xmm7, 112(%rdi)
1475; SSE2-NEXT:    pavgb {{[0-9]+}}(%rsp), %xmm6
1476; SSE2-NEXT:    movdqa %xmm6, 96(%rdi)
1477; SSE2-NEXT:    pavgb {{[0-9]+}}(%rsp), %xmm5
1478; SSE2-NEXT:    movdqa %xmm5, 80(%rdi)
1479; SSE2-NEXT:    pavgb {{[0-9]+}}(%rsp), %xmm4
1480; SSE2-NEXT:    movdqa %xmm4, 64(%rdi)
1481; SSE2-NEXT:    pavgb {{[0-9]+}}(%rsp), %xmm3
1482; SSE2-NEXT:    movdqa %xmm3, 48(%rdi)
1483; SSE2-NEXT:    pavgb {{[0-9]+}}(%rsp), %xmm2
1484; SSE2-NEXT:    movdqa %xmm2, 32(%rdi)
1485; SSE2-NEXT:    pavgb {{[0-9]+}}(%rsp), %xmm1
1486; SSE2-NEXT:    movdqa %xmm1, 16(%rdi)
1487; SSE2-NEXT:    pavgb {{[0-9]+}}(%rsp), %xmm0
1488; SSE2-NEXT:    movdqa %xmm0, (%rdi)
1489; SSE2-NEXT:    retq
1490;
1491; AVX1-LABEL: avg_v512i8_3:
1492; AVX1:       # %bb.0:
1493; AVX1-NEXT:    pushq %rbp
1494; AVX1-NEXT:    movq %rsp, %rbp
1495; AVX1-NEXT:    andq $-32, %rsp
1496; AVX1-NEXT:    subq $32, %rsp
1497; AVX1-NEXT:    movq %rdi, %rax
1498; AVX1-NEXT:    vmovdqa 256(%rbp), %xmm8
1499; AVX1-NEXT:    vpavgb 768(%rbp), %xmm8, %xmm8
1500; AVX1-NEXT:    vmovdqa %xmm8, 496(%rdi)
1501; AVX1-NEXT:    vmovdqa 240(%rbp), %xmm8
1502; AVX1-NEXT:    vpavgb 752(%rbp), %xmm8, %xmm8
1503; AVX1-NEXT:    vmovdqa %xmm8, 480(%rdi)
1504; AVX1-NEXT:    vmovdqa 224(%rbp), %xmm8
1505; AVX1-NEXT:    vpavgb 736(%rbp), %xmm8, %xmm8
1506; AVX1-NEXT:    vmovdqa %xmm8, 464(%rdi)
1507; AVX1-NEXT:    vmovdqa 208(%rbp), %xmm8
1508; AVX1-NEXT:    vpavgb 720(%rbp), %xmm8, %xmm8
1509; AVX1-NEXT:    vmovdqa %xmm8, 448(%rdi)
1510; AVX1-NEXT:    vmovdqa 192(%rbp), %xmm8
1511; AVX1-NEXT:    vpavgb 704(%rbp), %xmm8, %xmm8
1512; AVX1-NEXT:    vmovdqa %xmm8, 432(%rdi)
1513; AVX1-NEXT:    vmovdqa 176(%rbp), %xmm8
1514; AVX1-NEXT:    vpavgb 688(%rbp), %xmm8, %xmm8
1515; AVX1-NEXT:    vmovdqa %xmm8, 416(%rdi)
1516; AVX1-NEXT:    vmovdqa 160(%rbp), %xmm8
1517; AVX1-NEXT:    vpavgb 672(%rbp), %xmm8, %xmm8
1518; AVX1-NEXT:    vmovdqa %xmm8, 400(%rdi)
1519; AVX1-NEXT:    vmovdqa 144(%rbp), %xmm8
1520; AVX1-NEXT:    vpavgb 656(%rbp), %xmm8, %xmm8
1521; AVX1-NEXT:    vmovdqa %xmm8, 384(%rdi)
1522; AVX1-NEXT:    vmovdqa 128(%rbp), %xmm8
1523; AVX1-NEXT:    vpavgb 640(%rbp), %xmm8, %xmm8
1524; AVX1-NEXT:    vmovdqa %xmm8, 368(%rdi)
1525; AVX1-NEXT:    vmovdqa 112(%rbp), %xmm8
1526; AVX1-NEXT:    vpavgb 624(%rbp), %xmm8, %xmm8
1527; AVX1-NEXT:    vmovdqa %xmm8, 352(%rdi)
1528; AVX1-NEXT:    vmovdqa 96(%rbp), %xmm8
1529; AVX1-NEXT:    vpavgb 608(%rbp), %xmm8, %xmm8
1530; AVX1-NEXT:    vmovdqa %xmm8, 336(%rdi)
1531; AVX1-NEXT:    vmovdqa 80(%rbp), %xmm8
1532; AVX1-NEXT:    vpavgb 592(%rbp), %xmm8, %xmm8
1533; AVX1-NEXT:    vmovdqa %xmm8, 320(%rdi)
1534; AVX1-NEXT:    vmovdqa 64(%rbp), %xmm8
1535; AVX1-NEXT:    vpavgb 576(%rbp), %xmm8, %xmm8
1536; AVX1-NEXT:    vmovdqa %xmm8, 304(%rdi)
1537; AVX1-NEXT:    vmovdqa 48(%rbp), %xmm8
1538; AVX1-NEXT:    vpavgb 560(%rbp), %xmm8, %xmm8
1539; AVX1-NEXT:    vmovdqa %xmm8, 288(%rdi)
1540; AVX1-NEXT:    vmovdqa 32(%rbp), %xmm8
1541; AVX1-NEXT:    vpavgb 544(%rbp), %xmm8, %xmm8
1542; AVX1-NEXT:    vmovdqa %xmm8, 272(%rdi)
1543; AVX1-NEXT:    vmovdqa 16(%rbp), %xmm8
1544; AVX1-NEXT:    vpavgb 528(%rbp), %xmm8, %xmm8
1545; AVX1-NEXT:    vmovdqa %xmm8, 256(%rdi)
1546; AVX1-NEXT:    vextractf128 $1, %ymm7, %xmm8
1547; AVX1-NEXT:    vpavgb 512(%rbp), %xmm8, %xmm8
1548; AVX1-NEXT:    vmovdqa %xmm8, 240(%rdi)
1549; AVX1-NEXT:    vpavgb 496(%rbp), %xmm7, %xmm7
1550; AVX1-NEXT:    vmovdqa %xmm7, 224(%rdi)
1551; AVX1-NEXT:    vextractf128 $1, %ymm6, %xmm7
1552; AVX1-NEXT:    vpavgb 480(%rbp), %xmm7, %xmm7
1553; AVX1-NEXT:    vmovdqa %xmm7, 208(%rdi)
1554; AVX1-NEXT:    vpavgb 464(%rbp), %xmm6, %xmm6
1555; AVX1-NEXT:    vmovdqa %xmm6, 192(%rdi)
1556; AVX1-NEXT:    vextractf128 $1, %ymm5, %xmm6
1557; AVX1-NEXT:    vpavgb 448(%rbp), %xmm6, %xmm6
1558; AVX1-NEXT:    vmovdqa %xmm6, 176(%rdi)
1559; AVX1-NEXT:    vpavgb 432(%rbp), %xmm5, %xmm5
1560; AVX1-NEXT:    vmovdqa %xmm5, 160(%rdi)
1561; AVX1-NEXT:    vextractf128 $1, %ymm4, %xmm5
1562; AVX1-NEXT:    vpavgb 416(%rbp), %xmm5, %xmm5
1563; AVX1-NEXT:    vmovdqa %xmm5, 144(%rdi)
1564; AVX1-NEXT:    vpavgb 400(%rbp), %xmm4, %xmm4
1565; AVX1-NEXT:    vmovdqa %xmm4, 128(%rdi)
1566; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm4
1567; AVX1-NEXT:    vpavgb 384(%rbp), %xmm4, %xmm4
1568; AVX1-NEXT:    vmovdqa %xmm4, 112(%rdi)
1569; AVX1-NEXT:    vpavgb 368(%rbp), %xmm3, %xmm3
1570; AVX1-NEXT:    vmovdqa %xmm3, 96(%rdi)
1571; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm3
1572; AVX1-NEXT:    vpavgb 352(%rbp), %xmm3, %xmm3
1573; AVX1-NEXT:    vmovdqa %xmm3, 80(%rdi)
1574; AVX1-NEXT:    vpavgb 336(%rbp), %xmm2, %xmm2
1575; AVX1-NEXT:    vmovdqa %xmm2, 64(%rdi)
1576; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
1577; AVX1-NEXT:    vpavgb 320(%rbp), %xmm2, %xmm2
1578; AVX1-NEXT:    vmovdqa %xmm2, 48(%rdi)
1579; AVX1-NEXT:    vpavgb 304(%rbp), %xmm1, %xmm1
1580; AVX1-NEXT:    vmovdqa %xmm1, 32(%rdi)
1581; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
1582; AVX1-NEXT:    vpavgb 288(%rbp), %xmm1, %xmm1
1583; AVX1-NEXT:    vmovdqa %xmm1, 16(%rdi)
1584; AVX1-NEXT:    vpavgb 272(%rbp), %xmm0, %xmm0
1585; AVX1-NEXT:    vmovdqa %xmm0, (%rdi)
1586; AVX1-NEXT:    movq %rbp, %rsp
1587; AVX1-NEXT:    popq %rbp
1588; AVX1-NEXT:    vzeroupper
1589; AVX1-NEXT:    retq
1590;
1591; AVX2-LABEL: avg_v512i8_3:
1592; AVX2:       # %bb.0:
1593; AVX2-NEXT:    pushq %rbp
1594; AVX2-NEXT:    movq %rsp, %rbp
1595; AVX2-NEXT:    andq $-32, %rsp
1596; AVX2-NEXT:    subq $32, %rsp
1597; AVX2-NEXT:    movq %rdi, %rax
1598; AVX2-NEXT:    vmovdqa 240(%rbp), %ymm8
1599; AVX2-NEXT:    vmovdqa 208(%rbp), %ymm9
1600; AVX2-NEXT:    vmovdqa 176(%rbp), %ymm10
1601; AVX2-NEXT:    vmovdqa 144(%rbp), %ymm11
1602; AVX2-NEXT:    vmovdqa 112(%rbp), %ymm12
1603; AVX2-NEXT:    vmovdqa 80(%rbp), %ymm13
1604; AVX2-NEXT:    vmovdqa 48(%rbp), %ymm14
1605; AVX2-NEXT:    vmovdqa 16(%rbp), %ymm15
1606; AVX2-NEXT:    vpavgb 272(%rbp), %ymm0, %ymm0
1607; AVX2-NEXT:    vpavgb 304(%rbp), %ymm1, %ymm1
1608; AVX2-NEXT:    vpavgb 336(%rbp), %ymm2, %ymm2
1609; AVX2-NEXT:    vpavgb 368(%rbp), %ymm3, %ymm3
1610; AVX2-NEXT:    vpavgb 400(%rbp), %ymm4, %ymm4
1611; AVX2-NEXT:    vpavgb 432(%rbp), %ymm5, %ymm5
1612; AVX2-NEXT:    vpavgb 464(%rbp), %ymm6, %ymm6
1613; AVX2-NEXT:    vpavgb 496(%rbp), %ymm7, %ymm7
1614; AVX2-NEXT:    vpavgb 528(%rbp), %ymm15, %ymm15
1615; AVX2-NEXT:    vpavgb 560(%rbp), %ymm14, %ymm14
1616; AVX2-NEXT:    vpavgb 592(%rbp), %ymm13, %ymm13
1617; AVX2-NEXT:    vpavgb 624(%rbp), %ymm12, %ymm12
1618; AVX2-NEXT:    vpavgb 656(%rbp), %ymm11, %ymm11
1619; AVX2-NEXT:    vpavgb 688(%rbp), %ymm10, %ymm10
1620; AVX2-NEXT:    vpavgb 720(%rbp), %ymm9, %ymm9
1621; AVX2-NEXT:    vpavgb 752(%rbp), %ymm8, %ymm8
1622; AVX2-NEXT:    vmovdqa %ymm8, 480(%rdi)
1623; AVX2-NEXT:    vmovdqa %ymm9, 448(%rdi)
1624; AVX2-NEXT:    vmovdqa %ymm10, 416(%rdi)
1625; AVX2-NEXT:    vmovdqa %ymm11, 384(%rdi)
1626; AVX2-NEXT:    vmovdqa %ymm12, 352(%rdi)
1627; AVX2-NEXT:    vmovdqa %ymm13, 320(%rdi)
1628; AVX2-NEXT:    vmovdqa %ymm14, 288(%rdi)
1629; AVX2-NEXT:    vmovdqa %ymm15, 256(%rdi)
1630; AVX2-NEXT:    vmovdqa %ymm7, 224(%rdi)
1631; AVX2-NEXT:    vmovdqa %ymm6, 192(%rdi)
1632; AVX2-NEXT:    vmovdqa %ymm5, 160(%rdi)
1633; AVX2-NEXT:    vmovdqa %ymm4, 128(%rdi)
1634; AVX2-NEXT:    vmovdqa %ymm3, 96(%rdi)
1635; AVX2-NEXT:    vmovdqa %ymm2, 64(%rdi)
1636; AVX2-NEXT:    vmovdqa %ymm1, 32(%rdi)
1637; AVX2-NEXT:    vmovdqa %ymm0, (%rdi)
1638; AVX2-NEXT:    movq %rbp, %rsp
1639; AVX2-NEXT:    popq %rbp
1640; AVX2-NEXT:    vzeroupper
1641; AVX2-NEXT:    retq
1642;
1643; AVX512F-LABEL: avg_v512i8_3:
1644; AVX512F:       # %bb.0:
1645; AVX512F-NEXT:    pushq %rbp
1646; AVX512F-NEXT:    movq %rsp, %rbp
1647; AVX512F-NEXT:    andq $-64, %rsp
1648; AVX512F-NEXT:    subq $64, %rsp
1649; AVX512F-NEXT:    movq %rdi, %rax
1650; AVX512F-NEXT:    vpavgb 16(%rbp), %ymm0, %ymm8
1651; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
1652; AVX512F-NEXT:    vpavgb 48(%rbp), %ymm0, %ymm0
1653; AVX512F-NEXT:    vpavgb 80(%rbp), %ymm1, %ymm9
1654; AVX512F-NEXT:    vextracti64x4 $1, %zmm1, %ymm1
1655; AVX512F-NEXT:    vpavgb 112(%rbp), %ymm1, %ymm1
1656; AVX512F-NEXT:    vpavgb 144(%rbp), %ymm2, %ymm10
1657; AVX512F-NEXT:    vextracti64x4 $1, %zmm2, %ymm2
1658; AVX512F-NEXT:    vpavgb 176(%rbp), %ymm2, %ymm2
1659; AVX512F-NEXT:    vpavgb 208(%rbp), %ymm3, %ymm11
1660; AVX512F-NEXT:    vextracti64x4 $1, %zmm3, %ymm3
1661; AVX512F-NEXT:    vpavgb 240(%rbp), %ymm3, %ymm3
1662; AVX512F-NEXT:    vpavgb 272(%rbp), %ymm4, %ymm12
1663; AVX512F-NEXT:    vextracti64x4 $1, %zmm4, %ymm4
1664; AVX512F-NEXT:    vpavgb 304(%rbp), %ymm4, %ymm4
1665; AVX512F-NEXT:    vpavgb 336(%rbp), %ymm5, %ymm13
1666; AVX512F-NEXT:    vextracti64x4 $1, %zmm5, %ymm5
1667; AVX512F-NEXT:    vpavgb 368(%rbp), %ymm5, %ymm5
1668; AVX512F-NEXT:    vpavgb 400(%rbp), %ymm6, %ymm14
1669; AVX512F-NEXT:    vextracti64x4 $1, %zmm6, %ymm6
1670; AVX512F-NEXT:    vpavgb 432(%rbp), %ymm6, %ymm6
1671; AVX512F-NEXT:    vpavgb 464(%rbp), %ymm7, %ymm15
1672; AVX512F-NEXT:    vextracti64x4 $1, %zmm7, %ymm7
1673; AVX512F-NEXT:    vpavgb 496(%rbp), %ymm7, %ymm7
1674; AVX512F-NEXT:    vmovdqa %ymm7, 480(%rdi)
1675; AVX512F-NEXT:    vmovdqa %ymm15, 448(%rdi)
1676; AVX512F-NEXT:    vmovdqa %ymm6, 416(%rdi)
1677; AVX512F-NEXT:    vmovdqa %ymm14, 384(%rdi)
1678; AVX512F-NEXT:    vmovdqa %ymm5, 352(%rdi)
1679; AVX512F-NEXT:    vmovdqa %ymm13, 320(%rdi)
1680; AVX512F-NEXT:    vmovdqa %ymm4, 288(%rdi)
1681; AVX512F-NEXT:    vmovdqa %ymm12, 256(%rdi)
1682; AVX512F-NEXT:    vmovdqa %ymm3, 224(%rdi)
1683; AVX512F-NEXT:    vmovdqa %ymm11, 192(%rdi)
1684; AVX512F-NEXT:    vmovdqa %ymm2, 160(%rdi)
1685; AVX512F-NEXT:    vmovdqa %ymm10, 128(%rdi)
1686; AVX512F-NEXT:    vmovdqa %ymm1, 96(%rdi)
1687; AVX512F-NEXT:    vmovdqa %ymm9, 64(%rdi)
1688; AVX512F-NEXT:    vmovdqa %ymm0, 32(%rdi)
1689; AVX512F-NEXT:    vmovdqa %ymm8, (%rdi)
1690; AVX512F-NEXT:    movq %rbp, %rsp
1691; AVX512F-NEXT:    popq %rbp
1692; AVX512F-NEXT:    vzeroupper
1693; AVX512F-NEXT:    retq
1694;
1695; AVX512BW-LABEL: avg_v512i8_3:
1696; AVX512BW:       # %bb.0:
1697; AVX512BW-NEXT:    pushq %rbp
1698; AVX512BW-NEXT:    movq %rsp, %rbp
1699; AVX512BW-NEXT:    andq $-64, %rsp
1700; AVX512BW-NEXT:    subq $64, %rsp
1701; AVX512BW-NEXT:    movq %rdi, %rax
1702; AVX512BW-NEXT:    vpavgb 16(%rbp), %zmm0, %zmm0
1703; AVX512BW-NEXT:    vpavgb 80(%rbp), %zmm1, %zmm1
1704; AVX512BW-NEXT:    vpavgb 144(%rbp), %zmm2, %zmm2
1705; AVX512BW-NEXT:    vpavgb 208(%rbp), %zmm3, %zmm3
1706; AVX512BW-NEXT:    vpavgb 272(%rbp), %zmm4, %zmm4
1707; AVX512BW-NEXT:    vpavgb 336(%rbp), %zmm5, %zmm5
1708; AVX512BW-NEXT:    vpavgb 400(%rbp), %zmm6, %zmm6
1709; AVX512BW-NEXT:    vpavgb 464(%rbp), %zmm7, %zmm7
1710; AVX512BW-NEXT:    vmovdqa64 %zmm7, 448(%rdi)
1711; AVX512BW-NEXT:    vmovdqa64 %zmm6, 384(%rdi)
1712; AVX512BW-NEXT:    vmovdqa64 %zmm5, 320(%rdi)
1713; AVX512BW-NEXT:    vmovdqa64 %zmm4, 256(%rdi)
1714; AVX512BW-NEXT:    vmovdqa64 %zmm3, 192(%rdi)
1715; AVX512BW-NEXT:    vmovdqa64 %zmm2, 128(%rdi)
1716; AVX512BW-NEXT:    vmovdqa64 %zmm1, 64(%rdi)
1717; AVX512BW-NEXT:    vmovdqa64 %zmm0, (%rdi)
1718; AVX512BW-NEXT:    movq %rbp, %rsp
1719; AVX512BW-NEXT:    popq %rbp
1720; AVX512BW-NEXT:    vzeroupper
1721; AVX512BW-NEXT:    retq
1722  %za = zext <512 x i8> %a to <512 x i16>
1723  %zb = zext <512 x i8> %b to <512 x i16>
1724  %add = add nuw nsw <512 x i16> %za, %zb
1725  %add1 = add nuw nsw <512 x i16> %add, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
1726  %lshr = lshr <512 x i16> %add1, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
1727  %res = trunc <512 x i16> %lshr to <512 x i8>
1728  ret <512 x i8> %res
1729}
1730
1731; This is not an avgceilu, but its structurally similar and previously caused a crash
1732; because the constants can't be read with APInt::getZExtValue.
1733define void @not_avg_v16i8_wide_constants(ptr %a, ptr %b) nounwind {
1734; SSE2-LABEL: not_avg_v16i8_wide_constants:
1735; SSE2:       # %bb.0:
1736; SSE2-NEXT:    movaps (%rdi), %xmm1
1737; SSE2-NEXT:    movdqa (%rsi), %xmm0
1738; SSE2-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
1739; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
1740; SSE2-NEXT:    decl %eax
1741; SSE2-NEXT:    movd %eax, %xmm2
1742; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
1743; SSE2-NEXT:    decl %eax
1744; SSE2-NEXT:    movd %eax, %xmm1
1745; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
1746; SSE2-NEXT:    decl %eax
1747; SSE2-NEXT:    movd %eax, %xmm3
1748; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
1749; SSE2-NEXT:    decl %eax
1750; SSE2-NEXT:    movd %eax, %xmm4
1751; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
1752; SSE2-NEXT:    decl %eax
1753; SSE2-NEXT:    movd %eax, %xmm5
1754; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
1755; SSE2-NEXT:    decl %eax
1756; SSE2-NEXT:    movd %eax, %xmm6
1757; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
1758; SSE2-NEXT:    decl %eax
1759; SSE2-NEXT:    movd %eax, %xmm7
1760; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
1761; SSE2-NEXT:    decl %eax
1762; SSE2-NEXT:    movd %eax, %xmm8
1763; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
1764; SSE2-NEXT:    decl %eax
1765; SSE2-NEXT:    movd %eax, %xmm10
1766; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
1767; SSE2-NEXT:    decl %eax
1768; SSE2-NEXT:    movd %eax, %xmm9
1769; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
1770; SSE2-NEXT:    decl %eax
1771; SSE2-NEXT:    movd %eax, %xmm11
1772; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
1773; SSE2-NEXT:    decl %eax
1774; SSE2-NEXT:    movd %eax, %xmm12
1775; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
1776; SSE2-NEXT:    decl %eax
1777; SSE2-NEXT:    movd %eax, %xmm13
1778; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
1779; SSE2-NEXT:    decl %eax
1780; SSE2-NEXT:    movd %eax, %xmm14
1781; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
1782; SSE2-NEXT:    decl %eax
1783; SSE2-NEXT:    movd %eax, %xmm15
1784; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
1785; SSE2-NEXT:    decl %eax
1786; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
1787; SSE2-NEXT:    movd %eax, %xmm2
1788; SSE2-NEXT:    punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
1789; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm4[0,0,0,0]
1790; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
1791; SSE2-NEXT:    punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3]
1792; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm6[0,0,0,0]
1793; SSE2-NEXT:    punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3]
1794; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm8[0,0,0,0]
1795; SSE2-NEXT:    punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm3[2],xmm4[3],xmm3[3]
1796; SSE2-NEXT:    movsd {{.*#+}} xmm4 = xmm1[0],xmm4[1]
1797; SSE2-NEXT:    pxor %xmm3, %xmm3
1798; SSE2-NEXT:    movdqa %xmm0, %xmm1
1799; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7]
1800; SSE2-NEXT:    movapd %xmm4, %xmm5
1801; SSE2-NEXT:    andpd %xmm1, %xmm5
1802; SSE2-NEXT:    xorpd %xmm4, %xmm1
1803; SSE2-NEXT:    psrlw $1, %xmm1
1804; SSE2-NEXT:    paddw %xmm5, %xmm1
1805; SSE2-NEXT:    punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3]
1806; SSE2-NEXT:    punpcklwd {{.*#+}} xmm12 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3]
1807; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm12[0,0,0,0]
1808; SSE2-NEXT:    punpckldq {{.*#+}} xmm9 = xmm9[0],xmm4[0],xmm9[1],xmm4[1]
1809; SSE2-NEXT:    punpcklwd {{.*#+}} xmm14 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3]
1810; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm14[0,0,0,0]
1811; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm15[0],xmm2[1],xmm15[1],xmm2[2],xmm15[2],xmm2[3],xmm15[3]
1812; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
1813; SSE2-NEXT:    punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm4[2],xmm2[3],xmm4[3]
1814; SSE2-NEXT:    movsd {{.*#+}} xmm2 = xmm9[0],xmm2[1]
1815; SSE2-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm3[8],xmm0[9],xmm3[9],xmm0[10],xmm3[10],xmm0[11],xmm3[11],xmm0[12],xmm3[12],xmm0[13],xmm3[13],xmm0[14],xmm3[14],xmm0[15],xmm3[15]
1816; SSE2-NEXT:    movapd %xmm2, %xmm3
1817; SSE2-NEXT:    andpd %xmm0, %xmm3
1818; SSE2-NEXT:    xorpd %xmm2, %xmm0
1819; SSE2-NEXT:    psrlw $1, %xmm0
1820; SSE2-NEXT:    paddw %xmm3, %xmm0
1821; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
1822; SSE2-NEXT:    pand %xmm2, %xmm0
1823; SSE2-NEXT:    pand %xmm2, %xmm1
1824; SSE2-NEXT:    packuswb %xmm0, %xmm1
1825; SSE2-NEXT:    movdqu %xmm1, (%rax)
1826; SSE2-NEXT:    retq
1827;
1828; AVX1-LABEL: not_avg_v16i8_wide_constants:
1829; AVX1:       # %bb.0:
1830; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
1831; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm3 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
1832; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
1833; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
1834; AVX1-NEXT:    vpextrw $7, %xmm3, %edx
1835; AVX1-NEXT:    vpextrw $6, %xmm3, %ecx
1836; AVX1-NEXT:    vpextrw $5, %xmm3, %eax
1837; AVX1-NEXT:    decl %edx
1838; AVX1-NEXT:    vmovd %edx, %xmm4
1839; AVX1-NEXT:    vpextrw $4, %xmm3, %edx
1840; AVX1-NEXT:    decl %ecx
1841; AVX1-NEXT:    vmovd %ecx, %xmm5
1842; AVX1-NEXT:    vpextrw $1, %xmm3, %ecx
1843; AVX1-NEXT:    decl %eax
1844; AVX1-NEXT:    vmovd %eax, %xmm6
1845; AVX1-NEXT:    vpextrw $0, %xmm3, %eax
1846; AVX1-NEXT:    decl %edx
1847; AVX1-NEXT:    vmovd %edx, %xmm7
1848; AVX1-NEXT:    vpextrw $3, %xmm3, %edx
1849; AVX1-NEXT:    decq %rcx
1850; AVX1-NEXT:    vmovq %rcx, %xmm8
1851; AVX1-NEXT:    vpextrw $2, %xmm3, %ecx
1852; AVX1-NEXT:    decq %rax
1853; AVX1-NEXT:    vmovq %rax, %xmm3
1854; AVX1-NEXT:    vpextrw $7, %xmm2, %eax
1855; AVX1-NEXT:    decl %edx
1856; AVX1-NEXT:    vmovd %edx, %xmm9
1857; AVX1-NEXT:    vpextrw $6, %xmm2, %edx
1858; AVX1-NEXT:    decl %ecx
1859; AVX1-NEXT:    vmovd %ecx, %xmm10
1860; AVX1-NEXT:    vpextrw $5, %xmm2, %ecx
1861; AVX1-NEXT:    decl %eax
1862; AVX1-NEXT:    vmovd %eax, %xmm11
1863; AVX1-NEXT:    vpextrw $4, %xmm2, %eax
1864; AVX1-NEXT:    decl %edx
1865; AVX1-NEXT:    vmovd %edx, %xmm12
1866; AVX1-NEXT:    vpextrw $1, %xmm2, %edx
1867; AVX1-NEXT:    decl %ecx
1868; AVX1-NEXT:    vmovd %ecx, %xmm13
1869; AVX1-NEXT:    vpextrw $0, %xmm2, %ecx
1870; AVX1-NEXT:    decl %eax
1871; AVX1-NEXT:    vmovd %eax, %xmm14
1872; AVX1-NEXT:    vpextrw $3, %xmm2, %eax
1873; AVX1-NEXT:    decq %rdx
1874; AVX1-NEXT:    vmovq %rdx, %xmm15
1875; AVX1-NEXT:    vpextrw $2, %xmm2, %edx
1876; AVX1-NEXT:    decq %rcx
1877; AVX1-NEXT:    vmovq %rcx, %xmm2
1878; AVX1-NEXT:    decl %eax
1879; AVX1-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
1880; AVX1-NEXT:    vmovd %eax, %xmm5
1881; AVX1-NEXT:    decl %edx
1882; AVX1-NEXT:    vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3]
1883; AVX1-NEXT:    vmovd %edx, %xmm7
1884; AVX1-NEXT:    vpshufd {{.*#+}} xmm4 = xmm4[0,0,0,0]
1885; AVX1-NEXT:    vpshufd {{.*#+}} xmm6 = xmm6[0,1,0,1]
1886; AVX1-NEXT:    vpblendw {{.*#+}} xmm4 = xmm6[0,1,2,3,4,5],xmm4[6,7]
1887; AVX1-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm8[0],xmm3[1],xmm8[1],xmm3[2],xmm8[2],xmm3[3],xmm8[3]
1888; AVX1-NEXT:    vpunpcklwd {{.*#+}} xmm6 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3]
1889; AVX1-NEXT:    vpshufd {{.*#+}} xmm6 = xmm6[0,0,1,1]
1890; AVX1-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm6[2,3],xmm3[4,5,6,7]
1891; AVX1-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm4[4,5,6,7]
1892; AVX1-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3]
1893; AVX1-NEXT:    vpunpcklwd {{.*#+}} xmm6 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3]
1894; AVX1-NEXT:    vpshufd {{.*#+}} xmm4 = xmm4[0,0,0,0]
1895; AVX1-NEXT:    vpshufd {{.*#+}} xmm6 = xmm6[0,1,0,1]
1896; AVX1-NEXT:    vpblendw {{.*#+}} xmm4 = xmm6[0,1,2,3,4,5],xmm4[6,7]
1897; AVX1-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm15[0],xmm2[1],xmm15[1],xmm2[2],xmm15[2],xmm2[3],xmm15[3]
1898; AVX1-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm7[0],xmm5[0],xmm7[1],xmm5[1],xmm7[2],xmm5[2],xmm7[3],xmm5[3]
1899; AVX1-NEXT:    vpshufd {{.*#+}} xmm5 = xmm5[0,0,1,1]
1900; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm5[2,3],xmm2[4,5,6,7]
1901; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm4[4,5,6,7]
1902; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm2, %ymm2
1903; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
1904; AVX1-NEXT:    vandps %ymm0, %ymm2, %ymm1
1905; AVX1-NEXT:    vxorps %ymm0, %ymm2, %ymm0
1906; AVX1-NEXT:    vpsrlw $1, %xmm0, %xmm2
1907; AVX1-NEXT:    vpaddw %xmm2, %xmm1, %xmm2
1908; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
1909; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
1910; AVX1-NEXT:    vpsrlw $1, %xmm0, %xmm0
1911; AVX1-NEXT:    vpaddw %xmm0, %xmm1, %xmm0
1912; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255]
1913; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm0
1914; AVX1-NEXT:    vpand %xmm1, %xmm2, %xmm1
1915; AVX1-NEXT:    vpackuswb %xmm0, %xmm1, %xmm0
1916; AVX1-NEXT:    vmovdqu %xmm0, (%rax)
1917; AVX1-NEXT:    vzeroupper
1918; AVX1-NEXT:    retq
1919;
1920; AVX2-LABEL: not_avg_v16i8_wide_constants:
1921; AVX2:       # %bb.0:
1922; AVX2-NEXT:    vpmovzxbw {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
1923; AVX2-NEXT:    vpmovzxbw {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
1924; AVX2-NEXT:    vpaddw %ymm1, %ymm0, %ymm0
1925; AVX2-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1
1926; AVX2-NEXT:    vpaddw %ymm1, %ymm0, %ymm0
1927; AVX2-NEXT:    vpsrlw $1, %ymm0, %ymm0
1928; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1929; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
1930; AVX2-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
1931; AVX2-NEXT:    vmovdqu %xmm0, (%rax)
1932; AVX2-NEXT:    vzeroupper
1933; AVX2-NEXT:    retq
1934;
1935; AVX512F-LABEL: not_avg_v16i8_wide_constants:
1936; AVX512F:       # %bb.0:
1937; AVX512F-NEXT:    vpmovzxbw {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
1938; AVX512F-NEXT:    vpmovzxbw {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
1939; AVX512F-NEXT:    vpaddw %ymm1, %ymm0, %ymm0
1940; AVX512F-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1
1941; AVX512F-NEXT:    vpaddw %ymm1, %ymm0, %ymm0
1942; AVX512F-NEXT:    vpsrlw $1, %ymm0, %ymm0
1943; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
1944; AVX512F-NEXT:    vpmovdb %zmm0, (%rax)
1945; AVX512F-NEXT:    vzeroupper
1946; AVX512F-NEXT:    retq
1947;
1948; AVX512BW-LABEL: not_avg_v16i8_wide_constants:
1949; AVX512BW:       # %bb.0:
1950; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
1951; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
1952; AVX512BW-NEXT:    vpaddw %ymm1, %ymm0, %ymm0
1953; AVX512BW-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1
1954; AVX512BW-NEXT:    vpaddw %ymm1, %ymm0, %ymm0
1955; AVX512BW-NEXT:    vpsrlw $1, %ymm0, %ymm0
1956; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
1957; AVX512BW-NEXT:    vmovdqu %xmm0, (%rax)
1958; AVX512BW-NEXT:    vzeroupper
1959; AVX512BW-NEXT:    retq
1960  %1 = load <16 x i8>, ptr %a
1961  %2 = load <16 x i8>, ptr %b
1962  %3 = zext <16 x i8> %1 to <16 x i128>
1963  %4 = zext <16 x i8> %2 to <16 x i128>
1964  %5 = add <16 x i128> %3, <i128 -1, i128 -1, i128 -1, i128 -1, i128 -1, i128 -1, i128 -1, i128 -1, i128 -1, i128 -1, i128 -1, i128 -1, i128 -1, i128 -1, i128 -1, i128 -1>
1965  %6 = add <16 x i128> %5, %4
1966  %7 = lshr <16 x i128> %6, <i128 1, i128 1, i128 1, i128 1, i128 1, i128 1, i128 1, i128 1, i128 1, i128 1, i128 1, i128 1, i128 1, i128 1, i128 1, i128 1>
1967  %8 = trunc <16 x i128> %7 to <16 x i8>
1968  store <16 x i8> %8, ptr undef, align 4
1969  ret void
1970}
1971
1972; Make sure we don't fail on single element vectors.
1973define <1 x i8> @avg_v1i8(<1 x i8> %x, <1 x i8> %y) {
1974; CHECK-LABEL: avg_v1i8:
1975; CHECK:       # %bb.0:
1976; CHECK-NEXT:    movzbl %sil, %eax
1977; CHECK-NEXT:    movzbl %dil, %ecx
1978; CHECK-NEXT:    leal 1(%rcx,%rax), %eax
1979; CHECK-NEXT:    shrl %eax
1980; CHECK-NEXT:    # kill: def $al killed $al killed $eax
1981; CHECK-NEXT:    retq
1982  %a = zext <1 x i8> %x to <1 x i16>
1983  %b = zext <1 x i8> %y to <1 x i16>
1984  %c = add <1 x i16> %a, %b
1985  %d = add <1 x i16> %c, <i16 1>
1986  %e = lshr <1 x i16> %d, <i16 1>
1987  %f = trunc <1 x i16> %e to <1 x i8>
1988  ret <1 x i8> %f
1989}
1990
1991; _mm_avg_epu16( _mm_slli_epi16(a, 2), _mm_slli_epi16(b, 2))
1992define <2 x i64> @PR41316(<2 x i64>, <2 x i64>) {
1993; SSE2-LABEL: PR41316:
1994; SSE2:       # %bb.0:
1995; SSE2-NEXT:    psllw $2, %xmm0
1996; SSE2-NEXT:    psllw $2, %xmm1
1997; SSE2-NEXT:    pavgw %xmm1, %xmm0
1998; SSE2-NEXT:    retq
1999;
2000; AVX-LABEL: PR41316:
2001; AVX:       # %bb.0:
2002; AVX-NEXT:    vpsllw $2, %xmm0, %xmm0
2003; AVX-NEXT:    vpsllw $2, %xmm1, %xmm1
2004; AVX-NEXT:    vpavgw %xmm0, %xmm1, %xmm0
2005; AVX-NEXT:    retq
2006  %3 = bitcast <2 x i64> %0 to <8 x i16>
2007  %4 = shl <8 x i16> %3, <i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2>
2008  %5 = bitcast <2 x i64> %1 to <8 x i16>
2009  %6 = shl <8 x i16> %5, <i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2>
2010  %7 = zext <8 x i16> %6 to <8 x i32>
2011  %8 = or <8 x i16> %4, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
2012  %9 = zext <8 x i16> %8 to <8 x i32>
2013  %10 = add nuw nsw <8 x i32> %9, %7
2014  %11 = lshr <8 x i32> %10, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
2015  %12 = trunc <8 x i32> %11 to <8 x i16>
2016  %13 = bitcast <8 x i16> %12 to <2 x i64>
2017  ret <2 x i64> %13
2018}
2019
2020; shuffle(avg(shuffle(),shuffle())) -> avg(shuffle(),shuffle())
2021define  <16 x i8> @fold_avgb_shuffles(<16 x i8> %x, <16 x i8> %y) {
2022; SSE2-LABEL: fold_avgb_shuffles:
2023; SSE2:       # %bb.0: # %entry
2024; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2]
2025; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
2026; SSE2-NEXT:    pavgb %xmm1, %xmm0
2027; SSE2-NEXT:    retq
2028;
2029; AVX-LABEL: fold_avgb_shuffles:
2030; AVX:       # %bb.0: # %entry
2031; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,0,2,2]
2032; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
2033; AVX-NEXT:    vpavgb %xmm1, %xmm0, %xmm0
2034; AVX-NEXT:    retq
2035entry:
2036   %0 = shufflevector <16 x i8> %x, <16 x i8> poison, <16 x i32> <i32 12, i32 13, i32 14, i32 15, i32 12, i32 13, i32 14, i32 15, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7>
2037   %1 = shufflevector <16 x i8> %y, <16 x i8> poison, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 8, i32 9, i32 10, i32 11, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
2038   %2 = tail call <16 x i8> @llvm.x86.sse2.pavg.b(<16 x i8> %0, <16 x i8> %1)
2039   %3 = shufflevector <16 x i8> %2, <16 x i8> poison, <16 x i32> <i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>
2040   ret <16 x i8> %3
2041}
2042declare <16 x i8> @llvm.x86.sse2.pavg.b(<16 x i8>, <16 x i8>)
2043
2044define <8 x i16> @fold_avgw_shuffles(<8 x i16> %x, <8 x i16> %y) {
2045; SSE2-LABEL: fold_avgw_shuffles:
2046; SSE2:       # %bb.0: # %entry
2047; SSE2-NEXT:    pavgw %xmm1, %xmm0
2048; SSE2-NEXT:    retq
2049;
2050; AVX-LABEL: fold_avgw_shuffles:
2051; AVX:       # %bb.0: # %entry
2052; AVX-NEXT:    vpavgw %xmm1, %xmm0, %xmm0
2053; AVX-NEXT:    retq
2054entry:
2055   %0 = shufflevector <8 x i16> %x, <8 x i16> poison, <8 x i32> <i32 6, i32 7, i32 4, i32 5, i32 2, i32 3, i32 0, i32 1>
2056   %1 = shufflevector <8 x i16> %y, <8 x i16> poison, <8 x i32> <i32 6, i32 7, i32 4, i32 5, i32 2, i32 3, i32 0, i32 1>
2057   %2 = tail call <8 x i16> @llvm.x86.sse2.pavg.w(<8 x i16> %0, <8 x i16> %1)
2058   %3 = shufflevector <8 x i16> %2, <8 x i16> poison, <8 x i32> <i32 6, i32 7, i32 4, i32 5, i32 2, i32 3, i32 0, i32 1>
2059   ret <8 x i16> %3
2060}
2061declare <8 x i16> @llvm.x86.sse2.pavg.w(<8 x i16>, <8 x i16>)
2062
2063define <8 x i16> @PR52131_pavg_chain(<8 x i16> %a, <8 x i16> %b, <8 x i16> %c) {
2064; SSE2-LABEL: PR52131_pavg_chain:
2065; SSE2:       # %bb.0:
2066; SSE2-NEXT:    pavgw %xmm1, %xmm0
2067; SSE2-NEXT:    pavgw %xmm2, %xmm0
2068; SSE2-NEXT:    retq
2069;
2070; AVX-LABEL: PR52131_pavg_chain:
2071; AVX:       # %bb.0:
2072; AVX-NEXT:    vpavgw %xmm1, %xmm0, %xmm0
2073; AVX-NEXT:    vpavgw %xmm0, %xmm2, %xmm0
2074; AVX-NEXT:    retq
2075  %i = zext <8 x i16> %a to <8 x i32>
2076  %i1 = zext <8 x i16> %b to <8 x i32>
2077  %i2 = add nuw nsw <8 x i32> %i, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
2078  %i3 = add nuw nsw <8 x i32> %i2, %i1
2079  %i4 = lshr <8 x i32> %i3, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
2080  %i5 = and <8 x i32> %i4, <i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535>
2081  %i6 = zext <8 x i16> %c to <8 x i32>
2082  %i7 = add nuw nsw <8 x i32> %i6, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
2083  %i8 = add nuw nsw <8 x i32> %i7, %i5
2084  %i9 = lshr <8 x i32> %i8, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
2085  %i10 = trunc <8 x i32> %i9 to <8 x i16>
2086  ret <8 x i16> %i10
2087}
2088
2089define <8 x i16> @PR52131_pavg_chainlike_but_not_zext(<8 x i16> %a, <8 x i16> %b, <8 x i16> %c) {
2090; SSE2-LABEL: PR52131_pavg_chainlike_but_not_zext:
2091; SSE2:       # %bb.0:
2092; SSE2-NEXT:    pavgw %xmm1, %xmm0
2093; SSE2-NEXT:    pavgw %xmm2, %xmm0
2094; SSE2-NEXT:    retq
2095;
2096; AVX-LABEL: PR52131_pavg_chainlike_but_not_zext:
2097; AVX:       # %bb.0:
2098; AVX-NEXT:    vpavgw %xmm1, %xmm0, %xmm0
2099; AVX-NEXT:    vpavgw %xmm0, %xmm2, %xmm0
2100; AVX-NEXT:    retq
2101  %i = zext <8 x i16> %a to <8 x i32>
2102  %i1 = zext <8 x i16> %b to <8 x i32>
2103  %i2 = add nuw nsw <8 x i32> %i, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
2104  %i3 = add nuw nsw <8 x i32> %i2, %i1
2105  %i4 = lshr <8 x i32> %i3, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
2106  %i5 = and <8 x i32> %i4, <i32 131071, i32 131071, i32 131071, i32 131071, i32 131071, i32 131071, i32 131071, i32 131071>
2107  %i6 = zext <8 x i16> %c to <8 x i32>
2108  %i7 = add nuw nsw <8 x i32> %i6, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
2109  %i8 = add nuw nsw <8 x i32> %i7, %i5
2110  %i9 = lshr <8 x i32> %i8, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
2111  %i10 = trunc <8 x i32> %i9 to <8 x i16>
2112  ret <8 x i16> %i10
2113}
2114
2115define <8 x i16> @PR52131_pavg_with_mask(<8 x i32> %a, <8 x i16> %b) {
2116; SSE2-LABEL: PR52131_pavg_with_mask:
2117; SSE2:       # %bb.0:
2118; SSE2-NEXT:    pslld $16, %xmm1
2119; SSE2-NEXT:    psrad $16, %xmm1
2120; SSE2-NEXT:    pslld $16, %xmm0
2121; SSE2-NEXT:    psrad $16, %xmm0
2122; SSE2-NEXT:    packssdw %xmm1, %xmm0
2123; SSE2-NEXT:    pavgw %xmm2, %xmm0
2124; SSE2-NEXT:    retq
2125;
2126; AVX1-LABEL: PR52131_pavg_with_mask:
2127; AVX1:       # %bb.0:
2128; AVX1-NEXT:    vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
2129; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
2130; AVX1-NEXT:    vpackusdw %xmm2, %xmm0, %xmm0
2131; AVX1-NEXT:    vpavgw %xmm0, %xmm1, %xmm0
2132; AVX1-NEXT:    vzeroupper
2133; AVX1-NEXT:    retq
2134;
2135; AVX2-LABEL: PR52131_pavg_with_mask:
2136; AVX2:       # %bb.0:
2137; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u]
2138; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
2139; AVX2-NEXT:    vpavgw %xmm0, %xmm1, %xmm0
2140; AVX2-NEXT:    vzeroupper
2141; AVX2-NEXT:    retq
2142;
2143; AVX512-LABEL: PR52131_pavg_with_mask:
2144; AVX512:       # %bb.0:
2145; AVX512-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
2146; AVX512-NEXT:    vpmovdw %zmm0, %ymm0
2147; AVX512-NEXT:    vpavgw %xmm0, %xmm1, %xmm0
2148; AVX512-NEXT:    vzeroupper
2149; AVX512-NEXT:    retq
2150  %i = and <8 x i32> %a, <i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535>
2151  %i3 = zext <8 x i16> %b to <8 x i32>
2152  %i4 = add nuw nsw <8 x i32> %i3, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
2153  %i5 = add nuw nsw <8 x i32> %i4, %i
2154  %i6 = lshr <8 x i32> %i5, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
2155  %i7 = trunc <8 x i32> %i6 to <8 x i16>
2156  ret <8 x i16> %i7
2157}
2158
2159define <8 x i16> @PR52131_not_zext_with_constant(<8 x i32> %a) {
2160; SSE2-LABEL: PR52131_not_zext_with_constant:
2161; SSE2:       # %bb.0:
2162; SSE2-NEXT:    pslld $16, %xmm1
2163; SSE2-NEXT:    psrad $16, %xmm1
2164; SSE2-NEXT:    pslld $16, %xmm0
2165; SSE2-NEXT:    psrad $16, %xmm0
2166; SSE2-NEXT:    packssdw %xmm1, %xmm0
2167; SSE2-NEXT:    pavgw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
2168; SSE2-NEXT:    retq
2169;
2170; AVX1-LABEL: PR52131_not_zext_with_constant:
2171; AVX1:       # %bb.0:
2172; AVX1-NEXT:    vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
2173; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
2174; AVX1-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
2175; AVX1-NEXT:    vpavgw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
2176; AVX1-NEXT:    vzeroupper
2177; AVX1-NEXT:    retq
2178;
2179; AVX2-LABEL: PR52131_not_zext_with_constant:
2180; AVX2:       # %bb.0:
2181; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u]
2182; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
2183; AVX2-NEXT:    vpavgw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
2184; AVX2-NEXT:    vzeroupper
2185; AVX2-NEXT:    retq
2186;
2187; AVX512-LABEL: PR52131_not_zext_with_constant:
2188; AVX512:       # %bb.0:
2189; AVX512-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
2190; AVX512-NEXT:    vpmovdw %zmm0, %ymm0
2191; AVX512-NEXT:    vpavgw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
2192; AVX512-NEXT:    vzeroupper
2193; AVX512-NEXT:    retq
2194  %i = and <8 x i32> %a, <i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535>
2195  %i1 = add nuw nsw <8 x i32> %i, <i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43>
2196  %i2 = lshr <8 x i32> %i1, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
2197  %i3 = trunc <8 x i32> %i2 to <8 x i16>
2198  ret <8 x i16> %i3
2199}
2200
2201define i64 @PR95284(i32 %a0) {
2202; CHECK-LABEL: PR95284:
2203; CHECK:       # %bb.0:
2204; CHECK-NEXT:    movl %edi, %eax
2205; CHECK-NEXT:    decq %rax
2206; CHECK-NEXT:    shrq %rax
2207; CHECK-NEXT:    incq %rax
2208; CHECK-NEXT:    andq $-2, %rax
2209; CHECK-NEXT:    retq
2210  %ext = zext nneg i32 %a0 to i64
2211  %dec = add i64 %ext, -1
2212  %srl = lshr i64 %dec, 1
2213  %inc = add nuw nsw i64 %srl, 1
2214  %res = and i64 %inc, 9223372036854775806
2215  ret i64 %res
2216}
2217