xref: /llvm-project/llvm/test/CodeGen/X86/avg-mask.ll (revision c22dc71b120b066c0066b8517014149a001cc2b0)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX512F
3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw -mattr=+avx512vl | FileCheck %s --check-prefix=AVX512BWVL
4
5define <16 x i8> @avg_v16i8_mask(<16 x i8> %a, <16 x i8> %b, <16 x i8> %src, i16 %mask) nounwind {
6; AVX512F-LABEL: avg_v16i8_mask:
7; AVX512F:       # %bb.0:
8; AVX512F-NEXT:    vpavgb %xmm1, %xmm0, %xmm0
9; AVX512F-NEXT:    kmovw %edi, %k1
10; AVX512F-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
11; AVX512F-NEXT:    vpmovdb %zmm1, %xmm1
12; AVX512F-NEXT:    vpblendvb %xmm1, %xmm0, %xmm2, %xmm0
13; AVX512F-NEXT:    vzeroupper
14; AVX512F-NEXT:    retq
15;
16; AVX512BWVL-LABEL: avg_v16i8_mask:
17; AVX512BWVL:       # %bb.0:
18; AVX512BWVL-NEXT:    kmovd %edi, %k1
19; AVX512BWVL-NEXT:    vpavgb %xmm1, %xmm0, %xmm2 {%k1}
20; AVX512BWVL-NEXT:    vmovdqa %xmm2, %xmm0
21; AVX512BWVL-NEXT:    retq
22  %za = zext <16 x i8> %a to <16 x i16>
23  %zb = zext <16 x i8> %b to <16 x i16>
24  %add = add nuw nsw <16 x i16> %za, %zb
25  %add1 = add nuw nsw <16 x i16> %add, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
26  %lshr = lshr <16 x i16> %add1, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
27  %trunc = trunc <16 x i16> %lshr to <16 x i8>
28  %mask1 = bitcast i16 %mask to <16 x i1>
29  %res = select <16 x i1> %mask1, <16 x i8> %trunc, <16 x i8> %src
30  ret <16 x i8> %res
31}
32
33define <16 x i8> @avg_v16i8_maskz(<16 x i8> %a, <16 x i8> %b, i16 %mask) nounwind {
34; AVX512F-LABEL: avg_v16i8_maskz:
35; AVX512F:       # %bb.0:
36; AVX512F-NEXT:    vpavgb %xmm1, %xmm0, %xmm0
37; AVX512F-NEXT:    kmovw %edi, %k1
38; AVX512F-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
39; AVX512F-NEXT:    vpmovdb %zmm1, %xmm1
40; AVX512F-NEXT:    vpand %xmm0, %xmm1, %xmm0
41; AVX512F-NEXT:    vzeroupper
42; AVX512F-NEXT:    retq
43;
44; AVX512BWVL-LABEL: avg_v16i8_maskz:
45; AVX512BWVL:       # %bb.0:
46; AVX512BWVL-NEXT:    kmovd %edi, %k1
47; AVX512BWVL-NEXT:    vpavgb %xmm1, %xmm0, %xmm0 {%k1} {z}
48; AVX512BWVL-NEXT:    retq
49  %za = zext <16 x i8> %a to <16 x i16>
50  %zb = zext <16 x i8> %b to <16 x i16>
51  %add = add nuw nsw <16 x i16> %za, %zb
52  %add1 = add nuw nsw <16 x i16> %add, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
53  %lshr = lshr <16 x i16> %add1, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
54  %trunc = trunc <16 x i16> %lshr to <16 x i8>
55  %mask1 = bitcast i16 %mask to <16 x i1>
56  %res = select <16 x i1> %mask1, <16 x i8> %trunc, <16 x i8> zeroinitializer
57  ret <16 x i8> %res
58}
59
60define <32 x i8> @avg_v32i8_mask(<32 x i8> %a, <32 x i8> %b, <32 x i8> %src, i32 %mask) nounwind {
61; AVX512F-LABEL: avg_v32i8_mask:
62; AVX512F:       # %bb.0:
63; AVX512F-NEXT:    kmovw %edi, %k1
64; AVX512F-NEXT:    shrl $16, %edi
65; AVX512F-NEXT:    vpavgb %ymm1, %ymm0, %ymm0
66; AVX512F-NEXT:    kmovw %edi, %k2
67; AVX512F-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
68; AVX512F-NEXT:    vpmovdb %zmm1, %xmm1
69; AVX512F-NEXT:    vpternlogd $255, %zmm3, %zmm3, %zmm3 {%k2} {z}
70; AVX512F-NEXT:    vpmovdb %zmm3, %xmm3
71; AVX512F-NEXT:    vinserti128 $1, %xmm3, %ymm1, %ymm1
72; AVX512F-NEXT:    vpblendvb %ymm1, %ymm0, %ymm2, %ymm0
73; AVX512F-NEXT:    retq
74;
75; AVX512BWVL-LABEL: avg_v32i8_mask:
76; AVX512BWVL:       # %bb.0:
77; AVX512BWVL-NEXT:    kmovd %edi, %k1
78; AVX512BWVL-NEXT:    vpavgb %ymm1, %ymm0, %ymm2 {%k1}
79; AVX512BWVL-NEXT:    vmovdqa %ymm2, %ymm0
80; AVX512BWVL-NEXT:    retq
81  %za = zext <32 x i8> %a to <32 x i16>
82  %zb = zext <32 x i8> %b to <32 x i16>
83  %add = add nuw nsw <32 x i16> %za, %zb
84  %add1 = add nuw nsw <32 x i16> %add, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
85  %lshr = lshr <32 x i16> %add1, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
86  %trunc = trunc <32 x i16> %lshr to <32 x i8>
87  %mask1 = bitcast i32 %mask to <32 x i1>
88  %res = select <32 x i1> %mask1, <32 x i8> %trunc, <32 x i8> %src
89  ret <32 x i8> %res
90}
91
92define <32 x i8> @avg_v32i8_maskz(<32 x i8> %a, <32 x i8> %b, i32 %mask) nounwind {
93; AVX512F-LABEL: avg_v32i8_maskz:
94; AVX512F:       # %bb.0:
95; AVX512F-NEXT:    kmovw %edi, %k1
96; AVX512F-NEXT:    shrl $16, %edi
97; AVX512F-NEXT:    vpavgb %ymm1, %ymm0, %ymm0
98; AVX512F-NEXT:    kmovw %edi, %k2
99; AVX512F-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
100; AVX512F-NEXT:    vpmovdb %zmm1, %xmm1
101; AVX512F-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k2} {z}
102; AVX512F-NEXT:    vpmovdb %zmm2, %xmm2
103; AVX512F-NEXT:    vinserti128 $1, %xmm2, %ymm1, %ymm1
104; AVX512F-NEXT:    vpand %ymm0, %ymm1, %ymm0
105; AVX512F-NEXT:    retq
106;
107; AVX512BWVL-LABEL: avg_v32i8_maskz:
108; AVX512BWVL:       # %bb.0:
109; AVX512BWVL-NEXT:    kmovd %edi, %k1
110; AVX512BWVL-NEXT:    vpavgb %ymm1, %ymm0, %ymm0 {%k1} {z}
111; AVX512BWVL-NEXT:    retq
112  %za = zext <32 x i8> %a to <32 x i16>
113  %zb = zext <32 x i8> %b to <32 x i16>
114  %add = add nuw nsw <32 x i16> %za, %zb
115  %add1 = add nuw nsw <32 x i16> %add, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
116  %lshr = lshr <32 x i16> %add1, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
117  %trunc = trunc <32 x i16> %lshr to <32 x i8>
118  %mask1 = bitcast i32 %mask to <32 x i1>
119  %res = select <32 x i1> %mask1, <32 x i8> %trunc, <32 x i8> zeroinitializer
120  ret <32 x i8> %res
121}
122
123define <64 x i8> @avg_v64i8_mask(<64 x i8> %a, <64 x i8> %b, <64 x i8> %src, i64 %mask) nounwind {
124; AVX512F-LABEL: avg_v64i8_mask:
125; AVX512F:       # %bb.0:
126; AVX512F-NEXT:    movq %rdi, %rax
127; AVX512F-NEXT:    movl %edi, %ecx
128; AVX512F-NEXT:    kmovw %edi, %k1
129; AVX512F-NEXT:    shrq $32, %rdi
130; AVX512F-NEXT:    shrq $48, %rax
131; AVX512F-NEXT:    shrl $16, %ecx
132; AVX512F-NEXT:    vextracti64x4 $1, %zmm1, %ymm3
133; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm4
134; AVX512F-NEXT:    vpavgb %ymm3, %ymm4, %ymm3
135; AVX512F-NEXT:    vpavgb %ymm1, %ymm0, %ymm0
136; AVX512F-NEXT:    vinserti64x4 $1, %ymm3, %zmm0, %zmm1
137; AVX512F-NEXT:    kmovw %ecx, %k2
138; AVX512F-NEXT:    kmovw %eax, %k3
139; AVX512F-NEXT:    kmovw %edi, %k4
140; AVX512F-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k4} {z}
141; AVX512F-NEXT:    vpmovdb %zmm0, %xmm0
142; AVX512F-NEXT:    vpternlogd $255, %zmm3, %zmm3, %zmm3 {%k3} {z}
143; AVX512F-NEXT:    vpmovdb %zmm3, %xmm3
144; AVX512F-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm0
145; AVX512F-NEXT:    vpternlogd $255, %zmm3, %zmm3, %zmm3 {%k1} {z}
146; AVX512F-NEXT:    vpmovdb %zmm3, %xmm3
147; AVX512F-NEXT:    vpternlogd $255, %zmm4, %zmm4, %zmm4 {%k2} {z}
148; AVX512F-NEXT:    vpmovdb %zmm4, %xmm4
149; AVX512F-NEXT:    vinserti128 $1, %xmm4, %ymm3, %ymm3
150; AVX512F-NEXT:    vinserti64x4 $1, %ymm0, %zmm3, %zmm0
151; AVX512F-NEXT:    vpternlogq $202, %zmm2, %zmm1, %zmm0
152; AVX512F-NEXT:    retq
153;
154; AVX512BWVL-LABEL: avg_v64i8_mask:
155; AVX512BWVL:       # %bb.0:
156; AVX512BWVL-NEXT:    kmovq %rdi, %k1
157; AVX512BWVL-NEXT:    vpavgb %zmm1, %zmm0, %zmm2 {%k1}
158; AVX512BWVL-NEXT:    vmovdqa64 %zmm2, %zmm0
159; AVX512BWVL-NEXT:    retq
160  %za = zext <64 x i8> %a to <64 x i16>
161  %zb = zext <64 x i8> %b to <64 x i16>
162  %add = add nuw nsw <64 x i16> %za, %zb
163  %add1 = add nuw nsw <64 x i16> %add, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
164  %lshr = lshr <64 x i16> %add1, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
165  %trunc = trunc <64 x i16> %lshr to <64 x i8>
166  %mask1 = bitcast i64 %mask to <64 x i1>
167  %res = select <64 x i1> %mask1, <64 x i8> %trunc, <64 x i8> %src
168  ret <64 x i8> %res
169}
170
171define <64 x i8> @avg_v64i8_maskz(<64 x i8> %a, <64 x i8> %b, i64 %mask) nounwind {
172; AVX512F-LABEL: avg_v64i8_maskz:
173; AVX512F:       # %bb.0:
174; AVX512F-NEXT:    movq %rdi, %rax
175; AVX512F-NEXT:    movl %edi, %ecx
176; AVX512F-NEXT:    kmovw %edi, %k1
177; AVX512F-NEXT:    shrq $32, %rdi
178; AVX512F-NEXT:    shrq $48, %rax
179; AVX512F-NEXT:    shrl $16, %ecx
180; AVX512F-NEXT:    vextracti64x4 $1, %zmm1, %ymm2
181; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm3
182; AVX512F-NEXT:    vpavgb %ymm2, %ymm3, %ymm2
183; AVX512F-NEXT:    vpavgb %ymm1, %ymm0, %ymm0
184; AVX512F-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm0
185; AVX512F-NEXT:    kmovw %ecx, %k2
186; AVX512F-NEXT:    kmovw %eax, %k3
187; AVX512F-NEXT:    kmovw %edi, %k4
188; AVX512F-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k4} {z}
189; AVX512F-NEXT:    vpmovdb %zmm1, %xmm1
190; AVX512F-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k3} {z}
191; AVX512F-NEXT:    vpmovdb %zmm2, %xmm2
192; AVX512F-NEXT:    vinserti128 $1, %xmm2, %ymm1, %ymm1
193; AVX512F-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
194; AVX512F-NEXT:    vpmovdb %zmm2, %xmm2
195; AVX512F-NEXT:    vpternlogd $255, %zmm3, %zmm3, %zmm3 {%k2} {z}
196; AVX512F-NEXT:    vpmovdb %zmm3, %xmm3
197; AVX512F-NEXT:    vinserti128 $1, %xmm3, %ymm2, %ymm2
198; AVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm2, %zmm1
199; AVX512F-NEXT:    vpandq %zmm0, %zmm1, %zmm0
200; AVX512F-NEXT:    retq
201;
202; AVX512BWVL-LABEL: avg_v64i8_maskz:
203; AVX512BWVL:       # %bb.0:
204; AVX512BWVL-NEXT:    kmovq %rdi, %k1
205; AVX512BWVL-NEXT:    vpavgb %zmm1, %zmm0, %zmm0 {%k1} {z}
206; AVX512BWVL-NEXT:    retq
207  %za = zext <64 x i8> %a to <64 x i16>
208  %zb = zext <64 x i8> %b to <64 x i16>
209  %add = add nuw nsw <64 x i16> %za, %zb
210  %add1 = add nuw nsw <64 x i16> %add, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
211  %lshr = lshr <64 x i16> %add1, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
212  %trunc = trunc <64 x i16> %lshr to <64 x i8>
213  %mask1 = bitcast i64 %mask to <64 x i1>
214  %res = select <64 x i1> %mask1, <64 x i8> %trunc, <64 x i8> zeroinitializer
215  ret <64 x i8> %res
216}
217
218define <8 x i16> @avg_v8i16_mask(<8 x i16> %a, <8 x i16> %b, <8 x i16> %src, i8 %mask) nounwind {
219; AVX512F-LABEL: avg_v8i16_mask:
220; AVX512F:       # %bb.0:
221; AVX512F-NEXT:    vpavgw %xmm1, %xmm0, %xmm0
222; AVX512F-NEXT:    kmovw %edi, %k1
223; AVX512F-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
224; AVX512F-NEXT:    vpmovdw %zmm1, %ymm1
225; AVX512F-NEXT:    vpblendvb %xmm1, %xmm0, %xmm2, %xmm0
226; AVX512F-NEXT:    vzeroupper
227; AVX512F-NEXT:    retq
228;
229; AVX512BWVL-LABEL: avg_v8i16_mask:
230; AVX512BWVL:       # %bb.0:
231; AVX512BWVL-NEXT:    kmovd %edi, %k1
232; AVX512BWVL-NEXT:    vpavgw %xmm1, %xmm0, %xmm2 {%k1}
233; AVX512BWVL-NEXT:    vmovdqa %xmm2, %xmm0
234; AVX512BWVL-NEXT:    retq
235  %za = zext <8 x i16> %a to <8 x i32>
236  %zb = zext <8 x i16> %b to <8 x i32>
237  %add = add nuw nsw <8 x i32> %za, %zb
238  %add1 = add nuw nsw <8 x i32> %add, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
239  %lshr = lshr <8 x i32> %add1, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
240  %trunc = trunc <8 x i32> %lshr to <8 x i16>
241  %mask1 = bitcast i8 %mask to <8 x i1>
242  %res = select <8 x i1> %mask1, <8 x i16> %trunc, <8 x i16> %src
243  ret <8 x i16> %res
244}
245
246define <8 x i16> @avg_v8i16_maskz(<8 x i16> %a, <8 x i16> %b, i8 %mask) nounwind {
247; AVX512F-LABEL: avg_v8i16_maskz:
248; AVX512F:       # %bb.0:
249; AVX512F-NEXT:    vpavgw %xmm1, %xmm0, %xmm0
250; AVX512F-NEXT:    kmovw %edi, %k1
251; AVX512F-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
252; AVX512F-NEXT:    vpmovdw %zmm1, %ymm1
253; AVX512F-NEXT:    vpand %xmm0, %xmm1, %xmm0
254; AVX512F-NEXT:    vzeroupper
255; AVX512F-NEXT:    retq
256;
257; AVX512BWVL-LABEL: avg_v8i16_maskz:
258; AVX512BWVL:       # %bb.0:
259; AVX512BWVL-NEXT:    kmovd %edi, %k1
260; AVX512BWVL-NEXT:    vpavgw %xmm1, %xmm0, %xmm0 {%k1} {z}
261; AVX512BWVL-NEXT:    retq
262  %za = zext <8 x i16> %a to <8 x i32>
263  %zb = zext <8 x i16> %b to <8 x i32>
264  %add = add nuw nsw <8 x i32> %za, %zb
265  %add1 = add nuw nsw <8 x i32> %add, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
266  %lshr = lshr <8 x i32> %add1, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
267  %trunc = trunc <8 x i32> %lshr to <8 x i16>
268  %mask1 = bitcast i8 %mask to <8 x i1>
269  %res = select <8 x i1> %mask1, <8 x i16> %trunc, <8 x i16> zeroinitializer
270  ret <8 x i16> %res
271}
272
273define <16 x i16> @avg_v16i16_mask(<16 x i16> %a, <16 x i16> %b, <16 x i16> %src, i16 %mask) nounwind {
274; AVX512F-LABEL: avg_v16i16_mask:
275; AVX512F:       # %bb.0:
276; AVX512F-NEXT:    vpavgw %ymm1, %ymm0, %ymm0
277; AVX512F-NEXT:    kmovw %edi, %k1
278; AVX512F-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
279; AVX512F-NEXT:    vpmovdw %zmm1, %ymm1
280; AVX512F-NEXT:    vpblendvb %ymm1, %ymm0, %ymm2, %ymm0
281; AVX512F-NEXT:    retq
282;
283; AVX512BWVL-LABEL: avg_v16i16_mask:
284; AVX512BWVL:       # %bb.0:
285; AVX512BWVL-NEXT:    kmovd %edi, %k1
286; AVX512BWVL-NEXT:    vpavgw %ymm1, %ymm0, %ymm2 {%k1}
287; AVX512BWVL-NEXT:    vmovdqa %ymm2, %ymm0
288; AVX512BWVL-NEXT:    retq
289  %za = zext <16 x i16> %a to <16 x i32>
290  %zb = zext <16 x i16> %b to <16 x i32>
291  %add = add nuw nsw <16 x i32> %za, %zb
292  %add1 = add nuw nsw <16 x i32> %add, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
293  %lshr = lshr <16 x i32> %add1, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
294  %trunc = trunc <16 x i32> %lshr to <16 x i16>
295  %mask1 = bitcast i16 %mask to <16 x i1>
296  %res = select <16 x i1> %mask1, <16 x i16> %trunc, <16 x i16> %src
297  ret <16 x i16> %res
298}
299
300define <16 x i16> @avg_v16i16_maskz(<16 x i16> %a, <16 x i16> %b, i16 %mask) nounwind {
301; AVX512F-LABEL: avg_v16i16_maskz:
302; AVX512F:       # %bb.0:
303; AVX512F-NEXT:    vpavgw %ymm1, %ymm0, %ymm0
304; AVX512F-NEXT:    kmovw %edi, %k1
305; AVX512F-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
306; AVX512F-NEXT:    vpmovdw %zmm1, %ymm1
307; AVX512F-NEXT:    vpand %ymm0, %ymm1, %ymm0
308; AVX512F-NEXT:    retq
309;
310; AVX512BWVL-LABEL: avg_v16i16_maskz:
311; AVX512BWVL:       # %bb.0:
312; AVX512BWVL-NEXT:    kmovd %edi, %k1
313; AVX512BWVL-NEXT:    vpavgw %ymm1, %ymm0, %ymm0 {%k1} {z}
314; AVX512BWVL-NEXT:    retq
315  %za = zext <16 x i16> %a to <16 x i32>
316  %zb = zext <16 x i16> %b to <16 x i32>
317  %add = add nuw nsw <16 x i32> %za, %zb
318  %add1 = add nuw nsw <16 x i32> %add, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
319  %lshr = lshr <16 x i32> %add1, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
320  %trunc = trunc <16 x i32> %lshr to <16 x i16>
321  %mask1 = bitcast i16 %mask to <16 x i1>
322  %res = select <16 x i1> %mask1, <16 x i16> %trunc, <16 x i16> zeroinitializer
323  ret <16 x i16> %res
324}
325
326define <32 x i16> @avg_v32i16_mask(<32 x i16> %a, <32 x i16> %b, <32 x i16> %src, i32 %mask) nounwind {
327; AVX512F-LABEL: avg_v32i16_mask:
328; AVX512F:       # %bb.0:
329; AVX512F-NEXT:    kmovw %edi, %k1
330; AVX512F-NEXT:    shrl $16, %edi
331; AVX512F-NEXT:    vextracti64x4 $1, %zmm1, %ymm3
332; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm4
333; AVX512F-NEXT:    vpavgw %ymm3, %ymm4, %ymm3
334; AVX512F-NEXT:    vpavgw %ymm1, %ymm0, %ymm0
335; AVX512F-NEXT:    vinserti64x4 $1, %ymm3, %zmm0, %zmm1
336; AVX512F-NEXT:    kmovw %edi, %k2
337; AVX512F-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
338; AVX512F-NEXT:    vpmovdw %zmm0, %ymm0
339; AVX512F-NEXT:    vpternlogd $255, %zmm3, %zmm3, %zmm3 {%k2} {z}
340; AVX512F-NEXT:    vpmovdw %zmm3, %ymm3
341; AVX512F-NEXT:    vinserti64x4 $1, %ymm3, %zmm0, %zmm0
342; AVX512F-NEXT:    vpternlogq $202, %zmm2, %zmm1, %zmm0
343; AVX512F-NEXT:    retq
344;
345; AVX512BWVL-LABEL: avg_v32i16_mask:
346; AVX512BWVL:       # %bb.0:
347; AVX512BWVL-NEXT:    kmovd %edi, %k1
348; AVX512BWVL-NEXT:    vpavgw %zmm1, %zmm0, %zmm2 {%k1}
349; AVX512BWVL-NEXT:    vmovdqa64 %zmm2, %zmm0
350; AVX512BWVL-NEXT:    retq
351  %za = zext <32 x i16> %a to <32 x i32>
352  %zb = zext <32 x i16> %b to <32 x i32>
353  %add = add nuw nsw <32 x i32> %za, %zb
354  %add1 = add nuw nsw <32 x i32> %add, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
355  %lshr = lshr <32 x i32> %add1, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
356  %trunc = trunc <32 x i32> %lshr to <32 x i16>
357  %mask1 = bitcast i32 %mask to <32 x i1>
358  %res = select <32 x i1> %mask1, <32 x i16> %trunc, <32 x i16> %src
359  ret <32 x i16> %res
360}
361
362define <32 x i16> @avg_v32i16_maskz(<32 x i16> %a, <32 x i16> %b, i32 %mask) nounwind {
363; AVX512F-LABEL: avg_v32i16_maskz:
364; AVX512F:       # %bb.0:
365; AVX512F-NEXT:    kmovw %edi, %k1
366; AVX512F-NEXT:    shrl $16, %edi
367; AVX512F-NEXT:    vextracti64x4 $1, %zmm1, %ymm2
368; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm3
369; AVX512F-NEXT:    vpavgw %ymm2, %ymm3, %ymm2
370; AVX512F-NEXT:    vpavgw %ymm1, %ymm0, %ymm0
371; AVX512F-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm0
372; AVX512F-NEXT:    kmovw %edi, %k2
373; AVX512F-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
374; AVX512F-NEXT:    vpmovdw %zmm1, %ymm1
375; AVX512F-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k2} {z}
376; AVX512F-NEXT:    vpmovdw %zmm2, %ymm2
377; AVX512F-NEXT:    vinserti64x4 $1, %ymm2, %zmm1, %zmm1
378; AVX512F-NEXT:    vpandq %zmm0, %zmm1, %zmm0
379; AVX512F-NEXT:    retq
380;
381; AVX512BWVL-LABEL: avg_v32i16_maskz:
382; AVX512BWVL:       # %bb.0:
383; AVX512BWVL-NEXT:    kmovd %edi, %k1
384; AVX512BWVL-NEXT:    vpavgw %zmm1, %zmm0, %zmm0 {%k1} {z}
385; AVX512BWVL-NEXT:    retq
386  %za = zext <32 x i16> %a to <32 x i32>
387  %zb = zext <32 x i16> %b to <32 x i32>
388  %add = add nuw nsw <32 x i32> %za, %zb
389  %add1 = add nuw nsw <32 x i32> %add, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
390  %lshr = lshr <32 x i32> %add1, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
391  %trunc = trunc <32 x i32> %lshr to <32 x i16>
392  %mask1 = bitcast i32 %mask to <32 x i1>
393  %res = select <32 x i1> %mask1, <32 x i16> %trunc, <32 x i16> zeroinitializer
394  ret <32 x i16> %res
395}
396