xref: /llvm-project/llvm/test/CodeGen/X86/midpoint-int-vec-128.ll (revision 61d5addd942a5ef8128e48d3617419e6320d8280)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=SSE,SSE2
3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE,SSE41
4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1
5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2
6; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop | FileCheck %s --check-prefixes=XOP,XOP-FALLBACK
7; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx | FileCheck %s --check-prefixes=XOP,XOPAVX1
8; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx2 | FileCheck %s --check-prefixes=XOP,XOPAVX2
9; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=AVX512F
10; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vl | FileCheck %s --check-prefixes=AVX512VL,AVX512VL-FALLBACK
11; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefix=AVX512BW-FALLBACK
12; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vl,+avx512bw | FileCheck %s --check-prefixes=AVX512VL,AVX512VLBW
13
14; These test cases are inspired by C++2a std::midpoint().
15; See https://bugs.llvm.org/show_bug.cgi?id=40965
16
17; Using 128-bit vector regs.
18
19; ---------------------------------------------------------------------------- ;
20; 32-bit width. 128 / 32 = 4 elts.
21; ---------------------------------------------------------------------------- ;
22
23; Values come from regs
24
25define <4 x i32> @vec128_i32_signed_reg_reg(<4 x i32> %a1, <4 x i32> %a2) nounwind {
26; SSE2-LABEL: vec128_i32_signed_reg_reg:
27; SSE2:       # %bb.0:
28; SSE2-NEXT:    movdqa %xmm0, %xmm2
29; SSE2-NEXT:    pcmpgtd %xmm1, %xmm2
30; SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [1,1,1,1]
31; SSE2-NEXT:    por %xmm2, %xmm3
32; SSE2-NEXT:    movdqa %xmm0, %xmm4
33; SSE2-NEXT:    psubd %xmm1, %xmm4
34; SSE2-NEXT:    pxor %xmm2, %xmm4
35; SSE2-NEXT:    psubd %xmm4, %xmm2
36; SSE2-NEXT:    psrld $1, %xmm2
37; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3]
38; SSE2-NEXT:    pmuludq %xmm3, %xmm2
39; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
40; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
41; SSE2-NEXT:    pmuludq %xmm1, %xmm3
42; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[0,2,2,3]
43; SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
44; SSE2-NEXT:    paddd %xmm2, %xmm0
45; SSE2-NEXT:    retq
46;
47; SSE41-LABEL: vec128_i32_signed_reg_reg:
48; SSE41:       # %bb.0:
49; SSE41-NEXT:    movdqa %xmm0, %xmm2
50; SSE41-NEXT:    pcmpgtd %xmm1, %xmm2
51; SSE41-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
52; SSE41-NEXT:    movdqa %xmm0, %xmm3
53; SSE41-NEXT:    pminsd %xmm1, %xmm3
54; SSE41-NEXT:    pmaxsd %xmm0, %xmm1
55; SSE41-NEXT:    psubd %xmm3, %xmm1
56; SSE41-NEXT:    psrld $1, %xmm1
57; SSE41-NEXT:    pmulld %xmm1, %xmm2
58; SSE41-NEXT:    paddd %xmm2, %xmm0
59; SSE41-NEXT:    retq
60;
61; AVX1-LABEL: vec128_i32_signed_reg_reg:
62; AVX1:       # %bb.0:
63; AVX1-NEXT:    vpcmpgtd %xmm1, %xmm0, %xmm2
64; AVX1-NEXT:    vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
65; AVX1-NEXT:    vpminsd %xmm1, %xmm0, %xmm3
66; AVX1-NEXT:    vpmaxsd %xmm1, %xmm0, %xmm1
67; AVX1-NEXT:    vpsubd %xmm3, %xmm1, %xmm1
68; AVX1-NEXT:    vpsrld $1, %xmm1, %xmm1
69; AVX1-NEXT:    vpmulld %xmm2, %xmm1, %xmm1
70; AVX1-NEXT:    vpaddd %xmm0, %xmm1, %xmm0
71; AVX1-NEXT:    retq
72;
73; AVX2-LABEL: vec128_i32_signed_reg_reg:
74; AVX2:       # %bb.0:
75; AVX2-NEXT:    vpcmpgtd %xmm1, %xmm0, %xmm2
76; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm3 = [1,1,1,1]
77; AVX2-NEXT:    vpor %xmm3, %xmm2, %xmm2
78; AVX2-NEXT:    vpminsd %xmm1, %xmm0, %xmm3
79; AVX2-NEXT:    vpmaxsd %xmm1, %xmm0, %xmm1
80; AVX2-NEXT:    vpsubd %xmm3, %xmm1, %xmm1
81; AVX2-NEXT:    vpsrld $1, %xmm1, %xmm1
82; AVX2-NEXT:    vpmulld %xmm2, %xmm1, %xmm1
83; AVX2-NEXT:    vpaddd %xmm0, %xmm1, %xmm0
84; AVX2-NEXT:    retq
85;
86; XOP-FALLBACK-LABEL: vec128_i32_signed_reg_reg:
87; XOP-FALLBACK:       # %bb.0:
88; XOP-FALLBACK-NEXT:    vpcomgtd %xmm1, %xmm0, %xmm2
89; XOP-FALLBACK-NEXT:    vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
90; XOP-FALLBACK-NEXT:    vpminsd %xmm1, %xmm0, %xmm3
91; XOP-FALLBACK-NEXT:    vpmaxsd %xmm1, %xmm0, %xmm1
92; XOP-FALLBACK-NEXT:    vpsubd %xmm3, %xmm1, %xmm1
93; XOP-FALLBACK-NEXT:    vpsrld $1, %xmm1, %xmm1
94; XOP-FALLBACK-NEXT:    vpmacsdd %xmm0, %xmm2, %xmm1, %xmm0
95; XOP-FALLBACK-NEXT:    retq
96;
97; XOPAVX1-LABEL: vec128_i32_signed_reg_reg:
98; XOPAVX1:       # %bb.0:
99; XOPAVX1-NEXT:    vpcomgtd %xmm1, %xmm0, %xmm2
100; XOPAVX1-NEXT:    vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
101; XOPAVX1-NEXT:    vpminsd %xmm1, %xmm0, %xmm3
102; XOPAVX1-NEXT:    vpmaxsd %xmm1, %xmm0, %xmm1
103; XOPAVX1-NEXT:    vpsubd %xmm3, %xmm1, %xmm1
104; XOPAVX1-NEXT:    vpsrld $1, %xmm1, %xmm1
105; XOPAVX1-NEXT:    vpmacsdd %xmm0, %xmm2, %xmm1, %xmm0
106; XOPAVX1-NEXT:    retq
107;
108; XOPAVX2-LABEL: vec128_i32_signed_reg_reg:
109; XOPAVX2:       # %bb.0:
110; XOPAVX2-NEXT:    vpcomgtd %xmm1, %xmm0, %xmm2
111; XOPAVX2-NEXT:    vpbroadcastd {{.*#+}} xmm3 = [1,1,1,1]
112; XOPAVX2-NEXT:    vpor %xmm3, %xmm2, %xmm2
113; XOPAVX2-NEXT:    vpminsd %xmm1, %xmm0, %xmm3
114; XOPAVX2-NEXT:    vpmaxsd %xmm1, %xmm0, %xmm1
115; XOPAVX2-NEXT:    vpsubd %xmm3, %xmm1, %xmm1
116; XOPAVX2-NEXT:    vpsrld $1, %xmm1, %xmm1
117; XOPAVX2-NEXT:    vpmacsdd %xmm0, %xmm2, %xmm1, %xmm0
118; XOPAVX2-NEXT:    retq
119;
120; AVX512F-LABEL: vec128_i32_signed_reg_reg:
121; AVX512F:       # %bb.0:
122; AVX512F-NEXT:    # kill: def $xmm1 killed $xmm1 def $zmm1
123; AVX512F-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
124; AVX512F-NEXT:    vpcmpgtd %zmm1, %zmm0, %k1
125; AVX512F-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
126; AVX512F-NEXT:    vpbroadcastd {{.*#+}} xmm3 = [1,1,1,1]
127; AVX512F-NEXT:    vmovdqa32 %zmm2, %zmm3 {%k1}
128; AVX512F-NEXT:    vpminsd %xmm1, %xmm0, %xmm2
129; AVX512F-NEXT:    vpmaxsd %xmm1, %xmm0, %xmm1
130; AVX512F-NEXT:    vpsubd %xmm2, %xmm1, %xmm1
131; AVX512F-NEXT:    vpsrld $1, %xmm1, %xmm1
132; AVX512F-NEXT:    vpmulld %xmm3, %xmm1, %xmm1
133; AVX512F-NEXT:    vpaddd %xmm0, %xmm1, %xmm0
134; AVX512F-NEXT:    vzeroupper
135; AVX512F-NEXT:    retq
136;
137; AVX512VL-LABEL: vec128_i32_signed_reg_reg:
138; AVX512VL:       # %bb.0:
139; AVX512VL-NEXT:    vpcmpgtd %xmm1, %xmm0, %k1
140; AVX512VL-NEXT:    vpminsd %xmm1, %xmm0, %xmm2
141; AVX512VL-NEXT:    vpmaxsd %xmm1, %xmm0, %xmm1
142; AVX512VL-NEXT:    vpsubd %xmm2, %xmm1, %xmm1
143; AVX512VL-NEXT:    vpsrld $1, %xmm1, %xmm1
144; AVX512VL-NEXT:    vpxor %xmm2, %xmm2, %xmm2
145; AVX512VL-NEXT:    vpsubd %xmm1, %xmm2, %xmm1 {%k1}
146; AVX512VL-NEXT:    vpaddd %xmm0, %xmm1, %xmm0
147; AVX512VL-NEXT:    retq
148;
149; AVX512BW-FALLBACK-LABEL: vec128_i32_signed_reg_reg:
150; AVX512BW-FALLBACK:       # %bb.0:
151; AVX512BW-FALLBACK-NEXT:    # kill: def $xmm1 killed $xmm1 def $zmm1
152; AVX512BW-FALLBACK-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
153; AVX512BW-FALLBACK-NEXT:    vpcmpgtd %zmm1, %zmm0, %k1
154; AVX512BW-FALLBACK-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
155; AVX512BW-FALLBACK-NEXT:    vpbroadcastd {{.*#+}} xmm3 = [1,1,1,1]
156; AVX512BW-FALLBACK-NEXT:    vmovdqa32 %zmm2, %zmm3 {%k1}
157; AVX512BW-FALLBACK-NEXT:    vpminsd %xmm1, %xmm0, %xmm2
158; AVX512BW-FALLBACK-NEXT:    vpmaxsd %xmm1, %xmm0, %xmm1
159; AVX512BW-FALLBACK-NEXT:    vpsubd %xmm2, %xmm1, %xmm1
160; AVX512BW-FALLBACK-NEXT:    vpsrld $1, %xmm1, %xmm1
161; AVX512BW-FALLBACK-NEXT:    vpmulld %xmm3, %xmm1, %xmm1
162; AVX512BW-FALLBACK-NEXT:    vpaddd %xmm0, %xmm1, %xmm0
163; AVX512BW-FALLBACK-NEXT:    vzeroupper
164; AVX512BW-FALLBACK-NEXT:    retq
165  %t3 = icmp sgt <4 x i32> %a1, %a2 ; signed
166  %t4 = select <4 x i1> %t3, <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
167  %t5 = select <4 x i1> %t3, <4 x i32> %a2, <4 x i32> %a1
168  %t6 = select <4 x i1> %t3, <4 x i32> %a1, <4 x i32> %a2
169  %t7 = sub <4 x i32> %t6, %t5
170  %t8 = lshr <4 x i32> %t7, <i32 1, i32 1, i32 1, i32 1>
171  %t9 = mul nsw <4 x i32> %t8, %t4 ; signed
172  %a10 = add nsw <4 x i32> %t9, %a1 ; signed
173  ret <4 x i32> %a10
174}
175
176define <4 x i32> @vec128_i32_unsigned_reg_reg(<4 x i32> %a1, <4 x i32> %a2) nounwind {
177; SSE2-LABEL: vec128_i32_unsigned_reg_reg:
178; SSE2:       # %bb.0:
179; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
180; SSE2-NEXT:    movdqa %xmm0, %xmm3
181; SSE2-NEXT:    psubd %xmm1, %xmm3
182; SSE2-NEXT:    pxor %xmm2, %xmm1
183; SSE2-NEXT:    pxor %xmm0, %xmm2
184; SSE2-NEXT:    pcmpgtd %xmm1, %xmm2
185; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [1,1,1,1]
186; SSE2-NEXT:    por %xmm2, %xmm1
187; SSE2-NEXT:    pxor %xmm2, %xmm3
188; SSE2-NEXT:    psubd %xmm3, %xmm2
189; SSE2-NEXT:    psrld $1, %xmm2
190; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3]
191; SSE2-NEXT:    pmuludq %xmm1, %xmm2
192; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
193; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
194; SSE2-NEXT:    pmuludq %xmm3, %xmm1
195; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
196; SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
197; SSE2-NEXT:    paddd %xmm2, %xmm0
198; SSE2-NEXT:    retq
199;
200; SSE41-LABEL: vec128_i32_unsigned_reg_reg:
201; SSE41:       # %bb.0:
202; SSE41-NEXT:    movdqa %xmm0, %xmm2
203; SSE41-NEXT:    pminud %xmm1, %xmm2
204; SSE41-NEXT:    movdqa %xmm0, %xmm3
205; SSE41-NEXT:    pcmpeqd %xmm2, %xmm3
206; SSE41-NEXT:    pcmpeqd %xmm4, %xmm4
207; SSE41-NEXT:    pxor %xmm3, %xmm4
208; SSE41-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4
209; SSE41-NEXT:    pmaxud %xmm0, %xmm1
210; SSE41-NEXT:    psubd %xmm2, %xmm1
211; SSE41-NEXT:    psrld $1, %xmm1
212; SSE41-NEXT:    pmulld %xmm1, %xmm4
213; SSE41-NEXT:    paddd %xmm4, %xmm0
214; SSE41-NEXT:    retq
215;
216; AVX1-LABEL: vec128_i32_unsigned_reg_reg:
217; AVX1:       # %bb.0:
218; AVX1-NEXT:    vpminud %xmm1, %xmm0, %xmm2
219; AVX1-NEXT:    vpcmpeqd %xmm2, %xmm0, %xmm3
220; AVX1-NEXT:    vpcmpeqd %xmm4, %xmm4, %xmm4
221; AVX1-NEXT:    vpxor %xmm4, %xmm3, %xmm3
222; AVX1-NEXT:    vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3
223; AVX1-NEXT:    vpmaxud %xmm1, %xmm0, %xmm1
224; AVX1-NEXT:    vpsubd %xmm2, %xmm1, %xmm1
225; AVX1-NEXT:    vpsrld $1, %xmm1, %xmm1
226; AVX1-NEXT:    vpmulld %xmm3, %xmm1, %xmm1
227; AVX1-NEXT:    vpaddd %xmm0, %xmm1, %xmm0
228; AVX1-NEXT:    retq
229;
230; AVX2-LABEL: vec128_i32_unsigned_reg_reg:
231; AVX2:       # %bb.0:
232; AVX2-NEXT:    vpminud %xmm1, %xmm0, %xmm2
233; AVX2-NEXT:    vpcmpeqd %xmm2, %xmm0, %xmm3
234; AVX2-NEXT:    vpcmpeqd %xmm4, %xmm4, %xmm4
235; AVX2-NEXT:    vpxor %xmm4, %xmm3, %xmm3
236; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm4 = [1,1,1,1]
237; AVX2-NEXT:    vpor %xmm4, %xmm3, %xmm3
238; AVX2-NEXT:    vpmaxud %xmm1, %xmm0, %xmm1
239; AVX2-NEXT:    vpsubd %xmm2, %xmm1, %xmm1
240; AVX2-NEXT:    vpsrld $1, %xmm1, %xmm1
241; AVX2-NEXT:    vpmulld %xmm3, %xmm1, %xmm1
242; AVX2-NEXT:    vpaddd %xmm0, %xmm1, %xmm0
243; AVX2-NEXT:    retq
244;
245; XOP-FALLBACK-LABEL: vec128_i32_unsigned_reg_reg:
246; XOP-FALLBACK:       # %bb.0:
247; XOP-FALLBACK-NEXT:    vpcomgtud %xmm1, %xmm0, %xmm2
248; XOP-FALLBACK-NEXT:    vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
249; XOP-FALLBACK-NEXT:    vpminud %xmm1, %xmm0, %xmm3
250; XOP-FALLBACK-NEXT:    vpmaxud %xmm1, %xmm0, %xmm1
251; XOP-FALLBACK-NEXT:    vpsubd %xmm3, %xmm1, %xmm1
252; XOP-FALLBACK-NEXT:    vpsrld $1, %xmm1, %xmm1
253; XOP-FALLBACK-NEXT:    vpmacsdd %xmm0, %xmm2, %xmm1, %xmm0
254; XOP-FALLBACK-NEXT:    retq
255;
256; XOPAVX1-LABEL: vec128_i32_unsigned_reg_reg:
257; XOPAVX1:       # %bb.0:
258; XOPAVX1-NEXT:    vpcomgtud %xmm1, %xmm0, %xmm2
259; XOPAVX1-NEXT:    vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
260; XOPAVX1-NEXT:    vpminud %xmm1, %xmm0, %xmm3
261; XOPAVX1-NEXT:    vpmaxud %xmm1, %xmm0, %xmm1
262; XOPAVX1-NEXT:    vpsubd %xmm3, %xmm1, %xmm1
263; XOPAVX1-NEXT:    vpsrld $1, %xmm1, %xmm1
264; XOPAVX1-NEXT:    vpmacsdd %xmm0, %xmm2, %xmm1, %xmm0
265; XOPAVX1-NEXT:    retq
266;
267; XOPAVX2-LABEL: vec128_i32_unsigned_reg_reg:
268; XOPAVX2:       # %bb.0:
269; XOPAVX2-NEXT:    vpcomgtud %xmm1, %xmm0, %xmm2
270; XOPAVX2-NEXT:    vpbroadcastd {{.*#+}} xmm3 = [1,1,1,1]
271; XOPAVX2-NEXT:    vpor %xmm3, %xmm2, %xmm2
272; XOPAVX2-NEXT:    vpminud %xmm1, %xmm0, %xmm3
273; XOPAVX2-NEXT:    vpmaxud %xmm1, %xmm0, %xmm1
274; XOPAVX2-NEXT:    vpsubd %xmm3, %xmm1, %xmm1
275; XOPAVX2-NEXT:    vpsrld $1, %xmm1, %xmm1
276; XOPAVX2-NEXT:    vpmacsdd %xmm0, %xmm2, %xmm1, %xmm0
277; XOPAVX2-NEXT:    retq
278;
279; AVX512F-LABEL: vec128_i32_unsigned_reg_reg:
280; AVX512F:       # %bb.0:
281; AVX512F-NEXT:    # kill: def $xmm1 killed $xmm1 def $zmm1
282; AVX512F-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
283; AVX512F-NEXT:    vpcmpnleud %zmm1, %zmm0, %k1
284; AVX512F-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
285; AVX512F-NEXT:    vpbroadcastd {{.*#+}} xmm3 = [1,1,1,1]
286; AVX512F-NEXT:    vmovdqa32 %zmm2, %zmm3 {%k1}
287; AVX512F-NEXT:    vpminud %xmm1, %xmm0, %xmm2
288; AVX512F-NEXT:    vpmaxud %xmm1, %xmm0, %xmm1
289; AVX512F-NEXT:    vpsubd %xmm2, %xmm1, %xmm1
290; AVX512F-NEXT:    vpsrld $1, %xmm1, %xmm1
291; AVX512F-NEXT:    vpmulld %xmm3, %xmm1, %xmm1
292; AVX512F-NEXT:    vpaddd %xmm0, %xmm1, %xmm0
293; AVX512F-NEXT:    vzeroupper
294; AVX512F-NEXT:    retq
295;
296; AVX512VL-LABEL: vec128_i32_unsigned_reg_reg:
297; AVX512VL:       # %bb.0:
298; AVX512VL-NEXT:    vpcmpnleud %xmm1, %xmm0, %k1
299; AVX512VL-NEXT:    vpminud %xmm1, %xmm0, %xmm2
300; AVX512VL-NEXT:    vpmaxud %xmm1, %xmm0, %xmm1
301; AVX512VL-NEXT:    vpsubd %xmm2, %xmm1, %xmm1
302; AVX512VL-NEXT:    vpsrld $1, %xmm1, %xmm1
303; AVX512VL-NEXT:    vpxor %xmm2, %xmm2, %xmm2
304; AVX512VL-NEXT:    vpsubd %xmm1, %xmm2, %xmm1 {%k1}
305; AVX512VL-NEXT:    vpaddd %xmm0, %xmm1, %xmm0
306; AVX512VL-NEXT:    retq
307;
308; AVX512BW-FALLBACK-LABEL: vec128_i32_unsigned_reg_reg:
309; AVX512BW-FALLBACK:       # %bb.0:
310; AVX512BW-FALLBACK-NEXT:    # kill: def $xmm1 killed $xmm1 def $zmm1
311; AVX512BW-FALLBACK-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
312; AVX512BW-FALLBACK-NEXT:    vpcmpnleud %zmm1, %zmm0, %k1
313; AVX512BW-FALLBACK-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
314; AVX512BW-FALLBACK-NEXT:    vpbroadcastd {{.*#+}} xmm3 = [1,1,1,1]
315; AVX512BW-FALLBACK-NEXT:    vmovdqa32 %zmm2, %zmm3 {%k1}
316; AVX512BW-FALLBACK-NEXT:    vpminud %xmm1, %xmm0, %xmm2
317; AVX512BW-FALLBACK-NEXT:    vpmaxud %xmm1, %xmm0, %xmm1
318; AVX512BW-FALLBACK-NEXT:    vpsubd %xmm2, %xmm1, %xmm1
319; AVX512BW-FALLBACK-NEXT:    vpsrld $1, %xmm1, %xmm1
320; AVX512BW-FALLBACK-NEXT:    vpmulld %xmm3, %xmm1, %xmm1
321; AVX512BW-FALLBACK-NEXT:    vpaddd %xmm0, %xmm1, %xmm0
322; AVX512BW-FALLBACK-NEXT:    vzeroupper
323; AVX512BW-FALLBACK-NEXT:    retq
324  %t3 = icmp ugt <4 x i32> %a1, %a2
325  %t4 = select <4 x i1> %t3, <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
326  %t5 = select <4 x i1> %t3, <4 x i32> %a2, <4 x i32> %a1
327  %t6 = select <4 x i1> %t3, <4 x i32> %a1, <4 x i32> %a2
328  %t7 = sub <4 x i32> %t6, %t5
329  %t8 = lshr <4 x i32> %t7, <i32 1, i32 1, i32 1, i32 1>
330  %t9 = mul <4 x i32> %t8, %t4
331  %a10 = add <4 x i32> %t9, %a1
332  ret <4 x i32> %a10
333}
334
335; Values are loaded. Only check signed case.
336
337define <4 x i32> @vec128_i32_signed_mem_reg(ptr %a1_addr, <4 x i32> %a2) nounwind {
338; SSE2-LABEL: vec128_i32_signed_mem_reg:
339; SSE2:       # %bb.0:
340; SSE2-NEXT:    movdqa (%rdi), %xmm1
341; SSE2-NEXT:    movdqa %xmm1, %xmm2
342; SSE2-NEXT:    pcmpgtd %xmm0, %xmm2
343; SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [1,1,1,1]
344; SSE2-NEXT:    por %xmm2, %xmm3
345; SSE2-NEXT:    movdqa %xmm1, %xmm4
346; SSE2-NEXT:    psubd %xmm0, %xmm4
347; SSE2-NEXT:    pxor %xmm2, %xmm4
348; SSE2-NEXT:    psubd %xmm4, %xmm2
349; SSE2-NEXT:    psrld $1, %xmm2
350; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3]
351; SSE2-NEXT:    pmuludq %xmm3, %xmm2
352; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
353; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3]
354; SSE2-NEXT:    pmuludq %xmm4, %xmm2
355; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
356; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
357; SSE2-NEXT:    paddd %xmm1, %xmm0
358; SSE2-NEXT:    retq
359;
360; SSE41-LABEL: vec128_i32_signed_mem_reg:
361; SSE41:       # %bb.0:
362; SSE41-NEXT:    movdqa (%rdi), %xmm1
363; SSE41-NEXT:    movdqa %xmm1, %xmm2
364; SSE41-NEXT:    pcmpgtd %xmm0, %xmm2
365; SSE41-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
366; SSE41-NEXT:    movdqa %xmm1, %xmm3
367; SSE41-NEXT:    pminsd %xmm0, %xmm3
368; SSE41-NEXT:    pmaxsd %xmm1, %xmm0
369; SSE41-NEXT:    psubd %xmm3, %xmm0
370; SSE41-NEXT:    psrld $1, %xmm0
371; SSE41-NEXT:    pmulld %xmm2, %xmm0
372; SSE41-NEXT:    paddd %xmm1, %xmm0
373; SSE41-NEXT:    retq
374;
375; AVX1-LABEL: vec128_i32_signed_mem_reg:
376; AVX1:       # %bb.0:
377; AVX1-NEXT:    vmovdqa (%rdi), %xmm1
378; AVX1-NEXT:    vpcmpgtd %xmm0, %xmm1, %xmm2
379; AVX1-NEXT:    vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
380; AVX1-NEXT:    vpminsd %xmm0, %xmm1, %xmm3
381; AVX1-NEXT:    vpmaxsd %xmm0, %xmm1, %xmm0
382; AVX1-NEXT:    vpsubd %xmm3, %xmm0, %xmm0
383; AVX1-NEXT:    vpsrld $1, %xmm0, %xmm0
384; AVX1-NEXT:    vpmulld %xmm2, %xmm0, %xmm0
385; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
386; AVX1-NEXT:    retq
387;
388; AVX2-LABEL: vec128_i32_signed_mem_reg:
389; AVX2:       # %bb.0:
390; AVX2-NEXT:    vmovdqa (%rdi), %xmm1
391; AVX2-NEXT:    vpcmpgtd %xmm0, %xmm1, %xmm2
392; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm3 = [1,1,1,1]
393; AVX2-NEXT:    vpor %xmm3, %xmm2, %xmm2
394; AVX2-NEXT:    vpminsd %xmm0, %xmm1, %xmm3
395; AVX2-NEXT:    vpmaxsd %xmm0, %xmm1, %xmm0
396; AVX2-NEXT:    vpsubd %xmm3, %xmm0, %xmm0
397; AVX2-NEXT:    vpsrld $1, %xmm0, %xmm0
398; AVX2-NEXT:    vpmulld %xmm2, %xmm0, %xmm0
399; AVX2-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
400; AVX2-NEXT:    retq
401;
402; XOP-FALLBACK-LABEL: vec128_i32_signed_mem_reg:
403; XOP-FALLBACK:       # %bb.0:
404; XOP-FALLBACK-NEXT:    vmovdqa (%rdi), %xmm1
405; XOP-FALLBACK-NEXT:    vpcomgtd %xmm0, %xmm1, %xmm2
406; XOP-FALLBACK-NEXT:    vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
407; XOP-FALLBACK-NEXT:    vpminsd %xmm0, %xmm1, %xmm3
408; XOP-FALLBACK-NEXT:    vpmaxsd %xmm0, %xmm1, %xmm0
409; XOP-FALLBACK-NEXT:    vpsubd %xmm3, %xmm0, %xmm0
410; XOP-FALLBACK-NEXT:    vpsrld $1, %xmm0, %xmm0
411; XOP-FALLBACK-NEXT:    vpmacsdd %xmm1, %xmm2, %xmm0, %xmm0
412; XOP-FALLBACK-NEXT:    retq
413;
414; XOPAVX1-LABEL: vec128_i32_signed_mem_reg:
415; XOPAVX1:       # %bb.0:
416; XOPAVX1-NEXT:    vmovdqa (%rdi), %xmm1
417; XOPAVX1-NEXT:    vpcomgtd %xmm0, %xmm1, %xmm2
418; XOPAVX1-NEXT:    vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
419; XOPAVX1-NEXT:    vpminsd %xmm0, %xmm1, %xmm3
420; XOPAVX1-NEXT:    vpmaxsd %xmm0, %xmm1, %xmm0
421; XOPAVX1-NEXT:    vpsubd %xmm3, %xmm0, %xmm0
422; XOPAVX1-NEXT:    vpsrld $1, %xmm0, %xmm0
423; XOPAVX1-NEXT:    vpmacsdd %xmm1, %xmm2, %xmm0, %xmm0
424; XOPAVX1-NEXT:    retq
425;
426; XOPAVX2-LABEL: vec128_i32_signed_mem_reg:
427; XOPAVX2:       # %bb.0:
428; XOPAVX2-NEXT:    vmovdqa (%rdi), %xmm1
429; XOPAVX2-NEXT:    vpcomgtd %xmm0, %xmm1, %xmm2
430; XOPAVX2-NEXT:    vpbroadcastd {{.*#+}} xmm3 = [1,1,1,1]
431; XOPAVX2-NEXT:    vpor %xmm3, %xmm2, %xmm2
432; XOPAVX2-NEXT:    vpminsd %xmm0, %xmm1, %xmm3
433; XOPAVX2-NEXT:    vpmaxsd %xmm0, %xmm1, %xmm0
434; XOPAVX2-NEXT:    vpsubd %xmm3, %xmm0, %xmm0
435; XOPAVX2-NEXT:    vpsrld $1, %xmm0, %xmm0
436; XOPAVX2-NEXT:    vpmacsdd %xmm1, %xmm2, %xmm0, %xmm0
437; XOPAVX2-NEXT:    retq
438;
439; AVX512F-LABEL: vec128_i32_signed_mem_reg:
440; AVX512F:       # %bb.0:
441; AVX512F-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
442; AVX512F-NEXT:    vmovdqa (%rdi), %xmm1
443; AVX512F-NEXT:    vpcmpgtd %zmm0, %zmm1, %k1
444; AVX512F-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
445; AVX512F-NEXT:    vpbroadcastd {{.*#+}} xmm3 = [1,1,1,1]
446; AVX512F-NEXT:    vmovdqa32 %zmm2, %zmm3 {%k1}
447; AVX512F-NEXT:    vpminsd %xmm0, %xmm1, %xmm2
448; AVX512F-NEXT:    vpmaxsd %xmm0, %xmm1, %xmm0
449; AVX512F-NEXT:    vpsubd %xmm2, %xmm0, %xmm0
450; AVX512F-NEXT:    vpsrld $1, %xmm0, %xmm0
451; AVX512F-NEXT:    vpmulld %xmm3, %xmm0, %xmm0
452; AVX512F-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
453; AVX512F-NEXT:    vzeroupper
454; AVX512F-NEXT:    retq
455;
456; AVX512VL-LABEL: vec128_i32_signed_mem_reg:
457; AVX512VL:       # %bb.0:
458; AVX512VL-NEXT:    vmovdqa (%rdi), %xmm1
459; AVX512VL-NEXT:    vpcmpgtd %xmm0, %xmm1, %k1
460; AVX512VL-NEXT:    vpminsd %xmm0, %xmm1, %xmm2
461; AVX512VL-NEXT:    vpmaxsd %xmm0, %xmm1, %xmm0
462; AVX512VL-NEXT:    vpsubd %xmm2, %xmm0, %xmm0
463; AVX512VL-NEXT:    vpsrld $1, %xmm0, %xmm0
464; AVX512VL-NEXT:    vpxor %xmm2, %xmm2, %xmm2
465; AVX512VL-NEXT:    vpsubd %xmm0, %xmm2, %xmm0 {%k1}
466; AVX512VL-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
467; AVX512VL-NEXT:    retq
468;
469; AVX512BW-FALLBACK-LABEL: vec128_i32_signed_mem_reg:
470; AVX512BW-FALLBACK:       # %bb.0:
471; AVX512BW-FALLBACK-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
472; AVX512BW-FALLBACK-NEXT:    vmovdqa (%rdi), %xmm1
473; AVX512BW-FALLBACK-NEXT:    vpcmpgtd %zmm0, %zmm1, %k1
474; AVX512BW-FALLBACK-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
475; AVX512BW-FALLBACK-NEXT:    vpbroadcastd {{.*#+}} xmm3 = [1,1,1,1]
476; AVX512BW-FALLBACK-NEXT:    vmovdqa32 %zmm2, %zmm3 {%k1}
477; AVX512BW-FALLBACK-NEXT:    vpminsd %xmm0, %xmm1, %xmm2
478; AVX512BW-FALLBACK-NEXT:    vpmaxsd %xmm0, %xmm1, %xmm0
479; AVX512BW-FALLBACK-NEXT:    vpsubd %xmm2, %xmm0, %xmm0
480; AVX512BW-FALLBACK-NEXT:    vpsrld $1, %xmm0, %xmm0
481; AVX512BW-FALLBACK-NEXT:    vpmulld %xmm3, %xmm0, %xmm0
482; AVX512BW-FALLBACK-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
483; AVX512BW-FALLBACK-NEXT:    vzeroupper
484; AVX512BW-FALLBACK-NEXT:    retq
485  %a1 = load <4 x i32>, ptr %a1_addr
486  %t3 = icmp sgt <4 x i32> %a1, %a2 ; signed
487  %t4 = select <4 x i1> %t3, <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
488  %t5 = select <4 x i1> %t3, <4 x i32> %a2, <4 x i32> %a1
489  %t6 = select <4 x i1> %t3, <4 x i32> %a1, <4 x i32> %a2
490  %t7 = sub <4 x i32> %t6, %t5
491  %t8 = lshr <4 x i32> %t7, <i32 1, i32 1, i32 1, i32 1>
492  %t9 = mul nsw <4 x i32> %t8, %t4 ; signed
493  %a10 = add nsw <4 x i32> %t9, %a1 ; signed
494  ret <4 x i32> %a10
495}
496
497define <4 x i32> @vec128_i32_signed_reg_mem(<4 x i32> %a1, ptr %a2_addr) nounwind {
498; SSE2-LABEL: vec128_i32_signed_reg_mem:
499; SSE2:       # %bb.0:
500; SSE2-NEXT:    movdqa (%rdi), %xmm1
501; SSE2-NEXT:    movdqa %xmm0, %xmm2
502; SSE2-NEXT:    pcmpgtd %xmm1, %xmm2
503; SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [1,1,1,1]
504; SSE2-NEXT:    por %xmm2, %xmm3
505; SSE2-NEXT:    movdqa %xmm0, %xmm4
506; SSE2-NEXT:    psubd %xmm1, %xmm4
507; SSE2-NEXT:    pxor %xmm2, %xmm4
508; SSE2-NEXT:    psubd %xmm4, %xmm2
509; SSE2-NEXT:    psrld $1, %xmm2
510; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3]
511; SSE2-NEXT:    pmuludq %xmm3, %xmm2
512; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
513; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
514; SSE2-NEXT:    pmuludq %xmm1, %xmm3
515; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[0,2,2,3]
516; SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
517; SSE2-NEXT:    paddd %xmm2, %xmm0
518; SSE2-NEXT:    retq
519;
520; SSE41-LABEL: vec128_i32_signed_reg_mem:
521; SSE41:       # %bb.0:
522; SSE41-NEXT:    movdqa (%rdi), %xmm1
523; SSE41-NEXT:    movdqa %xmm0, %xmm2
524; SSE41-NEXT:    pcmpgtd %xmm1, %xmm2
525; SSE41-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
526; SSE41-NEXT:    movdqa %xmm0, %xmm3
527; SSE41-NEXT:    pminsd %xmm1, %xmm3
528; SSE41-NEXT:    pmaxsd %xmm0, %xmm1
529; SSE41-NEXT:    psubd %xmm3, %xmm1
530; SSE41-NEXT:    psrld $1, %xmm1
531; SSE41-NEXT:    pmulld %xmm2, %xmm1
532; SSE41-NEXT:    paddd %xmm1, %xmm0
533; SSE41-NEXT:    retq
534;
535; AVX1-LABEL: vec128_i32_signed_reg_mem:
536; AVX1:       # %bb.0:
537; AVX1-NEXT:    vmovdqa (%rdi), %xmm1
538; AVX1-NEXT:    vpcmpgtd %xmm1, %xmm0, %xmm2
539; AVX1-NEXT:    vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
540; AVX1-NEXT:    vpminsd %xmm1, %xmm0, %xmm3
541; AVX1-NEXT:    vpmaxsd %xmm1, %xmm0, %xmm1
542; AVX1-NEXT:    vpsubd %xmm3, %xmm1, %xmm1
543; AVX1-NEXT:    vpsrld $1, %xmm1, %xmm1
544; AVX1-NEXT:    vpmulld %xmm2, %xmm1, %xmm1
545; AVX1-NEXT:    vpaddd %xmm0, %xmm1, %xmm0
546; AVX1-NEXT:    retq
547;
548; AVX2-LABEL: vec128_i32_signed_reg_mem:
549; AVX2:       # %bb.0:
550; AVX2-NEXT:    vmovdqa (%rdi), %xmm1
551; AVX2-NEXT:    vpcmpgtd %xmm1, %xmm0, %xmm2
552; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm3 = [1,1,1,1]
553; AVX2-NEXT:    vpor %xmm3, %xmm2, %xmm2
554; AVX2-NEXT:    vpminsd %xmm1, %xmm0, %xmm3
555; AVX2-NEXT:    vpmaxsd %xmm1, %xmm0, %xmm1
556; AVX2-NEXT:    vpsubd %xmm3, %xmm1, %xmm1
557; AVX2-NEXT:    vpsrld $1, %xmm1, %xmm1
558; AVX2-NEXT:    vpmulld %xmm2, %xmm1, %xmm1
559; AVX2-NEXT:    vpaddd %xmm0, %xmm1, %xmm0
560; AVX2-NEXT:    retq
561;
562; XOP-FALLBACK-LABEL: vec128_i32_signed_reg_mem:
563; XOP-FALLBACK:       # %bb.0:
564; XOP-FALLBACK-NEXT:    vmovdqa (%rdi), %xmm1
565; XOP-FALLBACK-NEXT:    vpcomgtd %xmm1, %xmm0, %xmm2
566; XOP-FALLBACK-NEXT:    vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
567; XOP-FALLBACK-NEXT:    vpminsd %xmm1, %xmm0, %xmm3
568; XOP-FALLBACK-NEXT:    vpmaxsd %xmm1, %xmm0, %xmm1
569; XOP-FALLBACK-NEXT:    vpsubd %xmm3, %xmm1, %xmm1
570; XOP-FALLBACK-NEXT:    vpsrld $1, %xmm1, %xmm1
571; XOP-FALLBACK-NEXT:    vpmacsdd %xmm0, %xmm2, %xmm1, %xmm0
572; XOP-FALLBACK-NEXT:    retq
573;
574; XOPAVX1-LABEL: vec128_i32_signed_reg_mem:
575; XOPAVX1:       # %bb.0:
576; XOPAVX1-NEXT:    vmovdqa (%rdi), %xmm1
577; XOPAVX1-NEXT:    vpcomgtd %xmm1, %xmm0, %xmm2
578; XOPAVX1-NEXT:    vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
579; XOPAVX1-NEXT:    vpminsd %xmm1, %xmm0, %xmm3
580; XOPAVX1-NEXT:    vpmaxsd %xmm1, %xmm0, %xmm1
581; XOPAVX1-NEXT:    vpsubd %xmm3, %xmm1, %xmm1
582; XOPAVX1-NEXT:    vpsrld $1, %xmm1, %xmm1
583; XOPAVX1-NEXT:    vpmacsdd %xmm0, %xmm2, %xmm1, %xmm0
584; XOPAVX1-NEXT:    retq
585;
586; XOPAVX2-LABEL: vec128_i32_signed_reg_mem:
587; XOPAVX2:       # %bb.0:
588; XOPAVX2-NEXT:    vmovdqa (%rdi), %xmm1
589; XOPAVX2-NEXT:    vpcomgtd %xmm1, %xmm0, %xmm2
590; XOPAVX2-NEXT:    vpbroadcastd {{.*#+}} xmm3 = [1,1,1,1]
591; XOPAVX2-NEXT:    vpor %xmm3, %xmm2, %xmm2
592; XOPAVX2-NEXT:    vpminsd %xmm1, %xmm0, %xmm3
593; XOPAVX2-NEXT:    vpmaxsd %xmm1, %xmm0, %xmm1
594; XOPAVX2-NEXT:    vpsubd %xmm3, %xmm1, %xmm1
595; XOPAVX2-NEXT:    vpsrld $1, %xmm1, %xmm1
596; XOPAVX2-NEXT:    vpmacsdd %xmm0, %xmm2, %xmm1, %xmm0
597; XOPAVX2-NEXT:    retq
598;
599; AVX512F-LABEL: vec128_i32_signed_reg_mem:
600; AVX512F:       # %bb.0:
601; AVX512F-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
602; AVX512F-NEXT:    vmovdqa (%rdi), %xmm1
603; AVX512F-NEXT:    vpcmpgtd %zmm1, %zmm0, %k1
604; AVX512F-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
605; AVX512F-NEXT:    vpbroadcastd {{.*#+}} xmm3 = [1,1,1,1]
606; AVX512F-NEXT:    vmovdqa32 %zmm2, %zmm3 {%k1}
607; AVX512F-NEXT:    vpminsd %xmm1, %xmm0, %xmm2
608; AVX512F-NEXT:    vpmaxsd %xmm1, %xmm0, %xmm1
609; AVX512F-NEXT:    vpsubd %xmm2, %xmm1, %xmm1
610; AVX512F-NEXT:    vpsrld $1, %xmm1, %xmm1
611; AVX512F-NEXT:    vpmulld %xmm3, %xmm1, %xmm1
612; AVX512F-NEXT:    vpaddd %xmm0, %xmm1, %xmm0
613; AVX512F-NEXT:    vzeroupper
614; AVX512F-NEXT:    retq
615;
616; AVX512VL-LABEL: vec128_i32_signed_reg_mem:
617; AVX512VL:       # %bb.0:
618; AVX512VL-NEXT:    vmovdqa (%rdi), %xmm1
619; AVX512VL-NEXT:    vpcmpgtd %xmm1, %xmm0, %k1
620; AVX512VL-NEXT:    vpminsd %xmm1, %xmm0, %xmm2
621; AVX512VL-NEXT:    vpmaxsd %xmm1, %xmm0, %xmm1
622; AVX512VL-NEXT:    vpsubd %xmm2, %xmm1, %xmm1
623; AVX512VL-NEXT:    vpsrld $1, %xmm1, %xmm1
624; AVX512VL-NEXT:    vpxor %xmm2, %xmm2, %xmm2
625; AVX512VL-NEXT:    vpsubd %xmm1, %xmm2, %xmm1 {%k1}
626; AVX512VL-NEXT:    vpaddd %xmm0, %xmm1, %xmm0
627; AVX512VL-NEXT:    retq
628;
629; AVX512BW-FALLBACK-LABEL: vec128_i32_signed_reg_mem:
630; AVX512BW-FALLBACK:       # %bb.0:
631; AVX512BW-FALLBACK-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
632; AVX512BW-FALLBACK-NEXT:    vmovdqa (%rdi), %xmm1
633; AVX512BW-FALLBACK-NEXT:    vpcmpgtd %zmm1, %zmm0, %k1
634; AVX512BW-FALLBACK-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
635; AVX512BW-FALLBACK-NEXT:    vpbroadcastd {{.*#+}} xmm3 = [1,1,1,1]
636; AVX512BW-FALLBACK-NEXT:    vmovdqa32 %zmm2, %zmm3 {%k1}
637; AVX512BW-FALLBACK-NEXT:    vpminsd %xmm1, %xmm0, %xmm2
638; AVX512BW-FALLBACK-NEXT:    vpmaxsd %xmm1, %xmm0, %xmm1
639; AVX512BW-FALLBACK-NEXT:    vpsubd %xmm2, %xmm1, %xmm1
640; AVX512BW-FALLBACK-NEXT:    vpsrld $1, %xmm1, %xmm1
641; AVX512BW-FALLBACK-NEXT:    vpmulld %xmm3, %xmm1, %xmm1
642; AVX512BW-FALLBACK-NEXT:    vpaddd %xmm0, %xmm1, %xmm0
643; AVX512BW-FALLBACK-NEXT:    vzeroupper
644; AVX512BW-FALLBACK-NEXT:    retq
645  %a2 = load <4 x i32>, ptr %a2_addr
646  %t3 = icmp sgt <4 x i32> %a1, %a2 ; signed
647  %t4 = select <4 x i1> %t3, <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
648  %t5 = select <4 x i1> %t3, <4 x i32> %a2, <4 x i32> %a1
649  %t6 = select <4 x i1> %t3, <4 x i32> %a1, <4 x i32> %a2
650  %t7 = sub <4 x i32> %t6, %t5
651  %t8 = lshr <4 x i32> %t7, <i32 1, i32 1, i32 1, i32 1>
652  %t9 = mul nsw <4 x i32> %t8, %t4 ; signed
653  %a10 = add nsw <4 x i32> %t9, %a1 ; signed
654  ret <4 x i32> %a10
655}
656
657define <4 x i32> @vec128_i32_signed_mem_mem(ptr %a1_addr, ptr %a2_addr) nounwind {
658; SSE2-LABEL: vec128_i32_signed_mem_mem:
659; SSE2:       # %bb.0:
660; SSE2-NEXT:    movdqa (%rdi), %xmm1
661; SSE2-NEXT:    movdqa (%rsi), %xmm0
662; SSE2-NEXT:    movdqa %xmm1, %xmm2
663; SSE2-NEXT:    pcmpgtd %xmm0, %xmm2
664; SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [1,1,1,1]
665; SSE2-NEXT:    por %xmm2, %xmm3
666; SSE2-NEXT:    movdqa %xmm1, %xmm4
667; SSE2-NEXT:    psubd %xmm0, %xmm4
668; SSE2-NEXT:    pxor %xmm2, %xmm4
669; SSE2-NEXT:    psubd %xmm4, %xmm2
670; SSE2-NEXT:    psrld $1, %xmm2
671; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3]
672; SSE2-NEXT:    pmuludq %xmm3, %xmm2
673; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
674; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3]
675; SSE2-NEXT:    pmuludq %xmm4, %xmm2
676; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
677; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
678; SSE2-NEXT:    paddd %xmm1, %xmm0
679; SSE2-NEXT:    retq
680;
681; SSE41-LABEL: vec128_i32_signed_mem_mem:
682; SSE41:       # %bb.0:
683; SSE41-NEXT:    movdqa (%rdi), %xmm1
684; SSE41-NEXT:    movdqa (%rsi), %xmm0
685; SSE41-NEXT:    movdqa %xmm1, %xmm2
686; SSE41-NEXT:    pcmpgtd %xmm0, %xmm2
687; SSE41-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
688; SSE41-NEXT:    movdqa %xmm1, %xmm3
689; SSE41-NEXT:    pminsd %xmm0, %xmm3
690; SSE41-NEXT:    pmaxsd %xmm1, %xmm0
691; SSE41-NEXT:    psubd %xmm3, %xmm0
692; SSE41-NEXT:    psrld $1, %xmm0
693; SSE41-NEXT:    pmulld %xmm2, %xmm0
694; SSE41-NEXT:    paddd %xmm1, %xmm0
695; SSE41-NEXT:    retq
696;
697; AVX1-LABEL: vec128_i32_signed_mem_mem:
698; AVX1:       # %bb.0:
699; AVX1-NEXT:    vmovdqa (%rdi), %xmm0
700; AVX1-NEXT:    vmovdqa (%rsi), %xmm1
701; AVX1-NEXT:    vpcmpgtd %xmm1, %xmm0, %xmm2
702; AVX1-NEXT:    vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
703; AVX1-NEXT:    vpminsd %xmm1, %xmm0, %xmm3
704; AVX1-NEXT:    vpmaxsd %xmm1, %xmm0, %xmm1
705; AVX1-NEXT:    vpsubd %xmm3, %xmm1, %xmm1
706; AVX1-NEXT:    vpsrld $1, %xmm1, %xmm1
707; AVX1-NEXT:    vpmulld %xmm2, %xmm1, %xmm1
708; AVX1-NEXT:    vpaddd %xmm0, %xmm1, %xmm0
709; AVX1-NEXT:    retq
710;
711; AVX2-LABEL: vec128_i32_signed_mem_mem:
712; AVX2:       # %bb.0:
713; AVX2-NEXT:    vmovdqa (%rdi), %xmm0
714; AVX2-NEXT:    vmovdqa (%rsi), %xmm1
715; AVX2-NEXT:    vpcmpgtd %xmm1, %xmm0, %xmm2
716; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm3 = [1,1,1,1]
717; AVX2-NEXT:    vpor %xmm3, %xmm2, %xmm2
718; AVX2-NEXT:    vpminsd %xmm1, %xmm0, %xmm3
719; AVX2-NEXT:    vpmaxsd %xmm1, %xmm0, %xmm1
720; AVX2-NEXT:    vpsubd %xmm3, %xmm1, %xmm1
721; AVX2-NEXT:    vpsrld $1, %xmm1, %xmm1
722; AVX2-NEXT:    vpmulld %xmm2, %xmm1, %xmm1
723; AVX2-NEXT:    vpaddd %xmm0, %xmm1, %xmm0
724; AVX2-NEXT:    retq
725;
726; XOP-FALLBACK-LABEL: vec128_i32_signed_mem_mem:
727; XOP-FALLBACK:       # %bb.0:
728; XOP-FALLBACK-NEXT:    vmovdqa (%rdi), %xmm0
729; XOP-FALLBACK-NEXT:    vmovdqa (%rsi), %xmm1
730; XOP-FALLBACK-NEXT:    vpcomgtd %xmm1, %xmm0, %xmm2
731; XOP-FALLBACK-NEXT:    vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
732; XOP-FALLBACK-NEXT:    vpminsd %xmm1, %xmm0, %xmm3
733; XOP-FALLBACK-NEXT:    vpmaxsd %xmm1, %xmm0, %xmm1
734; XOP-FALLBACK-NEXT:    vpsubd %xmm3, %xmm1, %xmm1
735; XOP-FALLBACK-NEXT:    vpsrld $1, %xmm1, %xmm1
736; XOP-FALLBACK-NEXT:    vpmacsdd %xmm0, %xmm2, %xmm1, %xmm0
737; XOP-FALLBACK-NEXT:    retq
738;
739; XOPAVX1-LABEL: vec128_i32_signed_mem_mem:
740; XOPAVX1:       # %bb.0:
741; XOPAVX1-NEXT:    vmovdqa (%rdi), %xmm0
742; XOPAVX1-NEXT:    vmovdqa (%rsi), %xmm1
743; XOPAVX1-NEXT:    vpcomgtd %xmm1, %xmm0, %xmm2
744; XOPAVX1-NEXT:    vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
745; XOPAVX1-NEXT:    vpminsd %xmm1, %xmm0, %xmm3
746; XOPAVX1-NEXT:    vpmaxsd %xmm1, %xmm0, %xmm1
747; XOPAVX1-NEXT:    vpsubd %xmm3, %xmm1, %xmm1
748; XOPAVX1-NEXT:    vpsrld $1, %xmm1, %xmm1
749; XOPAVX1-NEXT:    vpmacsdd %xmm0, %xmm2, %xmm1, %xmm0
750; XOPAVX1-NEXT:    retq
751;
752; XOPAVX2-LABEL: vec128_i32_signed_mem_mem:
753; XOPAVX2:       # %bb.0:
754; XOPAVX2-NEXT:    vmovdqa (%rdi), %xmm0
755; XOPAVX2-NEXT:    vmovdqa (%rsi), %xmm1
756; XOPAVX2-NEXT:    vpcomgtd %xmm1, %xmm0, %xmm2
757; XOPAVX2-NEXT:    vpbroadcastd {{.*#+}} xmm3 = [1,1,1,1]
758; XOPAVX2-NEXT:    vpor %xmm3, %xmm2, %xmm2
759; XOPAVX2-NEXT:    vpminsd %xmm1, %xmm0, %xmm3
760; XOPAVX2-NEXT:    vpmaxsd %xmm1, %xmm0, %xmm1
761; XOPAVX2-NEXT:    vpsubd %xmm3, %xmm1, %xmm1
762; XOPAVX2-NEXT:    vpsrld $1, %xmm1, %xmm1
763; XOPAVX2-NEXT:    vpmacsdd %xmm0, %xmm2, %xmm1, %xmm0
764; XOPAVX2-NEXT:    retq
765;
766; AVX512F-LABEL: vec128_i32_signed_mem_mem:
767; AVX512F:       # %bb.0:
768; AVX512F-NEXT:    vmovdqa (%rdi), %xmm0
769; AVX512F-NEXT:    vmovdqa (%rsi), %xmm1
770; AVX512F-NEXT:    vpcmpgtd %zmm1, %zmm0, %k1
771; AVX512F-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
772; AVX512F-NEXT:    vpbroadcastd {{.*#+}} xmm3 = [1,1,1,1]
773; AVX512F-NEXT:    vmovdqa32 %zmm2, %zmm3 {%k1}
774; AVX512F-NEXT:    vpminsd %xmm1, %xmm0, %xmm2
775; AVX512F-NEXT:    vpmaxsd %xmm1, %xmm0, %xmm1
776; AVX512F-NEXT:    vpsubd %xmm2, %xmm1, %xmm1
777; AVX512F-NEXT:    vpsrld $1, %xmm1, %xmm1
778; AVX512F-NEXT:    vpmulld %xmm3, %xmm1, %xmm1
779; AVX512F-NEXT:    vpaddd %xmm0, %xmm1, %xmm0
780; AVX512F-NEXT:    vzeroupper
781; AVX512F-NEXT:    retq
782;
783; AVX512VL-LABEL: vec128_i32_signed_mem_mem:
784; AVX512VL:       # %bb.0:
785; AVX512VL-NEXT:    vmovdqa (%rdi), %xmm0
786; AVX512VL-NEXT:    vmovdqa (%rsi), %xmm1
787; AVX512VL-NEXT:    vpcmpgtd %xmm1, %xmm0, %k1
788; AVX512VL-NEXT:    vpminsd %xmm1, %xmm0, %xmm2
789; AVX512VL-NEXT:    vpmaxsd %xmm1, %xmm0, %xmm1
790; AVX512VL-NEXT:    vpsubd %xmm2, %xmm1, %xmm1
791; AVX512VL-NEXT:    vpsrld $1, %xmm1, %xmm1
792; AVX512VL-NEXT:    vpxor %xmm2, %xmm2, %xmm2
793; AVX512VL-NEXT:    vpsubd %xmm1, %xmm2, %xmm1 {%k1}
794; AVX512VL-NEXT:    vpaddd %xmm0, %xmm1, %xmm0
795; AVX512VL-NEXT:    retq
796;
797; AVX512BW-FALLBACK-LABEL: vec128_i32_signed_mem_mem:
798; AVX512BW-FALLBACK:       # %bb.0:
799; AVX512BW-FALLBACK-NEXT:    vmovdqa (%rdi), %xmm0
800; AVX512BW-FALLBACK-NEXT:    vmovdqa (%rsi), %xmm1
801; AVX512BW-FALLBACK-NEXT:    vpcmpgtd %zmm1, %zmm0, %k1
802; AVX512BW-FALLBACK-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
803; AVX512BW-FALLBACK-NEXT:    vpbroadcastd {{.*#+}} xmm3 = [1,1,1,1]
804; AVX512BW-FALLBACK-NEXT:    vmovdqa32 %zmm2, %zmm3 {%k1}
805; AVX512BW-FALLBACK-NEXT:    vpminsd %xmm1, %xmm0, %xmm2
806; AVX512BW-FALLBACK-NEXT:    vpmaxsd %xmm1, %xmm0, %xmm1
807; AVX512BW-FALLBACK-NEXT:    vpsubd %xmm2, %xmm1, %xmm1
808; AVX512BW-FALLBACK-NEXT:    vpsrld $1, %xmm1, %xmm1
809; AVX512BW-FALLBACK-NEXT:    vpmulld %xmm3, %xmm1, %xmm1
810; AVX512BW-FALLBACK-NEXT:    vpaddd %xmm0, %xmm1, %xmm0
811; AVX512BW-FALLBACK-NEXT:    vzeroupper
812; AVX512BW-FALLBACK-NEXT:    retq
813  %a1 = load <4 x i32>, ptr %a1_addr
814  %a2 = load <4 x i32>, ptr %a2_addr
815  %t3 = icmp sgt <4 x i32> %a1, %a2 ; signed
816  %t4 = select <4 x i1> %t3, <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
817  %t5 = select <4 x i1> %t3, <4 x i32> %a2, <4 x i32> %a1
818  %t6 = select <4 x i1> %t3, <4 x i32> %a1, <4 x i32> %a2
819  %t7 = sub <4 x i32> %t6, %t5
820  %t8 = lshr <4 x i32> %t7, <i32 1, i32 1, i32 1, i32 1>
821  %t9 = mul nsw <4 x i32> %t8, %t4 ; signed
822  %a10 = add nsw <4 x i32> %t9, %a1 ; signed
823  ret <4 x i32> %a10
824}
825
826; ---------------------------------------------------------------------------- ;
827; 64-bit width. 128 / 64 = 2 elts.
828; ---------------------------------------------------------------------------- ;
829
830; Values come from regs
831
832define <2 x i64> @vec128_i64_signed_reg_reg(<2 x i64> %a1, <2 x i64> %a2) nounwind {
833; SSE2-LABEL: vec128_i64_signed_reg_reg:
834; SSE2:       # %bb.0:
835; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [2147483648,2147483648]
836; SSE2-NEXT:    movdqa %xmm0, %xmm3
837; SSE2-NEXT:    psubq %xmm1, %xmm3
838; SSE2-NEXT:    pxor %xmm2, %xmm1
839; SSE2-NEXT:    pxor %xmm0, %xmm2
840; SSE2-NEXT:    movdqa %xmm2, %xmm4
841; SSE2-NEXT:    pcmpgtd %xmm1, %xmm4
842; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
843; SSE2-NEXT:    pcmpeqd %xmm1, %xmm2
844; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3]
845; SSE2-NEXT:    pand %xmm5, %xmm1
846; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3]
847; SSE2-NEXT:    por %xmm1, %xmm2
848; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [1,1]
849; SSE2-NEXT:    por %xmm2, %xmm1
850; SSE2-NEXT:    pxor %xmm2, %xmm3
851; SSE2-NEXT:    movdqa %xmm2, %xmm4
852; SSE2-NEXT:    psubq %xmm3, %xmm4
853; SSE2-NEXT:    movdqa %xmm4, %xmm3
854; SSE2-NEXT:    psrlq $1, %xmm3
855; SSE2-NEXT:    psrlq $33, %xmm4
856; SSE2-NEXT:    pmuludq %xmm1, %xmm4
857; SSE2-NEXT:    psrlq $32, %xmm2
858; SSE2-NEXT:    pmuludq %xmm3, %xmm2
859; SSE2-NEXT:    paddq %xmm4, %xmm2
860; SSE2-NEXT:    psllq $32, %xmm2
861; SSE2-NEXT:    pmuludq %xmm1, %xmm3
862; SSE2-NEXT:    paddq %xmm3, %xmm0
863; SSE2-NEXT:    paddq %xmm2, %xmm0
864; SSE2-NEXT:    retq
865;
866; SSE41-LABEL: vec128_i64_signed_reg_reg:
867; SSE41:       # %bb.0:
868; SSE41-NEXT:    pmovzxdq {{.*#+}} xmm2 = [2147483648,2147483648]
869; SSE41-NEXT:    movdqa %xmm0, %xmm3
870; SSE41-NEXT:    psubq %xmm1, %xmm3
871; SSE41-NEXT:    pxor %xmm2, %xmm1
872; SSE41-NEXT:    pxor %xmm0, %xmm2
873; SSE41-NEXT:    movdqa %xmm2, %xmm4
874; SSE41-NEXT:    pcmpgtd %xmm1, %xmm4
875; SSE41-NEXT:    pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
876; SSE41-NEXT:    pcmpeqd %xmm1, %xmm2
877; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3]
878; SSE41-NEXT:    pand %xmm5, %xmm1
879; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3]
880; SSE41-NEXT:    por %xmm1, %xmm2
881; SSE41-NEXT:    pmovsxbq {{.*#+}} xmm1 = [1,1]
882; SSE41-NEXT:    por %xmm2, %xmm1
883; SSE41-NEXT:    pxor %xmm2, %xmm3
884; SSE41-NEXT:    movdqa %xmm2, %xmm4
885; SSE41-NEXT:    psubq %xmm3, %xmm4
886; SSE41-NEXT:    movdqa %xmm4, %xmm3
887; SSE41-NEXT:    psrlq $1, %xmm3
888; SSE41-NEXT:    psrlq $33, %xmm4
889; SSE41-NEXT:    pmuludq %xmm1, %xmm4
890; SSE41-NEXT:    psrlq $32, %xmm2
891; SSE41-NEXT:    pmuludq %xmm3, %xmm2
892; SSE41-NEXT:    paddq %xmm4, %xmm2
893; SSE41-NEXT:    psllq $32, %xmm2
894; SSE41-NEXT:    pmuludq %xmm1, %xmm3
895; SSE41-NEXT:    paddq %xmm3, %xmm0
896; SSE41-NEXT:    paddq %xmm2, %xmm0
897; SSE41-NEXT:    retq
898;
899; AVX-LABEL: vec128_i64_signed_reg_reg:
900; AVX:       # %bb.0:
901; AVX-NEXT:    vpcmpgtq %xmm1, %xmm0, %xmm2
902; AVX-NEXT:    vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm3
903; AVX-NEXT:    vpsubq %xmm1, %xmm0, %xmm1
904; AVX-NEXT:    vpxor %xmm2, %xmm1, %xmm1
905; AVX-NEXT:    vpsubq %xmm1, %xmm2, %xmm1
906; AVX-NEXT:    vpsrlq $1, %xmm1, %xmm4
907; AVX-NEXT:    vpsrlq $33, %xmm1, %xmm1
908; AVX-NEXT:    vpmuludq %xmm3, %xmm1, %xmm1
909; AVX-NEXT:    vpsrlq $32, %xmm2, %xmm2
910; AVX-NEXT:    vpmuludq %xmm2, %xmm4, %xmm2
911; AVX-NEXT:    vpaddq %xmm1, %xmm2, %xmm1
912; AVX-NEXT:    vpsllq $32, %xmm1, %xmm1
913; AVX-NEXT:    vpmuludq %xmm3, %xmm4, %xmm2
914; AVX-NEXT:    vpaddq %xmm0, %xmm2, %xmm0
915; AVX-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
916; AVX-NEXT:    retq
917;
918; XOP-LABEL: vec128_i64_signed_reg_reg:
919; XOP:       # %bb.0:
920; XOP-NEXT:    vpcomgtq %xmm1, %xmm0, %xmm2
921; XOP-NEXT:    vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm3
922; XOP-NEXT:    vpsubq %xmm1, %xmm0, %xmm1
923; XOP-NEXT:    vpxor %xmm2, %xmm1, %xmm1
924; XOP-NEXT:    vpsubq %xmm1, %xmm2, %xmm1
925; XOP-NEXT:    vpsrlq $1, %xmm1, %xmm4
926; XOP-NEXT:    vpsrlq $33, %xmm1, %xmm1
927; XOP-NEXT:    vpmuludq %xmm3, %xmm1, %xmm1
928; XOP-NEXT:    vpsrlq $32, %xmm2, %xmm2
929; XOP-NEXT:    vpmuludq %xmm2, %xmm4, %xmm2
930; XOP-NEXT:    vpaddq %xmm1, %xmm2, %xmm1
931; XOP-NEXT:    vpsllq $32, %xmm1, %xmm1
932; XOP-NEXT:    vpmuludq %xmm3, %xmm4, %xmm2
933; XOP-NEXT:    vpaddq %xmm0, %xmm2, %xmm0
934; XOP-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
935; XOP-NEXT:    retq
936;
937; AVX512F-LABEL: vec128_i64_signed_reg_reg:
938; AVX512F:       # %bb.0:
939; AVX512F-NEXT:    # kill: def $xmm1 killed $xmm1 def $zmm1
940; AVX512F-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
941; AVX512F-NEXT:    vpcmpgtq %zmm1, %zmm0, %k1
942; AVX512F-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
943; AVX512F-NEXT:    vpmovsxbq {{.*#+}} xmm3 = [1,1]
944; AVX512F-NEXT:    vmovdqa64 %zmm2, %zmm3 {%k1}
945; AVX512F-NEXT:    vpminsq %zmm1, %zmm0, %zmm2
946; AVX512F-NEXT:    vpmaxsq %zmm1, %zmm0, %zmm1
947; AVX512F-NEXT:    vpsubq %xmm2, %xmm1, %xmm1
948; AVX512F-NEXT:    vpsrlq $1, %xmm1, %xmm2
949; AVX512F-NEXT:    vpsrlq $33, %xmm1, %xmm1
950; AVX512F-NEXT:    vpmuludq %xmm3, %xmm1, %xmm1
951; AVX512F-NEXT:    vpsrlq $32, %xmm3, %xmm4
952; AVX512F-NEXT:    vpmuludq %xmm4, %xmm2, %xmm4
953; AVX512F-NEXT:    vpaddq %xmm1, %xmm4, %xmm1
954; AVX512F-NEXT:    vpsllq $32, %xmm1, %xmm1
955; AVX512F-NEXT:    vpmuludq %xmm3, %xmm2, %xmm2
956; AVX512F-NEXT:    vpaddq %xmm0, %xmm2, %xmm0
957; AVX512F-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
958; AVX512F-NEXT:    vzeroupper
959; AVX512F-NEXT:    retq
960;
961; AVX512VL-LABEL: vec128_i64_signed_reg_reg:
962; AVX512VL:       # %bb.0:
963; AVX512VL-NEXT:    vpcmpgtq %xmm1, %xmm0, %k1
964; AVX512VL-NEXT:    vpminsq %xmm1, %xmm0, %xmm2
965; AVX512VL-NEXT:    vpmaxsq %xmm1, %xmm0, %xmm1
966; AVX512VL-NEXT:    vpsubq %xmm2, %xmm1, %xmm1
967; AVX512VL-NEXT:    vpsrlq $1, %xmm1, %xmm1
968; AVX512VL-NEXT:    vpxor %xmm2, %xmm2, %xmm2
969; AVX512VL-NEXT:    vpsubq %xmm1, %xmm2, %xmm1 {%k1}
970; AVX512VL-NEXT:    vpaddq %xmm0, %xmm1, %xmm0
971; AVX512VL-NEXT:    retq
972;
973; AVX512BW-FALLBACK-LABEL: vec128_i64_signed_reg_reg:
974; AVX512BW-FALLBACK:       # %bb.0:
975; AVX512BW-FALLBACK-NEXT:    # kill: def $xmm1 killed $xmm1 def $zmm1
976; AVX512BW-FALLBACK-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
977; AVX512BW-FALLBACK-NEXT:    vpcmpgtq %zmm1, %zmm0, %k1
978; AVX512BW-FALLBACK-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
979; AVX512BW-FALLBACK-NEXT:    vpmovsxbq {{.*#+}} xmm3 = [1,1]
980; AVX512BW-FALLBACK-NEXT:    vmovdqa64 %zmm2, %zmm3 {%k1}
981; AVX512BW-FALLBACK-NEXT:    vpminsq %zmm1, %zmm0, %zmm2
982; AVX512BW-FALLBACK-NEXT:    vpmaxsq %zmm1, %zmm0, %zmm1
983; AVX512BW-FALLBACK-NEXT:    vpsubq %xmm2, %xmm1, %xmm1
984; AVX512BW-FALLBACK-NEXT:    vpsrlq $1, %xmm1, %xmm2
985; AVX512BW-FALLBACK-NEXT:    vpsrlq $33, %xmm1, %xmm1
986; AVX512BW-FALLBACK-NEXT:    vpmuludq %xmm3, %xmm1, %xmm1
987; AVX512BW-FALLBACK-NEXT:    vpsrlq $32, %xmm3, %xmm4
988; AVX512BW-FALLBACK-NEXT:    vpmuludq %xmm4, %xmm2, %xmm4
989; AVX512BW-FALLBACK-NEXT:    vpaddq %xmm1, %xmm4, %xmm1
990; AVX512BW-FALLBACK-NEXT:    vpsllq $32, %xmm1, %xmm1
991; AVX512BW-FALLBACK-NEXT:    vpmuludq %xmm3, %xmm2, %xmm2
992; AVX512BW-FALLBACK-NEXT:    vpaddq %xmm0, %xmm2, %xmm0
993; AVX512BW-FALLBACK-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
994; AVX512BW-FALLBACK-NEXT:    vzeroupper
995; AVX512BW-FALLBACK-NEXT:    retq
996  %t3 = icmp sgt <2 x i64> %a1, %a2 ; signed
997  %t4 = select <2 x i1> %t3, <2 x i64> <i64 -1, i64 -1>, <2 x i64> <i64 1, i64 1>
998  %t5 = select <2 x i1> %t3, <2 x i64> %a2, <2 x i64> %a1
999  %t6 = select <2 x i1> %t3, <2 x i64> %a1, <2 x i64> %a2
1000  %t7 = sub <2 x i64> %t6, %t5
1001  %t8 = lshr <2 x i64> %t7, <i64 1, i64 1>
1002  %t9 = mul nsw <2 x i64> %t8, %t4 ; signed
1003  %a10 = add nsw <2 x i64> %t9, %a1 ; signed
1004  ret <2 x i64> %a10
1005}
1006
1007define <2 x i64> @vec128_i64_unsigned_reg_reg(<2 x i64> %a1, <2 x i64> %a2) nounwind {
1008; SSE2-LABEL: vec128_i64_unsigned_reg_reg:
1009; SSE2:       # %bb.0:
1010; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [9223372039002259456,9223372039002259456]
1011; SSE2-NEXT:    movdqa %xmm0, %xmm3
1012; SSE2-NEXT:    psubq %xmm1, %xmm3
1013; SSE2-NEXT:    pxor %xmm2, %xmm1
1014; SSE2-NEXT:    pxor %xmm0, %xmm2
1015; SSE2-NEXT:    movdqa %xmm2, %xmm4
1016; SSE2-NEXT:    pcmpgtd %xmm1, %xmm4
1017; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
1018; SSE2-NEXT:    pcmpeqd %xmm1, %xmm2
1019; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3]
1020; SSE2-NEXT:    pand %xmm5, %xmm1
1021; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3]
1022; SSE2-NEXT:    por %xmm1, %xmm2
1023; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [1,1]
1024; SSE2-NEXT:    por %xmm2, %xmm1
1025; SSE2-NEXT:    pxor %xmm2, %xmm3
1026; SSE2-NEXT:    movdqa %xmm2, %xmm4
1027; SSE2-NEXT:    psubq %xmm3, %xmm4
1028; SSE2-NEXT:    movdqa %xmm4, %xmm3
1029; SSE2-NEXT:    psrlq $1, %xmm3
1030; SSE2-NEXT:    psrlq $33, %xmm4
1031; SSE2-NEXT:    pmuludq %xmm1, %xmm4
1032; SSE2-NEXT:    psrlq $32, %xmm2
1033; SSE2-NEXT:    pmuludq %xmm3, %xmm2
1034; SSE2-NEXT:    paddq %xmm4, %xmm2
1035; SSE2-NEXT:    psllq $32, %xmm2
1036; SSE2-NEXT:    pmuludq %xmm1, %xmm3
1037; SSE2-NEXT:    paddq %xmm3, %xmm0
1038; SSE2-NEXT:    paddq %xmm2, %xmm0
1039; SSE2-NEXT:    retq
1040;
1041; SSE41-LABEL: vec128_i64_unsigned_reg_reg:
1042; SSE41:       # %bb.0:
1043; SSE41-NEXT:    movdqa {{.*#+}} xmm2 = [9223372039002259456,9223372039002259456]
1044; SSE41-NEXT:    movdqa %xmm0, %xmm3
1045; SSE41-NEXT:    psubq %xmm1, %xmm3
1046; SSE41-NEXT:    pxor %xmm2, %xmm1
1047; SSE41-NEXT:    pxor %xmm0, %xmm2
1048; SSE41-NEXT:    movdqa %xmm2, %xmm4
1049; SSE41-NEXT:    pcmpgtd %xmm1, %xmm4
1050; SSE41-NEXT:    pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
1051; SSE41-NEXT:    pcmpeqd %xmm1, %xmm2
1052; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3]
1053; SSE41-NEXT:    pand %xmm5, %xmm1
1054; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3]
1055; SSE41-NEXT:    por %xmm1, %xmm2
1056; SSE41-NEXT:    pmovsxbq {{.*#+}} xmm1 = [1,1]
1057; SSE41-NEXT:    por %xmm2, %xmm1
1058; SSE41-NEXT:    pxor %xmm2, %xmm3
1059; SSE41-NEXT:    movdqa %xmm2, %xmm4
1060; SSE41-NEXT:    psubq %xmm3, %xmm4
1061; SSE41-NEXT:    movdqa %xmm4, %xmm3
1062; SSE41-NEXT:    psrlq $1, %xmm3
1063; SSE41-NEXT:    psrlq $33, %xmm4
1064; SSE41-NEXT:    pmuludq %xmm1, %xmm4
1065; SSE41-NEXT:    psrlq $32, %xmm2
1066; SSE41-NEXT:    pmuludq %xmm3, %xmm2
1067; SSE41-NEXT:    paddq %xmm4, %xmm2
1068; SSE41-NEXT:    psllq $32, %xmm2
1069; SSE41-NEXT:    pmuludq %xmm1, %xmm3
1070; SSE41-NEXT:    paddq %xmm3, %xmm0
1071; SSE41-NEXT:    paddq %xmm2, %xmm0
1072; SSE41-NEXT:    retq
1073;
1074; AVX1-LABEL: vec128_i64_unsigned_reg_reg:
1075; AVX1:       # %bb.0:
1076; AVX1-NEXT:    vmovddup {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
1077; AVX1-NEXT:    # xmm2 = mem[0,0]
1078; AVX1-NEXT:    vpxor %xmm2, %xmm1, %xmm3
1079; AVX1-NEXT:    vpxor %xmm2, %xmm0, %xmm2
1080; AVX1-NEXT:    vpcmpgtq %xmm3, %xmm2, %xmm2
1081; AVX1-NEXT:    vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm3
1082; AVX1-NEXT:    vpsubq %xmm1, %xmm0, %xmm1
1083; AVX1-NEXT:    vpxor %xmm2, %xmm1, %xmm1
1084; AVX1-NEXT:    vpsubq %xmm1, %xmm2, %xmm1
1085; AVX1-NEXT:    vpsrlq $1, %xmm1, %xmm4
1086; AVX1-NEXT:    vpsrlq $33, %xmm1, %xmm1
1087; AVX1-NEXT:    vpmuludq %xmm3, %xmm1, %xmm1
1088; AVX1-NEXT:    vpsrlq $32, %xmm2, %xmm2
1089; AVX1-NEXT:    vpmuludq %xmm2, %xmm4, %xmm2
1090; AVX1-NEXT:    vpaddq %xmm1, %xmm2, %xmm1
1091; AVX1-NEXT:    vpsllq $32, %xmm1, %xmm1
1092; AVX1-NEXT:    vpmuludq %xmm3, %xmm4, %xmm2
1093; AVX1-NEXT:    vpaddq %xmm0, %xmm2, %xmm0
1094; AVX1-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
1095; AVX1-NEXT:    retq
1096;
1097; AVX2-LABEL: vec128_i64_unsigned_reg_reg:
1098; AVX2:       # %bb.0:
1099; AVX2-NEXT:    vpbroadcastq {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
1100; AVX2-NEXT:    vpxor %xmm2, %xmm1, %xmm3
1101; AVX2-NEXT:    vpxor %xmm2, %xmm0, %xmm2
1102; AVX2-NEXT:    vpcmpgtq %xmm3, %xmm2, %xmm2
1103; AVX2-NEXT:    vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm3
1104; AVX2-NEXT:    vpsubq %xmm1, %xmm0, %xmm1
1105; AVX2-NEXT:    vpxor %xmm2, %xmm1, %xmm1
1106; AVX2-NEXT:    vpsubq %xmm1, %xmm2, %xmm1
1107; AVX2-NEXT:    vpsrlq $1, %xmm1, %xmm4
1108; AVX2-NEXT:    vpsrlq $33, %xmm1, %xmm1
1109; AVX2-NEXT:    vpmuludq %xmm3, %xmm1, %xmm1
1110; AVX2-NEXT:    vpsrlq $32, %xmm2, %xmm2
1111; AVX2-NEXT:    vpmuludq %xmm2, %xmm4, %xmm2
1112; AVX2-NEXT:    vpaddq %xmm1, %xmm2, %xmm1
1113; AVX2-NEXT:    vpsllq $32, %xmm1, %xmm1
1114; AVX2-NEXT:    vpmuludq %xmm3, %xmm4, %xmm2
1115; AVX2-NEXT:    vpaddq %xmm0, %xmm2, %xmm0
1116; AVX2-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
1117; AVX2-NEXT:    retq
1118;
1119; XOP-LABEL: vec128_i64_unsigned_reg_reg:
1120; XOP:       # %bb.0:
1121; XOP-NEXT:    vpcomgtuq %xmm1, %xmm0, %xmm2
1122; XOP-NEXT:    vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm3
1123; XOP-NEXT:    vpsubq %xmm1, %xmm0, %xmm1
1124; XOP-NEXT:    vpxor %xmm2, %xmm1, %xmm1
1125; XOP-NEXT:    vpsubq %xmm1, %xmm2, %xmm1
1126; XOP-NEXT:    vpsrlq $1, %xmm1, %xmm4
1127; XOP-NEXT:    vpsrlq $33, %xmm1, %xmm1
1128; XOP-NEXT:    vpmuludq %xmm3, %xmm1, %xmm1
1129; XOP-NEXT:    vpsrlq $32, %xmm2, %xmm2
1130; XOP-NEXT:    vpmuludq %xmm2, %xmm4, %xmm2
1131; XOP-NEXT:    vpaddq %xmm1, %xmm2, %xmm1
1132; XOP-NEXT:    vpsllq $32, %xmm1, %xmm1
1133; XOP-NEXT:    vpmuludq %xmm3, %xmm4, %xmm2
1134; XOP-NEXT:    vpaddq %xmm0, %xmm2, %xmm0
1135; XOP-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
1136; XOP-NEXT:    retq
1137;
1138; AVX512F-LABEL: vec128_i64_unsigned_reg_reg:
1139; AVX512F:       # %bb.0:
1140; AVX512F-NEXT:    # kill: def $xmm1 killed $xmm1 def $zmm1
1141; AVX512F-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
1142; AVX512F-NEXT:    vpcmpnleuq %zmm1, %zmm0, %k1
1143; AVX512F-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
1144; AVX512F-NEXT:    vpmovsxbq {{.*#+}} xmm3 = [1,1]
1145; AVX512F-NEXT:    vmovdqa64 %zmm2, %zmm3 {%k1}
1146; AVX512F-NEXT:    vpminuq %zmm1, %zmm0, %zmm2
1147; AVX512F-NEXT:    vpmaxuq %zmm1, %zmm0, %zmm1
1148; AVX512F-NEXT:    vpsubq %xmm2, %xmm1, %xmm1
1149; AVX512F-NEXT:    vpsrlq $1, %xmm1, %xmm2
1150; AVX512F-NEXT:    vpsrlq $33, %xmm1, %xmm1
1151; AVX512F-NEXT:    vpmuludq %xmm3, %xmm1, %xmm1
1152; AVX512F-NEXT:    vpsrlq $32, %xmm3, %xmm4
1153; AVX512F-NEXT:    vpmuludq %xmm4, %xmm2, %xmm4
1154; AVX512F-NEXT:    vpaddq %xmm1, %xmm4, %xmm1
1155; AVX512F-NEXT:    vpsllq $32, %xmm1, %xmm1
1156; AVX512F-NEXT:    vpmuludq %xmm3, %xmm2, %xmm2
1157; AVX512F-NEXT:    vpaddq %xmm0, %xmm2, %xmm0
1158; AVX512F-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
1159; AVX512F-NEXT:    vzeroupper
1160; AVX512F-NEXT:    retq
1161;
1162; AVX512VL-LABEL: vec128_i64_unsigned_reg_reg:
1163; AVX512VL:       # %bb.0:
1164; AVX512VL-NEXT:    vpcmpnleuq %xmm1, %xmm0, %k1
1165; AVX512VL-NEXT:    vpminuq %xmm1, %xmm0, %xmm2
1166; AVX512VL-NEXT:    vpmaxuq %xmm1, %xmm0, %xmm1
1167; AVX512VL-NEXT:    vpsubq %xmm2, %xmm1, %xmm1
1168; AVX512VL-NEXT:    vpsrlq $1, %xmm1, %xmm1
1169; AVX512VL-NEXT:    vpxor %xmm2, %xmm2, %xmm2
1170; AVX512VL-NEXT:    vpsubq %xmm1, %xmm2, %xmm1 {%k1}
1171; AVX512VL-NEXT:    vpaddq %xmm0, %xmm1, %xmm0
1172; AVX512VL-NEXT:    retq
1173;
1174; AVX512BW-FALLBACK-LABEL: vec128_i64_unsigned_reg_reg:
1175; AVX512BW-FALLBACK:       # %bb.0:
1176; AVX512BW-FALLBACK-NEXT:    # kill: def $xmm1 killed $xmm1 def $zmm1
1177; AVX512BW-FALLBACK-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
1178; AVX512BW-FALLBACK-NEXT:    vpcmpnleuq %zmm1, %zmm0, %k1
1179; AVX512BW-FALLBACK-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
1180; AVX512BW-FALLBACK-NEXT:    vpmovsxbq {{.*#+}} xmm3 = [1,1]
1181; AVX512BW-FALLBACK-NEXT:    vmovdqa64 %zmm2, %zmm3 {%k1}
1182; AVX512BW-FALLBACK-NEXT:    vpminuq %zmm1, %zmm0, %zmm2
1183; AVX512BW-FALLBACK-NEXT:    vpmaxuq %zmm1, %zmm0, %zmm1
1184; AVX512BW-FALLBACK-NEXT:    vpsubq %xmm2, %xmm1, %xmm1
1185; AVX512BW-FALLBACK-NEXT:    vpsrlq $1, %xmm1, %xmm2
1186; AVX512BW-FALLBACK-NEXT:    vpsrlq $33, %xmm1, %xmm1
1187; AVX512BW-FALLBACK-NEXT:    vpmuludq %xmm3, %xmm1, %xmm1
1188; AVX512BW-FALLBACK-NEXT:    vpsrlq $32, %xmm3, %xmm4
1189; AVX512BW-FALLBACK-NEXT:    vpmuludq %xmm4, %xmm2, %xmm4
1190; AVX512BW-FALLBACK-NEXT:    vpaddq %xmm1, %xmm4, %xmm1
1191; AVX512BW-FALLBACK-NEXT:    vpsllq $32, %xmm1, %xmm1
1192; AVX512BW-FALLBACK-NEXT:    vpmuludq %xmm3, %xmm2, %xmm2
1193; AVX512BW-FALLBACK-NEXT:    vpaddq %xmm0, %xmm2, %xmm0
1194; AVX512BW-FALLBACK-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
1195; AVX512BW-FALLBACK-NEXT:    vzeroupper
1196; AVX512BW-FALLBACK-NEXT:    retq
1197  %t3 = icmp ugt <2 x i64> %a1, %a2
1198  %t4 = select <2 x i1> %t3, <2 x i64> <i64 -1, i64 -1>, <2 x i64> <i64 1, i64 1>
1199  %t5 = select <2 x i1> %t3, <2 x i64> %a2, <2 x i64> %a1
1200  %t6 = select <2 x i1> %t3, <2 x i64> %a1, <2 x i64> %a2
1201  %t7 = sub <2 x i64> %t6, %t5
1202  %t8 = lshr <2 x i64> %t7, <i64 1, i64 1>
1203  %t9 = mul <2 x i64> %t8, %t4
1204  %a10 = add <2 x i64> %t9, %a1
1205  ret <2 x i64> %a10
1206}
1207
1208; Values are loaded. Only check signed case.
1209
1210define <2 x i64> @vec128_i64_signed_mem_reg(ptr %a1_addr, <2 x i64> %a2) nounwind {
1211; SSE2-LABEL: vec128_i64_signed_mem_reg:
1212; SSE2:       # %bb.0:
1213; SSE2-NEXT:    movdqa (%rdi), %xmm1
1214; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [2147483648,2147483648]
1215; SSE2-NEXT:    movdqa %xmm1, %xmm3
1216; SSE2-NEXT:    psubq %xmm0, %xmm3
1217; SSE2-NEXT:    pxor %xmm2, %xmm0
1218; SSE2-NEXT:    pxor %xmm1, %xmm2
1219; SSE2-NEXT:    movdqa %xmm2, %xmm4
1220; SSE2-NEXT:    pcmpgtd %xmm0, %xmm4
1221; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
1222; SSE2-NEXT:    pcmpeqd %xmm0, %xmm2
1223; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3]
1224; SSE2-NEXT:    pand %xmm5, %xmm0
1225; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3]
1226; SSE2-NEXT:    por %xmm0, %xmm2
1227; SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [1,1]
1228; SSE2-NEXT:    por %xmm2, %xmm4
1229; SSE2-NEXT:    pxor %xmm2, %xmm3
1230; SSE2-NEXT:    movdqa %xmm2, %xmm5
1231; SSE2-NEXT:    psubq %xmm3, %xmm5
1232; SSE2-NEXT:    movdqa %xmm5, %xmm0
1233; SSE2-NEXT:    psrlq $1, %xmm0
1234; SSE2-NEXT:    psrlq $33, %xmm5
1235; SSE2-NEXT:    pmuludq %xmm4, %xmm5
1236; SSE2-NEXT:    psrlq $32, %xmm2
1237; SSE2-NEXT:    pmuludq %xmm0, %xmm2
1238; SSE2-NEXT:    paddq %xmm5, %xmm2
1239; SSE2-NEXT:    psllq $32, %xmm2
1240; SSE2-NEXT:    pmuludq %xmm4, %xmm0
1241; SSE2-NEXT:    paddq %xmm1, %xmm0
1242; SSE2-NEXT:    paddq %xmm2, %xmm0
1243; SSE2-NEXT:    retq
1244;
1245; SSE41-LABEL: vec128_i64_signed_mem_reg:
1246; SSE41:       # %bb.0:
1247; SSE41-NEXT:    movdqa (%rdi), %xmm1
1248; SSE41-NEXT:    pmovzxdq {{.*#+}} xmm2 = [2147483648,2147483648]
1249; SSE41-NEXT:    movdqa %xmm1, %xmm3
1250; SSE41-NEXT:    psubq %xmm0, %xmm3
1251; SSE41-NEXT:    pxor %xmm2, %xmm0
1252; SSE41-NEXT:    pxor %xmm1, %xmm2
1253; SSE41-NEXT:    movdqa %xmm2, %xmm4
1254; SSE41-NEXT:    pcmpgtd %xmm0, %xmm4
1255; SSE41-NEXT:    pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
1256; SSE41-NEXT:    pcmpeqd %xmm0, %xmm2
1257; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3]
1258; SSE41-NEXT:    pand %xmm5, %xmm0
1259; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3]
1260; SSE41-NEXT:    por %xmm0, %xmm2
1261; SSE41-NEXT:    pmovsxbq {{.*#+}} xmm4 = [1,1]
1262; SSE41-NEXT:    por %xmm2, %xmm4
1263; SSE41-NEXT:    pxor %xmm2, %xmm3
1264; SSE41-NEXT:    movdqa %xmm2, %xmm5
1265; SSE41-NEXT:    psubq %xmm3, %xmm5
1266; SSE41-NEXT:    movdqa %xmm5, %xmm0
1267; SSE41-NEXT:    psrlq $1, %xmm0
1268; SSE41-NEXT:    psrlq $33, %xmm5
1269; SSE41-NEXT:    pmuludq %xmm4, %xmm5
1270; SSE41-NEXT:    psrlq $32, %xmm2
1271; SSE41-NEXT:    pmuludq %xmm0, %xmm2
1272; SSE41-NEXT:    paddq %xmm5, %xmm2
1273; SSE41-NEXT:    psllq $32, %xmm2
1274; SSE41-NEXT:    pmuludq %xmm4, %xmm0
1275; SSE41-NEXT:    paddq %xmm1, %xmm0
1276; SSE41-NEXT:    paddq %xmm2, %xmm0
1277; SSE41-NEXT:    retq
1278;
1279; AVX-LABEL: vec128_i64_signed_mem_reg:
1280; AVX:       # %bb.0:
1281; AVX-NEXT:    vmovdqa (%rdi), %xmm1
1282; AVX-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm2
1283; AVX-NEXT:    vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm3
1284; AVX-NEXT:    vpsubq %xmm0, %xmm1, %xmm0
1285; AVX-NEXT:    vpxor %xmm2, %xmm0, %xmm0
1286; AVX-NEXT:    vpsubq %xmm0, %xmm2, %xmm0
1287; AVX-NEXT:    vpsrlq $1, %xmm0, %xmm4
1288; AVX-NEXT:    vpsrlq $33, %xmm0, %xmm0
1289; AVX-NEXT:    vpmuludq %xmm3, %xmm0, %xmm0
1290; AVX-NEXT:    vpsrlq $32, %xmm2, %xmm2
1291; AVX-NEXT:    vpmuludq %xmm2, %xmm4, %xmm2
1292; AVX-NEXT:    vpaddq %xmm0, %xmm2, %xmm0
1293; AVX-NEXT:    vpsllq $32, %xmm0, %xmm0
1294; AVX-NEXT:    vpmuludq %xmm3, %xmm4, %xmm2
1295; AVX-NEXT:    vpaddq %xmm1, %xmm2, %xmm1
1296; AVX-NEXT:    vpaddq %xmm0, %xmm1, %xmm0
1297; AVX-NEXT:    retq
1298;
1299; XOP-LABEL: vec128_i64_signed_mem_reg:
1300; XOP:       # %bb.0:
1301; XOP-NEXT:    vmovdqa (%rdi), %xmm1
1302; XOP-NEXT:    vpcomgtq %xmm0, %xmm1, %xmm2
1303; XOP-NEXT:    vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm3
1304; XOP-NEXT:    vpsubq %xmm0, %xmm1, %xmm0
1305; XOP-NEXT:    vpxor %xmm2, %xmm0, %xmm0
1306; XOP-NEXT:    vpsubq %xmm0, %xmm2, %xmm0
1307; XOP-NEXT:    vpsrlq $1, %xmm0, %xmm4
1308; XOP-NEXT:    vpsrlq $33, %xmm0, %xmm0
1309; XOP-NEXT:    vpmuludq %xmm3, %xmm0, %xmm0
1310; XOP-NEXT:    vpsrlq $32, %xmm2, %xmm2
1311; XOP-NEXT:    vpmuludq %xmm2, %xmm4, %xmm2
1312; XOP-NEXT:    vpaddq %xmm0, %xmm2, %xmm0
1313; XOP-NEXT:    vpsllq $32, %xmm0, %xmm0
1314; XOP-NEXT:    vpmuludq %xmm3, %xmm4, %xmm2
1315; XOP-NEXT:    vpaddq %xmm1, %xmm2, %xmm1
1316; XOP-NEXT:    vpaddq %xmm0, %xmm1, %xmm0
1317; XOP-NEXT:    retq
1318;
1319; AVX512F-LABEL: vec128_i64_signed_mem_reg:
1320; AVX512F:       # %bb.0:
1321; AVX512F-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
1322; AVX512F-NEXT:    vmovdqa (%rdi), %xmm1
1323; AVX512F-NEXT:    vpcmpgtq %zmm0, %zmm1, %k1
1324; AVX512F-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
1325; AVX512F-NEXT:    vpmovsxbq {{.*#+}} xmm3 = [1,1]
1326; AVX512F-NEXT:    vmovdqa64 %zmm2, %zmm3 {%k1}
1327; AVX512F-NEXT:    vpminsq %zmm0, %zmm1, %zmm2
1328; AVX512F-NEXT:    vpmaxsq %zmm0, %zmm1, %zmm0
1329; AVX512F-NEXT:    vpsubq %xmm2, %xmm0, %xmm0
1330; AVX512F-NEXT:    vpsrlq $1, %xmm0, %xmm2
1331; AVX512F-NEXT:    vpsrlq $33, %xmm0, %xmm0
1332; AVX512F-NEXT:    vpmuludq %xmm3, %xmm0, %xmm0
1333; AVX512F-NEXT:    vpsrlq $32, %xmm3, %xmm4
1334; AVX512F-NEXT:    vpmuludq %xmm4, %xmm2, %xmm4
1335; AVX512F-NEXT:    vpaddq %xmm0, %xmm4, %xmm0
1336; AVX512F-NEXT:    vpsllq $32, %xmm0, %xmm0
1337; AVX512F-NEXT:    vpmuludq %xmm3, %xmm2, %xmm2
1338; AVX512F-NEXT:    vpaddq %xmm1, %xmm2, %xmm1
1339; AVX512F-NEXT:    vpaddq %xmm0, %xmm1, %xmm0
1340; AVX512F-NEXT:    vzeroupper
1341; AVX512F-NEXT:    retq
1342;
1343; AVX512VL-LABEL: vec128_i64_signed_mem_reg:
1344; AVX512VL:       # %bb.0:
1345; AVX512VL-NEXT:    vmovdqa (%rdi), %xmm1
1346; AVX512VL-NEXT:    vpcmpgtq %xmm0, %xmm1, %k1
1347; AVX512VL-NEXT:    vpminsq %xmm0, %xmm1, %xmm2
1348; AVX512VL-NEXT:    vpmaxsq %xmm0, %xmm1, %xmm0
1349; AVX512VL-NEXT:    vpsubq %xmm2, %xmm0, %xmm0
1350; AVX512VL-NEXT:    vpsrlq $1, %xmm0, %xmm0
1351; AVX512VL-NEXT:    vpxor %xmm2, %xmm2, %xmm2
1352; AVX512VL-NEXT:    vpsubq %xmm0, %xmm2, %xmm0 {%k1}
1353; AVX512VL-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
1354; AVX512VL-NEXT:    retq
1355;
1356; AVX512BW-FALLBACK-LABEL: vec128_i64_signed_mem_reg:
1357; AVX512BW-FALLBACK:       # %bb.0:
1358; AVX512BW-FALLBACK-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
1359; AVX512BW-FALLBACK-NEXT:    vmovdqa (%rdi), %xmm1
1360; AVX512BW-FALLBACK-NEXT:    vpcmpgtq %zmm0, %zmm1, %k1
1361; AVX512BW-FALLBACK-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
1362; AVX512BW-FALLBACK-NEXT:    vpmovsxbq {{.*#+}} xmm3 = [1,1]
1363; AVX512BW-FALLBACK-NEXT:    vmovdqa64 %zmm2, %zmm3 {%k1}
1364; AVX512BW-FALLBACK-NEXT:    vpminsq %zmm0, %zmm1, %zmm2
1365; AVX512BW-FALLBACK-NEXT:    vpmaxsq %zmm0, %zmm1, %zmm0
1366; AVX512BW-FALLBACK-NEXT:    vpsubq %xmm2, %xmm0, %xmm0
1367; AVX512BW-FALLBACK-NEXT:    vpsrlq $1, %xmm0, %xmm2
1368; AVX512BW-FALLBACK-NEXT:    vpsrlq $33, %xmm0, %xmm0
1369; AVX512BW-FALLBACK-NEXT:    vpmuludq %xmm3, %xmm0, %xmm0
1370; AVX512BW-FALLBACK-NEXT:    vpsrlq $32, %xmm3, %xmm4
1371; AVX512BW-FALLBACK-NEXT:    vpmuludq %xmm4, %xmm2, %xmm4
1372; AVX512BW-FALLBACK-NEXT:    vpaddq %xmm0, %xmm4, %xmm0
1373; AVX512BW-FALLBACK-NEXT:    vpsllq $32, %xmm0, %xmm0
1374; AVX512BW-FALLBACK-NEXT:    vpmuludq %xmm3, %xmm2, %xmm2
1375; AVX512BW-FALLBACK-NEXT:    vpaddq %xmm1, %xmm2, %xmm1
1376; AVX512BW-FALLBACK-NEXT:    vpaddq %xmm0, %xmm1, %xmm0
1377; AVX512BW-FALLBACK-NEXT:    vzeroupper
1378; AVX512BW-FALLBACK-NEXT:    retq
1379  %a1 = load <2 x i64>, ptr %a1_addr
1380  %t3 = icmp sgt <2 x i64> %a1, %a2 ; signed
1381  %t4 = select <2 x i1> %t3, <2 x i64> <i64 -1, i64 -1>, <2 x i64> <i64 1, i64 1>
1382  %t5 = select <2 x i1> %t3, <2 x i64> %a2, <2 x i64> %a1
1383  %t6 = select <2 x i1> %t3, <2 x i64> %a1, <2 x i64> %a2
1384  %t7 = sub <2 x i64> %t6, %t5
1385  %t8 = lshr <2 x i64> %t7, <i64 1, i64 1>
1386  %t9 = mul nsw <2 x i64> %t8, %t4 ; signed
1387  %a10 = add nsw <2 x i64> %t9, %a1 ; signed
1388  ret <2 x i64> %a10
1389}
1390
1391define <2 x i64> @vec128_i64_signed_reg_mem(<2 x i64> %a1, ptr %a2_addr) nounwind {
1392; SSE2-LABEL: vec128_i64_signed_reg_mem:
1393; SSE2:       # %bb.0:
1394; SSE2-NEXT:    movdqa (%rdi), %xmm1
1395; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [2147483648,2147483648]
1396; SSE2-NEXT:    movdqa %xmm0, %xmm3
1397; SSE2-NEXT:    pxor %xmm2, %xmm3
1398; SSE2-NEXT:    pxor %xmm1, %xmm2
1399; SSE2-NEXT:    movdqa %xmm3, %xmm4
1400; SSE2-NEXT:    pcmpgtd %xmm2, %xmm4
1401; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
1402; SSE2-NEXT:    pcmpeqd %xmm3, %xmm2
1403; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
1404; SSE2-NEXT:    pand %xmm5, %xmm2
1405; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3]
1406; SSE2-NEXT:    por %xmm2, %xmm3
1407; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [1,1]
1408; SSE2-NEXT:    por %xmm3, %xmm2
1409; SSE2-NEXT:    movdqa %xmm0, %xmm4
1410; SSE2-NEXT:    psubq %xmm1, %xmm4
1411; SSE2-NEXT:    pxor %xmm3, %xmm4
1412; SSE2-NEXT:    movdqa %xmm3, %xmm1
1413; SSE2-NEXT:    psubq %xmm4, %xmm1
1414; SSE2-NEXT:    movdqa %xmm1, %xmm4
1415; SSE2-NEXT:    psrlq $1, %xmm4
1416; SSE2-NEXT:    psrlq $33, %xmm1
1417; SSE2-NEXT:    pmuludq %xmm2, %xmm1
1418; SSE2-NEXT:    psrlq $32, %xmm3
1419; SSE2-NEXT:    pmuludq %xmm4, %xmm3
1420; SSE2-NEXT:    paddq %xmm1, %xmm3
1421; SSE2-NEXT:    psllq $32, %xmm3
1422; SSE2-NEXT:    pmuludq %xmm2, %xmm4
1423; SSE2-NEXT:    paddq %xmm4, %xmm0
1424; SSE2-NEXT:    paddq %xmm3, %xmm0
1425; SSE2-NEXT:    retq
1426;
1427; SSE41-LABEL: vec128_i64_signed_reg_mem:
1428; SSE41:       # %bb.0:
1429; SSE41-NEXT:    movdqa (%rdi), %xmm1
1430; SSE41-NEXT:    pmovzxdq {{.*#+}} xmm2 = [2147483648,2147483648]
1431; SSE41-NEXT:    movdqa %xmm0, %xmm3
1432; SSE41-NEXT:    pxor %xmm2, %xmm3
1433; SSE41-NEXT:    pxor %xmm1, %xmm2
1434; SSE41-NEXT:    movdqa %xmm3, %xmm4
1435; SSE41-NEXT:    pcmpgtd %xmm2, %xmm4
1436; SSE41-NEXT:    pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
1437; SSE41-NEXT:    pcmpeqd %xmm3, %xmm2
1438; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
1439; SSE41-NEXT:    pand %xmm5, %xmm2
1440; SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3]
1441; SSE41-NEXT:    por %xmm2, %xmm3
1442; SSE41-NEXT:    pmovsxbq {{.*#+}} xmm2 = [1,1]
1443; SSE41-NEXT:    por %xmm3, %xmm2
1444; SSE41-NEXT:    movdqa %xmm0, %xmm4
1445; SSE41-NEXT:    psubq %xmm1, %xmm4
1446; SSE41-NEXT:    pxor %xmm3, %xmm4
1447; SSE41-NEXT:    movdqa %xmm3, %xmm1
1448; SSE41-NEXT:    psubq %xmm4, %xmm1
1449; SSE41-NEXT:    movdqa %xmm1, %xmm4
1450; SSE41-NEXT:    psrlq $1, %xmm4
1451; SSE41-NEXT:    psrlq $33, %xmm1
1452; SSE41-NEXT:    pmuludq %xmm2, %xmm1
1453; SSE41-NEXT:    psrlq $32, %xmm3
1454; SSE41-NEXT:    pmuludq %xmm4, %xmm3
1455; SSE41-NEXT:    paddq %xmm1, %xmm3
1456; SSE41-NEXT:    psllq $32, %xmm3
1457; SSE41-NEXT:    pmuludq %xmm2, %xmm4
1458; SSE41-NEXT:    paddq %xmm4, %xmm0
1459; SSE41-NEXT:    paddq %xmm3, %xmm0
1460; SSE41-NEXT:    retq
1461;
1462; AVX-LABEL: vec128_i64_signed_reg_mem:
1463; AVX:       # %bb.0:
1464; AVX-NEXT:    vmovdqa (%rdi), %xmm1
1465; AVX-NEXT:    vpcmpgtq %xmm1, %xmm0, %xmm2
1466; AVX-NEXT:    vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm3
1467; AVX-NEXT:    vpsubq %xmm1, %xmm0, %xmm1
1468; AVX-NEXT:    vpxor %xmm2, %xmm1, %xmm1
1469; AVX-NEXT:    vpsubq %xmm1, %xmm2, %xmm1
1470; AVX-NEXT:    vpsrlq $1, %xmm1, %xmm4
1471; AVX-NEXT:    vpsrlq $33, %xmm1, %xmm1
1472; AVX-NEXT:    vpmuludq %xmm3, %xmm1, %xmm1
1473; AVX-NEXT:    vpsrlq $32, %xmm2, %xmm2
1474; AVX-NEXT:    vpmuludq %xmm2, %xmm4, %xmm2
1475; AVX-NEXT:    vpaddq %xmm1, %xmm2, %xmm1
1476; AVX-NEXT:    vpsllq $32, %xmm1, %xmm1
1477; AVX-NEXT:    vpmuludq %xmm3, %xmm4, %xmm2
1478; AVX-NEXT:    vpaddq %xmm0, %xmm2, %xmm0
1479; AVX-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
1480; AVX-NEXT:    retq
1481;
1482; XOP-LABEL: vec128_i64_signed_reg_mem:
1483; XOP:       # %bb.0:
1484; XOP-NEXT:    vmovdqa (%rdi), %xmm1
1485; XOP-NEXT:    vpcomgtq %xmm1, %xmm0, %xmm2
1486; XOP-NEXT:    vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm3
1487; XOP-NEXT:    vpsubq %xmm1, %xmm0, %xmm1
1488; XOP-NEXT:    vpxor %xmm2, %xmm1, %xmm1
1489; XOP-NEXT:    vpsubq %xmm1, %xmm2, %xmm1
1490; XOP-NEXT:    vpsrlq $1, %xmm1, %xmm4
1491; XOP-NEXT:    vpsrlq $33, %xmm1, %xmm1
1492; XOP-NEXT:    vpmuludq %xmm3, %xmm1, %xmm1
1493; XOP-NEXT:    vpsrlq $32, %xmm2, %xmm2
1494; XOP-NEXT:    vpmuludq %xmm2, %xmm4, %xmm2
1495; XOP-NEXT:    vpaddq %xmm1, %xmm2, %xmm1
1496; XOP-NEXT:    vpsllq $32, %xmm1, %xmm1
1497; XOP-NEXT:    vpmuludq %xmm3, %xmm4, %xmm2
1498; XOP-NEXT:    vpaddq %xmm0, %xmm2, %xmm0
1499; XOP-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
1500; XOP-NEXT:    retq
1501;
1502; AVX512F-LABEL: vec128_i64_signed_reg_mem:
1503; AVX512F:       # %bb.0:
1504; AVX512F-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
1505; AVX512F-NEXT:    vmovdqa (%rdi), %xmm1
1506; AVX512F-NEXT:    vpcmpgtq %zmm1, %zmm0, %k1
1507; AVX512F-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
1508; AVX512F-NEXT:    vpmovsxbq {{.*#+}} xmm3 = [1,1]
1509; AVX512F-NEXT:    vmovdqa64 %zmm2, %zmm3 {%k1}
1510; AVX512F-NEXT:    vpminsq %zmm1, %zmm0, %zmm2
1511; AVX512F-NEXT:    vpmaxsq %zmm1, %zmm0, %zmm1
1512; AVX512F-NEXT:    vpsubq %xmm2, %xmm1, %xmm1
1513; AVX512F-NEXT:    vpsrlq $1, %xmm1, %xmm2
1514; AVX512F-NEXT:    vpsrlq $33, %xmm1, %xmm1
1515; AVX512F-NEXT:    vpmuludq %xmm3, %xmm1, %xmm1
1516; AVX512F-NEXT:    vpsrlq $32, %xmm3, %xmm4
1517; AVX512F-NEXT:    vpmuludq %xmm4, %xmm2, %xmm4
1518; AVX512F-NEXT:    vpaddq %xmm1, %xmm4, %xmm1
1519; AVX512F-NEXT:    vpsllq $32, %xmm1, %xmm1
1520; AVX512F-NEXT:    vpmuludq %xmm3, %xmm2, %xmm2
1521; AVX512F-NEXT:    vpaddq %xmm0, %xmm2, %xmm0
1522; AVX512F-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
1523; AVX512F-NEXT:    vzeroupper
1524; AVX512F-NEXT:    retq
1525;
1526; AVX512VL-LABEL: vec128_i64_signed_reg_mem:
1527; AVX512VL:       # %bb.0:
1528; AVX512VL-NEXT:    vmovdqa (%rdi), %xmm1
1529; AVX512VL-NEXT:    vpcmpgtq %xmm1, %xmm0, %k1
1530; AVX512VL-NEXT:    vpminsq %xmm1, %xmm0, %xmm2
1531; AVX512VL-NEXT:    vpmaxsq %xmm1, %xmm0, %xmm1
1532; AVX512VL-NEXT:    vpsubq %xmm2, %xmm1, %xmm1
1533; AVX512VL-NEXT:    vpsrlq $1, %xmm1, %xmm1
1534; AVX512VL-NEXT:    vpxor %xmm2, %xmm2, %xmm2
1535; AVX512VL-NEXT:    vpsubq %xmm1, %xmm2, %xmm1 {%k1}
1536; AVX512VL-NEXT:    vpaddq %xmm0, %xmm1, %xmm0
1537; AVX512VL-NEXT:    retq
1538;
1539; AVX512BW-FALLBACK-LABEL: vec128_i64_signed_reg_mem:
1540; AVX512BW-FALLBACK:       # %bb.0:
1541; AVX512BW-FALLBACK-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
1542; AVX512BW-FALLBACK-NEXT:    vmovdqa (%rdi), %xmm1
1543; AVX512BW-FALLBACK-NEXT:    vpcmpgtq %zmm1, %zmm0, %k1
1544; AVX512BW-FALLBACK-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
1545; AVX512BW-FALLBACK-NEXT:    vpmovsxbq {{.*#+}} xmm3 = [1,1]
1546; AVX512BW-FALLBACK-NEXT:    vmovdqa64 %zmm2, %zmm3 {%k1}
1547; AVX512BW-FALLBACK-NEXT:    vpminsq %zmm1, %zmm0, %zmm2
1548; AVX512BW-FALLBACK-NEXT:    vpmaxsq %zmm1, %zmm0, %zmm1
1549; AVX512BW-FALLBACK-NEXT:    vpsubq %xmm2, %xmm1, %xmm1
1550; AVX512BW-FALLBACK-NEXT:    vpsrlq $1, %xmm1, %xmm2
1551; AVX512BW-FALLBACK-NEXT:    vpsrlq $33, %xmm1, %xmm1
1552; AVX512BW-FALLBACK-NEXT:    vpmuludq %xmm3, %xmm1, %xmm1
1553; AVX512BW-FALLBACK-NEXT:    vpsrlq $32, %xmm3, %xmm4
1554; AVX512BW-FALLBACK-NEXT:    vpmuludq %xmm4, %xmm2, %xmm4
1555; AVX512BW-FALLBACK-NEXT:    vpaddq %xmm1, %xmm4, %xmm1
1556; AVX512BW-FALLBACK-NEXT:    vpsllq $32, %xmm1, %xmm1
1557; AVX512BW-FALLBACK-NEXT:    vpmuludq %xmm3, %xmm2, %xmm2
1558; AVX512BW-FALLBACK-NEXT:    vpaddq %xmm0, %xmm2, %xmm0
1559; AVX512BW-FALLBACK-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
1560; AVX512BW-FALLBACK-NEXT:    vzeroupper
1561; AVX512BW-FALLBACK-NEXT:    retq
1562  %a2 = load <2 x i64>, ptr %a2_addr
1563  %t3 = icmp sgt <2 x i64> %a1, %a2 ; signed
1564  %t4 = select <2 x i1> %t3, <2 x i64> <i64 -1, i64 -1>, <2 x i64> <i64 1, i64 1>
1565  %t5 = select <2 x i1> %t3, <2 x i64> %a2, <2 x i64> %a1
1566  %t6 = select <2 x i1> %t3, <2 x i64> %a1, <2 x i64> %a2
1567  %t7 = sub <2 x i64> %t6, %t5
1568  %t8 = lshr <2 x i64> %t7, <i64 1, i64 1>
1569  %t9 = mul nsw <2 x i64> %t8, %t4 ; signed
1570  %a10 = add nsw <2 x i64> %t9, %a1 ; signed
1571  ret <2 x i64> %a10
1572}
1573
1574define <2 x i64> @vec128_i64_signed_mem_mem(ptr %a1_addr, ptr %a2_addr) nounwind {
1575; SSE2-LABEL: vec128_i64_signed_mem_mem:
1576; SSE2:       # %bb.0:
1577; SSE2-NEXT:    movdqa (%rdi), %xmm1
1578; SSE2-NEXT:    movdqa (%rsi), %xmm0
1579; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [2147483648,2147483648]
1580; SSE2-NEXT:    movdqa %xmm1, %xmm3
1581; SSE2-NEXT:    psubq %xmm0, %xmm3
1582; SSE2-NEXT:    pxor %xmm2, %xmm0
1583; SSE2-NEXT:    pxor %xmm1, %xmm2
1584; SSE2-NEXT:    movdqa %xmm2, %xmm4
1585; SSE2-NEXT:    pcmpgtd %xmm0, %xmm4
1586; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
1587; SSE2-NEXT:    pcmpeqd %xmm0, %xmm2
1588; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3]
1589; SSE2-NEXT:    pand %xmm5, %xmm0
1590; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3]
1591; SSE2-NEXT:    por %xmm0, %xmm2
1592; SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [1,1]
1593; SSE2-NEXT:    por %xmm2, %xmm4
1594; SSE2-NEXT:    pxor %xmm2, %xmm3
1595; SSE2-NEXT:    movdqa %xmm2, %xmm5
1596; SSE2-NEXT:    psubq %xmm3, %xmm5
1597; SSE2-NEXT:    movdqa %xmm5, %xmm0
1598; SSE2-NEXT:    psrlq $1, %xmm0
1599; SSE2-NEXT:    psrlq $33, %xmm5
1600; SSE2-NEXT:    pmuludq %xmm4, %xmm5
1601; SSE2-NEXT:    psrlq $32, %xmm2
1602; SSE2-NEXT:    pmuludq %xmm0, %xmm2
1603; SSE2-NEXT:    paddq %xmm5, %xmm2
1604; SSE2-NEXT:    psllq $32, %xmm2
1605; SSE2-NEXT:    pmuludq %xmm4, %xmm0
1606; SSE2-NEXT:    paddq %xmm1, %xmm0
1607; SSE2-NEXT:    paddq %xmm2, %xmm0
1608; SSE2-NEXT:    retq
1609;
1610; SSE41-LABEL: vec128_i64_signed_mem_mem:
1611; SSE41:       # %bb.0:
1612; SSE41-NEXT:    movdqa (%rdi), %xmm1
1613; SSE41-NEXT:    movdqa (%rsi), %xmm0
1614; SSE41-NEXT:    pmovzxdq {{.*#+}} xmm2 = [2147483648,2147483648]
1615; SSE41-NEXT:    movdqa %xmm1, %xmm3
1616; SSE41-NEXT:    psubq %xmm0, %xmm3
1617; SSE41-NEXT:    pxor %xmm2, %xmm0
1618; SSE41-NEXT:    pxor %xmm1, %xmm2
1619; SSE41-NEXT:    movdqa %xmm2, %xmm4
1620; SSE41-NEXT:    pcmpgtd %xmm0, %xmm4
1621; SSE41-NEXT:    pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
1622; SSE41-NEXT:    pcmpeqd %xmm0, %xmm2
1623; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3]
1624; SSE41-NEXT:    pand %xmm5, %xmm0
1625; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3]
1626; SSE41-NEXT:    por %xmm0, %xmm2
1627; SSE41-NEXT:    pmovsxbq {{.*#+}} xmm4 = [1,1]
1628; SSE41-NEXT:    por %xmm2, %xmm4
1629; SSE41-NEXT:    pxor %xmm2, %xmm3
1630; SSE41-NEXT:    movdqa %xmm2, %xmm5
1631; SSE41-NEXT:    psubq %xmm3, %xmm5
1632; SSE41-NEXT:    movdqa %xmm5, %xmm0
1633; SSE41-NEXT:    psrlq $1, %xmm0
1634; SSE41-NEXT:    psrlq $33, %xmm5
1635; SSE41-NEXT:    pmuludq %xmm4, %xmm5
1636; SSE41-NEXT:    psrlq $32, %xmm2
1637; SSE41-NEXT:    pmuludq %xmm0, %xmm2
1638; SSE41-NEXT:    paddq %xmm5, %xmm2
1639; SSE41-NEXT:    psllq $32, %xmm2
1640; SSE41-NEXT:    pmuludq %xmm4, %xmm0
1641; SSE41-NEXT:    paddq %xmm1, %xmm0
1642; SSE41-NEXT:    paddq %xmm2, %xmm0
1643; SSE41-NEXT:    retq
1644;
1645; AVX-LABEL: vec128_i64_signed_mem_mem:
1646; AVX:       # %bb.0:
1647; AVX-NEXT:    vmovdqa (%rdi), %xmm0
1648; AVX-NEXT:    vmovdqa (%rsi), %xmm1
1649; AVX-NEXT:    vpcmpgtq %xmm1, %xmm0, %xmm2
1650; AVX-NEXT:    vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm3
1651; AVX-NEXT:    vpsubq %xmm1, %xmm0, %xmm1
1652; AVX-NEXT:    vpxor %xmm2, %xmm1, %xmm1
1653; AVX-NEXT:    vpsubq %xmm1, %xmm2, %xmm1
1654; AVX-NEXT:    vpsrlq $1, %xmm1, %xmm4
1655; AVX-NEXT:    vpsrlq $33, %xmm1, %xmm1
1656; AVX-NEXT:    vpmuludq %xmm3, %xmm1, %xmm1
1657; AVX-NEXT:    vpsrlq $32, %xmm2, %xmm2
1658; AVX-NEXT:    vpmuludq %xmm2, %xmm4, %xmm2
1659; AVX-NEXT:    vpaddq %xmm1, %xmm2, %xmm1
1660; AVX-NEXT:    vpsllq $32, %xmm1, %xmm1
1661; AVX-NEXT:    vpmuludq %xmm3, %xmm4, %xmm2
1662; AVX-NEXT:    vpaddq %xmm0, %xmm2, %xmm0
1663; AVX-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
1664; AVX-NEXT:    retq
1665;
1666; XOP-LABEL: vec128_i64_signed_mem_mem:
1667; XOP:       # %bb.0:
1668; XOP-NEXT:    vmovdqa (%rdi), %xmm0
1669; XOP-NEXT:    vmovdqa (%rsi), %xmm1
1670; XOP-NEXT:    vpcomgtq %xmm1, %xmm0, %xmm2
1671; XOP-NEXT:    vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm3
1672; XOP-NEXT:    vpsubq %xmm1, %xmm0, %xmm1
1673; XOP-NEXT:    vpxor %xmm2, %xmm1, %xmm1
1674; XOP-NEXT:    vpsubq %xmm1, %xmm2, %xmm1
1675; XOP-NEXT:    vpsrlq $1, %xmm1, %xmm4
1676; XOP-NEXT:    vpsrlq $33, %xmm1, %xmm1
1677; XOP-NEXT:    vpmuludq %xmm3, %xmm1, %xmm1
1678; XOP-NEXT:    vpsrlq $32, %xmm2, %xmm2
1679; XOP-NEXT:    vpmuludq %xmm2, %xmm4, %xmm2
1680; XOP-NEXT:    vpaddq %xmm1, %xmm2, %xmm1
1681; XOP-NEXT:    vpsllq $32, %xmm1, %xmm1
1682; XOP-NEXT:    vpmuludq %xmm3, %xmm4, %xmm2
1683; XOP-NEXT:    vpaddq %xmm0, %xmm2, %xmm0
1684; XOP-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
1685; XOP-NEXT:    retq
1686;
1687; AVX512F-LABEL: vec128_i64_signed_mem_mem:
1688; AVX512F:       # %bb.0:
1689; AVX512F-NEXT:    vmovdqa (%rdi), %xmm0
1690; AVX512F-NEXT:    vmovdqa (%rsi), %xmm1
1691; AVX512F-NEXT:    vpcmpgtq %zmm1, %zmm0, %k1
1692; AVX512F-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
1693; AVX512F-NEXT:    vpmovsxbq {{.*#+}} xmm3 = [1,1]
1694; AVX512F-NEXT:    vmovdqa64 %zmm2, %zmm3 {%k1}
1695; AVX512F-NEXT:    vpminsq %zmm1, %zmm0, %zmm2
1696; AVX512F-NEXT:    vpmaxsq %zmm1, %zmm0, %zmm1
1697; AVX512F-NEXT:    vpsubq %xmm2, %xmm1, %xmm1
1698; AVX512F-NEXT:    vpsrlq $1, %xmm1, %xmm2
1699; AVX512F-NEXT:    vpsrlq $33, %xmm1, %xmm1
1700; AVX512F-NEXT:    vpmuludq %xmm3, %xmm1, %xmm1
1701; AVX512F-NEXT:    vpsrlq $32, %xmm3, %xmm4
1702; AVX512F-NEXT:    vpmuludq %xmm4, %xmm2, %xmm4
1703; AVX512F-NEXT:    vpaddq %xmm1, %xmm4, %xmm1
1704; AVX512F-NEXT:    vpsllq $32, %xmm1, %xmm1
1705; AVX512F-NEXT:    vpmuludq %xmm3, %xmm2, %xmm2
1706; AVX512F-NEXT:    vpaddq %xmm0, %xmm2, %xmm0
1707; AVX512F-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
1708; AVX512F-NEXT:    vzeroupper
1709; AVX512F-NEXT:    retq
1710;
1711; AVX512VL-LABEL: vec128_i64_signed_mem_mem:
1712; AVX512VL:       # %bb.0:
1713; AVX512VL-NEXT:    vmovdqa (%rdi), %xmm0
1714; AVX512VL-NEXT:    vmovdqa (%rsi), %xmm1
1715; AVX512VL-NEXT:    vpcmpgtq %xmm1, %xmm0, %k1
1716; AVX512VL-NEXT:    vpminsq %xmm1, %xmm0, %xmm2
1717; AVX512VL-NEXT:    vpmaxsq %xmm1, %xmm0, %xmm1
1718; AVX512VL-NEXT:    vpsubq %xmm2, %xmm1, %xmm1
1719; AVX512VL-NEXT:    vpsrlq $1, %xmm1, %xmm1
1720; AVX512VL-NEXT:    vpxor %xmm2, %xmm2, %xmm2
1721; AVX512VL-NEXT:    vpsubq %xmm1, %xmm2, %xmm1 {%k1}
1722; AVX512VL-NEXT:    vpaddq %xmm0, %xmm1, %xmm0
1723; AVX512VL-NEXT:    retq
1724;
1725; AVX512BW-FALLBACK-LABEL: vec128_i64_signed_mem_mem:
1726; AVX512BW-FALLBACK:       # %bb.0:
1727; AVX512BW-FALLBACK-NEXT:    vmovdqa (%rdi), %xmm0
1728; AVX512BW-FALLBACK-NEXT:    vmovdqa (%rsi), %xmm1
1729; AVX512BW-FALLBACK-NEXT:    vpcmpgtq %zmm1, %zmm0, %k1
1730; AVX512BW-FALLBACK-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
1731; AVX512BW-FALLBACK-NEXT:    vpmovsxbq {{.*#+}} xmm3 = [1,1]
1732; AVX512BW-FALLBACK-NEXT:    vmovdqa64 %zmm2, %zmm3 {%k1}
1733; AVX512BW-FALLBACK-NEXT:    vpminsq %zmm1, %zmm0, %zmm2
1734; AVX512BW-FALLBACK-NEXT:    vpmaxsq %zmm1, %zmm0, %zmm1
1735; AVX512BW-FALLBACK-NEXT:    vpsubq %xmm2, %xmm1, %xmm1
1736; AVX512BW-FALLBACK-NEXT:    vpsrlq $1, %xmm1, %xmm2
1737; AVX512BW-FALLBACK-NEXT:    vpsrlq $33, %xmm1, %xmm1
1738; AVX512BW-FALLBACK-NEXT:    vpmuludq %xmm3, %xmm1, %xmm1
1739; AVX512BW-FALLBACK-NEXT:    vpsrlq $32, %xmm3, %xmm4
1740; AVX512BW-FALLBACK-NEXT:    vpmuludq %xmm4, %xmm2, %xmm4
1741; AVX512BW-FALLBACK-NEXT:    vpaddq %xmm1, %xmm4, %xmm1
1742; AVX512BW-FALLBACK-NEXT:    vpsllq $32, %xmm1, %xmm1
1743; AVX512BW-FALLBACK-NEXT:    vpmuludq %xmm3, %xmm2, %xmm2
1744; AVX512BW-FALLBACK-NEXT:    vpaddq %xmm0, %xmm2, %xmm0
1745; AVX512BW-FALLBACK-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
1746; AVX512BW-FALLBACK-NEXT:    vzeroupper
1747; AVX512BW-FALLBACK-NEXT:    retq
1748  %a1 = load <2 x i64>, ptr %a1_addr
1749  %a2 = load <2 x i64>, ptr %a2_addr
1750  %t3 = icmp sgt <2 x i64> %a1, %a2 ; signed
1751  %t4 = select <2 x i1> %t3, <2 x i64> <i64 -1, i64 -1>, <2 x i64> <i64 1, i64 1>
1752  %t5 = select <2 x i1> %t3, <2 x i64> %a2, <2 x i64> %a1
1753  %t6 = select <2 x i1> %t3, <2 x i64> %a1, <2 x i64> %a2
1754  %t7 = sub <2 x i64> %t6, %t5
1755  %t8 = lshr <2 x i64> %t7, <i64 1, i64 1>
1756  %t9 = mul nsw <2 x i64> %t8, %t4 ; signed
1757  %a10 = add nsw <2 x i64> %t9, %a1 ; signed
1758  ret <2 x i64> %a10
1759}
1760
1761; ---------------------------------------------------------------------------- ;
1762; 16-bit width. 128 / 16 = 8 elts.
1763; ---------------------------------------------------------------------------- ;
1764
1765; Values come from regs
1766
1767define <8 x i16> @vec128_i16_signed_reg_reg(<8 x i16> %a1, <8 x i16> %a2) nounwind {
1768; SSE-LABEL: vec128_i16_signed_reg_reg:
1769; SSE:       # %bb.0:
1770; SSE-NEXT:    movdqa %xmm0, %xmm2
1771; SSE-NEXT:    pcmpgtw %xmm1, %xmm2
1772; SSE-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
1773; SSE-NEXT:    movdqa %xmm0, %xmm3
1774; SSE-NEXT:    pminsw %xmm1, %xmm3
1775; SSE-NEXT:    pmaxsw %xmm0, %xmm1
1776; SSE-NEXT:    psubw %xmm3, %xmm1
1777; SSE-NEXT:    psrlw $1, %xmm1
1778; SSE-NEXT:    pmullw %xmm1, %xmm2
1779; SSE-NEXT:    paddw %xmm2, %xmm0
1780; SSE-NEXT:    retq
1781;
1782; AVX-LABEL: vec128_i16_signed_reg_reg:
1783; AVX:       # %bb.0:
1784; AVX-NEXT:    vpcmpgtw %xmm1, %xmm0, %xmm2
1785; AVX-NEXT:    vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
1786; AVX-NEXT:    vpminsw %xmm1, %xmm0, %xmm3
1787; AVX-NEXT:    vpmaxsw %xmm1, %xmm0, %xmm1
1788; AVX-NEXT:    vpsubw %xmm3, %xmm1, %xmm1
1789; AVX-NEXT:    vpsrlw $1, %xmm1, %xmm1
1790; AVX-NEXT:    vpmullw %xmm2, %xmm1, %xmm1
1791; AVX-NEXT:    vpaddw %xmm0, %xmm1, %xmm0
1792; AVX-NEXT:    retq
1793;
1794; XOP-LABEL: vec128_i16_signed_reg_reg:
1795; XOP:       # %bb.0:
1796; XOP-NEXT:    vpcomgtw %xmm1, %xmm0, %xmm2
1797; XOP-NEXT:    vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
1798; XOP-NEXT:    vpminsw %xmm1, %xmm0, %xmm3
1799; XOP-NEXT:    vpmaxsw %xmm1, %xmm0, %xmm1
1800; XOP-NEXT:    vpsubw %xmm3, %xmm1, %xmm1
1801; XOP-NEXT:    vpsrlw $1, %xmm1, %xmm1
1802; XOP-NEXT:    vpmacsww %xmm0, %xmm2, %xmm1, %xmm0
1803; XOP-NEXT:    retq
1804;
1805; AVX512F-LABEL: vec128_i16_signed_reg_reg:
1806; AVX512F:       # %bb.0:
1807; AVX512F-NEXT:    vpcmpgtw %xmm1, %xmm0, %xmm2
1808; AVX512F-NEXT:    vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
1809; AVX512F-NEXT:    vpminsw %xmm1, %xmm0, %xmm3
1810; AVX512F-NEXT:    vpmaxsw %xmm1, %xmm0, %xmm1
1811; AVX512F-NEXT:    vpsubw %xmm3, %xmm1, %xmm1
1812; AVX512F-NEXT:    vpsrlw $1, %xmm1, %xmm1
1813; AVX512F-NEXT:    vpmullw %xmm2, %xmm1, %xmm1
1814; AVX512F-NEXT:    vpaddw %xmm0, %xmm1, %xmm0
1815; AVX512F-NEXT:    retq
1816;
1817; AVX512VL-FALLBACK-LABEL: vec128_i16_signed_reg_reg:
1818; AVX512VL-FALLBACK:       # %bb.0:
1819; AVX512VL-FALLBACK-NEXT:    vpminsw %xmm1, %xmm0, %xmm2
1820; AVX512VL-FALLBACK-NEXT:    vpmaxsw %xmm1, %xmm0, %xmm3
1821; AVX512VL-FALLBACK-NEXT:    vpsubw %xmm2, %xmm3, %xmm2
1822; AVX512VL-FALLBACK-NEXT:    vpsrlw $1, %xmm2, %xmm2
1823; AVX512VL-FALLBACK-NEXT:    vpcmpgtw %xmm1, %xmm0, %xmm1
1824; AVX512VL-FALLBACK-NEXT:    vpxor %xmm1, %xmm2, %xmm2
1825; AVX512VL-FALLBACK-NEXT:    vpsubw %xmm1, %xmm2, %xmm1
1826; AVX512VL-FALLBACK-NEXT:    vpaddw %xmm0, %xmm1, %xmm0
1827; AVX512VL-FALLBACK-NEXT:    retq
1828;
1829; AVX512BW-FALLBACK-LABEL: vec128_i16_signed_reg_reg:
1830; AVX512BW-FALLBACK:       # %bb.0:
1831; AVX512BW-FALLBACK-NEXT:    # kill: def $xmm1 killed $xmm1 def $zmm1
1832; AVX512BW-FALLBACK-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
1833; AVX512BW-FALLBACK-NEXT:    vpcmpgtw %zmm1, %zmm0, %k1
1834; AVX512BW-FALLBACK-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
1835; AVX512BW-FALLBACK-NEXT:    vpbroadcastw {{.*#+}} xmm3 = [1,1,1,1,1,1,1,1]
1836; AVX512BW-FALLBACK-NEXT:    vmovdqu16 %zmm2, %zmm3 {%k1}
1837; AVX512BW-FALLBACK-NEXT:    vpminsw %xmm1, %xmm0, %xmm2
1838; AVX512BW-FALLBACK-NEXT:    vpmaxsw %xmm1, %xmm0, %xmm1
1839; AVX512BW-FALLBACK-NEXT:    vpsubw %xmm2, %xmm1, %xmm1
1840; AVX512BW-FALLBACK-NEXT:    vpsrlw $1, %xmm1, %xmm1
1841; AVX512BW-FALLBACK-NEXT:    vpmullw %xmm3, %xmm1, %xmm1
1842; AVX512BW-FALLBACK-NEXT:    vpaddw %xmm0, %xmm1, %xmm0
1843; AVX512BW-FALLBACK-NEXT:    vzeroupper
1844; AVX512BW-FALLBACK-NEXT:    retq
1845;
1846; AVX512VLBW-LABEL: vec128_i16_signed_reg_reg:
1847; AVX512VLBW:       # %bb.0:
1848; AVX512VLBW-NEXT:    vpcmpgtw %xmm1, %xmm0, %k1
1849; AVX512VLBW-NEXT:    vpminsw %xmm1, %xmm0, %xmm2
1850; AVX512VLBW-NEXT:    vpmaxsw %xmm1, %xmm0, %xmm1
1851; AVX512VLBW-NEXT:    vpsubw %xmm2, %xmm1, %xmm1
1852; AVX512VLBW-NEXT:    vpsrlw $1, %xmm1, %xmm1
1853; AVX512VLBW-NEXT:    vpxor %xmm2, %xmm2, %xmm2
1854; AVX512VLBW-NEXT:    vpsubw %xmm1, %xmm2, %xmm1 {%k1}
1855; AVX512VLBW-NEXT:    vpaddw %xmm0, %xmm1, %xmm0
1856; AVX512VLBW-NEXT:    retq
1857  %t3 = icmp sgt <8 x i16> %a1, %a2 ; signed
1858  %t4 = select <8 x i1> %t3, <8 x i16> <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>, <8 x i16> <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
1859  %t5 = select <8 x i1> %t3, <8 x i16> %a2, <8 x i16> %a1
1860  %t6 = select <8 x i1> %t3, <8 x i16> %a1, <8 x i16> %a2
1861  %t7 = sub <8 x i16> %t6, %t5
1862  %t8 = lshr <8 x i16> %t7, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
1863  %t9 = mul nsw <8 x i16> %t8, %t4 ; signed
1864  %a10 = add nsw <8 x i16> %t9, %a1 ; signed
1865  ret <8 x i16> %a10
1866}
1867
1868define <8 x i16> @vec128_i16_unsigned_reg_reg(<8 x i16> %a1, <8 x i16> %a2) nounwind {
1869; SSE2-LABEL: vec128_i16_unsigned_reg_reg:
1870; SSE2:       # %bb.0:
1871; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768]
1872; SSE2-NEXT:    movdqa %xmm1, %xmm3
1873; SSE2-NEXT:    pxor %xmm2, %xmm3
1874; SSE2-NEXT:    pxor %xmm0, %xmm2
1875; SSE2-NEXT:    pcmpgtw %xmm3, %xmm2
1876; SSE2-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
1877; SSE2-NEXT:    movdqa %xmm0, %xmm3
1878; SSE2-NEXT:    psubusw %xmm1, %xmm3
1879; SSE2-NEXT:    psubusw %xmm0, %xmm1
1880; SSE2-NEXT:    por %xmm1, %xmm3
1881; SSE2-NEXT:    psrlw $1, %xmm3
1882; SSE2-NEXT:    pmullw %xmm2, %xmm3
1883; SSE2-NEXT:    paddw %xmm3, %xmm0
1884; SSE2-NEXT:    retq
1885;
1886; SSE41-LABEL: vec128_i16_unsigned_reg_reg:
1887; SSE41:       # %bb.0:
1888; SSE41-NEXT:    movdqa %xmm0, %xmm2
1889; SSE41-NEXT:    pminuw %xmm1, %xmm2
1890; SSE41-NEXT:    movdqa %xmm0, %xmm3
1891; SSE41-NEXT:    pcmpeqw %xmm2, %xmm3
1892; SSE41-NEXT:    pcmpeqd %xmm4, %xmm4
1893; SSE41-NEXT:    pxor %xmm3, %xmm4
1894; SSE41-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4
1895; SSE41-NEXT:    pmaxuw %xmm0, %xmm1
1896; SSE41-NEXT:    psubw %xmm2, %xmm1
1897; SSE41-NEXT:    psrlw $1, %xmm1
1898; SSE41-NEXT:    pmullw %xmm1, %xmm4
1899; SSE41-NEXT:    paddw %xmm4, %xmm0
1900; SSE41-NEXT:    retq
1901;
1902; AVX-LABEL: vec128_i16_unsigned_reg_reg:
1903; AVX:       # %bb.0:
1904; AVX-NEXT:    vpminuw %xmm1, %xmm0, %xmm2
1905; AVX-NEXT:    vpcmpeqw %xmm2, %xmm0, %xmm3
1906; AVX-NEXT:    vpcmpeqd %xmm4, %xmm4, %xmm4
1907; AVX-NEXT:    vpxor %xmm4, %xmm3, %xmm3
1908; AVX-NEXT:    vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3
1909; AVX-NEXT:    vpmaxuw %xmm1, %xmm0, %xmm1
1910; AVX-NEXT:    vpsubw %xmm2, %xmm1, %xmm1
1911; AVX-NEXT:    vpsrlw $1, %xmm1, %xmm1
1912; AVX-NEXT:    vpmullw %xmm3, %xmm1, %xmm1
1913; AVX-NEXT:    vpaddw %xmm0, %xmm1, %xmm0
1914; AVX-NEXT:    retq
1915;
1916; XOP-LABEL: vec128_i16_unsigned_reg_reg:
1917; XOP:       # %bb.0:
1918; XOP-NEXT:    vpcomgtuw %xmm1, %xmm0, %xmm2
1919; XOP-NEXT:    vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
1920; XOP-NEXT:    vpminuw %xmm1, %xmm0, %xmm3
1921; XOP-NEXT:    vpmaxuw %xmm1, %xmm0, %xmm1
1922; XOP-NEXT:    vpsubw %xmm3, %xmm1, %xmm1
1923; XOP-NEXT:    vpsrlw $1, %xmm1, %xmm1
1924; XOP-NEXT:    vpmacsww %xmm0, %xmm2, %xmm1, %xmm0
1925; XOP-NEXT:    retq
1926;
1927; AVX512F-LABEL: vec128_i16_unsigned_reg_reg:
1928; AVX512F:       # %bb.0:
1929; AVX512F-NEXT:    vpminuw %xmm1, %xmm0, %xmm2
1930; AVX512F-NEXT:    vpcmpeqw %xmm2, %xmm0, %xmm3
1931; AVX512F-NEXT:    vpternlogq {{.*#+}} zmm3 = ~zmm3
1932; AVX512F-NEXT:    vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3
1933; AVX512F-NEXT:    vpmaxuw %xmm1, %xmm0, %xmm1
1934; AVX512F-NEXT:    vpsubw %xmm2, %xmm1, %xmm1
1935; AVX512F-NEXT:    vpsrlw $1, %xmm1, %xmm1
1936; AVX512F-NEXT:    vpmullw %xmm3, %xmm1, %xmm1
1937; AVX512F-NEXT:    vpaddw %xmm0, %xmm1, %xmm0
1938; AVX512F-NEXT:    vzeroupper
1939; AVX512F-NEXT:    retq
1940;
1941; AVX512VL-FALLBACK-LABEL: vec128_i16_unsigned_reg_reg:
1942; AVX512VL-FALLBACK:       # %bb.0:
1943; AVX512VL-FALLBACK-NEXT:    vpminuw %xmm1, %xmm0, %xmm2
1944; AVX512VL-FALLBACK-NEXT:    vpmaxuw %xmm1, %xmm0, %xmm1
1945; AVX512VL-FALLBACK-NEXT:    vpsubw %xmm2, %xmm1, %xmm1
1946; AVX512VL-FALLBACK-NEXT:    vpsrlw $1, %xmm1, %xmm1
1947; AVX512VL-FALLBACK-NEXT:    vpcmpeqw %xmm2, %xmm0, %xmm2
1948; AVX512VL-FALLBACK-NEXT:    vpternlogq {{.*#+}} xmm2 = ~xmm2
1949; AVX512VL-FALLBACK-NEXT:    vpxor %xmm2, %xmm1, %xmm1
1950; AVX512VL-FALLBACK-NEXT:    vpsubw %xmm2, %xmm1, %xmm1
1951; AVX512VL-FALLBACK-NEXT:    vpaddw %xmm0, %xmm1, %xmm0
1952; AVX512VL-FALLBACK-NEXT:    retq
1953;
1954; AVX512BW-FALLBACK-LABEL: vec128_i16_unsigned_reg_reg:
1955; AVX512BW-FALLBACK:       # %bb.0:
1956; AVX512BW-FALLBACK-NEXT:    # kill: def $xmm1 killed $xmm1 def $zmm1
1957; AVX512BW-FALLBACK-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
1958; AVX512BW-FALLBACK-NEXT:    vpcmpnleuw %zmm1, %zmm0, %k1
1959; AVX512BW-FALLBACK-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
1960; AVX512BW-FALLBACK-NEXT:    vpbroadcastw {{.*#+}} xmm3 = [1,1,1,1,1,1,1,1]
1961; AVX512BW-FALLBACK-NEXT:    vmovdqu16 %zmm2, %zmm3 {%k1}
1962; AVX512BW-FALLBACK-NEXT:    vpminuw %xmm1, %xmm0, %xmm2
1963; AVX512BW-FALLBACK-NEXT:    vpmaxuw %xmm1, %xmm0, %xmm1
1964; AVX512BW-FALLBACK-NEXT:    vpsubw %xmm2, %xmm1, %xmm1
1965; AVX512BW-FALLBACK-NEXT:    vpsrlw $1, %xmm1, %xmm1
1966; AVX512BW-FALLBACK-NEXT:    vpmullw %xmm3, %xmm1, %xmm1
1967; AVX512BW-FALLBACK-NEXT:    vpaddw %xmm0, %xmm1, %xmm0
1968; AVX512BW-FALLBACK-NEXT:    vzeroupper
1969; AVX512BW-FALLBACK-NEXT:    retq
1970;
1971; AVX512VLBW-LABEL: vec128_i16_unsigned_reg_reg:
1972; AVX512VLBW:       # %bb.0:
1973; AVX512VLBW-NEXT:    vpcmpnleuw %xmm1, %xmm0, %k1
1974; AVX512VLBW-NEXT:    vpminuw %xmm1, %xmm0, %xmm2
1975; AVX512VLBW-NEXT:    vpmaxuw %xmm1, %xmm0, %xmm1
1976; AVX512VLBW-NEXT:    vpsubw %xmm2, %xmm1, %xmm1
1977; AVX512VLBW-NEXT:    vpsrlw $1, %xmm1, %xmm1
1978; AVX512VLBW-NEXT:    vpxor %xmm2, %xmm2, %xmm2
1979; AVX512VLBW-NEXT:    vpsubw %xmm1, %xmm2, %xmm1 {%k1}
1980; AVX512VLBW-NEXT:    vpaddw %xmm0, %xmm1, %xmm0
1981; AVX512VLBW-NEXT:    retq
1982  %t3 = icmp ugt <8 x i16> %a1, %a2
1983  %t4 = select <8 x i1> %t3, <8 x i16> <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>, <8 x i16> <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
1984  %t5 = select <8 x i1> %t3, <8 x i16> %a2, <8 x i16> %a1
1985  %t6 = select <8 x i1> %t3, <8 x i16> %a1, <8 x i16> %a2
1986  %t7 = sub <8 x i16> %t6, %t5
1987  %t8 = lshr <8 x i16> %t7, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
1988  %t9 = mul <8 x i16> %t8, %t4
1989  %a10 = add <8 x i16> %t9, %a1
1990  ret <8 x i16> %a10
1991}
1992
1993; Values are loaded. Only check signed case.
1994
1995define <8 x i16> @vec128_i16_signed_mem_reg(ptr %a1_addr, <8 x i16> %a2) nounwind {
1996; SSE-LABEL: vec128_i16_signed_mem_reg:
1997; SSE:       # %bb.0:
1998; SSE-NEXT:    movdqa (%rdi), %xmm1
1999; SSE-NEXT:    movdqa %xmm1, %xmm2
2000; SSE-NEXT:    pcmpgtw %xmm0, %xmm2
2001; SSE-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
2002; SSE-NEXT:    movdqa %xmm1, %xmm3
2003; SSE-NEXT:    pminsw %xmm0, %xmm3
2004; SSE-NEXT:    pmaxsw %xmm1, %xmm0
2005; SSE-NEXT:    psubw %xmm3, %xmm0
2006; SSE-NEXT:    psrlw $1, %xmm0
2007; SSE-NEXT:    pmullw %xmm2, %xmm0
2008; SSE-NEXT:    paddw %xmm1, %xmm0
2009; SSE-NEXT:    retq
2010;
2011; AVX-LABEL: vec128_i16_signed_mem_reg:
2012; AVX:       # %bb.0:
2013; AVX-NEXT:    vmovdqa (%rdi), %xmm1
2014; AVX-NEXT:    vpcmpgtw %xmm0, %xmm1, %xmm2
2015; AVX-NEXT:    vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
2016; AVX-NEXT:    vpminsw %xmm0, %xmm1, %xmm3
2017; AVX-NEXT:    vpmaxsw %xmm0, %xmm1, %xmm0
2018; AVX-NEXT:    vpsubw %xmm3, %xmm0, %xmm0
2019; AVX-NEXT:    vpsrlw $1, %xmm0, %xmm0
2020; AVX-NEXT:    vpmullw %xmm2, %xmm0, %xmm0
2021; AVX-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
2022; AVX-NEXT:    retq
2023;
2024; XOP-LABEL: vec128_i16_signed_mem_reg:
2025; XOP:       # %bb.0:
2026; XOP-NEXT:    vmovdqa (%rdi), %xmm1
2027; XOP-NEXT:    vpcomgtw %xmm0, %xmm1, %xmm2
2028; XOP-NEXT:    vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
2029; XOP-NEXT:    vpminsw %xmm0, %xmm1, %xmm3
2030; XOP-NEXT:    vpmaxsw %xmm0, %xmm1, %xmm0
2031; XOP-NEXT:    vpsubw %xmm3, %xmm0, %xmm0
2032; XOP-NEXT:    vpsrlw $1, %xmm0, %xmm0
2033; XOP-NEXT:    vpmacsww %xmm1, %xmm2, %xmm0, %xmm0
2034; XOP-NEXT:    retq
2035;
2036; AVX512F-LABEL: vec128_i16_signed_mem_reg:
2037; AVX512F:       # %bb.0:
2038; AVX512F-NEXT:    vmovdqa (%rdi), %xmm1
2039; AVX512F-NEXT:    vpcmpgtw %xmm0, %xmm1, %xmm2
2040; AVX512F-NEXT:    vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
2041; AVX512F-NEXT:    vpminsw %xmm0, %xmm1, %xmm3
2042; AVX512F-NEXT:    vpmaxsw %xmm0, %xmm1, %xmm0
2043; AVX512F-NEXT:    vpsubw %xmm3, %xmm0, %xmm0
2044; AVX512F-NEXT:    vpsrlw $1, %xmm0, %xmm0
2045; AVX512F-NEXT:    vpmullw %xmm2, %xmm0, %xmm0
2046; AVX512F-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
2047; AVX512F-NEXT:    retq
2048;
2049; AVX512VL-FALLBACK-LABEL: vec128_i16_signed_mem_reg:
2050; AVX512VL-FALLBACK:       # %bb.0:
2051; AVX512VL-FALLBACK-NEXT:    vmovdqa (%rdi), %xmm1
2052; AVX512VL-FALLBACK-NEXT:    vpminsw %xmm0, %xmm1, %xmm2
2053; AVX512VL-FALLBACK-NEXT:    vpmaxsw %xmm0, %xmm1, %xmm3
2054; AVX512VL-FALLBACK-NEXT:    vpsubw %xmm2, %xmm3, %xmm2
2055; AVX512VL-FALLBACK-NEXT:    vpsrlw $1, %xmm2, %xmm2
2056; AVX512VL-FALLBACK-NEXT:    vpcmpgtw %xmm0, %xmm1, %xmm0
2057; AVX512VL-FALLBACK-NEXT:    vpxor %xmm0, %xmm2, %xmm2
2058; AVX512VL-FALLBACK-NEXT:    vpsubw %xmm0, %xmm2, %xmm0
2059; AVX512VL-FALLBACK-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
2060; AVX512VL-FALLBACK-NEXT:    retq
2061;
2062; AVX512BW-FALLBACK-LABEL: vec128_i16_signed_mem_reg:
2063; AVX512BW-FALLBACK:       # %bb.0:
2064; AVX512BW-FALLBACK-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
2065; AVX512BW-FALLBACK-NEXT:    vmovdqa (%rdi), %xmm1
2066; AVX512BW-FALLBACK-NEXT:    vpcmpgtw %zmm0, %zmm1, %k1
2067; AVX512BW-FALLBACK-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
2068; AVX512BW-FALLBACK-NEXT:    vpbroadcastw {{.*#+}} xmm3 = [1,1,1,1,1,1,1,1]
2069; AVX512BW-FALLBACK-NEXT:    vmovdqu16 %zmm2, %zmm3 {%k1}
2070; AVX512BW-FALLBACK-NEXT:    vpminsw %xmm0, %xmm1, %xmm2
2071; AVX512BW-FALLBACK-NEXT:    vpmaxsw %xmm0, %xmm1, %xmm0
2072; AVX512BW-FALLBACK-NEXT:    vpsubw %xmm2, %xmm0, %xmm0
2073; AVX512BW-FALLBACK-NEXT:    vpsrlw $1, %xmm0, %xmm0
2074; AVX512BW-FALLBACK-NEXT:    vpmullw %xmm3, %xmm0, %xmm0
2075; AVX512BW-FALLBACK-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
2076; AVX512BW-FALLBACK-NEXT:    vzeroupper
2077; AVX512BW-FALLBACK-NEXT:    retq
2078;
2079; AVX512VLBW-LABEL: vec128_i16_signed_mem_reg:
2080; AVX512VLBW:       # %bb.0:
2081; AVX512VLBW-NEXT:    vmovdqa (%rdi), %xmm1
2082; AVX512VLBW-NEXT:    vpcmpgtw %xmm0, %xmm1, %k1
2083; AVX512VLBW-NEXT:    vpminsw %xmm0, %xmm1, %xmm2
2084; AVX512VLBW-NEXT:    vpmaxsw %xmm0, %xmm1, %xmm0
2085; AVX512VLBW-NEXT:    vpsubw %xmm2, %xmm0, %xmm0
2086; AVX512VLBW-NEXT:    vpsrlw $1, %xmm0, %xmm0
2087; AVX512VLBW-NEXT:    vpxor %xmm2, %xmm2, %xmm2
2088; AVX512VLBW-NEXT:    vpsubw %xmm0, %xmm2, %xmm0 {%k1}
2089; AVX512VLBW-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
2090; AVX512VLBW-NEXT:    retq
2091  %a1 = load <8 x i16>, ptr %a1_addr
2092  %t3 = icmp sgt <8 x i16> %a1, %a2 ; signed
2093  %t4 = select <8 x i1> %t3, <8 x i16> <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>, <8 x i16> <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
2094  %t5 = select <8 x i1> %t3, <8 x i16> %a2, <8 x i16> %a1
2095  %t6 = select <8 x i1> %t3, <8 x i16> %a1, <8 x i16> %a2
2096  %t7 = sub <8 x i16> %t6, %t5
2097  %t8 = lshr <8 x i16> %t7, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
2098  %t9 = mul nsw <8 x i16> %t8, %t4 ; signed
2099  %a10 = add nsw <8 x i16> %t9, %a1 ; signed
2100  ret <8 x i16> %a10
2101}
2102
2103define <8 x i16> @vec128_i16_signed_reg_mem(<8 x i16> %a1, ptr %a2_addr) nounwind {
2104; SSE-LABEL: vec128_i16_signed_reg_mem:
2105; SSE:       # %bb.0:
2106; SSE-NEXT:    movdqa (%rdi), %xmm1
2107; SSE-NEXT:    movdqa %xmm0, %xmm2
2108; SSE-NEXT:    pcmpgtw %xmm1, %xmm2
2109; SSE-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
2110; SSE-NEXT:    movdqa %xmm0, %xmm3
2111; SSE-NEXT:    pminsw %xmm1, %xmm3
2112; SSE-NEXT:    pmaxsw %xmm0, %xmm1
2113; SSE-NEXT:    psubw %xmm3, %xmm1
2114; SSE-NEXT:    psrlw $1, %xmm1
2115; SSE-NEXT:    pmullw %xmm2, %xmm1
2116; SSE-NEXT:    paddw %xmm1, %xmm0
2117; SSE-NEXT:    retq
2118;
2119; AVX-LABEL: vec128_i16_signed_reg_mem:
2120; AVX:       # %bb.0:
2121; AVX-NEXT:    vmovdqa (%rdi), %xmm1
2122; AVX-NEXT:    vpcmpgtw %xmm1, %xmm0, %xmm2
2123; AVX-NEXT:    vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
2124; AVX-NEXT:    vpminsw %xmm1, %xmm0, %xmm3
2125; AVX-NEXT:    vpmaxsw %xmm1, %xmm0, %xmm1
2126; AVX-NEXT:    vpsubw %xmm3, %xmm1, %xmm1
2127; AVX-NEXT:    vpsrlw $1, %xmm1, %xmm1
2128; AVX-NEXT:    vpmullw %xmm2, %xmm1, %xmm1
2129; AVX-NEXT:    vpaddw %xmm0, %xmm1, %xmm0
2130; AVX-NEXT:    retq
2131;
2132; XOP-LABEL: vec128_i16_signed_reg_mem:
2133; XOP:       # %bb.0:
2134; XOP-NEXT:    vmovdqa (%rdi), %xmm1
2135; XOP-NEXT:    vpcomgtw %xmm1, %xmm0, %xmm2
2136; XOP-NEXT:    vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
2137; XOP-NEXT:    vpminsw %xmm1, %xmm0, %xmm3
2138; XOP-NEXT:    vpmaxsw %xmm1, %xmm0, %xmm1
2139; XOP-NEXT:    vpsubw %xmm3, %xmm1, %xmm1
2140; XOP-NEXT:    vpsrlw $1, %xmm1, %xmm1
2141; XOP-NEXT:    vpmacsww %xmm0, %xmm2, %xmm1, %xmm0
2142; XOP-NEXT:    retq
2143;
2144; AVX512F-LABEL: vec128_i16_signed_reg_mem:
2145; AVX512F:       # %bb.0:
2146; AVX512F-NEXT:    vmovdqa (%rdi), %xmm1
2147; AVX512F-NEXT:    vpcmpgtw %xmm1, %xmm0, %xmm2
2148; AVX512F-NEXT:    vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
2149; AVX512F-NEXT:    vpminsw %xmm1, %xmm0, %xmm3
2150; AVX512F-NEXT:    vpmaxsw %xmm1, %xmm0, %xmm1
2151; AVX512F-NEXT:    vpsubw %xmm3, %xmm1, %xmm1
2152; AVX512F-NEXT:    vpsrlw $1, %xmm1, %xmm1
2153; AVX512F-NEXT:    vpmullw %xmm2, %xmm1, %xmm1
2154; AVX512F-NEXT:    vpaddw %xmm0, %xmm1, %xmm0
2155; AVX512F-NEXT:    retq
2156;
2157; AVX512VL-FALLBACK-LABEL: vec128_i16_signed_reg_mem:
2158; AVX512VL-FALLBACK:       # %bb.0:
2159; AVX512VL-FALLBACK-NEXT:    vmovdqa (%rdi), %xmm1
2160; AVX512VL-FALLBACK-NEXT:    vpminsw %xmm1, %xmm0, %xmm2
2161; AVX512VL-FALLBACK-NEXT:    vpmaxsw %xmm1, %xmm0, %xmm3
2162; AVX512VL-FALLBACK-NEXT:    vpsubw %xmm2, %xmm3, %xmm2
2163; AVX512VL-FALLBACK-NEXT:    vpsrlw $1, %xmm2, %xmm2
2164; AVX512VL-FALLBACK-NEXT:    vpcmpgtw %xmm1, %xmm0, %xmm1
2165; AVX512VL-FALLBACK-NEXT:    vpxor %xmm1, %xmm2, %xmm2
2166; AVX512VL-FALLBACK-NEXT:    vpsubw %xmm1, %xmm2, %xmm1
2167; AVX512VL-FALLBACK-NEXT:    vpaddw %xmm0, %xmm1, %xmm0
2168; AVX512VL-FALLBACK-NEXT:    retq
2169;
2170; AVX512BW-FALLBACK-LABEL: vec128_i16_signed_reg_mem:
2171; AVX512BW-FALLBACK:       # %bb.0:
2172; AVX512BW-FALLBACK-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
2173; AVX512BW-FALLBACK-NEXT:    vmovdqa (%rdi), %xmm1
2174; AVX512BW-FALLBACK-NEXT:    vpcmpgtw %zmm1, %zmm0, %k1
2175; AVX512BW-FALLBACK-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
2176; AVX512BW-FALLBACK-NEXT:    vpbroadcastw {{.*#+}} xmm3 = [1,1,1,1,1,1,1,1]
2177; AVX512BW-FALLBACK-NEXT:    vmovdqu16 %zmm2, %zmm3 {%k1}
2178; AVX512BW-FALLBACK-NEXT:    vpminsw %xmm1, %xmm0, %xmm2
2179; AVX512BW-FALLBACK-NEXT:    vpmaxsw %xmm1, %xmm0, %xmm1
2180; AVX512BW-FALLBACK-NEXT:    vpsubw %xmm2, %xmm1, %xmm1
2181; AVX512BW-FALLBACK-NEXT:    vpsrlw $1, %xmm1, %xmm1
2182; AVX512BW-FALLBACK-NEXT:    vpmullw %xmm3, %xmm1, %xmm1
2183; AVX512BW-FALLBACK-NEXT:    vpaddw %xmm0, %xmm1, %xmm0
2184; AVX512BW-FALLBACK-NEXT:    vzeroupper
2185; AVX512BW-FALLBACK-NEXT:    retq
2186;
2187; AVX512VLBW-LABEL: vec128_i16_signed_reg_mem:
2188; AVX512VLBW:       # %bb.0:
2189; AVX512VLBW-NEXT:    vmovdqa (%rdi), %xmm1
2190; AVX512VLBW-NEXT:    vpcmpgtw %xmm1, %xmm0, %k1
2191; AVX512VLBW-NEXT:    vpminsw %xmm1, %xmm0, %xmm2
2192; AVX512VLBW-NEXT:    vpmaxsw %xmm1, %xmm0, %xmm1
2193; AVX512VLBW-NEXT:    vpsubw %xmm2, %xmm1, %xmm1
2194; AVX512VLBW-NEXT:    vpsrlw $1, %xmm1, %xmm1
2195; AVX512VLBW-NEXT:    vpxor %xmm2, %xmm2, %xmm2
2196; AVX512VLBW-NEXT:    vpsubw %xmm1, %xmm2, %xmm1 {%k1}
2197; AVX512VLBW-NEXT:    vpaddw %xmm0, %xmm1, %xmm0
2198; AVX512VLBW-NEXT:    retq
2199  %a2 = load <8 x i16>, ptr %a2_addr
2200  %t3 = icmp sgt <8 x i16> %a1, %a2 ; signed
2201  %t4 = select <8 x i1> %t3, <8 x i16> <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>, <8 x i16> <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
2202  %t5 = select <8 x i1> %t3, <8 x i16> %a2, <8 x i16> %a1
2203  %t6 = select <8 x i1> %t3, <8 x i16> %a1, <8 x i16> %a2
2204  %t7 = sub <8 x i16> %t6, %t5
2205  %t8 = lshr <8 x i16> %t7, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
2206  %t9 = mul nsw <8 x i16> %t8, %t4 ; signed
2207  %a10 = add nsw <8 x i16> %t9, %a1 ; signed
2208  ret <8 x i16> %a10
2209}
2210
2211define <8 x i16> @vec128_i16_signed_mem_mem(ptr %a1_addr, ptr %a2_addr) nounwind {
2212; SSE-LABEL: vec128_i16_signed_mem_mem:
2213; SSE:       # %bb.0:
2214; SSE-NEXT:    movdqa (%rdi), %xmm1
2215; SSE-NEXT:    movdqa (%rsi), %xmm0
2216; SSE-NEXT:    movdqa %xmm1, %xmm2
2217; SSE-NEXT:    pcmpgtw %xmm0, %xmm2
2218; SSE-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
2219; SSE-NEXT:    movdqa %xmm1, %xmm3
2220; SSE-NEXT:    pminsw %xmm0, %xmm3
2221; SSE-NEXT:    pmaxsw %xmm1, %xmm0
2222; SSE-NEXT:    psubw %xmm3, %xmm0
2223; SSE-NEXT:    psrlw $1, %xmm0
2224; SSE-NEXT:    pmullw %xmm2, %xmm0
2225; SSE-NEXT:    paddw %xmm1, %xmm0
2226; SSE-NEXT:    retq
2227;
2228; AVX-LABEL: vec128_i16_signed_mem_mem:
2229; AVX:       # %bb.0:
2230; AVX-NEXT:    vmovdqa (%rdi), %xmm0
2231; AVX-NEXT:    vmovdqa (%rsi), %xmm1
2232; AVX-NEXT:    vpcmpgtw %xmm1, %xmm0, %xmm2
2233; AVX-NEXT:    vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
2234; AVX-NEXT:    vpminsw %xmm1, %xmm0, %xmm3
2235; AVX-NEXT:    vpmaxsw %xmm1, %xmm0, %xmm1
2236; AVX-NEXT:    vpsubw %xmm3, %xmm1, %xmm1
2237; AVX-NEXT:    vpsrlw $1, %xmm1, %xmm1
2238; AVX-NEXT:    vpmullw %xmm2, %xmm1, %xmm1
2239; AVX-NEXT:    vpaddw %xmm0, %xmm1, %xmm0
2240; AVX-NEXT:    retq
2241;
2242; XOP-LABEL: vec128_i16_signed_mem_mem:
2243; XOP:       # %bb.0:
2244; XOP-NEXT:    vmovdqa (%rdi), %xmm0
2245; XOP-NEXT:    vmovdqa (%rsi), %xmm1
2246; XOP-NEXT:    vpcomgtw %xmm1, %xmm0, %xmm2
2247; XOP-NEXT:    vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
2248; XOP-NEXT:    vpminsw %xmm1, %xmm0, %xmm3
2249; XOP-NEXT:    vpmaxsw %xmm1, %xmm0, %xmm1
2250; XOP-NEXT:    vpsubw %xmm3, %xmm1, %xmm1
2251; XOP-NEXT:    vpsrlw $1, %xmm1, %xmm1
2252; XOP-NEXT:    vpmacsww %xmm0, %xmm2, %xmm1, %xmm0
2253; XOP-NEXT:    retq
2254;
2255; AVX512F-LABEL: vec128_i16_signed_mem_mem:
2256; AVX512F:       # %bb.0:
2257; AVX512F-NEXT:    vmovdqa (%rdi), %xmm0
2258; AVX512F-NEXT:    vmovdqa (%rsi), %xmm1
2259; AVX512F-NEXT:    vpcmpgtw %xmm1, %xmm0, %xmm2
2260; AVX512F-NEXT:    vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
2261; AVX512F-NEXT:    vpminsw %xmm1, %xmm0, %xmm3
2262; AVX512F-NEXT:    vpmaxsw %xmm1, %xmm0, %xmm1
2263; AVX512F-NEXT:    vpsubw %xmm3, %xmm1, %xmm1
2264; AVX512F-NEXT:    vpsrlw $1, %xmm1, %xmm1
2265; AVX512F-NEXT:    vpmullw %xmm2, %xmm1, %xmm1
2266; AVX512F-NEXT:    vpaddw %xmm0, %xmm1, %xmm0
2267; AVX512F-NEXT:    retq
2268;
2269; AVX512VL-FALLBACK-LABEL: vec128_i16_signed_mem_mem:
2270; AVX512VL-FALLBACK:       # %bb.0:
2271; AVX512VL-FALLBACK-NEXT:    vmovdqa (%rdi), %xmm0
2272; AVX512VL-FALLBACK-NEXT:    vmovdqa (%rsi), %xmm1
2273; AVX512VL-FALLBACK-NEXT:    vpminsw %xmm1, %xmm0, %xmm2
2274; AVX512VL-FALLBACK-NEXT:    vpmaxsw %xmm1, %xmm0, %xmm3
2275; AVX512VL-FALLBACK-NEXT:    vpsubw %xmm2, %xmm3, %xmm2
2276; AVX512VL-FALLBACK-NEXT:    vpsrlw $1, %xmm2, %xmm2
2277; AVX512VL-FALLBACK-NEXT:    vpcmpgtw %xmm1, %xmm0, %xmm1
2278; AVX512VL-FALLBACK-NEXT:    vpxor %xmm1, %xmm2, %xmm2
2279; AVX512VL-FALLBACK-NEXT:    vpsubw %xmm1, %xmm2, %xmm1
2280; AVX512VL-FALLBACK-NEXT:    vpaddw %xmm0, %xmm1, %xmm0
2281; AVX512VL-FALLBACK-NEXT:    retq
2282;
2283; AVX512BW-FALLBACK-LABEL: vec128_i16_signed_mem_mem:
2284; AVX512BW-FALLBACK:       # %bb.0:
2285; AVX512BW-FALLBACK-NEXT:    vmovdqa (%rdi), %xmm0
2286; AVX512BW-FALLBACK-NEXT:    vmovdqa (%rsi), %xmm1
2287; AVX512BW-FALLBACK-NEXT:    vpcmpgtw %zmm1, %zmm0, %k1
2288; AVX512BW-FALLBACK-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
2289; AVX512BW-FALLBACK-NEXT:    vpbroadcastw {{.*#+}} xmm3 = [1,1,1,1,1,1,1,1]
2290; AVX512BW-FALLBACK-NEXT:    vmovdqu16 %zmm2, %zmm3 {%k1}
2291; AVX512BW-FALLBACK-NEXT:    vpminsw %xmm1, %xmm0, %xmm2
2292; AVX512BW-FALLBACK-NEXT:    vpmaxsw %xmm1, %xmm0, %xmm1
2293; AVX512BW-FALLBACK-NEXT:    vpsubw %xmm2, %xmm1, %xmm1
2294; AVX512BW-FALLBACK-NEXT:    vpsrlw $1, %xmm1, %xmm1
2295; AVX512BW-FALLBACK-NEXT:    vpmullw %xmm3, %xmm1, %xmm1
2296; AVX512BW-FALLBACK-NEXT:    vpaddw %xmm0, %xmm1, %xmm0
2297; AVX512BW-FALLBACK-NEXT:    vzeroupper
2298; AVX512BW-FALLBACK-NEXT:    retq
2299;
2300; AVX512VLBW-LABEL: vec128_i16_signed_mem_mem:
2301; AVX512VLBW:       # %bb.0:
2302; AVX512VLBW-NEXT:    vmovdqa (%rdi), %xmm0
2303; AVX512VLBW-NEXT:    vmovdqa (%rsi), %xmm1
2304; AVX512VLBW-NEXT:    vpcmpgtw %xmm1, %xmm0, %k1
2305; AVX512VLBW-NEXT:    vpminsw %xmm1, %xmm0, %xmm2
2306; AVX512VLBW-NEXT:    vpmaxsw %xmm1, %xmm0, %xmm1
2307; AVX512VLBW-NEXT:    vpsubw %xmm2, %xmm1, %xmm1
2308; AVX512VLBW-NEXT:    vpsrlw $1, %xmm1, %xmm1
2309; AVX512VLBW-NEXT:    vpxor %xmm2, %xmm2, %xmm2
2310; AVX512VLBW-NEXT:    vpsubw %xmm1, %xmm2, %xmm1 {%k1}
2311; AVX512VLBW-NEXT:    vpaddw %xmm0, %xmm1, %xmm0
2312; AVX512VLBW-NEXT:    retq
2313  %a1 = load <8 x i16>, ptr %a1_addr
2314  %a2 = load <8 x i16>, ptr %a2_addr
2315  %t3 = icmp sgt <8 x i16> %a1, %a2 ; signed
2316  %t4 = select <8 x i1> %t3, <8 x i16> <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>, <8 x i16> <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
2317  %t5 = select <8 x i1> %t3, <8 x i16> %a2, <8 x i16> %a1
2318  %t6 = select <8 x i1> %t3, <8 x i16> %a1, <8 x i16> %a2
2319  %t7 = sub <8 x i16> %t6, %t5
2320  %t8 = lshr <8 x i16> %t7, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
2321  %t9 = mul nsw <8 x i16> %t8, %t4 ; signed
2322  %a10 = add nsw <8 x i16> %t9, %a1 ; signed
2323  ret <8 x i16> %a10
2324}
2325
2326; ---------------------------------------------------------------------------- ;
2327; 8-bit width. 128 / 8 = 16 elts.
2328; ---------------------------------------------------------------------------- ;
2329
2330; Values come from regs
2331
2332define <16 x i8> @vec128_i8_signed_reg_reg(<16 x i8> %a1, <16 x i8> %a2) nounwind {
2333; SSE2-LABEL: vec128_i8_signed_reg_reg:
2334; SSE2:       # %bb.0:
2335; SSE2-NEXT:    movdqa %xmm0, %xmm3
2336; SSE2-NEXT:    pcmpgtb %xmm1, %xmm3
2337; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
2338; SSE2-NEXT:    por %xmm3, %xmm2
2339; SSE2-NEXT:    movdqa %xmm0, %xmm4
2340; SSE2-NEXT:    psubb %xmm1, %xmm4
2341; SSE2-NEXT:    pxor %xmm3, %xmm4
2342; SSE2-NEXT:    psubb %xmm4, %xmm3
2343; SSE2-NEXT:    psrlw $1, %xmm3
2344; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
2345; SSE2-NEXT:    movdqa %xmm3, %xmm1
2346; SSE2-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
2347; SSE2-NEXT:    movdqa %xmm2, %xmm4
2348; SSE2-NEXT:    punpckhbw {{.*#+}} xmm4 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
2349; SSE2-NEXT:    pmullw %xmm1, %xmm4
2350; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255]
2351; SSE2-NEXT:    pand %xmm1, %xmm4
2352; SSE2-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
2353; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
2354; SSE2-NEXT:    pmullw %xmm3, %xmm2
2355; SSE2-NEXT:    pand %xmm1, %xmm2
2356; SSE2-NEXT:    packuswb %xmm4, %xmm2
2357; SSE2-NEXT:    paddb %xmm2, %xmm0
2358; SSE2-NEXT:    retq
2359;
2360; SSE41-LABEL: vec128_i8_signed_reg_reg:
2361; SSE41:       # %bb.0:
2362; SSE41-NEXT:    movdqa %xmm0, %xmm2
2363; SSE41-NEXT:    pcmpgtb %xmm1, %xmm2
2364; SSE41-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
2365; SSE41-NEXT:    movdqa %xmm0, %xmm3
2366; SSE41-NEXT:    pminsb %xmm1, %xmm3
2367; SSE41-NEXT:    pmaxsb %xmm0, %xmm1
2368; SSE41-NEXT:    psubb %xmm3, %xmm1
2369; SSE41-NEXT:    psrlw $1, %xmm1
2370; SSE41-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
2371; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
2372; SSE41-NEXT:    movdqa %xmm2, %xmm4
2373; SSE41-NEXT:    pand %xmm3, %xmm4
2374; SSE41-NEXT:    movdqa %xmm1, %xmm5
2375; SSE41-NEXT:    pmaddubsw %xmm4, %xmm5
2376; SSE41-NEXT:    pand %xmm3, %xmm5
2377; SSE41-NEXT:    pandn %xmm2, %xmm3
2378; SSE41-NEXT:    pmaddubsw %xmm3, %xmm1
2379; SSE41-NEXT:    psllw $8, %xmm1
2380; SSE41-NEXT:    por %xmm1, %xmm5
2381; SSE41-NEXT:    paddb %xmm5, %xmm0
2382; SSE41-NEXT:    retq
2383;
2384; AVX1-LABEL: vec128_i8_signed_reg_reg:
2385; AVX1:       # %bb.0:
2386; AVX1-NEXT:    vpcmpgtb %xmm1, %xmm0, %xmm2
2387; AVX1-NEXT:    vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
2388; AVX1-NEXT:    vpminsb %xmm1, %xmm0, %xmm3
2389; AVX1-NEXT:    vpmaxsb %xmm1, %xmm0, %xmm1
2390; AVX1-NEXT:    vpsubb %xmm3, %xmm1, %xmm1
2391; AVX1-NEXT:    vpsrlw $1, %xmm1, %xmm1
2392; AVX1-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
2393; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
2394; AVX1-NEXT:    vpand %xmm3, %xmm2, %xmm4
2395; AVX1-NEXT:    vpmaddubsw %xmm4, %xmm1, %xmm4
2396; AVX1-NEXT:    vpand %xmm3, %xmm4, %xmm4
2397; AVX1-NEXT:    vpandn %xmm2, %xmm3, %xmm2
2398; AVX1-NEXT:    vpmaddubsw %xmm2, %xmm1, %xmm1
2399; AVX1-NEXT:    vpsllw $8, %xmm1, %xmm1
2400; AVX1-NEXT:    vpor %xmm1, %xmm4, %xmm1
2401; AVX1-NEXT:    vpaddb %xmm0, %xmm1, %xmm0
2402; AVX1-NEXT:    retq
2403;
2404; AVX2-LABEL: vec128_i8_signed_reg_reg:
2405; AVX2:       # %bb.0:
2406; AVX2-NEXT:    vpcmpgtb %xmm1, %xmm0, %xmm2
2407; AVX2-NEXT:    vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
2408; AVX2-NEXT:    vpminsb %xmm1, %xmm0, %xmm3
2409; AVX2-NEXT:    vpmaxsb %xmm1, %xmm0, %xmm1
2410; AVX2-NEXT:    vpsubb %xmm3, %xmm1, %xmm1
2411; AVX2-NEXT:    vpsrlw $1, %xmm1, %xmm1
2412; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
2413; AVX2-NEXT:    vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
2414; AVX2-NEXT:    vpmovzxbw {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero
2415; AVX2-NEXT:    vpmullw %ymm2, %ymm1, %ymm1
2416; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
2417; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm2
2418; AVX2-NEXT:    vpackuswb %xmm2, %xmm1, %xmm1
2419; AVX2-NEXT:    vpaddb %xmm0, %xmm1, %xmm0
2420; AVX2-NEXT:    vzeroupper
2421; AVX2-NEXT:    retq
2422;
2423; XOP-FALLBACK-LABEL: vec128_i8_signed_reg_reg:
2424; XOP-FALLBACK:       # %bb.0:
2425; XOP-FALLBACK-NEXT:    vpcomgtb %xmm1, %xmm0, %xmm2
2426; XOP-FALLBACK-NEXT:    vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
2427; XOP-FALLBACK-NEXT:    vpminsb %xmm1, %xmm0, %xmm3
2428; XOP-FALLBACK-NEXT:    vpmaxsb %xmm1, %xmm0, %xmm1
2429; XOP-FALLBACK-NEXT:    vpsubb %xmm3, %xmm1, %xmm1
2430; XOP-FALLBACK-NEXT:    vpcmpeqd %xmm3, %xmm3, %xmm3
2431; XOP-FALLBACK-NEXT:    vpshlb %xmm3, %xmm1, %xmm1
2432; XOP-FALLBACK-NEXT:    vbroadcastss {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
2433; XOP-FALLBACK-NEXT:    vpandn %xmm2, %xmm3, %xmm4
2434; XOP-FALLBACK-NEXT:    vpmaddubsw %xmm4, %xmm1, %xmm4
2435; XOP-FALLBACK-NEXT:    vpand %xmm3, %xmm2, %xmm2
2436; XOP-FALLBACK-NEXT:    vpmaddubsw %xmm2, %xmm1, %xmm1
2437; XOP-FALLBACK-NEXT:    vpperm {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[2],xmm4[2],xmm1[4],xmm4[4],xmm1[6],xmm4[6],xmm1[8],xmm4[8],xmm1[10],xmm4[10],xmm1[12],xmm4[12],xmm1[14],xmm4[14]
2438; XOP-FALLBACK-NEXT:    vpaddb %xmm0, %xmm1, %xmm0
2439; XOP-FALLBACK-NEXT:    retq
2440;
2441; XOPAVX1-LABEL: vec128_i8_signed_reg_reg:
2442; XOPAVX1:       # %bb.0:
2443; XOPAVX1-NEXT:    vpcomgtb %xmm1, %xmm0, %xmm2
2444; XOPAVX1-NEXT:    vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
2445; XOPAVX1-NEXT:    vpminsb %xmm1, %xmm0, %xmm3
2446; XOPAVX1-NEXT:    vpmaxsb %xmm1, %xmm0, %xmm1
2447; XOPAVX1-NEXT:    vpsubb %xmm3, %xmm1, %xmm1
2448; XOPAVX1-NEXT:    vpcmpeqd %xmm3, %xmm3, %xmm3
2449; XOPAVX1-NEXT:    vpshlb %xmm3, %xmm1, %xmm1
2450; XOPAVX1-NEXT:    vbroadcastss {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
2451; XOPAVX1-NEXT:    vpandn %xmm2, %xmm3, %xmm4
2452; XOPAVX1-NEXT:    vpmaddubsw %xmm4, %xmm1, %xmm4
2453; XOPAVX1-NEXT:    vpand %xmm3, %xmm2, %xmm2
2454; XOPAVX1-NEXT:    vpmaddubsw %xmm2, %xmm1, %xmm1
2455; XOPAVX1-NEXT:    vpperm {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[2],xmm4[2],xmm1[4],xmm4[4],xmm1[6],xmm4[6],xmm1[8],xmm4[8],xmm1[10],xmm4[10],xmm1[12],xmm4[12],xmm1[14],xmm4[14]
2456; XOPAVX1-NEXT:    vpaddb %xmm0, %xmm1, %xmm0
2457; XOPAVX1-NEXT:    retq
2458;
2459; XOPAVX2-LABEL: vec128_i8_signed_reg_reg:
2460; XOPAVX2:       # %bb.0:
2461; XOPAVX2-NEXT:    vpcomgtb %xmm1, %xmm0, %xmm2
2462; XOPAVX2-NEXT:    vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
2463; XOPAVX2-NEXT:    vpminsb %xmm1, %xmm0, %xmm3
2464; XOPAVX2-NEXT:    vpmaxsb %xmm1, %xmm0, %xmm1
2465; XOPAVX2-NEXT:    vpsubb %xmm3, %xmm1, %xmm1
2466; XOPAVX2-NEXT:    vpcmpeqd %xmm3, %xmm3, %xmm3
2467; XOPAVX2-NEXT:    vpshlb %xmm3, %xmm1, %xmm1
2468; XOPAVX2-NEXT:    vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
2469; XOPAVX2-NEXT:    vpmovzxbw {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero
2470; XOPAVX2-NEXT:    vpmullw %ymm2, %ymm1, %ymm1
2471; XOPAVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
2472; XOPAVX2-NEXT:    vextracti128 $1, %ymm1, %xmm2
2473; XOPAVX2-NEXT:    vpackuswb %xmm2, %xmm1, %xmm1
2474; XOPAVX2-NEXT:    vpaddb %xmm0, %xmm1, %xmm0
2475; XOPAVX2-NEXT:    vzeroupper
2476; XOPAVX2-NEXT:    retq
2477;
2478; AVX512F-LABEL: vec128_i8_signed_reg_reg:
2479; AVX512F:       # %bb.0:
2480; AVX512F-NEXT:    vpcmpgtb %xmm1, %xmm0, %xmm2
2481; AVX512F-NEXT:    vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
2482; AVX512F-NEXT:    vpminsb %xmm1, %xmm0, %xmm3
2483; AVX512F-NEXT:    vpmaxsb %xmm1, %xmm0, %xmm1
2484; AVX512F-NEXT:    vpsubb %xmm3, %xmm1, %xmm1
2485; AVX512F-NEXT:    vpsrlw $1, %xmm1, %xmm1
2486; AVX512F-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
2487; AVX512F-NEXT:    vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
2488; AVX512F-NEXT:    vpmovzxbw {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero
2489; AVX512F-NEXT:    vpmullw %ymm2, %ymm1, %ymm1
2490; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
2491; AVX512F-NEXT:    vpmovdb %zmm1, %xmm1
2492; AVX512F-NEXT:    vpaddb %xmm0, %xmm1, %xmm0
2493; AVX512F-NEXT:    vzeroupper
2494; AVX512F-NEXT:    retq
2495;
2496; AVX512VL-FALLBACK-LABEL: vec128_i8_signed_reg_reg:
2497; AVX512VL-FALLBACK:       # %bb.0:
2498; AVX512VL-FALLBACK-NEXT:    vpminsb %xmm1, %xmm0, %xmm2
2499; AVX512VL-FALLBACK-NEXT:    vpmaxsb %xmm1, %xmm0, %xmm3
2500; AVX512VL-FALLBACK-NEXT:    vpsubb %xmm2, %xmm3, %xmm2
2501; AVX512VL-FALLBACK-NEXT:    vpsrlw $1, %xmm2, %xmm2
2502; AVX512VL-FALLBACK-NEXT:    vpcmpgtb %xmm1, %xmm0, %xmm1
2503; AVX512VL-FALLBACK-NEXT:    vpternlogd {{.*#+}} xmm2 = xmm1 ^ (xmm2 & mem)
2504; AVX512VL-FALLBACK-NEXT:    vpsubb %xmm1, %xmm2, %xmm1
2505; AVX512VL-FALLBACK-NEXT:    vpaddb %xmm0, %xmm1, %xmm0
2506; AVX512VL-FALLBACK-NEXT:    retq
2507;
2508; AVX512BW-FALLBACK-LABEL: vec128_i8_signed_reg_reg:
2509; AVX512BW-FALLBACK:       # %bb.0:
2510; AVX512BW-FALLBACK-NEXT:    # kill: def $xmm1 killed $xmm1 def $zmm1
2511; AVX512BW-FALLBACK-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
2512; AVX512BW-FALLBACK-NEXT:    vpcmpgtb %zmm1, %zmm0, %k1
2513; AVX512BW-FALLBACK-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
2514; AVX512BW-FALLBACK-NEXT:    vpbroadcastb {{.*#+}} xmm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
2515; AVX512BW-FALLBACK-NEXT:    vmovdqu8 %zmm2, %zmm3 {%k1}
2516; AVX512BW-FALLBACK-NEXT:    vpminsb %xmm1, %xmm0, %xmm2
2517; AVX512BW-FALLBACK-NEXT:    vpmaxsb %xmm1, %xmm0, %xmm1
2518; AVX512BW-FALLBACK-NEXT:    vpsubb %xmm2, %xmm1, %xmm1
2519; AVX512BW-FALLBACK-NEXT:    vpsrlw $1, %xmm1, %xmm1
2520; AVX512BW-FALLBACK-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
2521; AVX512BW-FALLBACK-NEXT:    vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
2522; AVX512BW-FALLBACK-NEXT:    vpmovzxbw {{.*#+}} ymm2 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero,xmm3[8],zero,xmm3[9],zero,xmm3[10],zero,xmm3[11],zero,xmm3[12],zero,xmm3[13],zero,xmm3[14],zero,xmm3[15],zero
2523; AVX512BW-FALLBACK-NEXT:    vpmullw %ymm2, %ymm1, %ymm1
2524; AVX512BW-FALLBACK-NEXT:    vpmovwb %zmm1, %ymm1
2525; AVX512BW-FALLBACK-NEXT:    vpaddb %xmm0, %xmm1, %xmm0
2526; AVX512BW-FALLBACK-NEXT:    vzeroupper
2527; AVX512BW-FALLBACK-NEXT:    retq
2528;
2529; AVX512VLBW-LABEL: vec128_i8_signed_reg_reg:
2530; AVX512VLBW:       # %bb.0:
2531; AVX512VLBW-NEXT:    vpcmpgtb %xmm1, %xmm0, %k1
2532; AVX512VLBW-NEXT:    vpminsb %xmm1, %xmm0, %xmm2
2533; AVX512VLBW-NEXT:    vpmaxsb %xmm1, %xmm0, %xmm1
2534; AVX512VLBW-NEXT:    vpsubb %xmm2, %xmm1, %xmm1
2535; AVX512VLBW-NEXT:    vpsrlw $1, %xmm1, %xmm1
2536; AVX512VLBW-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm1
2537; AVX512VLBW-NEXT:    vpxor %xmm2, %xmm2, %xmm2
2538; AVX512VLBW-NEXT:    vpsubb %xmm1, %xmm2, %xmm1 {%k1}
2539; AVX512VLBW-NEXT:    vpaddb %xmm0, %xmm1, %xmm0
2540; AVX512VLBW-NEXT:    retq
2541  %t3 = icmp sgt <16 x i8> %a1, %a2 ; signed
2542  %t4 = select <16 x i1> %t3, <16 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>, <16 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
2543  %t5 = select <16 x i1> %t3, <16 x i8> %a2, <16 x i8> %a1
2544  %t6 = select <16 x i1> %t3, <16 x i8> %a1, <16 x i8> %a2
2545  %t7 = sub <16 x i8> %t6, %t5
2546  %t8 = lshr <16 x i8> %t7, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
2547  %t9 = mul nsw <16 x i8> %t8, %t4 ; signed
2548  %a10 = add nsw <16 x i8> %t9, %a1 ; signed
2549  ret <16 x i8> %a10
2550}
2551
2552define <16 x i8> @vec128_i8_unsigned_reg_reg(<16 x i8> %a1, <16 x i8> %a2) nounwind {
2553; SSE2-LABEL: vec128_i8_unsigned_reg_reg:
2554; SSE2:       # %bb.0:
2555; SSE2-NEXT:    movdqa %xmm0, %xmm3
2556; SSE2-NEXT:    pminub %xmm1, %xmm3
2557; SSE2-NEXT:    movdqa %xmm0, %xmm4
2558; SSE2-NEXT:    pcmpeqb %xmm3, %xmm4
2559; SSE2-NEXT:    pcmpeqd %xmm2, %xmm2
2560; SSE2-NEXT:    pxor %xmm4, %xmm2
2561; SSE2-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
2562; SSE2-NEXT:    pmaxub %xmm0, %xmm1
2563; SSE2-NEXT:    psubb %xmm3, %xmm1
2564; SSE2-NEXT:    psrlw $1, %xmm1
2565; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
2566; SSE2-NEXT:    movdqa %xmm1, %xmm3
2567; SSE2-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
2568; SSE2-NEXT:    movdqa %xmm2, %xmm4
2569; SSE2-NEXT:    punpckhbw {{.*#+}} xmm4 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
2570; SSE2-NEXT:    pmullw %xmm3, %xmm4
2571; SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
2572; SSE2-NEXT:    pand %xmm3, %xmm4
2573; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
2574; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
2575; SSE2-NEXT:    pmullw %xmm1, %xmm2
2576; SSE2-NEXT:    pand %xmm3, %xmm2
2577; SSE2-NEXT:    packuswb %xmm4, %xmm2
2578; SSE2-NEXT:    paddb %xmm2, %xmm0
2579; SSE2-NEXT:    retq
2580;
2581; SSE41-LABEL: vec128_i8_unsigned_reg_reg:
2582; SSE41:       # %bb.0:
2583; SSE41-NEXT:    movdqa %xmm0, %xmm2
2584; SSE41-NEXT:    pminub %xmm1, %xmm2
2585; SSE41-NEXT:    movdqa %xmm0, %xmm3
2586; SSE41-NEXT:    pcmpeqb %xmm2, %xmm3
2587; SSE41-NEXT:    pcmpeqd %xmm4, %xmm4
2588; SSE41-NEXT:    pxor %xmm3, %xmm4
2589; SSE41-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4
2590; SSE41-NEXT:    pmaxub %xmm0, %xmm1
2591; SSE41-NEXT:    psubb %xmm2, %xmm1
2592; SSE41-NEXT:    psrlw $1, %xmm1
2593; SSE41-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
2594; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
2595; SSE41-NEXT:    movdqa %xmm4, %xmm3
2596; SSE41-NEXT:    pand %xmm2, %xmm3
2597; SSE41-NEXT:    movdqa %xmm1, %xmm5
2598; SSE41-NEXT:    pmaddubsw %xmm3, %xmm5
2599; SSE41-NEXT:    pand %xmm2, %xmm5
2600; SSE41-NEXT:    pandn %xmm4, %xmm2
2601; SSE41-NEXT:    pmaddubsw %xmm2, %xmm1
2602; SSE41-NEXT:    psllw $8, %xmm1
2603; SSE41-NEXT:    por %xmm1, %xmm5
2604; SSE41-NEXT:    paddb %xmm5, %xmm0
2605; SSE41-NEXT:    retq
2606;
2607; AVX1-LABEL: vec128_i8_unsigned_reg_reg:
2608; AVX1:       # %bb.0:
2609; AVX1-NEXT:    vpminub %xmm1, %xmm0, %xmm2
2610; AVX1-NEXT:    vpcmpeqb %xmm2, %xmm0, %xmm3
2611; AVX1-NEXT:    vpcmpeqd %xmm4, %xmm4, %xmm4
2612; AVX1-NEXT:    vpxor %xmm4, %xmm3, %xmm3
2613; AVX1-NEXT:    vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3
2614; AVX1-NEXT:    vpmaxub %xmm1, %xmm0, %xmm1
2615; AVX1-NEXT:    vpsubb %xmm2, %xmm1, %xmm1
2616; AVX1-NEXT:    vpsrlw $1, %xmm1, %xmm1
2617; AVX1-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
2618; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
2619; AVX1-NEXT:    vpand %xmm2, %xmm3, %xmm4
2620; AVX1-NEXT:    vpmaddubsw %xmm4, %xmm1, %xmm4
2621; AVX1-NEXT:    vpand %xmm2, %xmm4, %xmm4
2622; AVX1-NEXT:    vpandn %xmm3, %xmm2, %xmm2
2623; AVX1-NEXT:    vpmaddubsw %xmm2, %xmm1, %xmm1
2624; AVX1-NEXT:    vpsllw $8, %xmm1, %xmm1
2625; AVX1-NEXT:    vpor %xmm1, %xmm4, %xmm1
2626; AVX1-NEXT:    vpaddb %xmm0, %xmm1, %xmm0
2627; AVX1-NEXT:    retq
2628;
2629; AVX2-LABEL: vec128_i8_unsigned_reg_reg:
2630; AVX2:       # %bb.0:
2631; AVX2-NEXT:    vpminub %xmm1, %xmm0, %xmm2
2632; AVX2-NEXT:    vpcmpeqb %xmm2, %xmm0, %xmm3
2633; AVX2-NEXT:    vpcmpeqd %xmm4, %xmm4, %xmm4
2634; AVX2-NEXT:    vpxor %xmm4, %xmm3, %xmm3
2635; AVX2-NEXT:    vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3
2636; AVX2-NEXT:    vpmaxub %xmm1, %xmm0, %xmm1
2637; AVX2-NEXT:    vpsubb %xmm2, %xmm1, %xmm1
2638; AVX2-NEXT:    vpsrlw $1, %xmm1, %xmm1
2639; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
2640; AVX2-NEXT:    vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
2641; AVX2-NEXT:    vpmovzxbw {{.*#+}} ymm2 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero,xmm3[8],zero,xmm3[9],zero,xmm3[10],zero,xmm3[11],zero,xmm3[12],zero,xmm3[13],zero,xmm3[14],zero,xmm3[15],zero
2642; AVX2-NEXT:    vpmullw %ymm2, %ymm1, %ymm1
2643; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
2644; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm2
2645; AVX2-NEXT:    vpackuswb %xmm2, %xmm1, %xmm1
2646; AVX2-NEXT:    vpaddb %xmm0, %xmm1, %xmm0
2647; AVX2-NEXT:    vzeroupper
2648; AVX2-NEXT:    retq
2649;
2650; XOP-FALLBACK-LABEL: vec128_i8_unsigned_reg_reg:
2651; XOP-FALLBACK:       # %bb.0:
2652; XOP-FALLBACK-NEXT:    vpcomgtub %xmm1, %xmm0, %xmm2
2653; XOP-FALLBACK-NEXT:    vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
2654; XOP-FALLBACK-NEXT:    vpminub %xmm1, %xmm0, %xmm3
2655; XOP-FALLBACK-NEXT:    vpmaxub %xmm1, %xmm0, %xmm1
2656; XOP-FALLBACK-NEXT:    vpsubb %xmm3, %xmm1, %xmm1
2657; XOP-FALLBACK-NEXT:    vpcmpeqd %xmm3, %xmm3, %xmm3
2658; XOP-FALLBACK-NEXT:    vpshlb %xmm3, %xmm1, %xmm1
2659; XOP-FALLBACK-NEXT:    vbroadcastss {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
2660; XOP-FALLBACK-NEXT:    vpandn %xmm2, %xmm3, %xmm4
2661; XOP-FALLBACK-NEXT:    vpmaddubsw %xmm4, %xmm1, %xmm4
2662; XOP-FALLBACK-NEXT:    vpand %xmm3, %xmm2, %xmm2
2663; XOP-FALLBACK-NEXT:    vpmaddubsw %xmm2, %xmm1, %xmm1
2664; XOP-FALLBACK-NEXT:    vpperm {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[2],xmm4[2],xmm1[4],xmm4[4],xmm1[6],xmm4[6],xmm1[8],xmm4[8],xmm1[10],xmm4[10],xmm1[12],xmm4[12],xmm1[14],xmm4[14]
2665; XOP-FALLBACK-NEXT:    vpaddb %xmm0, %xmm1, %xmm0
2666; XOP-FALLBACK-NEXT:    retq
2667;
2668; XOPAVX1-LABEL: vec128_i8_unsigned_reg_reg:
2669; XOPAVX1:       # %bb.0:
2670; XOPAVX1-NEXT:    vpcomgtub %xmm1, %xmm0, %xmm2
2671; XOPAVX1-NEXT:    vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
2672; XOPAVX1-NEXT:    vpminub %xmm1, %xmm0, %xmm3
2673; XOPAVX1-NEXT:    vpmaxub %xmm1, %xmm0, %xmm1
2674; XOPAVX1-NEXT:    vpsubb %xmm3, %xmm1, %xmm1
2675; XOPAVX1-NEXT:    vpcmpeqd %xmm3, %xmm3, %xmm3
2676; XOPAVX1-NEXT:    vpshlb %xmm3, %xmm1, %xmm1
2677; XOPAVX1-NEXT:    vbroadcastss {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
2678; XOPAVX1-NEXT:    vpandn %xmm2, %xmm3, %xmm4
2679; XOPAVX1-NEXT:    vpmaddubsw %xmm4, %xmm1, %xmm4
2680; XOPAVX1-NEXT:    vpand %xmm3, %xmm2, %xmm2
2681; XOPAVX1-NEXT:    vpmaddubsw %xmm2, %xmm1, %xmm1
2682; XOPAVX1-NEXT:    vpperm {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[2],xmm4[2],xmm1[4],xmm4[4],xmm1[6],xmm4[6],xmm1[8],xmm4[8],xmm1[10],xmm4[10],xmm1[12],xmm4[12],xmm1[14],xmm4[14]
2683; XOPAVX1-NEXT:    vpaddb %xmm0, %xmm1, %xmm0
2684; XOPAVX1-NEXT:    retq
2685;
2686; XOPAVX2-LABEL: vec128_i8_unsigned_reg_reg:
2687; XOPAVX2:       # %bb.0:
2688; XOPAVX2-NEXT:    vpcomgtub %xmm1, %xmm0, %xmm2
2689; XOPAVX2-NEXT:    vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
2690; XOPAVX2-NEXT:    vpminub %xmm1, %xmm0, %xmm3
2691; XOPAVX2-NEXT:    vpmaxub %xmm1, %xmm0, %xmm1
2692; XOPAVX2-NEXT:    vpsubb %xmm3, %xmm1, %xmm1
2693; XOPAVX2-NEXT:    vpcmpeqd %xmm3, %xmm3, %xmm3
2694; XOPAVX2-NEXT:    vpshlb %xmm3, %xmm1, %xmm1
2695; XOPAVX2-NEXT:    vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
2696; XOPAVX2-NEXT:    vpmovzxbw {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero
2697; XOPAVX2-NEXT:    vpmullw %ymm2, %ymm1, %ymm1
2698; XOPAVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
2699; XOPAVX2-NEXT:    vextracti128 $1, %ymm1, %xmm2
2700; XOPAVX2-NEXT:    vpackuswb %xmm2, %xmm1, %xmm1
2701; XOPAVX2-NEXT:    vpaddb %xmm0, %xmm1, %xmm0
2702; XOPAVX2-NEXT:    vzeroupper
2703; XOPAVX2-NEXT:    retq
2704;
2705; AVX512F-LABEL: vec128_i8_unsigned_reg_reg:
2706; AVX512F:       # %bb.0:
2707; AVX512F-NEXT:    vpminub %xmm1, %xmm0, %xmm2
2708; AVX512F-NEXT:    vpcmpeqb %xmm2, %xmm0, %xmm3
2709; AVX512F-NEXT:    vpternlogq {{.*#+}} zmm3 = ~zmm3
2710; AVX512F-NEXT:    vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3
2711; AVX512F-NEXT:    vpmaxub %xmm1, %xmm0, %xmm1
2712; AVX512F-NEXT:    vpsubb %xmm2, %xmm1, %xmm1
2713; AVX512F-NEXT:    vpsrlw $1, %xmm1, %xmm1
2714; AVX512F-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
2715; AVX512F-NEXT:    vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
2716; AVX512F-NEXT:    vpmovzxbw {{.*#+}} ymm2 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero,xmm3[8],zero,xmm3[9],zero,xmm3[10],zero,xmm3[11],zero,xmm3[12],zero,xmm3[13],zero,xmm3[14],zero,xmm3[15],zero
2717; AVX512F-NEXT:    vpmullw %ymm2, %ymm1, %ymm1
2718; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
2719; AVX512F-NEXT:    vpmovdb %zmm1, %xmm1
2720; AVX512F-NEXT:    vpaddb %xmm0, %xmm1, %xmm0
2721; AVX512F-NEXT:    vzeroupper
2722; AVX512F-NEXT:    retq
2723;
2724; AVX512VL-FALLBACK-LABEL: vec128_i8_unsigned_reg_reg:
2725; AVX512VL-FALLBACK:       # %bb.0:
2726; AVX512VL-FALLBACK-NEXT:    vpminub %xmm1, %xmm0, %xmm2
2727; AVX512VL-FALLBACK-NEXT:    vpmaxub %xmm1, %xmm0, %xmm1
2728; AVX512VL-FALLBACK-NEXT:    vpsubb %xmm2, %xmm1, %xmm1
2729; AVX512VL-FALLBACK-NEXT:    vpsrlw $1, %xmm1, %xmm1
2730; AVX512VL-FALLBACK-NEXT:    vpcmpeqb %xmm2, %xmm0, %xmm2
2731; AVX512VL-FALLBACK-NEXT:    vpternlogq {{.*#+}} xmm2 = ~xmm2
2732; AVX512VL-FALLBACK-NEXT:    vpternlogd {{.*#+}} xmm1 = xmm2 ^ (xmm1 & mem)
2733; AVX512VL-FALLBACK-NEXT:    vpsubb %xmm2, %xmm1, %xmm1
2734; AVX512VL-FALLBACK-NEXT:    vpaddb %xmm0, %xmm1, %xmm0
2735; AVX512VL-FALLBACK-NEXT:    retq
2736;
2737; AVX512BW-FALLBACK-LABEL: vec128_i8_unsigned_reg_reg:
2738; AVX512BW-FALLBACK:       # %bb.0:
2739; AVX512BW-FALLBACK-NEXT:    # kill: def $xmm1 killed $xmm1 def $zmm1
2740; AVX512BW-FALLBACK-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
2741; AVX512BW-FALLBACK-NEXT:    vpcmpnleub %zmm1, %zmm0, %k1
2742; AVX512BW-FALLBACK-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
2743; AVX512BW-FALLBACK-NEXT:    vpbroadcastb {{.*#+}} xmm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
2744; AVX512BW-FALLBACK-NEXT:    vmovdqu8 %zmm2, %zmm3 {%k1}
2745; AVX512BW-FALLBACK-NEXT:    vpminub %xmm1, %xmm0, %xmm2
2746; AVX512BW-FALLBACK-NEXT:    vpmaxub %xmm1, %xmm0, %xmm1
2747; AVX512BW-FALLBACK-NEXT:    vpsubb %xmm2, %xmm1, %xmm1
2748; AVX512BW-FALLBACK-NEXT:    vpsrlw $1, %xmm1, %xmm1
2749; AVX512BW-FALLBACK-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
2750; AVX512BW-FALLBACK-NEXT:    vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
2751; AVX512BW-FALLBACK-NEXT:    vpmovzxbw {{.*#+}} ymm2 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero,xmm3[8],zero,xmm3[9],zero,xmm3[10],zero,xmm3[11],zero,xmm3[12],zero,xmm3[13],zero,xmm3[14],zero,xmm3[15],zero
2752; AVX512BW-FALLBACK-NEXT:    vpmullw %ymm2, %ymm1, %ymm1
2753; AVX512BW-FALLBACK-NEXT:    vpmovwb %zmm1, %ymm1
2754; AVX512BW-FALLBACK-NEXT:    vpaddb %xmm0, %xmm1, %xmm0
2755; AVX512BW-FALLBACK-NEXT:    vzeroupper
2756; AVX512BW-FALLBACK-NEXT:    retq
2757;
2758; AVX512VLBW-LABEL: vec128_i8_unsigned_reg_reg:
2759; AVX512VLBW:       # %bb.0:
2760; AVX512VLBW-NEXT:    vpcmpnleub %xmm1, %xmm0, %k1
2761; AVX512VLBW-NEXT:    vpminub %xmm1, %xmm0, %xmm2
2762; AVX512VLBW-NEXT:    vpmaxub %xmm1, %xmm0, %xmm1
2763; AVX512VLBW-NEXT:    vpsubb %xmm2, %xmm1, %xmm1
2764; AVX512VLBW-NEXT:    vpsrlw $1, %xmm1, %xmm1
2765; AVX512VLBW-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm1
2766; AVX512VLBW-NEXT:    vpxor %xmm2, %xmm2, %xmm2
2767; AVX512VLBW-NEXT:    vpsubb %xmm1, %xmm2, %xmm1 {%k1}
2768; AVX512VLBW-NEXT:    vpaddb %xmm0, %xmm1, %xmm0
2769; AVX512VLBW-NEXT:    retq
2770  %t3 = icmp ugt <16 x i8> %a1, %a2
2771  %t4 = select <16 x i1> %t3, <16 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>, <16 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
2772  %t5 = select <16 x i1> %t3, <16 x i8> %a2, <16 x i8> %a1
2773  %t6 = select <16 x i1> %t3, <16 x i8> %a1, <16 x i8> %a2
2774  %t7 = sub <16 x i8> %t6, %t5
2775  %t8 = lshr <16 x i8> %t7, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
2776  %t9 = mul <16 x i8> %t8, %t4
2777  %a10 = add <16 x i8> %t9, %a1
2778  ret <16 x i8> %a10
2779}
2780
2781; Values are loaded. Only check signed case.
2782
2783define <16 x i8> @vec128_i8_signed_mem_reg(ptr %a1_addr, <16 x i8> %a2) nounwind {
2784; SSE2-LABEL: vec128_i8_signed_mem_reg:
2785; SSE2:       # %bb.0:
2786; SSE2-NEXT:    movdqa %xmm0, %xmm1
2787; SSE2-NEXT:    movdqa (%rdi), %xmm2
2788; SSE2-NEXT:    movdqa %xmm2, %xmm3
2789; SSE2-NEXT:    pcmpgtb %xmm0, %xmm3
2790; SSE2-NEXT:    movdqa {{.*#+}} xmm0 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
2791; SSE2-NEXT:    por %xmm3, %xmm0
2792; SSE2-NEXT:    movdqa %xmm2, %xmm4
2793; SSE2-NEXT:    psubb %xmm1, %xmm4
2794; SSE2-NEXT:    pxor %xmm3, %xmm4
2795; SSE2-NEXT:    psubb %xmm4, %xmm3
2796; SSE2-NEXT:    psrlw $1, %xmm3
2797; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
2798; SSE2-NEXT:    movdqa %xmm3, %xmm1
2799; SSE2-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
2800; SSE2-NEXT:    movdqa %xmm0, %xmm4
2801; SSE2-NEXT:    punpckhbw {{.*#+}} xmm4 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
2802; SSE2-NEXT:    pmullw %xmm1, %xmm4
2803; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255]
2804; SSE2-NEXT:    pand %xmm1, %xmm4
2805; SSE2-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
2806; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
2807; SSE2-NEXT:    pmullw %xmm3, %xmm0
2808; SSE2-NEXT:    pand %xmm1, %xmm0
2809; SSE2-NEXT:    packuswb %xmm4, %xmm0
2810; SSE2-NEXT:    paddb %xmm2, %xmm0
2811; SSE2-NEXT:    retq
2812;
2813; SSE41-LABEL: vec128_i8_signed_mem_reg:
2814; SSE41:       # %bb.0:
2815; SSE41-NEXT:    movdqa (%rdi), %xmm1
2816; SSE41-NEXT:    movdqa %xmm1, %xmm2
2817; SSE41-NEXT:    pcmpgtb %xmm0, %xmm2
2818; SSE41-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
2819; SSE41-NEXT:    movdqa %xmm1, %xmm3
2820; SSE41-NEXT:    pminsb %xmm0, %xmm3
2821; SSE41-NEXT:    pmaxsb %xmm1, %xmm0
2822; SSE41-NEXT:    psubb %xmm3, %xmm0
2823; SSE41-NEXT:    psrlw $1, %xmm0
2824; SSE41-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
2825; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
2826; SSE41-NEXT:    movdqa %xmm2, %xmm4
2827; SSE41-NEXT:    pand %xmm3, %xmm4
2828; SSE41-NEXT:    movdqa %xmm0, %xmm5
2829; SSE41-NEXT:    pmaddubsw %xmm4, %xmm5
2830; SSE41-NEXT:    pand %xmm3, %xmm5
2831; SSE41-NEXT:    pandn %xmm2, %xmm3
2832; SSE41-NEXT:    pmaddubsw %xmm3, %xmm0
2833; SSE41-NEXT:    psllw $8, %xmm0
2834; SSE41-NEXT:    por %xmm5, %xmm0
2835; SSE41-NEXT:    paddb %xmm1, %xmm0
2836; SSE41-NEXT:    retq
2837;
2838; AVX1-LABEL: vec128_i8_signed_mem_reg:
2839; AVX1:       # %bb.0:
2840; AVX1-NEXT:    vmovdqa (%rdi), %xmm1
2841; AVX1-NEXT:    vpcmpgtb %xmm0, %xmm1, %xmm2
2842; AVX1-NEXT:    vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
2843; AVX1-NEXT:    vpminsb %xmm0, %xmm1, %xmm3
2844; AVX1-NEXT:    vpmaxsb %xmm0, %xmm1, %xmm0
2845; AVX1-NEXT:    vpsubb %xmm3, %xmm0, %xmm0
2846; AVX1-NEXT:    vpsrlw $1, %xmm0, %xmm0
2847; AVX1-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
2848; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
2849; AVX1-NEXT:    vpand %xmm3, %xmm2, %xmm4
2850; AVX1-NEXT:    vpmaddubsw %xmm4, %xmm0, %xmm4
2851; AVX1-NEXT:    vpand %xmm3, %xmm4, %xmm4
2852; AVX1-NEXT:    vpandn %xmm2, %xmm3, %xmm2
2853; AVX1-NEXT:    vpmaddubsw %xmm2, %xmm0, %xmm0
2854; AVX1-NEXT:    vpsllw $8, %xmm0, %xmm0
2855; AVX1-NEXT:    vpor %xmm0, %xmm4, %xmm0
2856; AVX1-NEXT:    vpaddb %xmm1, %xmm0, %xmm0
2857; AVX1-NEXT:    retq
2858;
2859; AVX2-LABEL: vec128_i8_signed_mem_reg:
2860; AVX2:       # %bb.0:
2861; AVX2-NEXT:    vmovdqa (%rdi), %xmm1
2862; AVX2-NEXT:    vpcmpgtb %xmm0, %xmm1, %xmm2
2863; AVX2-NEXT:    vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
2864; AVX2-NEXT:    vpminsb %xmm0, %xmm1, %xmm3
2865; AVX2-NEXT:    vpmaxsb %xmm0, %xmm1, %xmm0
2866; AVX2-NEXT:    vpsubb %xmm3, %xmm0, %xmm0
2867; AVX2-NEXT:    vpsrlw $1, %xmm0, %xmm0
2868; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
2869; AVX2-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
2870; AVX2-NEXT:    vpmovzxbw {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero
2871; AVX2-NEXT:    vpmullw %ymm2, %ymm0, %ymm0
2872; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
2873; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm2
2874; AVX2-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
2875; AVX2-NEXT:    vpaddb %xmm1, %xmm0, %xmm0
2876; AVX2-NEXT:    vzeroupper
2877; AVX2-NEXT:    retq
2878;
2879; XOP-FALLBACK-LABEL: vec128_i8_signed_mem_reg:
2880; XOP-FALLBACK:       # %bb.0:
2881; XOP-FALLBACK-NEXT:    vmovdqa (%rdi), %xmm1
2882; XOP-FALLBACK-NEXT:    vpcomgtb %xmm0, %xmm1, %xmm2
2883; XOP-FALLBACK-NEXT:    vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
2884; XOP-FALLBACK-NEXT:    vpminsb %xmm0, %xmm1, %xmm3
2885; XOP-FALLBACK-NEXT:    vpmaxsb %xmm0, %xmm1, %xmm0
2886; XOP-FALLBACK-NEXT:    vpsubb %xmm3, %xmm0, %xmm0
2887; XOP-FALLBACK-NEXT:    vpcmpeqd %xmm3, %xmm3, %xmm3
2888; XOP-FALLBACK-NEXT:    vpshlb %xmm3, %xmm0, %xmm0
2889; XOP-FALLBACK-NEXT:    vbroadcastss {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
2890; XOP-FALLBACK-NEXT:    vpandn %xmm2, %xmm3, %xmm4
2891; XOP-FALLBACK-NEXT:    vpmaddubsw %xmm4, %xmm0, %xmm4
2892; XOP-FALLBACK-NEXT:    vpand %xmm3, %xmm2, %xmm2
2893; XOP-FALLBACK-NEXT:    vpmaddubsw %xmm2, %xmm0, %xmm0
2894; XOP-FALLBACK-NEXT:    vpperm {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[2],xmm4[2],xmm0[4],xmm4[4],xmm0[6],xmm4[6],xmm0[8],xmm4[8],xmm0[10],xmm4[10],xmm0[12],xmm4[12],xmm0[14],xmm4[14]
2895; XOP-FALLBACK-NEXT:    vpaddb %xmm1, %xmm0, %xmm0
2896; XOP-FALLBACK-NEXT:    retq
2897;
2898; XOPAVX1-LABEL: vec128_i8_signed_mem_reg:
2899; XOPAVX1:       # %bb.0:
2900; XOPAVX1-NEXT:    vmovdqa (%rdi), %xmm1
2901; XOPAVX1-NEXT:    vpcomgtb %xmm0, %xmm1, %xmm2
2902; XOPAVX1-NEXT:    vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
2903; XOPAVX1-NEXT:    vpminsb %xmm0, %xmm1, %xmm3
2904; XOPAVX1-NEXT:    vpmaxsb %xmm0, %xmm1, %xmm0
2905; XOPAVX1-NEXT:    vpsubb %xmm3, %xmm0, %xmm0
2906; XOPAVX1-NEXT:    vpcmpeqd %xmm3, %xmm3, %xmm3
2907; XOPAVX1-NEXT:    vpshlb %xmm3, %xmm0, %xmm0
2908; XOPAVX1-NEXT:    vbroadcastss {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
2909; XOPAVX1-NEXT:    vpandn %xmm2, %xmm3, %xmm4
2910; XOPAVX1-NEXT:    vpmaddubsw %xmm4, %xmm0, %xmm4
2911; XOPAVX1-NEXT:    vpand %xmm3, %xmm2, %xmm2
2912; XOPAVX1-NEXT:    vpmaddubsw %xmm2, %xmm0, %xmm0
2913; XOPAVX1-NEXT:    vpperm {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[2],xmm4[2],xmm0[4],xmm4[4],xmm0[6],xmm4[6],xmm0[8],xmm4[8],xmm0[10],xmm4[10],xmm0[12],xmm4[12],xmm0[14],xmm4[14]
2914; XOPAVX1-NEXT:    vpaddb %xmm1, %xmm0, %xmm0
2915; XOPAVX1-NEXT:    retq
2916;
2917; XOPAVX2-LABEL: vec128_i8_signed_mem_reg:
2918; XOPAVX2:       # %bb.0:
2919; XOPAVX2-NEXT:    vmovdqa (%rdi), %xmm1
2920; XOPAVX2-NEXT:    vpcomgtb %xmm0, %xmm1, %xmm2
2921; XOPAVX2-NEXT:    vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
2922; XOPAVX2-NEXT:    vpminsb %xmm0, %xmm1, %xmm3
2923; XOPAVX2-NEXT:    vpmaxsb %xmm0, %xmm1, %xmm0
2924; XOPAVX2-NEXT:    vpsubb %xmm3, %xmm0, %xmm0
2925; XOPAVX2-NEXT:    vpcmpeqd %xmm3, %xmm3, %xmm3
2926; XOPAVX2-NEXT:    vpshlb %xmm3, %xmm0, %xmm0
2927; XOPAVX2-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
2928; XOPAVX2-NEXT:    vpmovzxbw {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero
2929; XOPAVX2-NEXT:    vpmullw %ymm2, %ymm0, %ymm0
2930; XOPAVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
2931; XOPAVX2-NEXT:    vextracti128 $1, %ymm0, %xmm2
2932; XOPAVX2-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
2933; XOPAVX2-NEXT:    vpaddb %xmm1, %xmm0, %xmm0
2934; XOPAVX2-NEXT:    vzeroupper
2935; XOPAVX2-NEXT:    retq
2936;
2937; AVX512F-LABEL: vec128_i8_signed_mem_reg:
2938; AVX512F:       # %bb.0:
2939; AVX512F-NEXT:    vmovdqa (%rdi), %xmm1
2940; AVX512F-NEXT:    vpcmpgtb %xmm0, %xmm1, %xmm2
2941; AVX512F-NEXT:    vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
2942; AVX512F-NEXT:    vpminsb %xmm0, %xmm1, %xmm3
2943; AVX512F-NEXT:    vpmaxsb %xmm0, %xmm1, %xmm0
2944; AVX512F-NEXT:    vpsubb %xmm3, %xmm0, %xmm0
2945; AVX512F-NEXT:    vpsrlw $1, %xmm0, %xmm0
2946; AVX512F-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
2947; AVX512F-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
2948; AVX512F-NEXT:    vpmovzxbw {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero
2949; AVX512F-NEXT:    vpmullw %ymm2, %ymm0, %ymm0
2950; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
2951; AVX512F-NEXT:    vpmovdb %zmm0, %xmm0
2952; AVX512F-NEXT:    vpaddb %xmm1, %xmm0, %xmm0
2953; AVX512F-NEXT:    vzeroupper
2954; AVX512F-NEXT:    retq
2955;
2956; AVX512VL-FALLBACK-LABEL: vec128_i8_signed_mem_reg:
2957; AVX512VL-FALLBACK:       # %bb.0:
2958; AVX512VL-FALLBACK-NEXT:    vmovdqa (%rdi), %xmm1
2959; AVX512VL-FALLBACK-NEXT:    vpminsb %xmm0, %xmm1, %xmm2
2960; AVX512VL-FALLBACK-NEXT:    vpmaxsb %xmm0, %xmm1, %xmm3
2961; AVX512VL-FALLBACK-NEXT:    vpsubb %xmm2, %xmm3, %xmm2
2962; AVX512VL-FALLBACK-NEXT:    vpsrlw $1, %xmm2, %xmm2
2963; AVX512VL-FALLBACK-NEXT:    vpcmpgtb %xmm0, %xmm1, %xmm0
2964; AVX512VL-FALLBACK-NEXT:    vpternlogd {{.*#+}} xmm2 = xmm0 ^ (xmm2 & mem)
2965; AVX512VL-FALLBACK-NEXT:    vpsubb %xmm0, %xmm2, %xmm0
2966; AVX512VL-FALLBACK-NEXT:    vpaddb %xmm1, %xmm0, %xmm0
2967; AVX512VL-FALLBACK-NEXT:    retq
2968;
2969; AVX512BW-FALLBACK-LABEL: vec128_i8_signed_mem_reg:
2970; AVX512BW-FALLBACK:       # %bb.0:
2971; AVX512BW-FALLBACK-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
2972; AVX512BW-FALLBACK-NEXT:    vmovdqa (%rdi), %xmm1
2973; AVX512BW-FALLBACK-NEXT:    vpcmpgtb %zmm0, %zmm1, %k1
2974; AVX512BW-FALLBACK-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
2975; AVX512BW-FALLBACK-NEXT:    vpbroadcastb {{.*#+}} xmm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
2976; AVX512BW-FALLBACK-NEXT:    vmovdqu8 %zmm2, %zmm3 {%k1}
2977; AVX512BW-FALLBACK-NEXT:    vpminsb %xmm0, %xmm1, %xmm2
2978; AVX512BW-FALLBACK-NEXT:    vpmaxsb %xmm0, %xmm1, %xmm0
2979; AVX512BW-FALLBACK-NEXT:    vpsubb %xmm2, %xmm0, %xmm0
2980; AVX512BW-FALLBACK-NEXT:    vpsrlw $1, %xmm0, %xmm0
2981; AVX512BW-FALLBACK-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
2982; AVX512BW-FALLBACK-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
2983; AVX512BW-FALLBACK-NEXT:    vpmovzxbw {{.*#+}} ymm2 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero,xmm3[8],zero,xmm3[9],zero,xmm3[10],zero,xmm3[11],zero,xmm3[12],zero,xmm3[13],zero,xmm3[14],zero,xmm3[15],zero
2984; AVX512BW-FALLBACK-NEXT:    vpmullw %ymm2, %ymm0, %ymm0
2985; AVX512BW-FALLBACK-NEXT:    vpmovwb %zmm0, %ymm0
2986; AVX512BW-FALLBACK-NEXT:    vpaddb %xmm1, %xmm0, %xmm0
2987; AVX512BW-FALLBACK-NEXT:    vzeroupper
2988; AVX512BW-FALLBACK-NEXT:    retq
2989;
2990; AVX512VLBW-LABEL: vec128_i8_signed_mem_reg:
2991; AVX512VLBW:       # %bb.0:
2992; AVX512VLBW-NEXT:    vmovdqa (%rdi), %xmm1
2993; AVX512VLBW-NEXT:    vpcmpgtb %xmm0, %xmm1, %k1
2994; AVX512VLBW-NEXT:    vpminsb %xmm0, %xmm1, %xmm2
2995; AVX512VLBW-NEXT:    vpmaxsb %xmm0, %xmm1, %xmm0
2996; AVX512VLBW-NEXT:    vpsubb %xmm2, %xmm0, %xmm0
2997; AVX512VLBW-NEXT:    vpsrlw $1, %xmm0, %xmm0
2998; AVX512VLBW-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0
2999; AVX512VLBW-NEXT:    vpxor %xmm2, %xmm2, %xmm2
3000; AVX512VLBW-NEXT:    vpsubb %xmm0, %xmm2, %xmm0 {%k1}
3001; AVX512VLBW-NEXT:    vpaddb %xmm1, %xmm0, %xmm0
3002; AVX512VLBW-NEXT:    retq
3003  %a1 = load <16 x i8>, ptr %a1_addr
3004  %t3 = icmp sgt <16 x i8> %a1, %a2 ; signed
3005  %t4 = select <16 x i1> %t3, <16 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>, <16 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
3006  %t5 = select <16 x i1> %t3, <16 x i8> %a2, <16 x i8> %a1
3007  %t6 = select <16 x i1> %t3, <16 x i8> %a1, <16 x i8> %a2
3008  %t7 = sub <16 x i8> %t6, %t5
3009  %t8 = lshr <16 x i8> %t7, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
3010  %t9 = mul nsw <16 x i8> %t8, %t4 ; signed
3011  %a10 = add nsw <16 x i8> %t9, %a1 ; signed
3012  ret <16 x i8> %a10
3013}
3014
3015define <16 x i8> @vec128_i8_signed_reg_mem(<16 x i8> %a1, ptr %a2_addr) nounwind {
3016; SSE2-LABEL: vec128_i8_signed_reg_mem:
3017; SSE2:       # %bb.0:
3018; SSE2-NEXT:    movdqa (%rdi), %xmm2
3019; SSE2-NEXT:    movdqa %xmm0, %xmm3
3020; SSE2-NEXT:    pcmpgtb %xmm2, %xmm3
3021; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
3022; SSE2-NEXT:    por %xmm3, %xmm1
3023; SSE2-NEXT:    movdqa %xmm0, %xmm4
3024; SSE2-NEXT:    psubb %xmm2, %xmm4
3025; SSE2-NEXT:    pxor %xmm3, %xmm4
3026; SSE2-NEXT:    psubb %xmm4, %xmm3
3027; SSE2-NEXT:    psrlw $1, %xmm3
3028; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
3029; SSE2-NEXT:    movdqa %xmm3, %xmm2
3030; SSE2-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
3031; SSE2-NEXT:    movdqa %xmm1, %xmm4
3032; SSE2-NEXT:    punpckhbw {{.*#+}} xmm4 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
3033; SSE2-NEXT:    pmullw %xmm2, %xmm4
3034; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
3035; SSE2-NEXT:    pand %xmm2, %xmm4
3036; SSE2-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
3037; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
3038; SSE2-NEXT:    pmullw %xmm3, %xmm1
3039; SSE2-NEXT:    pand %xmm2, %xmm1
3040; SSE2-NEXT:    packuswb %xmm4, %xmm1
3041; SSE2-NEXT:    paddb %xmm1, %xmm0
3042; SSE2-NEXT:    retq
3043;
3044; SSE41-LABEL: vec128_i8_signed_reg_mem:
3045; SSE41:       # %bb.0:
3046; SSE41-NEXT:    movdqa (%rdi), %xmm1
3047; SSE41-NEXT:    movdqa %xmm0, %xmm2
3048; SSE41-NEXT:    pcmpgtb %xmm1, %xmm2
3049; SSE41-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
3050; SSE41-NEXT:    movdqa %xmm0, %xmm3
3051; SSE41-NEXT:    pminsb %xmm1, %xmm3
3052; SSE41-NEXT:    pmaxsb %xmm0, %xmm1
3053; SSE41-NEXT:    psubb %xmm3, %xmm1
3054; SSE41-NEXT:    psrlw $1, %xmm1
3055; SSE41-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
3056; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
3057; SSE41-NEXT:    movdqa %xmm2, %xmm4
3058; SSE41-NEXT:    pand %xmm3, %xmm4
3059; SSE41-NEXT:    movdqa %xmm1, %xmm5
3060; SSE41-NEXT:    pmaddubsw %xmm4, %xmm5
3061; SSE41-NEXT:    pand %xmm3, %xmm5
3062; SSE41-NEXT:    pandn %xmm2, %xmm3
3063; SSE41-NEXT:    pmaddubsw %xmm3, %xmm1
3064; SSE41-NEXT:    psllw $8, %xmm1
3065; SSE41-NEXT:    por %xmm5, %xmm1
3066; SSE41-NEXT:    paddb %xmm1, %xmm0
3067; SSE41-NEXT:    retq
3068;
3069; AVX1-LABEL: vec128_i8_signed_reg_mem:
3070; AVX1:       # %bb.0:
3071; AVX1-NEXT:    vmovdqa (%rdi), %xmm1
3072; AVX1-NEXT:    vpcmpgtb %xmm1, %xmm0, %xmm2
3073; AVX1-NEXT:    vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
3074; AVX1-NEXT:    vpminsb %xmm1, %xmm0, %xmm3
3075; AVX1-NEXT:    vpmaxsb %xmm1, %xmm0, %xmm1
3076; AVX1-NEXT:    vpsubb %xmm3, %xmm1, %xmm1
3077; AVX1-NEXT:    vpsrlw $1, %xmm1, %xmm1
3078; AVX1-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
3079; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
3080; AVX1-NEXT:    vpand %xmm3, %xmm2, %xmm4
3081; AVX1-NEXT:    vpmaddubsw %xmm4, %xmm1, %xmm4
3082; AVX1-NEXT:    vpand %xmm3, %xmm4, %xmm4
3083; AVX1-NEXT:    vpandn %xmm2, %xmm3, %xmm2
3084; AVX1-NEXT:    vpmaddubsw %xmm2, %xmm1, %xmm1
3085; AVX1-NEXT:    vpsllw $8, %xmm1, %xmm1
3086; AVX1-NEXT:    vpor %xmm1, %xmm4, %xmm1
3087; AVX1-NEXT:    vpaddb %xmm0, %xmm1, %xmm0
3088; AVX1-NEXT:    retq
3089;
3090; AVX2-LABEL: vec128_i8_signed_reg_mem:
3091; AVX2:       # %bb.0:
3092; AVX2-NEXT:    vmovdqa (%rdi), %xmm1
3093; AVX2-NEXT:    vpcmpgtb %xmm1, %xmm0, %xmm2
3094; AVX2-NEXT:    vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
3095; AVX2-NEXT:    vpminsb %xmm1, %xmm0, %xmm3
3096; AVX2-NEXT:    vpmaxsb %xmm1, %xmm0, %xmm1
3097; AVX2-NEXT:    vpsubb %xmm3, %xmm1, %xmm1
3098; AVX2-NEXT:    vpsrlw $1, %xmm1, %xmm1
3099; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
3100; AVX2-NEXT:    vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
3101; AVX2-NEXT:    vpmovzxbw {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero
3102; AVX2-NEXT:    vpmullw %ymm2, %ymm1, %ymm1
3103; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
3104; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm2
3105; AVX2-NEXT:    vpackuswb %xmm2, %xmm1, %xmm1
3106; AVX2-NEXT:    vpaddb %xmm0, %xmm1, %xmm0
3107; AVX2-NEXT:    vzeroupper
3108; AVX2-NEXT:    retq
3109;
3110; XOP-FALLBACK-LABEL: vec128_i8_signed_reg_mem:
3111; XOP-FALLBACK:       # %bb.0:
3112; XOP-FALLBACK-NEXT:    vmovdqa (%rdi), %xmm1
3113; XOP-FALLBACK-NEXT:    vpcomgtb %xmm1, %xmm0, %xmm2
3114; XOP-FALLBACK-NEXT:    vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
3115; XOP-FALLBACK-NEXT:    vpminsb %xmm1, %xmm0, %xmm3
3116; XOP-FALLBACK-NEXT:    vpmaxsb %xmm1, %xmm0, %xmm1
3117; XOP-FALLBACK-NEXT:    vpsubb %xmm3, %xmm1, %xmm1
3118; XOP-FALLBACK-NEXT:    vpcmpeqd %xmm3, %xmm3, %xmm3
3119; XOP-FALLBACK-NEXT:    vpshlb %xmm3, %xmm1, %xmm1
3120; XOP-FALLBACK-NEXT:    vbroadcastss {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
3121; XOP-FALLBACK-NEXT:    vpandn %xmm2, %xmm3, %xmm4
3122; XOP-FALLBACK-NEXT:    vpmaddubsw %xmm4, %xmm1, %xmm4
3123; XOP-FALLBACK-NEXT:    vpand %xmm3, %xmm2, %xmm2
3124; XOP-FALLBACK-NEXT:    vpmaddubsw %xmm2, %xmm1, %xmm1
3125; XOP-FALLBACK-NEXT:    vpperm {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[2],xmm4[2],xmm1[4],xmm4[4],xmm1[6],xmm4[6],xmm1[8],xmm4[8],xmm1[10],xmm4[10],xmm1[12],xmm4[12],xmm1[14],xmm4[14]
3126; XOP-FALLBACK-NEXT:    vpaddb %xmm0, %xmm1, %xmm0
3127; XOP-FALLBACK-NEXT:    retq
3128;
3129; XOPAVX1-LABEL: vec128_i8_signed_reg_mem:
3130; XOPAVX1:       # %bb.0:
3131; XOPAVX1-NEXT:    vmovdqa (%rdi), %xmm1
3132; XOPAVX1-NEXT:    vpcomgtb %xmm1, %xmm0, %xmm2
3133; XOPAVX1-NEXT:    vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
3134; XOPAVX1-NEXT:    vpminsb %xmm1, %xmm0, %xmm3
3135; XOPAVX1-NEXT:    vpmaxsb %xmm1, %xmm0, %xmm1
3136; XOPAVX1-NEXT:    vpsubb %xmm3, %xmm1, %xmm1
3137; XOPAVX1-NEXT:    vpcmpeqd %xmm3, %xmm3, %xmm3
3138; XOPAVX1-NEXT:    vpshlb %xmm3, %xmm1, %xmm1
3139; XOPAVX1-NEXT:    vbroadcastss {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
3140; XOPAVX1-NEXT:    vpandn %xmm2, %xmm3, %xmm4
3141; XOPAVX1-NEXT:    vpmaddubsw %xmm4, %xmm1, %xmm4
3142; XOPAVX1-NEXT:    vpand %xmm3, %xmm2, %xmm2
3143; XOPAVX1-NEXT:    vpmaddubsw %xmm2, %xmm1, %xmm1
3144; XOPAVX1-NEXT:    vpperm {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[2],xmm4[2],xmm1[4],xmm4[4],xmm1[6],xmm4[6],xmm1[8],xmm4[8],xmm1[10],xmm4[10],xmm1[12],xmm4[12],xmm1[14],xmm4[14]
3145; XOPAVX1-NEXT:    vpaddb %xmm0, %xmm1, %xmm0
3146; XOPAVX1-NEXT:    retq
3147;
3148; XOPAVX2-LABEL: vec128_i8_signed_reg_mem:
3149; XOPAVX2:       # %bb.0:
3150; XOPAVX2-NEXT:    vmovdqa (%rdi), %xmm1
3151; XOPAVX2-NEXT:    vpcomgtb %xmm1, %xmm0, %xmm2
3152; XOPAVX2-NEXT:    vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
3153; XOPAVX2-NEXT:    vpminsb %xmm1, %xmm0, %xmm3
3154; XOPAVX2-NEXT:    vpmaxsb %xmm1, %xmm0, %xmm1
3155; XOPAVX2-NEXT:    vpsubb %xmm3, %xmm1, %xmm1
3156; XOPAVX2-NEXT:    vpcmpeqd %xmm3, %xmm3, %xmm3
3157; XOPAVX2-NEXT:    vpshlb %xmm3, %xmm1, %xmm1
3158; XOPAVX2-NEXT:    vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
3159; XOPAVX2-NEXT:    vpmovzxbw {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero
3160; XOPAVX2-NEXT:    vpmullw %ymm2, %ymm1, %ymm1
3161; XOPAVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
3162; XOPAVX2-NEXT:    vextracti128 $1, %ymm1, %xmm2
3163; XOPAVX2-NEXT:    vpackuswb %xmm2, %xmm1, %xmm1
3164; XOPAVX2-NEXT:    vpaddb %xmm0, %xmm1, %xmm0
3165; XOPAVX2-NEXT:    vzeroupper
3166; XOPAVX2-NEXT:    retq
3167;
3168; AVX512F-LABEL: vec128_i8_signed_reg_mem:
3169; AVX512F:       # %bb.0:
3170; AVX512F-NEXT:    vmovdqa (%rdi), %xmm1
3171; AVX512F-NEXT:    vpcmpgtb %xmm1, %xmm0, %xmm2
3172; AVX512F-NEXT:    vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
3173; AVX512F-NEXT:    vpminsb %xmm1, %xmm0, %xmm3
3174; AVX512F-NEXT:    vpmaxsb %xmm1, %xmm0, %xmm1
3175; AVX512F-NEXT:    vpsubb %xmm3, %xmm1, %xmm1
3176; AVX512F-NEXT:    vpsrlw $1, %xmm1, %xmm1
3177; AVX512F-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
3178; AVX512F-NEXT:    vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
3179; AVX512F-NEXT:    vpmovzxbw {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero
3180; AVX512F-NEXT:    vpmullw %ymm2, %ymm1, %ymm1
3181; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
3182; AVX512F-NEXT:    vpmovdb %zmm1, %xmm1
3183; AVX512F-NEXT:    vpaddb %xmm0, %xmm1, %xmm0
3184; AVX512F-NEXT:    vzeroupper
3185; AVX512F-NEXT:    retq
3186;
3187; AVX512VL-FALLBACK-LABEL: vec128_i8_signed_reg_mem:
3188; AVX512VL-FALLBACK:       # %bb.0:
3189; AVX512VL-FALLBACK-NEXT:    vmovdqa (%rdi), %xmm1
3190; AVX512VL-FALLBACK-NEXT:    vpminsb %xmm1, %xmm0, %xmm2
3191; AVX512VL-FALLBACK-NEXT:    vpmaxsb %xmm1, %xmm0, %xmm3
3192; AVX512VL-FALLBACK-NEXT:    vpsubb %xmm2, %xmm3, %xmm2
3193; AVX512VL-FALLBACK-NEXT:    vpsrlw $1, %xmm2, %xmm2
3194; AVX512VL-FALLBACK-NEXT:    vpcmpgtb %xmm1, %xmm0, %xmm1
3195; AVX512VL-FALLBACK-NEXT:    vpternlogd {{.*#+}} xmm2 = xmm1 ^ (xmm2 & mem)
3196; AVX512VL-FALLBACK-NEXT:    vpsubb %xmm1, %xmm2, %xmm1
3197; AVX512VL-FALLBACK-NEXT:    vpaddb %xmm0, %xmm1, %xmm0
3198; AVX512VL-FALLBACK-NEXT:    retq
3199;
3200; AVX512BW-FALLBACK-LABEL: vec128_i8_signed_reg_mem:
3201; AVX512BW-FALLBACK:       # %bb.0:
3202; AVX512BW-FALLBACK-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
3203; AVX512BW-FALLBACK-NEXT:    vmovdqa (%rdi), %xmm1
3204; AVX512BW-FALLBACK-NEXT:    vpcmpgtb %zmm1, %zmm0, %k1
3205; AVX512BW-FALLBACK-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
3206; AVX512BW-FALLBACK-NEXT:    vpbroadcastb {{.*#+}} xmm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
3207; AVX512BW-FALLBACK-NEXT:    vmovdqu8 %zmm2, %zmm3 {%k1}
3208; AVX512BW-FALLBACK-NEXT:    vpminsb %xmm1, %xmm0, %xmm2
3209; AVX512BW-FALLBACK-NEXT:    vpmaxsb %xmm1, %xmm0, %xmm1
3210; AVX512BW-FALLBACK-NEXT:    vpsubb %xmm2, %xmm1, %xmm1
3211; AVX512BW-FALLBACK-NEXT:    vpsrlw $1, %xmm1, %xmm1
3212; AVX512BW-FALLBACK-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
3213; AVX512BW-FALLBACK-NEXT:    vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
3214; AVX512BW-FALLBACK-NEXT:    vpmovzxbw {{.*#+}} ymm2 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero,xmm3[8],zero,xmm3[9],zero,xmm3[10],zero,xmm3[11],zero,xmm3[12],zero,xmm3[13],zero,xmm3[14],zero,xmm3[15],zero
3215; AVX512BW-FALLBACK-NEXT:    vpmullw %ymm2, %ymm1, %ymm1
3216; AVX512BW-FALLBACK-NEXT:    vpmovwb %zmm1, %ymm1
3217; AVX512BW-FALLBACK-NEXT:    vpaddb %xmm0, %xmm1, %xmm0
3218; AVX512BW-FALLBACK-NEXT:    vzeroupper
3219; AVX512BW-FALLBACK-NEXT:    retq
3220;
3221; AVX512VLBW-LABEL: vec128_i8_signed_reg_mem:
3222; AVX512VLBW:       # %bb.0:
3223; AVX512VLBW-NEXT:    vmovdqa (%rdi), %xmm1
3224; AVX512VLBW-NEXT:    vpcmpgtb %xmm1, %xmm0, %k1
3225; AVX512VLBW-NEXT:    vpminsb %xmm1, %xmm0, %xmm2
3226; AVX512VLBW-NEXT:    vpmaxsb %xmm1, %xmm0, %xmm1
3227; AVX512VLBW-NEXT:    vpsubb %xmm2, %xmm1, %xmm1
3228; AVX512VLBW-NEXT:    vpsrlw $1, %xmm1, %xmm1
3229; AVX512VLBW-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm1
3230; AVX512VLBW-NEXT:    vpxor %xmm2, %xmm2, %xmm2
3231; AVX512VLBW-NEXT:    vpsubb %xmm1, %xmm2, %xmm1 {%k1}
3232; AVX512VLBW-NEXT:    vpaddb %xmm0, %xmm1, %xmm0
3233; AVX512VLBW-NEXT:    retq
3234  %a2 = load <16 x i8>, ptr %a2_addr
3235  %t3 = icmp sgt <16 x i8> %a1, %a2 ; signed
3236  %t4 = select <16 x i1> %t3, <16 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>, <16 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
3237  %t5 = select <16 x i1> %t3, <16 x i8> %a2, <16 x i8> %a1
3238  %t6 = select <16 x i1> %t3, <16 x i8> %a1, <16 x i8> %a2
3239  %t7 = sub <16 x i8> %t6, %t5
3240  %t8 = lshr <16 x i8> %t7, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
3241  %t9 = mul nsw <16 x i8> %t8, %t4 ; signed
3242  %a10 = add nsw <16 x i8> %t9, %a1 ; signed
3243  ret <16 x i8> %a10
3244}
3245
3246define <16 x i8> @vec128_i8_signed_mem_mem(ptr %a1_addr, ptr %a2_addr) nounwind {
3247; SSE2-LABEL: vec128_i8_signed_mem_mem:
3248; SSE2:       # %bb.0:
3249; SSE2-NEXT:    movdqa (%rdi), %xmm1
3250; SSE2-NEXT:    movdqa (%rsi), %xmm2
3251; SSE2-NEXT:    movdqa %xmm1, %xmm3
3252; SSE2-NEXT:    pcmpgtb %xmm2, %xmm3
3253; SSE2-NEXT:    movdqa {{.*#+}} xmm0 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
3254; SSE2-NEXT:    por %xmm3, %xmm0
3255; SSE2-NEXT:    movdqa %xmm1, %xmm4
3256; SSE2-NEXT:    psubb %xmm2, %xmm4
3257; SSE2-NEXT:    pxor %xmm3, %xmm4
3258; SSE2-NEXT:    psubb %xmm4, %xmm3
3259; SSE2-NEXT:    psrlw $1, %xmm3
3260; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
3261; SSE2-NEXT:    movdqa %xmm3, %xmm2
3262; SSE2-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
3263; SSE2-NEXT:    movdqa %xmm0, %xmm4
3264; SSE2-NEXT:    punpckhbw {{.*#+}} xmm4 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
3265; SSE2-NEXT:    pmullw %xmm2, %xmm4
3266; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
3267; SSE2-NEXT:    pand %xmm2, %xmm4
3268; SSE2-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
3269; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
3270; SSE2-NEXT:    pmullw %xmm3, %xmm0
3271; SSE2-NEXT:    pand %xmm2, %xmm0
3272; SSE2-NEXT:    packuswb %xmm4, %xmm0
3273; SSE2-NEXT:    paddb %xmm1, %xmm0
3274; SSE2-NEXT:    retq
3275;
3276; SSE41-LABEL: vec128_i8_signed_mem_mem:
3277; SSE41:       # %bb.0:
3278; SSE41-NEXT:    movdqa (%rdi), %xmm1
3279; SSE41-NEXT:    movdqa (%rsi), %xmm0
3280; SSE41-NEXT:    movdqa %xmm1, %xmm2
3281; SSE41-NEXT:    pcmpgtb %xmm0, %xmm2
3282; SSE41-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
3283; SSE41-NEXT:    movdqa %xmm1, %xmm3
3284; SSE41-NEXT:    pminsb %xmm0, %xmm3
3285; SSE41-NEXT:    pmaxsb %xmm1, %xmm0
3286; SSE41-NEXT:    psubb %xmm3, %xmm0
3287; SSE41-NEXT:    psrlw $1, %xmm0
3288; SSE41-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
3289; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
3290; SSE41-NEXT:    movdqa %xmm2, %xmm4
3291; SSE41-NEXT:    pand %xmm3, %xmm4
3292; SSE41-NEXT:    movdqa %xmm0, %xmm5
3293; SSE41-NEXT:    pmaddubsw %xmm4, %xmm5
3294; SSE41-NEXT:    pand %xmm3, %xmm5
3295; SSE41-NEXT:    pandn %xmm2, %xmm3
3296; SSE41-NEXT:    pmaddubsw %xmm3, %xmm0
3297; SSE41-NEXT:    psllw $8, %xmm0
3298; SSE41-NEXT:    por %xmm5, %xmm0
3299; SSE41-NEXT:    paddb %xmm1, %xmm0
3300; SSE41-NEXT:    retq
3301;
3302; AVX1-LABEL: vec128_i8_signed_mem_mem:
3303; AVX1:       # %bb.0:
3304; AVX1-NEXT:    vmovdqa (%rdi), %xmm0
3305; AVX1-NEXT:    vmovdqa (%rsi), %xmm1
3306; AVX1-NEXT:    vpcmpgtb %xmm1, %xmm0, %xmm2
3307; AVX1-NEXT:    vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
3308; AVX1-NEXT:    vpminsb %xmm1, %xmm0, %xmm3
3309; AVX1-NEXT:    vpmaxsb %xmm1, %xmm0, %xmm1
3310; AVX1-NEXT:    vpsubb %xmm3, %xmm1, %xmm1
3311; AVX1-NEXT:    vpsrlw $1, %xmm1, %xmm1
3312; AVX1-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
3313; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
3314; AVX1-NEXT:    vpand %xmm3, %xmm2, %xmm4
3315; AVX1-NEXT:    vpmaddubsw %xmm4, %xmm1, %xmm4
3316; AVX1-NEXT:    vpand %xmm3, %xmm4, %xmm4
3317; AVX1-NEXT:    vpandn %xmm2, %xmm3, %xmm2
3318; AVX1-NEXT:    vpmaddubsw %xmm2, %xmm1, %xmm1
3319; AVX1-NEXT:    vpsllw $8, %xmm1, %xmm1
3320; AVX1-NEXT:    vpor %xmm1, %xmm4, %xmm1
3321; AVX1-NEXT:    vpaddb %xmm0, %xmm1, %xmm0
3322; AVX1-NEXT:    retq
3323;
3324; AVX2-LABEL: vec128_i8_signed_mem_mem:
3325; AVX2:       # %bb.0:
3326; AVX2-NEXT:    vmovdqa (%rdi), %xmm0
3327; AVX2-NEXT:    vmovdqa (%rsi), %xmm1
3328; AVX2-NEXT:    vpcmpgtb %xmm1, %xmm0, %xmm2
3329; AVX2-NEXT:    vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
3330; AVX2-NEXT:    vpminsb %xmm1, %xmm0, %xmm3
3331; AVX2-NEXT:    vpmaxsb %xmm1, %xmm0, %xmm1
3332; AVX2-NEXT:    vpsubb %xmm3, %xmm1, %xmm1
3333; AVX2-NEXT:    vpsrlw $1, %xmm1, %xmm1
3334; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
3335; AVX2-NEXT:    vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
3336; AVX2-NEXT:    vpmovzxbw {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero
3337; AVX2-NEXT:    vpmullw %ymm2, %ymm1, %ymm1
3338; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
3339; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm2
3340; AVX2-NEXT:    vpackuswb %xmm2, %xmm1, %xmm1
3341; AVX2-NEXT:    vpaddb %xmm0, %xmm1, %xmm0
3342; AVX2-NEXT:    vzeroupper
3343; AVX2-NEXT:    retq
3344;
3345; XOP-FALLBACK-LABEL: vec128_i8_signed_mem_mem:
3346; XOP-FALLBACK:       # %bb.0:
3347; XOP-FALLBACK-NEXT:    vmovdqa (%rdi), %xmm0
3348; XOP-FALLBACK-NEXT:    vmovdqa (%rsi), %xmm1
3349; XOP-FALLBACK-NEXT:    vpcomgtb %xmm1, %xmm0, %xmm2
3350; XOP-FALLBACK-NEXT:    vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
3351; XOP-FALLBACK-NEXT:    vpminsb %xmm1, %xmm0, %xmm3
3352; XOP-FALLBACK-NEXT:    vpmaxsb %xmm1, %xmm0, %xmm1
3353; XOP-FALLBACK-NEXT:    vpsubb %xmm3, %xmm1, %xmm1
3354; XOP-FALLBACK-NEXT:    vpcmpeqd %xmm3, %xmm3, %xmm3
3355; XOP-FALLBACK-NEXT:    vpshlb %xmm3, %xmm1, %xmm1
3356; XOP-FALLBACK-NEXT:    vbroadcastss {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
3357; XOP-FALLBACK-NEXT:    vpandn %xmm2, %xmm3, %xmm4
3358; XOP-FALLBACK-NEXT:    vpmaddubsw %xmm4, %xmm1, %xmm4
3359; XOP-FALLBACK-NEXT:    vpand %xmm3, %xmm2, %xmm2
3360; XOP-FALLBACK-NEXT:    vpmaddubsw %xmm2, %xmm1, %xmm1
3361; XOP-FALLBACK-NEXT:    vpperm {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[2],xmm4[2],xmm1[4],xmm4[4],xmm1[6],xmm4[6],xmm1[8],xmm4[8],xmm1[10],xmm4[10],xmm1[12],xmm4[12],xmm1[14],xmm4[14]
3362; XOP-FALLBACK-NEXT:    vpaddb %xmm0, %xmm1, %xmm0
3363; XOP-FALLBACK-NEXT:    retq
3364;
3365; XOPAVX1-LABEL: vec128_i8_signed_mem_mem:
3366; XOPAVX1:       # %bb.0:
3367; XOPAVX1-NEXT:    vmovdqa (%rdi), %xmm0
3368; XOPAVX1-NEXT:    vmovdqa (%rsi), %xmm1
3369; XOPAVX1-NEXT:    vpcomgtb %xmm1, %xmm0, %xmm2
3370; XOPAVX1-NEXT:    vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
3371; XOPAVX1-NEXT:    vpminsb %xmm1, %xmm0, %xmm3
3372; XOPAVX1-NEXT:    vpmaxsb %xmm1, %xmm0, %xmm1
3373; XOPAVX1-NEXT:    vpsubb %xmm3, %xmm1, %xmm1
3374; XOPAVX1-NEXT:    vpcmpeqd %xmm3, %xmm3, %xmm3
3375; XOPAVX1-NEXT:    vpshlb %xmm3, %xmm1, %xmm1
3376; XOPAVX1-NEXT:    vbroadcastss {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
3377; XOPAVX1-NEXT:    vpandn %xmm2, %xmm3, %xmm4
3378; XOPAVX1-NEXT:    vpmaddubsw %xmm4, %xmm1, %xmm4
3379; XOPAVX1-NEXT:    vpand %xmm3, %xmm2, %xmm2
3380; XOPAVX1-NEXT:    vpmaddubsw %xmm2, %xmm1, %xmm1
3381; XOPAVX1-NEXT:    vpperm {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[2],xmm4[2],xmm1[4],xmm4[4],xmm1[6],xmm4[6],xmm1[8],xmm4[8],xmm1[10],xmm4[10],xmm1[12],xmm4[12],xmm1[14],xmm4[14]
3382; XOPAVX1-NEXT:    vpaddb %xmm0, %xmm1, %xmm0
3383; XOPAVX1-NEXT:    retq
3384;
3385; XOPAVX2-LABEL: vec128_i8_signed_mem_mem:
3386; XOPAVX2:       # %bb.0:
3387; XOPAVX2-NEXT:    vmovdqa (%rdi), %xmm0
3388; XOPAVX2-NEXT:    vmovdqa (%rsi), %xmm1
3389; XOPAVX2-NEXT:    vpcomgtb %xmm1, %xmm0, %xmm2
3390; XOPAVX2-NEXT:    vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
3391; XOPAVX2-NEXT:    vpminsb %xmm1, %xmm0, %xmm3
3392; XOPAVX2-NEXT:    vpmaxsb %xmm1, %xmm0, %xmm1
3393; XOPAVX2-NEXT:    vpsubb %xmm3, %xmm1, %xmm1
3394; XOPAVX2-NEXT:    vpcmpeqd %xmm3, %xmm3, %xmm3
3395; XOPAVX2-NEXT:    vpshlb %xmm3, %xmm1, %xmm1
3396; XOPAVX2-NEXT:    vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
3397; XOPAVX2-NEXT:    vpmovzxbw {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero
3398; XOPAVX2-NEXT:    vpmullw %ymm2, %ymm1, %ymm1
3399; XOPAVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
3400; XOPAVX2-NEXT:    vextracti128 $1, %ymm1, %xmm2
3401; XOPAVX2-NEXT:    vpackuswb %xmm2, %xmm1, %xmm1
3402; XOPAVX2-NEXT:    vpaddb %xmm0, %xmm1, %xmm0
3403; XOPAVX2-NEXT:    vzeroupper
3404; XOPAVX2-NEXT:    retq
3405;
3406; AVX512F-LABEL: vec128_i8_signed_mem_mem:
3407; AVX512F:       # %bb.0:
3408; AVX512F-NEXT:    vmovdqa (%rdi), %xmm0
3409; AVX512F-NEXT:    vmovdqa (%rsi), %xmm1
3410; AVX512F-NEXT:    vpcmpgtb %xmm1, %xmm0, %xmm2
3411; AVX512F-NEXT:    vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
3412; AVX512F-NEXT:    vpminsb %xmm1, %xmm0, %xmm3
3413; AVX512F-NEXT:    vpmaxsb %xmm1, %xmm0, %xmm1
3414; AVX512F-NEXT:    vpsubb %xmm3, %xmm1, %xmm1
3415; AVX512F-NEXT:    vpsrlw $1, %xmm1, %xmm1
3416; AVX512F-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
3417; AVX512F-NEXT:    vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
3418; AVX512F-NEXT:    vpmovzxbw {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero
3419; AVX512F-NEXT:    vpmullw %ymm2, %ymm1, %ymm1
3420; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
3421; AVX512F-NEXT:    vpmovdb %zmm1, %xmm1
3422; AVX512F-NEXT:    vpaddb %xmm0, %xmm1, %xmm0
3423; AVX512F-NEXT:    vzeroupper
3424; AVX512F-NEXT:    retq
3425;
3426; AVX512VL-FALLBACK-LABEL: vec128_i8_signed_mem_mem:
3427; AVX512VL-FALLBACK:       # %bb.0:
3428; AVX512VL-FALLBACK-NEXT:    vmovdqa (%rdi), %xmm0
3429; AVX512VL-FALLBACK-NEXT:    vmovdqa (%rsi), %xmm1
3430; AVX512VL-FALLBACK-NEXT:    vpminsb %xmm1, %xmm0, %xmm2
3431; AVX512VL-FALLBACK-NEXT:    vpmaxsb %xmm1, %xmm0, %xmm3
3432; AVX512VL-FALLBACK-NEXT:    vpsubb %xmm2, %xmm3, %xmm2
3433; AVX512VL-FALLBACK-NEXT:    vpsrlw $1, %xmm2, %xmm2
3434; AVX512VL-FALLBACK-NEXT:    vpcmpgtb %xmm1, %xmm0, %xmm1
3435; AVX512VL-FALLBACK-NEXT:    vpternlogd {{.*#+}} xmm2 = xmm1 ^ (xmm2 & mem)
3436; AVX512VL-FALLBACK-NEXT:    vpsubb %xmm1, %xmm2, %xmm1
3437; AVX512VL-FALLBACK-NEXT:    vpaddb %xmm0, %xmm1, %xmm0
3438; AVX512VL-FALLBACK-NEXT:    retq
3439;
3440; AVX512BW-FALLBACK-LABEL: vec128_i8_signed_mem_mem:
3441; AVX512BW-FALLBACK:       # %bb.0:
3442; AVX512BW-FALLBACK-NEXT:    vmovdqa (%rdi), %xmm0
3443; AVX512BW-FALLBACK-NEXT:    vmovdqa (%rsi), %xmm1
3444; AVX512BW-FALLBACK-NEXT:    vpcmpgtb %zmm1, %zmm0, %k1
3445; AVX512BW-FALLBACK-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
3446; AVX512BW-FALLBACK-NEXT:    vpbroadcastb {{.*#+}} xmm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
3447; AVX512BW-FALLBACK-NEXT:    vmovdqu8 %zmm2, %zmm3 {%k1}
3448; AVX512BW-FALLBACK-NEXT:    vpminsb %xmm1, %xmm0, %xmm2
3449; AVX512BW-FALLBACK-NEXT:    vpmaxsb %xmm1, %xmm0, %xmm1
3450; AVX512BW-FALLBACK-NEXT:    vpsubb %xmm2, %xmm1, %xmm1
3451; AVX512BW-FALLBACK-NEXT:    vpsrlw $1, %xmm1, %xmm1
3452; AVX512BW-FALLBACK-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
3453; AVX512BW-FALLBACK-NEXT:    vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
3454; AVX512BW-FALLBACK-NEXT:    vpmovzxbw {{.*#+}} ymm2 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero,xmm3[8],zero,xmm3[9],zero,xmm3[10],zero,xmm3[11],zero,xmm3[12],zero,xmm3[13],zero,xmm3[14],zero,xmm3[15],zero
3455; AVX512BW-FALLBACK-NEXT:    vpmullw %ymm2, %ymm1, %ymm1
3456; AVX512BW-FALLBACK-NEXT:    vpmovwb %zmm1, %ymm1
3457; AVX512BW-FALLBACK-NEXT:    vpaddb %xmm0, %xmm1, %xmm0
3458; AVX512BW-FALLBACK-NEXT:    vzeroupper
3459; AVX512BW-FALLBACK-NEXT:    retq
3460;
3461; AVX512VLBW-LABEL: vec128_i8_signed_mem_mem:
3462; AVX512VLBW:       # %bb.0:
3463; AVX512VLBW-NEXT:    vmovdqa (%rdi), %xmm0
3464; AVX512VLBW-NEXT:    vmovdqa (%rsi), %xmm1
3465; AVX512VLBW-NEXT:    vpcmpgtb %xmm1, %xmm0, %k1
3466; AVX512VLBW-NEXT:    vpminsb %xmm1, %xmm0, %xmm2
3467; AVX512VLBW-NEXT:    vpmaxsb %xmm1, %xmm0, %xmm1
3468; AVX512VLBW-NEXT:    vpsubb %xmm2, %xmm1, %xmm1
3469; AVX512VLBW-NEXT:    vpsrlw $1, %xmm1, %xmm1
3470; AVX512VLBW-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm1
3471; AVX512VLBW-NEXT:    vpxor %xmm2, %xmm2, %xmm2
3472; AVX512VLBW-NEXT:    vpsubb %xmm1, %xmm2, %xmm1 {%k1}
3473; AVX512VLBW-NEXT:    vpaddb %xmm0, %xmm1, %xmm0
3474; AVX512VLBW-NEXT:    retq
3475  %a1 = load <16 x i8>, ptr %a1_addr
3476  %a2 = load <16 x i8>, ptr %a2_addr
3477  %t3 = icmp sgt <16 x i8> %a1, %a2 ; signed
3478  %t4 = select <16 x i1> %t3, <16 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>, <16 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
3479  %t5 = select <16 x i1> %t3, <16 x i8> %a2, <16 x i8> %a1
3480  %t6 = select <16 x i1> %t3, <16 x i8> %a1, <16 x i8> %a2
3481  %t7 = sub <16 x i8> %t6, %t5
3482  %t8 = lshr <16 x i8> %t7, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
3483  %t9 = mul nsw <16 x i8> %t8, %t4 ; signed
3484  %a10 = add nsw <16 x i8> %t9, %a1 ; signed
3485  ret <16 x i8> %a10
3486}
3487