xref: /llvm-project/llvm/test/CodeGen/X86/vector-idiv-sdiv-128.ll (revision 9476671dc3fa13b1cd39f5de796f501539441a21)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=SSE --check-prefix=SSE2
3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=SSE --check-prefix=SSE41
4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX --check-prefix=AVX1
5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX --check-prefix=AVX2 --check-prefix=AVX2NOBW
6; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw | FileCheck %s --check-prefix=AVX --check-prefix=AVX2 --check-prefix=AVX512BW
7
8;
9; sdiv by 7
10;
11
12define <2 x i64> @test_div7_2i64(<2 x i64> %a) nounwind {
13; SSE2-LABEL: test_div7_2i64:
14; SSE2:       # %bb.0:
15; SSE2-NEXT:    movq %xmm0, %rax
16; SSE2-NEXT:    movabsq $5270498306774157605, %rcx # imm = 0x4924924924924925
17; SSE2-NEXT:    imulq %rcx
18; SSE2-NEXT:    movq %rdx, %rax
19; SSE2-NEXT:    shrq $63, %rax
20; SSE2-NEXT:    sarq %rdx
21; SSE2-NEXT:    addq %rax, %rdx
22; SSE2-NEXT:    movq %rdx, %xmm1
23; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
24; SSE2-NEXT:    movq %xmm0, %rax
25; SSE2-NEXT:    imulq %rcx
26; SSE2-NEXT:    movq %rdx, %rax
27; SSE2-NEXT:    shrq $63, %rax
28; SSE2-NEXT:    sarq %rdx
29; SSE2-NEXT:    addq %rax, %rdx
30; SSE2-NEXT:    movq %rdx, %xmm0
31; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
32; SSE2-NEXT:    movdqa %xmm1, %xmm0
33; SSE2-NEXT:    retq
34;
35; SSE41-LABEL: test_div7_2i64:
36; SSE41:       # %bb.0:
37; SSE41-NEXT:    pextrq $1, %xmm0, %rax
38; SSE41-NEXT:    movabsq $5270498306774157605, %rcx # imm = 0x4924924924924925
39; SSE41-NEXT:    imulq %rcx
40; SSE41-NEXT:    movq %rdx, %rax
41; SSE41-NEXT:    shrq $63, %rax
42; SSE41-NEXT:    sarq %rdx
43; SSE41-NEXT:    addq %rax, %rdx
44; SSE41-NEXT:    movq %rdx, %xmm1
45; SSE41-NEXT:    movq %xmm0, %rax
46; SSE41-NEXT:    imulq %rcx
47; SSE41-NEXT:    movq %rdx, %rax
48; SSE41-NEXT:    shrq $63, %rax
49; SSE41-NEXT:    sarq %rdx
50; SSE41-NEXT:    addq %rax, %rdx
51; SSE41-NEXT:    movq %rdx, %xmm0
52; SSE41-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
53; SSE41-NEXT:    retq
54;
55; AVX-LABEL: test_div7_2i64:
56; AVX:       # %bb.0:
57; AVX-NEXT:    vpextrq $1, %xmm0, %rax
58; AVX-NEXT:    movabsq $5270498306774157605, %rcx # imm = 0x4924924924924925
59; AVX-NEXT:    imulq %rcx
60; AVX-NEXT:    movq %rdx, %rax
61; AVX-NEXT:    shrq $63, %rax
62; AVX-NEXT:    sarq %rdx
63; AVX-NEXT:    addq %rax, %rdx
64; AVX-NEXT:    vmovq %rdx, %xmm1
65; AVX-NEXT:    vmovq %xmm0, %rax
66; AVX-NEXT:    imulq %rcx
67; AVX-NEXT:    movq %rdx, %rax
68; AVX-NEXT:    shrq $63, %rax
69; AVX-NEXT:    sarq %rdx
70; AVX-NEXT:    addq %rax, %rdx
71; AVX-NEXT:    vmovq %rdx, %xmm0
72; AVX-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
73; AVX-NEXT:    retq
74  %res = sdiv <2 x i64> %a, <i64 7, i64 7>
75  ret <2 x i64> %res
76}
77
78define <4 x i32> @test_div7_4i32(<4 x i32> %a) nounwind {
79; SSE2-LABEL: test_div7_4i32:
80; SSE2:       # %bb.0:
81; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [2454267027,2454267027,2454267027,2454267027]
82; SSE2-NEXT:    movdqa %xmm0, %xmm2
83; SSE2-NEXT:    pmuludq %xmm1, %xmm2
84; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3]
85; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
86; SSE2-NEXT:    pmuludq %xmm1, %xmm3
87; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[1,3,2,3]
88; SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
89; SSE2-NEXT:    pxor %xmm3, %xmm3
90; SSE2-NEXT:    pcmpgtd %xmm0, %xmm3
91; SSE2-NEXT:    pand %xmm1, %xmm3
92; SSE2-NEXT:    paddd %xmm0, %xmm3
93; SSE2-NEXT:    psubd %xmm3, %xmm2
94; SSE2-NEXT:    paddd %xmm2, %xmm0
95; SSE2-NEXT:    movdqa %xmm0, %xmm1
96; SSE2-NEXT:    psrld $31, %xmm1
97; SSE2-NEXT:    psrad $2, %xmm0
98; SSE2-NEXT:    paddd %xmm1, %xmm0
99; SSE2-NEXT:    retq
100;
101; SSE41-LABEL: test_div7_4i32:
102; SSE41:       # %bb.0:
103; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
104; SSE41-NEXT:    movdqa {{.*#+}} xmm2 = [2454267027,2454267027,2454267027,2454267027]
105; SSE41-NEXT:    pmuldq %xmm2, %xmm1
106; SSE41-NEXT:    pmuldq %xmm0, %xmm2
107; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
108; SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
109; SSE41-NEXT:    paddd %xmm2, %xmm0
110; SSE41-NEXT:    movdqa %xmm0, %xmm1
111; SSE41-NEXT:    psrld $31, %xmm1
112; SSE41-NEXT:    psrad $2, %xmm0
113; SSE41-NEXT:    paddd %xmm1, %xmm0
114; SSE41-NEXT:    retq
115;
116; AVX1-LABEL: test_div7_4i32:
117; AVX1:       # %bb.0:
118; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
119; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm2 = [2454267027,2454267027,2454267027,2454267027]
120; AVX1-NEXT:    vpmuldq %xmm2, %xmm1, %xmm1
121; AVX1-NEXT:    vpmuldq %xmm2, %xmm0, %xmm2
122; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
123; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
124; AVX1-NEXT:    vpaddd %xmm0, %xmm1, %xmm0
125; AVX1-NEXT:    vpsrld $31, %xmm0, %xmm1
126; AVX1-NEXT:    vpsrad $2, %xmm0, %xmm0
127; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
128; AVX1-NEXT:    retq
129;
130; AVX2-LABEL: test_div7_4i32:
131; AVX2:       # %bb.0:
132; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
133; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm2 = [2454267027,2454267027,2454267027,2454267027]
134; AVX2-NEXT:    vpmuldq %xmm2, %xmm1, %xmm1
135; AVX2-NEXT:    vpmuldq %xmm2, %xmm0, %xmm2
136; AVX2-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
137; AVX2-NEXT:    vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3]
138; AVX2-NEXT:    vpaddd %xmm0, %xmm1, %xmm0
139; AVX2-NEXT:    vpsrld $31, %xmm0, %xmm1
140; AVX2-NEXT:    vpsrad $2, %xmm0, %xmm0
141; AVX2-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
142; AVX2-NEXT:    retq
143  %res = sdiv <4 x i32> %a, <i32 7, i32 7, i32 7, i32 7>
144  ret <4 x i32> %res
145}
146
147define <8 x i16> @test_div7_8i16(<8 x i16> %a) nounwind {
148; SSE-LABEL: test_div7_8i16:
149; SSE:       # %bb.0:
150; SSE-NEXT:    pmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [18725,18725,18725,18725,18725,18725,18725,18725]
151; SSE-NEXT:    movdqa %xmm0, %xmm1
152; SSE-NEXT:    psrlw $15, %xmm1
153; SSE-NEXT:    psraw $1, %xmm0
154; SSE-NEXT:    paddw %xmm1, %xmm0
155; SSE-NEXT:    retq
156;
157; AVX-LABEL: test_div7_8i16:
158; AVX:       # %bb.0:
159; AVX-NEXT:    vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [18725,18725,18725,18725,18725,18725,18725,18725]
160; AVX-NEXT:    vpsrlw $15, %xmm0, %xmm1
161; AVX-NEXT:    vpsraw $1, %xmm0, %xmm0
162; AVX-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
163; AVX-NEXT:    retq
164  %res = sdiv <8 x i16> %a, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
165  ret <8 x i16> %res
166}
167
168define <16 x i8> @test_div7_16i8(<16 x i8> %a) nounwind {
169; SSE-LABEL: test_div7_16i8:
170; SSE:       # %bb.0:
171; SSE-NEXT:    pxor %xmm1, %xmm1
172; SSE-NEXT:    pxor %xmm2, %xmm2
173; SSE-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15]
174; SSE-NEXT:    movdqa {{.*#+}} xmm3 = [37632,37632,37632,37632,37632,37632,37632,37632]
175; SSE-NEXT:    pmulhw %xmm3, %xmm2
176; SSE-NEXT:    psrlw $8, %xmm2
177; SSE-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
178; SSE-NEXT:    pmulhw %xmm3, %xmm1
179; SSE-NEXT:    psrlw $8, %xmm1
180; SSE-NEXT:    packuswb %xmm2, %xmm1
181; SSE-NEXT:    paddb %xmm1, %xmm0
182; SSE-NEXT:    movdqa %xmm0, %xmm1
183; SSE-NEXT:    psrlw $2, %xmm1
184; SSE-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
185; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
186; SSE-NEXT:    pxor %xmm2, %xmm1
187; SSE-NEXT:    psrlw $7, %xmm0
188; SSE-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
189; SSE-NEXT:    paddb %xmm1, %xmm0
190; SSE-NEXT:    psubb %xmm2, %xmm0
191; SSE-NEXT:    retq
192;
193; AVX1-LABEL: test_div7_16i8:
194; AVX1:       # %bb.0:
195; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
196; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm2 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
197; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm3 = [37632,37632,37632,37632,37632,37632,37632,37632]
198; AVX1-NEXT:    vpmulhw %xmm3, %xmm2, %xmm2
199; AVX1-NEXT:    vpsrlw $8, %xmm2, %xmm2
200; AVX1-NEXT:    vpunpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
201; AVX1-NEXT:    vpmulhw %xmm3, %xmm1, %xmm1
202; AVX1-NEXT:    vpsrlw $8, %xmm1, %xmm1
203; AVX1-NEXT:    vpackuswb %xmm2, %xmm1, %xmm1
204; AVX1-NEXT:    vpaddb %xmm0, %xmm1, %xmm0
205; AVX1-NEXT:    vpsrlw $2, %xmm0, %xmm1
206; AVX1-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
207; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm2 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
208; AVX1-NEXT:    vpxor %xmm2, %xmm1, %xmm1
209; AVX1-NEXT:    vpsrlw $7, %xmm0, %xmm0
210; AVX1-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
211; AVX1-NEXT:    vpaddb %xmm0, %xmm1, %xmm0
212; AVX1-NEXT:    vpsubb %xmm2, %xmm0, %xmm0
213; AVX1-NEXT:    retq
214;
215; AVX2NOBW-LABEL: test_div7_16i8:
216; AVX2NOBW:       # %bb.0:
217; AVX2NOBW-NEXT:    vpmovsxbw %xmm0, %ymm1
218; AVX2NOBW-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 # [65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427]
219; AVX2NOBW-NEXT:    vpsrlw $8, %ymm1, %ymm1
220; AVX2NOBW-NEXT:    vextracti128 $1, %ymm1, %xmm2
221; AVX2NOBW-NEXT:    vpackuswb %xmm2, %xmm1, %xmm1
222; AVX2NOBW-NEXT:    vpaddb %xmm0, %xmm1, %xmm0
223; AVX2NOBW-NEXT:    vpsrlw $2, %xmm0, %xmm1
224; AVX2NOBW-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
225; AVX2NOBW-NEXT:    vpbroadcastb {{.*#+}} xmm2 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
226; AVX2NOBW-NEXT:    vpxor %xmm2, %xmm1, %xmm1
227; AVX2NOBW-NEXT:    vpsrlw $7, %xmm0, %xmm0
228; AVX2NOBW-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
229; AVX2NOBW-NEXT:    vpaddb %xmm0, %xmm1, %xmm0
230; AVX2NOBW-NEXT:    vpsubb %xmm2, %xmm0, %xmm0
231; AVX2NOBW-NEXT:    vzeroupper
232; AVX2NOBW-NEXT:    retq
233;
234; AVX512BW-LABEL: test_div7_16i8:
235; AVX512BW:       # %bb.0:
236; AVX512BW-NEXT:    vpmovsxbw %xmm0, %ymm1
237; AVX512BW-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 # [65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427]
238; AVX512BW-NEXT:    vpsrlw $8, %ymm1, %ymm1
239; AVX512BW-NEXT:    vpmovwb %zmm1, %ymm1
240; AVX512BW-NEXT:    vpaddb %xmm0, %xmm1, %xmm0
241; AVX512BW-NEXT:    vpsrlw $2, %xmm0, %xmm1
242; AVX512BW-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
243; AVX512BW-NEXT:    vpbroadcastb {{.*#+}} xmm2 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
244; AVX512BW-NEXT:    vpxor %xmm2, %xmm1, %xmm1
245; AVX512BW-NEXT:    vpsrlw $7, %xmm0, %xmm0
246; AVX512BW-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
247; AVX512BW-NEXT:    vpaddb %xmm0, %xmm1, %xmm0
248; AVX512BW-NEXT:    vpsubb %xmm2, %xmm0, %xmm0
249; AVX512BW-NEXT:    vzeroupper
250; AVX512BW-NEXT:    retq
251  %res = sdiv <16 x i8> %a, <i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7>
252  ret <16 x i8> %res
253}
254
255;
256; sdiv by non-splat constant
257;
258
259define <16 x i8> @test_divconstant_16i8(<16 x i8> %a) nounwind {
260; SSE-LABEL: test_divconstant_16i8:
261; SSE:       # %bb.0:
262; SSE-NEXT:    pxor %xmm1, %xmm1
263; SSE-NEXT:    pxor %xmm2, %xmm2
264; SSE-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15]
265; SSE-NEXT:    pmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [37632,20224,11008,47872,26368,14592,14592,37632]
266; SSE-NEXT:    psrlw $8, %xmm2
267; SSE-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
268; SSE-NEXT:    pmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [37632,33024,14592,26368,47872,11008,20224,37632]
269; SSE-NEXT:    psrlw $8, %xmm1
270; SSE-NEXT:    packuswb %xmm2, %xmm1
271; SSE-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
272; SSE-NEXT:    paddb %xmm1, %xmm0
273; SSE-NEXT:    movdqa %xmm0, %xmm1
274; SSE-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
275; SSE-NEXT:    psraw $8, %xmm1
276; SSE-NEXT:    pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [32,64,128,32,64,128,128,64]
277; SSE-NEXT:    psrlw $8, %xmm1
278; SSE-NEXT:    movdqa %xmm0, %xmm2
279; SSE-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
280; SSE-NEXT:    psraw $8, %xmm2
281; SSE-NEXT:    pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [64,64,128,64,32,128,64,32]
282; SSE-NEXT:    psrlw $8, %xmm2
283; SSE-NEXT:    packuswb %xmm1, %xmm2
284; SSE-NEXT:    psrlw $7, %xmm0
285; SSE-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
286; SSE-NEXT:    paddb %xmm2, %xmm0
287; SSE-NEXT:    retq
288;
289; AVX1-LABEL: test_divconstant_16i8:
290; AVX1:       # %bb.0:
291; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
292; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm2 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
293; AVX1-NEXT:    vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 # [37632,20224,11008,47872,26368,14592,14592,37632]
294; AVX1-NEXT:    vpsrlw $8, %xmm2, %xmm2
295; AVX1-NEXT:    vpunpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
296; AVX1-NEXT:    vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [37632,33024,14592,26368,47872,11008,20224,37632]
297; AVX1-NEXT:    vpsrlw $8, %xmm1, %xmm1
298; AVX1-NEXT:    vpackuswb %xmm2, %xmm1, %xmm1
299; AVX1-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
300; AVX1-NEXT:    vpaddb %xmm0, %xmm1, %xmm0
301; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm1 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
302; AVX1-NEXT:    vpsraw $8, %xmm1, %xmm1
303; AVX1-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [32,64,128,32,64,128,128,64]
304; AVX1-NEXT:    vpsrlw $8, %xmm1, %xmm1
305; AVX1-NEXT:    vpunpcklbw {{.*#+}} xmm2 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
306; AVX1-NEXT:    vpsraw $8, %xmm2, %xmm2
307; AVX1-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 # [64,64,128,64,32,128,64,32]
308; AVX1-NEXT:    vpsrlw $8, %xmm2, %xmm2
309; AVX1-NEXT:    vpackuswb %xmm1, %xmm2, %xmm1
310; AVX1-NEXT:    vpsrlw $7, %xmm0, %xmm0
311; AVX1-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
312; AVX1-NEXT:    vpaddb %xmm0, %xmm1, %xmm0
313; AVX1-NEXT:    retq
314;
315; AVX2NOBW-LABEL: test_divconstant_16i8:
316; AVX2NOBW:       # %bb.0:
317; AVX2NOBW-NEXT:    vpmovsxbw %xmm0, %ymm1
318; AVX2NOBW-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 # [65427,65409,57,103,65467,43,79,65427,65427,79,43,65467,103,57,57,65427]
319; AVX2NOBW-NEXT:    vpsrlw $8, %ymm1, %ymm1
320; AVX2NOBW-NEXT:    vextracti128 $1, %ymm1, %xmm2
321; AVX2NOBW-NEXT:    vpackuswb %xmm2, %xmm1, %xmm1
322; AVX2NOBW-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
323; AVX2NOBW-NEXT:    vpaddb %xmm0, %xmm1, %xmm0
324; AVX2NOBW-NEXT:    vpmovsxbw %xmm0, %ymm1
325; AVX2NOBW-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 # [64,64,128,64,32,128,64,32,32,64,128,32,64,128,128,64]
326; AVX2NOBW-NEXT:    vpsrlw $8, %ymm1, %ymm1
327; AVX2NOBW-NEXT:    vextracti128 $1, %ymm1, %xmm2
328; AVX2NOBW-NEXT:    vpackuswb %xmm2, %xmm1, %xmm1
329; AVX2NOBW-NEXT:    vpsrlw $7, %xmm0, %xmm0
330; AVX2NOBW-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
331; AVX2NOBW-NEXT:    vpaddb %xmm0, %xmm1, %xmm0
332; AVX2NOBW-NEXT:    vzeroupper
333; AVX2NOBW-NEXT:    retq
334;
335; AVX512BW-LABEL: test_divconstant_16i8:
336; AVX512BW:       # %bb.0:
337; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} ymm1 = [2,2,1,2,3,1,2,3,3,2,1,3,2,1,1,2]
338; AVX512BW-NEXT:    vpmovsxbw %xmm0, %ymm2
339; AVX512BW-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 # [65427,65409,57,103,65467,43,79,65427,65427,79,43,65467,103,57,57,65427]
340; AVX512BW-NEXT:    vpsrlw $8, %ymm2, %ymm2
341; AVX512BW-NEXT:    vpmovwb %zmm2, %ymm2
342; AVX512BW-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
343; AVX512BW-NEXT:    vpaddb %xmm0, %xmm2, %xmm0
344; AVX512BW-NEXT:    vpmovsxbw %xmm0, %ymm2
345; AVX512BW-NEXT:    vpsravw %zmm1, %zmm2, %zmm1
346; AVX512BW-NEXT:    vpmovwb %zmm1, %ymm1
347; AVX512BW-NEXT:    vpsrlw $7, %xmm0, %xmm0
348; AVX512BW-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
349; AVX512BW-NEXT:    vpaddb %xmm0, %xmm1, %xmm0
350; AVX512BW-NEXT:    vzeroupper
351; AVX512BW-NEXT:    retq
352  %res = sdiv <16 x i8> %a, <i8 7, i8 8, i8 9, i8 10,i8 11, i8 12, i8 13, i8 14, i8 14, i8 13, i8 12, i8 11, i8 10, i8 9,i8 9, i8 7>
353  ret <16 x i8> %res
354}
355
356;
357; srem by 7
358;
359
360define <2 x i64> @test_rem7_2i64(<2 x i64> %a) nounwind {
361; SSE2-LABEL: test_rem7_2i64:
362; SSE2:       # %bb.0:
363; SSE2-NEXT:    movq %xmm0, %rcx
364; SSE2-NEXT:    movabsq $5270498306774157605, %rsi # imm = 0x4924924924924925
365; SSE2-NEXT:    movq %rcx, %rax
366; SSE2-NEXT:    imulq %rsi
367; SSE2-NEXT:    movq %rdx, %rax
368; SSE2-NEXT:    shrq $63, %rax
369; SSE2-NEXT:    sarq %rdx
370; SSE2-NEXT:    addq %rax, %rdx
371; SSE2-NEXT:    leaq (,%rdx,8), %rax
372; SSE2-NEXT:    subq %rax, %rdx
373; SSE2-NEXT:    addq %rcx, %rdx
374; SSE2-NEXT:    movq %rdx, %xmm1
375; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
376; SSE2-NEXT:    movq %xmm0, %rcx
377; SSE2-NEXT:    movq %rcx, %rax
378; SSE2-NEXT:    imulq %rsi
379; SSE2-NEXT:    movq %rdx, %rax
380; SSE2-NEXT:    shrq $63, %rax
381; SSE2-NEXT:    sarq %rdx
382; SSE2-NEXT:    addq %rax, %rdx
383; SSE2-NEXT:    leaq (,%rdx,8), %rax
384; SSE2-NEXT:    subq %rax, %rdx
385; SSE2-NEXT:    addq %rcx, %rdx
386; SSE2-NEXT:    movq %rdx, %xmm0
387; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
388; SSE2-NEXT:    movdqa %xmm1, %xmm0
389; SSE2-NEXT:    retq
390;
391; SSE41-LABEL: test_rem7_2i64:
392; SSE41:       # %bb.0:
393; SSE41-NEXT:    pextrq $1, %xmm0, %rcx
394; SSE41-NEXT:    movabsq $5270498306774157605, %rsi # imm = 0x4924924924924925
395; SSE41-NEXT:    movq %rcx, %rax
396; SSE41-NEXT:    imulq %rsi
397; SSE41-NEXT:    movq %rdx, %rax
398; SSE41-NEXT:    shrq $63, %rax
399; SSE41-NEXT:    sarq %rdx
400; SSE41-NEXT:    addq %rax, %rdx
401; SSE41-NEXT:    leaq (,%rdx,8), %rax
402; SSE41-NEXT:    subq %rax, %rdx
403; SSE41-NEXT:    addq %rcx, %rdx
404; SSE41-NEXT:    movq %rdx, %xmm1
405; SSE41-NEXT:    movq %xmm0, %rcx
406; SSE41-NEXT:    movq %rcx, %rax
407; SSE41-NEXT:    imulq %rsi
408; SSE41-NEXT:    movq %rdx, %rax
409; SSE41-NEXT:    shrq $63, %rax
410; SSE41-NEXT:    sarq %rdx
411; SSE41-NEXT:    addq %rax, %rdx
412; SSE41-NEXT:    leaq (,%rdx,8), %rax
413; SSE41-NEXT:    subq %rax, %rdx
414; SSE41-NEXT:    addq %rcx, %rdx
415; SSE41-NEXT:    movq %rdx, %xmm0
416; SSE41-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
417; SSE41-NEXT:    retq
418;
419; AVX-LABEL: test_rem7_2i64:
420; AVX:       # %bb.0:
421; AVX-NEXT:    vpextrq $1, %xmm0, %rcx
422; AVX-NEXT:    movabsq $5270498306774157605, %rsi # imm = 0x4924924924924925
423; AVX-NEXT:    movq %rcx, %rax
424; AVX-NEXT:    imulq %rsi
425; AVX-NEXT:    movq %rdx, %rax
426; AVX-NEXT:    shrq $63, %rax
427; AVX-NEXT:    sarq %rdx
428; AVX-NEXT:    addq %rax, %rdx
429; AVX-NEXT:    leaq (,%rdx,8), %rax
430; AVX-NEXT:    subq %rax, %rdx
431; AVX-NEXT:    addq %rcx, %rdx
432; AVX-NEXT:    vmovq %rdx, %xmm1
433; AVX-NEXT:    vmovq %xmm0, %rcx
434; AVX-NEXT:    movq %rcx, %rax
435; AVX-NEXT:    imulq %rsi
436; AVX-NEXT:    movq %rdx, %rax
437; AVX-NEXT:    shrq $63, %rax
438; AVX-NEXT:    sarq %rdx
439; AVX-NEXT:    addq %rax, %rdx
440; AVX-NEXT:    leaq (,%rdx,8), %rax
441; AVX-NEXT:    subq %rax, %rdx
442; AVX-NEXT:    addq %rcx, %rdx
443; AVX-NEXT:    vmovq %rdx, %xmm0
444; AVX-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
445; AVX-NEXT:    retq
446  %res = srem <2 x i64> %a, <i64 7, i64 7>
447  ret <2 x i64> %res
448}
449
450define <4 x i32> @test_rem7_4i32(<4 x i32> %a) nounwind {
451; SSE2-LABEL: test_rem7_4i32:
452; SSE2:       # %bb.0:
453; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [2454267027,2454267027,2454267027,2454267027]
454; SSE2-NEXT:    movdqa %xmm0, %xmm2
455; SSE2-NEXT:    pmuludq %xmm1, %xmm2
456; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3]
457; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
458; SSE2-NEXT:    pmuludq %xmm1, %xmm3
459; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[1,3,2,3]
460; SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
461; SSE2-NEXT:    pxor %xmm3, %xmm3
462; SSE2-NEXT:    pcmpgtd %xmm0, %xmm3
463; SSE2-NEXT:    pand %xmm1, %xmm3
464; SSE2-NEXT:    paddd %xmm0, %xmm3
465; SSE2-NEXT:    psubd %xmm3, %xmm2
466; SSE2-NEXT:    paddd %xmm0, %xmm2
467; SSE2-NEXT:    movdqa %xmm2, %xmm1
468; SSE2-NEXT:    psrld $31, %xmm1
469; SSE2-NEXT:    psrad $2, %xmm2
470; SSE2-NEXT:    paddd %xmm1, %xmm2
471; SSE2-NEXT:    movdqa %xmm2, %xmm1
472; SSE2-NEXT:    pslld $3, %xmm1
473; SSE2-NEXT:    psubd %xmm1, %xmm2
474; SSE2-NEXT:    paddd %xmm2, %xmm0
475; SSE2-NEXT:    retq
476;
477; SSE41-LABEL: test_rem7_4i32:
478; SSE41:       # %bb.0:
479; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
480; SSE41-NEXT:    movdqa {{.*#+}} xmm2 = [2454267027,2454267027,2454267027,2454267027]
481; SSE41-NEXT:    pmuldq %xmm2, %xmm1
482; SSE41-NEXT:    pmuldq %xmm0, %xmm2
483; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
484; SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
485; SSE41-NEXT:    paddd %xmm0, %xmm2
486; SSE41-NEXT:    movdqa %xmm2, %xmm1
487; SSE41-NEXT:    psrld $31, %xmm1
488; SSE41-NEXT:    psrad $2, %xmm2
489; SSE41-NEXT:    paddd %xmm1, %xmm2
490; SSE41-NEXT:    movdqa %xmm2, %xmm1
491; SSE41-NEXT:    pslld $3, %xmm1
492; SSE41-NEXT:    psubd %xmm1, %xmm2
493; SSE41-NEXT:    paddd %xmm2, %xmm0
494; SSE41-NEXT:    retq
495;
496; AVX1-LABEL: test_rem7_4i32:
497; AVX1:       # %bb.0:
498; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
499; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm2 = [2454267027,2454267027,2454267027,2454267027]
500; AVX1-NEXT:    vpmuldq %xmm2, %xmm1, %xmm1
501; AVX1-NEXT:    vpmuldq %xmm2, %xmm0, %xmm2
502; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
503; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
504; AVX1-NEXT:    vpaddd %xmm0, %xmm1, %xmm1
505; AVX1-NEXT:    vpsrld $31, %xmm1, %xmm2
506; AVX1-NEXT:    vpsrad $2, %xmm1, %xmm1
507; AVX1-NEXT:    vpaddd %xmm2, %xmm1, %xmm1
508; AVX1-NEXT:    vpslld $3, %xmm1, %xmm2
509; AVX1-NEXT:    vpsubd %xmm2, %xmm1, %xmm1
510; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
511; AVX1-NEXT:    retq
512;
513; AVX2-LABEL: test_rem7_4i32:
514; AVX2:       # %bb.0:
515; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
516; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm2 = [2454267027,2454267027,2454267027,2454267027]
517; AVX2-NEXT:    vpmuldq %xmm2, %xmm1, %xmm1
518; AVX2-NEXT:    vpmuldq %xmm2, %xmm0, %xmm2
519; AVX2-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
520; AVX2-NEXT:    vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3]
521; AVX2-NEXT:    vpaddd %xmm0, %xmm1, %xmm1
522; AVX2-NEXT:    vpsrld $31, %xmm1, %xmm2
523; AVX2-NEXT:    vpsrad $2, %xmm1, %xmm1
524; AVX2-NEXT:    vpaddd %xmm2, %xmm1, %xmm1
525; AVX2-NEXT:    vpslld $3, %xmm1, %xmm2
526; AVX2-NEXT:    vpsubd %xmm2, %xmm1, %xmm1
527; AVX2-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
528; AVX2-NEXT:    retq
529  %res = srem <4 x i32> %a, <i32 7, i32 7, i32 7, i32 7>
530  ret <4 x i32> %res
531}
532
533define <8 x i16> @test_rem7_8i16(<8 x i16> %a) nounwind {
534; SSE-LABEL: test_rem7_8i16:
535; SSE:       # %bb.0:
536; SSE-NEXT:    movdqa {{.*#+}} xmm1 = [18725,18725,18725,18725,18725,18725,18725,18725]
537; SSE-NEXT:    pmulhw %xmm0, %xmm1
538; SSE-NEXT:    movdqa %xmm1, %xmm2
539; SSE-NEXT:    psrlw $15, %xmm2
540; SSE-NEXT:    psraw $1, %xmm1
541; SSE-NEXT:    paddw %xmm2, %xmm1
542; SSE-NEXT:    movdqa %xmm1, %xmm2
543; SSE-NEXT:    psllw $3, %xmm2
544; SSE-NEXT:    psubw %xmm2, %xmm1
545; SSE-NEXT:    paddw %xmm1, %xmm0
546; SSE-NEXT:    retq
547;
548; AVX-LABEL: test_rem7_8i16:
549; AVX:       # %bb.0:
550; AVX-NEXT:    vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 # [18725,18725,18725,18725,18725,18725,18725,18725]
551; AVX-NEXT:    vpsrlw $15, %xmm1, %xmm2
552; AVX-NEXT:    vpsraw $1, %xmm1, %xmm1
553; AVX-NEXT:    vpaddw %xmm2, %xmm1, %xmm1
554; AVX-NEXT:    vpsllw $3, %xmm1, %xmm2
555; AVX-NEXT:    vpsubw %xmm2, %xmm1, %xmm1
556; AVX-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
557; AVX-NEXT:    retq
558  %res = srem <8 x i16> %a, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
559  ret <8 x i16> %res
560}
561
562define <16 x i8> @test_rem7_16i8(<16 x i8> %a) nounwind {
563; SSE-LABEL: test_rem7_16i8:
564; SSE:       # %bb.0:
565; SSE-NEXT:    pxor %xmm1, %xmm1
566; SSE-NEXT:    pxor %xmm2, %xmm2
567; SSE-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15]
568; SSE-NEXT:    movdqa {{.*#+}} xmm3 = [37632,37632,37632,37632,37632,37632,37632,37632]
569; SSE-NEXT:    pmulhw %xmm3, %xmm2
570; SSE-NEXT:    psrlw $8, %xmm2
571; SSE-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
572; SSE-NEXT:    pmulhw %xmm3, %xmm1
573; SSE-NEXT:    psrlw $8, %xmm1
574; SSE-NEXT:    packuswb %xmm2, %xmm1
575; SSE-NEXT:    paddb %xmm0, %xmm1
576; SSE-NEXT:    movdqa %xmm1, %xmm2
577; SSE-NEXT:    psrlw $2, %xmm2
578; SSE-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
579; SSE-NEXT:    movdqa {{.*#+}} xmm3 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
580; SSE-NEXT:    pxor %xmm3, %xmm2
581; SSE-NEXT:    psrlw $7, %xmm1
582; SSE-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
583; SSE-NEXT:    paddb %xmm2, %xmm1
584; SSE-NEXT:    psubb %xmm3, %xmm1
585; SSE-NEXT:    movdqa %xmm1, %xmm2
586; SSE-NEXT:    psllw $3, %xmm2
587; SSE-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
588; SSE-NEXT:    psubb %xmm2, %xmm1
589; SSE-NEXT:    paddb %xmm1, %xmm0
590; SSE-NEXT:    retq
591;
592; AVX1-LABEL: test_rem7_16i8:
593; AVX1:       # %bb.0:
594; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
595; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm2 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
596; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm3 = [37632,37632,37632,37632,37632,37632,37632,37632]
597; AVX1-NEXT:    vpmulhw %xmm3, %xmm2, %xmm2
598; AVX1-NEXT:    vpsrlw $8, %xmm2, %xmm2
599; AVX1-NEXT:    vpunpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
600; AVX1-NEXT:    vpmulhw %xmm3, %xmm1, %xmm1
601; AVX1-NEXT:    vpsrlw $8, %xmm1, %xmm1
602; AVX1-NEXT:    vpackuswb %xmm2, %xmm1, %xmm1
603; AVX1-NEXT:    vpaddb %xmm0, %xmm1, %xmm1
604; AVX1-NEXT:    vpsrlw $2, %xmm1, %xmm2
605; AVX1-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
606; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm3 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
607; AVX1-NEXT:    vpxor %xmm3, %xmm2, %xmm2
608; AVX1-NEXT:    vpsrlw $7, %xmm1, %xmm1
609; AVX1-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
610; AVX1-NEXT:    vpaddb %xmm1, %xmm2, %xmm1
611; AVX1-NEXT:    vpsubb %xmm3, %xmm1, %xmm1
612; AVX1-NEXT:    vpsllw $3, %xmm1, %xmm2
613; AVX1-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
614; AVX1-NEXT:    vpsubb %xmm2, %xmm1, %xmm1
615; AVX1-NEXT:    vpaddb %xmm1, %xmm0, %xmm0
616; AVX1-NEXT:    retq
617;
618; AVX2NOBW-LABEL: test_rem7_16i8:
619; AVX2NOBW:       # %bb.0:
620; AVX2NOBW-NEXT:    vpmovsxbw %xmm0, %ymm1
621; AVX2NOBW-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 # [65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427]
622; AVX2NOBW-NEXT:    vpsrlw $8, %ymm1, %ymm1
623; AVX2NOBW-NEXT:    vextracti128 $1, %ymm1, %xmm2
624; AVX2NOBW-NEXT:    vpackuswb %xmm2, %xmm1, %xmm1
625; AVX2NOBW-NEXT:    vpaddb %xmm0, %xmm1, %xmm1
626; AVX2NOBW-NEXT:    vpsrlw $2, %xmm1, %xmm2
627; AVX2NOBW-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
628; AVX2NOBW-NEXT:    vpbroadcastb {{.*#+}} xmm3 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
629; AVX2NOBW-NEXT:    vpxor %xmm3, %xmm2, %xmm2
630; AVX2NOBW-NEXT:    vpsrlw $7, %xmm1, %xmm1
631; AVX2NOBW-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
632; AVX2NOBW-NEXT:    vpaddb %xmm1, %xmm2, %xmm1
633; AVX2NOBW-NEXT:    vpsubb %xmm3, %xmm1, %xmm1
634; AVX2NOBW-NEXT:    vpsllw $3, %xmm1, %xmm2
635; AVX2NOBW-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
636; AVX2NOBW-NEXT:    vpsubb %xmm2, %xmm1, %xmm1
637; AVX2NOBW-NEXT:    vpaddb %xmm1, %xmm0, %xmm0
638; AVX2NOBW-NEXT:    vzeroupper
639; AVX2NOBW-NEXT:    retq
640;
641; AVX512BW-LABEL: test_rem7_16i8:
642; AVX512BW:       # %bb.0:
643; AVX512BW-NEXT:    vpmovsxbw %xmm0, %ymm1
644; AVX512BW-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 # [65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427]
645; AVX512BW-NEXT:    vpsrlw $8, %ymm1, %ymm1
646; AVX512BW-NEXT:    vpmovwb %zmm1, %ymm1
647; AVX512BW-NEXT:    vpaddb %xmm0, %xmm1, %xmm1
648; AVX512BW-NEXT:    vpsrlw $2, %xmm1, %xmm2
649; AVX512BW-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
650; AVX512BW-NEXT:    vpbroadcastb {{.*#+}} xmm3 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
651; AVX512BW-NEXT:    vpxor %xmm3, %xmm2, %xmm2
652; AVX512BW-NEXT:    vpsrlw $7, %xmm1, %xmm1
653; AVX512BW-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
654; AVX512BW-NEXT:    vpaddb %xmm1, %xmm2, %xmm1
655; AVX512BW-NEXT:    vpsubb %xmm3, %xmm1, %xmm1
656; AVX512BW-NEXT:    vpsllw $3, %xmm1, %xmm2
657; AVX512BW-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
658; AVX512BW-NEXT:    vpsubb %xmm2, %xmm1, %xmm1
659; AVX512BW-NEXT:    vpaddb %xmm1, %xmm0, %xmm0
660; AVX512BW-NEXT:    vzeroupper
661; AVX512BW-NEXT:    retq
662  %res = srem <16 x i8> %a, <i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7>
663  ret <16 x i8> %res
664}
665
666;
667; srem by non-splat constant
668;
669
670define <16 x i8> @test_remconstant_16i8(<16 x i8> %a) nounwind {
671; SSE2-LABEL: test_remconstant_16i8:
672; SSE2:       # %bb.0:
673; SSE2-NEXT:    pxor %xmm2, %xmm2
674; SSE2-NEXT:    pxor %xmm1, %xmm1
675; SSE2-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
676; SSE2-NEXT:    pmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [37632,20224,11008,47872,26368,14592,14592,37632]
677; SSE2-NEXT:    psrlw $8, %xmm1
678; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
679; SSE2-NEXT:    pmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [37632,33024,14592,26368,47872,11008,20224,37632]
680; SSE2-NEXT:    psrlw $8, %xmm2
681; SSE2-NEXT:    packuswb %xmm1, %xmm2
682; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [255,255,0,0,255,0,0,255,255,0,0,255,0,0,0,255]
683; SSE2-NEXT:    pand %xmm0, %xmm1
684; SSE2-NEXT:    paddb %xmm2, %xmm1
685; SSE2-NEXT:    movdqa %xmm1, %xmm2
686; SSE2-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
687; SSE2-NEXT:    psraw $8, %xmm2
688; SSE2-NEXT:    pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [32,64,128,32,64,128,128,64]
689; SSE2-NEXT:    psrlw $8, %xmm2
690; SSE2-NEXT:    movdqa %xmm1, %xmm3
691; SSE2-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
692; SSE2-NEXT:    psraw $8, %xmm3
693; SSE2-NEXT:    pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 # [64,64,128,64,32,128,64,32]
694; SSE2-NEXT:    psrlw $8, %xmm3
695; SSE2-NEXT:    packuswb %xmm2, %xmm3
696; SSE2-NEXT:    psrlw $7, %xmm1
697; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
698; SSE2-NEXT:    paddb %xmm3, %xmm1
699; SSE2-NEXT:    movdqa %xmm1, %xmm2
700; SSE2-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
701; SSE2-NEXT:    pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [14,13,12,11,10,9,9,7]
702; SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
703; SSE2-NEXT:    pand %xmm3, %xmm2
704; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
705; SSE2-NEXT:    pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [7,8,9,10,11,12,13,14]
706; SSE2-NEXT:    pand %xmm3, %xmm1
707; SSE2-NEXT:    packuswb %xmm2, %xmm1
708; SSE2-NEXT:    psubb %xmm1, %xmm0
709; SSE2-NEXT:    retq
710;
711; SSE41-LABEL: test_remconstant_16i8:
712; SSE41:       # %bb.0:
713; SSE41-NEXT:    pxor %xmm2, %xmm2
714; SSE41-NEXT:    pxor %xmm1, %xmm1
715; SSE41-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
716; SSE41-NEXT:    pmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [37632,20224,11008,47872,26368,14592,14592,37632]
717; SSE41-NEXT:    psrlw $8, %xmm1
718; SSE41-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
719; SSE41-NEXT:    pmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [37632,33024,14592,26368,47872,11008,20224,37632]
720; SSE41-NEXT:    psrlw $8, %xmm2
721; SSE41-NEXT:    packuswb %xmm1, %xmm2
722; SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [255,255,0,0,255,0,0,255,255,0,0,255,0,0,0,255]
723; SSE41-NEXT:    pand %xmm0, %xmm1
724; SSE41-NEXT:    paddb %xmm2, %xmm1
725; SSE41-NEXT:    movdqa %xmm1, %xmm2
726; SSE41-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
727; SSE41-NEXT:    psraw $8, %xmm2
728; SSE41-NEXT:    pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [32,64,128,32,64,128,128,64]
729; SSE41-NEXT:    psrlw $8, %xmm2
730; SSE41-NEXT:    movdqa %xmm1, %xmm3
731; SSE41-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
732; SSE41-NEXT:    psraw $8, %xmm3
733; SSE41-NEXT:    pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 # [64,64,128,64,32,128,64,32]
734; SSE41-NEXT:    psrlw $8, %xmm3
735; SSE41-NEXT:    packuswb %xmm2, %xmm3
736; SSE41-NEXT:    psrlw $7, %xmm1
737; SSE41-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
738; SSE41-NEXT:    paddb %xmm3, %xmm1
739; SSE41-NEXT:    movdqa %xmm1, %xmm2
740; SSE41-NEXT:    pmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [0,8,0,10,0,12,0,14,0,13,0,11,0,9,0,7]
741; SSE41-NEXT:    psllw $8, %xmm2
742; SSE41-NEXT:    pmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [7,0,9,0,11,0,13,0,14,0,12,0,10,0,9,0]
743; SSE41-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
744; SSE41-NEXT:    por %xmm2, %xmm1
745; SSE41-NEXT:    psubb %xmm1, %xmm0
746; SSE41-NEXT:    retq
747;
748; AVX1-LABEL: test_remconstant_16i8:
749; AVX1:       # %bb.0:
750; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
751; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm2 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
752; AVX1-NEXT:    vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 # [37632,20224,11008,47872,26368,14592,14592,37632]
753; AVX1-NEXT:    vpsrlw $8, %xmm2, %xmm2
754; AVX1-NEXT:    vpunpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
755; AVX1-NEXT:    vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [37632,33024,14592,26368,47872,11008,20224,37632]
756; AVX1-NEXT:    vpsrlw $8, %xmm1, %xmm1
757; AVX1-NEXT:    vpackuswb %xmm2, %xmm1, %xmm1
758; AVX1-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2
759; AVX1-NEXT:    vpaddb %xmm2, %xmm1, %xmm1
760; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm2 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
761; AVX1-NEXT:    vpsraw $8, %xmm2, %xmm2
762; AVX1-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 # [32,64,128,32,64,128,128,64]
763; AVX1-NEXT:    vpsrlw $8, %xmm2, %xmm2
764; AVX1-NEXT:    vpunpcklbw {{.*#+}} xmm3 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
765; AVX1-NEXT:    vpsraw $8, %xmm3, %xmm3
766; AVX1-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3 # [64,64,128,64,32,128,64,32]
767; AVX1-NEXT:    vpsrlw $8, %xmm3, %xmm3
768; AVX1-NEXT:    vpackuswb %xmm2, %xmm3, %xmm2
769; AVX1-NEXT:    vpsrlw $7, %xmm1, %xmm1
770; AVX1-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
771; AVX1-NEXT:    vpaddb %xmm1, %xmm2, %xmm1
772; AVX1-NEXT:    vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm2 # [0,8,0,10,0,12,0,14,0,13,0,11,0,9,0,7]
773; AVX1-NEXT:    vpsllw $8, %xmm2, %xmm2
774; AVX1-NEXT:    vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [7,0,9,0,11,0,13,0,14,0,12,0,10,0,9,0]
775; AVX1-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
776; AVX1-NEXT:    vpor %xmm2, %xmm1, %xmm1
777; AVX1-NEXT:    vpsubb %xmm1, %xmm0, %xmm0
778; AVX1-NEXT:    retq
779;
780; AVX2NOBW-LABEL: test_remconstant_16i8:
781; AVX2NOBW:       # %bb.0:
782; AVX2NOBW-NEXT:    vpmovsxbw %xmm0, %ymm1
783; AVX2NOBW-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 # [65427,65409,57,103,65467,43,79,65427,65427,79,43,65467,103,57,57,65427]
784; AVX2NOBW-NEXT:    vpsrlw $8, %ymm1, %ymm1
785; AVX2NOBW-NEXT:    vextracti128 $1, %ymm1, %xmm2
786; AVX2NOBW-NEXT:    vpackuswb %xmm2, %xmm1, %xmm1
787; AVX2NOBW-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2
788; AVX2NOBW-NEXT:    vpaddb %xmm2, %xmm1, %xmm1
789; AVX2NOBW-NEXT:    vpmovsxbw %xmm1, %ymm2
790; AVX2NOBW-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 # [64,64,128,64,32,128,64,32,32,64,128,32,64,128,128,64]
791; AVX2NOBW-NEXT:    vpsrlw $8, %ymm2, %ymm2
792; AVX2NOBW-NEXT:    vextracti128 $1, %ymm2, %xmm3
793; AVX2NOBW-NEXT:    vpackuswb %xmm3, %xmm2, %xmm2
794; AVX2NOBW-NEXT:    vpsrlw $7, %xmm1, %xmm1
795; AVX2NOBW-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
796; AVX2NOBW-NEXT:    vpaddb %xmm1, %xmm2, %xmm1
797; AVX2NOBW-NEXT:    vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
798; AVX2NOBW-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 # [7,8,9,10,11,12,13,14,14,13,12,11,10,9,9,7]
799; AVX2NOBW-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
800; AVX2NOBW-NEXT:    vextracti128 $1, %ymm1, %xmm2
801; AVX2NOBW-NEXT:    vpackuswb %xmm2, %xmm1, %xmm1
802; AVX2NOBW-NEXT:    vpsubb %xmm1, %xmm0, %xmm0
803; AVX2NOBW-NEXT:    vzeroupper
804; AVX2NOBW-NEXT:    retq
805;
806; AVX512BW-LABEL: test_remconstant_16i8:
807; AVX512BW:       # %bb.0:
808; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} ymm1 = [2,2,1,2,3,1,2,3,3,2,1,3,2,1,1,2]
809; AVX512BW-NEXT:    vpmovsxbw %xmm0, %ymm2
810; AVX512BW-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 # [65427,65409,57,103,65467,43,79,65427,65427,79,43,65467,103,57,57,65427]
811; AVX512BW-NEXT:    vpsrlw $8, %ymm2, %ymm2
812; AVX512BW-NEXT:    vpmovwb %zmm2, %ymm2
813; AVX512BW-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm3
814; AVX512BW-NEXT:    vpaddb %xmm3, %xmm2, %xmm2
815; AVX512BW-NEXT:    vpmovsxbw %xmm2, %ymm3
816; AVX512BW-NEXT:    vpsravw %zmm1, %zmm3, %zmm1
817; AVX512BW-NEXT:    vpmovwb %zmm1, %ymm1
818; AVX512BW-NEXT:    vpsrlw $7, %xmm2, %xmm2
819; AVX512BW-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
820; AVX512BW-NEXT:    vpaddb %xmm2, %xmm1, %xmm1
821; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
822; AVX512BW-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 # [7,8,9,10,11,12,13,14,14,13,12,11,10,9,9,7]
823; AVX512BW-NEXT:    vpmovwb %zmm1, %ymm1
824; AVX512BW-NEXT:    vpsubb %xmm1, %xmm0, %xmm0
825; AVX512BW-NEXT:    vzeroupper
826; AVX512BW-NEXT:    retq
827  %res = srem <16 x i8> %a, <i8 7, i8 8, i8 9, i8 10,i8 11, i8 12, i8 13, i8 14, i8 14, i8 13, i8 12, i8 11, i8 10, i8 9,i8 9, i8 7>
828  ret <16 x i8> %res
829}
830
831; This test is just to show what an scalarized v16i8 division looks like.
832define <16 x i8> @test_rem_variable_16i8(<16 x i8> %a, <16 x i8> %b) nounwind {
833; SSE2-LABEL: test_rem_variable_16i8:
834; SSE2:       # %bb.0:
835; SSE2-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
836; SSE2-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
837; SSE2-NEXT:    movsbl -{{[0-9]+}}(%rsp), %eax
838; SSE2-NEXT:    idivb -{{[0-9]+}}(%rsp)
839; SSE2-NEXT:    movsbl %ah, %eax
840; SSE2-NEXT:    movd %eax, %xmm0
841; SSE2-NEXT:    movsbl -{{[0-9]+}}(%rsp), %eax
842; SSE2-NEXT:    idivb -{{[0-9]+}}(%rsp)
843; SSE2-NEXT:    movsbl %ah, %eax
844; SSE2-NEXT:    movd %eax, %xmm1
845; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
846; SSE2-NEXT:    movsbl -{{[0-9]+}}(%rsp), %eax
847; SSE2-NEXT:    idivb -{{[0-9]+}}(%rsp)
848; SSE2-NEXT:    movsbl %ah, %eax
849; SSE2-NEXT:    movd %eax, %xmm0
850; SSE2-NEXT:    movsbl -{{[0-9]+}}(%rsp), %eax
851; SSE2-NEXT:    idivb -{{[0-9]+}}(%rsp)
852; SSE2-NEXT:    movsbl %ah, %eax
853; SSE2-NEXT:    movd %eax, %xmm2
854; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
855; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
856; SSE2-NEXT:    movsbl -{{[0-9]+}}(%rsp), %eax
857; SSE2-NEXT:    idivb -{{[0-9]+}}(%rsp)
858; SSE2-NEXT:    movsbl %ah, %eax
859; SSE2-NEXT:    movd %eax, %xmm0
860; SSE2-NEXT:    movsbl -{{[0-9]+}}(%rsp), %eax
861; SSE2-NEXT:    idivb -{{[0-9]+}}(%rsp)
862; SSE2-NEXT:    movsbl %ah, %eax
863; SSE2-NEXT:    movd %eax, %xmm3
864; SSE2-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
865; SSE2-NEXT:    movsbl -{{[0-9]+}}(%rsp), %eax
866; SSE2-NEXT:    idivb -{{[0-9]+}}(%rsp)
867; SSE2-NEXT:    movsbl %ah, %eax
868; SSE2-NEXT:    movd %eax, %xmm0
869; SSE2-NEXT:    movsbl -{{[0-9]+}}(%rsp), %eax
870; SSE2-NEXT:    idivb -{{[0-9]+}}(%rsp)
871; SSE2-NEXT:    movsbl %ah, %eax
872; SSE2-NEXT:    movd %eax, %xmm1
873; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
874; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3]
875; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
876; SSE2-NEXT:    movsbl -{{[0-9]+}}(%rsp), %eax
877; SSE2-NEXT:    idivb -{{[0-9]+}}(%rsp)
878; SSE2-NEXT:    movsbl %ah, %eax
879; SSE2-NEXT:    movd %eax, %xmm0
880; SSE2-NEXT:    movsbl -{{[0-9]+}}(%rsp), %eax
881; SSE2-NEXT:    idivb -{{[0-9]+}}(%rsp)
882; SSE2-NEXT:    movsbl %ah, %eax
883; SSE2-NEXT:    movd %eax, %xmm2
884; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
885; SSE2-NEXT:    movsbl -{{[0-9]+}}(%rsp), %eax
886; SSE2-NEXT:    idivb -{{[0-9]+}}(%rsp)
887; SSE2-NEXT:    movsbl %ah, %eax
888; SSE2-NEXT:    movd %eax, %xmm0
889; SSE2-NEXT:    movsbl -{{[0-9]+}}(%rsp), %eax
890; SSE2-NEXT:    idivb -{{[0-9]+}}(%rsp)
891; SSE2-NEXT:    movsbl %ah, %eax
892; SSE2-NEXT:    movd %eax, %xmm3
893; SSE2-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
894; SSE2-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
895; SSE2-NEXT:    movsbl -{{[0-9]+}}(%rsp), %eax
896; SSE2-NEXT:    idivb -{{[0-9]+}}(%rsp)
897; SSE2-NEXT:    movsbl %ah, %eax
898; SSE2-NEXT:    movd %eax, %xmm0
899; SSE2-NEXT:    movsbl -{{[0-9]+}}(%rsp), %eax
900; SSE2-NEXT:    idivb -{{[0-9]+}}(%rsp)
901; SSE2-NEXT:    movsbl %ah, %eax
902; SSE2-NEXT:    movd %eax, %xmm2
903; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
904; SSE2-NEXT:    movsbl -{{[0-9]+}}(%rsp), %eax
905; SSE2-NEXT:    idivb -{{[0-9]+}}(%rsp)
906; SSE2-NEXT:    movsbl %ah, %eax
907; SSE2-NEXT:    movd %eax, %xmm4
908; SSE2-NEXT:    movsbl -{{[0-9]+}}(%rsp), %eax
909; SSE2-NEXT:    idivb -{{[0-9]+}}(%rsp)
910; SSE2-NEXT:    movsbl %ah, %eax
911; SSE2-NEXT:    movd %eax, %xmm0
912; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7]
913; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
914; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
915; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
916; SSE2-NEXT:    retq
917;
918; SSE41-LABEL: test_rem_variable_16i8:
919; SSE41:       # %bb.0:
920; SSE41-NEXT:    pextrb $1, %xmm1, %ecx
921; SSE41-NEXT:    pextrb $1, %xmm0, %eax
922; SSE41-NEXT:    cbtw
923; SSE41-NEXT:    idivb %cl
924; SSE41-NEXT:    movsbl %ah, %ecx
925; SSE41-NEXT:    movd %xmm1, %edx
926; SSE41-NEXT:    movd %xmm0, %eax
927; SSE41-NEXT:    cbtw
928; SSE41-NEXT:    idivb %dl
929; SSE41-NEXT:    movsbl %ah, %eax
930; SSE41-NEXT:    movd %eax, %xmm2
931; SSE41-NEXT:    pinsrb $1, %ecx, %xmm2
932; SSE41-NEXT:    pextrb $2, %xmm1, %ecx
933; SSE41-NEXT:    pextrb $2, %xmm0, %eax
934; SSE41-NEXT:    cbtw
935; SSE41-NEXT:    idivb %cl
936; SSE41-NEXT:    movsbl %ah, %eax
937; SSE41-NEXT:    pinsrb $2, %eax, %xmm2
938; SSE41-NEXT:    pextrb $3, %xmm1, %ecx
939; SSE41-NEXT:    pextrb $3, %xmm0, %eax
940; SSE41-NEXT:    cbtw
941; SSE41-NEXT:    idivb %cl
942; SSE41-NEXT:    movsbl %ah, %eax
943; SSE41-NEXT:    pinsrb $3, %eax, %xmm2
944; SSE41-NEXT:    pextrb $4, %xmm1, %ecx
945; SSE41-NEXT:    pextrb $4, %xmm0, %eax
946; SSE41-NEXT:    cbtw
947; SSE41-NEXT:    idivb %cl
948; SSE41-NEXT:    movsbl %ah, %eax
949; SSE41-NEXT:    pinsrb $4, %eax, %xmm2
950; SSE41-NEXT:    pextrb $5, %xmm1, %ecx
951; SSE41-NEXT:    pextrb $5, %xmm0, %eax
952; SSE41-NEXT:    cbtw
953; SSE41-NEXT:    idivb %cl
954; SSE41-NEXT:    movsbl %ah, %eax
955; SSE41-NEXT:    pinsrb $5, %eax, %xmm2
956; SSE41-NEXT:    pextrb $6, %xmm1, %ecx
957; SSE41-NEXT:    pextrb $6, %xmm0, %eax
958; SSE41-NEXT:    cbtw
959; SSE41-NEXT:    idivb %cl
960; SSE41-NEXT:    movsbl %ah, %eax
961; SSE41-NEXT:    pinsrb $6, %eax, %xmm2
962; SSE41-NEXT:    pextrb $7, %xmm1, %ecx
963; SSE41-NEXT:    pextrb $7, %xmm0, %eax
964; SSE41-NEXT:    cbtw
965; SSE41-NEXT:    idivb %cl
966; SSE41-NEXT:    movsbl %ah, %eax
967; SSE41-NEXT:    pinsrb $7, %eax, %xmm2
968; SSE41-NEXT:    pextrb $8, %xmm1, %ecx
969; SSE41-NEXT:    pextrb $8, %xmm0, %eax
970; SSE41-NEXT:    cbtw
971; SSE41-NEXT:    idivb %cl
972; SSE41-NEXT:    movsbl %ah, %eax
973; SSE41-NEXT:    pinsrb $8, %eax, %xmm2
974; SSE41-NEXT:    pextrb $9, %xmm1, %ecx
975; SSE41-NEXT:    pextrb $9, %xmm0, %eax
976; SSE41-NEXT:    cbtw
977; SSE41-NEXT:    idivb %cl
978; SSE41-NEXT:    movsbl %ah, %eax
979; SSE41-NEXT:    pinsrb $9, %eax, %xmm2
980; SSE41-NEXT:    pextrb $10, %xmm1, %ecx
981; SSE41-NEXT:    pextrb $10, %xmm0, %eax
982; SSE41-NEXT:    cbtw
983; SSE41-NEXT:    idivb %cl
984; SSE41-NEXT:    movsbl %ah, %eax
985; SSE41-NEXT:    pinsrb $10, %eax, %xmm2
986; SSE41-NEXT:    pextrb $11, %xmm1, %ecx
987; SSE41-NEXT:    pextrb $11, %xmm0, %eax
988; SSE41-NEXT:    cbtw
989; SSE41-NEXT:    idivb %cl
990; SSE41-NEXT:    movsbl %ah, %eax
991; SSE41-NEXT:    pinsrb $11, %eax, %xmm2
992; SSE41-NEXT:    pextrb $12, %xmm1, %ecx
993; SSE41-NEXT:    pextrb $12, %xmm0, %eax
994; SSE41-NEXT:    cbtw
995; SSE41-NEXT:    idivb %cl
996; SSE41-NEXT:    movsbl %ah, %eax
997; SSE41-NEXT:    pinsrb $12, %eax, %xmm2
998; SSE41-NEXT:    pextrb $13, %xmm1, %ecx
999; SSE41-NEXT:    pextrb $13, %xmm0, %eax
1000; SSE41-NEXT:    cbtw
1001; SSE41-NEXT:    idivb %cl
1002; SSE41-NEXT:    movsbl %ah, %eax
1003; SSE41-NEXT:    pinsrb $13, %eax, %xmm2
1004; SSE41-NEXT:    pextrb $14, %xmm1, %ecx
1005; SSE41-NEXT:    pextrb $14, %xmm0, %eax
1006; SSE41-NEXT:    cbtw
1007; SSE41-NEXT:    idivb %cl
1008; SSE41-NEXT:    movsbl %ah, %eax
1009; SSE41-NEXT:    pinsrb $14, %eax, %xmm2
1010; SSE41-NEXT:    pextrb $15, %xmm1, %ecx
1011; SSE41-NEXT:    pextrb $15, %xmm0, %eax
1012; SSE41-NEXT:    cbtw
1013; SSE41-NEXT:    idivb %cl
1014; SSE41-NEXT:    movsbl %ah, %eax
1015; SSE41-NEXT:    pinsrb $15, %eax, %xmm2
1016; SSE41-NEXT:    movdqa %xmm2, %xmm0
1017; SSE41-NEXT:    retq
1018;
1019; AVX-LABEL: test_rem_variable_16i8:
1020; AVX:       # %bb.0:
1021; AVX-NEXT:    vpextrb $1, %xmm1, %ecx
1022; AVX-NEXT:    vpextrb $1, %xmm0, %eax
1023; AVX-NEXT:    cbtw
1024; AVX-NEXT:    idivb %cl
1025; AVX-NEXT:    movsbl %ah, %ecx
1026; AVX-NEXT:    vmovd %xmm1, %edx
1027; AVX-NEXT:    vmovd %xmm0, %eax
1028; AVX-NEXT:    cbtw
1029; AVX-NEXT:    idivb %dl
1030; AVX-NEXT:    movsbl %ah, %eax
1031; AVX-NEXT:    vmovd %eax, %xmm2
1032; AVX-NEXT:    vpinsrb $1, %ecx, %xmm2, %xmm2
1033; AVX-NEXT:    vpextrb $2, %xmm1, %ecx
1034; AVX-NEXT:    vpextrb $2, %xmm0, %eax
1035; AVX-NEXT:    cbtw
1036; AVX-NEXT:    idivb %cl
1037; AVX-NEXT:    movsbl %ah, %eax
1038; AVX-NEXT:    vpinsrb $2, %eax, %xmm2, %xmm2
1039; AVX-NEXT:    vpextrb $3, %xmm1, %ecx
1040; AVX-NEXT:    vpextrb $3, %xmm0, %eax
1041; AVX-NEXT:    cbtw
1042; AVX-NEXT:    idivb %cl
1043; AVX-NEXT:    movsbl %ah, %eax
1044; AVX-NEXT:    vpinsrb $3, %eax, %xmm2, %xmm2
1045; AVX-NEXT:    vpextrb $4, %xmm1, %ecx
1046; AVX-NEXT:    vpextrb $4, %xmm0, %eax
1047; AVX-NEXT:    cbtw
1048; AVX-NEXT:    idivb %cl
1049; AVX-NEXT:    movsbl %ah, %eax
1050; AVX-NEXT:    vpinsrb $4, %eax, %xmm2, %xmm2
1051; AVX-NEXT:    vpextrb $5, %xmm1, %ecx
1052; AVX-NEXT:    vpextrb $5, %xmm0, %eax
1053; AVX-NEXT:    cbtw
1054; AVX-NEXT:    idivb %cl
1055; AVX-NEXT:    movsbl %ah, %eax
1056; AVX-NEXT:    vpinsrb $5, %eax, %xmm2, %xmm2
1057; AVX-NEXT:    vpextrb $6, %xmm1, %ecx
1058; AVX-NEXT:    vpextrb $6, %xmm0, %eax
1059; AVX-NEXT:    cbtw
1060; AVX-NEXT:    idivb %cl
1061; AVX-NEXT:    movsbl %ah, %eax
1062; AVX-NEXT:    vpinsrb $6, %eax, %xmm2, %xmm2
1063; AVX-NEXT:    vpextrb $7, %xmm1, %ecx
1064; AVX-NEXT:    vpextrb $7, %xmm0, %eax
1065; AVX-NEXT:    cbtw
1066; AVX-NEXT:    idivb %cl
1067; AVX-NEXT:    movsbl %ah, %eax
1068; AVX-NEXT:    vpinsrb $7, %eax, %xmm2, %xmm2
1069; AVX-NEXT:    vpextrb $8, %xmm1, %ecx
1070; AVX-NEXT:    vpextrb $8, %xmm0, %eax
1071; AVX-NEXT:    cbtw
1072; AVX-NEXT:    idivb %cl
1073; AVX-NEXT:    movsbl %ah, %eax
1074; AVX-NEXT:    vpinsrb $8, %eax, %xmm2, %xmm2
1075; AVX-NEXT:    vpextrb $9, %xmm1, %ecx
1076; AVX-NEXT:    vpextrb $9, %xmm0, %eax
1077; AVX-NEXT:    cbtw
1078; AVX-NEXT:    idivb %cl
1079; AVX-NEXT:    movsbl %ah, %eax
1080; AVX-NEXT:    vpinsrb $9, %eax, %xmm2, %xmm2
1081; AVX-NEXT:    vpextrb $10, %xmm1, %ecx
1082; AVX-NEXT:    vpextrb $10, %xmm0, %eax
1083; AVX-NEXT:    cbtw
1084; AVX-NEXT:    idivb %cl
1085; AVX-NEXT:    movsbl %ah, %eax
1086; AVX-NEXT:    vpinsrb $10, %eax, %xmm2, %xmm2
1087; AVX-NEXT:    vpextrb $11, %xmm1, %ecx
1088; AVX-NEXT:    vpextrb $11, %xmm0, %eax
1089; AVX-NEXT:    cbtw
1090; AVX-NEXT:    idivb %cl
1091; AVX-NEXT:    movsbl %ah, %eax
1092; AVX-NEXT:    vpinsrb $11, %eax, %xmm2, %xmm2
1093; AVX-NEXT:    vpextrb $12, %xmm1, %ecx
1094; AVX-NEXT:    vpextrb $12, %xmm0, %eax
1095; AVX-NEXT:    cbtw
1096; AVX-NEXT:    idivb %cl
1097; AVX-NEXT:    movsbl %ah, %eax
1098; AVX-NEXT:    vpinsrb $12, %eax, %xmm2, %xmm2
1099; AVX-NEXT:    vpextrb $13, %xmm1, %ecx
1100; AVX-NEXT:    vpextrb $13, %xmm0, %eax
1101; AVX-NEXT:    cbtw
1102; AVX-NEXT:    idivb %cl
1103; AVX-NEXT:    movsbl %ah, %eax
1104; AVX-NEXT:    vpinsrb $13, %eax, %xmm2, %xmm2
1105; AVX-NEXT:    vpextrb $14, %xmm1, %ecx
1106; AVX-NEXT:    vpextrb $14, %xmm0, %eax
1107; AVX-NEXT:    cbtw
1108; AVX-NEXT:    idivb %cl
1109; AVX-NEXT:    movsbl %ah, %eax
1110; AVX-NEXT:    vpinsrb $14, %eax, %xmm2, %xmm2
1111; AVX-NEXT:    vpextrb $15, %xmm1, %ecx
1112; AVX-NEXT:    vpextrb $15, %xmm0, %eax
1113; AVX-NEXT:    cbtw
1114; AVX-NEXT:    idivb %cl
1115; AVX-NEXT:    movsbl %ah, %eax
1116; AVX-NEXT:    vpinsrb $15, %eax, %xmm2, %xmm0
1117; AVX-NEXT:    retq
1118  %res = srem <16 x i8> %a, %b
1119  ret <16 x i8> %res
1120}
1121