xref: /llvm-project/llvm/test/CodeGen/X86/urem-vector-lkk.ll (revision 74fe1da01eb149a2234fc0f9570c84a08692e782)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=CHECK --check-prefix=SSE
3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx  | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX1
4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX2
5
6define <4 x i16> @fold_urem_vec_1(<4 x i16> %x) {
7; SSE-LABEL: fold_urem_vec_1:
8; SSE:       # %bb.0:
9; SSE-NEXT:    pextrw $1, %xmm0, %eax
10; SSE-NEXT:    movl %eax, %ecx
11; SSE-NEXT:    shrl $2, %ecx
12; SSE-NEXT:    imull $16913, %ecx, %ecx # imm = 0x4211
13; SSE-NEXT:    shrl $19, %ecx
14; SSE-NEXT:    imull $124, %ecx, %ecx
15; SSE-NEXT:    subl %ecx, %eax
16; SSE-NEXT:    movd %xmm0, %ecx
17; SSE-NEXT:    movzwl %cx, %edx
18; SSE-NEXT:    imull $44151, %edx, %edx # imm = 0xAC77
19; SSE-NEXT:    shrl $22, %edx
20; SSE-NEXT:    imull $95, %edx, %edx
21; SSE-NEXT:    subl %edx, %ecx
22; SSE-NEXT:    movd %ecx, %xmm1
23; SSE-NEXT:    pinsrw $1, %eax, %xmm1
24; SSE-NEXT:    pextrw $2, %xmm0, %eax
25; SSE-NEXT:    movl %eax, %ecx
26; SSE-NEXT:    shrl %ecx
27; SSE-NEXT:    imull $2675, %ecx, %ecx # imm = 0xA73
28; SSE-NEXT:    shrl $17, %ecx
29; SSE-NEXT:    imull $98, %ecx, %ecx
30; SSE-NEXT:    subl %ecx, %eax
31; SSE-NEXT:    pinsrw $2, %eax, %xmm1
32; SSE-NEXT:    pextrw $3, %xmm0, %eax
33; SSE-NEXT:    imull $1373, %eax, %ecx # imm = 0x55D
34; SSE-NEXT:    shrl $16, %ecx
35; SSE-NEXT:    movl %eax, %edx
36; SSE-NEXT:    subl %ecx, %edx
37; SSE-NEXT:    movzwl %dx, %edx
38; SSE-NEXT:    shrl %edx
39; SSE-NEXT:    addl %ecx, %edx
40; SSE-NEXT:    shrl $9, %edx
41; SSE-NEXT:    imull $1003, %edx, %ecx # imm = 0x3EB
42; SSE-NEXT:    subl %ecx, %eax
43; SSE-NEXT:    pinsrw $3, %eax, %xmm1
44; SSE-NEXT:    movdqa %xmm1, %xmm0
45; SSE-NEXT:    retq
46;
47; AVX-LABEL: fold_urem_vec_1:
48; AVX:       # %bb.0:
49; AVX-NEXT:    vpextrw $1, %xmm0, %eax
50; AVX-NEXT:    movl %eax, %ecx
51; AVX-NEXT:    shrl $2, %ecx
52; AVX-NEXT:    imull $16913, %ecx, %ecx # imm = 0x4211
53; AVX-NEXT:    shrl $19, %ecx
54; AVX-NEXT:    imull $124, %ecx, %ecx
55; AVX-NEXT:    subl %ecx, %eax
56; AVX-NEXT:    vmovd %xmm0, %ecx
57; AVX-NEXT:    movzwl %cx, %edx
58; AVX-NEXT:    imull $44151, %edx, %edx # imm = 0xAC77
59; AVX-NEXT:    shrl $22, %edx
60; AVX-NEXT:    imull $95, %edx, %edx
61; AVX-NEXT:    subl %edx, %ecx
62; AVX-NEXT:    vmovd %ecx, %xmm1
63; AVX-NEXT:    vpinsrw $1, %eax, %xmm1, %xmm1
64; AVX-NEXT:    vpextrw $2, %xmm0, %eax
65; AVX-NEXT:    movl %eax, %ecx
66; AVX-NEXT:    shrl %ecx
67; AVX-NEXT:    imull $2675, %ecx, %ecx # imm = 0xA73
68; AVX-NEXT:    shrl $17, %ecx
69; AVX-NEXT:    imull $98, %ecx, %ecx
70; AVX-NEXT:    subl %ecx, %eax
71; AVX-NEXT:    vpinsrw $2, %eax, %xmm1, %xmm1
72; AVX-NEXT:    vpextrw $3, %xmm0, %eax
73; AVX-NEXT:    imull $1373, %eax, %ecx # imm = 0x55D
74; AVX-NEXT:    shrl $16, %ecx
75; AVX-NEXT:    movl %eax, %edx
76; AVX-NEXT:    subl %ecx, %edx
77; AVX-NEXT:    movzwl %dx, %edx
78; AVX-NEXT:    shrl %edx
79; AVX-NEXT:    addl %ecx, %edx
80; AVX-NEXT:    shrl $9, %edx
81; AVX-NEXT:    imull $1003, %edx, %ecx # imm = 0x3EB
82; AVX-NEXT:    subl %ecx, %eax
83; AVX-NEXT:    vpinsrw $3, %eax, %xmm1, %xmm0
84; AVX-NEXT:    retq
85  %1 = urem <4 x i16> %x, <i16 95, i16 124, i16 98, i16 1003>
86  ret <4 x i16> %1
87}
88
89define <4 x i16> @fold_urem_vec_2(<4 x i16> %x) {
90; SSE-LABEL: fold_urem_vec_2:
91; SSE:       # %bb.0:
92; SSE-NEXT:    movdqa {{.*#+}} xmm1 = [44151,44151,44151,44151,44151,44151,44151,44151]
93; SSE-NEXT:    pmulhuw %xmm0, %xmm1
94; SSE-NEXT:    psrlw $6, %xmm1
95; SSE-NEXT:    pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [95,95,95,95,95,95,95,95]
96; SSE-NEXT:    psubw %xmm1, %xmm0
97; SSE-NEXT:    retq
98;
99; AVX-LABEL: fold_urem_vec_2:
100; AVX:       # %bb.0:
101; AVX-NEXT:    vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 # [44151,44151,44151,44151,44151,44151,44151,44151]
102; AVX-NEXT:    vpsrlw $6, %xmm1, %xmm1
103; AVX-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [95,95,95,95,95,95,95,95]
104; AVX-NEXT:    vpsubw %xmm1, %xmm0, %xmm0
105; AVX-NEXT:    retq
106  %1 = urem <4 x i16> %x, <i16 95, i16 95, i16 95, i16 95>
107  ret <4 x i16> %1
108}
109
110
111; Don't fold if we can combine urem with udiv.
112define <4 x i16> @combine_urem_udiv(<4 x i16> %x) {
113; SSE-LABEL: combine_urem_udiv:
114; SSE:       # %bb.0:
115; SSE-NEXT:    movdqa {{.*#+}} xmm1 = [44151,44151,44151,44151,44151,44151,44151,44151]
116; SSE-NEXT:    pmulhuw %xmm0, %xmm1
117; SSE-NEXT:    psrlw $6, %xmm1
118; SSE-NEXT:    pmovsxbw {{.*#+}} xmm2 = [95,95,95,95,95,95,95,95]
119; SSE-NEXT:    pmullw %xmm1, %xmm2
120; SSE-NEXT:    psubw %xmm2, %xmm0
121; SSE-NEXT:    paddw %xmm1, %xmm0
122; SSE-NEXT:    retq
123;
124; AVX-LABEL: combine_urem_udiv:
125; AVX:       # %bb.0:
126; AVX-NEXT:    vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 # [44151,44151,44151,44151,44151,44151,44151,44151]
127; AVX-NEXT:    vpsrlw $6, %xmm1, %xmm1
128; AVX-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm2 # [95,95,95,95,95,95,95,95]
129; AVX-NEXT:    vpsubw %xmm2, %xmm0, %xmm0
130; AVX-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
131; AVX-NEXT:    retq
132  %1 = urem <4 x i16> %x, <i16 95, i16 95, i16 95, i16 95>
133  %2 = udiv <4 x i16> %x, <i16 95, i16 95, i16 95, i16 95>
134  %3 = add <4 x i16> %1, %2
135  ret <4 x i16> %3
136}
137
138; Don't fold for divisors that are a power of two.
139define <4 x i16> @dont_fold_urem_power_of_two(<4 x i16> %x) {
140; SSE-LABEL: dont_fold_urem_power_of_two:
141; SSE:       # %bb.0:
142; SSE-NEXT:    pmovsxbd {{.*#+}} xmm1 = [63,63,63,63]
143; SSE-NEXT:    pand %xmm0, %xmm1
144; SSE-NEXT:    pextrw $1, %xmm0, %eax
145; SSE-NEXT:    andl $31, %eax
146; SSE-NEXT:    pinsrw $1, %eax, %xmm1
147; SSE-NEXT:    pextrw $2, %xmm0, %eax
148; SSE-NEXT:    andl $7, %eax
149; SSE-NEXT:    pinsrw $2, %eax, %xmm1
150; SSE-NEXT:    pextrw $3, %xmm0, %eax
151; SSE-NEXT:    imull $44151, %eax, %ecx # imm = 0xAC77
152; SSE-NEXT:    shrl $22, %ecx
153; SSE-NEXT:    imull $95, %ecx, %ecx
154; SSE-NEXT:    subl %ecx, %eax
155; SSE-NEXT:    pinsrw $3, %eax, %xmm1
156; SSE-NEXT:    movdqa %xmm1, %xmm0
157; SSE-NEXT:    retq
158;
159; AVX1-LABEL: dont_fold_urem_power_of_two:
160; AVX1:       # %bb.0:
161; AVX1-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
162; AVX1-NEXT:    vpextrw $1, %xmm0, %eax
163; AVX1-NEXT:    andl $31, %eax
164; AVX1-NEXT:    vpinsrw $1, %eax, %xmm1, %xmm1
165; AVX1-NEXT:    vpextrw $2, %xmm0, %eax
166; AVX1-NEXT:    andl $7, %eax
167; AVX1-NEXT:    vpinsrw $2, %eax, %xmm1, %xmm1
168; AVX1-NEXT:    vpextrw $3, %xmm0, %eax
169; AVX1-NEXT:    imull $44151, %eax, %ecx # imm = 0xAC77
170; AVX1-NEXT:    shrl $22, %ecx
171; AVX1-NEXT:    imull $95, %ecx, %ecx
172; AVX1-NEXT:    subl %ecx, %eax
173; AVX1-NEXT:    vpinsrw $3, %eax, %xmm1, %xmm0
174; AVX1-NEXT:    retq
175;
176; AVX2-LABEL: dont_fold_urem_power_of_two:
177; AVX2:       # %bb.0:
178; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [63,63,63,63]
179; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm1
180; AVX2-NEXT:    vpextrw $1, %xmm0, %eax
181; AVX2-NEXT:    andl $31, %eax
182; AVX2-NEXT:    vpinsrw $1, %eax, %xmm1, %xmm1
183; AVX2-NEXT:    vpextrw $2, %xmm0, %eax
184; AVX2-NEXT:    andl $7, %eax
185; AVX2-NEXT:    vpinsrw $2, %eax, %xmm1, %xmm1
186; AVX2-NEXT:    vpextrw $3, %xmm0, %eax
187; AVX2-NEXT:    imull $44151, %eax, %ecx # imm = 0xAC77
188; AVX2-NEXT:    shrl $22, %ecx
189; AVX2-NEXT:    imull $95, %ecx, %ecx
190; AVX2-NEXT:    subl %ecx, %eax
191; AVX2-NEXT:    vpinsrw $3, %eax, %xmm1, %xmm0
192; AVX2-NEXT:    retq
193  %1 = urem <4 x i16> %x, <i16 64, i16 32, i16 8, i16 95>
194  ret <4 x i16> %1
195}
196
197; Don't fold if the divisor is one.
198define <4 x i16> @dont_fold_urem_one(<4 x i16> %x) {
199; SSE-LABEL: dont_fold_urem_one:
200; SSE:       # %bb.0:
201; SSE-NEXT:    pextrw $2, %xmm0, %eax
202; SSE-NEXT:    imull $25645, %eax, %ecx # imm = 0x642D
203; SSE-NEXT:    shrl $16, %ecx
204; SSE-NEXT:    movl %eax, %edx
205; SSE-NEXT:    subl %ecx, %edx
206; SSE-NEXT:    movzwl %dx, %edx
207; SSE-NEXT:    shrl %edx
208; SSE-NEXT:    addl %ecx, %edx
209; SSE-NEXT:    shrl $4, %edx
210; SSE-NEXT:    leal (%rdx,%rdx,2), %ecx
211; SSE-NEXT:    shll $3, %ecx
212; SSE-NEXT:    subl %ecx, %edx
213; SSE-NEXT:    addl %eax, %edx
214; SSE-NEXT:    pextrw $1, %xmm0, %eax
215; SSE-NEXT:    imull $51307, %eax, %ecx # imm = 0xC86B
216; SSE-NEXT:    shrl $25, %ecx
217; SSE-NEXT:    imull $654, %ecx, %ecx # imm = 0x28E
218; SSE-NEXT:    subl %ecx, %eax
219; SSE-NEXT:    pxor %xmm1, %xmm1
220; SSE-NEXT:    pinsrw $1, %eax, %xmm1
221; SSE-NEXT:    pinsrw $2, %edx, %xmm1
222; SSE-NEXT:    pextrw $3, %xmm0, %eax
223; SSE-NEXT:    imull $12375, %eax, %ecx # imm = 0x3057
224; SSE-NEXT:    shrl $26, %ecx
225; SSE-NEXT:    imull $5423, %ecx, %ecx # imm = 0x152F
226; SSE-NEXT:    subl %ecx, %eax
227; SSE-NEXT:    pinsrw $3, %eax, %xmm1
228; SSE-NEXT:    movdqa %xmm1, %xmm0
229; SSE-NEXT:    retq
230;
231; AVX-LABEL: dont_fold_urem_one:
232; AVX:       # %bb.0:
233; AVX-NEXT:    vpextrw $2, %xmm0, %eax
234; AVX-NEXT:    imull $25645, %eax, %ecx # imm = 0x642D
235; AVX-NEXT:    shrl $16, %ecx
236; AVX-NEXT:    movl %eax, %edx
237; AVX-NEXT:    subl %ecx, %edx
238; AVX-NEXT:    movzwl %dx, %edx
239; AVX-NEXT:    shrl %edx
240; AVX-NEXT:    addl %ecx, %edx
241; AVX-NEXT:    shrl $4, %edx
242; AVX-NEXT:    leal (%rdx,%rdx,2), %ecx
243; AVX-NEXT:    shll $3, %ecx
244; AVX-NEXT:    subl %ecx, %edx
245; AVX-NEXT:    addl %eax, %edx
246; AVX-NEXT:    vpextrw $1, %xmm0, %eax
247; AVX-NEXT:    imull $51307, %eax, %ecx # imm = 0xC86B
248; AVX-NEXT:    shrl $25, %ecx
249; AVX-NEXT:    imull $654, %ecx, %ecx # imm = 0x28E
250; AVX-NEXT:    subl %ecx, %eax
251; AVX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
252; AVX-NEXT:    vpinsrw $1, %eax, %xmm1, %xmm1
253; AVX-NEXT:    vpinsrw $2, %edx, %xmm1, %xmm1
254; AVX-NEXT:    vpextrw $3, %xmm0, %eax
255; AVX-NEXT:    imull $12375, %eax, %ecx # imm = 0x3057
256; AVX-NEXT:    shrl $26, %ecx
257; AVX-NEXT:    imull $5423, %ecx, %ecx # imm = 0x152F
258; AVX-NEXT:    subl %ecx, %eax
259; AVX-NEXT:    vpinsrw $3, %eax, %xmm1, %xmm0
260; AVX-NEXT:    retq
261  %1 = urem <4 x i16> %x, <i16 1, i16 654, i16 23, i16 5423>
262  ret <4 x i16> %1
263}
264
265; Don't fold if the divisor is 2^16.
266define <4 x i16> @dont_fold_urem_i16_smax(<4 x i16> %x) {
267; CHECK-LABEL: dont_fold_urem_i16_smax:
268; CHECK:       # %bb.0:
269; CHECK-NEXT:    retq
270  %1 = urem <4 x i16> %x, <i16 1, i16 65536, i16 23, i16 5423>
271  ret <4 x i16> %1
272}
273
274; Don't fold i64 urem.
275define <4 x i64> @dont_fold_urem_i64(<4 x i64> %x) {
276; SSE-LABEL: dont_fold_urem_i64:
277; SSE:       # %bb.0:
278; SSE-NEXT:    movq %xmm1, %rcx
279; SSE-NEXT:    movabsq $7218291159277650633, %rdx # imm = 0x642C8590B21642C9
280; SSE-NEXT:    movq %rcx, %rax
281; SSE-NEXT:    mulq %rdx
282; SSE-NEXT:    movq %rcx, %rax
283; SSE-NEXT:    subq %rdx, %rax
284; SSE-NEXT:    shrq %rax
285; SSE-NEXT:    addq %rdx, %rax
286; SSE-NEXT:    shrq $4, %rax
287; SSE-NEXT:    leaq (%rax,%rax,2), %rdx
288; SSE-NEXT:    shlq $3, %rdx
289; SSE-NEXT:    subq %rdx, %rax
290; SSE-NEXT:    addq %rcx, %rax
291; SSE-NEXT:    movq %rax, %xmm2
292; SSE-NEXT:    pextrq $1, %xmm1, %rcx
293; SSE-NEXT:    movabsq $-4513890722074972339, %rdx # imm = 0xC15B704DCBCA2F4D
294; SSE-NEXT:    movq %rcx, %rax
295; SSE-NEXT:    mulq %rdx
296; SSE-NEXT:    shrq $12, %rdx
297; SSE-NEXT:    imulq $5423, %rdx, %rax # imm = 0x152F
298; SSE-NEXT:    subq %rax, %rcx
299; SSE-NEXT:    movq %rcx, %xmm1
300; SSE-NEXT:    punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0]
301; SSE-NEXT:    pextrq $1, %xmm0, %rcx
302; SSE-NEXT:    movq %rcx, %rax
303; SSE-NEXT:    shrq %rax
304; SSE-NEXT:    movabsq $7220743857598845893, %rdx # imm = 0x64353C48064353C5
305; SSE-NEXT:    mulq %rdx
306; SSE-NEXT:    shrq $7, %rdx
307; SSE-NEXT:    imulq $654, %rdx, %rax # imm = 0x28E
308; SSE-NEXT:    subq %rax, %rcx
309; SSE-NEXT:    movq %rcx, %xmm0
310; SSE-NEXT:    pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7]
311; SSE-NEXT:    movdqa %xmm2, %xmm1
312; SSE-NEXT:    retq
313;
314; AVX1-LABEL: dont_fold_urem_i64:
315; AVX1:       # %bb.0:
316; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
317; AVX1-NEXT:    vmovq %xmm1, %rcx
318; AVX1-NEXT:    movabsq $7218291159277650633, %rdx # imm = 0x642C8590B21642C9
319; AVX1-NEXT:    movq %rcx, %rax
320; AVX1-NEXT:    mulq %rdx
321; AVX1-NEXT:    movq %rcx, %rax
322; AVX1-NEXT:    subq %rdx, %rax
323; AVX1-NEXT:    shrq %rax
324; AVX1-NEXT:    addq %rdx, %rax
325; AVX1-NEXT:    shrq $4, %rax
326; AVX1-NEXT:    leaq (%rax,%rax,2), %rdx
327; AVX1-NEXT:    shlq $3, %rdx
328; AVX1-NEXT:    subq %rdx, %rax
329; AVX1-NEXT:    addq %rcx, %rax
330; AVX1-NEXT:    vmovq %rax, %xmm2
331; AVX1-NEXT:    vpextrq $1, %xmm1, %rcx
332; AVX1-NEXT:    movabsq $-4513890722074972339, %rdx # imm = 0xC15B704DCBCA2F4D
333; AVX1-NEXT:    movq %rcx, %rax
334; AVX1-NEXT:    mulq %rdx
335; AVX1-NEXT:    shrq $12, %rdx
336; AVX1-NEXT:    imulq $5423, %rdx, %rax # imm = 0x152F
337; AVX1-NEXT:    subq %rax, %rcx
338; AVX1-NEXT:    vmovq %rcx, %xmm1
339; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
340; AVX1-NEXT:    vpextrq $1, %xmm0, %rcx
341; AVX1-NEXT:    movq %rcx, %rax
342; AVX1-NEXT:    shrq %rax
343; AVX1-NEXT:    movabsq $7220743857598845893, %rdx # imm = 0x64353C48064353C5
344; AVX1-NEXT:    mulq %rdx
345; AVX1-NEXT:    shrq $7, %rdx
346; AVX1-NEXT:    imulq $654, %rdx, %rax # imm = 0x28E
347; AVX1-NEXT:    subq %rax, %rcx
348; AVX1-NEXT:    vmovq %rcx, %xmm0
349; AVX1-NEXT:    vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7]
350; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
351; AVX1-NEXT:    retq
352;
353; AVX2-LABEL: dont_fold_urem_i64:
354; AVX2:       # %bb.0:
355; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
356; AVX2-NEXT:    vmovq %xmm1, %rcx
357; AVX2-NEXT:    movabsq $7218291159277650633, %rdx # imm = 0x642C8590B21642C9
358; AVX2-NEXT:    movq %rcx, %rax
359; AVX2-NEXT:    mulq %rdx
360; AVX2-NEXT:    movq %rcx, %rax
361; AVX2-NEXT:    subq %rdx, %rax
362; AVX2-NEXT:    shrq %rax
363; AVX2-NEXT:    addq %rdx, %rax
364; AVX2-NEXT:    shrq $4, %rax
365; AVX2-NEXT:    leaq (%rax,%rax,2), %rdx
366; AVX2-NEXT:    shlq $3, %rdx
367; AVX2-NEXT:    subq %rdx, %rax
368; AVX2-NEXT:    addq %rcx, %rax
369; AVX2-NEXT:    vmovq %rax, %xmm2
370; AVX2-NEXT:    vpextrq $1, %xmm1, %rcx
371; AVX2-NEXT:    movabsq $-4513890722074972339, %rdx # imm = 0xC15B704DCBCA2F4D
372; AVX2-NEXT:    movq %rcx, %rax
373; AVX2-NEXT:    mulq %rdx
374; AVX2-NEXT:    shrq $12, %rdx
375; AVX2-NEXT:    imulq $5423, %rdx, %rax # imm = 0x152F
376; AVX2-NEXT:    subq %rax, %rcx
377; AVX2-NEXT:    vmovq %rcx, %xmm1
378; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
379; AVX2-NEXT:    vpextrq $1, %xmm0, %rcx
380; AVX2-NEXT:    movq %rcx, %rax
381; AVX2-NEXT:    shrq %rax
382; AVX2-NEXT:    movabsq $7220743857598845893, %rdx # imm = 0x64353C48064353C5
383; AVX2-NEXT:    mulq %rdx
384; AVX2-NEXT:    shrq $7, %rdx
385; AVX2-NEXT:    imulq $654, %rdx, %rax # imm = 0x28E
386; AVX2-NEXT:    subq %rax, %rcx
387; AVX2-NEXT:    vmovq %rcx, %xmm0
388; AVX2-NEXT:    vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7]
389; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
390; AVX2-NEXT:    retq
391  %1 = urem <4 x i64> %x, <i64 1, i64 654, i64 23, i64 5423>
392  ret <4 x i64> %1
393}
394