xref: /llvm-project/llvm/test/CodeGen/AArch64/sve-fixed-length-int-rem.ll (revision cc82f1290a1e2157a6c0530d78d8cc84d2b8553d)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -aarch64-sve-vector-bits-min=128  < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_128
3; RUN: llc -aarch64-sve-vector-bits-min=256  < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256
4; RUN: llc -aarch64-sve-vector-bits-min=512  < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
5; RUN: llc -aarch64-sve-vector-bits-min=2048 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
6
7target triple = "aarch64-unknown-linux-gnu"
8
9;
10; SREM
11;
12
13; Vector vXi8 sdiv are not legal for NEON so use SVE when available.
14; FIXME: We should be able to improve the codegen for >= 256 bits here.
15define <8 x i8> @srem_v8i8(<8 x i8> %op1, <8 x i8> %op2) #0 {
16; VBITS_GE_128-LABEL: srem_v8i8:
17; VBITS_GE_128:       // %bb.0:
18; VBITS_GE_128-NEXT:    sshll v2.8h, v1.8b, #0
19; VBITS_GE_128-NEXT:    sshll v3.8h, v0.8b, #0
20; VBITS_GE_128-NEXT:    ptrue p0.s, vl4
21; VBITS_GE_128-NEXT:    sshll2 v4.4s, v2.8h, #0
22; VBITS_GE_128-NEXT:    sshll2 v5.4s, v3.8h, #0
23; VBITS_GE_128-NEXT:    sshll v2.4s, v2.4h, #0
24; VBITS_GE_128-NEXT:    sshll v3.4s, v3.4h, #0
25; VBITS_GE_128-NEXT:    sdivr z4.s, p0/m, z4.s, z5.s
26; VBITS_GE_128-NEXT:    sdivr z2.s, p0/m, z2.s, z3.s
27; VBITS_GE_128-NEXT:    uzp1 v2.8h, v2.8h, v4.8h
28; VBITS_GE_128-NEXT:    xtn v2.8b, v2.8h
29; VBITS_GE_128-NEXT:    mls v0.8b, v2.8b, v1.8b
30; VBITS_GE_128-NEXT:    ret
31;
32; VBITS_GE_256-LABEL: srem_v8i8:
33; VBITS_GE_256:       // %bb.0:
34; VBITS_GE_256-NEXT:    // kill: def $d1 killed $d1 def $z1
35; VBITS_GE_256-NEXT:    // kill: def $d0 killed $d0 def $z0
36; VBITS_GE_256-NEXT:    sunpklo z2.h, z1.b
37; VBITS_GE_256-NEXT:    sunpklo z3.h, z0.b
38; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
39; VBITS_GE_256-NEXT:    sunpklo z2.s, z2.h
40; VBITS_GE_256-NEXT:    sunpklo z3.s, z3.h
41; VBITS_GE_256-NEXT:    sdivr z2.s, p0/m, z2.s, z3.s
42; VBITS_GE_256-NEXT:    uzp1 z2.h, z2.h, z2.h
43; VBITS_GE_256-NEXT:    umov w8, v2.h[0]
44; VBITS_GE_256-NEXT:    umov w9, v2.h[1]
45; VBITS_GE_256-NEXT:    fmov s3, w8
46; VBITS_GE_256-NEXT:    umov w8, v2.h[2]
47; VBITS_GE_256-NEXT:    mov v3.b[1], w9
48; VBITS_GE_256-NEXT:    mov v3.b[2], w8
49; VBITS_GE_256-NEXT:    umov w8, v2.h[3]
50; VBITS_GE_256-NEXT:    mov v3.b[3], w8
51; VBITS_GE_256-NEXT:    umov w8, v2.h[4]
52; VBITS_GE_256-NEXT:    mov v3.b[4], w8
53; VBITS_GE_256-NEXT:    umov w8, v2.h[5]
54; VBITS_GE_256-NEXT:    mov v3.b[5], w8
55; VBITS_GE_256-NEXT:    umov w8, v2.h[6]
56; VBITS_GE_256-NEXT:    mov v3.b[6], w8
57; VBITS_GE_256-NEXT:    umov w8, v2.h[7]
58; VBITS_GE_256-NEXT:    mov v3.b[7], w8
59; VBITS_GE_256-NEXT:    mls v0.8b, v3.8b, v1.8b
60; VBITS_GE_256-NEXT:    // kill: def $d0 killed $d0 killed $z0
61; VBITS_GE_256-NEXT:    ret
62;
63; VBITS_GE_512-LABEL: srem_v8i8:
64; VBITS_GE_512:       // %bb.0:
65; VBITS_GE_512-NEXT:    // kill: def $d1 killed $d1 def $z1
66; VBITS_GE_512-NEXT:    // kill: def $d0 killed $d0 def $z0
67; VBITS_GE_512-NEXT:    sunpklo z2.h, z1.b
68; VBITS_GE_512-NEXT:    sunpklo z3.h, z0.b
69; VBITS_GE_512-NEXT:    ptrue p0.s, vl8
70; VBITS_GE_512-NEXT:    sunpklo z2.s, z2.h
71; VBITS_GE_512-NEXT:    sunpklo z3.s, z3.h
72; VBITS_GE_512-NEXT:    sdivr z2.s, p0/m, z2.s, z3.s
73; VBITS_GE_512-NEXT:    uzp1 z2.h, z2.h, z2.h
74; VBITS_GE_512-NEXT:    umov w8, v2.h[0]
75; VBITS_GE_512-NEXT:    umov w9, v2.h[1]
76; VBITS_GE_512-NEXT:    fmov s3, w8
77; VBITS_GE_512-NEXT:    umov w8, v2.h[2]
78; VBITS_GE_512-NEXT:    mov v3.b[1], w9
79; VBITS_GE_512-NEXT:    mov v3.b[2], w8
80; VBITS_GE_512-NEXT:    umov w8, v2.h[3]
81; VBITS_GE_512-NEXT:    mov v3.b[3], w8
82; VBITS_GE_512-NEXT:    umov w8, v2.h[4]
83; VBITS_GE_512-NEXT:    mov v3.b[4], w8
84; VBITS_GE_512-NEXT:    umov w8, v2.h[5]
85; VBITS_GE_512-NEXT:    mov v3.b[5], w8
86; VBITS_GE_512-NEXT:    umov w8, v2.h[6]
87; VBITS_GE_512-NEXT:    mov v3.b[6], w8
88; VBITS_GE_512-NEXT:    umov w8, v2.h[7]
89; VBITS_GE_512-NEXT:    mov v3.b[7], w8
90; VBITS_GE_512-NEXT:    mls v0.8b, v3.8b, v1.8b
91; VBITS_GE_512-NEXT:    // kill: def $d0 killed $d0 killed $z0
92; VBITS_GE_512-NEXT:    ret
93  %res = srem <8 x i8> %op1, %op2
94  ret <8 x i8> %res
95}
96
97define <16 x i8> @srem_v16i8(<16 x i8> %op1, <16 x i8> %op2) #0 {
98; VBITS_GE_128-LABEL: srem_v16i8:
99; VBITS_GE_128:       // %bb.0:
100; VBITS_GE_128-NEXT:    sshll2 v2.8h, v1.16b, #0
101; VBITS_GE_128-NEXT:    sshll2 v3.8h, v0.16b, #0
102; VBITS_GE_128-NEXT:    ptrue p0.s, vl4
103; VBITS_GE_128-NEXT:    sshll2 v4.4s, v2.8h, #0
104; VBITS_GE_128-NEXT:    sshll2 v5.4s, v3.8h, #0
105; VBITS_GE_128-NEXT:    sshll v2.4s, v2.4h, #0
106; VBITS_GE_128-NEXT:    sshll v3.4s, v3.4h, #0
107; VBITS_GE_128-NEXT:    sdivr z4.s, p0/m, z4.s, z5.s
108; VBITS_GE_128-NEXT:    sshll v5.8h, v0.8b, #0
109; VBITS_GE_128-NEXT:    sshll2 v7.4s, v5.8h, #0
110; VBITS_GE_128-NEXT:    sshll v5.4s, v5.4h, #0
111; VBITS_GE_128-NEXT:    sdivr z2.s, p0/m, z2.s, z3.s
112; VBITS_GE_128-NEXT:    sshll v3.8h, v1.8b, #0
113; VBITS_GE_128-NEXT:    sshll2 v6.4s, v3.8h, #0
114; VBITS_GE_128-NEXT:    sshll v3.4s, v3.4h, #0
115; VBITS_GE_128-NEXT:    sdivr z6.s, p0/m, z6.s, z7.s
116; VBITS_GE_128-NEXT:    uzp1 v2.8h, v2.8h, v4.8h
117; VBITS_GE_128-NEXT:    sdivr z3.s, p0/m, z3.s, z5.s
118; VBITS_GE_128-NEXT:    uzp1 v3.8h, v3.8h, v6.8h
119; VBITS_GE_128-NEXT:    uzp1 v2.16b, v3.16b, v2.16b
120; VBITS_GE_128-NEXT:    mls v0.16b, v2.16b, v1.16b
121; VBITS_GE_128-NEXT:    ret
122;
123; VBITS_GE_256-LABEL: srem_v16i8:
124; VBITS_GE_256:       // %bb.0:
125; VBITS_GE_256-NEXT:    // kill: def $q1 killed $q1 def $z1
126; VBITS_GE_256-NEXT:    // kill: def $q0 killed $q0 def $z0
127; VBITS_GE_256-NEXT:    sunpklo z2.h, z1.b
128; VBITS_GE_256-NEXT:    sunpklo z3.h, z0.b
129; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
130; VBITS_GE_256-NEXT:    sunpklo z4.s, z2.h
131; VBITS_GE_256-NEXT:    sunpklo z5.s, z3.h
132; VBITS_GE_256-NEXT:    ext z2.b, z2.b, z2.b, #16
133; VBITS_GE_256-NEXT:    ext z3.b, z3.b, z3.b, #16
134; VBITS_GE_256-NEXT:    sunpklo z2.s, z2.h
135; VBITS_GE_256-NEXT:    sunpklo z3.s, z3.h
136; VBITS_GE_256-NEXT:    sdivr z4.s, p0/m, z4.s, z5.s
137; VBITS_GE_256-NEXT:    sdivr z2.s, p0/m, z2.s, z3.s
138; VBITS_GE_256-NEXT:    ptrue p0.h, vl8
139; VBITS_GE_256-NEXT:    uzp1 z3.h, z4.h, z4.h
140; VBITS_GE_256-NEXT:    uzp1 z2.h, z2.h, z2.h
141; VBITS_GE_256-NEXT:    splice z3.h, p0, z3.h, z2.h
142; VBITS_GE_256-NEXT:    uzp1 z2.b, z3.b, z3.b
143; VBITS_GE_256-NEXT:    mls v0.16b, v2.16b, v1.16b
144; VBITS_GE_256-NEXT:    // kill: def $q0 killed $q0 killed $z0
145; VBITS_GE_256-NEXT:    ret
146;
147; VBITS_GE_512-LABEL: srem_v16i8:
148; VBITS_GE_512:       // %bb.0:
149; VBITS_GE_512-NEXT:    // kill: def $q1 killed $q1 def $z1
150; VBITS_GE_512-NEXT:    // kill: def $q0 killed $q0 def $z0
151; VBITS_GE_512-NEXT:    sunpklo z2.h, z1.b
152; VBITS_GE_512-NEXT:    sunpklo z3.h, z0.b
153; VBITS_GE_512-NEXT:    ptrue p0.s, vl16
154; VBITS_GE_512-NEXT:    sunpklo z2.s, z2.h
155; VBITS_GE_512-NEXT:    sunpklo z3.s, z3.h
156; VBITS_GE_512-NEXT:    sdivr z2.s, p0/m, z2.s, z3.s
157; VBITS_GE_512-NEXT:    uzp1 z2.h, z2.h, z2.h
158; VBITS_GE_512-NEXT:    uzp1 z2.b, z2.b, z2.b
159; VBITS_GE_512-NEXT:    mls v0.16b, v2.16b, v1.16b
160; VBITS_GE_512-NEXT:    // kill: def $q0 killed $q0 killed $z0
161; VBITS_GE_512-NEXT:    ret
162  %res = srem <16 x i8> %op1, %op2
163  ret <16 x i8> %res
164}
165
166define void @srem_v32i8(ptr %a, ptr %b) vscale_range(8,0) #0 {
167; CHECK-LABEL: srem_v32i8:
168; CHECK:       // %bb.0:
169; CHECK-NEXT:    ptrue p0.b, vl32
170; CHECK-NEXT:    ptrue p1.s, vl32
171; CHECK-NEXT:    ld1b { z0.b }, p0/z, [x0]
172; CHECK-NEXT:    ld1b { z1.b }, p0/z, [x1]
173; CHECK-NEXT:    sunpklo z2.h, z1.b
174; CHECK-NEXT:    sunpklo z3.h, z0.b
175; CHECK-NEXT:    sunpklo z2.s, z2.h
176; CHECK-NEXT:    sunpklo z3.s, z3.h
177; CHECK-NEXT:    sdivr z2.s, p1/m, z2.s, z3.s
178; CHECK-NEXT:    uzp1 z2.h, z2.h, z2.h
179; CHECK-NEXT:    uzp1 z2.b, z2.b, z2.b
180; CHECK-NEXT:    mls z0.b, p0/m, z2.b, z1.b
181; CHECK-NEXT:    st1b { z0.b }, p0, [x0]
182; CHECK-NEXT:    ret
183  %op1 = load <32 x i8>, ptr %a
184  %op2 = load <32 x i8>, ptr %b
185  %res = srem <32 x i8> %op1, %op2
186  store <32 x i8> %res, ptr %a
187  ret void
188}
189
190define void @srem_v64i8(ptr %a, ptr %b) vscale_range(16,0) #0 {
191; CHECK-LABEL: srem_v64i8:
192; CHECK:       // %bb.0:
193; CHECK-NEXT:    ptrue p0.b, vl64
194; CHECK-NEXT:    ptrue p1.s, vl64
195; CHECK-NEXT:    ld1b { z0.b }, p0/z, [x0]
196; CHECK-NEXT:    ld1b { z1.b }, p0/z, [x1]
197; CHECK-NEXT:    sunpklo z2.h, z1.b
198; CHECK-NEXT:    sunpklo z3.h, z0.b
199; CHECK-NEXT:    sunpklo z2.s, z2.h
200; CHECK-NEXT:    sunpklo z3.s, z3.h
201; CHECK-NEXT:    sdivr z2.s, p1/m, z2.s, z3.s
202; CHECK-NEXT:    uzp1 z2.h, z2.h, z2.h
203; CHECK-NEXT:    uzp1 z2.b, z2.b, z2.b
204; CHECK-NEXT:    mls z0.b, p0/m, z2.b, z1.b
205; CHECK-NEXT:    st1b { z0.b }, p0, [x0]
206; CHECK-NEXT:    ret
207  %op1 = load <64 x i8>, ptr %a
208  %op2 = load <64 x i8>, ptr %b
209  %res = srem <64 x i8> %op1, %op2
210  store <64 x i8> %res, ptr %a
211  ret void
212}
213
214define void @srem_v128i8(ptr %a, ptr %b) vscale_range(16,0) #0 {
215; CHECK-LABEL: srem_v128i8:
216; CHECK:       // %bb.0:
217; CHECK-NEXT:    ptrue p0.b, vl128
218; CHECK-NEXT:    ptrue p1.s, vl64
219; CHECK-NEXT:    ld1b { z0.b }, p0/z, [x0]
220; CHECK-NEXT:    ld1b { z1.b }, p0/z, [x1]
221; CHECK-NEXT:    sunpklo z2.h, z1.b
222; CHECK-NEXT:    sunpklo z3.h, z0.b
223; CHECK-NEXT:    sunpklo z4.s, z2.h
224; CHECK-NEXT:    sunpklo z5.s, z3.h
225; CHECK-NEXT:    ext z2.b, z2.b, z2.b, #128
226; CHECK-NEXT:    ext z3.b, z3.b, z3.b, #128
227; CHECK-NEXT:    sunpklo z2.s, z2.h
228; CHECK-NEXT:    sunpklo z3.s, z3.h
229; CHECK-NEXT:    sdivr z4.s, p1/m, z4.s, z5.s
230; CHECK-NEXT:    sdivr z2.s, p1/m, z2.s, z3.s
231; CHECK-NEXT:    ptrue p1.h, vl64
232; CHECK-NEXT:    uzp1 z3.h, z4.h, z4.h
233; CHECK-NEXT:    uzp1 z2.h, z2.h, z2.h
234; CHECK-NEXT:    splice z3.h, p1, z3.h, z2.h
235; CHECK-NEXT:    uzp1 z2.b, z3.b, z3.b
236; CHECK-NEXT:    mls z0.b, p0/m, z2.b, z1.b
237; CHECK-NEXT:    st1b { z0.b }, p0, [x0]
238; CHECK-NEXT:    ret
239  %op1 = load <128 x i8>, ptr %a
240  %op2 = load <128 x i8>, ptr %b
241  %res = srem <128 x i8> %op1, %op2
242  store <128 x i8> %res, ptr %a
243  ret void
244}
245
246define void @srem_v256i8(ptr %a, ptr %b) vscale_range(16,0) #0 {
247; CHECK-LABEL: srem_v256i8:
248; CHECK:       // %bb.0:
249; CHECK-NEXT:    ptrue p0.b, vl256
250; CHECK-NEXT:    ptrue p1.s, vl64
251; CHECK-NEXT:    ld1b { z0.b }, p0/z, [x0]
252; CHECK-NEXT:    ld1b { z1.b }, p0/z, [x1]
253; CHECK-NEXT:    sunpklo z2.h, z1.b
254; CHECK-NEXT:    sunpklo z3.h, z0.b
255; CHECK-NEXT:    sunpklo z4.s, z2.h
256; CHECK-NEXT:    sunpklo z5.s, z3.h
257; CHECK-NEXT:    ext z2.b, z2.b, z2.b, #128
258; CHECK-NEXT:    ext z3.b, z3.b, z3.b, #128
259; CHECK-NEXT:    sunpklo z2.s, z2.h
260; CHECK-NEXT:    sunpklo z3.s, z3.h
261; CHECK-NEXT:    sdivr z4.s, p1/m, z4.s, z5.s
262; CHECK-NEXT:    mov z5.d, z0.d
263; CHECK-NEXT:    ext z5.b, z5.b, z0.b, #128
264; CHECK-NEXT:    sunpklo z5.h, z5.b
265; CHECK-NEXT:    sunpklo z7.s, z5.h
266; CHECK-NEXT:    ext z5.b, z5.b, z5.b, #128
267; CHECK-NEXT:    sdivr z2.s, p1/m, z2.s, z3.s
268; CHECK-NEXT:    mov z3.d, z1.d
269; CHECK-NEXT:    sunpklo z5.s, z5.h
270; CHECK-NEXT:    ext z3.b, z3.b, z1.b, #128
271; CHECK-NEXT:    uzp1 z4.h, z4.h, z4.h
272; CHECK-NEXT:    sunpklo z3.h, z3.b
273; CHECK-NEXT:    sunpklo z6.s, z3.h
274; CHECK-NEXT:    ext z3.b, z3.b, z3.b, #128
275; CHECK-NEXT:    sunpklo z3.s, z3.h
276; CHECK-NEXT:    sdivr z6.s, p1/m, z6.s, z7.s
277; CHECK-NEXT:    uzp1 z2.h, z2.h, z2.h
278; CHECK-NEXT:    sdivr z3.s, p1/m, z3.s, z5.s
279; CHECK-NEXT:    ptrue p1.h, vl64
280; CHECK-NEXT:    splice z4.h, p1, z4.h, z2.h
281; CHECK-NEXT:    uzp1 z5.h, z6.h, z6.h
282; CHECK-NEXT:    uzp1 z2.b, z4.b, z4.b
283; CHECK-NEXT:    uzp1 z3.h, z3.h, z3.h
284; CHECK-NEXT:    splice z5.h, p1, z5.h, z3.h
285; CHECK-NEXT:    ptrue p1.b, vl128
286; CHECK-NEXT:    uzp1 z3.b, z5.b, z5.b
287; CHECK-NEXT:    splice z2.b, p1, z2.b, z3.b
288; CHECK-NEXT:    mls z0.b, p0/m, z2.b, z1.b
289; CHECK-NEXT:    st1b { z0.b }, p0, [x0]
290; CHECK-NEXT:    ret
291  %op1 = load <256 x i8>, ptr %a
292  %op2 = load <256 x i8>, ptr %b
293  %res = srem <256 x i8> %op1, %op2
294  store <256 x i8> %res, ptr %a
295  ret void
296}
297
298; Vector vXi16 sdiv are not legal for NEON so use SVE when available.
299; FIXME: We should be able to improve the codegen for >= 256 bits here.
300define <4 x i16> @srem_v4i16(<4 x i16> %op1, <4 x i16> %op2) #0 {
301; VBITS_GE_128-LABEL: srem_v4i16:
302; VBITS_GE_128:       // %bb.0:
303; VBITS_GE_128-NEXT:    sshll v2.4s, v1.4h, #0
304; VBITS_GE_128-NEXT:    sshll v3.4s, v0.4h, #0
305; VBITS_GE_128-NEXT:    ptrue p0.s, vl4
306; VBITS_GE_128-NEXT:    sdivr z2.s, p0/m, z2.s, z3.s
307; VBITS_GE_128-NEXT:    xtn v2.4h, v2.4s
308; VBITS_GE_128-NEXT:    mls v0.4h, v2.4h, v1.4h
309; VBITS_GE_128-NEXT:    ret
310;
311; VBITS_GE_256-LABEL: srem_v4i16:
312; VBITS_GE_256:       // %bb.0:
313; VBITS_GE_256-NEXT:    sshll v2.4s, v1.4h, #0
314; VBITS_GE_256-NEXT:    sshll v3.4s, v0.4h, #0
315; VBITS_GE_256-NEXT:    ptrue p0.s, vl4
316; VBITS_GE_256-NEXT:    sdivr z2.s, p0/m, z2.s, z3.s
317; VBITS_GE_256-NEXT:    mov w8, v2.s[1]
318; VBITS_GE_256-NEXT:    mov v3.16b, v2.16b
319; VBITS_GE_256-NEXT:    mov w9, v2.s[2]
320; VBITS_GE_256-NEXT:    mov v3.h[1], w8
321; VBITS_GE_256-NEXT:    mov w8, v2.s[3]
322; VBITS_GE_256-NEXT:    mov v3.h[2], w9
323; VBITS_GE_256-NEXT:    mov v3.h[3], w8
324; VBITS_GE_256-NEXT:    mls v0.4h, v3.4h, v1.4h
325; VBITS_GE_256-NEXT:    ret
326;
327; VBITS_GE_512-LABEL: srem_v4i16:
328; VBITS_GE_512:       // %bb.0:
329; VBITS_GE_512-NEXT:    sshll v2.4s, v1.4h, #0
330; VBITS_GE_512-NEXT:    sshll v3.4s, v0.4h, #0
331; VBITS_GE_512-NEXT:    ptrue p0.s, vl4
332; VBITS_GE_512-NEXT:    sdivr z2.s, p0/m, z2.s, z3.s
333; VBITS_GE_512-NEXT:    mov w8, v2.s[1]
334; VBITS_GE_512-NEXT:    mov v3.16b, v2.16b
335; VBITS_GE_512-NEXT:    mov w9, v2.s[2]
336; VBITS_GE_512-NEXT:    mov v3.h[1], w8
337; VBITS_GE_512-NEXT:    mov w8, v2.s[3]
338; VBITS_GE_512-NEXT:    mov v3.h[2], w9
339; VBITS_GE_512-NEXT:    mov v3.h[3], w8
340; VBITS_GE_512-NEXT:    mls v0.4h, v3.4h, v1.4h
341; VBITS_GE_512-NEXT:    ret
342  %res = srem <4 x i16> %op1, %op2
343  ret <4 x i16> %res
344}
345
346define <8 x i16> @srem_v8i16(<8 x i16> %op1, <8 x i16> %op2) #0 {
347; VBITS_GE_128-LABEL: srem_v8i16:
348; VBITS_GE_128:       // %bb.0:
349; VBITS_GE_128-NEXT:    sshll2 v2.4s, v1.8h, #0
350; VBITS_GE_128-NEXT:    sshll2 v3.4s, v0.8h, #0
351; VBITS_GE_128-NEXT:    ptrue p0.s, vl4
352; VBITS_GE_128-NEXT:    sshll v4.4s, v0.4h, #0
353; VBITS_GE_128-NEXT:    sdivr z2.s, p0/m, z2.s, z3.s
354; VBITS_GE_128-NEXT:    sshll v3.4s, v1.4h, #0
355; VBITS_GE_128-NEXT:    sdivr z3.s, p0/m, z3.s, z4.s
356; VBITS_GE_128-NEXT:    uzp1 v2.8h, v3.8h, v2.8h
357; VBITS_GE_128-NEXT:    mls v0.8h, v2.8h, v1.8h
358; VBITS_GE_128-NEXT:    ret
359;
360; VBITS_GE_256-LABEL: srem_v8i16:
361; VBITS_GE_256:       // %bb.0:
362; VBITS_GE_256-NEXT:    // kill: def $q1 killed $q1 def $z1
363; VBITS_GE_256-NEXT:    // kill: def $q0 killed $q0 def $z0
364; VBITS_GE_256-NEXT:    sunpklo z2.s, z1.h
365; VBITS_GE_256-NEXT:    sunpklo z3.s, z0.h
366; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
367; VBITS_GE_256-NEXT:    sdivr z2.s, p0/m, z2.s, z3.s
368; VBITS_GE_256-NEXT:    uzp1 z2.h, z2.h, z2.h
369; VBITS_GE_256-NEXT:    mls v0.8h, v2.8h, v1.8h
370; VBITS_GE_256-NEXT:    // kill: def $q0 killed $q0 killed $z0
371; VBITS_GE_256-NEXT:    ret
372;
373; VBITS_GE_512-LABEL: srem_v8i16:
374; VBITS_GE_512:       // %bb.0:
375; VBITS_GE_512-NEXT:    // kill: def $q1 killed $q1 def $z1
376; VBITS_GE_512-NEXT:    // kill: def $q0 killed $q0 def $z0
377; VBITS_GE_512-NEXT:    sunpklo z2.s, z1.h
378; VBITS_GE_512-NEXT:    sunpklo z3.s, z0.h
379; VBITS_GE_512-NEXT:    ptrue p0.s, vl8
380; VBITS_GE_512-NEXT:    sdivr z2.s, p0/m, z2.s, z3.s
381; VBITS_GE_512-NEXT:    uzp1 z2.h, z2.h, z2.h
382; VBITS_GE_512-NEXT:    mls v0.8h, v2.8h, v1.8h
383; VBITS_GE_512-NEXT:    // kill: def $q0 killed $q0 killed $z0
384; VBITS_GE_512-NEXT:    ret
385  %res = srem <8 x i16> %op1, %op2
386  ret <8 x i16> %res
387}
388
389define void @srem_v16i16(ptr %a, ptr %b) #0 {
390; VBITS_GE_128-LABEL: srem_v16i16:
391; VBITS_GE_128:       // %bb.0:
392; VBITS_GE_128-NEXT:    ldp q4, q1, [x1]
393; VBITS_GE_128-NEXT:    ptrue p0.s, vl4
394; VBITS_GE_128-NEXT:    ldr q0, [x0, #16]
395; VBITS_GE_128-NEXT:    sshll2 v2.4s, v1.8h, #0
396; VBITS_GE_128-NEXT:    sshll2 v3.4s, v0.8h, #0
397; VBITS_GE_128-NEXT:    sshll2 v5.4s, v4.8h, #0
398; VBITS_GE_128-NEXT:    sshll v16.4s, v0.4h, #0
399; VBITS_GE_128-NEXT:    sdivr z2.s, p0/m, z2.s, z3.s
400; VBITS_GE_128-NEXT:    ldr q3, [x0]
401; VBITS_GE_128-NEXT:    sshll2 v6.4s, v3.8h, #0
402; VBITS_GE_128-NEXT:    sshll v7.4s, v3.4h, #0
403; VBITS_GE_128-NEXT:    sdivr z5.s, p0/m, z5.s, z6.s
404; VBITS_GE_128-NEXT:    sshll v6.4s, v4.4h, #0
405; VBITS_GE_128-NEXT:    sdivr z6.s, p0/m, z6.s, z7.s
406; VBITS_GE_128-NEXT:    sshll v7.4s, v1.4h, #0
407; VBITS_GE_128-NEXT:    sdivr z7.s, p0/m, z7.s, z16.s
408; VBITS_GE_128-NEXT:    uzp1 v5.8h, v6.8h, v5.8h
409; VBITS_GE_128-NEXT:    mls v3.8h, v5.8h, v4.8h
410; VBITS_GE_128-NEXT:    uzp1 v2.8h, v7.8h, v2.8h
411; VBITS_GE_128-NEXT:    mls v0.8h, v2.8h, v1.8h
412; VBITS_GE_128-NEXT:    stp q3, q0, [x0]
413; VBITS_GE_128-NEXT:    ret
414;
415; VBITS_GE_256-LABEL: srem_v16i16:
416; VBITS_GE_256:       // %bb.0:
417; VBITS_GE_256-NEXT:    ptrue p0.h, vl16
418; VBITS_GE_256-NEXT:    ptrue p1.s, vl8
419; VBITS_GE_256-NEXT:    ld1h { z0.h }, p0/z, [x0]
420; VBITS_GE_256-NEXT:    ld1h { z1.h }, p0/z, [x1]
421; VBITS_GE_256-NEXT:    sunpklo z2.s, z1.h
422; VBITS_GE_256-NEXT:    sunpklo z3.s, z0.h
423; VBITS_GE_256-NEXT:    mov z4.d, z0.d
424; VBITS_GE_256-NEXT:    ext z4.b, z4.b, z0.b, #16
425; VBITS_GE_256-NEXT:    sdivr z2.s, p1/m, z2.s, z3.s
426; VBITS_GE_256-NEXT:    mov z3.d, z1.d
427; VBITS_GE_256-NEXT:    sunpklo z4.s, z4.h
428; VBITS_GE_256-NEXT:    ext z3.b, z3.b, z1.b, #16
429; VBITS_GE_256-NEXT:    sunpklo z3.s, z3.h
430; VBITS_GE_256-NEXT:    sdivr z3.s, p1/m, z3.s, z4.s
431; VBITS_GE_256-NEXT:    ptrue p1.h, vl8
432; VBITS_GE_256-NEXT:    uzp1 z2.h, z2.h, z2.h
433; VBITS_GE_256-NEXT:    uzp1 z3.h, z3.h, z3.h
434; VBITS_GE_256-NEXT:    splice z2.h, p1, z2.h, z3.h
435; VBITS_GE_256-NEXT:    mls z0.h, p0/m, z2.h, z1.h
436; VBITS_GE_256-NEXT:    st1h { z0.h }, p0, [x0]
437; VBITS_GE_256-NEXT:    ret
438;
439; VBITS_GE_512-LABEL: srem_v16i16:
440; VBITS_GE_512:       // %bb.0:
441; VBITS_GE_512-NEXT:    ptrue p0.h, vl16
442; VBITS_GE_512-NEXT:    ptrue p1.s, vl16
443; VBITS_GE_512-NEXT:    ld1h { z0.h }, p0/z, [x0]
444; VBITS_GE_512-NEXT:    ld1h { z1.h }, p0/z, [x1]
445; VBITS_GE_512-NEXT:    sunpklo z2.s, z1.h
446; VBITS_GE_512-NEXT:    sunpklo z3.s, z0.h
447; VBITS_GE_512-NEXT:    sdivr z2.s, p1/m, z2.s, z3.s
448; VBITS_GE_512-NEXT:    uzp1 z2.h, z2.h, z2.h
449; VBITS_GE_512-NEXT:    mls z0.h, p0/m, z2.h, z1.h
450; VBITS_GE_512-NEXT:    st1h { z0.h }, p0, [x0]
451; VBITS_GE_512-NEXT:    ret
452  %op1 = load <16 x i16>, ptr %a
453  %op2 = load <16 x i16>, ptr %b
454  %res = srem <16 x i16> %op1, %op2
455  store <16 x i16> %res, ptr %a
456  ret void
457}
458
459define void @srem_v32i16(ptr %a, ptr %b) vscale_range(8,0) #0 {
460; CHECK-LABEL: srem_v32i16:
461; CHECK:       // %bb.0:
462; CHECK-NEXT:    ptrue p0.h, vl32
463; CHECK-NEXT:    ptrue p1.s, vl32
464; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
465; CHECK-NEXT:    ld1h { z1.h }, p0/z, [x1]
466; CHECK-NEXT:    sunpklo z2.s, z1.h
467; CHECK-NEXT:    sunpklo z3.s, z0.h
468; CHECK-NEXT:    sdivr z2.s, p1/m, z2.s, z3.s
469; CHECK-NEXT:    uzp1 z2.h, z2.h, z2.h
470; CHECK-NEXT:    mls z0.h, p0/m, z2.h, z1.h
471; CHECK-NEXT:    st1h { z0.h }, p0, [x0]
472; CHECK-NEXT:    ret
473  %op1 = load <32 x i16>, ptr %a
474  %op2 = load <32 x i16>, ptr %b
475  %res = srem <32 x i16> %op1, %op2
476  store <32 x i16> %res, ptr %a
477  ret void
478}
479
480define void @srem_v64i16(ptr %a, ptr %b) vscale_range(16,0) #0 {
481; CHECK-LABEL: srem_v64i16:
482; CHECK:       // %bb.0:
483; CHECK-NEXT:    ptrue p0.h, vl64
484; CHECK-NEXT:    ptrue p1.s, vl64
485; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
486; CHECK-NEXT:    ld1h { z1.h }, p0/z, [x1]
487; CHECK-NEXT:    sunpklo z2.s, z1.h
488; CHECK-NEXT:    sunpklo z3.s, z0.h
489; CHECK-NEXT:    sdivr z2.s, p1/m, z2.s, z3.s
490; CHECK-NEXT:    uzp1 z2.h, z2.h, z2.h
491; CHECK-NEXT:    mls z0.h, p0/m, z2.h, z1.h
492; CHECK-NEXT:    st1h { z0.h }, p0, [x0]
493; CHECK-NEXT:    ret
494  %op1 = load <64 x i16>, ptr %a
495  %op2 = load <64 x i16>, ptr %b
496  %res = srem <64 x i16> %op1, %op2
497  store <64 x i16> %res, ptr %a
498  ret void
499}
500
501define void @srem_v128i16(ptr %a, ptr %b) vscale_range(16,0) #0 {
502; CHECK-LABEL: srem_v128i16:
503; CHECK:       // %bb.0:
504; CHECK-NEXT:    ptrue p0.h, vl128
505; CHECK-NEXT:    ptrue p1.s, vl64
506; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
507; CHECK-NEXT:    ld1h { z1.h }, p0/z, [x1]
508; CHECK-NEXT:    sunpklo z2.s, z1.h
509; CHECK-NEXT:    sunpklo z3.s, z0.h
510; CHECK-NEXT:    mov z4.d, z0.d
511; CHECK-NEXT:    ext z4.b, z4.b, z0.b, #128
512; CHECK-NEXT:    sdivr z2.s, p1/m, z2.s, z3.s
513; CHECK-NEXT:    mov z3.d, z1.d
514; CHECK-NEXT:    sunpklo z4.s, z4.h
515; CHECK-NEXT:    ext z3.b, z3.b, z1.b, #128
516; CHECK-NEXT:    sunpklo z3.s, z3.h
517; CHECK-NEXT:    sdivr z3.s, p1/m, z3.s, z4.s
518; CHECK-NEXT:    ptrue p1.h, vl64
519; CHECK-NEXT:    uzp1 z2.h, z2.h, z2.h
520; CHECK-NEXT:    uzp1 z3.h, z3.h, z3.h
521; CHECK-NEXT:    splice z2.h, p1, z2.h, z3.h
522; CHECK-NEXT:    mls z0.h, p0/m, z2.h, z1.h
523; CHECK-NEXT:    st1h { z0.h }, p0, [x0]
524; CHECK-NEXT:    ret
525  %op1 = load <128 x i16>, ptr %a
526  %op2 = load <128 x i16>, ptr %b
527  %res = srem <128 x i16> %op1, %op2
528  store <128 x i16> %res, ptr %a
529  ret void
530}
531
532; Vector v2i32 sdiv are not legal for NEON so use SVE when available.
533define <2 x i32> @srem_v2i32(<2 x i32> %op1, <2 x i32> %op2) vscale_range(1,0) #0 {
534; CHECK-LABEL: srem_v2i32:
535; CHECK:       // %bb.0:
536; CHECK-NEXT:    ptrue p0.s, vl2
537; CHECK-NEXT:    // kill: def $d1 killed $d1 def $z1
538; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
539; CHECK-NEXT:    movprfx z2, z0
540; CHECK-NEXT:    sdiv z2.s, p0/m, z2.s, z1.s
541; CHECK-NEXT:    mls v0.2s, v2.2s, v1.2s
542; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
543; CHECK-NEXT:    ret
544  %res = srem <2 x i32> %op1, %op2
545  ret <2 x i32> %res
546}
547
548; Vector v4i32 sdiv are not legal for NEON so use SVE when available.
549define <4 x i32> @srem_v4i32(<4 x i32> %op1, <4 x i32> %op2) vscale_range(1,0) #0 {
550; CHECK-LABEL: srem_v4i32:
551; CHECK:       // %bb.0:
552; CHECK-NEXT:    ptrue p0.s, vl4
553; CHECK-NEXT:    // kill: def $q1 killed $q1 def $z1
554; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
555; CHECK-NEXT:    movprfx z2, z0
556; CHECK-NEXT:    sdiv z2.s, p0/m, z2.s, z1.s
557; CHECK-NEXT:    mls v0.4s, v2.4s, v1.4s
558; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
559; CHECK-NEXT:    ret
560  %res = srem <4 x i32> %op1, %op2
561  ret <4 x i32> %res
562}
563
564define void @srem_v8i32(ptr %a, ptr %b) vscale_range(2,0) #0 {
565; CHECK-LABEL: srem_v8i32:
566; CHECK:       // %bb.0:
567; CHECK-NEXT:    ptrue p0.s, vl8
568; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
569; CHECK-NEXT:    ld1w { z1.s }, p0/z, [x1]
570; CHECK-NEXT:    movprfx z2, z0
571; CHECK-NEXT:    sdiv z2.s, p0/m, z2.s, z1.s
572; CHECK-NEXT:    mls z0.s, p0/m, z2.s, z1.s
573; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
574; CHECK-NEXT:    ret
575  %op1 = load <8 x i32>, ptr %a
576  %op2 = load <8 x i32>, ptr %b
577  %res = srem <8 x i32> %op1, %op2
578  store <8 x i32> %res, ptr %a
579  ret void
580}
581
582define void @srem_v16i32(ptr %a, ptr %b) #0 {
583; VBITS_GE_128-LABEL: srem_v16i32:
584; VBITS_GE_128:       // %bb.0:
585; VBITS_GE_128-NEXT:    ldp q0, q3, [x1]
586; VBITS_GE_128-NEXT:    ptrue p0.s, vl4
587; VBITS_GE_128-NEXT:    ldp q1, q2, [x0]
588; VBITS_GE_128-NEXT:    ldp q16, q5, [x0, #32]
589; VBITS_GE_128-NEXT:    ldp q17, q6, [x1, #32]
590; VBITS_GE_128-NEXT:    movprfx z4, z1
591; VBITS_GE_128-NEXT:    sdiv z4.s, p0/m, z4.s, z0.s
592; VBITS_GE_128-NEXT:    movprfx z19, z2
593; VBITS_GE_128-NEXT:    sdiv z19.s, p0/m, z19.s, z3.s
594; VBITS_GE_128-NEXT:    movprfx z7, z5
595; VBITS_GE_128-NEXT:    sdiv z7.s, p0/m, z7.s, z6.s
596; VBITS_GE_128-NEXT:    movprfx z18, z16
597; VBITS_GE_128-NEXT:    sdiv z18.s, p0/m, z18.s, z17.s
598; VBITS_GE_128-NEXT:    mls v1.4s, v4.4s, v0.4s
599; VBITS_GE_128-NEXT:    mls v2.4s, v19.4s, v3.4s
600; VBITS_GE_128-NEXT:    mls v16.4s, v18.4s, v17.4s
601; VBITS_GE_128-NEXT:    mls v5.4s, v7.4s, v6.4s
602; VBITS_GE_128-NEXT:    stp q1, q2, [x0]
603; VBITS_GE_128-NEXT:    stp q16, q5, [x0, #32]
604; VBITS_GE_128-NEXT:    ret
605;
606; VBITS_GE_256-LABEL: srem_v16i32:
607; VBITS_GE_256:       // %bb.0:
608; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
609; VBITS_GE_256-NEXT:    mov x8, #8 // =0x8
610; VBITS_GE_256-NEXT:    ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
611; VBITS_GE_256-NEXT:    ld1w { z1.s }, p0/z, [x1, x8, lsl #2]
612; VBITS_GE_256-NEXT:    ld1w { z3.s }, p0/z, [x0]
613; VBITS_GE_256-NEXT:    ld1w { z4.s }, p0/z, [x1]
614; VBITS_GE_256-NEXT:    movprfx z2, z0
615; VBITS_GE_256-NEXT:    sdiv z2.s, p0/m, z2.s, z1.s
616; VBITS_GE_256-NEXT:    movprfx z5, z3
617; VBITS_GE_256-NEXT:    sdiv z5.s, p0/m, z5.s, z4.s
618; VBITS_GE_256-NEXT:    mls z0.s, p0/m, z2.s, z1.s
619; VBITS_GE_256-NEXT:    movprfx z1, z3
620; VBITS_GE_256-NEXT:    mls z1.s, p0/m, z5.s, z4.s
621; VBITS_GE_256-NEXT:    st1w { z0.s }, p0, [x0, x8, lsl #2]
622; VBITS_GE_256-NEXT:    st1w { z1.s }, p0, [x0]
623; VBITS_GE_256-NEXT:    ret
624;
625; VBITS_GE_512-LABEL: srem_v16i32:
626; VBITS_GE_512:       // %bb.0:
627; VBITS_GE_512-NEXT:    ptrue p0.s, vl16
628; VBITS_GE_512-NEXT:    ld1w { z0.s }, p0/z, [x0]
629; VBITS_GE_512-NEXT:    ld1w { z1.s }, p0/z, [x1]
630; VBITS_GE_512-NEXT:    movprfx z2, z0
631; VBITS_GE_512-NEXT:    sdiv z2.s, p0/m, z2.s, z1.s
632; VBITS_GE_512-NEXT:    mls z0.s, p0/m, z2.s, z1.s
633; VBITS_GE_512-NEXT:    st1w { z0.s }, p0, [x0]
634; VBITS_GE_512-NEXT:    ret
635  %op1 = load <16 x i32>, ptr %a
636  %op2 = load <16 x i32>, ptr %b
637  %res = srem <16 x i32> %op1, %op2
638  store <16 x i32> %res, ptr %a
639  ret void
640}
641
642define void @srem_v32i32(ptr %a, ptr %b) vscale_range(8,0) #0 {
643; CHECK-LABEL: srem_v32i32:
644; CHECK:       // %bb.0:
645; CHECK-NEXT:    ptrue p0.s, vl32
646; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
647; CHECK-NEXT:    ld1w { z1.s }, p0/z, [x1]
648; CHECK-NEXT:    movprfx z2, z0
649; CHECK-NEXT:    sdiv z2.s, p0/m, z2.s, z1.s
650; CHECK-NEXT:    mls z0.s, p0/m, z2.s, z1.s
651; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
652; CHECK-NEXT:    ret
653  %op1 = load <32 x i32>, ptr %a
654  %op2 = load <32 x i32>, ptr %b
655  %res = srem <32 x i32> %op1, %op2
656  store <32 x i32> %res, ptr %a
657  ret void
658}
659
660define void @srem_v64i32(ptr %a, ptr %b) vscale_range(16,0) #0 {
661; CHECK-LABEL: srem_v64i32:
662; CHECK:       // %bb.0:
663; CHECK-NEXT:    ptrue p0.s, vl64
664; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
665; CHECK-NEXT:    ld1w { z1.s }, p0/z, [x1]
666; CHECK-NEXT:    movprfx z2, z0
667; CHECK-NEXT:    sdiv z2.s, p0/m, z2.s, z1.s
668; CHECK-NEXT:    mls z0.s, p0/m, z2.s, z1.s
669; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
670; CHECK-NEXT:    ret
671  %op1 = load <64 x i32>, ptr %a
672  %op2 = load <64 x i32>, ptr %b
673  %res = srem <64 x i32> %op1, %op2
674  store <64 x i32> %res, ptr %a
675  ret void
676}
677
678; Vector i64 sdiv are not legal for NEON so use SVE when available.
679; FIXME: We should be able to improve the codegen for the 128 bits case here.
680define <1 x i64> @srem_v1i64(<1 x i64> %op1, <1 x i64> %op2) vscale_range(1,0) #0 {
681; CHECK-LABEL: srem_v1i64:
682; CHECK:       // %bb.0:
683; CHECK-NEXT:    ptrue p0.d, vl1
684; CHECK-NEXT:    // kill: def $d1 killed $d1 def $z1
685; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
686; CHECK-NEXT:    movprfx z2, z0
687; CHECK-NEXT:    sdiv z2.d, p0/m, z2.d, z1.d
688; CHECK-NEXT:    mls z0.d, p0/m, z2.d, z1.d
689; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
690; CHECK-NEXT:    ret
691  %res = srem <1 x i64> %op1, %op2
692  ret <1 x i64> %res
693}
694
695; Vector i64 sdiv are not legal for NEON so use SVE when available.
696; FIXME: We should be able to improve the codegen for the 128 bits case here.
697define <2 x i64> @srem_v2i64(<2 x i64> %op1, <2 x i64> %op2) vscale_range(1,0) #0 {
698; CHECK-LABEL: srem_v2i64:
699; CHECK:       // %bb.0:
700; CHECK-NEXT:    ptrue p0.d, vl2
701; CHECK-NEXT:    // kill: def $q1 killed $q1 def $z1
702; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
703; CHECK-NEXT:    movprfx z2, z0
704; CHECK-NEXT:    sdiv z2.d, p0/m, z2.d, z1.d
705; CHECK-NEXT:    mls z0.d, p0/m, z2.d, z1.d
706; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
707; CHECK-NEXT:    ret
708  %res = srem <2 x i64> %op1, %op2
709  ret <2 x i64> %res
710}
711
712define void @srem_v4i64(ptr %a, ptr %b) vscale_range(2,0) #0 {
713; CHECK-LABEL: srem_v4i64:
714; CHECK:       // %bb.0:
715; CHECK-NEXT:    ptrue p0.d, vl4
716; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
717; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x1]
718; CHECK-NEXT:    movprfx z2, z0
719; CHECK-NEXT:    sdiv z2.d, p0/m, z2.d, z1.d
720; CHECK-NEXT:    mls z0.d, p0/m, z2.d, z1.d
721; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
722; CHECK-NEXT:    ret
723  %op1 = load <4 x i64>, ptr %a
724  %op2 = load <4 x i64>, ptr %b
725  %res = srem <4 x i64> %op1, %op2
726  store <4 x i64> %res, ptr %a
727  ret void
728}
729
730define void @srem_v8i64(ptr %a, ptr %b) #0 {
731; VBITS_GE_128-LABEL: srem_v8i64:
732; VBITS_GE_128:       // %bb.0:
733; VBITS_GE_128-NEXT:    ldp q0, q3, [x1]
734; VBITS_GE_128-NEXT:    ptrue p0.d, vl2
735; VBITS_GE_128-NEXT:    ldp q1, q2, [x0]
736; VBITS_GE_128-NEXT:    ldp q16, q5, [x0, #32]
737; VBITS_GE_128-NEXT:    ldp q17, q6, [x1, #32]
738; VBITS_GE_128-NEXT:    movprfx z4, z1
739; VBITS_GE_128-NEXT:    sdiv z4.d, p0/m, z4.d, z0.d
740; VBITS_GE_128-NEXT:    movprfx z19, z2
741; VBITS_GE_128-NEXT:    sdiv z19.d, p0/m, z19.d, z3.d
742; VBITS_GE_128-NEXT:    movprfx z7, z5
743; VBITS_GE_128-NEXT:    sdiv z7.d, p0/m, z7.d, z6.d
744; VBITS_GE_128-NEXT:    movprfx z18, z16
745; VBITS_GE_128-NEXT:    sdiv z18.d, p0/m, z18.d, z17.d
746; VBITS_GE_128-NEXT:    msb z0.d, p0/m, z4.d, z1.d
747; VBITS_GE_128-NEXT:    movprfx z1, z2
748; VBITS_GE_128-NEXT:    mls z1.d, p0/m, z19.d, z3.d
749; VBITS_GE_128-NEXT:    mls z16.d, p0/m, z18.d, z17.d
750; VBITS_GE_128-NEXT:    mls z5.d, p0/m, z7.d, z6.d
751; VBITS_GE_128-NEXT:    stp q0, q1, [x0]
752; VBITS_GE_128-NEXT:    stp q16, q5, [x0, #32]
753; VBITS_GE_128-NEXT:    ret
754;
755; VBITS_GE_256-LABEL: srem_v8i64:
756; VBITS_GE_256:       // %bb.0:
757; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
758; VBITS_GE_256-NEXT:    mov x8, #4 // =0x4
759; VBITS_GE_256-NEXT:    ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
760; VBITS_GE_256-NEXT:    ld1d { z1.d }, p0/z, [x1, x8, lsl #3]
761; VBITS_GE_256-NEXT:    ld1d { z3.d }, p0/z, [x0]
762; VBITS_GE_256-NEXT:    ld1d { z4.d }, p0/z, [x1]
763; VBITS_GE_256-NEXT:    movprfx z2, z0
764; VBITS_GE_256-NEXT:    sdiv z2.d, p0/m, z2.d, z1.d
765; VBITS_GE_256-NEXT:    movprfx z5, z3
766; VBITS_GE_256-NEXT:    sdiv z5.d, p0/m, z5.d, z4.d
767; VBITS_GE_256-NEXT:    mls z0.d, p0/m, z2.d, z1.d
768; VBITS_GE_256-NEXT:    movprfx z1, z3
769; VBITS_GE_256-NEXT:    mls z1.d, p0/m, z5.d, z4.d
770; VBITS_GE_256-NEXT:    st1d { z0.d }, p0, [x0, x8, lsl #3]
771; VBITS_GE_256-NEXT:    st1d { z1.d }, p0, [x0]
772; VBITS_GE_256-NEXT:    ret
773;
774; VBITS_GE_512-LABEL: srem_v8i64:
775; VBITS_GE_512:       // %bb.0:
776; VBITS_GE_512-NEXT:    ptrue p0.d, vl8
777; VBITS_GE_512-NEXT:    ld1d { z0.d }, p0/z, [x0]
778; VBITS_GE_512-NEXT:    ld1d { z1.d }, p0/z, [x1]
779; VBITS_GE_512-NEXT:    movprfx z2, z0
780; VBITS_GE_512-NEXT:    sdiv z2.d, p0/m, z2.d, z1.d
781; VBITS_GE_512-NEXT:    mls z0.d, p0/m, z2.d, z1.d
782; VBITS_GE_512-NEXT:    st1d { z0.d }, p0, [x0]
783; VBITS_GE_512-NEXT:    ret
784  %op1 = load <8 x i64>, ptr %a
785  %op2 = load <8 x i64>, ptr %b
786  %res = srem <8 x i64> %op1, %op2
787  store <8 x i64> %res, ptr %a
788  ret void
789}
790
791define void @srem_v16i64(ptr %a, ptr %b) vscale_range(8,0) #0 {
792; CHECK-LABEL: srem_v16i64:
793; CHECK:       // %bb.0:
794; CHECK-NEXT:    ptrue p0.d, vl16
795; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
796; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x1]
797; CHECK-NEXT:    movprfx z2, z0
798; CHECK-NEXT:    sdiv z2.d, p0/m, z2.d, z1.d
799; CHECK-NEXT:    mls z0.d, p0/m, z2.d, z1.d
800; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
801; CHECK-NEXT:    ret
802  %op1 = load <16 x i64>, ptr %a
803  %op2 = load <16 x i64>, ptr %b
804  %res = srem <16 x i64> %op1, %op2
805  store <16 x i64> %res, ptr %a
806  ret void
807}
808
809define void @srem_v32i64(ptr %a, ptr %b) vscale_range(16,0) #0 {
810; CHECK-LABEL: srem_v32i64:
811; CHECK:       // %bb.0:
812; CHECK-NEXT:    ptrue p0.d, vl32
813; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
814; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x1]
815; CHECK-NEXT:    movprfx z2, z0
816; CHECK-NEXT:    sdiv z2.d, p0/m, z2.d, z1.d
817; CHECK-NEXT:    mls z0.d, p0/m, z2.d, z1.d
818; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
819; CHECK-NEXT:    ret
820  %op1 = load <32 x i64>, ptr %a
821  %op2 = load <32 x i64>, ptr %b
822  %res = srem <32 x i64> %op1, %op2
823  store <32 x i64> %res, ptr %a
824  ret void
825}
826
827;
828; UREM
829;
830
831; Vector vXi8 udiv are not legal for NEON so use SVE when available.
832; FIXME: We should be able to improve the codegen for >= 256 bits here.
833define <8 x i8> @urem_v8i8(<8 x i8> %op1, <8 x i8> %op2) #0 {
834; VBITS_GE_128-LABEL: urem_v8i8:
835; VBITS_GE_128:       // %bb.0:
836; VBITS_GE_128-NEXT:    ushll v2.8h, v1.8b, #0
837; VBITS_GE_128-NEXT:    ushll v3.8h, v0.8b, #0
838; VBITS_GE_128-NEXT:    ptrue p0.s, vl4
839; VBITS_GE_128-NEXT:    ushll2 v4.4s, v2.8h, #0
840; VBITS_GE_128-NEXT:    ushll2 v5.4s, v3.8h, #0
841; VBITS_GE_128-NEXT:    ushll v2.4s, v2.4h, #0
842; VBITS_GE_128-NEXT:    ushll v3.4s, v3.4h, #0
843; VBITS_GE_128-NEXT:    udivr z4.s, p0/m, z4.s, z5.s
844; VBITS_GE_128-NEXT:    udivr z2.s, p0/m, z2.s, z3.s
845; VBITS_GE_128-NEXT:    uzp1 v2.8h, v2.8h, v4.8h
846; VBITS_GE_128-NEXT:    xtn v2.8b, v2.8h
847; VBITS_GE_128-NEXT:    mls v0.8b, v2.8b, v1.8b
848; VBITS_GE_128-NEXT:    ret
849;
850; VBITS_GE_256-LABEL: urem_v8i8:
851; VBITS_GE_256:       // %bb.0:
852; VBITS_GE_256-NEXT:    // kill: def $d1 killed $d1 def $z1
853; VBITS_GE_256-NEXT:    // kill: def $d0 killed $d0 def $z0
854; VBITS_GE_256-NEXT:    uunpklo z2.h, z1.b
855; VBITS_GE_256-NEXT:    uunpklo z3.h, z0.b
856; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
857; VBITS_GE_256-NEXT:    uunpklo z2.s, z2.h
858; VBITS_GE_256-NEXT:    uunpklo z3.s, z3.h
859; VBITS_GE_256-NEXT:    udivr z2.s, p0/m, z2.s, z3.s
860; VBITS_GE_256-NEXT:    uzp1 z2.h, z2.h, z2.h
861; VBITS_GE_256-NEXT:    umov w8, v2.h[0]
862; VBITS_GE_256-NEXT:    umov w9, v2.h[1]
863; VBITS_GE_256-NEXT:    fmov s3, w8
864; VBITS_GE_256-NEXT:    umov w8, v2.h[2]
865; VBITS_GE_256-NEXT:    mov v3.b[1], w9
866; VBITS_GE_256-NEXT:    mov v3.b[2], w8
867; VBITS_GE_256-NEXT:    umov w8, v2.h[3]
868; VBITS_GE_256-NEXT:    mov v3.b[3], w8
869; VBITS_GE_256-NEXT:    umov w8, v2.h[4]
870; VBITS_GE_256-NEXT:    mov v3.b[4], w8
871; VBITS_GE_256-NEXT:    umov w8, v2.h[5]
872; VBITS_GE_256-NEXT:    mov v3.b[5], w8
873; VBITS_GE_256-NEXT:    umov w8, v2.h[6]
874; VBITS_GE_256-NEXT:    mov v3.b[6], w8
875; VBITS_GE_256-NEXT:    umov w8, v2.h[7]
876; VBITS_GE_256-NEXT:    mov v3.b[7], w8
877; VBITS_GE_256-NEXT:    mls v0.8b, v3.8b, v1.8b
878; VBITS_GE_256-NEXT:    // kill: def $d0 killed $d0 killed $z0
879; VBITS_GE_256-NEXT:    ret
880;
881; VBITS_GE_512-LABEL: urem_v8i8:
882; VBITS_GE_512:       // %bb.0:
883; VBITS_GE_512-NEXT:    // kill: def $d1 killed $d1 def $z1
884; VBITS_GE_512-NEXT:    // kill: def $d0 killed $d0 def $z0
885; VBITS_GE_512-NEXT:    uunpklo z2.h, z1.b
886; VBITS_GE_512-NEXT:    uunpklo z3.h, z0.b
887; VBITS_GE_512-NEXT:    ptrue p0.s, vl8
888; VBITS_GE_512-NEXT:    uunpklo z2.s, z2.h
889; VBITS_GE_512-NEXT:    uunpklo z3.s, z3.h
890; VBITS_GE_512-NEXT:    udivr z2.s, p0/m, z2.s, z3.s
891; VBITS_GE_512-NEXT:    uzp1 z2.h, z2.h, z2.h
892; VBITS_GE_512-NEXT:    umov w8, v2.h[0]
893; VBITS_GE_512-NEXT:    umov w9, v2.h[1]
894; VBITS_GE_512-NEXT:    fmov s3, w8
895; VBITS_GE_512-NEXT:    umov w8, v2.h[2]
896; VBITS_GE_512-NEXT:    mov v3.b[1], w9
897; VBITS_GE_512-NEXT:    mov v3.b[2], w8
898; VBITS_GE_512-NEXT:    umov w8, v2.h[3]
899; VBITS_GE_512-NEXT:    mov v3.b[3], w8
900; VBITS_GE_512-NEXT:    umov w8, v2.h[4]
901; VBITS_GE_512-NEXT:    mov v3.b[4], w8
902; VBITS_GE_512-NEXT:    umov w8, v2.h[5]
903; VBITS_GE_512-NEXT:    mov v3.b[5], w8
904; VBITS_GE_512-NEXT:    umov w8, v2.h[6]
905; VBITS_GE_512-NEXT:    mov v3.b[6], w8
906; VBITS_GE_512-NEXT:    umov w8, v2.h[7]
907; VBITS_GE_512-NEXT:    mov v3.b[7], w8
908; VBITS_GE_512-NEXT:    mls v0.8b, v3.8b, v1.8b
909; VBITS_GE_512-NEXT:    // kill: def $d0 killed $d0 killed $z0
910; VBITS_GE_512-NEXT:    ret
911  %res = urem <8 x i8> %op1, %op2
912  ret <8 x i8> %res
913}
914
915define <16 x i8> @urem_v16i8(<16 x i8> %op1, <16 x i8> %op2) #0 {
916; VBITS_GE_128-LABEL: urem_v16i8:
917; VBITS_GE_128:       // %bb.0:
918; VBITS_GE_128-NEXT:    ushll2 v2.8h, v1.16b, #0
919; VBITS_GE_128-NEXT:    ushll2 v3.8h, v0.16b, #0
920; VBITS_GE_128-NEXT:    ptrue p0.s, vl4
921; VBITS_GE_128-NEXT:    ushll2 v4.4s, v2.8h, #0
922; VBITS_GE_128-NEXT:    ushll2 v5.4s, v3.8h, #0
923; VBITS_GE_128-NEXT:    ushll v2.4s, v2.4h, #0
924; VBITS_GE_128-NEXT:    ushll v3.4s, v3.4h, #0
925; VBITS_GE_128-NEXT:    udivr z4.s, p0/m, z4.s, z5.s
926; VBITS_GE_128-NEXT:    ushll v5.8h, v0.8b, #0
927; VBITS_GE_128-NEXT:    ushll2 v7.4s, v5.8h, #0
928; VBITS_GE_128-NEXT:    ushll v5.4s, v5.4h, #0
929; VBITS_GE_128-NEXT:    udivr z2.s, p0/m, z2.s, z3.s
930; VBITS_GE_128-NEXT:    ushll v3.8h, v1.8b, #0
931; VBITS_GE_128-NEXT:    ushll2 v6.4s, v3.8h, #0
932; VBITS_GE_128-NEXT:    ushll v3.4s, v3.4h, #0
933; VBITS_GE_128-NEXT:    udivr z6.s, p0/m, z6.s, z7.s
934; VBITS_GE_128-NEXT:    uzp1 v2.8h, v2.8h, v4.8h
935; VBITS_GE_128-NEXT:    udivr z3.s, p0/m, z3.s, z5.s
936; VBITS_GE_128-NEXT:    uzp1 v3.8h, v3.8h, v6.8h
937; VBITS_GE_128-NEXT:    uzp1 v2.16b, v3.16b, v2.16b
938; VBITS_GE_128-NEXT:    mls v0.16b, v2.16b, v1.16b
939; VBITS_GE_128-NEXT:    ret
940;
941; VBITS_GE_256-LABEL: urem_v16i8:
942; VBITS_GE_256:       // %bb.0:
943; VBITS_GE_256-NEXT:    // kill: def $q1 killed $q1 def $z1
944; VBITS_GE_256-NEXT:    // kill: def $q0 killed $q0 def $z0
945; VBITS_GE_256-NEXT:    uunpklo z2.h, z1.b
946; VBITS_GE_256-NEXT:    uunpklo z3.h, z0.b
947; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
948; VBITS_GE_256-NEXT:    uunpklo z4.s, z2.h
949; VBITS_GE_256-NEXT:    uunpklo z5.s, z3.h
950; VBITS_GE_256-NEXT:    ext z2.b, z2.b, z2.b, #16
951; VBITS_GE_256-NEXT:    ext z3.b, z3.b, z3.b, #16
952; VBITS_GE_256-NEXT:    uunpklo z2.s, z2.h
953; VBITS_GE_256-NEXT:    uunpklo z3.s, z3.h
954; VBITS_GE_256-NEXT:    udivr z4.s, p0/m, z4.s, z5.s
955; VBITS_GE_256-NEXT:    udivr z2.s, p0/m, z2.s, z3.s
956; VBITS_GE_256-NEXT:    ptrue p0.h, vl8
957; VBITS_GE_256-NEXT:    uzp1 z3.h, z4.h, z4.h
958; VBITS_GE_256-NEXT:    uzp1 z2.h, z2.h, z2.h
959; VBITS_GE_256-NEXT:    splice z3.h, p0, z3.h, z2.h
960; VBITS_GE_256-NEXT:    uzp1 z2.b, z3.b, z3.b
961; VBITS_GE_256-NEXT:    mls v0.16b, v2.16b, v1.16b
962; VBITS_GE_256-NEXT:    // kill: def $q0 killed $q0 killed $z0
963; VBITS_GE_256-NEXT:    ret
964;
965; VBITS_GE_512-LABEL: urem_v16i8:
966; VBITS_GE_512:       // %bb.0:
967; VBITS_GE_512-NEXT:    // kill: def $q1 killed $q1 def $z1
968; VBITS_GE_512-NEXT:    // kill: def $q0 killed $q0 def $z0
969; VBITS_GE_512-NEXT:    uunpklo z2.h, z1.b
970; VBITS_GE_512-NEXT:    uunpklo z3.h, z0.b
971; VBITS_GE_512-NEXT:    ptrue p0.s, vl16
972; VBITS_GE_512-NEXT:    uunpklo z2.s, z2.h
973; VBITS_GE_512-NEXT:    uunpklo z3.s, z3.h
974; VBITS_GE_512-NEXT:    udivr z2.s, p0/m, z2.s, z3.s
975; VBITS_GE_512-NEXT:    uzp1 z2.h, z2.h, z2.h
976; VBITS_GE_512-NEXT:    uzp1 z2.b, z2.b, z2.b
977; VBITS_GE_512-NEXT:    mls v0.16b, v2.16b, v1.16b
978; VBITS_GE_512-NEXT:    // kill: def $q0 killed $q0 killed $z0
979; VBITS_GE_512-NEXT:    ret
980  %res = urem <16 x i8> %op1, %op2
981  ret <16 x i8> %res
982}
983
984define void @urem_v32i8(ptr %a, ptr %b) vscale_range(8,0) #0 {
985; CHECK-LABEL: urem_v32i8:
986; CHECK:       // %bb.0:
987; CHECK-NEXT:    ptrue p0.b, vl32
988; CHECK-NEXT:    ptrue p1.s, vl32
989; CHECK-NEXT:    ld1b { z0.b }, p0/z, [x0]
990; CHECK-NEXT:    ld1b { z1.b }, p0/z, [x1]
991; CHECK-NEXT:    uunpklo z2.h, z1.b
992; CHECK-NEXT:    uunpklo z3.h, z0.b
993; CHECK-NEXT:    uunpklo z2.s, z2.h
994; CHECK-NEXT:    uunpklo z3.s, z3.h
995; CHECK-NEXT:    udivr z2.s, p1/m, z2.s, z3.s
996; CHECK-NEXT:    uzp1 z2.h, z2.h, z2.h
997; CHECK-NEXT:    uzp1 z2.b, z2.b, z2.b
998; CHECK-NEXT:    mls z0.b, p0/m, z2.b, z1.b
999; CHECK-NEXT:    st1b { z0.b }, p0, [x0]
1000; CHECK-NEXT:    ret
1001  %op1 = load <32 x i8>, ptr %a
1002  %op2 = load <32 x i8>, ptr %b
1003  %res = urem <32 x i8> %op1, %op2
1004  store <32 x i8> %res, ptr %a
1005  ret void
1006}
1007
1008define void @urem_v64i8(ptr %a, ptr %b) vscale_range(16,0) #0 {
1009; CHECK-LABEL: urem_v64i8:
1010; CHECK:       // %bb.0:
1011; CHECK-NEXT:    ptrue p0.b, vl64
1012; CHECK-NEXT:    ptrue p1.s, vl64
1013; CHECK-NEXT:    ld1b { z0.b }, p0/z, [x0]
1014; CHECK-NEXT:    ld1b { z1.b }, p0/z, [x1]
1015; CHECK-NEXT:    uunpklo z2.h, z1.b
1016; CHECK-NEXT:    uunpklo z3.h, z0.b
1017; CHECK-NEXT:    uunpklo z2.s, z2.h
1018; CHECK-NEXT:    uunpklo z3.s, z3.h
1019; CHECK-NEXT:    udivr z2.s, p1/m, z2.s, z3.s
1020; CHECK-NEXT:    uzp1 z2.h, z2.h, z2.h
1021; CHECK-NEXT:    uzp1 z2.b, z2.b, z2.b
1022; CHECK-NEXT:    mls z0.b, p0/m, z2.b, z1.b
1023; CHECK-NEXT:    st1b { z0.b }, p0, [x0]
1024; CHECK-NEXT:    ret
1025  %op1 = load <64 x i8>, ptr %a
1026  %op2 = load <64 x i8>, ptr %b
1027  %res = urem <64 x i8> %op1, %op2
1028  store <64 x i8> %res, ptr %a
1029  ret void
1030}
1031
1032define void @urem_v128i8(ptr %a, ptr %b) vscale_range(16,0) #0 {
1033; CHECK-LABEL: urem_v128i8:
1034; CHECK:       // %bb.0:
1035; CHECK-NEXT:    ptrue p0.b, vl128
1036; CHECK-NEXT:    ptrue p1.s, vl64
1037; CHECK-NEXT:    ld1b { z0.b }, p0/z, [x0]
1038; CHECK-NEXT:    ld1b { z1.b }, p0/z, [x1]
1039; CHECK-NEXT:    uunpklo z2.h, z1.b
1040; CHECK-NEXT:    uunpklo z3.h, z0.b
1041; CHECK-NEXT:    uunpklo z4.s, z2.h
1042; CHECK-NEXT:    uunpklo z5.s, z3.h
1043; CHECK-NEXT:    ext z2.b, z2.b, z2.b, #128
1044; CHECK-NEXT:    ext z3.b, z3.b, z3.b, #128
1045; CHECK-NEXT:    uunpklo z2.s, z2.h
1046; CHECK-NEXT:    uunpklo z3.s, z3.h
1047; CHECK-NEXT:    udivr z4.s, p1/m, z4.s, z5.s
1048; CHECK-NEXT:    udivr z2.s, p1/m, z2.s, z3.s
1049; CHECK-NEXT:    ptrue p1.h, vl64
1050; CHECK-NEXT:    uzp1 z3.h, z4.h, z4.h
1051; CHECK-NEXT:    uzp1 z2.h, z2.h, z2.h
1052; CHECK-NEXT:    splice z3.h, p1, z3.h, z2.h
1053; CHECK-NEXT:    uzp1 z2.b, z3.b, z3.b
1054; CHECK-NEXT:    mls z0.b, p0/m, z2.b, z1.b
1055; CHECK-NEXT:    st1b { z0.b }, p0, [x0]
1056; CHECK-NEXT:    ret
1057  %op1 = load <128 x i8>, ptr %a
1058  %op2 = load <128 x i8>, ptr %b
1059  %res = urem <128 x i8> %op1, %op2
1060  store <128 x i8> %res, ptr %a
1061  ret void
1062}
1063
1064define void @urem_v256i8(ptr %a, ptr %b) vscale_range(16,0) #0 {
1065; CHECK-LABEL: urem_v256i8:
1066; CHECK:       // %bb.0:
1067; CHECK-NEXT:    ptrue p0.b, vl256
1068; CHECK-NEXT:    ptrue p1.s, vl64
1069; CHECK-NEXT:    ld1b { z0.b }, p0/z, [x0]
1070; CHECK-NEXT:    ld1b { z1.b }, p0/z, [x1]
1071; CHECK-NEXT:    uunpklo z2.h, z1.b
1072; CHECK-NEXT:    uunpklo z3.h, z0.b
1073; CHECK-NEXT:    uunpklo z4.s, z2.h
1074; CHECK-NEXT:    uunpklo z5.s, z3.h
1075; CHECK-NEXT:    ext z2.b, z2.b, z2.b, #128
1076; CHECK-NEXT:    ext z3.b, z3.b, z3.b, #128
1077; CHECK-NEXT:    uunpklo z2.s, z2.h
1078; CHECK-NEXT:    uunpklo z3.s, z3.h
1079; CHECK-NEXT:    udivr z4.s, p1/m, z4.s, z5.s
1080; CHECK-NEXT:    mov z5.d, z0.d
1081; CHECK-NEXT:    ext z5.b, z5.b, z0.b, #128
1082; CHECK-NEXT:    uunpklo z5.h, z5.b
1083; CHECK-NEXT:    uunpklo z7.s, z5.h
1084; CHECK-NEXT:    ext z5.b, z5.b, z5.b, #128
1085; CHECK-NEXT:    udivr z2.s, p1/m, z2.s, z3.s
1086; CHECK-NEXT:    mov z3.d, z1.d
1087; CHECK-NEXT:    uunpklo z5.s, z5.h
1088; CHECK-NEXT:    ext z3.b, z3.b, z1.b, #128
1089; CHECK-NEXT:    uzp1 z4.h, z4.h, z4.h
1090; CHECK-NEXT:    uunpklo z3.h, z3.b
1091; CHECK-NEXT:    uunpklo z6.s, z3.h
1092; CHECK-NEXT:    ext z3.b, z3.b, z3.b, #128
1093; CHECK-NEXT:    uunpklo z3.s, z3.h
1094; CHECK-NEXT:    udivr z6.s, p1/m, z6.s, z7.s
1095; CHECK-NEXT:    uzp1 z2.h, z2.h, z2.h
1096; CHECK-NEXT:    udivr z3.s, p1/m, z3.s, z5.s
1097; CHECK-NEXT:    ptrue p1.h, vl64
1098; CHECK-NEXT:    splice z4.h, p1, z4.h, z2.h
1099; CHECK-NEXT:    uzp1 z5.h, z6.h, z6.h
1100; CHECK-NEXT:    uzp1 z2.b, z4.b, z4.b
1101; CHECK-NEXT:    uzp1 z3.h, z3.h, z3.h
1102; CHECK-NEXT:    splice z5.h, p1, z5.h, z3.h
1103; CHECK-NEXT:    ptrue p1.b, vl128
1104; CHECK-NEXT:    uzp1 z3.b, z5.b, z5.b
1105; CHECK-NEXT:    splice z2.b, p1, z2.b, z3.b
1106; CHECK-NEXT:    mls z0.b, p0/m, z2.b, z1.b
1107; CHECK-NEXT:    st1b { z0.b }, p0, [x0]
1108; CHECK-NEXT:    ret
1109  %op1 = load <256 x i8>, ptr %a
1110  %op2 = load <256 x i8>, ptr %b
1111  %res = urem <256 x i8> %op1, %op2
1112  store <256 x i8> %res, ptr %a
1113  ret void
1114}
1115
1116; Vector vXi16 udiv are not legal for NEON so use SVE when available.
1117; FIXME: We should be able to improve the codegen for >= 256 bits here.
1118define <4 x i16> @urem_v4i16(<4 x i16> %op1, <4 x i16> %op2) #0 {
1119; VBITS_GE_128-LABEL: urem_v4i16:
1120; VBITS_GE_128:       // %bb.0:
1121; VBITS_GE_128-NEXT:    ushll v2.4s, v1.4h, #0
1122; VBITS_GE_128-NEXT:    ushll v3.4s, v0.4h, #0
1123; VBITS_GE_128-NEXT:    ptrue p0.s, vl4
1124; VBITS_GE_128-NEXT:    udivr z2.s, p0/m, z2.s, z3.s
1125; VBITS_GE_128-NEXT:    xtn v2.4h, v2.4s
1126; VBITS_GE_128-NEXT:    mls v0.4h, v2.4h, v1.4h
1127; VBITS_GE_128-NEXT:    ret
1128;
1129; VBITS_GE_256-LABEL: urem_v4i16:
1130; VBITS_GE_256:       // %bb.0:
1131; VBITS_GE_256-NEXT:    ushll v2.4s, v1.4h, #0
1132; VBITS_GE_256-NEXT:    ushll v3.4s, v0.4h, #0
1133; VBITS_GE_256-NEXT:    ptrue p0.s, vl4
1134; VBITS_GE_256-NEXT:    udivr z2.s, p0/m, z2.s, z3.s
1135; VBITS_GE_256-NEXT:    mov w8, v2.s[1]
1136; VBITS_GE_256-NEXT:    mov v3.16b, v2.16b
1137; VBITS_GE_256-NEXT:    mov w9, v2.s[2]
1138; VBITS_GE_256-NEXT:    mov v3.h[1], w8
1139; VBITS_GE_256-NEXT:    mov w8, v2.s[3]
1140; VBITS_GE_256-NEXT:    mov v3.h[2], w9
1141; VBITS_GE_256-NEXT:    mov v3.h[3], w8
1142; VBITS_GE_256-NEXT:    mls v0.4h, v3.4h, v1.4h
1143; VBITS_GE_256-NEXT:    ret
1144;
1145; VBITS_GE_512-LABEL: urem_v4i16:
1146; VBITS_GE_512:       // %bb.0:
1147; VBITS_GE_512-NEXT:    ushll v2.4s, v1.4h, #0
1148; VBITS_GE_512-NEXT:    ushll v3.4s, v0.4h, #0
1149; VBITS_GE_512-NEXT:    ptrue p0.s, vl4
1150; VBITS_GE_512-NEXT:    udivr z2.s, p0/m, z2.s, z3.s
1151; VBITS_GE_512-NEXT:    mov w8, v2.s[1]
1152; VBITS_GE_512-NEXT:    mov v3.16b, v2.16b
1153; VBITS_GE_512-NEXT:    mov w9, v2.s[2]
1154; VBITS_GE_512-NEXT:    mov v3.h[1], w8
1155; VBITS_GE_512-NEXT:    mov w8, v2.s[3]
1156; VBITS_GE_512-NEXT:    mov v3.h[2], w9
1157; VBITS_GE_512-NEXT:    mov v3.h[3], w8
1158; VBITS_GE_512-NEXT:    mls v0.4h, v3.4h, v1.4h
1159; VBITS_GE_512-NEXT:    ret
1160  %res = urem <4 x i16> %op1, %op2
1161  ret <4 x i16> %res
1162}
1163
1164define <8 x i16> @urem_v8i16(<8 x i16> %op1, <8 x i16> %op2) #0 {
1165; VBITS_GE_128-LABEL: urem_v8i16:
1166; VBITS_GE_128:       // %bb.0:
1167; VBITS_GE_128-NEXT:    ushll2 v2.4s, v1.8h, #0
1168; VBITS_GE_128-NEXT:    ushll2 v3.4s, v0.8h, #0
1169; VBITS_GE_128-NEXT:    ptrue p0.s, vl4
1170; VBITS_GE_128-NEXT:    ushll v4.4s, v0.4h, #0
1171; VBITS_GE_128-NEXT:    udivr z2.s, p0/m, z2.s, z3.s
1172; VBITS_GE_128-NEXT:    ushll v3.4s, v1.4h, #0
1173; VBITS_GE_128-NEXT:    udivr z3.s, p0/m, z3.s, z4.s
1174; VBITS_GE_128-NEXT:    uzp1 v2.8h, v3.8h, v2.8h
1175; VBITS_GE_128-NEXT:    mls v0.8h, v2.8h, v1.8h
1176; VBITS_GE_128-NEXT:    ret
1177;
1178; VBITS_GE_256-LABEL: urem_v8i16:
1179; VBITS_GE_256:       // %bb.0:
1180; VBITS_GE_256-NEXT:    // kill: def $q1 killed $q1 def $z1
1181; VBITS_GE_256-NEXT:    // kill: def $q0 killed $q0 def $z0
1182; VBITS_GE_256-NEXT:    uunpklo z2.s, z1.h
1183; VBITS_GE_256-NEXT:    uunpklo z3.s, z0.h
1184; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
1185; VBITS_GE_256-NEXT:    udivr z2.s, p0/m, z2.s, z3.s
1186; VBITS_GE_256-NEXT:    uzp1 z2.h, z2.h, z2.h
1187; VBITS_GE_256-NEXT:    mls v0.8h, v2.8h, v1.8h
1188; VBITS_GE_256-NEXT:    // kill: def $q0 killed $q0 killed $z0
1189; VBITS_GE_256-NEXT:    ret
1190;
1191; VBITS_GE_512-LABEL: urem_v8i16:
1192; VBITS_GE_512:       // %bb.0:
1193; VBITS_GE_512-NEXT:    // kill: def $q1 killed $q1 def $z1
1194; VBITS_GE_512-NEXT:    // kill: def $q0 killed $q0 def $z0
1195; VBITS_GE_512-NEXT:    uunpklo z2.s, z1.h
1196; VBITS_GE_512-NEXT:    uunpklo z3.s, z0.h
1197; VBITS_GE_512-NEXT:    ptrue p0.s, vl8
1198; VBITS_GE_512-NEXT:    udivr z2.s, p0/m, z2.s, z3.s
1199; VBITS_GE_512-NEXT:    uzp1 z2.h, z2.h, z2.h
1200; VBITS_GE_512-NEXT:    mls v0.8h, v2.8h, v1.8h
1201; VBITS_GE_512-NEXT:    // kill: def $q0 killed $q0 killed $z0
1202; VBITS_GE_512-NEXT:    ret
1203  %res = urem <8 x i16> %op1, %op2
1204  ret <8 x i16> %res
1205}
1206
1207define void @urem_v16i16(ptr %a, ptr %b) #0 {
1208; VBITS_GE_128-LABEL: urem_v16i16:
1209; VBITS_GE_128:       // %bb.0:
1210; VBITS_GE_128-NEXT:    ldp q4, q1, [x1]
1211; VBITS_GE_128-NEXT:    ptrue p0.s, vl4
1212; VBITS_GE_128-NEXT:    ldr q0, [x0, #16]
1213; VBITS_GE_128-NEXT:    ushll2 v2.4s, v1.8h, #0
1214; VBITS_GE_128-NEXT:    ushll2 v3.4s, v0.8h, #0
1215; VBITS_GE_128-NEXT:    ushll2 v5.4s, v4.8h, #0
1216; VBITS_GE_128-NEXT:    ushll v16.4s, v0.4h, #0
1217; VBITS_GE_128-NEXT:    udivr z2.s, p0/m, z2.s, z3.s
1218; VBITS_GE_128-NEXT:    ldr q3, [x0]
1219; VBITS_GE_128-NEXT:    ushll2 v6.4s, v3.8h, #0
1220; VBITS_GE_128-NEXT:    ushll v7.4s, v3.4h, #0
1221; VBITS_GE_128-NEXT:    udivr z5.s, p0/m, z5.s, z6.s
1222; VBITS_GE_128-NEXT:    ushll v6.4s, v4.4h, #0
1223; VBITS_GE_128-NEXT:    udivr z6.s, p0/m, z6.s, z7.s
1224; VBITS_GE_128-NEXT:    ushll v7.4s, v1.4h, #0
1225; VBITS_GE_128-NEXT:    udivr z7.s, p0/m, z7.s, z16.s
1226; VBITS_GE_128-NEXT:    uzp1 v5.8h, v6.8h, v5.8h
1227; VBITS_GE_128-NEXT:    mls v3.8h, v5.8h, v4.8h
1228; VBITS_GE_128-NEXT:    uzp1 v2.8h, v7.8h, v2.8h
1229; VBITS_GE_128-NEXT:    mls v0.8h, v2.8h, v1.8h
1230; VBITS_GE_128-NEXT:    stp q3, q0, [x0]
1231; VBITS_GE_128-NEXT:    ret
1232;
1233; VBITS_GE_256-LABEL: urem_v16i16:
1234; VBITS_GE_256:       // %bb.0:
1235; VBITS_GE_256-NEXT:    ptrue p0.h, vl16
1236; VBITS_GE_256-NEXT:    ptrue p1.s, vl8
1237; VBITS_GE_256-NEXT:    ld1h { z0.h }, p0/z, [x0]
1238; VBITS_GE_256-NEXT:    ld1h { z1.h }, p0/z, [x1]
1239; VBITS_GE_256-NEXT:    uunpklo z2.s, z1.h
1240; VBITS_GE_256-NEXT:    uunpklo z3.s, z0.h
1241; VBITS_GE_256-NEXT:    mov z4.d, z0.d
1242; VBITS_GE_256-NEXT:    ext z4.b, z4.b, z0.b, #16
1243; VBITS_GE_256-NEXT:    udivr z2.s, p1/m, z2.s, z3.s
1244; VBITS_GE_256-NEXT:    mov z3.d, z1.d
1245; VBITS_GE_256-NEXT:    uunpklo z4.s, z4.h
1246; VBITS_GE_256-NEXT:    ext z3.b, z3.b, z1.b, #16
1247; VBITS_GE_256-NEXT:    uunpklo z3.s, z3.h
1248; VBITS_GE_256-NEXT:    udivr z3.s, p1/m, z3.s, z4.s
1249; VBITS_GE_256-NEXT:    ptrue p1.h, vl8
1250; VBITS_GE_256-NEXT:    uzp1 z2.h, z2.h, z2.h
1251; VBITS_GE_256-NEXT:    uzp1 z3.h, z3.h, z3.h
1252; VBITS_GE_256-NEXT:    splice z2.h, p1, z2.h, z3.h
1253; VBITS_GE_256-NEXT:    mls z0.h, p0/m, z2.h, z1.h
1254; VBITS_GE_256-NEXT:    st1h { z0.h }, p0, [x0]
1255; VBITS_GE_256-NEXT:    ret
1256;
1257; VBITS_GE_512-LABEL: urem_v16i16:
1258; VBITS_GE_512:       // %bb.0:
1259; VBITS_GE_512-NEXT:    ptrue p0.h, vl16
1260; VBITS_GE_512-NEXT:    ptrue p1.s, vl16
1261; VBITS_GE_512-NEXT:    ld1h { z0.h }, p0/z, [x0]
1262; VBITS_GE_512-NEXT:    ld1h { z1.h }, p0/z, [x1]
1263; VBITS_GE_512-NEXT:    uunpklo z2.s, z1.h
1264; VBITS_GE_512-NEXT:    uunpklo z3.s, z0.h
1265; VBITS_GE_512-NEXT:    udivr z2.s, p1/m, z2.s, z3.s
1266; VBITS_GE_512-NEXT:    uzp1 z2.h, z2.h, z2.h
1267; VBITS_GE_512-NEXT:    mls z0.h, p0/m, z2.h, z1.h
1268; VBITS_GE_512-NEXT:    st1h { z0.h }, p0, [x0]
1269; VBITS_GE_512-NEXT:    ret
1270  %op1 = load <16 x i16>, ptr %a
1271  %op2 = load <16 x i16>, ptr %b
1272  %res = urem <16 x i16> %op1, %op2
1273  store <16 x i16> %res, ptr %a
1274  ret void
1275}
1276
1277define void @urem_v32i16(ptr %a, ptr %b) vscale_range(8,0) #0 {
1278; CHECK-LABEL: urem_v32i16:
1279; CHECK:       // %bb.0:
1280; CHECK-NEXT:    ptrue p0.h, vl32
1281; CHECK-NEXT:    ptrue p1.s, vl32
1282; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
1283; CHECK-NEXT:    ld1h { z1.h }, p0/z, [x1]
1284; CHECK-NEXT:    uunpklo z2.s, z1.h
1285; CHECK-NEXT:    uunpklo z3.s, z0.h
1286; CHECK-NEXT:    udivr z2.s, p1/m, z2.s, z3.s
1287; CHECK-NEXT:    uzp1 z2.h, z2.h, z2.h
1288; CHECK-NEXT:    mls z0.h, p0/m, z2.h, z1.h
1289; CHECK-NEXT:    st1h { z0.h }, p0, [x0]
1290; CHECK-NEXT:    ret
1291  %op1 = load <32 x i16>, ptr %a
1292  %op2 = load <32 x i16>, ptr %b
1293  %res = urem <32 x i16> %op1, %op2
1294  store <32 x i16> %res, ptr %a
1295  ret void
1296}
1297
1298define void @urem_v64i16(ptr %a, ptr %b) vscale_range(16,0) #0 {
1299; CHECK-LABEL: urem_v64i16:
1300; CHECK:       // %bb.0:
1301; CHECK-NEXT:    ptrue p0.h, vl64
1302; CHECK-NEXT:    ptrue p1.s, vl64
1303; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
1304; CHECK-NEXT:    ld1h { z1.h }, p0/z, [x1]
1305; CHECK-NEXT:    uunpklo z2.s, z1.h
1306; CHECK-NEXT:    uunpklo z3.s, z0.h
1307; CHECK-NEXT:    udivr z2.s, p1/m, z2.s, z3.s
1308; CHECK-NEXT:    uzp1 z2.h, z2.h, z2.h
1309; CHECK-NEXT:    mls z0.h, p0/m, z2.h, z1.h
1310; CHECK-NEXT:    st1h { z0.h }, p0, [x0]
1311; CHECK-NEXT:    ret
1312  %op1 = load <64 x i16>, ptr %a
1313  %op2 = load <64 x i16>, ptr %b
1314  %res = urem <64 x i16> %op1, %op2
1315  store <64 x i16> %res, ptr %a
1316  ret void
1317}
1318
1319define void @urem_v128i16(ptr %a, ptr %b) vscale_range(16,0) #0 {
1320; CHECK-LABEL: urem_v128i16:
1321; CHECK:       // %bb.0:
1322; CHECK-NEXT:    ptrue p0.h, vl128
1323; CHECK-NEXT:    ptrue p1.s, vl64
1324; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
1325; CHECK-NEXT:    ld1h { z1.h }, p0/z, [x1]
1326; CHECK-NEXT:    uunpklo z2.s, z1.h
1327; CHECK-NEXT:    uunpklo z3.s, z0.h
1328; CHECK-NEXT:    mov z4.d, z0.d
1329; CHECK-NEXT:    ext z4.b, z4.b, z0.b, #128
1330; CHECK-NEXT:    udivr z2.s, p1/m, z2.s, z3.s
1331; CHECK-NEXT:    mov z3.d, z1.d
1332; CHECK-NEXT:    uunpklo z4.s, z4.h
1333; CHECK-NEXT:    ext z3.b, z3.b, z1.b, #128
1334; CHECK-NEXT:    uunpklo z3.s, z3.h
1335; CHECK-NEXT:    udivr z3.s, p1/m, z3.s, z4.s
1336; CHECK-NEXT:    ptrue p1.h, vl64
1337; CHECK-NEXT:    uzp1 z2.h, z2.h, z2.h
1338; CHECK-NEXT:    uzp1 z3.h, z3.h, z3.h
1339; CHECK-NEXT:    splice z2.h, p1, z2.h, z3.h
1340; CHECK-NEXT:    mls z0.h, p0/m, z2.h, z1.h
1341; CHECK-NEXT:    st1h { z0.h }, p0, [x0]
1342; CHECK-NEXT:    ret
1343  %op1 = load <128 x i16>, ptr %a
1344  %op2 = load <128 x i16>, ptr %b
1345  %res = urem <128 x i16> %op1, %op2
1346  store <128 x i16> %res, ptr %a
1347  ret void
1348}
1349
1350; Vector v2i32 udiv are not legal for NEON so use SVE when available.
1351define <2 x i32> @urem_v2i32(<2 x i32> %op1, <2 x i32> %op2) vscale_range(1,0) #0 {
1352; CHECK-LABEL: urem_v2i32:
1353; CHECK:       // %bb.0:
1354; CHECK-NEXT:    ptrue p0.s, vl2
1355; CHECK-NEXT:    // kill: def $d1 killed $d1 def $z1
1356; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
1357; CHECK-NEXT:    movprfx z2, z0
1358; CHECK-NEXT:    udiv z2.s, p0/m, z2.s, z1.s
1359; CHECK-NEXT:    mls v0.2s, v2.2s, v1.2s
1360; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
1361; CHECK-NEXT:    ret
1362  %res = urem <2 x i32> %op1, %op2
1363  ret <2 x i32> %res
1364}
1365
1366; Vector v4i32 udiv are not legal for NEON so use SVE when available.
1367define <4 x i32> @urem_v4i32(<4 x i32> %op1, <4 x i32> %op2) vscale_range(1,0) #0 {
1368; CHECK-LABEL: urem_v4i32:
1369; CHECK:       // %bb.0:
1370; CHECK-NEXT:    ptrue p0.s, vl4
1371; CHECK-NEXT:    // kill: def $q1 killed $q1 def $z1
1372; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
1373; CHECK-NEXT:    movprfx z2, z0
1374; CHECK-NEXT:    udiv z2.s, p0/m, z2.s, z1.s
1375; CHECK-NEXT:    mls v0.4s, v2.4s, v1.4s
1376; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
1377; CHECK-NEXT:    ret
1378  %res = urem <4 x i32> %op1, %op2
1379  ret <4 x i32> %res
1380}
1381
1382define void @urem_v8i32(ptr %a, ptr %b) vscale_range(2,0) #0 {
1383; CHECK-LABEL: urem_v8i32:
1384; CHECK:       // %bb.0:
1385; CHECK-NEXT:    ptrue p0.s, vl8
1386; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
1387; CHECK-NEXT:    ld1w { z1.s }, p0/z, [x1]
1388; CHECK-NEXT:    movprfx z2, z0
1389; CHECK-NEXT:    udiv z2.s, p0/m, z2.s, z1.s
1390; CHECK-NEXT:    mls z0.s, p0/m, z2.s, z1.s
1391; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
1392; CHECK-NEXT:    ret
1393  %op1 = load <8 x i32>, ptr %a
1394  %op2 = load <8 x i32>, ptr %b
1395  %res = urem <8 x i32> %op1, %op2
1396  store <8 x i32> %res, ptr %a
1397  ret void
1398}
1399
1400define void @urem_v16i32(ptr %a, ptr %b) #0 {
1401; VBITS_GE_128-LABEL: urem_v16i32:
1402; VBITS_GE_128:       // %bb.0:
1403; VBITS_GE_128-NEXT:    ldp q0, q3, [x1]
1404; VBITS_GE_128-NEXT:    ptrue p0.s, vl4
1405; VBITS_GE_128-NEXT:    ldp q1, q2, [x0]
1406; VBITS_GE_128-NEXT:    ldp q16, q5, [x0, #32]
1407; VBITS_GE_128-NEXT:    ldp q17, q6, [x1, #32]
1408; VBITS_GE_128-NEXT:    movprfx z4, z1
1409; VBITS_GE_128-NEXT:    udiv z4.s, p0/m, z4.s, z0.s
1410; VBITS_GE_128-NEXT:    movprfx z19, z2
1411; VBITS_GE_128-NEXT:    udiv z19.s, p0/m, z19.s, z3.s
1412; VBITS_GE_128-NEXT:    movprfx z7, z5
1413; VBITS_GE_128-NEXT:    udiv z7.s, p0/m, z7.s, z6.s
1414; VBITS_GE_128-NEXT:    movprfx z18, z16
1415; VBITS_GE_128-NEXT:    udiv z18.s, p0/m, z18.s, z17.s
1416; VBITS_GE_128-NEXT:    mls v1.4s, v4.4s, v0.4s
1417; VBITS_GE_128-NEXT:    mls v2.4s, v19.4s, v3.4s
1418; VBITS_GE_128-NEXT:    mls v16.4s, v18.4s, v17.4s
1419; VBITS_GE_128-NEXT:    mls v5.4s, v7.4s, v6.4s
1420; VBITS_GE_128-NEXT:    stp q1, q2, [x0]
1421; VBITS_GE_128-NEXT:    stp q16, q5, [x0, #32]
1422; VBITS_GE_128-NEXT:    ret
1423;
1424; VBITS_GE_256-LABEL: urem_v16i32:
1425; VBITS_GE_256:       // %bb.0:
1426; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
1427; VBITS_GE_256-NEXT:    mov x8, #8 // =0x8
1428; VBITS_GE_256-NEXT:    ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
1429; VBITS_GE_256-NEXT:    ld1w { z1.s }, p0/z, [x1, x8, lsl #2]
1430; VBITS_GE_256-NEXT:    ld1w { z3.s }, p0/z, [x0]
1431; VBITS_GE_256-NEXT:    ld1w { z4.s }, p0/z, [x1]
1432; VBITS_GE_256-NEXT:    movprfx z2, z0
1433; VBITS_GE_256-NEXT:    udiv z2.s, p0/m, z2.s, z1.s
1434; VBITS_GE_256-NEXT:    movprfx z5, z3
1435; VBITS_GE_256-NEXT:    udiv z5.s, p0/m, z5.s, z4.s
1436; VBITS_GE_256-NEXT:    mls z0.s, p0/m, z2.s, z1.s
1437; VBITS_GE_256-NEXT:    movprfx z1, z3
1438; VBITS_GE_256-NEXT:    mls z1.s, p0/m, z5.s, z4.s
1439; VBITS_GE_256-NEXT:    st1w { z0.s }, p0, [x0, x8, lsl #2]
1440; VBITS_GE_256-NEXT:    st1w { z1.s }, p0, [x0]
1441; VBITS_GE_256-NEXT:    ret
1442;
1443; VBITS_GE_512-LABEL: urem_v16i32:
1444; VBITS_GE_512:       // %bb.0:
1445; VBITS_GE_512-NEXT:    ptrue p0.s, vl16
1446; VBITS_GE_512-NEXT:    ld1w { z0.s }, p0/z, [x0]
1447; VBITS_GE_512-NEXT:    ld1w { z1.s }, p0/z, [x1]
1448; VBITS_GE_512-NEXT:    movprfx z2, z0
1449; VBITS_GE_512-NEXT:    udiv z2.s, p0/m, z2.s, z1.s
1450; VBITS_GE_512-NEXT:    mls z0.s, p0/m, z2.s, z1.s
1451; VBITS_GE_512-NEXT:    st1w { z0.s }, p0, [x0]
1452; VBITS_GE_512-NEXT:    ret
1453  %op1 = load <16 x i32>, ptr %a
1454  %op2 = load <16 x i32>, ptr %b
1455  %res = urem <16 x i32> %op1, %op2
1456  store <16 x i32> %res, ptr %a
1457  ret void
1458}
1459
1460define void @urem_v32i32(ptr %a, ptr %b) vscale_range(8,0) #0 {
1461; CHECK-LABEL: urem_v32i32:
1462; CHECK:       // %bb.0:
1463; CHECK-NEXT:    ptrue p0.s, vl32
1464; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
1465; CHECK-NEXT:    ld1w { z1.s }, p0/z, [x1]
1466; CHECK-NEXT:    movprfx z2, z0
1467; CHECK-NEXT:    udiv z2.s, p0/m, z2.s, z1.s
1468; CHECK-NEXT:    mls z0.s, p0/m, z2.s, z1.s
1469; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
1470; CHECK-NEXT:    ret
1471  %op1 = load <32 x i32>, ptr %a
1472  %op2 = load <32 x i32>, ptr %b
1473  %res = urem <32 x i32> %op1, %op2
1474  store <32 x i32> %res, ptr %a
1475  ret void
1476}
1477
1478define void @urem_v64i32(ptr %a, ptr %b) vscale_range(16,0) #0 {
1479; CHECK-LABEL: urem_v64i32:
1480; CHECK:       // %bb.0:
1481; CHECK-NEXT:    ptrue p0.s, vl64
1482; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
1483; CHECK-NEXT:    ld1w { z1.s }, p0/z, [x1]
1484; CHECK-NEXT:    movprfx z2, z0
1485; CHECK-NEXT:    udiv z2.s, p0/m, z2.s, z1.s
1486; CHECK-NEXT:    mls z0.s, p0/m, z2.s, z1.s
1487; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
1488; CHECK-NEXT:    ret
1489  %op1 = load <64 x i32>, ptr %a
1490  %op2 = load <64 x i32>, ptr %b
1491  %res = urem <64 x i32> %op1, %op2
1492  store <64 x i32> %res, ptr %a
1493  ret void
1494}
1495
1496; Vector i64 udiv are not legal for NEON so use SVE when available.
1497; FIXME: We should be able to improve the codegen for the 128 bits case here.
1498define <1 x i64> @urem_v1i64(<1 x i64> %op1, <1 x i64> %op2) vscale_range(1,0) #0 {
1499; CHECK-LABEL: urem_v1i64:
1500; CHECK:       // %bb.0:
1501; CHECK-NEXT:    ptrue p0.d, vl1
1502; CHECK-NEXT:    // kill: def $d1 killed $d1 def $z1
1503; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
1504; CHECK-NEXT:    movprfx z2, z0
1505; CHECK-NEXT:    udiv z2.d, p0/m, z2.d, z1.d
1506; CHECK-NEXT:    mls z0.d, p0/m, z2.d, z1.d
1507; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
1508; CHECK-NEXT:    ret
1509  %res = urem <1 x i64> %op1, %op2
1510  ret <1 x i64> %res
1511}
1512
1513; Vector i64 udiv are not legal for NEON so use SVE when available.
1514; FIXME: We should be able to improve the codegen for the 128 bits case here.
1515define <2 x i64> @urem_v2i64(<2 x i64> %op1, <2 x i64> %op2) vscale_range(1,0) #0 {
1516; CHECK-LABEL: urem_v2i64:
1517; CHECK:       // %bb.0:
1518; CHECK-NEXT:    ptrue p0.d, vl2
1519; CHECK-NEXT:    // kill: def $q1 killed $q1 def $z1
1520; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
1521; CHECK-NEXT:    movprfx z2, z0
1522; CHECK-NEXT:    udiv z2.d, p0/m, z2.d, z1.d
1523; CHECK-NEXT:    mls z0.d, p0/m, z2.d, z1.d
1524; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
1525; CHECK-NEXT:    ret
1526  %res = urem <2 x i64> %op1, %op2
1527  ret <2 x i64> %res
1528}
1529
1530define void @urem_v4i64(ptr %a, ptr %b) vscale_range(2,0) #0 {
1531; CHECK-LABEL: urem_v4i64:
1532; CHECK:       // %bb.0:
1533; CHECK-NEXT:    ptrue p0.d, vl4
1534; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
1535; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x1]
1536; CHECK-NEXT:    movprfx z2, z0
1537; CHECK-NEXT:    udiv z2.d, p0/m, z2.d, z1.d
1538; CHECK-NEXT:    mls z0.d, p0/m, z2.d, z1.d
1539; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
1540; CHECK-NEXT:    ret
1541  %op1 = load <4 x i64>, ptr %a
1542  %op2 = load <4 x i64>, ptr %b
1543  %res = urem <4 x i64> %op1, %op2
1544  store <4 x i64> %res, ptr %a
1545  ret void
1546}
1547
1548define void @urem_v8i64(ptr %a, ptr %b) #0 {
1549; VBITS_GE_128-LABEL: urem_v8i64:
1550; VBITS_GE_128:       // %bb.0:
1551; VBITS_GE_128-NEXT:    ldp q0, q3, [x1]
1552; VBITS_GE_128-NEXT:    ptrue p0.d, vl2
1553; VBITS_GE_128-NEXT:    ldp q1, q2, [x0]
1554; VBITS_GE_128-NEXT:    ldp q16, q5, [x0, #32]
1555; VBITS_GE_128-NEXT:    ldp q17, q6, [x1, #32]
1556; VBITS_GE_128-NEXT:    movprfx z4, z1
1557; VBITS_GE_128-NEXT:    udiv z4.d, p0/m, z4.d, z0.d
1558; VBITS_GE_128-NEXT:    movprfx z19, z2
1559; VBITS_GE_128-NEXT:    udiv z19.d, p0/m, z19.d, z3.d
1560; VBITS_GE_128-NEXT:    movprfx z7, z5
1561; VBITS_GE_128-NEXT:    udiv z7.d, p0/m, z7.d, z6.d
1562; VBITS_GE_128-NEXT:    movprfx z18, z16
1563; VBITS_GE_128-NEXT:    udiv z18.d, p0/m, z18.d, z17.d
1564; VBITS_GE_128-NEXT:    msb z0.d, p0/m, z4.d, z1.d
1565; VBITS_GE_128-NEXT:    movprfx z1, z2
1566; VBITS_GE_128-NEXT:    mls z1.d, p0/m, z19.d, z3.d
1567; VBITS_GE_128-NEXT:    mls z16.d, p0/m, z18.d, z17.d
1568; VBITS_GE_128-NEXT:    mls z5.d, p0/m, z7.d, z6.d
1569; VBITS_GE_128-NEXT:    stp q0, q1, [x0]
1570; VBITS_GE_128-NEXT:    stp q16, q5, [x0, #32]
1571; VBITS_GE_128-NEXT:    ret
1572;
1573; VBITS_GE_256-LABEL: urem_v8i64:
1574; VBITS_GE_256:       // %bb.0:
1575; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
1576; VBITS_GE_256-NEXT:    mov x8, #4 // =0x4
1577; VBITS_GE_256-NEXT:    ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
1578; VBITS_GE_256-NEXT:    ld1d { z1.d }, p0/z, [x1, x8, lsl #3]
1579; VBITS_GE_256-NEXT:    ld1d { z3.d }, p0/z, [x0]
1580; VBITS_GE_256-NEXT:    ld1d { z4.d }, p0/z, [x1]
1581; VBITS_GE_256-NEXT:    movprfx z2, z0
1582; VBITS_GE_256-NEXT:    udiv z2.d, p0/m, z2.d, z1.d
1583; VBITS_GE_256-NEXT:    movprfx z5, z3
1584; VBITS_GE_256-NEXT:    udiv z5.d, p0/m, z5.d, z4.d
1585; VBITS_GE_256-NEXT:    mls z0.d, p0/m, z2.d, z1.d
1586; VBITS_GE_256-NEXT:    movprfx z1, z3
1587; VBITS_GE_256-NEXT:    mls z1.d, p0/m, z5.d, z4.d
1588; VBITS_GE_256-NEXT:    st1d { z0.d }, p0, [x0, x8, lsl #3]
1589; VBITS_GE_256-NEXT:    st1d { z1.d }, p0, [x0]
1590; VBITS_GE_256-NEXT:    ret
1591;
1592; VBITS_GE_512-LABEL: urem_v8i64:
1593; VBITS_GE_512:       // %bb.0:
1594; VBITS_GE_512-NEXT:    ptrue p0.d, vl8
1595; VBITS_GE_512-NEXT:    ld1d { z0.d }, p0/z, [x0]
1596; VBITS_GE_512-NEXT:    ld1d { z1.d }, p0/z, [x1]
1597; VBITS_GE_512-NEXT:    movprfx z2, z0
1598; VBITS_GE_512-NEXT:    udiv z2.d, p0/m, z2.d, z1.d
1599; VBITS_GE_512-NEXT:    mls z0.d, p0/m, z2.d, z1.d
1600; VBITS_GE_512-NEXT:    st1d { z0.d }, p0, [x0]
1601; VBITS_GE_512-NEXT:    ret
1602  %op1 = load <8 x i64>, ptr %a
1603  %op2 = load <8 x i64>, ptr %b
1604  %res = urem <8 x i64> %op1, %op2
1605  store <8 x i64> %res, ptr %a
1606  ret void
1607}
1608
1609define void @urem_v16i64(ptr %a, ptr %b) vscale_range(8,0) #0 {
1610; CHECK-LABEL: urem_v16i64:
1611; CHECK:       // %bb.0:
1612; CHECK-NEXT:    ptrue p0.d, vl16
1613; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
1614; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x1]
1615; CHECK-NEXT:    movprfx z2, z0
1616; CHECK-NEXT:    udiv z2.d, p0/m, z2.d, z1.d
1617; CHECK-NEXT:    mls z0.d, p0/m, z2.d, z1.d
1618; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
1619; CHECK-NEXT:    ret
1620  %op1 = load <16 x i64>, ptr %a
1621  %op2 = load <16 x i64>, ptr %b
1622  %res = urem <16 x i64> %op1, %op2
1623  store <16 x i64> %res, ptr %a
1624  ret void
1625}
1626
1627define void @urem_v32i64(ptr %a, ptr %b) vscale_range(16,0) #0 {
1628; CHECK-LABEL: urem_v32i64:
1629; CHECK:       // %bb.0:
1630; CHECK-NEXT:    ptrue p0.d, vl32
1631; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
1632; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x1]
1633; CHECK-NEXT:    movprfx z2, z0
1634; CHECK-NEXT:    udiv z2.d, p0/m, z2.d, z1.d
1635; CHECK-NEXT:    mls z0.d, p0/m, z2.d, z1.d
1636; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
1637; CHECK-NEXT:    ret
1638  %op1 = load <32 x i64>, ptr %a
1639  %op2 = load <32 x i64>, ptr %b
1640  %res = urem <32 x i64> %op1, %op2
1641  store <32 x i64> %res, ptr %a
1642  ret void
1643}
1644
1645attributes #0 = { "target-features"="+sve" }
1646