xref: /llvm-project/llvm/test/CodeGen/AArch64/sve-fixed-length-int-div.ll (revision cc82f1290a1e2157a6c0530d78d8cc84d2b8553d)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -aarch64-sve-vector-bits-min=128  < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_128
3; RUN: llc -aarch64-sve-vector-bits-min=256  < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256
4; RUN: llc -aarch64-sve-vector-bits-min=512  < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
5; RUN: llc -aarch64-sve-vector-bits-min=2048 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
6
7target triple = "aarch64-unknown-linux-gnu"
8
9;
10; SDIV
11;
12
13; Vector vXi8 sdiv are not legal for NEON so use SVE when available.
14; FIXME: We should be able to improve the codegen for >= 256 bits here.
15define <8 x i8> @sdiv_v8i8(<8 x i8> %op1, <8 x i8> %op2) #0 {
16; VBITS_GE_128-LABEL: sdiv_v8i8:
17; VBITS_GE_128:       // %bb.0:
18; VBITS_GE_128-NEXT:    sshll v1.8h, v1.8b, #0
19; VBITS_GE_128-NEXT:    sshll v0.8h, v0.8b, #0
20; VBITS_GE_128-NEXT:    ptrue p0.s, vl4
21; VBITS_GE_128-NEXT:    sshll2 v2.4s, v1.8h, #0
22; VBITS_GE_128-NEXT:    sshll2 v3.4s, v0.8h, #0
23; VBITS_GE_128-NEXT:    sshll v1.4s, v1.4h, #0
24; VBITS_GE_128-NEXT:    sshll v0.4s, v0.4h, #0
25; VBITS_GE_128-NEXT:    sdivr z2.s, p0/m, z2.s, z3.s
26; VBITS_GE_128-NEXT:    sdiv z0.s, p0/m, z0.s, z1.s
27; VBITS_GE_128-NEXT:    uzp1 v0.8h, v0.8h, v2.8h
28; VBITS_GE_128-NEXT:    xtn v0.8b, v0.8h
29; VBITS_GE_128-NEXT:    ret
30;
31; VBITS_GE_256-LABEL: sdiv_v8i8:
32; VBITS_GE_256:       // %bb.0:
33; VBITS_GE_256-NEXT:    // kill: def $d1 killed $d1 def $z1
34; VBITS_GE_256-NEXT:    // kill: def $d0 killed $d0 def $z0
35; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
36; VBITS_GE_256-NEXT:    sunpklo z1.h, z1.b
37; VBITS_GE_256-NEXT:    sunpklo z0.h, z0.b
38; VBITS_GE_256-NEXT:    sunpklo z1.s, z1.h
39; VBITS_GE_256-NEXT:    sunpklo z0.s, z0.h
40; VBITS_GE_256-NEXT:    sdiv z0.s, p0/m, z0.s, z1.s
41; VBITS_GE_256-NEXT:    uzp1 z1.h, z0.h, z0.h
42; VBITS_GE_256-NEXT:    umov w8, v1.h[0]
43; VBITS_GE_256-NEXT:    umov w9, v1.h[1]
44; VBITS_GE_256-NEXT:    fmov s0, w8
45; VBITS_GE_256-NEXT:    umov w8, v1.h[2]
46; VBITS_GE_256-NEXT:    mov v0.b[1], w9
47; VBITS_GE_256-NEXT:    mov v0.b[2], w8
48; VBITS_GE_256-NEXT:    umov w8, v1.h[3]
49; VBITS_GE_256-NEXT:    mov v0.b[3], w8
50; VBITS_GE_256-NEXT:    umov w8, v1.h[4]
51; VBITS_GE_256-NEXT:    mov v0.b[4], w8
52; VBITS_GE_256-NEXT:    umov w8, v1.h[5]
53; VBITS_GE_256-NEXT:    mov v0.b[5], w8
54; VBITS_GE_256-NEXT:    umov w8, v1.h[6]
55; VBITS_GE_256-NEXT:    mov v0.b[6], w8
56; VBITS_GE_256-NEXT:    umov w8, v1.h[7]
57; VBITS_GE_256-NEXT:    mov v0.b[7], w8
58; VBITS_GE_256-NEXT:    // kill: def $d0 killed $d0 killed $q0
59; VBITS_GE_256-NEXT:    ret
60;
61; VBITS_GE_512-LABEL: sdiv_v8i8:
62; VBITS_GE_512:       // %bb.0:
63; VBITS_GE_512-NEXT:    // kill: def $d1 killed $d1 def $z1
64; VBITS_GE_512-NEXT:    // kill: def $d0 killed $d0 def $z0
65; VBITS_GE_512-NEXT:    ptrue p0.s, vl8
66; VBITS_GE_512-NEXT:    sunpklo z1.h, z1.b
67; VBITS_GE_512-NEXT:    sunpklo z0.h, z0.b
68; VBITS_GE_512-NEXT:    sunpklo z1.s, z1.h
69; VBITS_GE_512-NEXT:    sunpklo z0.s, z0.h
70; VBITS_GE_512-NEXT:    sdiv z0.s, p0/m, z0.s, z1.s
71; VBITS_GE_512-NEXT:    uzp1 z1.h, z0.h, z0.h
72; VBITS_GE_512-NEXT:    umov w8, v1.h[0]
73; VBITS_GE_512-NEXT:    umov w9, v1.h[1]
74; VBITS_GE_512-NEXT:    fmov s0, w8
75; VBITS_GE_512-NEXT:    umov w8, v1.h[2]
76; VBITS_GE_512-NEXT:    mov v0.b[1], w9
77; VBITS_GE_512-NEXT:    mov v0.b[2], w8
78; VBITS_GE_512-NEXT:    umov w8, v1.h[3]
79; VBITS_GE_512-NEXT:    mov v0.b[3], w8
80; VBITS_GE_512-NEXT:    umov w8, v1.h[4]
81; VBITS_GE_512-NEXT:    mov v0.b[4], w8
82; VBITS_GE_512-NEXT:    umov w8, v1.h[5]
83; VBITS_GE_512-NEXT:    mov v0.b[5], w8
84; VBITS_GE_512-NEXT:    umov w8, v1.h[6]
85; VBITS_GE_512-NEXT:    mov v0.b[6], w8
86; VBITS_GE_512-NEXT:    umov w8, v1.h[7]
87; VBITS_GE_512-NEXT:    mov v0.b[7], w8
88; VBITS_GE_512-NEXT:    // kill: def $d0 killed $d0 killed $q0
89; VBITS_GE_512-NEXT:    ret
90  %res = sdiv <8 x i8> %op1, %op2
91  ret <8 x i8> %res
92}
93
94define <16 x i8> @sdiv_v16i8(<16 x i8> %op1, <16 x i8> %op2) #0 {
95; VBITS_GE_128-LABEL: sdiv_v16i8:
96; VBITS_GE_128:       // %bb.0:
97; VBITS_GE_128-NEXT:    sshll2 v2.8h, v1.16b, #0
98; VBITS_GE_128-NEXT:    sshll2 v3.8h, v0.16b, #0
99; VBITS_GE_128-NEXT:    sshll v1.8h, v1.8b, #0
100; VBITS_GE_128-NEXT:    sshll v0.8h, v0.8b, #0
101; VBITS_GE_128-NEXT:    ptrue p0.s, vl4
102; VBITS_GE_128-NEXT:    sshll2 v4.4s, v2.8h, #0
103; VBITS_GE_128-NEXT:    sshll2 v5.4s, v3.8h, #0
104; VBITS_GE_128-NEXT:    sshll v2.4s, v2.4h, #0
105; VBITS_GE_128-NEXT:    sshll v3.4s, v3.4h, #0
106; VBITS_GE_128-NEXT:    sdivr z4.s, p0/m, z4.s, z5.s
107; VBITS_GE_128-NEXT:    sshll2 v5.4s, v0.8h, #0
108; VBITS_GE_128-NEXT:    sshll v0.4s, v0.4h, #0
109; VBITS_GE_128-NEXT:    sdivr z2.s, p0/m, z2.s, z3.s
110; VBITS_GE_128-NEXT:    sshll2 v3.4s, v1.8h, #0
111; VBITS_GE_128-NEXT:    sshll v1.4s, v1.4h, #0
112; VBITS_GE_128-NEXT:    sdivr z3.s, p0/m, z3.s, z5.s
113; VBITS_GE_128-NEXT:    sdiv z0.s, p0/m, z0.s, z1.s
114; VBITS_GE_128-NEXT:    uzp1 v1.8h, v2.8h, v4.8h
115; VBITS_GE_128-NEXT:    uzp1 v0.8h, v0.8h, v3.8h
116; VBITS_GE_128-NEXT:    uzp1 v0.16b, v0.16b, v1.16b
117; VBITS_GE_128-NEXT:    ret
118;
119; VBITS_GE_256-LABEL: sdiv_v16i8:
120; VBITS_GE_256:       // %bb.0:
121; VBITS_GE_256-NEXT:    // kill: def $q1 killed $q1 def $z1
122; VBITS_GE_256-NEXT:    // kill: def $q0 killed $q0 def $z0
123; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
124; VBITS_GE_256-NEXT:    sunpklo z1.h, z1.b
125; VBITS_GE_256-NEXT:    sunpklo z0.h, z0.b
126; VBITS_GE_256-NEXT:    sunpklo z2.s, z1.h
127; VBITS_GE_256-NEXT:    sunpklo z3.s, z0.h
128; VBITS_GE_256-NEXT:    ext z1.b, z1.b, z1.b, #16
129; VBITS_GE_256-NEXT:    ext z0.b, z0.b, z0.b, #16
130; VBITS_GE_256-NEXT:    sunpklo z1.s, z1.h
131; VBITS_GE_256-NEXT:    sunpklo z0.s, z0.h
132; VBITS_GE_256-NEXT:    sdivr z2.s, p0/m, z2.s, z3.s
133; VBITS_GE_256-NEXT:    sdiv z0.s, p0/m, z0.s, z1.s
134; VBITS_GE_256-NEXT:    ptrue p0.h, vl8
135; VBITS_GE_256-NEXT:    uzp1 z1.h, z2.h, z2.h
136; VBITS_GE_256-NEXT:    uzp1 z0.h, z0.h, z0.h
137; VBITS_GE_256-NEXT:    splice z1.h, p0, z1.h, z0.h
138; VBITS_GE_256-NEXT:    uzp1 z0.b, z1.b, z1.b
139; VBITS_GE_256-NEXT:    // kill: def $q0 killed $q0 killed $z0
140; VBITS_GE_256-NEXT:    ret
141;
142; VBITS_GE_512-LABEL: sdiv_v16i8:
143; VBITS_GE_512:       // %bb.0:
144; VBITS_GE_512-NEXT:    // kill: def $q1 killed $q1 def $z1
145; VBITS_GE_512-NEXT:    // kill: def $q0 killed $q0 def $z0
146; VBITS_GE_512-NEXT:    ptrue p0.s, vl16
147; VBITS_GE_512-NEXT:    sunpklo z1.h, z1.b
148; VBITS_GE_512-NEXT:    sunpklo z0.h, z0.b
149; VBITS_GE_512-NEXT:    sunpklo z1.s, z1.h
150; VBITS_GE_512-NEXT:    sunpklo z0.s, z0.h
151; VBITS_GE_512-NEXT:    sdiv z0.s, p0/m, z0.s, z1.s
152; VBITS_GE_512-NEXT:    uzp1 z0.h, z0.h, z0.h
153; VBITS_GE_512-NEXT:    uzp1 z0.b, z0.b, z0.b
154; VBITS_GE_512-NEXT:    // kill: def $q0 killed $q0 killed $z0
155; VBITS_GE_512-NEXT:    ret
156  %res = sdiv <16 x i8> %op1, %op2
157  ret <16 x i8> %res
158}
159
160define void @sdiv_v32i8(ptr %a, ptr %b) vscale_range(8,0) #0 {
161; CHECK-LABEL: sdiv_v32i8:
162; CHECK:       // %bb.0:
163; CHECK-NEXT:    ptrue p0.b, vl32
164; CHECK-NEXT:    ld1b { z0.b }, p0/z, [x0]
165; CHECK-NEXT:    ld1b { z1.b }, p0/z, [x1]
166; CHECK-NEXT:    ptrue p0.s, vl32
167; CHECK-NEXT:    sunpklo z1.h, z1.b
168; CHECK-NEXT:    sunpklo z0.h, z0.b
169; CHECK-NEXT:    sunpklo z1.s, z1.h
170; CHECK-NEXT:    sunpklo z0.s, z0.h
171; CHECK-NEXT:    sdiv z0.s, p0/m, z0.s, z1.s
172; CHECK-NEXT:    st1b { z0.s }, p0, [x0]
173; CHECK-NEXT:    ret
174  %op1 = load <32 x i8>, ptr %a
175  %op2 = load <32 x i8>, ptr %b
176  %res = sdiv <32 x i8> %op1, %op2
177  store <32 x i8> %res, ptr %a
178  ret void
179}
180
181define void @sdiv_v64i8(ptr %a, ptr %b) vscale_range(16,0) #0 {
182; CHECK-LABEL: sdiv_v64i8:
183; CHECK:       // %bb.0:
184; CHECK-NEXT:    ptrue p0.b, vl64
185; CHECK-NEXT:    ld1b { z0.b }, p0/z, [x0]
186; CHECK-NEXT:    ld1b { z1.b }, p0/z, [x1]
187; CHECK-NEXT:    ptrue p0.s, vl64
188; CHECK-NEXT:    sunpklo z1.h, z1.b
189; CHECK-NEXT:    sunpklo z0.h, z0.b
190; CHECK-NEXT:    sunpklo z1.s, z1.h
191; CHECK-NEXT:    sunpklo z0.s, z0.h
192; CHECK-NEXT:    sdiv z0.s, p0/m, z0.s, z1.s
193; CHECK-NEXT:    st1b { z0.s }, p0, [x0]
194; CHECK-NEXT:    ret
195  %op1 = load <64 x i8>, ptr %a
196  %op2 = load <64 x i8>, ptr %b
197  %res = sdiv <64 x i8> %op1, %op2
198  store <64 x i8> %res, ptr %a
199  ret void
200}
201
202define void @sdiv_v128i8(ptr %a, ptr %b) vscale_range(16,0) #0 {
203; CHECK-LABEL: sdiv_v128i8:
204; CHECK:       // %bb.0:
205; CHECK-NEXT:    ptrue p0.b, vl128
206; CHECK-NEXT:    ld1b { z0.b }, p0/z, [x0]
207; CHECK-NEXT:    ld1b { z1.b }, p0/z, [x1]
208; CHECK-NEXT:    ptrue p0.s, vl64
209; CHECK-NEXT:    sunpklo z1.h, z1.b
210; CHECK-NEXT:    sunpklo z0.h, z0.b
211; CHECK-NEXT:    sunpklo z2.s, z1.h
212; CHECK-NEXT:    sunpklo z3.s, z0.h
213; CHECK-NEXT:    ext z1.b, z1.b, z1.b, #128
214; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #128
215; CHECK-NEXT:    sunpklo z1.s, z1.h
216; CHECK-NEXT:    sunpklo z0.s, z0.h
217; CHECK-NEXT:    sdivr z2.s, p0/m, z2.s, z3.s
218; CHECK-NEXT:    sdiv z0.s, p0/m, z0.s, z1.s
219; CHECK-NEXT:    ptrue p0.h, vl64
220; CHECK-NEXT:    uzp1 z1.h, z2.h, z2.h
221; CHECK-NEXT:    uzp1 z0.h, z0.h, z0.h
222; CHECK-NEXT:    splice z1.h, p0, z1.h, z0.h
223; CHECK-NEXT:    ptrue p0.h, vl128
224; CHECK-NEXT:    st1b { z1.h }, p0, [x0]
225; CHECK-NEXT:    ret
226  %op1 = load <128 x i8>, ptr %a
227  %op2 = load <128 x i8>, ptr %b
228  %res = sdiv <128 x i8> %op1, %op2
229  store <128 x i8> %res, ptr %a
230  ret void
231}
232
233define void @sdiv_v256i8(ptr %a, ptr %b) vscale_range(16,0) #0 {
234; CHECK-LABEL: sdiv_v256i8:
235; CHECK:       // %bb.0:
236; CHECK-NEXT:    ptrue p0.b, vl256
237; CHECK-NEXT:    ptrue p1.s, vl64
238; CHECK-NEXT:    ld1b { z0.b }, p0/z, [x0]
239; CHECK-NEXT:    ld1b { z1.b }, p0/z, [x1]
240; CHECK-NEXT:    sunpklo z2.h, z1.b
241; CHECK-NEXT:    sunpklo z3.h, z0.b
242; CHECK-NEXT:    ext z1.b, z1.b, z1.b, #128
243; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #128
244; CHECK-NEXT:    sunpklo z1.h, z1.b
245; CHECK-NEXT:    sunpklo z4.s, z2.h
246; CHECK-NEXT:    sunpklo z5.s, z3.h
247; CHECK-NEXT:    ext z2.b, z2.b, z2.b, #128
248; CHECK-NEXT:    ext z3.b, z3.b, z3.b, #128
249; CHECK-NEXT:    sunpklo z0.h, z0.b
250; CHECK-NEXT:    sunpklo z2.s, z2.h
251; CHECK-NEXT:    sunpklo z3.s, z3.h
252; CHECK-NEXT:    sdivr z4.s, p1/m, z4.s, z5.s
253; CHECK-NEXT:    sunpklo z5.s, z0.h
254; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #128
255; CHECK-NEXT:    sunpklo z0.s, z0.h
256; CHECK-NEXT:    sdivr z2.s, p1/m, z2.s, z3.s
257; CHECK-NEXT:    sunpklo z3.s, z1.h
258; CHECK-NEXT:    ext z1.b, z1.b, z1.b, #128
259; CHECK-NEXT:    sunpklo z1.s, z1.h
260; CHECK-NEXT:    sdivr z3.s, p1/m, z3.s, z5.s
261; CHECK-NEXT:    uzp1 z2.h, z2.h, z2.h
262; CHECK-NEXT:    sdiv z0.s, p1/m, z0.s, z1.s
263; CHECK-NEXT:    uzp1 z1.h, z4.h, z4.h
264; CHECK-NEXT:    ptrue p1.h, vl64
265; CHECK-NEXT:    uzp1 z3.h, z3.h, z3.h
266; CHECK-NEXT:    splice z1.h, p1, z1.h, z2.h
267; CHECK-NEXT:    uzp1 z0.h, z0.h, z0.h
268; CHECK-NEXT:    splice z3.h, p1, z3.h, z0.h
269; CHECK-NEXT:    uzp1 z0.b, z1.b, z1.b
270; CHECK-NEXT:    ptrue p1.b, vl128
271; CHECK-NEXT:    uzp1 z1.b, z3.b, z3.b
272; CHECK-NEXT:    splice z0.b, p1, z0.b, z1.b
273; CHECK-NEXT:    st1b { z0.b }, p0, [x0]
274; CHECK-NEXT:    ret
275  %op1 = load <256 x i8>, ptr %a
276  %op2 = load <256 x i8>, ptr %b
277  %res = sdiv <256 x i8> %op1, %op2
278  store <256 x i8> %res, ptr %a
279  ret void
280}
281
282; Vector vXi16 sdiv are not legal for NEON so use SVE when available.
283; FIXME: We should be able to improve the codegen for >= 256 bits here.
284define <4 x i16> @sdiv_v4i16(<4 x i16> %op1, <4 x i16> %op2) #0 {
285; VBITS_GE_128-LABEL: sdiv_v4i16:
286; VBITS_GE_128:       // %bb.0:
287; VBITS_GE_128-NEXT:    sshll v1.4s, v1.4h, #0
288; VBITS_GE_128-NEXT:    sshll v0.4s, v0.4h, #0
289; VBITS_GE_128-NEXT:    ptrue p0.s, vl4
290; VBITS_GE_128-NEXT:    sdiv z0.s, p0/m, z0.s, z1.s
291; VBITS_GE_128-NEXT:    xtn v0.4h, v0.4s
292; VBITS_GE_128-NEXT:    ret
293;
294; VBITS_GE_256-LABEL: sdiv_v4i16:
295; VBITS_GE_256:       // %bb.0:
296; VBITS_GE_256-NEXT:    sshll v1.4s, v1.4h, #0
297; VBITS_GE_256-NEXT:    sshll v0.4s, v0.4h, #0
298; VBITS_GE_256-NEXT:    ptrue p0.s, vl4
299; VBITS_GE_256-NEXT:    sdivr z1.s, p0/m, z1.s, z0.s
300; VBITS_GE_256-NEXT:    mov w8, v1.s[1]
301; VBITS_GE_256-NEXT:    mov v0.16b, v1.16b
302; VBITS_GE_256-NEXT:    mov w9, v1.s[2]
303; VBITS_GE_256-NEXT:    mov v0.h[1], w8
304; VBITS_GE_256-NEXT:    mov w8, v1.s[3]
305; VBITS_GE_256-NEXT:    mov v0.h[2], w9
306; VBITS_GE_256-NEXT:    mov v0.h[3], w8
307; VBITS_GE_256-NEXT:    // kill: def $d0 killed $d0 killed $q0
308; VBITS_GE_256-NEXT:    ret
309;
310; VBITS_GE_512-LABEL: sdiv_v4i16:
311; VBITS_GE_512:       // %bb.0:
312; VBITS_GE_512-NEXT:    sshll v1.4s, v1.4h, #0
313; VBITS_GE_512-NEXT:    sshll v0.4s, v0.4h, #0
314; VBITS_GE_512-NEXT:    ptrue p0.s, vl4
315; VBITS_GE_512-NEXT:    sdivr z1.s, p0/m, z1.s, z0.s
316; VBITS_GE_512-NEXT:    mov w8, v1.s[1]
317; VBITS_GE_512-NEXT:    mov v0.16b, v1.16b
318; VBITS_GE_512-NEXT:    mov w9, v1.s[2]
319; VBITS_GE_512-NEXT:    mov v0.h[1], w8
320; VBITS_GE_512-NEXT:    mov w8, v1.s[3]
321; VBITS_GE_512-NEXT:    mov v0.h[2], w9
322; VBITS_GE_512-NEXT:    mov v0.h[3], w8
323; VBITS_GE_512-NEXT:    // kill: def $d0 killed $d0 killed $q0
324; VBITS_GE_512-NEXT:    ret
325  %res = sdiv <4 x i16> %op1, %op2
326  ret <4 x i16> %res
327}
328
329define <8 x i16> @sdiv_v8i16(<8 x i16> %op1, <8 x i16> %op2) #0 {
330; VBITS_GE_128-LABEL: sdiv_v8i16:
331; VBITS_GE_128:       // %bb.0:
332; VBITS_GE_128-NEXT:    sshll2 v2.4s, v1.8h, #0
333; VBITS_GE_128-NEXT:    sshll2 v3.4s, v0.8h, #0
334; VBITS_GE_128-NEXT:    sshll v1.4s, v1.4h, #0
335; VBITS_GE_128-NEXT:    sshll v0.4s, v0.4h, #0
336; VBITS_GE_128-NEXT:    ptrue p0.s, vl4
337; VBITS_GE_128-NEXT:    sdivr z2.s, p0/m, z2.s, z3.s
338; VBITS_GE_128-NEXT:    sdiv z0.s, p0/m, z0.s, z1.s
339; VBITS_GE_128-NEXT:    uzp1 v0.8h, v0.8h, v2.8h
340; VBITS_GE_128-NEXT:    ret
341;
342; VBITS_GE_256-LABEL: sdiv_v8i16:
343; VBITS_GE_256:       // %bb.0:
344; VBITS_GE_256-NEXT:    // kill: def $q1 killed $q1 def $z1
345; VBITS_GE_256-NEXT:    // kill: def $q0 killed $q0 def $z0
346; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
347; VBITS_GE_256-NEXT:    sunpklo z1.s, z1.h
348; VBITS_GE_256-NEXT:    sunpklo z0.s, z0.h
349; VBITS_GE_256-NEXT:    sdiv z0.s, p0/m, z0.s, z1.s
350; VBITS_GE_256-NEXT:    uzp1 z0.h, z0.h, z0.h
351; VBITS_GE_256-NEXT:    // kill: def $q0 killed $q0 killed $z0
352; VBITS_GE_256-NEXT:    ret
353;
354; VBITS_GE_512-LABEL: sdiv_v8i16:
355; VBITS_GE_512:       // %bb.0:
356; VBITS_GE_512-NEXT:    // kill: def $q1 killed $q1 def $z1
357; VBITS_GE_512-NEXT:    // kill: def $q0 killed $q0 def $z0
358; VBITS_GE_512-NEXT:    ptrue p0.s, vl8
359; VBITS_GE_512-NEXT:    sunpklo z1.s, z1.h
360; VBITS_GE_512-NEXT:    sunpklo z0.s, z0.h
361; VBITS_GE_512-NEXT:    sdiv z0.s, p0/m, z0.s, z1.s
362; VBITS_GE_512-NEXT:    uzp1 z0.h, z0.h, z0.h
363; VBITS_GE_512-NEXT:    // kill: def $q0 killed $q0 killed $z0
364; VBITS_GE_512-NEXT:    ret
365  %res = sdiv <8 x i16> %op1, %op2
366  ret <8 x i16> %res
367}
368
369define void @sdiv_v16i16(ptr %a, ptr %b) #0 {
370; VBITS_GE_128-LABEL: sdiv_v16i16:
371; VBITS_GE_128:       // %bb.0:
372; VBITS_GE_128-NEXT:    ldp q4, q1, [x1]
373; VBITS_GE_128-NEXT:    ptrue p0.s, vl4
374; VBITS_GE_128-NEXT:    ldr q0, [x0, #16]
375; VBITS_GE_128-NEXT:    sshll2 v2.4s, v1.8h, #0
376; VBITS_GE_128-NEXT:    sshll2 v3.4s, v0.8h, #0
377; VBITS_GE_128-NEXT:    sshll2 v5.4s, v4.8h, #0
378; VBITS_GE_128-NEXT:    sshll v4.4s, v4.4h, #0
379; VBITS_GE_128-NEXT:    sshll v1.4s, v1.4h, #0
380; VBITS_GE_128-NEXT:    sshll v0.4s, v0.4h, #0
381; VBITS_GE_128-NEXT:    sdivr z2.s, p0/m, z2.s, z3.s
382; VBITS_GE_128-NEXT:    ldr q3, [x0]
383; VBITS_GE_128-NEXT:    sshll2 v6.4s, v3.8h, #0
384; VBITS_GE_128-NEXT:    sshll v3.4s, v3.4h, #0
385; VBITS_GE_128-NEXT:    sdivr z5.s, p0/m, z5.s, z6.s
386; VBITS_GE_128-NEXT:    sdiv z3.s, p0/m, z3.s, z4.s
387; VBITS_GE_128-NEXT:    sdiv z0.s, p0/m, z0.s, z1.s
388; VBITS_GE_128-NEXT:    uzp1 v1.8h, v3.8h, v5.8h
389; VBITS_GE_128-NEXT:    uzp1 v0.8h, v0.8h, v2.8h
390; VBITS_GE_128-NEXT:    stp q1, q0, [x0]
391; VBITS_GE_128-NEXT:    ret
392;
393; VBITS_GE_256-LABEL: sdiv_v16i16:
394; VBITS_GE_256:       // %bb.0:
395; VBITS_GE_256-NEXT:    ptrue p0.h, vl16
396; VBITS_GE_256-NEXT:    ptrue p1.s, vl8
397; VBITS_GE_256-NEXT:    ld1h { z0.h }, p0/z, [x0]
398; VBITS_GE_256-NEXT:    ld1h { z1.h }, p0/z, [x1]
399; VBITS_GE_256-NEXT:    sunpklo z2.s, z1.h
400; VBITS_GE_256-NEXT:    sunpklo z3.s, z0.h
401; VBITS_GE_256-NEXT:    ext z1.b, z1.b, z1.b, #16
402; VBITS_GE_256-NEXT:    ext z0.b, z0.b, z0.b, #16
403; VBITS_GE_256-NEXT:    sunpklo z1.s, z1.h
404; VBITS_GE_256-NEXT:    sunpklo z0.s, z0.h
405; VBITS_GE_256-NEXT:    sdivr z2.s, p1/m, z2.s, z3.s
406; VBITS_GE_256-NEXT:    sdiv z0.s, p1/m, z0.s, z1.s
407; VBITS_GE_256-NEXT:    ptrue p1.h, vl8
408; VBITS_GE_256-NEXT:    uzp1 z1.h, z2.h, z2.h
409; VBITS_GE_256-NEXT:    uzp1 z0.h, z0.h, z0.h
410; VBITS_GE_256-NEXT:    splice z1.h, p1, z1.h, z0.h
411; VBITS_GE_256-NEXT:    st1h { z1.h }, p0, [x0]
412; VBITS_GE_256-NEXT:    ret
413;
414; VBITS_GE_512-LABEL: sdiv_v16i16:
415; VBITS_GE_512:       // %bb.0:
416; VBITS_GE_512-NEXT:    ptrue p0.h, vl16
417; VBITS_GE_512-NEXT:    ld1h { z0.h }, p0/z, [x0]
418; VBITS_GE_512-NEXT:    ld1h { z1.h }, p0/z, [x1]
419; VBITS_GE_512-NEXT:    ptrue p0.s, vl16
420; VBITS_GE_512-NEXT:    sunpklo z1.s, z1.h
421; VBITS_GE_512-NEXT:    sunpklo z0.s, z0.h
422; VBITS_GE_512-NEXT:    sdiv z0.s, p0/m, z0.s, z1.s
423; VBITS_GE_512-NEXT:    st1h { z0.s }, p0, [x0]
424; VBITS_GE_512-NEXT:    ret
425  %op1 = load <16 x i16>, ptr %a
426  %op2 = load <16 x i16>, ptr %b
427  %res = sdiv <16 x i16> %op1, %op2
428  store <16 x i16> %res, ptr %a
429  ret void
430}
431
432define void @sdiv_v32i16(ptr %a, ptr %b) vscale_range(8,0) #0 {
433; CHECK-LABEL: sdiv_v32i16:
434; CHECK:       // %bb.0:
435; CHECK-NEXT:    ptrue p0.h, vl32
436; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
437; CHECK-NEXT:    ld1h { z1.h }, p0/z, [x1]
438; CHECK-NEXT:    ptrue p0.s, vl32
439; CHECK-NEXT:    sunpklo z1.s, z1.h
440; CHECK-NEXT:    sunpklo z0.s, z0.h
441; CHECK-NEXT:    sdiv z0.s, p0/m, z0.s, z1.s
442; CHECK-NEXT:    st1h { z0.s }, p0, [x0]
443; CHECK-NEXT:    ret
444  %op1 = load <32 x i16>, ptr %a
445  %op2 = load <32 x i16>, ptr %b
446  %res = sdiv <32 x i16> %op1, %op2
447  store <32 x i16> %res, ptr %a
448  ret void
449}
450
451define void @sdiv_v64i16(ptr %a, ptr %b) vscale_range(16,0) #0 {
452; CHECK-LABEL: sdiv_v64i16:
453; CHECK:       // %bb.0:
454; CHECK-NEXT:    ptrue p0.h, vl64
455; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
456; CHECK-NEXT:    ld1h { z1.h }, p0/z, [x1]
457; CHECK-NEXT:    ptrue p0.s, vl64
458; CHECK-NEXT:    sunpklo z1.s, z1.h
459; CHECK-NEXT:    sunpklo z0.s, z0.h
460; CHECK-NEXT:    sdiv z0.s, p0/m, z0.s, z1.s
461; CHECK-NEXT:    st1h { z0.s }, p0, [x0]
462; CHECK-NEXT:    ret
463  %op1 = load <64 x i16>, ptr %a
464  %op2 = load <64 x i16>, ptr %b
465  %res = sdiv <64 x i16> %op1, %op2
466  store <64 x i16> %res, ptr %a
467  ret void
468}
469
470define void @sdiv_v128i16(ptr %a, ptr %b) vscale_range(16,0) #0 {
471; CHECK-LABEL: sdiv_v128i16:
472; CHECK:       // %bb.0:
473; CHECK-NEXT:    ptrue p0.h, vl128
474; CHECK-NEXT:    ptrue p1.s, vl64
475; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
476; CHECK-NEXT:    ld1h { z1.h }, p0/z, [x1]
477; CHECK-NEXT:    sunpklo z2.s, z1.h
478; CHECK-NEXT:    sunpklo z3.s, z0.h
479; CHECK-NEXT:    ext z1.b, z1.b, z1.b, #128
480; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #128
481; CHECK-NEXT:    sunpklo z1.s, z1.h
482; CHECK-NEXT:    sunpklo z0.s, z0.h
483; CHECK-NEXT:    sdivr z2.s, p1/m, z2.s, z3.s
484; CHECK-NEXT:    sdiv z0.s, p1/m, z0.s, z1.s
485; CHECK-NEXT:    ptrue p1.h, vl64
486; CHECK-NEXT:    uzp1 z1.h, z2.h, z2.h
487; CHECK-NEXT:    uzp1 z0.h, z0.h, z0.h
488; CHECK-NEXT:    splice z1.h, p1, z1.h, z0.h
489; CHECK-NEXT:    st1h { z1.h }, p0, [x0]
490; CHECK-NEXT:    ret
491  %op1 = load <128 x i16>, ptr %a
492  %op2 = load <128 x i16>, ptr %b
493  %res = sdiv <128 x i16> %op1, %op2
494  store <128 x i16> %res, ptr %a
495  ret void
496}
497
498; Vector v2i32 sdiv are not legal for NEON so use SVE when available.
499define <2 x i32> @sdiv_v2i32(<2 x i32> %op1, <2 x i32> %op2) vscale_range(1,0) #0 {
500; CHECK-LABEL: sdiv_v2i32:
501; CHECK:       // %bb.0:
502; CHECK-NEXT:    ptrue p0.s, vl2
503; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
504; CHECK-NEXT:    // kill: def $d1 killed $d1 def $z1
505; CHECK-NEXT:    sdiv z0.s, p0/m, z0.s, z1.s
506; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
507; CHECK-NEXT:    ret
508  %res = sdiv <2 x i32> %op1, %op2
509  ret <2 x i32> %res
510}
511
512; Vector v4i32 sdiv are not legal for NEON so use SVE when available.
513define <4 x i32> @sdiv_v4i32(<4 x i32> %op1, <4 x i32> %op2) vscale_range(1,0) #0 {
514; CHECK-LABEL: sdiv_v4i32:
515; CHECK:       // %bb.0:
516; CHECK-NEXT:    ptrue p0.s, vl4
517; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
518; CHECK-NEXT:    // kill: def $q1 killed $q1 def $z1
519; CHECK-NEXT:    sdiv z0.s, p0/m, z0.s, z1.s
520; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
521; CHECK-NEXT:    ret
522  %res = sdiv <4 x i32> %op1, %op2
523  ret <4 x i32> %res
524}
525
526define void @sdiv_v8i32(ptr %a, ptr %b) vscale_range(2,0) #0 {
527; CHECK-LABEL: sdiv_v8i32:
528; CHECK:       // %bb.0:
529; CHECK-NEXT:    ptrue p0.s, vl8
530; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
531; CHECK-NEXT:    ld1w { z1.s }, p0/z, [x1]
532; CHECK-NEXT:    sdiv z0.s, p0/m, z0.s, z1.s
533; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
534; CHECK-NEXT:    ret
535  %op1 = load <8 x i32>, ptr %a
536  %op2 = load <8 x i32>, ptr %b
537  %res = sdiv <8 x i32> %op1, %op2
538  store <8 x i32> %res, ptr %a
539  ret void
540}
541
542define void @sdiv_v16i32(ptr %a, ptr %b) #0 {
543; VBITS_GE_128-LABEL: sdiv_v16i32:
544; VBITS_GE_128:       // %bb.0:
545; VBITS_GE_128-NEXT:    ldp q0, q3, [x1]
546; VBITS_GE_128-NEXT:    ptrue p0.s, vl4
547; VBITS_GE_128-NEXT:    ldp q1, q2, [x0]
548; VBITS_GE_128-NEXT:    ldp q5, q4, [x1, #32]
549; VBITS_GE_128-NEXT:    sdivr z0.s, p0/m, z0.s, z1.s
550; VBITS_GE_128-NEXT:    ldr q1, [x0, #48]
551; VBITS_GE_128-NEXT:    sdiv z1.s, p0/m, z1.s, z4.s
552; VBITS_GE_128-NEXT:    ldr q4, [x0, #32]
553; VBITS_GE_128-NEXT:    sdiv z4.s, p0/m, z4.s, z5.s
554; VBITS_GE_128-NEXT:    sdiv z2.s, p0/m, z2.s, z3.s
555; VBITS_GE_128-NEXT:    stp q4, q1, [x0, #32]
556; VBITS_GE_128-NEXT:    stp q0, q2, [x0]
557; VBITS_GE_128-NEXT:    ret
558;
559; VBITS_GE_256-LABEL: sdiv_v16i32:
560; VBITS_GE_256:       // %bb.0:
561; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
562; VBITS_GE_256-NEXT:    mov x8, #8 // =0x8
563; VBITS_GE_256-NEXT:    ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
564; VBITS_GE_256-NEXT:    ld1w { z1.s }, p0/z, [x1, x8, lsl #2]
565; VBITS_GE_256-NEXT:    ld1w { z2.s }, p0/z, [x1]
566; VBITS_GE_256-NEXT:    sdiv z0.s, p0/m, z0.s, z1.s
567; VBITS_GE_256-NEXT:    ld1w { z1.s }, p0/z, [x0]
568; VBITS_GE_256-NEXT:    sdiv z1.s, p0/m, z1.s, z2.s
569; VBITS_GE_256-NEXT:    st1w { z0.s }, p0, [x0, x8, lsl #2]
570; VBITS_GE_256-NEXT:    st1w { z1.s }, p0, [x0]
571; VBITS_GE_256-NEXT:    ret
572;
573; VBITS_GE_512-LABEL: sdiv_v16i32:
574; VBITS_GE_512:       // %bb.0:
575; VBITS_GE_512-NEXT:    ptrue p0.s, vl16
576; VBITS_GE_512-NEXT:    ld1w { z0.s }, p0/z, [x0]
577; VBITS_GE_512-NEXT:    ld1w { z1.s }, p0/z, [x1]
578; VBITS_GE_512-NEXT:    sdiv z0.s, p0/m, z0.s, z1.s
579; VBITS_GE_512-NEXT:    st1w { z0.s }, p0, [x0]
580; VBITS_GE_512-NEXT:    ret
581  %op1 = load <16 x i32>, ptr %a
582  %op2 = load <16 x i32>, ptr %b
583  %res = sdiv <16 x i32> %op1, %op2
584  store <16 x i32> %res, ptr %a
585  ret void
586}
587
588define void @sdiv_v32i32(ptr %a, ptr %b) vscale_range(8,0) #0 {
589; CHECK-LABEL: sdiv_v32i32:
590; CHECK:       // %bb.0:
591; CHECK-NEXT:    ptrue p0.s, vl32
592; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
593; CHECK-NEXT:    ld1w { z1.s }, p0/z, [x1]
594; CHECK-NEXT:    sdiv z0.s, p0/m, z0.s, z1.s
595; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
596; CHECK-NEXT:    ret
597  %op1 = load <32 x i32>, ptr %a
598  %op2 = load <32 x i32>, ptr %b
599  %res = sdiv <32 x i32> %op1, %op2
600  store <32 x i32> %res, ptr %a
601  ret void
602}
603
604define void @sdiv_v64i32(ptr %a, ptr %b) vscale_range(16,0) #0 {
605; CHECK-LABEL: sdiv_v64i32:
606; CHECK:       // %bb.0:
607; CHECK-NEXT:    ptrue p0.s, vl64
608; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
609; CHECK-NEXT:    ld1w { z1.s }, p0/z, [x1]
610; CHECK-NEXT:    sdiv z0.s, p0/m, z0.s, z1.s
611; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
612; CHECK-NEXT:    ret
613  %op1 = load <64 x i32>, ptr %a
614  %op2 = load <64 x i32>, ptr %b
615  %res = sdiv <64 x i32> %op1, %op2
616  store <64 x i32> %res, ptr %a
617  ret void
618}
619
620; Vector i64 sdiv are not legal for NEON so use SVE when available.
621define <1 x i64> @sdiv_v1i64(<1 x i64> %op1, <1 x i64> %op2) vscale_range(1,0) #0 {
622; CHECK-LABEL: sdiv_v1i64:
623; CHECK:       // %bb.0:
624; CHECK-NEXT:    ptrue p0.d, vl1
625; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
626; CHECK-NEXT:    // kill: def $d1 killed $d1 def $z1
627; CHECK-NEXT:    sdiv z0.d, p0/m, z0.d, z1.d
628; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
629; CHECK-NEXT:    ret
630  %res = sdiv <1 x i64> %op1, %op2
631  ret <1 x i64> %res
632}
633
634; Vector i64 sdiv are not legal for NEON so use SVE when available.
635define <2 x i64> @sdiv_v2i64(<2 x i64> %op1, <2 x i64> %op2) vscale_range(1,0) #0 {
636; CHECK-LABEL: sdiv_v2i64:
637; CHECK:       // %bb.0:
638; CHECK-NEXT:    ptrue p0.d, vl2
639; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
640; CHECK-NEXT:    // kill: def $q1 killed $q1 def $z1
641; CHECK-NEXT:    sdiv z0.d, p0/m, z0.d, z1.d
642; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
643; CHECK-NEXT:    ret
644  %res = sdiv <2 x i64> %op1, %op2
645  ret <2 x i64> %res
646}
647
648define void @sdiv_v4i64(ptr %a, ptr %b) vscale_range(2,0) #0 {
649; CHECK-LABEL: sdiv_v4i64:
650; CHECK:       // %bb.0:
651; CHECK-NEXT:    ptrue p0.d, vl4
652; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
653; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x1]
654; CHECK-NEXT:    sdiv z0.d, p0/m, z0.d, z1.d
655; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
656; CHECK-NEXT:    ret
657  %op1 = load <4 x i64>, ptr %a
658  %op2 = load <4 x i64>, ptr %b
659  %res = sdiv <4 x i64> %op1, %op2
660  store <4 x i64> %res, ptr %a
661  ret void
662}
663
664define void @sdiv_v8i64(ptr %a, ptr %b) #0 {
665; VBITS_GE_128-LABEL: sdiv_v8i64:
666; VBITS_GE_128:       // %bb.0:
667; VBITS_GE_128-NEXT:    ldp q0, q3, [x1]
668; VBITS_GE_128-NEXT:    ptrue p0.d, vl2
669; VBITS_GE_128-NEXT:    ldp q1, q2, [x0]
670; VBITS_GE_128-NEXT:    ldp q5, q4, [x1, #32]
671; VBITS_GE_128-NEXT:    sdivr z0.d, p0/m, z0.d, z1.d
672; VBITS_GE_128-NEXT:    ldr q1, [x0, #48]
673; VBITS_GE_128-NEXT:    sdiv z1.d, p0/m, z1.d, z4.d
674; VBITS_GE_128-NEXT:    ldr q4, [x0, #32]
675; VBITS_GE_128-NEXT:    sdiv z4.d, p0/m, z4.d, z5.d
676; VBITS_GE_128-NEXT:    sdiv z2.d, p0/m, z2.d, z3.d
677; VBITS_GE_128-NEXT:    stp q4, q1, [x0, #32]
678; VBITS_GE_128-NEXT:    stp q0, q2, [x0]
679; VBITS_GE_128-NEXT:    ret
680;
681; VBITS_GE_256-LABEL: sdiv_v8i64:
682; VBITS_GE_256:       // %bb.0:
683; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
684; VBITS_GE_256-NEXT:    mov x8, #4 // =0x4
685; VBITS_GE_256-NEXT:    ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
686; VBITS_GE_256-NEXT:    ld1d { z1.d }, p0/z, [x1, x8, lsl #3]
687; VBITS_GE_256-NEXT:    ld1d { z2.d }, p0/z, [x1]
688; VBITS_GE_256-NEXT:    sdiv z0.d, p0/m, z0.d, z1.d
689; VBITS_GE_256-NEXT:    ld1d { z1.d }, p0/z, [x0]
690; VBITS_GE_256-NEXT:    sdiv z1.d, p0/m, z1.d, z2.d
691; VBITS_GE_256-NEXT:    st1d { z0.d }, p0, [x0, x8, lsl #3]
692; VBITS_GE_256-NEXT:    st1d { z1.d }, p0, [x0]
693; VBITS_GE_256-NEXT:    ret
694;
695; VBITS_GE_512-LABEL: sdiv_v8i64:
696; VBITS_GE_512:       // %bb.0:
697; VBITS_GE_512-NEXT:    ptrue p0.d, vl8
698; VBITS_GE_512-NEXT:    ld1d { z0.d }, p0/z, [x0]
699; VBITS_GE_512-NEXT:    ld1d { z1.d }, p0/z, [x1]
700; VBITS_GE_512-NEXT:    sdiv z0.d, p0/m, z0.d, z1.d
701; VBITS_GE_512-NEXT:    st1d { z0.d }, p0, [x0]
702; VBITS_GE_512-NEXT:    ret
703  %op1 = load <8 x i64>, ptr %a
704  %op2 = load <8 x i64>, ptr %b
705  %res = sdiv <8 x i64> %op1, %op2
706  store <8 x i64> %res, ptr %a
707  ret void
708}
709
710define void @sdiv_v16i64(ptr %a, ptr %b) vscale_range(8,0) #0 {
711; CHECK-LABEL: sdiv_v16i64:
712; CHECK:       // %bb.0:
713; CHECK-NEXT:    ptrue p0.d, vl16
714; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
715; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x1]
716; CHECK-NEXT:    sdiv z0.d, p0/m, z0.d, z1.d
717; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
718; CHECK-NEXT:    ret
719  %op1 = load <16 x i64>, ptr %a
720  %op2 = load <16 x i64>, ptr %b
721  %res = sdiv <16 x i64> %op1, %op2
722  store <16 x i64> %res, ptr %a
723  ret void
724}
725
726define void @sdiv_v32i64(ptr %a, ptr %b) vscale_range(16,0) #0 {
727; CHECK-LABEL: sdiv_v32i64:
728; CHECK:       // %bb.0:
729; CHECK-NEXT:    ptrue p0.d, vl32
730; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
731; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x1]
732; CHECK-NEXT:    sdiv z0.d, p0/m, z0.d, z1.d
733; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
734; CHECK-NEXT:    ret
735  %op1 = load <32 x i64>, ptr %a
736  %op2 = load <32 x i64>, ptr %b
737  %res = sdiv <32 x i64> %op1, %op2
738  store <32 x i64> %res, ptr %a
739  ret void
740}
741
742;
743; UDIV
744;
745
746; Vector vXi8 udiv are not legal for NEON so use SVE when available.
747; FIXME: We should be able to improve the codegen for >= 256 bits here.
748define <8 x i8> @udiv_v8i8(<8 x i8> %op1, <8 x i8> %op2) #0 {
749; VBITS_GE_128-LABEL: udiv_v8i8:
750; VBITS_GE_128:       // %bb.0:
751; VBITS_GE_128-NEXT:    ushll v1.8h, v1.8b, #0
752; VBITS_GE_128-NEXT:    ushll v0.8h, v0.8b, #0
753; VBITS_GE_128-NEXT:    ptrue p0.s, vl4
754; VBITS_GE_128-NEXT:    ushll2 v2.4s, v1.8h, #0
755; VBITS_GE_128-NEXT:    ushll2 v3.4s, v0.8h, #0
756; VBITS_GE_128-NEXT:    ushll v1.4s, v1.4h, #0
757; VBITS_GE_128-NEXT:    ushll v0.4s, v0.4h, #0
758; VBITS_GE_128-NEXT:    udivr z2.s, p0/m, z2.s, z3.s
759; VBITS_GE_128-NEXT:    udiv z0.s, p0/m, z0.s, z1.s
760; VBITS_GE_128-NEXT:    uzp1 v0.8h, v0.8h, v2.8h
761; VBITS_GE_128-NEXT:    xtn v0.8b, v0.8h
762; VBITS_GE_128-NEXT:    ret
763;
764; VBITS_GE_256-LABEL: udiv_v8i8:
765; VBITS_GE_256:       // %bb.0:
766; VBITS_GE_256-NEXT:    // kill: def $d1 killed $d1 def $z1
767; VBITS_GE_256-NEXT:    // kill: def $d0 killed $d0 def $z0
768; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
769; VBITS_GE_256-NEXT:    uunpklo z1.h, z1.b
770; VBITS_GE_256-NEXT:    uunpklo z0.h, z0.b
771; VBITS_GE_256-NEXT:    uunpklo z1.s, z1.h
772; VBITS_GE_256-NEXT:    uunpklo z0.s, z0.h
773; VBITS_GE_256-NEXT:    udiv z0.s, p0/m, z0.s, z1.s
774; VBITS_GE_256-NEXT:    uzp1 z1.h, z0.h, z0.h
775; VBITS_GE_256-NEXT:    umov w8, v1.h[0]
776; VBITS_GE_256-NEXT:    umov w9, v1.h[1]
777; VBITS_GE_256-NEXT:    fmov s0, w8
778; VBITS_GE_256-NEXT:    umov w8, v1.h[2]
779; VBITS_GE_256-NEXT:    mov v0.b[1], w9
780; VBITS_GE_256-NEXT:    mov v0.b[2], w8
781; VBITS_GE_256-NEXT:    umov w8, v1.h[3]
782; VBITS_GE_256-NEXT:    mov v0.b[3], w8
783; VBITS_GE_256-NEXT:    umov w8, v1.h[4]
784; VBITS_GE_256-NEXT:    mov v0.b[4], w8
785; VBITS_GE_256-NEXT:    umov w8, v1.h[5]
786; VBITS_GE_256-NEXT:    mov v0.b[5], w8
787; VBITS_GE_256-NEXT:    umov w8, v1.h[6]
788; VBITS_GE_256-NEXT:    mov v0.b[6], w8
789; VBITS_GE_256-NEXT:    umov w8, v1.h[7]
790; VBITS_GE_256-NEXT:    mov v0.b[7], w8
791; VBITS_GE_256-NEXT:    // kill: def $d0 killed $d0 killed $q0
792; VBITS_GE_256-NEXT:    ret
793;
794; VBITS_GE_512-LABEL: udiv_v8i8:
795; VBITS_GE_512:       // %bb.0:
796; VBITS_GE_512-NEXT:    // kill: def $d1 killed $d1 def $z1
797; VBITS_GE_512-NEXT:    // kill: def $d0 killed $d0 def $z0
798; VBITS_GE_512-NEXT:    ptrue p0.s, vl8
799; VBITS_GE_512-NEXT:    uunpklo z1.h, z1.b
800; VBITS_GE_512-NEXT:    uunpklo z0.h, z0.b
801; VBITS_GE_512-NEXT:    uunpklo z1.s, z1.h
802; VBITS_GE_512-NEXT:    uunpklo z0.s, z0.h
803; VBITS_GE_512-NEXT:    udiv z0.s, p0/m, z0.s, z1.s
804; VBITS_GE_512-NEXT:    uzp1 z1.h, z0.h, z0.h
805; VBITS_GE_512-NEXT:    umov w8, v1.h[0]
806; VBITS_GE_512-NEXT:    umov w9, v1.h[1]
807; VBITS_GE_512-NEXT:    fmov s0, w8
808; VBITS_GE_512-NEXT:    umov w8, v1.h[2]
809; VBITS_GE_512-NEXT:    mov v0.b[1], w9
810; VBITS_GE_512-NEXT:    mov v0.b[2], w8
811; VBITS_GE_512-NEXT:    umov w8, v1.h[3]
812; VBITS_GE_512-NEXT:    mov v0.b[3], w8
813; VBITS_GE_512-NEXT:    umov w8, v1.h[4]
814; VBITS_GE_512-NEXT:    mov v0.b[4], w8
815; VBITS_GE_512-NEXT:    umov w8, v1.h[5]
816; VBITS_GE_512-NEXT:    mov v0.b[5], w8
817; VBITS_GE_512-NEXT:    umov w8, v1.h[6]
818; VBITS_GE_512-NEXT:    mov v0.b[6], w8
819; VBITS_GE_512-NEXT:    umov w8, v1.h[7]
820; VBITS_GE_512-NEXT:    mov v0.b[7], w8
821; VBITS_GE_512-NEXT:    // kill: def $d0 killed $d0 killed $q0
822; VBITS_GE_512-NEXT:    ret
823  %res = udiv <8 x i8> %op1, %op2
824  ret <8 x i8> %res
825}
826
827define <16 x i8> @udiv_v16i8(<16 x i8> %op1, <16 x i8> %op2) #0 {
828; VBITS_GE_128-LABEL: udiv_v16i8:
829; VBITS_GE_128:       // %bb.0:
830; VBITS_GE_128-NEXT:    ushll2 v2.8h, v1.16b, #0
831; VBITS_GE_128-NEXT:    ushll2 v3.8h, v0.16b, #0
832; VBITS_GE_128-NEXT:    ushll v1.8h, v1.8b, #0
833; VBITS_GE_128-NEXT:    ushll v0.8h, v0.8b, #0
834; VBITS_GE_128-NEXT:    ptrue p0.s, vl4
835; VBITS_GE_128-NEXT:    ushll2 v4.4s, v2.8h, #0
836; VBITS_GE_128-NEXT:    ushll2 v5.4s, v3.8h, #0
837; VBITS_GE_128-NEXT:    ushll v2.4s, v2.4h, #0
838; VBITS_GE_128-NEXT:    ushll v3.4s, v3.4h, #0
839; VBITS_GE_128-NEXT:    udivr z4.s, p0/m, z4.s, z5.s
840; VBITS_GE_128-NEXT:    ushll2 v5.4s, v0.8h, #0
841; VBITS_GE_128-NEXT:    ushll v0.4s, v0.4h, #0
842; VBITS_GE_128-NEXT:    udivr z2.s, p0/m, z2.s, z3.s
843; VBITS_GE_128-NEXT:    ushll2 v3.4s, v1.8h, #0
844; VBITS_GE_128-NEXT:    ushll v1.4s, v1.4h, #0
845; VBITS_GE_128-NEXT:    udivr z3.s, p0/m, z3.s, z5.s
846; VBITS_GE_128-NEXT:    udiv z0.s, p0/m, z0.s, z1.s
847; VBITS_GE_128-NEXT:    uzp1 v1.8h, v2.8h, v4.8h
848; VBITS_GE_128-NEXT:    uzp1 v0.8h, v0.8h, v3.8h
849; VBITS_GE_128-NEXT:    uzp1 v0.16b, v0.16b, v1.16b
850; VBITS_GE_128-NEXT:    ret
851;
852; VBITS_GE_256-LABEL: udiv_v16i8:
853; VBITS_GE_256:       // %bb.0:
854; VBITS_GE_256-NEXT:    // kill: def $q1 killed $q1 def $z1
855; VBITS_GE_256-NEXT:    // kill: def $q0 killed $q0 def $z0
856; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
857; VBITS_GE_256-NEXT:    uunpklo z1.h, z1.b
858; VBITS_GE_256-NEXT:    uunpklo z0.h, z0.b
859; VBITS_GE_256-NEXT:    uunpklo z2.s, z1.h
860; VBITS_GE_256-NEXT:    uunpklo z3.s, z0.h
861; VBITS_GE_256-NEXT:    ext z1.b, z1.b, z1.b, #16
862; VBITS_GE_256-NEXT:    ext z0.b, z0.b, z0.b, #16
863; VBITS_GE_256-NEXT:    uunpklo z1.s, z1.h
864; VBITS_GE_256-NEXT:    uunpklo z0.s, z0.h
865; VBITS_GE_256-NEXT:    udivr z2.s, p0/m, z2.s, z3.s
866; VBITS_GE_256-NEXT:    udiv z0.s, p0/m, z0.s, z1.s
867; VBITS_GE_256-NEXT:    ptrue p0.h, vl8
868; VBITS_GE_256-NEXT:    uzp1 z1.h, z2.h, z2.h
869; VBITS_GE_256-NEXT:    uzp1 z0.h, z0.h, z0.h
870; VBITS_GE_256-NEXT:    splice z1.h, p0, z1.h, z0.h
871; VBITS_GE_256-NEXT:    uzp1 z0.b, z1.b, z1.b
872; VBITS_GE_256-NEXT:    // kill: def $q0 killed $q0 killed $z0
873; VBITS_GE_256-NEXT:    ret
874;
875; VBITS_GE_512-LABEL: udiv_v16i8:
876; VBITS_GE_512:       // %bb.0:
877; VBITS_GE_512-NEXT:    // kill: def $q1 killed $q1 def $z1
878; VBITS_GE_512-NEXT:    // kill: def $q0 killed $q0 def $z0
879; VBITS_GE_512-NEXT:    ptrue p0.s, vl16
880; VBITS_GE_512-NEXT:    uunpklo z1.h, z1.b
881; VBITS_GE_512-NEXT:    uunpklo z0.h, z0.b
882; VBITS_GE_512-NEXT:    uunpklo z1.s, z1.h
883; VBITS_GE_512-NEXT:    uunpklo z0.s, z0.h
884; VBITS_GE_512-NEXT:    udiv z0.s, p0/m, z0.s, z1.s
885; VBITS_GE_512-NEXT:    uzp1 z0.h, z0.h, z0.h
886; VBITS_GE_512-NEXT:    uzp1 z0.b, z0.b, z0.b
887; VBITS_GE_512-NEXT:    // kill: def $q0 killed $q0 killed $z0
888; VBITS_GE_512-NEXT:    ret
889  %res = udiv <16 x i8> %op1, %op2
890  ret <16 x i8> %res
891}
892
893define void @udiv_v32i8(ptr %a, ptr %b) vscale_range(8,0) #0 {
894; CHECK-LABEL: udiv_v32i8:
895; CHECK:       // %bb.0:
896; CHECK-NEXT:    ptrue p0.s, vl32
897; CHECK-NEXT:    ld1b { z0.s }, p0/z, [x1]
898; CHECK-NEXT:    ld1b { z1.s }, p0/z, [x0]
899; CHECK-NEXT:    udivr z0.s, p0/m, z0.s, z1.s
900; CHECK-NEXT:    st1b { z0.s }, p0, [x0]
901; CHECK-NEXT:    ret
902  %op1 = load <32 x i8>, ptr %a
903  %op2 = load <32 x i8>, ptr %b
904  %res = udiv <32 x i8> %op1, %op2
905  store <32 x i8> %res, ptr %a
906  ret void
907}
908
909define void @udiv_v64i8(ptr %a, ptr %b) vscale_range(16,0) #0 {
910; CHECK-LABEL: udiv_v64i8:
911; CHECK:       // %bb.0:
912; CHECK-NEXT:    ptrue p0.s, vl64
913; CHECK-NEXT:    ld1b { z0.s }, p0/z, [x1]
914; CHECK-NEXT:    ld1b { z1.s }, p0/z, [x0]
915; CHECK-NEXT:    udivr z0.s, p0/m, z0.s, z1.s
916; CHECK-NEXT:    st1b { z0.s }, p0, [x0]
917; CHECK-NEXT:    ret
918  %op1 = load <64 x i8>, ptr %a
919  %op2 = load <64 x i8>, ptr %b
920  %res = udiv <64 x i8> %op1, %op2
921  store <64 x i8> %res, ptr %a
922  ret void
923}
924
925define void @udiv_v128i8(ptr %a, ptr %b) vscale_range(16,0) #0 {
926; CHECK-LABEL: udiv_v128i8:
927; CHECK:       // %bb.0:
928; CHECK-NEXT:    ptrue p0.h, vl128
929; CHECK-NEXT:    ptrue p1.s, vl64
930; CHECK-NEXT:    ld1b { z0.h }, p0/z, [x1]
931; CHECK-NEXT:    ld1b { z1.h }, p0/z, [x0]
932; CHECK-NEXT:    uunpklo z2.s, z0.h
933; CHECK-NEXT:    uunpklo z3.s, z1.h
934; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #128
935; CHECK-NEXT:    ext z1.b, z1.b, z1.b, #128
936; CHECK-NEXT:    uunpklo z0.s, z0.h
937; CHECK-NEXT:    uunpklo z1.s, z1.h
938; CHECK-NEXT:    udivr z2.s, p1/m, z2.s, z3.s
939; CHECK-NEXT:    udivr z0.s, p1/m, z0.s, z1.s
940; CHECK-NEXT:    ptrue p1.h, vl64
941; CHECK-NEXT:    uzp1 z1.h, z2.h, z2.h
942; CHECK-NEXT:    uzp1 z0.h, z0.h, z0.h
943; CHECK-NEXT:    splice z1.h, p1, z1.h, z0.h
944; CHECK-NEXT:    st1b { z1.h }, p0, [x0]
945; CHECK-NEXT:    ret
946  %op1 = load <128 x i8>, ptr %a
947  %op2 = load <128 x i8>, ptr %b
948  %res = udiv <128 x i8> %op1, %op2
949  store <128 x i8> %res, ptr %a
950  ret void
951}
952
953define void @udiv_v256i8(ptr %a, ptr %b) vscale_range(16,0) #0 {
954; CHECK-LABEL: udiv_v256i8:
955; CHECK:       // %bb.0:
956; CHECK-NEXT:    ptrue p0.b, vl256
957; CHECK-NEXT:    ptrue p1.s, vl64
958; CHECK-NEXT:    ld1b { z0.b }, p0/z, [x0]
959; CHECK-NEXT:    ld1b { z1.b }, p0/z, [x1]
960; CHECK-NEXT:    uunpklo z2.h, z1.b
961; CHECK-NEXT:    uunpklo z3.h, z0.b
962; CHECK-NEXT:    ext z1.b, z1.b, z1.b, #128
963; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #128
964; CHECK-NEXT:    uunpklo z1.h, z1.b
965; CHECK-NEXT:    uunpklo z4.s, z2.h
966; CHECK-NEXT:    uunpklo z5.s, z3.h
967; CHECK-NEXT:    ext z2.b, z2.b, z2.b, #128
968; CHECK-NEXT:    ext z3.b, z3.b, z3.b, #128
969; CHECK-NEXT:    uunpklo z0.h, z0.b
970; CHECK-NEXT:    uunpklo z2.s, z2.h
971; CHECK-NEXT:    uunpklo z3.s, z3.h
972; CHECK-NEXT:    udivr z4.s, p1/m, z4.s, z5.s
973; CHECK-NEXT:    uunpklo z5.s, z0.h
974; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #128
975; CHECK-NEXT:    uunpklo z0.s, z0.h
976; CHECK-NEXT:    udivr z2.s, p1/m, z2.s, z3.s
977; CHECK-NEXT:    uunpklo z3.s, z1.h
978; CHECK-NEXT:    ext z1.b, z1.b, z1.b, #128
979; CHECK-NEXT:    uunpklo z1.s, z1.h
980; CHECK-NEXT:    udivr z3.s, p1/m, z3.s, z5.s
981; CHECK-NEXT:    uzp1 z2.h, z2.h, z2.h
982; CHECK-NEXT:    udiv z0.s, p1/m, z0.s, z1.s
983; CHECK-NEXT:    uzp1 z1.h, z4.h, z4.h
984; CHECK-NEXT:    ptrue p1.h, vl64
985; CHECK-NEXT:    uzp1 z3.h, z3.h, z3.h
986; CHECK-NEXT:    splice z1.h, p1, z1.h, z2.h
987; CHECK-NEXT:    uzp1 z0.h, z0.h, z0.h
988; CHECK-NEXT:    splice z3.h, p1, z3.h, z0.h
989; CHECK-NEXT:    uzp1 z0.b, z1.b, z1.b
990; CHECK-NEXT:    ptrue p1.b, vl128
991; CHECK-NEXT:    uzp1 z1.b, z3.b, z3.b
992; CHECK-NEXT:    splice z0.b, p1, z0.b, z1.b
993; CHECK-NEXT:    st1b { z0.b }, p0, [x0]
994; CHECK-NEXT:    ret
995  %op1 = load <256 x i8>, ptr %a
996  %op2 = load <256 x i8>, ptr %b
997  %res = udiv <256 x i8> %op1, %op2
998  store <256 x i8> %res, ptr %a
999  ret void
1000}
1001
1002; Vector vXi16 udiv are not legal for NEON so use SVE when available.
1003; FIXME: We should be able to improve the codegen for >= 256 bits here.
1004define <4 x i16> @udiv_v4i16(<4 x i16> %op1, <4 x i16> %op2) #0 {
1005; VBITS_GE_128-LABEL: udiv_v4i16:
1006; VBITS_GE_128:       // %bb.0:
1007; VBITS_GE_128-NEXT:    ushll v1.4s, v1.4h, #0
1008; VBITS_GE_128-NEXT:    ushll v0.4s, v0.4h, #0
1009; VBITS_GE_128-NEXT:    ptrue p0.s, vl4
1010; VBITS_GE_128-NEXT:    udiv z0.s, p0/m, z0.s, z1.s
1011; VBITS_GE_128-NEXT:    xtn v0.4h, v0.4s
1012; VBITS_GE_128-NEXT:    ret
1013;
1014; VBITS_GE_256-LABEL: udiv_v4i16:
1015; VBITS_GE_256:       // %bb.0:
1016; VBITS_GE_256-NEXT:    ushll v1.4s, v1.4h, #0
1017; VBITS_GE_256-NEXT:    ushll v0.4s, v0.4h, #0
1018; VBITS_GE_256-NEXT:    ptrue p0.s, vl4
1019; VBITS_GE_256-NEXT:    udivr z1.s, p0/m, z1.s, z0.s
1020; VBITS_GE_256-NEXT:    mov w8, v1.s[1]
1021; VBITS_GE_256-NEXT:    mov v0.16b, v1.16b
1022; VBITS_GE_256-NEXT:    mov w9, v1.s[2]
1023; VBITS_GE_256-NEXT:    mov v0.h[1], w8
1024; VBITS_GE_256-NEXT:    mov w8, v1.s[3]
1025; VBITS_GE_256-NEXT:    mov v0.h[2], w9
1026; VBITS_GE_256-NEXT:    mov v0.h[3], w8
1027; VBITS_GE_256-NEXT:    // kill: def $d0 killed $d0 killed $q0
1028; VBITS_GE_256-NEXT:    ret
1029;
1030; VBITS_GE_512-LABEL: udiv_v4i16:
1031; VBITS_GE_512:       // %bb.0:
1032; VBITS_GE_512-NEXT:    ushll v1.4s, v1.4h, #0
1033; VBITS_GE_512-NEXT:    ushll v0.4s, v0.4h, #0
1034; VBITS_GE_512-NEXT:    ptrue p0.s, vl4
1035; VBITS_GE_512-NEXT:    udivr z1.s, p0/m, z1.s, z0.s
1036; VBITS_GE_512-NEXT:    mov w8, v1.s[1]
1037; VBITS_GE_512-NEXT:    mov v0.16b, v1.16b
1038; VBITS_GE_512-NEXT:    mov w9, v1.s[2]
1039; VBITS_GE_512-NEXT:    mov v0.h[1], w8
1040; VBITS_GE_512-NEXT:    mov w8, v1.s[3]
1041; VBITS_GE_512-NEXT:    mov v0.h[2], w9
1042; VBITS_GE_512-NEXT:    mov v0.h[3], w8
1043; VBITS_GE_512-NEXT:    // kill: def $d0 killed $d0 killed $q0
1044; VBITS_GE_512-NEXT:    ret
1045  %res = udiv <4 x i16> %op1, %op2
1046  ret <4 x i16> %res
1047}
1048
1049define <8 x i16> @udiv_v8i16(<8 x i16> %op1, <8 x i16> %op2) #0 {
1050; VBITS_GE_128-LABEL: udiv_v8i16:
1051; VBITS_GE_128:       // %bb.0:
1052; VBITS_GE_128-NEXT:    ushll2 v2.4s, v1.8h, #0
1053; VBITS_GE_128-NEXT:    ushll2 v3.4s, v0.8h, #0
1054; VBITS_GE_128-NEXT:    ushll v1.4s, v1.4h, #0
1055; VBITS_GE_128-NEXT:    ushll v0.4s, v0.4h, #0
1056; VBITS_GE_128-NEXT:    ptrue p0.s, vl4
1057; VBITS_GE_128-NEXT:    udivr z2.s, p0/m, z2.s, z3.s
1058; VBITS_GE_128-NEXT:    udiv z0.s, p0/m, z0.s, z1.s
1059; VBITS_GE_128-NEXT:    uzp1 v0.8h, v0.8h, v2.8h
1060; VBITS_GE_128-NEXT:    ret
1061;
1062; VBITS_GE_256-LABEL: udiv_v8i16:
1063; VBITS_GE_256:       // %bb.0:
1064; VBITS_GE_256-NEXT:    // kill: def $q1 killed $q1 def $z1
1065; VBITS_GE_256-NEXT:    // kill: def $q0 killed $q0 def $z0
1066; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
1067; VBITS_GE_256-NEXT:    uunpklo z1.s, z1.h
1068; VBITS_GE_256-NEXT:    uunpklo z0.s, z0.h
1069; VBITS_GE_256-NEXT:    udiv z0.s, p0/m, z0.s, z1.s
1070; VBITS_GE_256-NEXT:    uzp1 z0.h, z0.h, z0.h
1071; VBITS_GE_256-NEXT:    // kill: def $q0 killed $q0 killed $z0
1072; VBITS_GE_256-NEXT:    ret
1073;
1074; VBITS_GE_512-LABEL: udiv_v8i16:
1075; VBITS_GE_512:       // %bb.0:
1076; VBITS_GE_512-NEXT:    // kill: def $q1 killed $q1 def $z1
1077; VBITS_GE_512-NEXT:    // kill: def $q0 killed $q0 def $z0
1078; VBITS_GE_512-NEXT:    ptrue p0.s, vl8
1079; VBITS_GE_512-NEXT:    uunpklo z1.s, z1.h
1080; VBITS_GE_512-NEXT:    uunpklo z0.s, z0.h
1081; VBITS_GE_512-NEXT:    udiv z0.s, p0/m, z0.s, z1.s
1082; VBITS_GE_512-NEXT:    uzp1 z0.h, z0.h, z0.h
1083; VBITS_GE_512-NEXT:    // kill: def $q0 killed $q0 killed $z0
1084; VBITS_GE_512-NEXT:    ret
1085  %res = udiv <8 x i16> %op1, %op2
1086  ret <8 x i16> %res
1087}
1088
1089define void @udiv_v16i16(ptr %a, ptr %b) #0 {
1090; VBITS_GE_128-LABEL: udiv_v16i16:
1091; VBITS_GE_128:       // %bb.0:
1092; VBITS_GE_128-NEXT:    ldp q4, q1, [x1]
1093; VBITS_GE_128-NEXT:    ptrue p0.s, vl4
1094; VBITS_GE_128-NEXT:    ldr q0, [x0, #16]
1095; VBITS_GE_128-NEXT:    ushll2 v2.4s, v1.8h, #0
1096; VBITS_GE_128-NEXT:    ushll2 v3.4s, v0.8h, #0
1097; VBITS_GE_128-NEXT:    ushll2 v5.4s, v4.8h, #0
1098; VBITS_GE_128-NEXT:    ushll v4.4s, v4.4h, #0
1099; VBITS_GE_128-NEXT:    ushll v1.4s, v1.4h, #0
1100; VBITS_GE_128-NEXT:    ushll v0.4s, v0.4h, #0
1101; VBITS_GE_128-NEXT:    udivr z2.s, p0/m, z2.s, z3.s
1102; VBITS_GE_128-NEXT:    ldr q3, [x0]
1103; VBITS_GE_128-NEXT:    ushll2 v6.4s, v3.8h, #0
1104; VBITS_GE_128-NEXT:    ushll v3.4s, v3.4h, #0
1105; VBITS_GE_128-NEXT:    udivr z5.s, p0/m, z5.s, z6.s
1106; VBITS_GE_128-NEXT:    udiv z3.s, p0/m, z3.s, z4.s
1107; VBITS_GE_128-NEXT:    udiv z0.s, p0/m, z0.s, z1.s
1108; VBITS_GE_128-NEXT:    uzp1 v1.8h, v3.8h, v5.8h
1109; VBITS_GE_128-NEXT:    uzp1 v0.8h, v0.8h, v2.8h
1110; VBITS_GE_128-NEXT:    stp q1, q0, [x0]
1111; VBITS_GE_128-NEXT:    ret
1112;
1113; VBITS_GE_256-LABEL: udiv_v16i16:
1114; VBITS_GE_256:       // %bb.0:
1115; VBITS_GE_256-NEXT:    ptrue p0.h, vl16
1116; VBITS_GE_256-NEXT:    ptrue p1.s, vl8
1117; VBITS_GE_256-NEXT:    ld1h { z0.h }, p0/z, [x0]
1118; VBITS_GE_256-NEXT:    ld1h { z1.h }, p0/z, [x1]
1119; VBITS_GE_256-NEXT:    uunpklo z2.s, z1.h
1120; VBITS_GE_256-NEXT:    uunpklo z3.s, z0.h
1121; VBITS_GE_256-NEXT:    ext z1.b, z1.b, z1.b, #16
1122; VBITS_GE_256-NEXT:    ext z0.b, z0.b, z0.b, #16
1123; VBITS_GE_256-NEXT:    uunpklo z1.s, z1.h
1124; VBITS_GE_256-NEXT:    uunpklo z0.s, z0.h
1125; VBITS_GE_256-NEXT:    udivr z2.s, p1/m, z2.s, z3.s
1126; VBITS_GE_256-NEXT:    udiv z0.s, p1/m, z0.s, z1.s
1127; VBITS_GE_256-NEXT:    ptrue p1.h, vl8
1128; VBITS_GE_256-NEXT:    uzp1 z1.h, z2.h, z2.h
1129; VBITS_GE_256-NEXT:    uzp1 z0.h, z0.h, z0.h
1130; VBITS_GE_256-NEXT:    splice z1.h, p1, z1.h, z0.h
1131; VBITS_GE_256-NEXT:    st1h { z1.h }, p0, [x0]
1132; VBITS_GE_256-NEXT:    ret
1133;
1134; VBITS_GE_512-LABEL: udiv_v16i16:
1135; VBITS_GE_512:       // %bb.0:
1136; VBITS_GE_512-NEXT:    ptrue p0.s, vl16
1137; VBITS_GE_512-NEXT:    ld1h { z0.s }, p0/z, [x1]
1138; VBITS_GE_512-NEXT:    ld1h { z1.s }, p0/z, [x0]
1139; VBITS_GE_512-NEXT:    udivr z0.s, p0/m, z0.s, z1.s
1140; VBITS_GE_512-NEXT:    st1h { z0.s }, p0, [x0]
1141; VBITS_GE_512-NEXT:    ret
1142  %op1 = load <16 x i16>, ptr %a
1143  %op2 = load <16 x i16>, ptr %b
1144  %res = udiv <16 x i16> %op1, %op2
1145  store <16 x i16> %res, ptr %a
1146  ret void
1147}
1148
1149define void @udiv_v32i16(ptr %a, ptr %b) vscale_range(8,0) #0 {
1150; CHECK-LABEL: udiv_v32i16:
1151; CHECK:       // %bb.0:
1152; CHECK-NEXT:    ptrue p0.s, vl32
1153; CHECK-NEXT:    ld1h { z0.s }, p0/z, [x1]
1154; CHECK-NEXT:    ld1h { z1.s }, p0/z, [x0]
1155; CHECK-NEXT:    udivr z0.s, p0/m, z0.s, z1.s
1156; CHECK-NEXT:    st1h { z0.s }, p0, [x0]
1157; CHECK-NEXT:    ret
1158  %op1 = load <32 x i16>, ptr %a
1159  %op2 = load <32 x i16>, ptr %b
1160  %res = udiv <32 x i16> %op1, %op2
1161  store <32 x i16> %res, ptr %a
1162  ret void
1163}
1164
1165define void @udiv_v64i16(ptr %a, ptr %b) vscale_range(16,0) #0 {
1166; CHECK-LABEL: udiv_v64i16:
1167; CHECK:       // %bb.0:
1168; CHECK-NEXT:    ptrue p0.s, vl64
1169; CHECK-NEXT:    ld1h { z0.s }, p0/z, [x1]
1170; CHECK-NEXT:    ld1h { z1.s }, p0/z, [x0]
1171; CHECK-NEXT:    udivr z0.s, p0/m, z0.s, z1.s
1172; CHECK-NEXT:    st1h { z0.s }, p0, [x0]
1173; CHECK-NEXT:    ret
1174  %op1 = load <64 x i16>, ptr %a
1175  %op2 = load <64 x i16>, ptr %b
1176  %res = udiv <64 x i16> %op1, %op2
1177  store <64 x i16> %res, ptr %a
1178  ret void
1179}
1180
1181define void @udiv_v128i16(ptr %a, ptr %b) vscale_range(16,0) #0 {
1182; CHECK-LABEL: udiv_v128i16:
1183; CHECK:       // %bb.0:
1184; CHECK-NEXT:    ptrue p0.h, vl128
1185; CHECK-NEXT:    ptrue p1.s, vl64
1186; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
1187; CHECK-NEXT:    ld1h { z1.h }, p0/z, [x1]
1188; CHECK-NEXT:    uunpklo z2.s, z1.h
1189; CHECK-NEXT:    uunpklo z3.s, z0.h
1190; CHECK-NEXT:    ext z1.b, z1.b, z1.b, #128
1191; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #128
1192; CHECK-NEXT:    uunpklo z1.s, z1.h
1193; CHECK-NEXT:    uunpklo z0.s, z0.h
1194; CHECK-NEXT:    udivr z2.s, p1/m, z2.s, z3.s
1195; CHECK-NEXT:    udiv z0.s, p1/m, z0.s, z1.s
1196; CHECK-NEXT:    ptrue p1.h, vl64
1197; CHECK-NEXT:    uzp1 z1.h, z2.h, z2.h
1198; CHECK-NEXT:    uzp1 z0.h, z0.h, z0.h
1199; CHECK-NEXT:    splice z1.h, p1, z1.h, z0.h
1200; CHECK-NEXT:    st1h { z1.h }, p0, [x0]
1201; CHECK-NEXT:    ret
1202  %op1 = load <128 x i16>, ptr %a
1203  %op2 = load <128 x i16>, ptr %b
1204  %res = udiv <128 x i16> %op1, %op2
1205  store <128 x i16> %res, ptr %a
1206  ret void
1207}
1208
1209; Vector v2i32 udiv are not legal for NEON so use SVE when available.
1210define <2 x i32> @udiv_v2i32(<2 x i32> %op1, <2 x i32> %op2) vscale_range(1,0) #0 {
1211; CHECK-LABEL: udiv_v2i32:
1212; CHECK:       // %bb.0:
1213; CHECK-NEXT:    ptrue p0.s, vl2
1214; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
1215; CHECK-NEXT:    // kill: def $d1 killed $d1 def $z1
1216; CHECK-NEXT:    udiv z0.s, p0/m, z0.s, z1.s
1217; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
1218; CHECK-NEXT:    ret
1219  %res = udiv <2 x i32> %op1, %op2
1220  ret <2 x i32> %res
1221}
1222
1223; Vector v4i32 udiv are not legal for NEON so use SVE when available.
1224define <4 x i32> @udiv_v4i32(<4 x i32> %op1, <4 x i32> %op2) vscale_range(1,0) #0 {
1225; CHECK-LABEL: udiv_v4i32:
1226; CHECK:       // %bb.0:
1227; CHECK-NEXT:    ptrue p0.s, vl4
1228; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
1229; CHECK-NEXT:    // kill: def $q1 killed $q1 def $z1
1230; CHECK-NEXT:    udiv z0.s, p0/m, z0.s, z1.s
1231; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
1232; CHECK-NEXT:    ret
1233  %res = udiv <4 x i32> %op1, %op2
1234  ret <4 x i32> %res
1235}
1236
1237define void @udiv_v8i32(ptr %a, ptr %b) vscale_range(2,0) #0 {
1238; CHECK-LABEL: udiv_v8i32:
1239; CHECK:       // %bb.0:
1240; CHECK-NEXT:    ptrue p0.s, vl8
1241; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
1242; CHECK-NEXT:    ld1w { z1.s }, p0/z, [x1]
1243; CHECK-NEXT:    udiv z0.s, p0/m, z0.s, z1.s
1244; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
1245; CHECK-NEXT:    ret
1246  %op1 = load <8 x i32>, ptr %a
1247  %op2 = load <8 x i32>, ptr %b
1248  %res = udiv <8 x i32> %op1, %op2
1249  store <8 x i32> %res, ptr %a
1250  ret void
1251}
1252
1253define void @udiv_v16i32(ptr %a, ptr %b) #0 {
1254; VBITS_GE_128-LABEL: udiv_v16i32:
1255; VBITS_GE_128:       // %bb.0:
1256; VBITS_GE_128-NEXT:    ldp q0, q3, [x1]
1257; VBITS_GE_128-NEXT:    ptrue p0.s, vl4
1258; VBITS_GE_128-NEXT:    ldp q1, q2, [x0]
1259; VBITS_GE_128-NEXT:    ldp q5, q4, [x1, #32]
1260; VBITS_GE_128-NEXT:    udivr z0.s, p0/m, z0.s, z1.s
1261; VBITS_GE_128-NEXT:    ldr q1, [x0, #48]
1262; VBITS_GE_128-NEXT:    udiv z1.s, p0/m, z1.s, z4.s
1263; VBITS_GE_128-NEXT:    ldr q4, [x0, #32]
1264; VBITS_GE_128-NEXT:    udiv z4.s, p0/m, z4.s, z5.s
1265; VBITS_GE_128-NEXT:    udiv z2.s, p0/m, z2.s, z3.s
1266; VBITS_GE_128-NEXT:    stp q4, q1, [x0, #32]
1267; VBITS_GE_128-NEXT:    stp q0, q2, [x0]
1268; VBITS_GE_128-NEXT:    ret
1269;
1270; VBITS_GE_256-LABEL: udiv_v16i32:
1271; VBITS_GE_256:       // %bb.0:
1272; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
1273; VBITS_GE_256-NEXT:    mov x8, #8 // =0x8
1274; VBITS_GE_256-NEXT:    ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
1275; VBITS_GE_256-NEXT:    ld1w { z1.s }, p0/z, [x1, x8, lsl #2]
1276; VBITS_GE_256-NEXT:    ld1w { z2.s }, p0/z, [x1]
1277; VBITS_GE_256-NEXT:    udiv z0.s, p0/m, z0.s, z1.s
1278; VBITS_GE_256-NEXT:    ld1w { z1.s }, p0/z, [x0]
1279; VBITS_GE_256-NEXT:    udiv z1.s, p0/m, z1.s, z2.s
1280; VBITS_GE_256-NEXT:    st1w { z0.s }, p0, [x0, x8, lsl #2]
1281; VBITS_GE_256-NEXT:    st1w { z1.s }, p0, [x0]
1282; VBITS_GE_256-NEXT:    ret
1283;
1284; VBITS_GE_512-LABEL: udiv_v16i32:
1285; VBITS_GE_512:       // %bb.0:
1286; VBITS_GE_512-NEXT:    ptrue p0.s, vl16
1287; VBITS_GE_512-NEXT:    ld1w { z0.s }, p0/z, [x0]
1288; VBITS_GE_512-NEXT:    ld1w { z1.s }, p0/z, [x1]
1289; VBITS_GE_512-NEXT:    udiv z0.s, p0/m, z0.s, z1.s
1290; VBITS_GE_512-NEXT:    st1w { z0.s }, p0, [x0]
1291; VBITS_GE_512-NEXT:    ret
1292  %op1 = load <16 x i32>, ptr %a
1293  %op2 = load <16 x i32>, ptr %b
1294  %res = udiv <16 x i32> %op1, %op2
1295  store <16 x i32> %res, ptr %a
1296  ret void
1297}
1298
1299define void @udiv_v32i32(ptr %a, ptr %b) vscale_range(8,0) #0 {
1300; CHECK-LABEL: udiv_v32i32:
1301; CHECK:       // %bb.0:
1302; CHECK-NEXT:    ptrue p0.s, vl32
1303; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
1304; CHECK-NEXT:    ld1w { z1.s }, p0/z, [x1]
1305; CHECK-NEXT:    udiv z0.s, p0/m, z0.s, z1.s
1306; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
1307; CHECK-NEXT:    ret
1308  %op1 = load <32 x i32>, ptr %a
1309  %op2 = load <32 x i32>, ptr %b
1310  %res = udiv <32 x i32> %op1, %op2
1311  store <32 x i32> %res, ptr %a
1312  ret void
1313}
1314
1315define void @udiv_v64i32(ptr %a, ptr %b) vscale_range(16,0) #0 {
1316; CHECK-LABEL: udiv_v64i32:
1317; CHECK:       // %bb.0:
1318; CHECK-NEXT:    ptrue p0.s, vl64
1319; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
1320; CHECK-NEXT:    ld1w { z1.s }, p0/z, [x1]
1321; CHECK-NEXT:    udiv z0.s, p0/m, z0.s, z1.s
1322; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
1323; CHECK-NEXT:    ret
1324  %op1 = load <64 x i32>, ptr %a
1325  %op2 = load <64 x i32>, ptr %b
1326  %res = udiv <64 x i32> %op1, %op2
1327  store <64 x i32> %res, ptr %a
1328  ret void
1329}
1330
1331; Vector i64 udiv are not legal for NEON so use SVE when available.
1332define <1 x i64> @udiv_v1i64(<1 x i64> %op1, <1 x i64> %op2) vscale_range(1,0) #0 {
1333; CHECK-LABEL: udiv_v1i64:
1334; CHECK:       // %bb.0:
1335; CHECK-NEXT:    ptrue p0.d, vl1
1336; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
1337; CHECK-NEXT:    // kill: def $d1 killed $d1 def $z1
1338; CHECK-NEXT:    udiv z0.d, p0/m, z0.d, z1.d
1339; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
1340; CHECK-NEXT:    ret
1341  %res = udiv <1 x i64> %op1, %op2
1342  ret <1 x i64> %res
1343}
1344
1345; Vector i64 udiv are not legal for NEON so use SVE when available.
1346define <2 x i64> @udiv_v2i64(<2 x i64> %op1, <2 x i64> %op2) vscale_range(1,0) #0 {
1347; CHECK-LABEL: udiv_v2i64:
1348; CHECK:       // %bb.0:
1349; CHECK-NEXT:    ptrue p0.d, vl2
1350; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
1351; CHECK-NEXT:    // kill: def $q1 killed $q1 def $z1
1352; CHECK-NEXT:    udiv z0.d, p0/m, z0.d, z1.d
1353; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
1354; CHECK-NEXT:    ret
1355  %res = udiv <2 x i64> %op1, %op2
1356  ret <2 x i64> %res
1357}
1358
1359define void @udiv_v4i64(ptr %a, ptr %b) vscale_range(2,0) #0 {
1360; CHECK-LABEL: udiv_v4i64:
1361; CHECK:       // %bb.0:
1362; CHECK-NEXT:    ptrue p0.d, vl4
1363; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
1364; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x1]
1365; CHECK-NEXT:    udiv z0.d, p0/m, z0.d, z1.d
1366; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
1367; CHECK-NEXT:    ret
1368  %op1 = load <4 x i64>, ptr %a
1369  %op2 = load <4 x i64>, ptr %b
1370  %res = udiv <4 x i64> %op1, %op2
1371  store <4 x i64> %res, ptr %a
1372  ret void
1373}
1374
1375define void @udiv_v8i64(ptr %a, ptr %b) #0 {
1376; VBITS_GE_128-LABEL: udiv_v8i64:
1377; VBITS_GE_128:       // %bb.0:
1378; VBITS_GE_128-NEXT:    ldp q0, q3, [x1]
1379; VBITS_GE_128-NEXT:    ptrue p0.d, vl2
1380; VBITS_GE_128-NEXT:    ldp q1, q2, [x0]
1381; VBITS_GE_128-NEXT:    ldp q5, q4, [x1, #32]
1382; VBITS_GE_128-NEXT:    udivr z0.d, p0/m, z0.d, z1.d
1383; VBITS_GE_128-NEXT:    ldr q1, [x0, #48]
1384; VBITS_GE_128-NEXT:    udiv z1.d, p0/m, z1.d, z4.d
1385; VBITS_GE_128-NEXT:    ldr q4, [x0, #32]
1386; VBITS_GE_128-NEXT:    udiv z4.d, p0/m, z4.d, z5.d
1387; VBITS_GE_128-NEXT:    udiv z2.d, p0/m, z2.d, z3.d
1388; VBITS_GE_128-NEXT:    stp q4, q1, [x0, #32]
1389; VBITS_GE_128-NEXT:    stp q0, q2, [x0]
1390; VBITS_GE_128-NEXT:    ret
1391;
1392; VBITS_GE_256-LABEL: udiv_v8i64:
1393; VBITS_GE_256:       // %bb.0:
1394; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
1395; VBITS_GE_256-NEXT:    mov x8, #4 // =0x4
1396; VBITS_GE_256-NEXT:    ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
1397; VBITS_GE_256-NEXT:    ld1d { z1.d }, p0/z, [x1, x8, lsl #3]
1398; VBITS_GE_256-NEXT:    ld1d { z2.d }, p0/z, [x1]
1399; VBITS_GE_256-NEXT:    udiv z0.d, p0/m, z0.d, z1.d
1400; VBITS_GE_256-NEXT:    ld1d { z1.d }, p0/z, [x0]
1401; VBITS_GE_256-NEXT:    udiv z1.d, p0/m, z1.d, z2.d
1402; VBITS_GE_256-NEXT:    st1d { z0.d }, p0, [x0, x8, lsl #3]
1403; VBITS_GE_256-NEXT:    st1d { z1.d }, p0, [x0]
1404; VBITS_GE_256-NEXT:    ret
1405;
1406; VBITS_GE_512-LABEL: udiv_v8i64:
1407; VBITS_GE_512:       // %bb.0:
1408; VBITS_GE_512-NEXT:    ptrue p0.d, vl8
1409; VBITS_GE_512-NEXT:    ld1d { z0.d }, p0/z, [x0]
1410; VBITS_GE_512-NEXT:    ld1d { z1.d }, p0/z, [x1]
1411; VBITS_GE_512-NEXT:    udiv z0.d, p0/m, z0.d, z1.d
1412; VBITS_GE_512-NEXT:    st1d { z0.d }, p0, [x0]
1413; VBITS_GE_512-NEXT:    ret
1414  %op1 = load <8 x i64>, ptr %a
1415  %op2 = load <8 x i64>, ptr %b
1416  %res = udiv <8 x i64> %op1, %op2
1417  store <8 x i64> %res, ptr %a
1418  ret void
1419}
1420
1421define void @udiv_v16i64(ptr %a, ptr %b) vscale_range(8,0) #0 {
1422; CHECK-LABEL: udiv_v16i64:
1423; CHECK:       // %bb.0:
1424; CHECK-NEXT:    ptrue p0.d, vl16
1425; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
1426; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x1]
1427; CHECK-NEXT:    udiv z0.d, p0/m, z0.d, z1.d
1428; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
1429; CHECK-NEXT:    ret
1430  %op1 = load <16 x i64>, ptr %a
1431  %op2 = load <16 x i64>, ptr %b
1432  %res = udiv <16 x i64> %op1, %op2
1433  store <16 x i64> %res, ptr %a
1434  ret void
1435}
1436
1437define void @udiv_v32i64(ptr %a, ptr %b) vscale_range(16,0) #0 {
1438; CHECK-LABEL: udiv_v32i64:
1439; CHECK:       // %bb.0:
1440; CHECK-NEXT:    ptrue p0.d, vl32
1441; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
1442; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x1]
1443; CHECK-NEXT:    udiv z0.d, p0/m, z0.d, z1.d
1444; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
1445; CHECK-NEXT:    ret
1446  %op1 = load <32 x i64>, ptr %a
1447  %op2 = load <32 x i64>, ptr %b
1448  %res = udiv <32 x i64> %op1, %op2
1449  store <32 x i64> %res, ptr %a
1450  ret void
1451}
1452
1453; This used to crash because isUnaryPredicate and BuildUDIV don't know how
1454; a SPLAT_VECTOR of fixed vector type should be handled.
1455define void @udiv_constantsplat_v8i32(ptr %a) vscale_range(2,0) #1 {
1456; CHECK-LABEL: udiv_constantsplat_v8i32:
1457; CHECK:       // %bb.0:
1458; CHECK-NEXT:    ptrue p0.s, vl8
1459; CHECK-NEXT:    mov z1.s, #95 // =0x5f
1460; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
1461; CHECK-NEXT:    udiv z0.s, p0/m, z0.s, z1.s
1462; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
1463; CHECK-NEXT:    ret
1464  %op1 = load <8 x i32>, ptr %a
1465  %res = udiv <8 x i32> %op1, <i32 95, i32 95, i32 95, i32 95, i32 95, i32 95, i32 95, i32 95>
1466  store <8 x i32> %res, ptr %a
1467  ret void
1468}
1469
1470attributes #0 = { "target-features"="+sve" }
1471attributes #1 = { "target-features"="+sve" minsize }
1472