xref: /llvm-project/llvm/test/CodeGen/AArch64/sve-fixed-length-int-minmax.ll (revision b24af43fdfa1b1242b7cb77540462212227c57c4)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -aarch64-sve-vector-bits-min=256  < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256
3; RUN: llc -aarch64-sve-vector-bits-min=512  < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
4; RUN: llc -aarch64-sve-vector-bits-min=2048 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
5
6target triple = "aarch64-unknown-linux-gnu"
7
8;
9; SMAX
10;
11
12; Don't use SVE for 64-bit vectors.
13define <8 x i8> @smax_v8i8(<8 x i8> %op1, <8 x i8> %op2) vscale_range(2,0) #0 {
14; CHECK-LABEL: smax_v8i8:
15; CHECK:       // %bb.0:
16; CHECK-NEXT:    smax v0.8b, v0.8b, v1.8b
17; CHECK-NEXT:    ret
18  %res = call <8 x i8> @llvm.smax.v8i8(<8 x i8> %op1, <8 x i8> %op2)
19  ret <8 x i8> %res
20}
21
22; Don't use SVE for 128-bit vectors.
23define <16 x i8> @smax_v16i8(<16 x i8> %op1, <16 x i8> %op2) vscale_range(2,0) #0 {
24; CHECK-LABEL: smax_v16i8:
25; CHECK:       // %bb.0:
26; CHECK-NEXT:    smax v0.16b, v0.16b, v1.16b
27; CHECK-NEXT:    ret
28  %res = call <16 x i8> @llvm.smax.v16i8(<16 x i8> %op1, <16 x i8> %op2)
29  ret <16 x i8> %res
30}
31
32define void @smax_v32i8(ptr %a, ptr %b) vscale_range(2,0) #0 {
33; CHECK-LABEL: smax_v32i8:
34; CHECK:       // %bb.0:
35; CHECK-NEXT:    ptrue p0.b, vl32
36; CHECK-NEXT:    ld1b { z0.b }, p0/z, [x0]
37; CHECK-NEXT:    ld1b { z1.b }, p0/z, [x1]
38; CHECK-NEXT:    smax z0.b, p0/m, z0.b, z1.b
39; CHECK-NEXT:    st1b { z0.b }, p0, [x0]
40; CHECK-NEXT:    ret
41  %op1 = load <32 x i8>, ptr %a
42  %op2 = load <32 x i8>, ptr %b
43  %res = call <32 x i8> @llvm.smax.v32i8(<32 x i8> %op1, <32 x i8> %op2)
44  store <32 x i8> %res, ptr %a
45  ret void
46}
47
48define void @smax_v64i8(ptr %a, ptr %b) #0 {
49; VBITS_GE_256-LABEL: smax_v64i8:
50; VBITS_GE_256:       // %bb.0:
51; VBITS_GE_256-NEXT:    ptrue p0.b, vl32
52; VBITS_GE_256-NEXT:    mov w8, #32 // =0x20
53; VBITS_GE_256-NEXT:    ld1b { z0.b }, p0/z, [x0, x8]
54; VBITS_GE_256-NEXT:    ld1b { z1.b }, p0/z, [x1, x8]
55; VBITS_GE_256-NEXT:    ld1b { z2.b }, p0/z, [x0]
56; VBITS_GE_256-NEXT:    ld1b { z3.b }, p0/z, [x1]
57; VBITS_GE_256-NEXT:    smax z0.b, p0/m, z0.b, z1.b
58; VBITS_GE_256-NEXT:    movprfx z1, z2
59; VBITS_GE_256-NEXT:    smax z1.b, p0/m, z1.b, z3.b
60; VBITS_GE_256-NEXT:    st1b { z0.b }, p0, [x0, x8]
61; VBITS_GE_256-NEXT:    st1b { z1.b }, p0, [x0]
62; VBITS_GE_256-NEXT:    ret
63;
64; VBITS_GE_512-LABEL: smax_v64i8:
65; VBITS_GE_512:       // %bb.0:
66; VBITS_GE_512-NEXT:    ptrue p0.b, vl64
67; VBITS_GE_512-NEXT:    ld1b { z0.b }, p0/z, [x0]
68; VBITS_GE_512-NEXT:    ld1b { z1.b }, p0/z, [x1]
69; VBITS_GE_512-NEXT:    smax z0.b, p0/m, z0.b, z1.b
70; VBITS_GE_512-NEXT:    st1b { z0.b }, p0, [x0]
71; VBITS_GE_512-NEXT:    ret
72  %op1 = load <64 x i8>, ptr %a
73  %op2 = load <64 x i8>, ptr %b
74  %res = call <64 x i8> @llvm.smax.v64i8(<64 x i8> %op1, <64 x i8> %op2)
75  store <64 x i8> %res, ptr %a
76  ret void
77}
78
79define void @smax_v128i8(ptr %a, ptr %b) vscale_range(8,0) #0 {
80; CHECK-LABEL: smax_v128i8:
81; CHECK:       // %bb.0:
82; CHECK-NEXT:    ptrue p0.b, vl128
83; CHECK-NEXT:    ld1b { z0.b }, p0/z, [x0]
84; CHECK-NEXT:    ld1b { z1.b }, p0/z, [x1]
85; CHECK-NEXT:    smax z0.b, p0/m, z0.b, z1.b
86; CHECK-NEXT:    st1b { z0.b }, p0, [x0]
87; CHECK-NEXT:    ret
88  %op1 = load <128 x i8>, ptr %a
89  %op2 = load <128 x i8>, ptr %b
90  %res = call <128 x i8> @llvm.smax.v128i8(<128 x i8> %op1, <128 x i8> %op2)
91  store <128 x i8> %res, ptr %a
92  ret void
93}
94
95define void @smax_v256i8(ptr %a, ptr %b) vscale_range(16,0) #0 {
96; CHECK-LABEL: smax_v256i8:
97; CHECK:       // %bb.0:
98; CHECK-NEXT:    ptrue p0.b, vl256
99; CHECK-NEXT:    ld1b { z0.b }, p0/z, [x0]
100; CHECK-NEXT:    ld1b { z1.b }, p0/z, [x1]
101; CHECK-NEXT:    smax z0.b, p0/m, z0.b, z1.b
102; CHECK-NEXT:    st1b { z0.b }, p0, [x0]
103; CHECK-NEXT:    ret
104  %op1 = load <256 x i8>, ptr %a
105  %op2 = load <256 x i8>, ptr %b
106  %res = call <256 x i8> @llvm.smax.v256i8(<256 x i8> %op1, <256 x i8> %op2)
107  store <256 x i8> %res, ptr %a
108  ret void
109}
110
111; Don't use SVE for 64-bit vectors.
112define <4 x i16> @smax_v4i16(<4 x i16> %op1, <4 x i16> %op2) vscale_range(2,0) #0 {
113; CHECK-LABEL: smax_v4i16:
114; CHECK:       // %bb.0:
115; CHECK-NEXT:    smax v0.4h, v0.4h, v1.4h
116; CHECK-NEXT:    ret
117  %res = call <4 x i16> @llvm.smax.v4i16(<4 x i16> %op1, <4 x i16> %op2)
118  ret <4 x i16> %res
119}
120
121; Don't use SVE for 128-bit vectors.
122define <8 x i16> @smax_v8i16(<8 x i16> %op1, <8 x i16> %op2) vscale_range(2,0) #0 {
123; CHECK-LABEL: smax_v8i16:
124; CHECK:       // %bb.0:
125; CHECK-NEXT:    smax v0.8h, v0.8h, v1.8h
126; CHECK-NEXT:    ret
127  %res = call <8 x i16> @llvm.smax.v8i16(<8 x i16> %op1, <8 x i16> %op2)
128  ret <8 x i16> %res
129}
130
131define void @smax_v16i16(ptr %a, ptr %b) vscale_range(2,0) #0 {
132; CHECK-LABEL: smax_v16i16:
133; CHECK:       // %bb.0:
134; CHECK-NEXT:    ptrue p0.h, vl16
135; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
136; CHECK-NEXT:    ld1h { z1.h }, p0/z, [x1]
137; CHECK-NEXT:    smax z0.h, p0/m, z0.h, z1.h
138; CHECK-NEXT:    st1h { z0.h }, p0, [x0]
139; CHECK-NEXT:    ret
140  %op1 = load <16 x i16>, ptr %a
141  %op2 = load <16 x i16>, ptr %b
142  %res = call <16 x i16> @llvm.smax.v16i16(<16 x i16> %op1, <16 x i16> %op2)
143  store <16 x i16> %res, ptr %a
144  ret void
145}
146
147define void @smax_v32i16(ptr %a, ptr %b) #0 {
148; VBITS_GE_256-LABEL: smax_v32i16:
149; VBITS_GE_256:       // %bb.0:
150; VBITS_GE_256-NEXT:    ptrue p0.h, vl16
151; VBITS_GE_256-NEXT:    mov x8, #16 // =0x10
152; VBITS_GE_256-NEXT:    ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
153; VBITS_GE_256-NEXT:    ld1h { z1.h }, p0/z, [x1, x8, lsl #1]
154; VBITS_GE_256-NEXT:    ld1h { z2.h }, p0/z, [x0]
155; VBITS_GE_256-NEXT:    ld1h { z3.h }, p0/z, [x1]
156; VBITS_GE_256-NEXT:    smax z0.h, p0/m, z0.h, z1.h
157; VBITS_GE_256-NEXT:    movprfx z1, z2
158; VBITS_GE_256-NEXT:    smax z1.h, p0/m, z1.h, z3.h
159; VBITS_GE_256-NEXT:    st1h { z0.h }, p0, [x0, x8, lsl #1]
160; VBITS_GE_256-NEXT:    st1h { z1.h }, p0, [x0]
161; VBITS_GE_256-NEXT:    ret
162;
163; VBITS_GE_512-LABEL: smax_v32i16:
164; VBITS_GE_512:       // %bb.0:
165; VBITS_GE_512-NEXT:    ptrue p0.h, vl32
166; VBITS_GE_512-NEXT:    ld1h { z0.h }, p0/z, [x0]
167; VBITS_GE_512-NEXT:    ld1h { z1.h }, p0/z, [x1]
168; VBITS_GE_512-NEXT:    smax z0.h, p0/m, z0.h, z1.h
169; VBITS_GE_512-NEXT:    st1h { z0.h }, p0, [x0]
170; VBITS_GE_512-NEXT:    ret
171  %op1 = load <32 x i16>, ptr %a
172  %op2 = load <32 x i16>, ptr %b
173  %res = call <32 x i16> @llvm.smax.v32i16(<32 x i16> %op1, <32 x i16> %op2)
174  store <32 x i16> %res, ptr %a
175  ret void
176}
177
178define void @smax_v64i16(ptr %a, ptr %b) vscale_range(8,0) #0 {
179; CHECK-LABEL: smax_v64i16:
180; CHECK:       // %bb.0:
181; CHECK-NEXT:    ptrue p0.h, vl64
182; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
183; CHECK-NEXT:    ld1h { z1.h }, p0/z, [x1]
184; CHECK-NEXT:    smax z0.h, p0/m, z0.h, z1.h
185; CHECK-NEXT:    st1h { z0.h }, p0, [x0]
186; CHECK-NEXT:    ret
187  %op1 = load <64 x i16>, ptr %a
188  %op2 = load <64 x i16>, ptr %b
189  %res = call <64 x i16> @llvm.smax.v64i16(<64 x i16> %op1, <64 x i16> %op2)
190  store <64 x i16> %res, ptr %a
191  ret void
192}
193
194define void @smax_v128i16(ptr %a, ptr %b) vscale_range(16,0) #0 {
195; CHECK-LABEL: smax_v128i16:
196; CHECK:       // %bb.0:
197; CHECK-NEXT:    ptrue p0.h, vl128
198; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
199; CHECK-NEXT:    ld1h { z1.h }, p0/z, [x1]
200; CHECK-NEXT:    smax z0.h, p0/m, z0.h, z1.h
201; CHECK-NEXT:    st1h { z0.h }, p0, [x0]
202; CHECK-NEXT:    ret
203  %op1 = load <128 x i16>, ptr %a
204  %op2 = load <128 x i16>, ptr %b
205  %res = call <128 x i16> @llvm.smax.v128i16(<128 x i16> %op1, <128 x i16> %op2)
206  store <128 x i16> %res, ptr %a
207  ret void
208}
209
210; Don't use SVE for 64-bit vectors.
211define <2 x i32> @smax_v2i32(<2 x i32> %op1, <2 x i32> %op2) vscale_range(2,0) #0 {
212; CHECK-LABEL: smax_v2i32:
213; CHECK:       // %bb.0:
214; CHECK-NEXT:    smax v0.2s, v0.2s, v1.2s
215; CHECK-NEXT:    ret
216  %res = call <2 x i32> @llvm.smax.v2i32(<2 x i32> %op1, <2 x i32> %op2)
217  ret <2 x i32> %res
218}
219
220; Don't use SVE for 128-bit vectors.
221define <4 x i32> @smax_v4i32(<4 x i32> %op1, <4 x i32> %op2) vscale_range(2,0) #0 {
222; CHECK-LABEL: smax_v4i32:
223; CHECK:       // %bb.0:
224; CHECK-NEXT:    smax v0.4s, v0.4s, v1.4s
225; CHECK-NEXT:    ret
226  %res = call <4 x i32> @llvm.smax.v4i32(<4 x i32> %op1, <4 x i32> %op2)
227  ret <4 x i32> %res
228}
229
230define void @smax_v8i32(ptr %a, ptr %b) vscale_range(2,0) #0 {
231; CHECK-LABEL: smax_v8i32:
232; CHECK:       // %bb.0:
233; CHECK-NEXT:    ptrue p0.s, vl8
234; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
235; CHECK-NEXT:    ld1w { z1.s }, p0/z, [x1]
236; CHECK-NEXT:    smax z0.s, p0/m, z0.s, z1.s
237; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
238; CHECK-NEXT:    ret
239  %op1 = load <8 x i32>, ptr %a
240  %op2 = load <8 x i32>, ptr %b
241  %res = call <8 x i32> @llvm.smax.v8i32(<8 x i32> %op1, <8 x i32> %op2)
242  store <8 x i32> %res, ptr %a
243  ret void
244}
245
246define void @smax_v16i32(ptr %a, ptr %b) #0 {
247; VBITS_GE_256-LABEL: smax_v16i32:
248; VBITS_GE_256:       // %bb.0:
249; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
250; VBITS_GE_256-NEXT:    mov x8, #8 // =0x8
251; VBITS_GE_256-NEXT:    ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
252; VBITS_GE_256-NEXT:    ld1w { z1.s }, p0/z, [x1, x8, lsl #2]
253; VBITS_GE_256-NEXT:    ld1w { z2.s }, p0/z, [x0]
254; VBITS_GE_256-NEXT:    ld1w { z3.s }, p0/z, [x1]
255; VBITS_GE_256-NEXT:    smax z0.s, p0/m, z0.s, z1.s
256; VBITS_GE_256-NEXT:    movprfx z1, z2
257; VBITS_GE_256-NEXT:    smax z1.s, p0/m, z1.s, z3.s
258; VBITS_GE_256-NEXT:    st1w { z0.s }, p0, [x0, x8, lsl #2]
259; VBITS_GE_256-NEXT:    st1w { z1.s }, p0, [x0]
260; VBITS_GE_256-NEXT:    ret
261;
262; VBITS_GE_512-LABEL: smax_v16i32:
263; VBITS_GE_512:       // %bb.0:
264; VBITS_GE_512-NEXT:    ptrue p0.s, vl16
265; VBITS_GE_512-NEXT:    ld1w { z0.s }, p0/z, [x0]
266; VBITS_GE_512-NEXT:    ld1w { z1.s }, p0/z, [x1]
267; VBITS_GE_512-NEXT:    smax z0.s, p0/m, z0.s, z1.s
268; VBITS_GE_512-NEXT:    st1w { z0.s }, p0, [x0]
269; VBITS_GE_512-NEXT:    ret
270  %op1 = load <16 x i32>, ptr %a
271  %op2 = load <16 x i32>, ptr %b
272  %res = call <16 x i32> @llvm.smax.v16i32(<16 x i32> %op1, <16 x i32> %op2)
273  store <16 x i32> %res, ptr %a
274  ret void
275}
276
277define void @smax_v32i32(ptr %a, ptr %b) vscale_range(8,0) #0 {
278; CHECK-LABEL: smax_v32i32:
279; CHECK:       // %bb.0:
280; CHECK-NEXT:    ptrue p0.s, vl32
281; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
282; CHECK-NEXT:    ld1w { z1.s }, p0/z, [x1]
283; CHECK-NEXT:    smax z0.s, p0/m, z0.s, z1.s
284; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
285; CHECK-NEXT:    ret
286  %op1 = load <32 x i32>, ptr %a
287  %op2 = load <32 x i32>, ptr %b
288  %res = call <32 x i32> @llvm.smax.v32i32(<32 x i32> %op1, <32 x i32> %op2)
289  store <32 x i32> %res, ptr %a
290  ret void
291}
292
293define void @smax_v64i32(ptr %a, ptr %b) vscale_range(16,0) #0 {
294; CHECK-LABEL: smax_v64i32:
295; CHECK:       // %bb.0:
296; CHECK-NEXT:    ptrue p0.s, vl64
297; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
298; CHECK-NEXT:    ld1w { z1.s }, p0/z, [x1]
299; CHECK-NEXT:    smax z0.s, p0/m, z0.s, z1.s
300; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
301; CHECK-NEXT:    ret
302  %op1 = load <64 x i32>, ptr %a
303  %op2 = load <64 x i32>, ptr %b
304  %res = call <64 x i32> @llvm.smax.v64i32(<64 x i32> %op1, <64 x i32> %op2)
305  store <64 x i32> %res, ptr %a
306  ret void
307}
308
309; Vector i64 max are not legal for NEON so use SVE when available.
310define <1 x i64> @smax_v1i64(<1 x i64> %op1, <1 x i64> %op2) vscale_range(2,0) #0 {
311; CHECK-LABEL: smax_v1i64:
312; CHECK:       // %bb.0:
313; CHECK-NEXT:    ptrue p0.d, vl1
314; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
315; CHECK-NEXT:    // kill: def $d1 killed $d1 def $z1
316; CHECK-NEXT:    smax z0.d, p0/m, z0.d, z1.d
317; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
318; CHECK-NEXT:    ret
319  %res = call <1 x i64> @llvm.smax.v1i64(<1 x i64> %op1, <1 x i64> %op2)
320  ret <1 x i64> %res
321}
322
323; Vector i64 max are not legal for NEON so use SVE when available.
324define <2 x i64> @smax_v2i64(<2 x i64> %op1, <2 x i64> %op2) vscale_range(2,0) #0 {
325; CHECK-LABEL: smax_v2i64:
326; CHECK:       // %bb.0:
327; CHECK-NEXT:    ptrue p0.d, vl2
328; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
329; CHECK-NEXT:    // kill: def $q1 killed $q1 def $z1
330; CHECK-NEXT:    smax z0.d, p0/m, z0.d, z1.d
331; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
332; CHECK-NEXT:    ret
333  %res = call <2 x i64> @llvm.smax.v2i64(<2 x i64> %op1, <2 x i64> %op2)
334  ret <2 x i64> %res
335}
336
337define void @smax_v4i64(ptr %a, ptr %b) vscale_range(2,0) #0 {
338; CHECK-LABEL: smax_v4i64:
339; CHECK:       // %bb.0:
340; CHECK-NEXT:    ptrue p0.d, vl4
341; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
342; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x1]
343; CHECK-NEXT:    smax z0.d, p0/m, z0.d, z1.d
344; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
345; CHECK-NEXT:    ret
346  %op1 = load <4 x i64>, ptr %a
347  %op2 = load <4 x i64>, ptr %b
348  %res = call <4 x i64> @llvm.smax.v4i64(<4 x i64> %op1, <4 x i64> %op2)
349  store <4 x i64> %res, ptr %a
350  ret void
351}
352
353define void @smax_v8i64(ptr %a, ptr %b) #0 {
354; VBITS_GE_256-LABEL: smax_v8i64:
355; VBITS_GE_256:       // %bb.0:
356; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
357; VBITS_GE_256-NEXT:    mov x8, #4 // =0x4
358; VBITS_GE_256-NEXT:    ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
359; VBITS_GE_256-NEXT:    ld1d { z1.d }, p0/z, [x1, x8, lsl #3]
360; VBITS_GE_256-NEXT:    ld1d { z2.d }, p0/z, [x0]
361; VBITS_GE_256-NEXT:    ld1d { z3.d }, p0/z, [x1]
362; VBITS_GE_256-NEXT:    smax z0.d, p0/m, z0.d, z1.d
363; VBITS_GE_256-NEXT:    movprfx z1, z2
364; VBITS_GE_256-NEXT:    smax z1.d, p0/m, z1.d, z3.d
365; VBITS_GE_256-NEXT:    st1d { z0.d }, p0, [x0, x8, lsl #3]
366; VBITS_GE_256-NEXT:    st1d { z1.d }, p0, [x0]
367; VBITS_GE_256-NEXT:    ret
368;
369; VBITS_GE_512-LABEL: smax_v8i64:
370; VBITS_GE_512:       // %bb.0:
371; VBITS_GE_512-NEXT:    ptrue p0.d, vl8
372; VBITS_GE_512-NEXT:    ld1d { z0.d }, p0/z, [x0]
373; VBITS_GE_512-NEXT:    ld1d { z1.d }, p0/z, [x1]
374; VBITS_GE_512-NEXT:    smax z0.d, p0/m, z0.d, z1.d
375; VBITS_GE_512-NEXT:    st1d { z0.d }, p0, [x0]
376; VBITS_GE_512-NEXT:    ret
377  %op1 = load <8 x i64>, ptr %a
378  %op2 = load <8 x i64>, ptr %b
379  %res = call <8 x i64> @llvm.smax.v8i64(<8 x i64> %op1, <8 x i64> %op2)
380  store <8 x i64> %res, ptr %a
381  ret void
382}
383
384define void @smax_v16i64(ptr %a, ptr %b) vscale_range(8,0) #0 {
385; CHECK-LABEL: smax_v16i64:
386; CHECK:       // %bb.0:
387; CHECK-NEXT:    ptrue p0.d, vl16
388; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
389; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x1]
390; CHECK-NEXT:    smax z0.d, p0/m, z0.d, z1.d
391; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
392; CHECK-NEXT:    ret
393  %op1 = load <16 x i64>, ptr %a
394  %op2 = load <16 x i64>, ptr %b
395  %res = call <16 x i64> @llvm.smax.v16i64(<16 x i64> %op1, <16 x i64> %op2)
396  store <16 x i64> %res, ptr %a
397  ret void
398}
399
400define void @smax_v32i64(ptr %a, ptr %b) vscale_range(16,0) #0 {
401; CHECK-LABEL: smax_v32i64:
402; CHECK:       // %bb.0:
403; CHECK-NEXT:    ptrue p0.d, vl32
404; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
405; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x1]
406; CHECK-NEXT:    smax z0.d, p0/m, z0.d, z1.d
407; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
408; CHECK-NEXT:    ret
409  %op1 = load <32 x i64>, ptr %a
410  %op2 = load <32 x i64>, ptr %b
411  %res = call <32 x i64> @llvm.smax.v32i64(<32 x i64> %op1, <32 x i64> %op2)
412  store <32 x i64> %res, ptr %a
413  ret void
414}
415
416;
417; SMIN
418;
419
420; Don't use SVE for 64-bit vectors.
421define <8 x i8> @smin_v8i8(<8 x i8> %op1, <8 x i8> %op2) vscale_range(2,0) #0 {
422; CHECK-LABEL: smin_v8i8:
423; CHECK:       // %bb.0:
424; CHECK-NEXT:    smin v0.8b, v0.8b, v1.8b
425; CHECK-NEXT:    ret
426  %res = call <8 x i8> @llvm.smin.v8i8(<8 x i8> %op1, <8 x i8> %op2)
427  ret <8 x i8> %res
428}
429
430; Don't use SVE for 128-bit vectors.
431define <16 x i8> @smin_v16i8(<16 x i8> %op1, <16 x i8> %op2) vscale_range(2,0) #0 {
432; CHECK-LABEL: smin_v16i8:
433; CHECK:       // %bb.0:
434; CHECK-NEXT:    smin v0.16b, v0.16b, v1.16b
435; CHECK-NEXT:    ret
436  %res = call <16 x i8> @llvm.smin.v16i8(<16 x i8> %op1, <16 x i8> %op2)
437  ret <16 x i8> %res
438}
439
440define void @smin_v32i8(ptr %a, ptr %b) vscale_range(2,0) #0 {
441; CHECK-LABEL: smin_v32i8:
442; CHECK:       // %bb.0:
443; CHECK-NEXT:    ptrue p0.b, vl32
444; CHECK-NEXT:    ld1b { z0.b }, p0/z, [x0]
445; CHECK-NEXT:    ld1b { z1.b }, p0/z, [x1]
446; CHECK-NEXT:    smin z0.b, p0/m, z0.b, z1.b
447; CHECK-NEXT:    st1b { z0.b }, p0, [x0]
448; CHECK-NEXT:    ret
449  %op1 = load <32 x i8>, ptr %a
450  %op2 = load <32 x i8>, ptr %b
451  %res = call <32 x i8> @llvm.smin.v32i8(<32 x i8> %op1, <32 x i8> %op2)
452  store <32 x i8> %res, ptr %a
453  ret void
454}
455
456define void @smin_v64i8(ptr %a, ptr %b) #0 {
457; VBITS_GE_256-LABEL: smin_v64i8:
458; VBITS_GE_256:       // %bb.0:
459; VBITS_GE_256-NEXT:    ptrue p0.b, vl32
460; VBITS_GE_256-NEXT:    mov w8, #32 // =0x20
461; VBITS_GE_256-NEXT:    ld1b { z0.b }, p0/z, [x0, x8]
462; VBITS_GE_256-NEXT:    ld1b { z1.b }, p0/z, [x1, x8]
463; VBITS_GE_256-NEXT:    ld1b { z2.b }, p0/z, [x0]
464; VBITS_GE_256-NEXT:    ld1b { z3.b }, p0/z, [x1]
465; VBITS_GE_256-NEXT:    smin z0.b, p0/m, z0.b, z1.b
466; VBITS_GE_256-NEXT:    movprfx z1, z2
467; VBITS_GE_256-NEXT:    smin z1.b, p0/m, z1.b, z3.b
468; VBITS_GE_256-NEXT:    st1b { z0.b }, p0, [x0, x8]
469; VBITS_GE_256-NEXT:    st1b { z1.b }, p0, [x0]
470; VBITS_GE_256-NEXT:    ret
471;
472; VBITS_GE_512-LABEL: smin_v64i8:
473; VBITS_GE_512:       // %bb.0:
474; VBITS_GE_512-NEXT:    ptrue p0.b, vl64
475; VBITS_GE_512-NEXT:    ld1b { z0.b }, p0/z, [x0]
476; VBITS_GE_512-NEXT:    ld1b { z1.b }, p0/z, [x1]
477; VBITS_GE_512-NEXT:    smin z0.b, p0/m, z0.b, z1.b
478; VBITS_GE_512-NEXT:    st1b { z0.b }, p0, [x0]
479; VBITS_GE_512-NEXT:    ret
480  %op1 = load <64 x i8>, ptr %a
481  %op2 = load <64 x i8>, ptr %b
482  %res = call <64 x i8> @llvm.smin.v64i8(<64 x i8> %op1, <64 x i8> %op2)
483  store <64 x i8> %res, ptr %a
484  ret void
485}
486
487define void @smin_v128i8(ptr %a, ptr %b) vscale_range(8,0) #0 {
488; CHECK-LABEL: smin_v128i8:
489; CHECK:       // %bb.0:
490; CHECK-NEXT:    ptrue p0.b, vl128
491; CHECK-NEXT:    ld1b { z0.b }, p0/z, [x0]
492; CHECK-NEXT:    ld1b { z1.b }, p0/z, [x1]
493; CHECK-NEXT:    smin z0.b, p0/m, z0.b, z1.b
494; CHECK-NEXT:    st1b { z0.b }, p0, [x0]
495; CHECK-NEXT:    ret
496  %op1 = load <128 x i8>, ptr %a
497  %op2 = load <128 x i8>, ptr %b
498  %res = call <128 x i8> @llvm.smin.v128i8(<128 x i8> %op1, <128 x i8> %op2)
499  store <128 x i8> %res, ptr %a
500  ret void
501}
502
503define void @smin_v256i8(ptr %a, ptr %b) vscale_range(16,0) #0 {
504; CHECK-LABEL: smin_v256i8:
505; CHECK:       // %bb.0:
506; CHECK-NEXT:    ptrue p0.b, vl256
507; CHECK-NEXT:    ld1b { z0.b }, p0/z, [x0]
508; CHECK-NEXT:    ld1b { z1.b }, p0/z, [x1]
509; CHECK-NEXT:    smin z0.b, p0/m, z0.b, z1.b
510; CHECK-NEXT:    st1b { z0.b }, p0, [x0]
511; CHECK-NEXT:    ret
512  %op1 = load <256 x i8>, ptr %a
513  %op2 = load <256 x i8>, ptr %b
514  %res = call <256 x i8> @llvm.smin.v256i8(<256 x i8> %op1, <256 x i8> %op2)
515  store <256 x i8> %res, ptr %a
516  ret void
517}
518
519; Don't use SVE for 64-bit vectors.
520define <4 x i16> @smin_v4i16(<4 x i16> %op1, <4 x i16> %op2) vscale_range(2,0) #0 {
521; CHECK-LABEL: smin_v4i16:
522; CHECK:       // %bb.0:
523; CHECK-NEXT:    smin v0.4h, v0.4h, v1.4h
524; CHECK-NEXT:    ret
525  %res = call <4 x i16> @llvm.smin.v4i16(<4 x i16> %op1, <4 x i16> %op2)
526  ret <4 x i16> %res
527}
528
529; Don't use SVE for 128-bit vectors.
530define <8 x i16> @smin_v8i16(<8 x i16> %op1, <8 x i16> %op2) vscale_range(2,0) #0 {
531; CHECK-LABEL: smin_v8i16:
532; CHECK:       // %bb.0:
533; CHECK-NEXT:    smin v0.8h, v0.8h, v1.8h
534; CHECK-NEXT:    ret
535  %res = call <8 x i16> @llvm.smin.v8i16(<8 x i16> %op1, <8 x i16> %op2)
536  ret <8 x i16> %res
537}
538
539define void @smin_v16i16(ptr %a, ptr %b) vscale_range(2,0) #0 {
540; CHECK-LABEL: smin_v16i16:
541; CHECK:       // %bb.0:
542; CHECK-NEXT:    ptrue p0.h, vl16
543; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
544; CHECK-NEXT:    ld1h { z1.h }, p0/z, [x1]
545; CHECK-NEXT:    smin z0.h, p0/m, z0.h, z1.h
546; CHECK-NEXT:    st1h { z0.h }, p0, [x0]
547; CHECK-NEXT:    ret
548  %op1 = load <16 x i16>, ptr %a
549  %op2 = load <16 x i16>, ptr %b
550  %res = call <16 x i16> @llvm.smin.v16i16(<16 x i16> %op1, <16 x i16> %op2)
551  store <16 x i16> %res, ptr %a
552  ret void
553}
554
555define void @smin_v32i16(ptr %a, ptr %b) #0 {
556; VBITS_GE_256-LABEL: smin_v32i16:
557; VBITS_GE_256:       // %bb.0:
558; VBITS_GE_256-NEXT:    ptrue p0.h, vl16
559; VBITS_GE_256-NEXT:    mov x8, #16 // =0x10
560; VBITS_GE_256-NEXT:    ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
561; VBITS_GE_256-NEXT:    ld1h { z1.h }, p0/z, [x1, x8, lsl #1]
562; VBITS_GE_256-NEXT:    ld1h { z2.h }, p0/z, [x0]
563; VBITS_GE_256-NEXT:    ld1h { z3.h }, p0/z, [x1]
564; VBITS_GE_256-NEXT:    smin z0.h, p0/m, z0.h, z1.h
565; VBITS_GE_256-NEXT:    movprfx z1, z2
566; VBITS_GE_256-NEXT:    smin z1.h, p0/m, z1.h, z3.h
567; VBITS_GE_256-NEXT:    st1h { z0.h }, p0, [x0, x8, lsl #1]
568; VBITS_GE_256-NEXT:    st1h { z1.h }, p0, [x0]
569; VBITS_GE_256-NEXT:    ret
570;
571; VBITS_GE_512-LABEL: smin_v32i16:
572; VBITS_GE_512:       // %bb.0:
573; VBITS_GE_512-NEXT:    ptrue p0.h, vl32
574; VBITS_GE_512-NEXT:    ld1h { z0.h }, p0/z, [x0]
575; VBITS_GE_512-NEXT:    ld1h { z1.h }, p0/z, [x1]
576; VBITS_GE_512-NEXT:    smin z0.h, p0/m, z0.h, z1.h
577; VBITS_GE_512-NEXT:    st1h { z0.h }, p0, [x0]
578; VBITS_GE_512-NEXT:    ret
579  %op1 = load <32 x i16>, ptr %a
580  %op2 = load <32 x i16>, ptr %b
581  %res = call <32 x i16> @llvm.smin.v32i16(<32 x i16> %op1, <32 x i16> %op2)
582  store <32 x i16> %res, ptr %a
583  ret void
584}
585
586define void @smin_v64i16(ptr %a, ptr %b) vscale_range(8,0) #0 {
587; CHECK-LABEL: smin_v64i16:
588; CHECK:       // %bb.0:
589; CHECK-NEXT:    ptrue p0.h, vl64
590; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
591; CHECK-NEXT:    ld1h { z1.h }, p0/z, [x1]
592; CHECK-NEXT:    smin z0.h, p0/m, z0.h, z1.h
593; CHECK-NEXT:    st1h { z0.h }, p0, [x0]
594; CHECK-NEXT:    ret
595  %op1 = load <64 x i16>, ptr %a
596  %op2 = load <64 x i16>, ptr %b
597  %res = call <64 x i16> @llvm.smin.v64i16(<64 x i16> %op1, <64 x i16> %op2)
598  store <64 x i16> %res, ptr %a
599  ret void
600}
601
602define void @smin_v128i16(ptr %a, ptr %b) vscale_range(16,0) #0 {
603; CHECK-LABEL: smin_v128i16:
604; CHECK:       // %bb.0:
605; CHECK-NEXT:    ptrue p0.h, vl128
606; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
607; CHECK-NEXT:    ld1h { z1.h }, p0/z, [x1]
608; CHECK-NEXT:    smin z0.h, p0/m, z0.h, z1.h
609; CHECK-NEXT:    st1h { z0.h }, p0, [x0]
610; CHECK-NEXT:    ret
611  %op1 = load <128 x i16>, ptr %a
612  %op2 = load <128 x i16>, ptr %b
613  %res = call <128 x i16> @llvm.smin.v128i16(<128 x i16> %op1, <128 x i16> %op2)
614  store <128 x i16> %res, ptr %a
615  ret void
616}
617
618; Don't use SVE for 64-bit vectors.
619define <2 x i32> @smin_v2i32(<2 x i32> %op1, <2 x i32> %op2) vscale_range(2,0) #0 {
620; CHECK-LABEL: smin_v2i32:
621; CHECK:       // %bb.0:
622; CHECK-NEXT:    smin v0.2s, v0.2s, v1.2s
623; CHECK-NEXT:    ret
624  %res = call <2 x i32> @llvm.smin.v2i32(<2 x i32> %op1, <2 x i32> %op2)
625  ret <2 x i32> %res
626}
627
628; Don't use SVE for 128-bit vectors.
629define <4 x i32> @smin_v4i32(<4 x i32> %op1, <4 x i32> %op2) vscale_range(2,0) #0 {
630; CHECK-LABEL: smin_v4i32:
631; CHECK:       // %bb.0:
632; CHECK-NEXT:    smin v0.4s, v0.4s, v1.4s
633; CHECK-NEXT:    ret
634  %res = call <4 x i32> @llvm.smin.v4i32(<4 x i32> %op1, <4 x i32> %op2)
635  ret <4 x i32> %res
636}
637
638define void @smin_v8i32(ptr %a, ptr %b) vscale_range(2,0) #0 {
639; CHECK-LABEL: smin_v8i32:
640; CHECK:       // %bb.0:
641; CHECK-NEXT:    ptrue p0.s, vl8
642; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
643; CHECK-NEXT:    ld1w { z1.s }, p0/z, [x1]
644; CHECK-NEXT:    smin z0.s, p0/m, z0.s, z1.s
645; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
646; CHECK-NEXT:    ret
647  %op1 = load <8 x i32>, ptr %a
648  %op2 = load <8 x i32>, ptr %b
649  %res = call <8 x i32> @llvm.smin.v8i32(<8 x i32> %op1, <8 x i32> %op2)
650  store <8 x i32> %res, ptr %a
651  ret void
652}
653
654define void @smin_v16i32(ptr %a, ptr %b) #0 {
655; VBITS_GE_256-LABEL: smin_v16i32:
656; VBITS_GE_256:       // %bb.0:
657; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
658; VBITS_GE_256-NEXT:    mov x8, #8 // =0x8
659; VBITS_GE_256-NEXT:    ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
660; VBITS_GE_256-NEXT:    ld1w { z1.s }, p0/z, [x1, x8, lsl #2]
661; VBITS_GE_256-NEXT:    ld1w { z2.s }, p0/z, [x0]
662; VBITS_GE_256-NEXT:    ld1w { z3.s }, p0/z, [x1]
663; VBITS_GE_256-NEXT:    smin z0.s, p0/m, z0.s, z1.s
664; VBITS_GE_256-NEXT:    movprfx z1, z2
665; VBITS_GE_256-NEXT:    smin z1.s, p0/m, z1.s, z3.s
666; VBITS_GE_256-NEXT:    st1w { z0.s }, p0, [x0, x8, lsl #2]
667; VBITS_GE_256-NEXT:    st1w { z1.s }, p0, [x0]
668; VBITS_GE_256-NEXT:    ret
669;
670; VBITS_GE_512-LABEL: smin_v16i32:
671; VBITS_GE_512:       // %bb.0:
672; VBITS_GE_512-NEXT:    ptrue p0.s, vl16
673; VBITS_GE_512-NEXT:    ld1w { z0.s }, p0/z, [x0]
674; VBITS_GE_512-NEXT:    ld1w { z1.s }, p0/z, [x1]
675; VBITS_GE_512-NEXT:    smin z0.s, p0/m, z0.s, z1.s
676; VBITS_GE_512-NEXT:    st1w { z0.s }, p0, [x0]
677; VBITS_GE_512-NEXT:    ret
678  %op1 = load <16 x i32>, ptr %a
679  %op2 = load <16 x i32>, ptr %b
680  %res = call <16 x i32> @llvm.smin.v16i32(<16 x i32> %op1, <16 x i32> %op2)
681  store <16 x i32> %res, ptr %a
682  ret void
683}
684
685define void @smin_v32i32(ptr %a, ptr %b) vscale_range(8,0) #0 {
686; CHECK-LABEL: smin_v32i32:
687; CHECK:       // %bb.0:
688; CHECK-NEXT:    ptrue p0.s, vl32
689; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
690; CHECK-NEXT:    ld1w { z1.s }, p0/z, [x1]
691; CHECK-NEXT:    smin z0.s, p0/m, z0.s, z1.s
692; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
693; CHECK-NEXT:    ret
694  %op1 = load <32 x i32>, ptr %a
695  %op2 = load <32 x i32>, ptr %b
696  %res = call <32 x i32> @llvm.smin.v32i32(<32 x i32> %op1, <32 x i32> %op2)
697  store <32 x i32> %res, ptr %a
698  ret void
699}
700
701define void @smin_v64i32(ptr %a, ptr %b) vscale_range(16,0) #0 {
702; CHECK-LABEL: smin_v64i32:
703; CHECK:       // %bb.0:
704; CHECK-NEXT:    ptrue p0.s, vl64
705; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
706; CHECK-NEXT:    ld1w { z1.s }, p0/z, [x1]
707; CHECK-NEXT:    smin z0.s, p0/m, z0.s, z1.s
708; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
709; CHECK-NEXT:    ret
710  %op1 = load <64 x i32>, ptr %a
711  %op2 = load <64 x i32>, ptr %b
712  %res = call <64 x i32> @llvm.smin.v64i32(<64 x i32> %op1, <64 x i32> %op2)
713  store <64 x i32> %res, ptr %a
714  ret void
715}
716
717; Vector i64 min are not legal for NEON so use SVE when available.
718define <1 x i64> @smin_v1i64(<1 x i64> %op1, <1 x i64> %op2) vscale_range(2,0) #0 {
719; CHECK-LABEL: smin_v1i64:
720; CHECK:       // %bb.0:
721; CHECK-NEXT:    ptrue p0.d, vl1
722; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
723; CHECK-NEXT:    // kill: def $d1 killed $d1 def $z1
724; CHECK-NEXT:    smin z0.d, p0/m, z0.d, z1.d
725; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
726; CHECK-NEXT:    ret
727  %res = call <1 x i64> @llvm.smin.v1i64(<1 x i64> %op1, <1 x i64> %op2)
728  ret <1 x i64> %res
729}
730
731; Vector i64 min are not legal for NEON so use SVE when available.
732define <2 x i64> @smin_v2i64(<2 x i64> %op1, <2 x i64> %op2) vscale_range(2,0) #0 {
733; CHECK-LABEL: smin_v2i64:
734; CHECK:       // %bb.0:
735; CHECK-NEXT:    ptrue p0.d, vl2
736; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
737; CHECK-NEXT:    // kill: def $q1 killed $q1 def $z1
738; CHECK-NEXT:    smin z0.d, p0/m, z0.d, z1.d
739; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
740; CHECK-NEXT:    ret
741  %res = call <2 x i64> @llvm.smin.v2i64(<2 x i64> %op1, <2 x i64> %op2)
742  ret <2 x i64> %res
743}
744
745define void @smin_v4i64(ptr %a, ptr %b) vscale_range(2,0) #0 {
746; CHECK-LABEL: smin_v4i64:
747; CHECK:       // %bb.0:
748; CHECK-NEXT:    ptrue p0.d, vl4
749; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
750; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x1]
751; CHECK-NEXT:    smin z0.d, p0/m, z0.d, z1.d
752; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
753; CHECK-NEXT:    ret
754  %op1 = load <4 x i64>, ptr %a
755  %op2 = load <4 x i64>, ptr %b
756  %res = call <4 x i64> @llvm.smin.v4i64(<4 x i64> %op1, <4 x i64> %op2)
757  store <4 x i64> %res, ptr %a
758  ret void
759}
760
761define void @smin_v8i64(ptr %a, ptr %b) #0 {
762; VBITS_GE_256-LABEL: smin_v8i64:
763; VBITS_GE_256:       // %bb.0:
764; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
765; VBITS_GE_256-NEXT:    mov x8, #4 // =0x4
766; VBITS_GE_256-NEXT:    ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
767; VBITS_GE_256-NEXT:    ld1d { z1.d }, p0/z, [x1, x8, lsl #3]
768; VBITS_GE_256-NEXT:    ld1d { z2.d }, p0/z, [x0]
769; VBITS_GE_256-NEXT:    ld1d { z3.d }, p0/z, [x1]
770; VBITS_GE_256-NEXT:    smin z0.d, p0/m, z0.d, z1.d
771; VBITS_GE_256-NEXT:    movprfx z1, z2
772; VBITS_GE_256-NEXT:    smin z1.d, p0/m, z1.d, z3.d
773; VBITS_GE_256-NEXT:    st1d { z0.d }, p0, [x0, x8, lsl #3]
774; VBITS_GE_256-NEXT:    st1d { z1.d }, p0, [x0]
775; VBITS_GE_256-NEXT:    ret
776;
777; VBITS_GE_512-LABEL: smin_v8i64:
778; VBITS_GE_512:       // %bb.0:
779; VBITS_GE_512-NEXT:    ptrue p0.d, vl8
780; VBITS_GE_512-NEXT:    ld1d { z0.d }, p0/z, [x0]
781; VBITS_GE_512-NEXT:    ld1d { z1.d }, p0/z, [x1]
782; VBITS_GE_512-NEXT:    smin z0.d, p0/m, z0.d, z1.d
783; VBITS_GE_512-NEXT:    st1d { z0.d }, p0, [x0]
784; VBITS_GE_512-NEXT:    ret
785  %op1 = load <8 x i64>, ptr %a
786  %op2 = load <8 x i64>, ptr %b
787  %res = call <8 x i64> @llvm.smin.v8i64(<8 x i64> %op1, <8 x i64> %op2)
788  store <8 x i64> %res, ptr %a
789  ret void
790}
791
792define void @smin_v16i64(ptr %a, ptr %b) vscale_range(8,0) #0 {
793; CHECK-LABEL: smin_v16i64:
794; CHECK:       // %bb.0:
795; CHECK-NEXT:    ptrue p0.d, vl16
796; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
797; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x1]
798; CHECK-NEXT:    smin z0.d, p0/m, z0.d, z1.d
799; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
800; CHECK-NEXT:    ret
801  %op1 = load <16 x i64>, ptr %a
802  %op2 = load <16 x i64>, ptr %b
803  %res = call <16 x i64> @llvm.smin.v16i64(<16 x i64> %op1, <16 x i64> %op2)
804  store <16 x i64> %res, ptr %a
805  ret void
806}
807
808define void @smin_v32i64(ptr %a, ptr %b) vscale_range(16,0) #0 {
809; CHECK-LABEL: smin_v32i64:
810; CHECK:       // %bb.0:
811; CHECK-NEXT:    ptrue p0.d, vl32
812; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
813; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x1]
814; CHECK-NEXT:    smin z0.d, p0/m, z0.d, z1.d
815; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
816; CHECK-NEXT:    ret
817  %op1 = load <32 x i64>, ptr %a
818  %op2 = load <32 x i64>, ptr %b
819  %res = call <32 x i64> @llvm.smin.v32i64(<32 x i64> %op1, <32 x i64> %op2)
820  store <32 x i64> %res, ptr %a
821  ret void
822}
823
824;
825; UMAX
826;
827
828; Don't use SVE for 64-bit vectors.
829define <8 x i8> @umax_v8i8(<8 x i8> %op1, <8 x i8> %op2) vscale_range(2,0) #0 {
830; CHECK-LABEL: umax_v8i8:
831; CHECK:       // %bb.0:
832; CHECK-NEXT:    umax v0.8b, v0.8b, v1.8b
833; CHECK-NEXT:    ret
834  %res = call <8 x i8> @llvm.umax.v8i8(<8 x i8> %op1, <8 x i8> %op2)
835  ret <8 x i8> %res
836}
837
838; Don't use SVE for 128-bit vectors.
839define <16 x i8> @umax_v16i8(<16 x i8> %op1, <16 x i8> %op2) vscale_range(2,0) #0 {
840; CHECK-LABEL: umax_v16i8:
841; CHECK:       // %bb.0:
842; CHECK-NEXT:    umax v0.16b, v0.16b, v1.16b
843; CHECK-NEXT:    ret
844  %res = call <16 x i8> @llvm.umax.v16i8(<16 x i8> %op1, <16 x i8> %op2)
845  ret <16 x i8> %res
846}
847
848define void @umax_v32i8(ptr %a, ptr %b) vscale_range(2,0) #0 {
849; CHECK-LABEL: umax_v32i8:
850; CHECK:       // %bb.0:
851; CHECK-NEXT:    ptrue p0.b, vl32
852; CHECK-NEXT:    ld1b { z0.b }, p0/z, [x0]
853; CHECK-NEXT:    ld1b { z1.b }, p0/z, [x1]
854; CHECK-NEXT:    umax z0.b, p0/m, z0.b, z1.b
855; CHECK-NEXT:    st1b { z0.b }, p0, [x0]
856; CHECK-NEXT:    ret
857  %op1 = load <32 x i8>, ptr %a
858  %op2 = load <32 x i8>, ptr %b
859  %res = call <32 x i8> @llvm.umax.v32i8(<32 x i8> %op1, <32 x i8> %op2)
860  store <32 x i8> %res, ptr %a
861  ret void
862}
863
864define void @umax_v64i8(ptr %a, ptr %b) #0 {
865; VBITS_GE_256-LABEL: umax_v64i8:
866; VBITS_GE_256:       // %bb.0:
867; VBITS_GE_256-NEXT:    ptrue p0.b, vl32
868; VBITS_GE_256-NEXT:    mov w8, #32 // =0x20
869; VBITS_GE_256-NEXT:    ld1b { z0.b }, p0/z, [x0, x8]
870; VBITS_GE_256-NEXT:    ld1b { z1.b }, p0/z, [x1, x8]
871; VBITS_GE_256-NEXT:    ld1b { z2.b }, p0/z, [x0]
872; VBITS_GE_256-NEXT:    ld1b { z3.b }, p0/z, [x1]
873; VBITS_GE_256-NEXT:    umax z0.b, p0/m, z0.b, z1.b
874; VBITS_GE_256-NEXT:    movprfx z1, z2
875; VBITS_GE_256-NEXT:    umax z1.b, p0/m, z1.b, z3.b
876; VBITS_GE_256-NEXT:    st1b { z0.b }, p0, [x0, x8]
877; VBITS_GE_256-NEXT:    st1b { z1.b }, p0, [x0]
878; VBITS_GE_256-NEXT:    ret
879;
880; VBITS_GE_512-LABEL: umax_v64i8:
881; VBITS_GE_512:       // %bb.0:
882; VBITS_GE_512-NEXT:    ptrue p0.b, vl64
883; VBITS_GE_512-NEXT:    ld1b { z0.b }, p0/z, [x0]
884; VBITS_GE_512-NEXT:    ld1b { z1.b }, p0/z, [x1]
885; VBITS_GE_512-NEXT:    umax z0.b, p0/m, z0.b, z1.b
886; VBITS_GE_512-NEXT:    st1b { z0.b }, p0, [x0]
887; VBITS_GE_512-NEXT:    ret
888  %op1 = load <64 x i8>, ptr %a
889  %op2 = load <64 x i8>, ptr %b
890  %res = call <64 x i8> @llvm.umax.v64i8(<64 x i8> %op1, <64 x i8> %op2)
891  store <64 x i8> %res, ptr %a
892  ret void
893}
894
895define void @umax_v128i8(ptr %a, ptr %b) vscale_range(8,0) #0 {
896; CHECK-LABEL: umax_v128i8:
897; CHECK:       // %bb.0:
898; CHECK-NEXT:    ptrue p0.b, vl128
899; CHECK-NEXT:    ld1b { z0.b }, p0/z, [x0]
900; CHECK-NEXT:    ld1b { z1.b }, p0/z, [x1]
901; CHECK-NEXT:    umax z0.b, p0/m, z0.b, z1.b
902; CHECK-NEXT:    st1b { z0.b }, p0, [x0]
903; CHECK-NEXT:    ret
904  %op1 = load <128 x i8>, ptr %a
905  %op2 = load <128 x i8>, ptr %b
906  %res = call <128 x i8> @llvm.umax.v128i8(<128 x i8> %op1, <128 x i8> %op2)
907  store <128 x i8> %res, ptr %a
908  ret void
909}
910
911define void @umax_v256i8(ptr %a, ptr %b) vscale_range(16,0) #0 {
912; CHECK-LABEL: umax_v256i8:
913; CHECK:       // %bb.0:
914; CHECK-NEXT:    ptrue p0.b, vl256
915; CHECK-NEXT:    ld1b { z0.b }, p0/z, [x0]
916; CHECK-NEXT:    ld1b { z1.b }, p0/z, [x1]
917; CHECK-NEXT:    umax z0.b, p0/m, z0.b, z1.b
918; CHECK-NEXT:    st1b { z0.b }, p0, [x0]
919; CHECK-NEXT:    ret
920  %op1 = load <256 x i8>, ptr %a
921  %op2 = load <256 x i8>, ptr %b
922  %res = call <256 x i8> @llvm.umax.v256i8(<256 x i8> %op1, <256 x i8> %op2)
923  store <256 x i8> %res, ptr %a
924  ret void
925}
926
927; Don't use SVE for 64-bit vectors.
928define <4 x i16> @umax_v4i16(<4 x i16> %op1, <4 x i16> %op2) vscale_range(2,0) #0 {
929; CHECK-LABEL: umax_v4i16:
930; CHECK:       // %bb.0:
931; CHECK-NEXT:    umax v0.4h, v0.4h, v1.4h
932; CHECK-NEXT:    ret
933  %res = call <4 x i16> @llvm.umax.v4i16(<4 x i16> %op1, <4 x i16> %op2)
934  ret <4 x i16> %res
935}
936
937; Don't use SVE for 128-bit vectors.
938define <8 x i16> @umax_v8i16(<8 x i16> %op1, <8 x i16> %op2) vscale_range(2,0) #0 {
939; CHECK-LABEL: umax_v8i16:
940; CHECK:       // %bb.0:
941; CHECK-NEXT:    umax v0.8h, v0.8h, v1.8h
942; CHECK-NEXT:    ret
943  %res = call <8 x i16> @llvm.umax.v8i16(<8 x i16> %op1, <8 x i16> %op2)
944  ret <8 x i16> %res
945}
946
947define void @umax_v16i16(ptr %a, ptr %b) vscale_range(2,0) #0 {
948; CHECK-LABEL: umax_v16i16:
949; CHECK:       // %bb.0:
950; CHECK-NEXT:    ptrue p0.h, vl16
951; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
952; CHECK-NEXT:    ld1h { z1.h }, p0/z, [x1]
953; CHECK-NEXT:    umax z0.h, p0/m, z0.h, z1.h
954; CHECK-NEXT:    st1h { z0.h }, p0, [x0]
955; CHECK-NEXT:    ret
956  %op1 = load <16 x i16>, ptr %a
957  %op2 = load <16 x i16>, ptr %b
958  %res = call <16 x i16> @llvm.umax.v16i16(<16 x i16> %op1, <16 x i16> %op2)
959  store <16 x i16> %res, ptr %a
960  ret void
961}
962
963define void @umax_v32i16(ptr %a, ptr %b) #0 {
964; VBITS_GE_256-LABEL: umax_v32i16:
965; VBITS_GE_256:       // %bb.0:
966; VBITS_GE_256-NEXT:    ptrue p0.h, vl16
967; VBITS_GE_256-NEXT:    mov x8, #16 // =0x10
968; VBITS_GE_256-NEXT:    ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
969; VBITS_GE_256-NEXT:    ld1h { z1.h }, p0/z, [x1, x8, lsl #1]
970; VBITS_GE_256-NEXT:    ld1h { z2.h }, p0/z, [x0]
971; VBITS_GE_256-NEXT:    ld1h { z3.h }, p0/z, [x1]
972; VBITS_GE_256-NEXT:    umax z0.h, p0/m, z0.h, z1.h
973; VBITS_GE_256-NEXT:    movprfx z1, z2
974; VBITS_GE_256-NEXT:    umax z1.h, p0/m, z1.h, z3.h
975; VBITS_GE_256-NEXT:    st1h { z0.h }, p0, [x0, x8, lsl #1]
976; VBITS_GE_256-NEXT:    st1h { z1.h }, p0, [x0]
977; VBITS_GE_256-NEXT:    ret
978;
979; VBITS_GE_512-LABEL: umax_v32i16:
980; VBITS_GE_512:       // %bb.0:
981; VBITS_GE_512-NEXT:    ptrue p0.h, vl32
982; VBITS_GE_512-NEXT:    ld1h { z0.h }, p0/z, [x0]
983; VBITS_GE_512-NEXT:    ld1h { z1.h }, p0/z, [x1]
984; VBITS_GE_512-NEXT:    umax z0.h, p0/m, z0.h, z1.h
985; VBITS_GE_512-NEXT:    st1h { z0.h }, p0, [x0]
986; VBITS_GE_512-NEXT:    ret
987  %op1 = load <32 x i16>, ptr %a
988  %op2 = load <32 x i16>, ptr %b
989  %res = call <32 x i16> @llvm.umax.v32i16(<32 x i16> %op1, <32 x i16> %op2)
990  store <32 x i16> %res, ptr %a
991  ret void
992}
993
994define void @umax_v64i16(ptr %a, ptr %b) vscale_range(8,0) #0 {
995; CHECK-LABEL: umax_v64i16:
996; CHECK:       // %bb.0:
997; CHECK-NEXT:    ptrue p0.h, vl64
998; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
999; CHECK-NEXT:    ld1h { z1.h }, p0/z, [x1]
1000; CHECK-NEXT:    umax z0.h, p0/m, z0.h, z1.h
1001; CHECK-NEXT:    st1h { z0.h }, p0, [x0]
1002; CHECK-NEXT:    ret
1003  %op1 = load <64 x i16>, ptr %a
1004  %op2 = load <64 x i16>, ptr %b
1005  %res = call <64 x i16> @llvm.umax.v64i16(<64 x i16> %op1, <64 x i16> %op2)
1006  store <64 x i16> %res, ptr %a
1007  ret void
1008}
1009
1010define void @umax_v128i16(ptr %a, ptr %b) vscale_range(16,0) #0 {
1011; CHECK-LABEL: umax_v128i16:
1012; CHECK:       // %bb.0:
1013; CHECK-NEXT:    ptrue p0.h, vl128
1014; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
1015; CHECK-NEXT:    ld1h { z1.h }, p0/z, [x1]
1016; CHECK-NEXT:    umax z0.h, p0/m, z0.h, z1.h
1017; CHECK-NEXT:    st1h { z0.h }, p0, [x0]
1018; CHECK-NEXT:    ret
1019  %op1 = load <128 x i16>, ptr %a
1020  %op2 = load <128 x i16>, ptr %b
1021  %res = call <128 x i16> @llvm.umax.v128i16(<128 x i16> %op1, <128 x i16> %op2)
1022  store <128 x i16> %res, ptr %a
1023  ret void
1024}
1025
1026; Don't use SVE for 64-bit vectors.
1027define <2 x i32> @umax_v2i32(<2 x i32> %op1, <2 x i32> %op2) vscale_range(2,0) #0 {
1028; CHECK-LABEL: umax_v2i32:
1029; CHECK:       // %bb.0:
1030; CHECK-NEXT:    umax v0.2s, v0.2s, v1.2s
1031; CHECK-NEXT:    ret
1032  %res = call <2 x i32> @llvm.umax.v2i32(<2 x i32> %op1, <2 x i32> %op2)
1033  ret <2 x i32> %res
1034}
1035
1036; Don't use SVE for 128-bit vectors.
1037define <4 x i32> @umax_v4i32(<4 x i32> %op1, <4 x i32> %op2) vscale_range(2,0) #0 {
1038; CHECK-LABEL: umax_v4i32:
1039; CHECK:       // %bb.0:
1040; CHECK-NEXT:    umax v0.4s, v0.4s, v1.4s
1041; CHECK-NEXT:    ret
1042  %res = call <4 x i32> @llvm.umax.v4i32(<4 x i32> %op1, <4 x i32> %op2)
1043  ret <4 x i32> %res
1044}
1045
1046define void @umax_v8i32(ptr %a, ptr %b) vscale_range(2,0) #0 {
1047; CHECK-LABEL: umax_v8i32:
1048; CHECK:       // %bb.0:
1049; CHECK-NEXT:    ptrue p0.s, vl8
1050; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
1051; CHECK-NEXT:    ld1w { z1.s }, p0/z, [x1]
1052; CHECK-NEXT:    umax z0.s, p0/m, z0.s, z1.s
1053; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
1054; CHECK-NEXT:    ret
1055  %op1 = load <8 x i32>, ptr %a
1056  %op2 = load <8 x i32>, ptr %b
1057  %res = call <8 x i32> @llvm.umax.v8i32(<8 x i32> %op1, <8 x i32> %op2)
1058  store <8 x i32> %res, ptr %a
1059  ret void
1060}
1061
1062define void @umax_v16i32(ptr %a, ptr %b) #0 {
1063; VBITS_GE_256-LABEL: umax_v16i32:
1064; VBITS_GE_256:       // %bb.0:
1065; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
1066; VBITS_GE_256-NEXT:    mov x8, #8 // =0x8
1067; VBITS_GE_256-NEXT:    ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
1068; VBITS_GE_256-NEXT:    ld1w { z1.s }, p0/z, [x1, x8, lsl #2]
1069; VBITS_GE_256-NEXT:    ld1w { z2.s }, p0/z, [x0]
1070; VBITS_GE_256-NEXT:    ld1w { z3.s }, p0/z, [x1]
1071; VBITS_GE_256-NEXT:    umax z0.s, p0/m, z0.s, z1.s
1072; VBITS_GE_256-NEXT:    movprfx z1, z2
1073; VBITS_GE_256-NEXT:    umax z1.s, p0/m, z1.s, z3.s
1074; VBITS_GE_256-NEXT:    st1w { z0.s }, p0, [x0, x8, lsl #2]
1075; VBITS_GE_256-NEXT:    st1w { z1.s }, p0, [x0]
1076; VBITS_GE_256-NEXT:    ret
1077;
1078; VBITS_GE_512-LABEL: umax_v16i32:
1079; VBITS_GE_512:       // %bb.0:
1080; VBITS_GE_512-NEXT:    ptrue p0.s, vl16
1081; VBITS_GE_512-NEXT:    ld1w { z0.s }, p0/z, [x0]
1082; VBITS_GE_512-NEXT:    ld1w { z1.s }, p0/z, [x1]
1083; VBITS_GE_512-NEXT:    umax z0.s, p0/m, z0.s, z1.s
1084; VBITS_GE_512-NEXT:    st1w { z0.s }, p0, [x0]
1085; VBITS_GE_512-NEXT:    ret
1086  %op1 = load <16 x i32>, ptr %a
1087  %op2 = load <16 x i32>, ptr %b
1088  %res = call <16 x i32> @llvm.umax.v16i32(<16 x i32> %op1, <16 x i32> %op2)
1089  store <16 x i32> %res, ptr %a
1090  ret void
1091}
1092
1093define void @umax_v32i32(ptr %a, ptr %b) vscale_range(8,0) #0 {
1094; CHECK-LABEL: umax_v32i32:
1095; CHECK:       // %bb.0:
1096; CHECK-NEXT:    ptrue p0.s, vl32
1097; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
1098; CHECK-NEXT:    ld1w { z1.s }, p0/z, [x1]
1099; CHECK-NEXT:    umax z0.s, p0/m, z0.s, z1.s
1100; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
1101; CHECK-NEXT:    ret
1102  %op1 = load <32 x i32>, ptr %a
1103  %op2 = load <32 x i32>, ptr %b
1104  %res = call <32 x i32> @llvm.umax.v32i32(<32 x i32> %op1, <32 x i32> %op2)
1105  store <32 x i32> %res, ptr %a
1106  ret void
1107}
1108
1109define void @umax_v64i32(ptr %a, ptr %b) vscale_range(16,0) #0 {
1110; CHECK-LABEL: umax_v64i32:
1111; CHECK:       // %bb.0:
1112; CHECK-NEXT:    ptrue p0.s, vl64
1113; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
1114; CHECK-NEXT:    ld1w { z1.s }, p0/z, [x1]
1115; CHECK-NEXT:    umax z0.s, p0/m, z0.s, z1.s
1116; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
1117; CHECK-NEXT:    ret
1118  %op1 = load <64 x i32>, ptr %a
1119  %op2 = load <64 x i32>, ptr %b
1120  %res = call <64 x i32> @llvm.umax.v64i32(<64 x i32> %op1, <64 x i32> %op2)
1121  store <64 x i32> %res, ptr %a
1122  ret void
1123}
1124
1125; Vector i64 max are not legal for NEON so use SVE when available.
1126define <1 x i64> @umax_v1i64(<1 x i64> %op1, <1 x i64> %op2) vscale_range(2,0) #0 {
1127; CHECK-LABEL: umax_v1i64:
1128; CHECK:       // %bb.0:
1129; CHECK-NEXT:    ptrue p0.d, vl1
1130; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
1131; CHECK-NEXT:    // kill: def $d1 killed $d1 def $z1
1132; CHECK-NEXT:    umax z0.d, p0/m, z0.d, z1.d
1133; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
1134; CHECK-NEXT:    ret
1135  %res = call <1 x i64> @llvm.umax.v1i64(<1 x i64> %op1, <1 x i64> %op2)
1136  ret <1 x i64> %res
1137}
1138
1139; Vector i64 max are not legal for NEON so use SVE when available.
1140define <2 x i64> @umax_v2i64(<2 x i64> %op1, <2 x i64> %op2) vscale_range(2,0) #0 {
1141; CHECK-LABEL: umax_v2i64:
1142; CHECK:       // %bb.0:
1143; CHECK-NEXT:    ptrue p0.d, vl2
1144; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
1145; CHECK-NEXT:    // kill: def $q1 killed $q1 def $z1
1146; CHECK-NEXT:    umax z0.d, p0/m, z0.d, z1.d
1147; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
1148; CHECK-NEXT:    ret
1149  %res = call <2 x i64> @llvm.umax.v2i64(<2 x i64> %op1, <2 x i64> %op2)
1150  ret <2 x i64> %res
1151}
1152
1153define void @umax_v4i64(ptr %a, ptr %b) vscale_range(2,0) #0 {
1154; CHECK-LABEL: umax_v4i64:
1155; CHECK:       // %bb.0:
1156; CHECK-NEXT:    ptrue p0.d, vl4
1157; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
1158; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x1]
1159; CHECK-NEXT:    umax z0.d, p0/m, z0.d, z1.d
1160; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
1161; CHECK-NEXT:    ret
1162  %op1 = load <4 x i64>, ptr %a
1163  %op2 = load <4 x i64>, ptr %b
1164  %res = call <4 x i64> @llvm.umax.v4i64(<4 x i64> %op1, <4 x i64> %op2)
1165  store <4 x i64> %res, ptr %a
1166  ret void
1167}
1168
1169define void @umax_v8i64(ptr %a, ptr %b) #0 {
1170; VBITS_GE_256-LABEL: umax_v8i64:
1171; VBITS_GE_256:       // %bb.0:
1172; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
1173; VBITS_GE_256-NEXT:    mov x8, #4 // =0x4
1174; VBITS_GE_256-NEXT:    ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
1175; VBITS_GE_256-NEXT:    ld1d { z1.d }, p0/z, [x1, x8, lsl #3]
1176; VBITS_GE_256-NEXT:    ld1d { z2.d }, p0/z, [x0]
1177; VBITS_GE_256-NEXT:    ld1d { z3.d }, p0/z, [x1]
1178; VBITS_GE_256-NEXT:    umax z0.d, p0/m, z0.d, z1.d
1179; VBITS_GE_256-NEXT:    movprfx z1, z2
1180; VBITS_GE_256-NEXT:    umax z1.d, p0/m, z1.d, z3.d
1181; VBITS_GE_256-NEXT:    st1d { z0.d }, p0, [x0, x8, lsl #3]
1182; VBITS_GE_256-NEXT:    st1d { z1.d }, p0, [x0]
1183; VBITS_GE_256-NEXT:    ret
1184;
1185; VBITS_GE_512-LABEL: umax_v8i64:
1186; VBITS_GE_512:       // %bb.0:
1187; VBITS_GE_512-NEXT:    ptrue p0.d, vl8
1188; VBITS_GE_512-NEXT:    ld1d { z0.d }, p0/z, [x0]
1189; VBITS_GE_512-NEXT:    ld1d { z1.d }, p0/z, [x1]
1190; VBITS_GE_512-NEXT:    umax z0.d, p0/m, z0.d, z1.d
1191; VBITS_GE_512-NEXT:    st1d { z0.d }, p0, [x0]
1192; VBITS_GE_512-NEXT:    ret
1193  %op1 = load <8 x i64>, ptr %a
1194  %op2 = load <8 x i64>, ptr %b
1195  %res = call <8 x i64> @llvm.umax.v8i64(<8 x i64> %op1, <8 x i64> %op2)
1196  store <8 x i64> %res, ptr %a
1197  ret void
1198}
1199
1200define void @umax_v16i64(ptr %a, ptr %b) vscale_range(8,0) #0 {
1201; CHECK-LABEL: umax_v16i64:
1202; CHECK:       // %bb.0:
1203; CHECK-NEXT:    ptrue p0.d, vl16
1204; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
1205; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x1]
1206; CHECK-NEXT:    umax z0.d, p0/m, z0.d, z1.d
1207; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
1208; CHECK-NEXT:    ret
1209  %op1 = load <16 x i64>, ptr %a
1210  %op2 = load <16 x i64>, ptr %b
1211  %res = call <16 x i64> @llvm.umax.v16i64(<16 x i64> %op1, <16 x i64> %op2)
1212  store <16 x i64> %res, ptr %a
1213  ret void
1214}
1215
1216define void @umax_v32i64(ptr %a, ptr %b) vscale_range(16,0) #0 {
1217; CHECK-LABEL: umax_v32i64:
1218; CHECK:       // %bb.0:
1219; CHECK-NEXT:    ptrue p0.d, vl32
1220; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
1221; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x1]
1222; CHECK-NEXT:    umax z0.d, p0/m, z0.d, z1.d
1223; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
1224; CHECK-NEXT:    ret
1225  %op1 = load <32 x i64>, ptr %a
1226  %op2 = load <32 x i64>, ptr %b
1227  %res = call <32 x i64> @llvm.umax.v32i64(<32 x i64> %op1, <32 x i64> %op2)
1228  store <32 x i64> %res, ptr %a
1229  ret void
1230}
1231
1232;
1233; UMIN
1234;
1235
1236; Don't use SVE for 64-bit vectors.
1237define <8 x i8> @umin_v8i8(<8 x i8> %op1, <8 x i8> %op2) vscale_range(2,0) #0 {
1238; CHECK-LABEL: umin_v8i8:
1239; CHECK:       // %bb.0:
1240; CHECK-NEXT:    umin v0.8b, v0.8b, v1.8b
1241; CHECK-NEXT:    ret
1242  %res = call <8 x i8> @llvm.umin.v8i8(<8 x i8> %op1, <8 x i8> %op2)
1243  ret <8 x i8> %res
1244}
1245
1246; Don't use SVE for 128-bit vectors.
1247define <16 x i8> @umin_v16i8(<16 x i8> %op1, <16 x i8> %op2) vscale_range(2,0) #0 {
1248; CHECK-LABEL: umin_v16i8:
1249; CHECK:       // %bb.0:
1250; CHECK-NEXT:    umin v0.16b, v0.16b, v1.16b
1251; CHECK-NEXT:    ret
1252  %res = call <16 x i8> @llvm.umin.v16i8(<16 x i8> %op1, <16 x i8> %op2)
1253  ret <16 x i8> %res
1254}
1255
1256define void @umin_v32i8(ptr %a, ptr %b) vscale_range(2,0) #0 {
1257; CHECK-LABEL: umin_v32i8:
1258; CHECK:       // %bb.0:
1259; CHECK-NEXT:    ptrue p0.b, vl32
1260; CHECK-NEXT:    ld1b { z0.b }, p0/z, [x0]
1261; CHECK-NEXT:    ld1b { z1.b }, p0/z, [x1]
1262; CHECK-NEXT:    umin z0.b, p0/m, z0.b, z1.b
1263; CHECK-NEXT:    st1b { z0.b }, p0, [x0]
1264; CHECK-NEXT:    ret
1265  %op1 = load <32 x i8>, ptr %a
1266  %op2 = load <32 x i8>, ptr %b
1267  %res = call <32 x i8> @llvm.umin.v32i8(<32 x i8> %op1, <32 x i8> %op2)
1268  store <32 x i8> %res, ptr %a
1269  ret void
1270}
1271
1272define void @umin_v64i8(ptr %a, ptr %b) #0 {
1273; VBITS_GE_256-LABEL: umin_v64i8:
1274; VBITS_GE_256:       // %bb.0:
1275; VBITS_GE_256-NEXT:    ptrue p0.b, vl32
1276; VBITS_GE_256-NEXT:    mov w8, #32 // =0x20
1277; VBITS_GE_256-NEXT:    ld1b { z0.b }, p0/z, [x0, x8]
1278; VBITS_GE_256-NEXT:    ld1b { z1.b }, p0/z, [x1, x8]
1279; VBITS_GE_256-NEXT:    ld1b { z2.b }, p0/z, [x0]
1280; VBITS_GE_256-NEXT:    ld1b { z3.b }, p0/z, [x1]
1281; VBITS_GE_256-NEXT:    umin z0.b, p0/m, z0.b, z1.b
1282; VBITS_GE_256-NEXT:    movprfx z1, z2
1283; VBITS_GE_256-NEXT:    umin z1.b, p0/m, z1.b, z3.b
1284; VBITS_GE_256-NEXT:    st1b { z0.b }, p0, [x0, x8]
1285; VBITS_GE_256-NEXT:    st1b { z1.b }, p0, [x0]
1286; VBITS_GE_256-NEXT:    ret
1287;
1288; VBITS_GE_512-LABEL: umin_v64i8:
1289; VBITS_GE_512:       // %bb.0:
1290; VBITS_GE_512-NEXT:    ptrue p0.b, vl64
1291; VBITS_GE_512-NEXT:    ld1b { z0.b }, p0/z, [x0]
1292; VBITS_GE_512-NEXT:    ld1b { z1.b }, p0/z, [x1]
1293; VBITS_GE_512-NEXT:    umin z0.b, p0/m, z0.b, z1.b
1294; VBITS_GE_512-NEXT:    st1b { z0.b }, p0, [x0]
1295; VBITS_GE_512-NEXT:    ret
1296  %op1 = load <64 x i8>, ptr %a
1297  %op2 = load <64 x i8>, ptr %b
1298  %res = call <64 x i8> @llvm.umin.v64i8(<64 x i8> %op1, <64 x i8> %op2)
1299  store <64 x i8> %res, ptr %a
1300  ret void
1301}
1302
1303define void @umin_v128i8(ptr %a, ptr %b) vscale_range(8,0) #0 {
1304; CHECK-LABEL: umin_v128i8:
1305; CHECK:       // %bb.0:
1306; CHECK-NEXT:    ptrue p0.b, vl128
1307; CHECK-NEXT:    ld1b { z0.b }, p0/z, [x0]
1308; CHECK-NEXT:    ld1b { z1.b }, p0/z, [x1]
1309; CHECK-NEXT:    umin z0.b, p0/m, z0.b, z1.b
1310; CHECK-NEXT:    st1b { z0.b }, p0, [x0]
1311; CHECK-NEXT:    ret
1312  %op1 = load <128 x i8>, ptr %a
1313  %op2 = load <128 x i8>, ptr %b
1314  %res = call <128 x i8> @llvm.umin.v128i8(<128 x i8> %op1, <128 x i8> %op2)
1315  store <128 x i8> %res, ptr %a
1316  ret void
1317}
1318
1319define void @umin_v256i8(ptr %a, ptr %b) vscale_range(16,0) #0 {
1320; CHECK-LABEL: umin_v256i8:
1321; CHECK:       // %bb.0:
1322; CHECK-NEXT:    ptrue p0.b, vl256
1323; CHECK-NEXT:    ld1b { z0.b }, p0/z, [x0]
1324; CHECK-NEXT:    ld1b { z1.b }, p0/z, [x1]
1325; CHECK-NEXT:    umin z0.b, p0/m, z0.b, z1.b
1326; CHECK-NEXT:    st1b { z0.b }, p0, [x0]
1327; CHECK-NEXT:    ret
1328  %op1 = load <256 x i8>, ptr %a
1329  %op2 = load <256 x i8>, ptr %b
1330  %res = call <256 x i8> @llvm.umin.v256i8(<256 x i8> %op1, <256 x i8> %op2)
1331  store <256 x i8> %res, ptr %a
1332  ret void
1333}
1334
1335; Don't use SVE for 64-bit vectors.
1336define <4 x i16> @umin_v4i16(<4 x i16> %op1, <4 x i16> %op2) vscale_range(2,0) #0 {
1337; CHECK-LABEL: umin_v4i16:
1338; CHECK:       // %bb.0:
1339; CHECK-NEXT:    umin v0.4h, v0.4h, v1.4h
1340; CHECK-NEXT:    ret
1341  %res = call <4 x i16> @llvm.umin.v4i16(<4 x i16> %op1, <4 x i16> %op2)
1342  ret <4 x i16> %res
1343}
1344
1345; Don't use SVE for 128-bit vectors.
1346define <8 x i16> @umin_v8i16(<8 x i16> %op1, <8 x i16> %op2) vscale_range(2,0) #0 {
1347; CHECK-LABEL: umin_v8i16:
1348; CHECK:       // %bb.0:
1349; CHECK-NEXT:    umin v0.8h, v0.8h, v1.8h
1350; CHECK-NEXT:    ret
1351  %res = call <8 x i16> @llvm.umin.v8i16(<8 x i16> %op1, <8 x i16> %op2)
1352  ret <8 x i16> %res
1353}
1354
1355define void @umin_v16i16(ptr %a, ptr %b) vscale_range(2,0) #0 {
1356; CHECK-LABEL: umin_v16i16:
1357; CHECK:       // %bb.0:
1358; CHECK-NEXT:    ptrue p0.h, vl16
1359; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
1360; CHECK-NEXT:    ld1h { z1.h }, p0/z, [x1]
1361; CHECK-NEXT:    umin z0.h, p0/m, z0.h, z1.h
1362; CHECK-NEXT:    st1h { z0.h }, p0, [x0]
1363; CHECK-NEXT:    ret
1364  %op1 = load <16 x i16>, ptr %a
1365  %op2 = load <16 x i16>, ptr %b
1366  %res = call <16 x i16> @llvm.umin.v16i16(<16 x i16> %op1, <16 x i16> %op2)
1367  store <16 x i16> %res, ptr %a
1368  ret void
1369}
1370
1371define void @umin_v32i16(ptr %a, ptr %b) #0 {
1372; VBITS_GE_256-LABEL: umin_v32i16:
1373; VBITS_GE_256:       // %bb.0:
1374; VBITS_GE_256-NEXT:    ptrue p0.h, vl16
1375; VBITS_GE_256-NEXT:    mov x8, #16 // =0x10
1376; VBITS_GE_256-NEXT:    ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
1377; VBITS_GE_256-NEXT:    ld1h { z1.h }, p0/z, [x1, x8, lsl #1]
1378; VBITS_GE_256-NEXT:    ld1h { z2.h }, p0/z, [x0]
1379; VBITS_GE_256-NEXT:    ld1h { z3.h }, p0/z, [x1]
1380; VBITS_GE_256-NEXT:    umin z0.h, p0/m, z0.h, z1.h
1381; VBITS_GE_256-NEXT:    movprfx z1, z2
1382; VBITS_GE_256-NEXT:    umin z1.h, p0/m, z1.h, z3.h
1383; VBITS_GE_256-NEXT:    st1h { z0.h }, p0, [x0, x8, lsl #1]
1384; VBITS_GE_256-NEXT:    st1h { z1.h }, p0, [x0]
1385; VBITS_GE_256-NEXT:    ret
1386;
1387; VBITS_GE_512-LABEL: umin_v32i16:
1388; VBITS_GE_512:       // %bb.0:
1389; VBITS_GE_512-NEXT:    ptrue p0.h, vl32
1390; VBITS_GE_512-NEXT:    ld1h { z0.h }, p0/z, [x0]
1391; VBITS_GE_512-NEXT:    ld1h { z1.h }, p0/z, [x1]
1392; VBITS_GE_512-NEXT:    umin z0.h, p0/m, z0.h, z1.h
1393; VBITS_GE_512-NEXT:    st1h { z0.h }, p0, [x0]
1394; VBITS_GE_512-NEXT:    ret
1395  %op1 = load <32 x i16>, ptr %a
1396  %op2 = load <32 x i16>, ptr %b
1397  %res = call <32 x i16> @llvm.umin.v32i16(<32 x i16> %op1, <32 x i16> %op2)
1398  store <32 x i16> %res, ptr %a
1399  ret void
1400}
1401
1402define void @umin_v64i16(ptr %a, ptr %b) vscale_range(8,0) #0 {
1403; CHECK-LABEL: umin_v64i16:
1404; CHECK:       // %bb.0:
1405; CHECK-NEXT:    ptrue p0.h, vl64
1406; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
1407; CHECK-NEXT:    ld1h { z1.h }, p0/z, [x1]
1408; CHECK-NEXT:    umin z0.h, p0/m, z0.h, z1.h
1409; CHECK-NEXT:    st1h { z0.h }, p0, [x0]
1410; CHECK-NEXT:    ret
1411  %op1 = load <64 x i16>, ptr %a
1412  %op2 = load <64 x i16>, ptr %b
1413  %res = call <64 x i16> @llvm.umin.v64i16(<64 x i16> %op1, <64 x i16> %op2)
1414  store <64 x i16> %res, ptr %a
1415  ret void
1416}
1417
1418define void @umin_v128i16(ptr %a, ptr %b) vscale_range(16,0) #0 {
1419; CHECK-LABEL: umin_v128i16:
1420; CHECK:       // %bb.0:
1421; CHECK-NEXT:    ptrue p0.h, vl128
1422; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
1423; CHECK-NEXT:    ld1h { z1.h }, p0/z, [x1]
1424; CHECK-NEXT:    umin z0.h, p0/m, z0.h, z1.h
1425; CHECK-NEXT:    st1h { z0.h }, p0, [x0]
1426; CHECK-NEXT:    ret
1427  %op1 = load <128 x i16>, ptr %a
1428  %op2 = load <128 x i16>, ptr %b
1429  %res = call <128 x i16> @llvm.umin.v128i16(<128 x i16> %op1, <128 x i16> %op2)
1430  store <128 x i16> %res, ptr %a
1431  ret void
1432}
1433
1434; Don't use SVE for 64-bit vectors.
1435define <2 x i32> @umin_v2i32(<2 x i32> %op1, <2 x i32> %op2) vscale_range(2,0) #0 {
1436; CHECK-LABEL: umin_v2i32:
1437; CHECK:       // %bb.0:
1438; CHECK-NEXT:    umin v0.2s, v0.2s, v1.2s
1439; CHECK-NEXT:    ret
1440  %res = call <2 x i32> @llvm.umin.v2i32(<2 x i32> %op1, <2 x i32> %op2)
1441  ret <2 x i32> %res
1442}
1443
1444; Don't use SVE for 128-bit vectors.
1445define <4 x i32> @umin_v4i32(<4 x i32> %op1, <4 x i32> %op2) vscale_range(2,0) #0 {
1446; CHECK-LABEL: umin_v4i32:
1447; CHECK:       // %bb.0:
1448; CHECK-NEXT:    umin v0.4s, v0.4s, v1.4s
1449; CHECK-NEXT:    ret
1450  %res = call <4 x i32> @llvm.umin.v4i32(<4 x i32> %op1, <4 x i32> %op2)
1451  ret <4 x i32> %res
1452}
1453
1454define void @umin_v8i32(ptr %a, ptr %b) vscale_range(2,0) #0 {
1455; CHECK-LABEL: umin_v8i32:
1456; CHECK:       // %bb.0:
1457; CHECK-NEXT:    ptrue p0.s, vl8
1458; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
1459; CHECK-NEXT:    ld1w { z1.s }, p0/z, [x1]
1460; CHECK-NEXT:    umin z0.s, p0/m, z0.s, z1.s
1461; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
1462; CHECK-NEXT:    ret
1463  %op1 = load <8 x i32>, ptr %a
1464  %op2 = load <8 x i32>, ptr %b
1465  %res = call <8 x i32> @llvm.umin.v8i32(<8 x i32> %op1, <8 x i32> %op2)
1466  store <8 x i32> %res, ptr %a
1467  ret void
1468}
1469
1470define void @umin_v16i32(ptr %a, ptr %b) #0 {
1471; VBITS_GE_256-LABEL: umin_v16i32:
1472; VBITS_GE_256:       // %bb.0:
1473; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
1474; VBITS_GE_256-NEXT:    mov x8, #8 // =0x8
1475; VBITS_GE_256-NEXT:    ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
1476; VBITS_GE_256-NEXT:    ld1w { z1.s }, p0/z, [x1, x8, lsl #2]
1477; VBITS_GE_256-NEXT:    ld1w { z2.s }, p0/z, [x0]
1478; VBITS_GE_256-NEXT:    ld1w { z3.s }, p0/z, [x1]
1479; VBITS_GE_256-NEXT:    umin z0.s, p0/m, z0.s, z1.s
1480; VBITS_GE_256-NEXT:    movprfx z1, z2
1481; VBITS_GE_256-NEXT:    umin z1.s, p0/m, z1.s, z3.s
1482; VBITS_GE_256-NEXT:    st1w { z0.s }, p0, [x0, x8, lsl #2]
1483; VBITS_GE_256-NEXT:    st1w { z1.s }, p0, [x0]
1484; VBITS_GE_256-NEXT:    ret
1485;
1486; VBITS_GE_512-LABEL: umin_v16i32:
1487; VBITS_GE_512:       // %bb.0:
1488; VBITS_GE_512-NEXT:    ptrue p0.s, vl16
1489; VBITS_GE_512-NEXT:    ld1w { z0.s }, p0/z, [x0]
1490; VBITS_GE_512-NEXT:    ld1w { z1.s }, p0/z, [x1]
1491; VBITS_GE_512-NEXT:    umin z0.s, p0/m, z0.s, z1.s
1492; VBITS_GE_512-NEXT:    st1w { z0.s }, p0, [x0]
1493; VBITS_GE_512-NEXT:    ret
1494  %op1 = load <16 x i32>, ptr %a
1495  %op2 = load <16 x i32>, ptr %b
1496  %res = call <16 x i32> @llvm.umin.v16i32(<16 x i32> %op1, <16 x i32> %op2)
1497  store <16 x i32> %res, ptr %a
1498  ret void
1499}
1500
1501define void @umin_v32i32(ptr %a, ptr %b) vscale_range(8,0) #0 {
1502; CHECK-LABEL: umin_v32i32:
1503; CHECK:       // %bb.0:
1504; CHECK-NEXT:    ptrue p0.s, vl32
1505; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
1506; CHECK-NEXT:    ld1w { z1.s }, p0/z, [x1]
1507; CHECK-NEXT:    umin z0.s, p0/m, z0.s, z1.s
1508; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
1509; CHECK-NEXT:    ret
1510  %op1 = load <32 x i32>, ptr %a
1511  %op2 = load <32 x i32>, ptr %b
1512  %res = call <32 x i32> @llvm.umin.v32i32(<32 x i32> %op1, <32 x i32> %op2)
1513  store <32 x i32> %res, ptr %a
1514  ret void
1515}
1516
1517define void @umin_v64i32(ptr %a, ptr %b) vscale_range(16,0) #0 {
1518; CHECK-LABEL: umin_v64i32:
1519; CHECK:       // %bb.0:
1520; CHECK-NEXT:    ptrue p0.s, vl64
1521; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
1522; CHECK-NEXT:    ld1w { z1.s }, p0/z, [x1]
1523; CHECK-NEXT:    umin z0.s, p0/m, z0.s, z1.s
1524; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
1525; CHECK-NEXT:    ret
1526  %op1 = load <64 x i32>, ptr %a
1527  %op2 = load <64 x i32>, ptr %b
1528  %res = call <64 x i32> @llvm.umin.v64i32(<64 x i32> %op1, <64 x i32> %op2)
1529  store <64 x i32> %res, ptr %a
1530  ret void
1531}
1532
1533; Vector i64 min are not legal for NEON so use SVE when available.
1534define <1 x i64> @umin_v1i64(<1 x i64> %op1, <1 x i64> %op2) vscale_range(2,0) #0 {
1535; CHECK-LABEL: umin_v1i64:
1536; CHECK:       // %bb.0:
1537; CHECK-NEXT:    ptrue p0.d, vl1
1538; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
1539; CHECK-NEXT:    // kill: def $d1 killed $d1 def $z1
1540; CHECK-NEXT:    umin z0.d, p0/m, z0.d, z1.d
1541; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
1542; CHECK-NEXT:    ret
1543  %res = call <1 x i64> @llvm.umin.v1i64(<1 x i64> %op1, <1 x i64> %op2)
1544  ret <1 x i64> %res
1545}
1546
1547; Vector i64 min are not legal for NEON so use SVE when available.
1548define <2 x i64> @umin_v2i64(<2 x i64> %op1, <2 x i64> %op2) vscale_range(2,0) #0 {
1549; CHECK-LABEL: umin_v2i64:
1550; CHECK:       // %bb.0:
1551; CHECK-NEXT:    ptrue p0.d, vl2
1552; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
1553; CHECK-NEXT:    // kill: def $q1 killed $q1 def $z1
1554; CHECK-NEXT:    umin z0.d, p0/m, z0.d, z1.d
1555; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
1556; CHECK-NEXT:    ret
1557  %res = call <2 x i64> @llvm.umin.v2i64(<2 x i64> %op1, <2 x i64> %op2)
1558  ret <2 x i64> %res
1559}
1560
1561define void @umin_v4i64(ptr %a, ptr %b) vscale_range(2,0) #0 {
1562; CHECK-LABEL: umin_v4i64:
1563; CHECK:       // %bb.0:
1564; CHECK-NEXT:    ptrue p0.d, vl4
1565; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
1566; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x1]
1567; CHECK-NEXT:    umin z0.d, p0/m, z0.d, z1.d
1568; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
1569; CHECK-NEXT:    ret
1570  %op1 = load <4 x i64>, ptr %a
1571  %op2 = load <4 x i64>, ptr %b
1572  %res = call <4 x i64> @llvm.umin.v4i64(<4 x i64> %op1, <4 x i64> %op2)
1573  store <4 x i64> %res, ptr %a
1574  ret void
1575}
1576
1577define void @umin_v8i64(ptr %a, ptr %b) #0 {
1578; VBITS_GE_256-LABEL: umin_v8i64:
1579; VBITS_GE_256:       // %bb.0:
1580; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
1581; VBITS_GE_256-NEXT:    mov x8, #4 // =0x4
1582; VBITS_GE_256-NEXT:    ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
1583; VBITS_GE_256-NEXT:    ld1d { z1.d }, p0/z, [x1, x8, lsl #3]
1584; VBITS_GE_256-NEXT:    ld1d { z2.d }, p0/z, [x0]
1585; VBITS_GE_256-NEXT:    ld1d { z3.d }, p0/z, [x1]
1586; VBITS_GE_256-NEXT:    umin z0.d, p0/m, z0.d, z1.d
1587; VBITS_GE_256-NEXT:    movprfx z1, z2
1588; VBITS_GE_256-NEXT:    umin z1.d, p0/m, z1.d, z3.d
1589; VBITS_GE_256-NEXT:    st1d { z0.d }, p0, [x0, x8, lsl #3]
1590; VBITS_GE_256-NEXT:    st1d { z1.d }, p0, [x0]
1591; VBITS_GE_256-NEXT:    ret
1592;
1593; VBITS_GE_512-LABEL: umin_v8i64:
1594; VBITS_GE_512:       // %bb.0:
1595; VBITS_GE_512-NEXT:    ptrue p0.d, vl8
1596; VBITS_GE_512-NEXT:    ld1d { z0.d }, p0/z, [x0]
1597; VBITS_GE_512-NEXT:    ld1d { z1.d }, p0/z, [x1]
1598; VBITS_GE_512-NEXT:    umin z0.d, p0/m, z0.d, z1.d
1599; VBITS_GE_512-NEXT:    st1d { z0.d }, p0, [x0]
1600; VBITS_GE_512-NEXT:    ret
1601  %op1 = load <8 x i64>, ptr %a
1602  %op2 = load <8 x i64>, ptr %b
1603  %res = call <8 x i64> @llvm.umin.v8i64(<8 x i64> %op1, <8 x i64> %op2)
1604  store <8 x i64> %res, ptr %a
1605  ret void
1606}
1607
1608define void @umin_v16i64(ptr %a, ptr %b) vscale_range(8,0) #0 {
1609; CHECK-LABEL: umin_v16i64:
1610; CHECK:       // %bb.0:
1611; CHECK-NEXT:    ptrue p0.d, vl16
1612; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
1613; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x1]
1614; CHECK-NEXT:    umin z0.d, p0/m, z0.d, z1.d
1615; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
1616; CHECK-NEXT:    ret
1617  %op1 = load <16 x i64>, ptr %a
1618  %op2 = load <16 x i64>, ptr %b
1619  %res = call <16 x i64> @llvm.umin.v16i64(<16 x i64> %op1, <16 x i64> %op2)
1620  store <16 x i64> %res, ptr %a
1621  ret void
1622}
1623
1624define void @umin_v32i64(ptr %a, ptr %b) vscale_range(16,0) #0 {
1625; CHECK-LABEL: umin_v32i64:
1626; CHECK:       // %bb.0:
1627; CHECK-NEXT:    ptrue p0.d, vl32
1628; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
1629; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x1]
1630; CHECK-NEXT:    umin z0.d, p0/m, z0.d, z1.d
1631; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
1632; CHECK-NEXT:    ret
1633  %op1 = load <32 x i64>, ptr %a
1634  %op2 = load <32 x i64>, ptr %b
1635  %res = call <32 x i64> @llvm.umin.v32i64(<32 x i64> %op1, <32 x i64> %op2)
1636  store <32 x i64> %res, ptr %a
1637  ret void
1638}
1639
1640attributes #0 = { "target-features"="+sve" }
1641
1642declare <8 x i8> @llvm.smin.v8i8(<8 x i8>, <8 x i8>)
1643declare <16 x i8> @llvm.smin.v16i8(<16 x i8>, <16 x i8>)
1644declare <32 x i8> @llvm.smin.v32i8(<32 x i8>, <32 x i8>)
1645declare <64 x i8> @llvm.smin.v64i8(<64 x i8>, <64 x i8>)
1646declare <128 x i8> @llvm.smin.v128i8(<128 x i8>, <128 x i8>)
1647declare <256 x i8> @llvm.smin.v256i8(<256 x i8>, <256 x i8>)
1648declare <4 x i16> @llvm.smin.v4i16(<4 x i16>, <4 x i16>)
1649declare <8 x i16> @llvm.smin.v8i16(<8 x i16>, <8 x i16>)
1650declare <16 x i16> @llvm.smin.v16i16(<16 x i16>, <16 x i16>)
1651declare <32 x i16> @llvm.smin.v32i16(<32 x i16>, <32 x i16>)
1652declare <64 x i16> @llvm.smin.v64i16(<64 x i16>, <64 x i16>)
1653declare <128 x i16> @llvm.smin.v128i16(<128 x i16>, <128 x i16>)
1654declare <2 x i32> @llvm.smin.v2i32(<2 x i32>, <2 x i32>)
1655declare <4 x i32> @llvm.smin.v4i32(<4 x i32>, <4 x i32>)
1656declare <8 x i32> @llvm.smin.v8i32(<8 x i32>, <8 x i32>)
1657declare <16 x i32> @llvm.smin.v16i32(<16 x i32>, <16 x i32>)
1658declare <32 x i32> @llvm.smin.v32i32(<32 x i32>, <32 x i32>)
1659declare <64 x i32> @llvm.smin.v64i32(<64 x i32>, <64 x i32>)
1660declare <1 x i64> @llvm.smin.v1i64(<1 x i64>, <1 x i64>)
1661declare <2 x i64> @llvm.smin.v2i64(<2 x i64>, <2 x i64>)
1662declare <4 x i64> @llvm.smin.v4i64(<4 x i64>, <4 x i64>)
1663declare <8 x i64> @llvm.smin.v8i64(<8 x i64>, <8 x i64>)
1664declare <16 x i64> @llvm.smin.v16i64(<16 x i64>, <16 x i64>)
1665declare <32 x i64> @llvm.smin.v32i64(<32 x i64>, <32 x i64>)
1666
1667declare <8 x i8> @llvm.smax.v8i8(<8 x i8>, <8 x i8>)
1668declare <16 x i8> @llvm.smax.v16i8(<16 x i8>, <16 x i8>)
1669declare <32 x i8> @llvm.smax.v32i8(<32 x i8>, <32 x i8>)
1670declare <64 x i8> @llvm.smax.v64i8(<64 x i8>, <64 x i8>)
1671declare <128 x i8> @llvm.smax.v128i8(<128 x i8>, <128 x i8>)
1672declare <256 x i8> @llvm.smax.v256i8(<256 x i8>, <256 x i8>)
1673declare <4 x i16> @llvm.smax.v4i16(<4 x i16>, <4 x i16>)
1674declare <8 x i16> @llvm.smax.v8i16(<8 x i16>, <8 x i16>)
1675declare <16 x i16> @llvm.smax.v16i16(<16 x i16>, <16 x i16>)
1676declare <32 x i16> @llvm.smax.v32i16(<32 x i16>, <32 x i16>)
1677declare <64 x i16> @llvm.smax.v64i16(<64 x i16>, <64 x i16>)
1678declare <128 x i16> @llvm.smax.v128i16(<128 x i16>, <128 x i16>)
1679declare <2 x i32> @llvm.smax.v2i32(<2 x i32>, <2 x i32>)
1680declare <4 x i32> @llvm.smax.v4i32(<4 x i32>, <4 x i32>)
1681declare <8 x i32> @llvm.smax.v8i32(<8 x i32>, <8 x i32>)
1682declare <16 x i32> @llvm.smax.v16i32(<16 x i32>, <16 x i32>)
1683declare <32 x i32> @llvm.smax.v32i32(<32 x i32>, <32 x i32>)
1684declare <64 x i32> @llvm.smax.v64i32(<64 x i32>, <64 x i32>)
1685declare <1 x i64> @llvm.smax.v1i64(<1 x i64>, <1 x i64>)
1686declare <2 x i64> @llvm.smax.v2i64(<2 x i64>, <2 x i64>)
1687declare <4 x i64> @llvm.smax.v4i64(<4 x i64>, <4 x i64>)
1688declare <8 x i64> @llvm.smax.v8i64(<8 x i64>, <8 x i64>)
1689declare <16 x i64> @llvm.smax.v16i64(<16 x i64>, <16 x i64>)
1690declare <32 x i64> @llvm.smax.v32i64(<32 x i64>, <32 x i64>)
1691
1692declare <8 x i8> @llvm.umin.v8i8(<8 x i8>, <8 x i8>)
1693declare <16 x i8> @llvm.umin.v16i8(<16 x i8>, <16 x i8>)
1694declare <32 x i8> @llvm.umin.v32i8(<32 x i8>, <32 x i8>)
1695declare <64 x i8> @llvm.umin.v64i8(<64 x i8>, <64 x i8>)
1696declare <128 x i8> @llvm.umin.v128i8(<128 x i8>, <128 x i8>)
1697declare <256 x i8> @llvm.umin.v256i8(<256 x i8>, <256 x i8>)
1698declare <4 x i16> @llvm.umin.v4i16(<4 x i16>, <4 x i16>)
1699declare <8 x i16> @llvm.umin.v8i16(<8 x i16>, <8 x i16>)
1700declare <16 x i16> @llvm.umin.v16i16(<16 x i16>, <16 x i16>)
1701declare <32 x i16> @llvm.umin.v32i16(<32 x i16>, <32 x i16>)
1702declare <64 x i16> @llvm.umin.v64i16(<64 x i16>, <64 x i16>)
1703declare <128 x i16> @llvm.umin.v128i16(<128 x i16>, <128 x i16>)
1704declare <2 x i32> @llvm.umin.v2i32(<2 x i32>, <2 x i32>)
1705declare <4 x i32> @llvm.umin.v4i32(<4 x i32>, <4 x i32>)
1706declare <8 x i32> @llvm.umin.v8i32(<8 x i32>, <8 x i32>)
1707declare <16 x i32> @llvm.umin.v16i32(<16 x i32>, <16 x i32>)
1708declare <32 x i32> @llvm.umin.v32i32(<32 x i32>, <32 x i32>)
1709declare <64 x i32> @llvm.umin.v64i32(<64 x i32>, <64 x i32>)
1710declare <1 x i64> @llvm.umin.v1i64(<1 x i64>, <1 x i64>)
1711declare <2 x i64> @llvm.umin.v2i64(<2 x i64>, <2 x i64>)
1712declare <4 x i64> @llvm.umin.v4i64(<4 x i64>, <4 x i64>)
1713declare <8 x i64> @llvm.umin.v8i64(<8 x i64>, <8 x i64>)
1714declare <16 x i64> @llvm.umin.v16i64(<16 x i64>, <16 x i64>)
1715declare <32 x i64> @llvm.umin.v32i64(<32 x i64>, <32 x i64>)
1716
1717declare <8 x i8> @llvm.umax.v8i8(<8 x i8>, <8 x i8>)
1718declare <16 x i8> @llvm.umax.v16i8(<16 x i8>, <16 x i8>)
1719declare <32 x i8> @llvm.umax.v32i8(<32 x i8>, <32 x i8>)
1720declare <64 x i8> @llvm.umax.v64i8(<64 x i8>, <64 x i8>)
1721declare <128 x i8> @llvm.umax.v128i8(<128 x i8>, <128 x i8>)
1722declare <256 x i8> @llvm.umax.v256i8(<256 x i8>, <256 x i8>)
1723declare <4 x i16> @llvm.umax.v4i16(<4 x i16>, <4 x i16>)
1724declare <8 x i16> @llvm.umax.v8i16(<8 x i16>, <8 x i16>)
1725declare <16 x i16> @llvm.umax.v16i16(<16 x i16>, <16 x i16>)
1726declare <32 x i16> @llvm.umax.v32i16(<32 x i16>, <32 x i16>)
1727declare <64 x i16> @llvm.umax.v64i16(<64 x i16>, <64 x i16>)
1728declare <128 x i16> @llvm.umax.v128i16(<128 x i16>, <128 x i16>)
1729declare <2 x i32> @llvm.umax.v2i32(<2 x i32>, <2 x i32>)
1730declare <4 x i32> @llvm.umax.v4i32(<4 x i32>, <4 x i32>)
1731declare <8 x i32> @llvm.umax.v8i32(<8 x i32>, <8 x i32>)
1732declare <16 x i32> @llvm.umax.v16i32(<16 x i32>, <16 x i32>)
1733declare <32 x i32> @llvm.umax.v32i32(<32 x i32>, <32 x i32>)
1734declare <64 x i32> @llvm.umax.v64i32(<64 x i32>, <64 x i32>)
1735declare <1 x i64> @llvm.umax.v1i64(<1 x i64>, <1 x i64>)
1736declare <2 x i64> @llvm.umax.v2i64(<2 x i64>, <2 x i64>)
1737declare <4 x i64> @llvm.umax.v4i64(<4 x i64>, <4 x i64>)
1738declare <8 x i64> @llvm.umax.v8i64(<8 x i64>, <8 x i64>)
1739declare <16 x i64> @llvm.umax.v16i64(<16 x i64>, <16 x i64>)
1740declare <32 x i64> @llvm.umax.v32i64(<32 x i64>, <32 x i64>)
1741