xref: /llvm-project/llvm/test/CodeGen/AArch64/sve-fixed-length-log-reduce.ll (revision db158c7c830807caeeb0691739c41f1d522029e9)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -aarch64-sve-vector-bits-min=256  < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256
3; RUN: llc -aarch64-sve-vector-bits-min=512  < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
4; RUN: llc -aarch64-sve-vector-bits-min=2048 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
5
6target triple = "aarch64-unknown-linux-gnu"
7
8;
9; ANDV
10;
11
12; No single instruction NEON ANDV support. Use SVE.
13define i8 @andv_v8i8(<8 x i8> %a) vscale_range(2,0) #0 {
14; CHECK-LABEL: andv_v8i8:
15; CHECK:       // %bb.0:
16; CHECK-NEXT:    ptrue p0.b, vl8
17; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
18; CHECK-NEXT:    andv b0, p0, z0.b
19; CHECK-NEXT:    fmov w0, s0
20; CHECK-NEXT:    ret
21  %res = call i8 @llvm.vector.reduce.and.v8i8(<8 x i8> %a)
22  ret i8 %res
23}
24
25; No single instruction NEON ANDV support. Use SVE.
26define i8 @andv_v16i8(<16 x i8> %a) vscale_range(2,0) #0 {
27; CHECK-LABEL: andv_v16i8:
28; CHECK:       // %bb.0:
29; CHECK-NEXT:    ptrue p0.b, vl16
30; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
31; CHECK-NEXT:    andv b0, p0, z0.b
32; CHECK-NEXT:    fmov w0, s0
33; CHECK-NEXT:    ret
34  %res = call i8 @llvm.vector.reduce.and.v16i8(<16 x i8> %a)
35  ret i8 %res
36}
37
38define i8 @andv_v32i8(ptr %a) vscale_range(2,0) #0 {
39; CHECK-LABEL: andv_v32i8:
40; CHECK:       // %bb.0:
41; CHECK-NEXT:    ptrue p0.b, vl32
42; CHECK-NEXT:    ld1b { z0.b }, p0/z, [x0]
43; CHECK-NEXT:    andv b0, p0, z0.b
44; CHECK-NEXT:    fmov w0, s0
45; CHECK-NEXT:    ret
46  %op = load <32 x i8>, ptr %a
47  %res = call i8 @llvm.vector.reduce.and.v32i8(<32 x i8> %op)
48  ret i8 %res
49}
50
51define i8 @andv_v64i8(ptr %a) #0 {
52; VBITS_GE_256-LABEL: andv_v64i8:
53; VBITS_GE_256:       // %bb.0:
54; VBITS_GE_256-NEXT:    ptrue p0.b, vl32
55; VBITS_GE_256-NEXT:    mov w8, #32 // =0x20
56; VBITS_GE_256-NEXT:    ld1b { z0.b }, p0/z, [x0, x8]
57; VBITS_GE_256-NEXT:    ld1b { z1.b }, p0/z, [x0]
58; VBITS_GE_256-NEXT:    and z0.d, z1.d, z0.d
59; VBITS_GE_256-NEXT:    andv b0, p0, z0.b
60; VBITS_GE_256-NEXT:    fmov w0, s0
61; VBITS_GE_256-NEXT:    ret
62;
63; VBITS_GE_512-LABEL: andv_v64i8:
64; VBITS_GE_512:       // %bb.0:
65; VBITS_GE_512-NEXT:    ptrue p0.b, vl64
66; VBITS_GE_512-NEXT:    ld1b { z0.b }, p0/z, [x0]
67; VBITS_GE_512-NEXT:    andv b0, p0, z0.b
68; VBITS_GE_512-NEXT:    fmov w0, s0
69; VBITS_GE_512-NEXT:    ret
70  %op = load <64 x i8>, ptr %a
71  %res = call i8 @llvm.vector.reduce.and.v64i8(<64 x i8> %op)
72  ret i8 %res
73}
74
75define i8 @andv_v128i8(ptr %a) vscale_range(8,0) #0 {
76; CHECK-LABEL: andv_v128i8:
77; CHECK:       // %bb.0:
78; CHECK-NEXT:    ptrue p0.b, vl128
79; CHECK-NEXT:    ld1b { z0.b }, p0/z, [x0]
80; CHECK-NEXT:    andv b0, p0, z0.b
81; CHECK-NEXT:    fmov w0, s0
82; CHECK-NEXT:    ret
83  %op = load <128 x i8>, ptr %a
84  %res = call i8 @llvm.vector.reduce.and.v128i8(<128 x i8> %op)
85  ret i8 %res
86}
87
88define i8 @andv_v256i8(ptr %a) vscale_range(16,0) #0 {
89; CHECK-LABEL: andv_v256i8:
90; CHECK:       // %bb.0:
91; CHECK-NEXT:    ptrue p0.b, vl256
92; CHECK-NEXT:    ld1b { z0.b }, p0/z, [x0]
93; CHECK-NEXT:    andv b0, p0, z0.b
94; CHECK-NEXT:    fmov w0, s0
95; CHECK-NEXT:    ret
96  %op = load <256 x i8>, ptr %a
97  %res = call i8 @llvm.vector.reduce.and.v256i8(<256 x i8> %op)
98  ret i8 %res
99}
100
101; No single instruction NEON ANDV support. Use SVE.
102define i16 @andv_v4i16(<4 x i16> %a) vscale_range(2,0) #0 {
103; CHECK-LABEL: andv_v4i16:
104; CHECK:       // %bb.0:
105; CHECK-NEXT:    ptrue p0.h, vl4
106; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
107; CHECK-NEXT:    andv h0, p0, z0.h
108; CHECK-NEXT:    fmov w0, s0
109; CHECK-NEXT:    ret
110  %res = call i16 @llvm.vector.reduce.and.v4i16(<4 x i16> %a)
111  ret i16 %res
112}
113
114; No single instruction NEON ANDV support. Use SVE.
115define i16 @andv_v8i16(<8 x i16> %a) vscale_range(2,0) #0 {
116; CHECK-LABEL: andv_v8i16:
117; CHECK:       // %bb.0:
118; CHECK-NEXT:    ptrue p0.h, vl8
119; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
120; CHECK-NEXT:    andv h0, p0, z0.h
121; CHECK-NEXT:    fmov w0, s0
122; CHECK-NEXT:    ret
123  %res = call i16 @llvm.vector.reduce.and.v8i16(<8 x i16> %a)
124  ret i16 %res
125}
126
127define i16 @andv_v16i16(ptr %a) vscale_range(2,0) #0 {
128; CHECK-LABEL: andv_v16i16:
129; CHECK:       // %bb.0:
130; CHECK-NEXT:    ptrue p0.h, vl16
131; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
132; CHECK-NEXT:    andv h0, p0, z0.h
133; CHECK-NEXT:    fmov w0, s0
134; CHECK-NEXT:    ret
135  %op = load <16 x i16>, ptr %a
136  %res = call i16 @llvm.vector.reduce.and.v16i16(<16 x i16> %op)
137  ret i16 %res
138}
139
140define i16 @andv_v32i16(ptr %a) #0 {
141; VBITS_GE_256-LABEL: andv_v32i16:
142; VBITS_GE_256:       // %bb.0:
143; VBITS_GE_256-NEXT:    ptrue p0.h, vl16
144; VBITS_GE_256-NEXT:    mov x8, #16 // =0x10
145; VBITS_GE_256-NEXT:    ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
146; VBITS_GE_256-NEXT:    ld1h { z1.h }, p0/z, [x0]
147; VBITS_GE_256-NEXT:    and z0.d, z1.d, z0.d
148; VBITS_GE_256-NEXT:    andv h0, p0, z0.h
149; VBITS_GE_256-NEXT:    fmov w0, s0
150; VBITS_GE_256-NEXT:    ret
151;
152; VBITS_GE_512-LABEL: andv_v32i16:
153; VBITS_GE_512:       // %bb.0:
154; VBITS_GE_512-NEXT:    ptrue p0.h, vl32
155; VBITS_GE_512-NEXT:    ld1h { z0.h }, p0/z, [x0]
156; VBITS_GE_512-NEXT:    andv h0, p0, z0.h
157; VBITS_GE_512-NEXT:    fmov w0, s0
158; VBITS_GE_512-NEXT:    ret
159  %op = load <32 x i16>, ptr %a
160  %res = call i16 @llvm.vector.reduce.and.v32i16(<32 x i16> %op)
161  ret i16 %res
162}
163
164define i16 @andv_v64i16(ptr %a) vscale_range(8,0) #0 {
165; CHECK-LABEL: andv_v64i16:
166; CHECK:       // %bb.0:
167; CHECK-NEXT:    ptrue p0.h, vl64
168; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
169; CHECK-NEXT:    andv h0, p0, z0.h
170; CHECK-NEXT:    fmov w0, s0
171; CHECK-NEXT:    ret
172  %op = load <64 x i16>, ptr %a
173  %res = call i16 @llvm.vector.reduce.and.v64i16(<64 x i16> %op)
174  ret i16 %res
175}
176
177define i16 @andv_v128i16(ptr %a) vscale_range(16,0) #0 {
178; CHECK-LABEL: andv_v128i16:
179; CHECK:       // %bb.0:
180; CHECK-NEXT:    ptrue p0.h, vl128
181; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
182; CHECK-NEXT:    andv h0, p0, z0.h
183; CHECK-NEXT:    fmov w0, s0
184; CHECK-NEXT:    ret
185  %op = load <128 x i16>, ptr %a
186  %res = call i16 @llvm.vector.reduce.and.v128i16(<128 x i16> %op)
187  ret i16 %res
188}
189
190; No single instruction NEON ANDV support. Use SVE.
191define i32 @andv_v2i32(<2 x i32> %a) vscale_range(2,0) #0 {
192; CHECK-LABEL: andv_v2i32:
193; CHECK:       // %bb.0:
194; CHECK-NEXT:    ptrue p0.s, vl2
195; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
196; CHECK-NEXT:    andv s0, p0, z0.s
197; CHECK-NEXT:    fmov w0, s0
198; CHECK-NEXT:    ret
199  %res = call i32 @llvm.vector.reduce.and.v2i32(<2 x i32> %a)
200  ret i32 %res
201}
202
203; No single instruction NEON ANDV support. Use SVE.
204define i32 @andv_v4i32(<4 x i32> %a) vscale_range(2,0) #0 {
205; CHECK-LABEL: andv_v4i32:
206; CHECK:       // %bb.0:
207; CHECK-NEXT:    ptrue p0.s, vl4
208; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
209; CHECK-NEXT:    andv s0, p0, z0.s
210; CHECK-NEXT:    fmov w0, s0
211; CHECK-NEXT:    ret
212  %res = call i32 @llvm.vector.reduce.and.v4i32(<4 x i32> %a)
213  ret i32 %res
214}
215
216define i32 @andv_v8i32(ptr %a) vscale_range(2,0) #0 {
217; CHECK-LABEL: andv_v8i32:
218; CHECK:       // %bb.0:
219; CHECK-NEXT:    ptrue p0.s, vl8
220; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
221; CHECK-NEXT:    andv s0, p0, z0.s
222; CHECK-NEXT:    fmov w0, s0
223; CHECK-NEXT:    ret
224  %op = load <8 x i32>, ptr %a
225  %res = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> %op)
226  ret i32 %res
227}
228
229define i32 @andv_v16i32(ptr %a) #0 {
230; VBITS_GE_256-LABEL: andv_v16i32:
231; VBITS_GE_256:       // %bb.0:
232; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
233; VBITS_GE_256-NEXT:    mov x8, #8 // =0x8
234; VBITS_GE_256-NEXT:    ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
235; VBITS_GE_256-NEXT:    ld1w { z1.s }, p0/z, [x0]
236; VBITS_GE_256-NEXT:    and z0.d, z1.d, z0.d
237; VBITS_GE_256-NEXT:    andv s0, p0, z0.s
238; VBITS_GE_256-NEXT:    fmov w0, s0
239; VBITS_GE_256-NEXT:    ret
240;
241; VBITS_GE_512-LABEL: andv_v16i32:
242; VBITS_GE_512:       // %bb.0:
243; VBITS_GE_512-NEXT:    ptrue p0.s, vl16
244; VBITS_GE_512-NEXT:    ld1w { z0.s }, p0/z, [x0]
245; VBITS_GE_512-NEXT:    andv s0, p0, z0.s
246; VBITS_GE_512-NEXT:    fmov w0, s0
247; VBITS_GE_512-NEXT:    ret
248  %op = load <16 x i32>, ptr %a
249  %res = call i32 @llvm.vector.reduce.and.v16i32(<16 x i32> %op)
250  ret i32 %res
251}
252
253define i32 @andv_v32i32(ptr %a) vscale_range(8,0) #0 {
254; CHECK-LABEL: andv_v32i32:
255; CHECK:       // %bb.0:
256; CHECK-NEXT:    ptrue p0.s, vl32
257; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
258; CHECK-NEXT:    andv s0, p0, z0.s
259; CHECK-NEXT:    fmov w0, s0
260; CHECK-NEXT:    ret
261  %op = load <32 x i32>, ptr %a
262  %res = call i32 @llvm.vector.reduce.and.v32i32(<32 x i32> %op)
263  ret i32 %res
264}
265
266define i32 @andv_v64i32(ptr %a) vscale_range(16,0) #0 {
267; CHECK-LABEL: andv_v64i32:
268; CHECK:       // %bb.0:
269; CHECK-NEXT:    ptrue p0.s, vl64
270; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
271; CHECK-NEXT:    andv s0, p0, z0.s
272; CHECK-NEXT:    fmov w0, s0
273; CHECK-NEXT:    ret
274  %op = load <64 x i32>, ptr %a
275  %res = call i32 @llvm.vector.reduce.and.v64i32(<64 x i32> %op)
276  ret i32 %res
277}
278
279; Nothing to do for single element vectors.
280define i64 @andv_v1i64(<1 x i64> %a) vscale_range(2,0) #0 {
281; CHECK-LABEL: andv_v1i64:
282; CHECK:       // %bb.0:
283; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
284; CHECK-NEXT:    fmov x0, d0
285; CHECK-NEXT:    ret
286  %res = call i64 @llvm.vector.reduce.and.v1i64(<1 x i64> %a)
287  ret i64 %res
288}
289
290; Use SVE for 128-bit vectors
291define i64 @andv_v2i64(<2 x i64> %a) vscale_range(2,0) #0 {
292; CHECK-LABEL: andv_v2i64:
293; CHECK:       // %bb.0:
294; CHECK-NEXT:    ptrue p0.d, vl2
295; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
296; CHECK-NEXT:    andv d0, p0, z0.d
297; CHECK-NEXT:    fmov x0, d0
298; CHECK-NEXT:    ret
299  %res = call i64 @llvm.vector.reduce.and.v2i64(<2 x i64> %a)
300  ret i64 %res
301}
302
303define i64 @andv_v4i64(ptr %a) vscale_range(2,0) #0 {
304; CHECK-LABEL: andv_v4i64:
305; CHECK:       // %bb.0:
306; CHECK-NEXT:    ptrue p0.d, vl4
307; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
308; CHECK-NEXT:    andv d0, p0, z0.d
309; CHECK-NEXT:    fmov x0, d0
310; CHECK-NEXT:    ret
311  %op = load <4 x i64>, ptr %a
312  %res = call i64 @llvm.vector.reduce.and.v4i64(<4 x i64> %op)
313  ret i64 %res
314}
315
316define i64 @andv_v8i64(ptr %a) #0 {
317; VBITS_GE_256-LABEL: andv_v8i64:
318; VBITS_GE_256:       // %bb.0:
319; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
320; VBITS_GE_256-NEXT:    mov x8, #4 // =0x4
321; VBITS_GE_256-NEXT:    ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
322; VBITS_GE_256-NEXT:    ld1d { z1.d }, p0/z, [x0]
323; VBITS_GE_256-NEXT:    and z0.d, z1.d, z0.d
324; VBITS_GE_256-NEXT:    andv d0, p0, z0.d
325; VBITS_GE_256-NEXT:    fmov x0, d0
326; VBITS_GE_256-NEXT:    ret
327;
328; VBITS_GE_512-LABEL: andv_v8i64:
329; VBITS_GE_512:       // %bb.0:
330; VBITS_GE_512-NEXT:    ptrue p0.d, vl8
331; VBITS_GE_512-NEXT:    ld1d { z0.d }, p0/z, [x0]
332; VBITS_GE_512-NEXT:    andv d0, p0, z0.d
333; VBITS_GE_512-NEXT:    fmov x0, d0
334; VBITS_GE_512-NEXT:    ret
335  %op = load <8 x i64>, ptr %a
336  %res = call i64 @llvm.vector.reduce.and.v8i64(<8 x i64> %op)
337  ret i64 %res
338}
339
340define i64 @andv_v16i64(ptr %a) vscale_range(8,0) #0 {
341; CHECK-LABEL: andv_v16i64:
342; CHECK:       // %bb.0:
343; CHECK-NEXT:    ptrue p0.d, vl16
344; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
345; CHECK-NEXT:    andv d0, p0, z0.d
346; CHECK-NEXT:    fmov x0, d0
347; CHECK-NEXT:    ret
348  %op = load <16 x i64>, ptr %a
349  %res = call i64 @llvm.vector.reduce.and.v16i64(<16 x i64> %op)
350  ret i64 %res
351}
352
353define i64 @andv_v32i64(ptr %a) vscale_range(16,0) #0 {
354; CHECK-LABEL: andv_v32i64:
355; CHECK:       // %bb.0:
356; CHECK-NEXT:    ptrue p0.d, vl32
357; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
358; CHECK-NEXT:    andv d0, p0, z0.d
359; CHECK-NEXT:    fmov x0, d0
360; CHECK-NEXT:    ret
361  %op = load <32 x i64>, ptr %a
362  %res = call i64 @llvm.vector.reduce.and.v32i64(<32 x i64> %op)
363  ret i64 %res
364}
365
366;
367; EORV
368;
369
370; No single instruction NEON EORV support. Use SVE.
371define i8 @eorv_v8i8(<8 x i8> %a) vscale_range(2,0) #0 {
372; CHECK-LABEL: eorv_v8i8:
373; CHECK:       // %bb.0:
374; CHECK-NEXT:    ptrue p0.b, vl8
375; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
376; CHECK-NEXT:    eorv b0, p0, z0.b
377; CHECK-NEXT:    fmov w0, s0
378; CHECK-NEXT:    ret
379  %res = call i8 @llvm.vector.reduce.xor.v8i8(<8 x i8> %a)
380  ret i8 %res
381}
382
383; No single instruction NEON EORV support. Use SVE.
384define i8 @eorv_v16i8(<16 x i8> %a) vscale_range(2,0) #0 {
385; CHECK-LABEL: eorv_v16i8:
386; CHECK:       // %bb.0:
387; CHECK-NEXT:    ptrue p0.b, vl16
388; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
389; CHECK-NEXT:    eorv b0, p0, z0.b
390; CHECK-NEXT:    fmov w0, s0
391; CHECK-NEXT:    ret
392  %res = call i8 @llvm.vector.reduce.xor.v16i8(<16 x i8> %a)
393  ret i8 %res
394}
395
396define i8 @eorv_v32i8(ptr %a) vscale_range(2,0) #0 {
397; CHECK-LABEL: eorv_v32i8:
398; CHECK:       // %bb.0:
399; CHECK-NEXT:    ptrue p0.b, vl32
400; CHECK-NEXT:    ld1b { z0.b }, p0/z, [x0]
401; CHECK-NEXT:    eorv b0, p0, z0.b
402; CHECK-NEXT:    fmov w0, s0
403; CHECK-NEXT:    ret
404  %op = load <32 x i8>, ptr %a
405  %res = call i8 @llvm.vector.reduce.xor.v32i8(<32 x i8> %op)
406  ret i8 %res
407}
408
409define i8 @eorv_v64i8(ptr %a) #0 {
410; VBITS_GE_256-LABEL: eorv_v64i8:
411; VBITS_GE_256:       // %bb.0:
412; VBITS_GE_256-NEXT:    ptrue p0.b, vl32
413; VBITS_GE_256-NEXT:    mov w8, #32 // =0x20
414; VBITS_GE_256-NEXT:    ld1b { z0.b }, p0/z, [x0, x8]
415; VBITS_GE_256-NEXT:    ld1b { z1.b }, p0/z, [x0]
416; VBITS_GE_256-NEXT:    eor z0.d, z1.d, z0.d
417; VBITS_GE_256-NEXT:    eorv b0, p0, z0.b
418; VBITS_GE_256-NEXT:    fmov w0, s0
419; VBITS_GE_256-NEXT:    ret
420;
421; VBITS_GE_512-LABEL: eorv_v64i8:
422; VBITS_GE_512:       // %bb.0:
423; VBITS_GE_512-NEXT:    ptrue p0.b, vl64
424; VBITS_GE_512-NEXT:    ld1b { z0.b }, p0/z, [x0]
425; VBITS_GE_512-NEXT:    eorv b0, p0, z0.b
426; VBITS_GE_512-NEXT:    fmov w0, s0
427; VBITS_GE_512-NEXT:    ret
428  %op = load <64 x i8>, ptr %a
429  %res = call i8 @llvm.vector.reduce.xor.v64i8(<64 x i8> %op)
430  ret i8 %res
431}
432
433define i8 @eorv_v128i8(ptr %a) vscale_range(8,0) #0 {
434; CHECK-LABEL: eorv_v128i8:
435; CHECK:       // %bb.0:
436; CHECK-NEXT:    ptrue p0.b, vl128
437; CHECK-NEXT:    ld1b { z0.b }, p0/z, [x0]
438; CHECK-NEXT:    eorv b0, p0, z0.b
439; CHECK-NEXT:    fmov w0, s0
440; CHECK-NEXT:    ret
441  %op = load <128 x i8>, ptr %a
442  %res = call i8 @llvm.vector.reduce.xor.v128i8(<128 x i8> %op)
443  ret i8 %res
444}
445
446define i8 @eorv_v256i8(ptr %a) vscale_range(16,0) #0 {
447; CHECK-LABEL: eorv_v256i8:
448; CHECK:       // %bb.0:
449; CHECK-NEXT:    ptrue p0.b, vl256
450; CHECK-NEXT:    ld1b { z0.b }, p0/z, [x0]
451; CHECK-NEXT:    eorv b0, p0, z0.b
452; CHECK-NEXT:    fmov w0, s0
453; CHECK-NEXT:    ret
454  %op = load <256 x i8>, ptr %a
455  %res = call i8 @llvm.vector.reduce.xor.v256i8(<256 x i8> %op)
456  ret i8 %res
457}
458
459; No single instruction NEON EORV support. Use SVE.
460define i16 @eorv_v4i16(<4 x i16> %a) vscale_range(2,0) #0 {
461; CHECK-LABEL: eorv_v4i16:
462; CHECK:       // %bb.0:
463; CHECK-NEXT:    ptrue p0.h, vl4
464; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
465; CHECK-NEXT:    eorv h0, p0, z0.h
466; CHECK-NEXT:    fmov w0, s0
467; CHECK-NEXT:    ret
468  %res = call i16 @llvm.vector.reduce.xor.v4i16(<4 x i16> %a)
469  ret i16 %res
470}
471
472; No single instruction NEON EORV support. Use SVE.
473define i16 @eorv_v8i16(<8 x i16> %a) vscale_range(2,0) #0 {
474; CHECK-LABEL: eorv_v8i16:
475; CHECK:       // %bb.0:
476; CHECK-NEXT:    ptrue p0.h, vl8
477; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
478; CHECK-NEXT:    eorv h0, p0, z0.h
479; CHECK-NEXT:    fmov w0, s0
480; CHECK-NEXT:    ret
481  %res = call i16 @llvm.vector.reduce.xor.v8i16(<8 x i16> %a)
482  ret i16 %res
483}
484
485define i16 @eorv_v16i16(ptr %a) vscale_range(2,0) #0 {
486; CHECK-LABEL: eorv_v16i16:
487; CHECK:       // %bb.0:
488; CHECK-NEXT:    ptrue p0.h, vl16
489; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
490; CHECK-NEXT:    eorv h0, p0, z0.h
491; CHECK-NEXT:    fmov w0, s0
492; CHECK-NEXT:    ret
493  %op = load <16 x i16>, ptr %a
494  %res = call i16 @llvm.vector.reduce.xor.v16i16(<16 x i16> %op)
495  ret i16 %res
496}
497
498define i16 @eorv_v32i16(ptr %a) #0 {
499; VBITS_GE_256-LABEL: eorv_v32i16:
500; VBITS_GE_256:       // %bb.0:
501; VBITS_GE_256-NEXT:    ptrue p0.h, vl16
502; VBITS_GE_256-NEXT:    mov x8, #16 // =0x10
503; VBITS_GE_256-NEXT:    ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
504; VBITS_GE_256-NEXT:    ld1h { z1.h }, p0/z, [x0]
505; VBITS_GE_256-NEXT:    eor z0.d, z1.d, z0.d
506; VBITS_GE_256-NEXT:    eorv h0, p0, z0.h
507; VBITS_GE_256-NEXT:    fmov w0, s0
508; VBITS_GE_256-NEXT:    ret
509;
510; VBITS_GE_512-LABEL: eorv_v32i16:
511; VBITS_GE_512:       // %bb.0:
512; VBITS_GE_512-NEXT:    ptrue p0.h, vl32
513; VBITS_GE_512-NEXT:    ld1h { z0.h }, p0/z, [x0]
514; VBITS_GE_512-NEXT:    eorv h0, p0, z0.h
515; VBITS_GE_512-NEXT:    fmov w0, s0
516; VBITS_GE_512-NEXT:    ret
517  %op = load <32 x i16>, ptr %a
518  %res = call i16 @llvm.vector.reduce.xor.v32i16(<32 x i16> %op)
519  ret i16 %res
520}
521
522define i16 @eorv_v64i16(ptr %a) vscale_range(8,0) #0 {
523; CHECK-LABEL: eorv_v64i16:
524; CHECK:       // %bb.0:
525; CHECK-NEXT:    ptrue p0.h, vl64
526; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
527; CHECK-NEXT:    eorv h0, p0, z0.h
528; CHECK-NEXT:    fmov w0, s0
529; CHECK-NEXT:    ret
530  %op = load <64 x i16>, ptr %a
531  %res = call i16 @llvm.vector.reduce.xor.v64i16(<64 x i16> %op)
532  ret i16 %res
533}
534
535define i16 @eorv_v128i16(ptr %a) vscale_range(16,0) #0 {
536; CHECK-LABEL: eorv_v128i16:
537; CHECK:       // %bb.0:
538; CHECK-NEXT:    ptrue p0.h, vl128
539; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
540; CHECK-NEXT:    eorv h0, p0, z0.h
541; CHECK-NEXT:    fmov w0, s0
542; CHECK-NEXT:    ret
543  %op = load <128 x i16>, ptr %a
544  %res = call i16 @llvm.vector.reduce.xor.v128i16(<128 x i16> %op)
545  ret i16 %res
546}
547
548; No single instruction NEON EORV support. Use SVE.
549define i32 @eorv_v2i32(<2 x i32> %a) vscale_range(2,0) #0 {
550; CHECK-LABEL: eorv_v2i32:
551; CHECK:       // %bb.0:
552; CHECK-NEXT:    ptrue p0.s, vl2
553; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
554; CHECK-NEXT:    eorv s0, p0, z0.s
555; CHECK-NEXT:    fmov w0, s0
556; CHECK-NEXT:    ret
557  %res = call i32 @llvm.vector.reduce.xor.v2i32(<2 x i32> %a)
558  ret i32 %res
559}
560
561; No single instruction NEON EORV support. Use SVE.
562define i32 @eorv_v4i32(<4 x i32> %a) vscale_range(2,0) #0 {
563; CHECK-LABEL: eorv_v4i32:
564; CHECK:       // %bb.0:
565; CHECK-NEXT:    ptrue p0.s, vl4
566; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
567; CHECK-NEXT:    eorv s0, p0, z0.s
568; CHECK-NEXT:    fmov w0, s0
569; CHECK-NEXT:    ret
570  %res = call i32 @llvm.vector.reduce.xor.v4i32(<4 x i32> %a)
571  ret i32 %res
572}
573
574define i32 @eorv_v8i32(ptr %a) vscale_range(2,0) #0 {
575; CHECK-LABEL: eorv_v8i32:
576; CHECK:       // %bb.0:
577; CHECK-NEXT:    ptrue p0.s, vl8
578; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
579; CHECK-NEXT:    eorv s0, p0, z0.s
580; CHECK-NEXT:    fmov w0, s0
581; CHECK-NEXT:    ret
582  %op = load <8 x i32>, ptr %a
583  %res = call i32 @llvm.vector.reduce.xor.v8i32(<8 x i32> %op)
584  ret i32 %res
585}
586
587define i32 @eorv_v16i32(ptr %a) #0 {
588; VBITS_GE_256-LABEL: eorv_v16i32:
589; VBITS_GE_256:       // %bb.0:
590; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
591; VBITS_GE_256-NEXT:    mov x8, #8 // =0x8
592; VBITS_GE_256-NEXT:    ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
593; VBITS_GE_256-NEXT:    ld1w { z1.s }, p0/z, [x0]
594; VBITS_GE_256-NEXT:    eor z0.d, z1.d, z0.d
595; VBITS_GE_256-NEXT:    eorv s0, p0, z0.s
596; VBITS_GE_256-NEXT:    fmov w0, s0
597; VBITS_GE_256-NEXT:    ret
598;
599; VBITS_GE_512-LABEL: eorv_v16i32:
600; VBITS_GE_512:       // %bb.0:
601; VBITS_GE_512-NEXT:    ptrue p0.s, vl16
602; VBITS_GE_512-NEXT:    ld1w { z0.s }, p0/z, [x0]
603; VBITS_GE_512-NEXT:    eorv s0, p0, z0.s
604; VBITS_GE_512-NEXT:    fmov w0, s0
605; VBITS_GE_512-NEXT:    ret
606  %op = load <16 x i32>, ptr %a
607  %res = call i32 @llvm.vector.reduce.xor.v16i32(<16 x i32> %op)
608  ret i32 %res
609}
610
611define i32 @eorv_v32i32(ptr %a) vscale_range(8,0) #0 {
612; CHECK-LABEL: eorv_v32i32:
613; CHECK:       // %bb.0:
614; CHECK-NEXT:    ptrue p0.s, vl32
615; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
616; CHECK-NEXT:    eorv s0, p0, z0.s
617; CHECK-NEXT:    fmov w0, s0
618; CHECK-NEXT:    ret
619  %op = load <32 x i32>, ptr %a
620  %res = call i32 @llvm.vector.reduce.xor.v32i32(<32 x i32> %op)
621  ret i32 %res
622}
623
624define i32 @eorv_v64i32(ptr %a) vscale_range(16,0) #0 {
625; CHECK-LABEL: eorv_v64i32:
626; CHECK:       // %bb.0:
627; CHECK-NEXT:    ptrue p0.s, vl64
628; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
629; CHECK-NEXT:    eorv s0, p0, z0.s
630; CHECK-NEXT:    fmov w0, s0
631; CHECK-NEXT:    ret
632  %op = load <64 x i32>, ptr %a
633  %res = call i32 @llvm.vector.reduce.xor.v64i32(<64 x i32> %op)
634  ret i32 %res
635}
636
637; Nothing to do for single element vectors.
638define i64 @eorv_v1i64(<1 x i64> %a) vscale_range(2,0) #0 {
639; CHECK-LABEL: eorv_v1i64:
640; CHECK:       // %bb.0:
641; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
642; CHECK-NEXT:    fmov x0, d0
643; CHECK-NEXT:    ret
644  %res = call i64 @llvm.vector.reduce.xor.v1i64(<1 x i64> %a)
645  ret i64 %res
646}
647
648; Use SVE for 128-bit vectors
649define i64 @eorv_v2i64(<2 x i64> %a) vscale_range(2,0) #0 {
650; CHECK-LABEL: eorv_v2i64:
651; CHECK:       // %bb.0:
652; CHECK-NEXT:    ptrue p0.d, vl2
653; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
654; CHECK-NEXT:    eorv d0, p0, z0.d
655; CHECK-NEXT:    fmov x0, d0
656; CHECK-NEXT:    ret
657  %res = call i64 @llvm.vector.reduce.xor.v2i64(<2 x i64> %a)
658  ret i64 %res
659}
660
661define i64 @eorv_v4i64(ptr %a) vscale_range(2,0) #0 {
662; CHECK-LABEL: eorv_v4i64:
663; CHECK:       // %bb.0:
664; CHECK-NEXT:    ptrue p0.d, vl4
665; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
666; CHECK-NEXT:    eorv d0, p0, z0.d
667; CHECK-NEXT:    fmov x0, d0
668; CHECK-NEXT:    ret
669  %op = load <4 x i64>, ptr %a
670  %res = call i64 @llvm.vector.reduce.xor.v4i64(<4 x i64> %op)
671  ret i64 %res
672}
673
674define i64 @eorv_v8i64(ptr %a) #0 {
675; VBITS_GE_256-LABEL: eorv_v8i64:
676; VBITS_GE_256:       // %bb.0:
677; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
678; VBITS_GE_256-NEXT:    mov x8, #4 // =0x4
679; VBITS_GE_256-NEXT:    ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
680; VBITS_GE_256-NEXT:    ld1d { z1.d }, p0/z, [x0]
681; VBITS_GE_256-NEXT:    eor z0.d, z1.d, z0.d
682; VBITS_GE_256-NEXT:    eorv d0, p0, z0.d
683; VBITS_GE_256-NEXT:    fmov x0, d0
684; VBITS_GE_256-NEXT:    ret
685;
686; VBITS_GE_512-LABEL: eorv_v8i64:
687; VBITS_GE_512:       // %bb.0:
688; VBITS_GE_512-NEXT:    ptrue p0.d, vl8
689; VBITS_GE_512-NEXT:    ld1d { z0.d }, p0/z, [x0]
690; VBITS_GE_512-NEXT:    eorv d0, p0, z0.d
691; VBITS_GE_512-NEXT:    fmov x0, d0
692; VBITS_GE_512-NEXT:    ret
693  %op = load <8 x i64>, ptr %a
694  %res = call i64 @llvm.vector.reduce.xor.v8i64(<8 x i64> %op)
695  ret i64 %res
696}
697
698define i64 @eorv_v16i64(ptr %a) vscale_range(8,0) #0 {
699; CHECK-LABEL: eorv_v16i64:
700; CHECK:       // %bb.0:
701; CHECK-NEXT:    ptrue p0.d, vl16
702; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
703; CHECK-NEXT:    eorv d0, p0, z0.d
704; CHECK-NEXT:    fmov x0, d0
705; CHECK-NEXT:    ret
706  %op = load <16 x i64>, ptr %a
707  %res = call i64 @llvm.vector.reduce.xor.v16i64(<16 x i64> %op)
708  ret i64 %res
709}
710
711define i64 @eorv_v32i64(ptr %a) vscale_range(16,0) #0 {
712; CHECK-LABEL: eorv_v32i64:
713; CHECK:       // %bb.0:
714; CHECK-NEXT:    ptrue p0.d, vl32
715; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
716; CHECK-NEXT:    eorv d0, p0, z0.d
717; CHECK-NEXT:    fmov x0, d0
718; CHECK-NEXT:    ret
719  %op = load <32 x i64>, ptr %a
720  %res = call i64 @llvm.vector.reduce.xor.v32i64(<32 x i64> %op)
721  ret i64 %res
722}
723
724;
725; ORV
726;
727
728; No single instruction NEON ORV support. Use SVE.
729define i8 @orv_v8i8(<8 x i8> %a) vscale_range(2,0) #0 {
730; CHECK-LABEL: orv_v8i8:
731; CHECK:       // %bb.0:
732; CHECK-NEXT:    ptrue p0.b, vl8
733; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
734; CHECK-NEXT:    orv b0, p0, z0.b
735; CHECK-NEXT:    fmov w0, s0
736; CHECK-NEXT:    ret
737  %res = call i8 @llvm.vector.reduce.or.v8i8(<8 x i8> %a)
738  ret i8 %res
739}
740
741; No single instruction NEON ORV support. Use SVE.
742define i8 @orv_v16i8(<16 x i8> %a) vscale_range(2,0) #0 {
743; CHECK-LABEL: orv_v16i8:
744; CHECK:       // %bb.0:
745; CHECK-NEXT:    ptrue p0.b, vl16
746; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
747; CHECK-NEXT:    orv b0, p0, z0.b
748; CHECK-NEXT:    fmov w0, s0
749; CHECK-NEXT:    ret
750  %res = call i8 @llvm.vector.reduce.or.v16i8(<16 x i8> %a)
751  ret i8 %res
752}
753
754define i8 @orv_v32i8(ptr %a) vscale_range(2,0) #0 {
755; CHECK-LABEL: orv_v32i8:
756; CHECK:       // %bb.0:
757; CHECK-NEXT:    ptrue p0.b, vl32
758; CHECK-NEXT:    ld1b { z0.b }, p0/z, [x0]
759; CHECK-NEXT:    orv b0, p0, z0.b
760; CHECK-NEXT:    fmov w0, s0
761; CHECK-NEXT:    ret
762  %op = load <32 x i8>, ptr %a
763  %res = call i8 @llvm.vector.reduce.or.v32i8(<32 x i8> %op)
764  ret i8 %res
765}
766
767define i8 @orv_v64i8(ptr %a) #0 {
768; VBITS_GE_256-LABEL: orv_v64i8:
769; VBITS_GE_256:       // %bb.0:
770; VBITS_GE_256-NEXT:    ptrue p0.b, vl32
771; VBITS_GE_256-NEXT:    mov w8, #32 // =0x20
772; VBITS_GE_256-NEXT:    ld1b { z0.b }, p0/z, [x0, x8]
773; VBITS_GE_256-NEXT:    ld1b { z1.b }, p0/z, [x0]
774; VBITS_GE_256-NEXT:    orr z0.d, z1.d, z0.d
775; VBITS_GE_256-NEXT:    orv b0, p0, z0.b
776; VBITS_GE_256-NEXT:    fmov w0, s0
777; VBITS_GE_256-NEXT:    ret
778;
779; VBITS_GE_512-LABEL: orv_v64i8:
780; VBITS_GE_512:       // %bb.0:
781; VBITS_GE_512-NEXT:    ptrue p0.b, vl64
782; VBITS_GE_512-NEXT:    ld1b { z0.b }, p0/z, [x0]
783; VBITS_GE_512-NEXT:    orv b0, p0, z0.b
784; VBITS_GE_512-NEXT:    fmov w0, s0
785; VBITS_GE_512-NEXT:    ret
786  %op = load <64 x i8>, ptr %a
787  %res = call i8 @llvm.vector.reduce.or.v64i8(<64 x i8> %op)
788  ret i8 %res
789}
790
791define i8 @orv_v128i8(ptr %a) vscale_range(8,0) #0 {
792; CHECK-LABEL: orv_v128i8:
793; CHECK:       // %bb.0:
794; CHECK-NEXT:    ptrue p0.b, vl128
795; CHECK-NEXT:    ld1b { z0.b }, p0/z, [x0]
796; CHECK-NEXT:    orv b0, p0, z0.b
797; CHECK-NEXT:    fmov w0, s0
798; CHECK-NEXT:    ret
799  %op = load <128 x i8>, ptr %a
800  %res = call i8 @llvm.vector.reduce.or.v128i8(<128 x i8> %op)
801  ret i8 %res
802}
803
804define i8 @orv_v256i8(ptr %a) vscale_range(16,0) #0 {
805; CHECK-LABEL: orv_v256i8:
806; CHECK:       // %bb.0:
807; CHECK-NEXT:    ptrue p0.b, vl256
808; CHECK-NEXT:    ld1b { z0.b }, p0/z, [x0]
809; CHECK-NEXT:    orv b0, p0, z0.b
810; CHECK-NEXT:    fmov w0, s0
811; CHECK-NEXT:    ret
812  %op = load <256 x i8>, ptr %a
813  %res = call i8 @llvm.vector.reduce.or.v256i8(<256 x i8> %op)
814  ret i8 %res
815}
816
817; No single instruction NEON ORV support. Use SVE.
818define i16 @orv_v4i16(<4 x i16> %a) vscale_range(2,0) #0 {
819; CHECK-LABEL: orv_v4i16:
820; CHECK:       // %bb.0:
821; CHECK-NEXT:    ptrue p0.h, vl4
822; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
823; CHECK-NEXT:    orv h0, p0, z0.h
824; CHECK-NEXT:    fmov w0, s0
825; CHECK-NEXT:    ret
826  %res = call i16 @llvm.vector.reduce.or.v4i16(<4 x i16> %a)
827  ret i16 %res
828}
829
830; No single instruction NEON ORV support. Use SVE.
831define i16 @orv_v8i16(<8 x i16> %a) vscale_range(2,0) #0 {
832; CHECK-LABEL: orv_v8i16:
833; CHECK:       // %bb.0:
834; CHECK-NEXT:    ptrue p0.h, vl8
835; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
836; CHECK-NEXT:    orv h0, p0, z0.h
837; CHECK-NEXT:    fmov w0, s0
838; CHECK-NEXT:    ret
839  %res = call i16 @llvm.vector.reduce.or.v8i16(<8 x i16> %a)
840  ret i16 %res
841}
842
843define i16 @orv_v16i16(ptr %a) vscale_range(2,0) #0 {
844; CHECK-LABEL: orv_v16i16:
845; CHECK:       // %bb.0:
846; CHECK-NEXT:    ptrue p0.h, vl16
847; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
848; CHECK-NEXT:    orv h0, p0, z0.h
849; CHECK-NEXT:    fmov w0, s0
850; CHECK-NEXT:    ret
851  %op = load <16 x i16>, ptr %a
852  %res = call i16 @llvm.vector.reduce.or.v16i16(<16 x i16> %op)
853  ret i16 %res
854}
855
856define i16 @orv_v32i16(ptr %a) #0 {
857; VBITS_GE_256-LABEL: orv_v32i16:
858; VBITS_GE_256:       // %bb.0:
859; VBITS_GE_256-NEXT:    ptrue p0.h, vl16
860; VBITS_GE_256-NEXT:    mov x8, #16 // =0x10
861; VBITS_GE_256-NEXT:    ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
862; VBITS_GE_256-NEXT:    ld1h { z1.h }, p0/z, [x0]
863; VBITS_GE_256-NEXT:    orr z0.d, z1.d, z0.d
864; VBITS_GE_256-NEXT:    orv h0, p0, z0.h
865; VBITS_GE_256-NEXT:    fmov w0, s0
866; VBITS_GE_256-NEXT:    ret
867;
868; VBITS_GE_512-LABEL: orv_v32i16:
869; VBITS_GE_512:       // %bb.0:
870; VBITS_GE_512-NEXT:    ptrue p0.h, vl32
871; VBITS_GE_512-NEXT:    ld1h { z0.h }, p0/z, [x0]
872; VBITS_GE_512-NEXT:    orv h0, p0, z0.h
873; VBITS_GE_512-NEXT:    fmov w0, s0
874; VBITS_GE_512-NEXT:    ret
875  %op = load <32 x i16>, ptr %a
876  %res = call i16 @llvm.vector.reduce.or.v32i16(<32 x i16> %op)
877  ret i16 %res
878}
879
880define i16 @orv_v64i16(ptr %a) vscale_range(8,0) #0 {
881; CHECK-LABEL: orv_v64i16:
882; CHECK:       // %bb.0:
883; CHECK-NEXT:    ptrue p0.h, vl64
884; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
885; CHECK-NEXT:    orv h0, p0, z0.h
886; CHECK-NEXT:    fmov w0, s0
887; CHECK-NEXT:    ret
888  %op = load <64 x i16>, ptr %a
889  %res = call i16 @llvm.vector.reduce.or.v64i16(<64 x i16> %op)
890  ret i16 %res
891}
892
893define i16 @orv_v128i16(ptr %a) vscale_range(16,0) #0 {
894; CHECK-LABEL: orv_v128i16:
895; CHECK:       // %bb.0:
896; CHECK-NEXT:    ptrue p0.h, vl128
897; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
898; CHECK-NEXT:    orv h0, p0, z0.h
899; CHECK-NEXT:    fmov w0, s0
900; CHECK-NEXT:    ret
901  %op = load <128 x i16>, ptr %a
902  %res = call i16 @llvm.vector.reduce.or.v128i16(<128 x i16> %op)
903  ret i16 %res
904}
905
906; No single instruction NEON ORV support. Use SVE.
907define i32 @orv_v2i32(<2 x i32> %a) vscale_range(2,0) #0 {
908; CHECK-LABEL: orv_v2i32:
909; CHECK:       // %bb.0:
910; CHECK-NEXT:    ptrue p0.s, vl2
911; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
912; CHECK-NEXT:    orv s0, p0, z0.s
913; CHECK-NEXT:    fmov w0, s0
914; CHECK-NEXT:    ret
915  %res = call i32 @llvm.vector.reduce.or.v2i32(<2 x i32> %a)
916  ret i32 %res
917}
918
919; No single instruction NEON ORV support. Use SVE.
920define i32 @orv_v4i32(<4 x i32> %a) vscale_range(2,0) #0 {
921; CHECK-LABEL: orv_v4i32:
922; CHECK:       // %bb.0:
923; CHECK-NEXT:    ptrue p0.s, vl4
924; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
925; CHECK-NEXT:    orv s0, p0, z0.s
926; CHECK-NEXT:    fmov w0, s0
927; CHECK-NEXT:    ret
928  %res = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> %a)
929  ret i32 %res
930}
931
932define i32 @orv_v8i32(ptr %a) vscale_range(2,0) #0 {
933; CHECK-LABEL: orv_v8i32:
934; CHECK:       // %bb.0:
935; CHECK-NEXT:    ptrue p0.s, vl8
936; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
937; CHECK-NEXT:    orv s0, p0, z0.s
938; CHECK-NEXT:    fmov w0, s0
939; CHECK-NEXT:    ret
940  %op = load <8 x i32>, ptr %a
941  %res = call i32 @llvm.vector.reduce.or.v8i32(<8 x i32> %op)
942  ret i32 %res
943}
944
945define i32 @orv_v16i32(ptr %a) #0 {
946; VBITS_GE_256-LABEL: orv_v16i32:
947; VBITS_GE_256:       // %bb.0:
948; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
949; VBITS_GE_256-NEXT:    mov x8, #8 // =0x8
950; VBITS_GE_256-NEXT:    ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
951; VBITS_GE_256-NEXT:    ld1w { z1.s }, p0/z, [x0]
952; VBITS_GE_256-NEXT:    orr z0.d, z1.d, z0.d
953; VBITS_GE_256-NEXT:    orv s0, p0, z0.s
954; VBITS_GE_256-NEXT:    fmov w0, s0
955; VBITS_GE_256-NEXT:    ret
956;
957; VBITS_GE_512-LABEL: orv_v16i32:
958; VBITS_GE_512:       // %bb.0:
959; VBITS_GE_512-NEXT:    ptrue p0.s, vl16
960; VBITS_GE_512-NEXT:    ld1w { z0.s }, p0/z, [x0]
961; VBITS_GE_512-NEXT:    orv s0, p0, z0.s
962; VBITS_GE_512-NEXT:    fmov w0, s0
963; VBITS_GE_512-NEXT:    ret
964  %op = load <16 x i32>, ptr %a
965  %res = call i32 @llvm.vector.reduce.or.v16i32(<16 x i32> %op)
966  ret i32 %res
967}
968
969define i32 @orv_v32i32(ptr %a) vscale_range(8,0) #0 {
970; CHECK-LABEL: orv_v32i32:
971; CHECK:       // %bb.0:
972; CHECK-NEXT:    ptrue p0.s, vl32
973; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
974; CHECK-NEXT:    orv s0, p0, z0.s
975; CHECK-NEXT:    fmov w0, s0
976; CHECK-NEXT:    ret
977  %op = load <32 x i32>, ptr %a
978  %res = call i32 @llvm.vector.reduce.or.v32i32(<32 x i32> %op)
979  ret i32 %res
980}
981
982define i32 @orv_v64i32(ptr %a) vscale_range(16,0) #0 {
983; CHECK-LABEL: orv_v64i32:
984; CHECK:       // %bb.0:
985; CHECK-NEXT:    ptrue p0.s, vl64
986; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
987; CHECK-NEXT:    orv s0, p0, z0.s
988; CHECK-NEXT:    fmov w0, s0
989; CHECK-NEXT:    ret
990  %op = load <64 x i32>, ptr %a
991  %res = call i32 @llvm.vector.reduce.or.v64i32(<64 x i32> %op)
992  ret i32 %res
993}
994
995; Nothing to do for single element vectors.
996define i64 @orv_v1i64(<1 x i64> %a) vscale_range(2,0) #0 {
997; CHECK-LABEL: orv_v1i64:
998; CHECK:       // %bb.0:
999; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
1000; CHECK-NEXT:    fmov x0, d0
1001; CHECK-NEXT:    ret
1002  %res = call i64 @llvm.vector.reduce.or.v1i64(<1 x i64> %a)
1003  ret i64 %res
1004}
1005
1006; Use SVE for 128-bit vectors
1007define i64 @orv_v2i64(<2 x i64> %a) vscale_range(2,0) #0 {
1008; CHECK-LABEL: orv_v2i64:
1009; CHECK:       // %bb.0:
1010; CHECK-NEXT:    ptrue p0.d, vl2
1011; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
1012; CHECK-NEXT:    orv d0, p0, z0.d
1013; CHECK-NEXT:    fmov x0, d0
1014; CHECK-NEXT:    ret
1015  %res = call i64 @llvm.vector.reduce.or.v2i64(<2 x i64> %a)
1016  ret i64 %res
1017}
1018
1019define i64 @orv_v4i64(ptr %a) vscale_range(2,0) #0 {
1020; CHECK-LABEL: orv_v4i64:
1021; CHECK:       // %bb.0:
1022; CHECK-NEXT:    ptrue p0.d, vl4
1023; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
1024; CHECK-NEXT:    orv d0, p0, z0.d
1025; CHECK-NEXT:    fmov x0, d0
1026; CHECK-NEXT:    ret
1027  %op = load <4 x i64>, ptr %a
1028  %res = call i64 @llvm.vector.reduce.or.v4i64(<4 x i64> %op)
1029  ret i64 %res
1030}
1031
1032define i64 @orv_v8i64(ptr %a) #0 {
1033; VBITS_GE_256-LABEL: orv_v8i64:
1034; VBITS_GE_256:       // %bb.0:
1035; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
1036; VBITS_GE_256-NEXT:    mov x8, #4 // =0x4
1037; VBITS_GE_256-NEXT:    ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
1038; VBITS_GE_256-NEXT:    ld1d { z1.d }, p0/z, [x0]
1039; VBITS_GE_256-NEXT:    orr z0.d, z1.d, z0.d
1040; VBITS_GE_256-NEXT:    orv d0, p0, z0.d
1041; VBITS_GE_256-NEXT:    fmov x0, d0
1042; VBITS_GE_256-NEXT:    ret
1043;
1044; VBITS_GE_512-LABEL: orv_v8i64:
1045; VBITS_GE_512:       // %bb.0:
1046; VBITS_GE_512-NEXT:    ptrue p0.d, vl8
1047; VBITS_GE_512-NEXT:    ld1d { z0.d }, p0/z, [x0]
1048; VBITS_GE_512-NEXT:    orv d0, p0, z0.d
1049; VBITS_GE_512-NEXT:    fmov x0, d0
1050; VBITS_GE_512-NEXT:    ret
1051  %op = load <8 x i64>, ptr %a
1052  %res = call i64 @llvm.vector.reduce.or.v8i64(<8 x i64> %op)
1053  ret i64 %res
1054}
1055
1056define i64 @orv_v16i64(ptr %a) vscale_range(8,0) #0 {
1057; CHECK-LABEL: orv_v16i64:
1058; CHECK:       // %bb.0:
1059; CHECK-NEXT:    ptrue p0.d, vl16
1060; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
1061; CHECK-NEXT:    orv d0, p0, z0.d
1062; CHECK-NEXT:    fmov x0, d0
1063; CHECK-NEXT:    ret
1064  %op = load <16 x i64>, ptr %a
1065  %res = call i64 @llvm.vector.reduce.or.v16i64(<16 x i64> %op)
1066  ret i64 %res
1067}
1068
1069define i64 @orv_v32i64(ptr %a) vscale_range(16,0) #0 {
1070; CHECK-LABEL: orv_v32i64:
1071; CHECK:       // %bb.0:
1072; CHECK-NEXT:    ptrue p0.d, vl32
1073; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
1074; CHECK-NEXT:    orv d0, p0, z0.d
1075; CHECK-NEXT:    fmov x0, d0
1076; CHECK-NEXT:    ret
1077  %op = load <32 x i64>, ptr %a
1078  %res = call i64 @llvm.vector.reduce.or.v32i64(<32 x i64> %op)
1079  ret i64 %res
1080}
1081
1082attributes #0 = { "target-features"="+sve" }
1083
1084declare i8 @llvm.vector.reduce.and.v8i8(<8 x i8>)
1085declare i8 @llvm.vector.reduce.and.v16i8(<16 x i8>)
1086declare i8 @llvm.vector.reduce.and.v32i8(<32 x i8>)
1087declare i8 @llvm.vector.reduce.and.v64i8(<64 x i8>)
1088declare i8 @llvm.vector.reduce.and.v128i8(<128 x i8>)
1089declare i8 @llvm.vector.reduce.and.v256i8(<256 x i8>)
1090
1091declare i16 @llvm.vector.reduce.and.v4i16(<4 x i16>)
1092declare i16 @llvm.vector.reduce.and.v8i16(<8 x i16>)
1093declare i16 @llvm.vector.reduce.and.v16i16(<16 x i16>)
1094declare i16 @llvm.vector.reduce.and.v32i16(<32 x i16>)
1095declare i16 @llvm.vector.reduce.and.v64i16(<64 x i16>)
1096declare i16 @llvm.vector.reduce.and.v128i16(<128 x i16>)
1097
1098declare i32 @llvm.vector.reduce.and.v2i32(<2 x i32>)
1099declare i32 @llvm.vector.reduce.and.v4i32(<4 x i32>)
1100declare i32 @llvm.vector.reduce.and.v8i32(<8 x i32>)
1101declare i32 @llvm.vector.reduce.and.v16i32(<16 x i32>)
1102declare i32 @llvm.vector.reduce.and.v32i32(<32 x i32>)
1103declare i32 @llvm.vector.reduce.and.v64i32(<64 x i32>)
1104
1105declare i64 @llvm.vector.reduce.and.v1i64(<1 x i64>)
1106declare i64 @llvm.vector.reduce.and.v2i64(<2 x i64>)
1107declare i64 @llvm.vector.reduce.and.v4i64(<4 x i64>)
1108declare i64 @llvm.vector.reduce.and.v8i64(<8 x i64>)
1109declare i64 @llvm.vector.reduce.and.v16i64(<16 x i64>)
1110declare i64 @llvm.vector.reduce.and.v32i64(<32 x i64>)
1111
1112declare i8 @llvm.vector.reduce.or.v8i8(<8 x i8>)
1113declare i8 @llvm.vector.reduce.or.v16i8(<16 x i8>)
1114declare i8 @llvm.vector.reduce.or.v32i8(<32 x i8>)
1115declare i8 @llvm.vector.reduce.or.v64i8(<64 x i8>)
1116declare i8 @llvm.vector.reduce.or.v128i8(<128 x i8>)
1117declare i8 @llvm.vector.reduce.or.v256i8(<256 x i8>)
1118
1119declare i16 @llvm.vector.reduce.or.v4i16(<4 x i16>)
1120declare i16 @llvm.vector.reduce.or.v8i16(<8 x i16>)
1121declare i16 @llvm.vector.reduce.or.v16i16(<16 x i16>)
1122declare i16 @llvm.vector.reduce.or.v32i16(<32 x i16>)
1123declare i16 @llvm.vector.reduce.or.v64i16(<64 x i16>)
1124declare i16 @llvm.vector.reduce.or.v128i16(<128 x i16>)
1125
1126declare i32 @llvm.vector.reduce.or.v2i32(<2 x i32>)
1127declare i32 @llvm.vector.reduce.or.v4i32(<4 x i32>)
1128declare i32 @llvm.vector.reduce.or.v8i32(<8 x i32>)
1129declare i32 @llvm.vector.reduce.or.v16i32(<16 x i32>)
1130declare i32 @llvm.vector.reduce.or.v32i32(<32 x i32>)
1131declare i32 @llvm.vector.reduce.or.v64i32(<64 x i32>)
1132
1133declare i64 @llvm.vector.reduce.or.v1i64(<1 x i64>)
1134declare i64 @llvm.vector.reduce.or.v2i64(<2 x i64>)
1135declare i64 @llvm.vector.reduce.or.v4i64(<4 x i64>)
1136declare i64 @llvm.vector.reduce.or.v8i64(<8 x i64>)
1137declare i64 @llvm.vector.reduce.or.v16i64(<16 x i64>)
1138declare i64 @llvm.vector.reduce.or.v32i64(<32 x i64>)
1139
1140declare i8 @llvm.vector.reduce.xor.v8i8(<8 x i8>)
1141declare i8 @llvm.vector.reduce.xor.v16i8(<16 x i8>)
1142declare i8 @llvm.vector.reduce.xor.v32i8(<32 x i8>)
1143declare i8 @llvm.vector.reduce.xor.v64i8(<64 x i8>)
1144declare i8 @llvm.vector.reduce.xor.v128i8(<128 x i8>)
1145declare i8 @llvm.vector.reduce.xor.v256i8(<256 x i8>)
1146
1147declare i16 @llvm.vector.reduce.xor.v4i16(<4 x i16>)
1148declare i16 @llvm.vector.reduce.xor.v8i16(<8 x i16>)
1149declare i16 @llvm.vector.reduce.xor.v16i16(<16 x i16>)
1150declare i16 @llvm.vector.reduce.xor.v32i16(<32 x i16>)
1151declare i16 @llvm.vector.reduce.xor.v64i16(<64 x i16>)
1152declare i16 @llvm.vector.reduce.xor.v128i16(<128 x i16>)
1153
1154declare i32 @llvm.vector.reduce.xor.v2i32(<2 x i32>)
1155declare i32 @llvm.vector.reduce.xor.v4i32(<4 x i32>)
1156declare i32 @llvm.vector.reduce.xor.v8i32(<8 x i32>)
1157declare i32 @llvm.vector.reduce.xor.v16i32(<16 x i32>)
1158declare i32 @llvm.vector.reduce.xor.v32i32(<32 x i32>)
1159declare i32 @llvm.vector.reduce.xor.v64i32(<64 x i32>)
1160
1161declare i64 @llvm.vector.reduce.xor.v1i64(<1 x i64>)
1162declare i64 @llvm.vector.reduce.xor.v2i64(<2 x i64>)
1163declare i64 @llvm.vector.reduce.xor.v4i64(<4 x i64>)
1164declare i64 @llvm.vector.reduce.xor.v8i64(<8 x i64>)
1165declare i64 @llvm.vector.reduce.xor.v16i64(<16 x i64>)
1166declare i64 @llvm.vector.reduce.xor.v32i64(<32 x i64>)
1167