xref: /llvm-project/llvm/test/CodeGen/AArch64/sve-fixed-length-rev.ll (revision db158c7c830807caeeb0691739c41f1d522029e9)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -aarch64-sve-vector-bits-min=256  < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256
3; RUN: llc -aarch64-sve-vector-bits-min=512  < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
4; RUN: llc -aarch64-sve-vector-bits-min=2048 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
5
6target triple = "aarch64-unknown-linux-gnu"
7
8;
9; RBIT
10;
11
12define <8 x i8> @bitreverse_v8i8(<8 x i8> %op) vscale_range(2,0) #0 {
13; CHECK-LABEL: bitreverse_v8i8:
14; CHECK:       // %bb.0:
15; CHECK-NEXT:    ptrue p0.b, vl8
16; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
17; CHECK-NEXT:    rbit z0.b, p0/m, z0.b
18; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
19; CHECK-NEXT:    ret
20  %res = call <8 x i8> @llvm.bitreverse.v8i8(<8 x i8> %op)
21  ret <8 x i8> %res
22}
23
24define <16 x i8> @bitreverse_v16i8(<16 x i8> %op) vscale_range(2,0) #0 {
25; CHECK-LABEL: bitreverse_v16i8:
26; CHECK:       // %bb.0:
27; CHECK-NEXT:    ptrue p0.b, vl16
28; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
29; CHECK-NEXT:    rbit z0.b, p0/m, z0.b
30; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
31; CHECK-NEXT:    ret
32  %res = call <16 x i8> @llvm.bitreverse.v16i8(<16 x i8> %op)
33  ret <16 x i8> %res
34}
35
36define void @bitreverse_v32i8(ptr %a) vscale_range(2,0) #0 {
37; CHECK-LABEL: bitreverse_v32i8:
38; CHECK:       // %bb.0:
39; CHECK-NEXT:    ptrue p0.b, vl32
40; CHECK-NEXT:    ld1b { z0.b }, p0/z, [x0]
41; CHECK-NEXT:    rbit z0.b, p0/m, z0.b
42; CHECK-NEXT:    st1b { z0.b }, p0, [x0]
43; CHECK-NEXT:    ret
44  %op = load <32 x i8>, ptr %a
45  %res = call <32 x i8> @llvm.bitreverse.v32i8(<32 x i8> %op)
46  store <32 x i8> %res, ptr %a
47  ret void
48}
49
50define void @bitreverse_v64i8(ptr %a) #0 {
51; VBITS_GE_256-LABEL: bitreverse_v64i8:
52; VBITS_GE_256:       // %bb.0:
53; VBITS_GE_256-NEXT:    ptrue p0.b, vl32
54; VBITS_GE_256-NEXT:    mov w8, #32 // =0x20
55; VBITS_GE_256-NEXT:    ld1b { z0.b }, p0/z, [x0, x8]
56; VBITS_GE_256-NEXT:    ld1b { z1.b }, p0/z, [x0]
57; VBITS_GE_256-NEXT:    rbit z0.b, p0/m, z0.b
58; VBITS_GE_256-NEXT:    rbit z1.b, p0/m, z1.b
59; VBITS_GE_256-NEXT:    st1b { z0.b }, p0, [x0, x8]
60; VBITS_GE_256-NEXT:    st1b { z1.b }, p0, [x0]
61; VBITS_GE_256-NEXT:    ret
62;
63; VBITS_GE_512-LABEL: bitreverse_v64i8:
64; VBITS_GE_512:       // %bb.0:
65; VBITS_GE_512-NEXT:    ptrue p0.b, vl64
66; VBITS_GE_512-NEXT:    ld1b { z0.b }, p0/z, [x0]
67; VBITS_GE_512-NEXT:    rbit z0.b, p0/m, z0.b
68; VBITS_GE_512-NEXT:    st1b { z0.b }, p0, [x0]
69; VBITS_GE_512-NEXT:    ret
70  %op = load <64 x i8>, ptr %a
71  %res = call <64 x i8> @llvm.bitreverse.v64i8(<64 x i8> %op)
72  store <64 x i8> %res, ptr %a
73  ret void
74}
75
76define void @bitreverse_v128i8(ptr %a) vscale_range(8,0) #0 {
77; CHECK-LABEL: bitreverse_v128i8:
78; CHECK:       // %bb.0:
79; CHECK-NEXT:    ptrue p0.b, vl128
80; CHECK-NEXT:    ld1b { z0.b }, p0/z, [x0]
81; CHECK-NEXT:    rbit z0.b, p0/m, z0.b
82; CHECK-NEXT:    st1b { z0.b }, p0, [x0]
83; CHECK-NEXT:    ret
84  %op = load <128 x i8>, ptr %a
85  %res = call <128 x i8> @llvm.bitreverse.v128i8(<128 x i8> %op)
86  store <128 x i8> %res, ptr %a
87  ret void
88}
89
90define void @bitreverse_v256i8(ptr %a) vscale_range(16,0) #0 {
91; CHECK-LABEL: bitreverse_v256i8:
92; CHECK:       // %bb.0:
93; CHECK-NEXT:    ptrue p0.b, vl256
94; CHECK-NEXT:    ld1b { z0.b }, p0/z, [x0]
95; CHECK-NEXT:    rbit z0.b, p0/m, z0.b
96; CHECK-NEXT:    st1b { z0.b }, p0, [x0]
97; CHECK-NEXT:    ret
98  %op = load <256 x i8>, ptr %a
99  %res = call <256 x i8> @llvm.bitreverse.v256i8(<256 x i8> %op)
100  store <256 x i8> %res, ptr %a
101  ret void
102}
103
104define <4 x i16> @bitreverse_v4i16(<4 x i16> %op) vscale_range(2,0) #0 {
105; CHECK-LABEL: bitreverse_v4i16:
106; CHECK:       // %bb.0:
107; CHECK-NEXT:    ptrue p0.h, vl4
108; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
109; CHECK-NEXT:    rbit z0.h, p0/m, z0.h
110; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
111; CHECK-NEXT:    ret
112  %res = call <4 x i16> @llvm.bitreverse.v4i16(<4 x i16> %op)
113  ret <4 x i16> %res
114}
115
116define <8 x i16> @bitreverse_v8i16(<8 x i16> %op) vscale_range(2,0) #0 {
117; CHECK-LABEL: bitreverse_v8i16:
118; CHECK:       // %bb.0:
119; CHECK-NEXT:    ptrue p0.h, vl8
120; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
121; CHECK-NEXT:    rbit z0.h, p0/m, z0.h
122; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
123; CHECK-NEXT:    ret
124  %res = call <8 x i16> @llvm.bitreverse.v8i16(<8 x i16> %op)
125  ret <8 x i16> %res
126}
127
128define void @bitreverse_v16i16(ptr %a) vscale_range(2,0) #0 {
129; CHECK-LABEL: bitreverse_v16i16:
130; CHECK:       // %bb.0:
131; CHECK-NEXT:    ptrue p0.h, vl16
132; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
133; CHECK-NEXT:    rbit z0.h, p0/m, z0.h
134; CHECK-NEXT:    st1h { z0.h }, p0, [x0]
135; CHECK-NEXT:    ret
136  %op = load <16 x i16>, ptr %a
137  %res = call <16 x i16> @llvm.bitreverse.v16i16(<16 x i16> %op)
138  store <16 x i16> %res, ptr %a
139  ret void
140}
141
142define void @bitreverse_v32i16(ptr %a) #0 {
143; VBITS_GE_256-LABEL: bitreverse_v32i16:
144; VBITS_GE_256:       // %bb.0:
145; VBITS_GE_256-NEXT:    ptrue p0.h, vl16
146; VBITS_GE_256-NEXT:    mov x8, #16 // =0x10
147; VBITS_GE_256-NEXT:    ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
148; VBITS_GE_256-NEXT:    ld1h { z1.h }, p0/z, [x0]
149; VBITS_GE_256-NEXT:    rbit z0.h, p0/m, z0.h
150; VBITS_GE_256-NEXT:    rbit z1.h, p0/m, z1.h
151; VBITS_GE_256-NEXT:    st1h { z0.h }, p0, [x0, x8, lsl #1]
152; VBITS_GE_256-NEXT:    st1h { z1.h }, p0, [x0]
153; VBITS_GE_256-NEXT:    ret
154;
155; VBITS_GE_512-LABEL: bitreverse_v32i16:
156; VBITS_GE_512:       // %bb.0:
157; VBITS_GE_512-NEXT:    ptrue p0.h, vl32
158; VBITS_GE_512-NEXT:    ld1h { z0.h }, p0/z, [x0]
159; VBITS_GE_512-NEXT:    rbit z0.h, p0/m, z0.h
160; VBITS_GE_512-NEXT:    st1h { z0.h }, p0, [x0]
161; VBITS_GE_512-NEXT:    ret
162  %op = load <32 x i16>, ptr %a
163  %res = call <32 x i16> @llvm.bitreverse.v32i16(<32 x i16> %op)
164  store <32 x i16> %res, ptr %a
165  ret void
166}
167
168define void @bitreverse_v64i16(ptr %a) vscale_range(8,0) #0 {
169; CHECK-LABEL: bitreverse_v64i16:
170; CHECK:       // %bb.0:
171; CHECK-NEXT:    ptrue p0.h, vl64
172; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
173; CHECK-NEXT:    rbit z0.h, p0/m, z0.h
174; CHECK-NEXT:    st1h { z0.h }, p0, [x0]
175; CHECK-NEXT:    ret
176  %op = load <64 x i16>, ptr %a
177  %res = call <64 x i16> @llvm.bitreverse.v64i16(<64 x i16> %op)
178  store <64 x i16> %res, ptr %a
179  ret void
180}
181
182define void @bitreverse_v128i16(ptr %a) vscale_range(16,0) #0 {
183; CHECK-LABEL: bitreverse_v128i16:
184; CHECK:       // %bb.0:
185; CHECK-NEXT:    ptrue p0.h, vl128
186; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
187; CHECK-NEXT:    rbit z0.h, p0/m, z0.h
188; CHECK-NEXT:    st1h { z0.h }, p0, [x0]
189; CHECK-NEXT:    ret
190  %op = load <128 x i16>, ptr %a
191  %res = call <128 x i16> @llvm.bitreverse.v128i16(<128 x i16> %op)
192  store <128 x i16> %res, ptr %a
193  ret void
194}
195
196define <2 x i32> @bitreverse_v2i32(<2 x i32> %op) vscale_range(2,0) #0 {
197; CHECK-LABEL: bitreverse_v2i32:
198; CHECK:       // %bb.0:
199; CHECK-NEXT:    ptrue p0.s, vl2
200; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
201; CHECK-NEXT:    rbit z0.s, p0/m, z0.s
202; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
203; CHECK-NEXT:    ret
204  %res = call <2 x i32> @llvm.bitreverse.v2i32(<2 x i32> %op)
205  ret <2 x i32> %res
206}
207
208define <4 x i32> @bitreverse_v4i32(<4 x i32> %op) vscale_range(2,0) #0 {
209; CHECK-LABEL: bitreverse_v4i32:
210; CHECK:       // %bb.0:
211; CHECK-NEXT:    ptrue p0.s, vl4
212; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
213; CHECK-NEXT:    rbit z0.s, p0/m, z0.s
214; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
215; CHECK-NEXT:    ret
216  %res = call <4 x i32> @llvm.bitreverse.v4i32(<4 x i32> %op)
217  ret <4 x i32> %res
218}
219
220define void @bitreverse_v8i32(ptr %a) vscale_range(2,0) #0 {
221; CHECK-LABEL: bitreverse_v8i32:
222; CHECK:       // %bb.0:
223; CHECK-NEXT:    ptrue p0.s, vl8
224; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
225; CHECK-NEXT:    rbit z0.s, p0/m, z0.s
226; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
227; CHECK-NEXT:    ret
228  %op = load <8 x i32>, ptr %a
229  %res = call <8 x i32> @llvm.bitreverse.v8i32(<8 x i32> %op)
230  store <8 x i32> %res, ptr %a
231  ret void
232}
233
234define void @bitreverse_v16i32(ptr %a) #0 {
235; VBITS_GE_256-LABEL: bitreverse_v16i32:
236; VBITS_GE_256:       // %bb.0:
237; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
238; VBITS_GE_256-NEXT:    mov x8, #8 // =0x8
239; VBITS_GE_256-NEXT:    ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
240; VBITS_GE_256-NEXT:    ld1w { z1.s }, p0/z, [x0]
241; VBITS_GE_256-NEXT:    rbit z0.s, p0/m, z0.s
242; VBITS_GE_256-NEXT:    rbit z1.s, p0/m, z1.s
243; VBITS_GE_256-NEXT:    st1w { z0.s }, p0, [x0, x8, lsl #2]
244; VBITS_GE_256-NEXT:    st1w { z1.s }, p0, [x0]
245; VBITS_GE_256-NEXT:    ret
246;
247; VBITS_GE_512-LABEL: bitreverse_v16i32:
248; VBITS_GE_512:       // %bb.0:
249; VBITS_GE_512-NEXT:    ptrue p0.s, vl16
250; VBITS_GE_512-NEXT:    ld1w { z0.s }, p0/z, [x0]
251; VBITS_GE_512-NEXT:    rbit z0.s, p0/m, z0.s
252; VBITS_GE_512-NEXT:    st1w { z0.s }, p0, [x0]
253; VBITS_GE_512-NEXT:    ret
254  %op = load <16 x i32>, ptr %a
255  %res = call <16 x i32> @llvm.bitreverse.v16i32(<16 x i32> %op)
256  store <16 x i32> %res, ptr %a
257  ret void
258}
259
260define void @bitreverse_v32i32(ptr %a) vscale_range(8,0) #0 {
261; CHECK-LABEL: bitreverse_v32i32:
262; CHECK:       // %bb.0:
263; CHECK-NEXT:    ptrue p0.s, vl32
264; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
265; CHECK-NEXT:    rbit z0.s, p0/m, z0.s
266; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
267; CHECK-NEXT:    ret
268  %op = load <32 x i32>, ptr %a
269  %res = call <32 x i32> @llvm.bitreverse.v32i32(<32 x i32> %op)
270  store <32 x i32> %res, ptr %a
271  ret void
272}
273
274define void @bitreverse_v64i32(ptr %a) vscale_range(16,0) #0 {
275; CHECK-LABEL: bitreverse_v64i32:
276; CHECK:       // %bb.0:
277; CHECK-NEXT:    ptrue p0.s, vl64
278; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
279; CHECK-NEXT:    rbit z0.s, p0/m, z0.s
280; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
281; CHECK-NEXT:    ret
282  %op = load <64 x i32>, ptr %a
283  %res = call <64 x i32> @llvm.bitreverse.v64i32(<64 x i32> %op)
284  store <64 x i32> %res, ptr %a
285  ret void
286}
287
288define <1 x i64> @bitreverse_v1i64(<1 x i64> %op) vscale_range(2,0) #0 {
289; CHECK-LABEL: bitreverse_v1i64:
290; CHECK:       // %bb.0:
291; CHECK-NEXT:    ptrue p0.d, vl1
292; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
293; CHECK-NEXT:    rbit z0.d, p0/m, z0.d
294; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
295; CHECK-NEXT:    ret
296  %res = call <1 x i64> @llvm.bitreverse.v1i64(<1 x i64> %op)
297  ret <1 x i64> %res
298}
299
300define <2 x i64> @bitreverse_v2i64(<2 x i64> %op) vscale_range(2,0) #0 {
301; CHECK-LABEL: bitreverse_v2i64:
302; CHECK:       // %bb.0:
303; CHECK-NEXT:    ptrue p0.d, vl2
304; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
305; CHECK-NEXT:    rbit z0.d, p0/m, z0.d
306; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
307; CHECK-NEXT:    ret
308  %res = call <2 x i64> @llvm.bitreverse.v2i64(<2 x i64> %op)
309  ret <2 x i64> %res
310}
311
312define void @bitreverse_v4i64(ptr %a) vscale_range(2,0) #0 {
313; CHECK-LABEL: bitreverse_v4i64:
314; CHECK:       // %bb.0:
315; CHECK-NEXT:    ptrue p0.d, vl4
316; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
317; CHECK-NEXT:    rbit z0.d, p0/m, z0.d
318; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
319; CHECK-NEXT:    ret
320  %op = load <4 x i64>, ptr %a
321  %res = call <4 x i64> @llvm.bitreverse.v4i64(<4 x i64> %op)
322  store <4 x i64> %res, ptr %a
323  ret void
324}
325
326define void @bitreverse_v8i64(ptr %a) #0 {
327; VBITS_GE_256-LABEL: bitreverse_v8i64:
328; VBITS_GE_256:       // %bb.0:
329; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
330; VBITS_GE_256-NEXT:    mov x8, #4 // =0x4
331; VBITS_GE_256-NEXT:    ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
332; VBITS_GE_256-NEXT:    ld1d { z1.d }, p0/z, [x0]
333; VBITS_GE_256-NEXT:    rbit z0.d, p0/m, z0.d
334; VBITS_GE_256-NEXT:    rbit z1.d, p0/m, z1.d
335; VBITS_GE_256-NEXT:    st1d { z0.d }, p0, [x0, x8, lsl #3]
336; VBITS_GE_256-NEXT:    st1d { z1.d }, p0, [x0]
337; VBITS_GE_256-NEXT:    ret
338;
339; VBITS_GE_512-LABEL: bitreverse_v8i64:
340; VBITS_GE_512:       // %bb.0:
341; VBITS_GE_512-NEXT:    ptrue p0.d, vl8
342; VBITS_GE_512-NEXT:    ld1d { z0.d }, p0/z, [x0]
343; VBITS_GE_512-NEXT:    rbit z0.d, p0/m, z0.d
344; VBITS_GE_512-NEXT:    st1d { z0.d }, p0, [x0]
345; VBITS_GE_512-NEXT:    ret
346  %op = load <8 x i64>, ptr %a
347  %res = call <8 x i64> @llvm.bitreverse.v8i64(<8 x i64> %op)
348  store <8 x i64> %res, ptr %a
349  ret void
350}
351
352define void @bitreverse_v16i64(ptr %a) vscale_range(8,0) #0 {
353; CHECK-LABEL: bitreverse_v16i64:
354; CHECK:       // %bb.0:
355; CHECK-NEXT:    ptrue p0.d, vl16
356; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
357; CHECK-NEXT:    rbit z0.d, p0/m, z0.d
358; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
359; CHECK-NEXT:    ret
360  %op = load <16 x i64>, ptr %a
361  %res = call <16 x i64> @llvm.bitreverse.v16i64(<16 x i64> %op)
362  store <16 x i64> %res, ptr %a
363  ret void
364}
365
366define void @bitreverse_v32i64(ptr %a) vscale_range(16,0) #0 {
367; CHECK-LABEL: bitreverse_v32i64:
368; CHECK:       // %bb.0:
369; CHECK-NEXT:    ptrue p0.d, vl32
370; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
371; CHECK-NEXT:    rbit z0.d, p0/m, z0.d
372; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
373; CHECK-NEXT:    ret
374  %op = load <32 x i64>, ptr %a
375  %res = call <32 x i64> @llvm.bitreverse.v32i64(<32 x i64> %op)
376  store <32 x i64> %res, ptr %a
377  ret void
378}
379
380;
381; REVB
382;
383
384; Don't use SVE for 64-bit vectors.
385define <4 x i16> @bswap_v4i16(<4 x i16> %op) vscale_range(2,0) #0 {
386; CHECK-LABEL: bswap_v4i16:
387; CHECK:       // %bb.0:
388; CHECK-NEXT:    rev16 v0.8b, v0.8b
389; CHECK-NEXT:    ret
390  %res = call <4 x i16> @llvm.bswap.v4i16(<4 x i16> %op)
391  ret <4 x i16> %res
392}
393
394; Don't use SVE for 128-bit vectors.
395define <8 x i16> @bswap_v8i16(<8 x i16> %op) vscale_range(2,0) #0 {
396; CHECK-LABEL: bswap_v8i16:
397; CHECK:       // %bb.0:
398; CHECK-NEXT:    rev16 v0.16b, v0.16b
399; CHECK-NEXT:    ret
400  %res = call <8 x i16> @llvm.bswap.v8i16(<8 x i16> %op)
401  ret <8 x i16> %res
402}
403
404define void @bswap_v16i16(ptr %a) vscale_range(2,0) #0 {
405; CHECK-LABEL: bswap_v16i16:
406; CHECK:       // %bb.0:
407; CHECK-NEXT:    ptrue p0.h, vl16
408; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
409; CHECK-NEXT:    revb z0.h, p0/m, z0.h
410; CHECK-NEXT:    st1h { z0.h }, p0, [x0]
411; CHECK-NEXT:    ret
412  %op = load <16 x i16>, ptr %a
413  %res = call <16 x i16> @llvm.bswap.v16i16(<16 x i16> %op)
414  store <16 x i16> %res, ptr %a
415  ret void
416}
417
418define void @bswap_v32i16(ptr %a) #0 {
419; VBITS_GE_256-LABEL: bswap_v32i16:
420; VBITS_GE_256:       // %bb.0:
421; VBITS_GE_256-NEXT:    ptrue p0.h, vl16
422; VBITS_GE_256-NEXT:    mov x8, #16 // =0x10
423; VBITS_GE_256-NEXT:    ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
424; VBITS_GE_256-NEXT:    ld1h { z1.h }, p0/z, [x0]
425; VBITS_GE_256-NEXT:    revb z0.h, p0/m, z0.h
426; VBITS_GE_256-NEXT:    revb z1.h, p0/m, z1.h
427; VBITS_GE_256-NEXT:    st1h { z0.h }, p0, [x0, x8, lsl #1]
428; VBITS_GE_256-NEXT:    st1h { z1.h }, p0, [x0]
429; VBITS_GE_256-NEXT:    ret
430;
431; VBITS_GE_512-LABEL: bswap_v32i16:
432; VBITS_GE_512:       // %bb.0:
433; VBITS_GE_512-NEXT:    ptrue p0.h, vl32
434; VBITS_GE_512-NEXT:    ld1h { z0.h }, p0/z, [x0]
435; VBITS_GE_512-NEXT:    revb z0.h, p0/m, z0.h
436; VBITS_GE_512-NEXT:    st1h { z0.h }, p0, [x0]
437; VBITS_GE_512-NEXT:    ret
438  %op = load <32 x i16>, ptr %a
439  %res = call <32 x i16> @llvm.bswap.v32i16(<32 x i16> %op)
440  store <32 x i16> %res, ptr %a
441  ret void
442}
443
444define void @bswap_v64i16(ptr %a) vscale_range(8,0) #0 {
445; CHECK-LABEL: bswap_v64i16:
446; CHECK:       // %bb.0:
447; CHECK-NEXT:    ptrue p0.h, vl64
448; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
449; CHECK-NEXT:    revb z0.h, p0/m, z0.h
450; CHECK-NEXT:    st1h { z0.h }, p0, [x0]
451; CHECK-NEXT:    ret
452  %op = load <64 x i16>, ptr %a
453  %res = call <64 x i16> @llvm.bswap.v64i16(<64 x i16> %op)
454  store <64 x i16> %res, ptr %a
455  ret void
456}
457
458define void @bswap_v128i16(ptr %a) vscale_range(16,0) #0 {
459; CHECK-LABEL: bswap_v128i16:
460; CHECK:       // %bb.0:
461; CHECK-NEXT:    ptrue p0.h, vl128
462; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
463; CHECK-NEXT:    revb z0.h, p0/m, z0.h
464; CHECK-NEXT:    st1h { z0.h }, p0, [x0]
465; CHECK-NEXT:    ret
466  %op = load <128 x i16>, ptr %a
467  %res = call <128 x i16> @llvm.bswap.v128i16(<128 x i16> %op)
468  store <128 x i16> %res, ptr %a
469  ret void
470}
471
472; Don't use SVE for 64-bit vectors.
473define <2 x i32> @bswap_v2i32(<2 x i32> %op) vscale_range(2,0) #0 {
474; CHECK-LABEL: bswap_v2i32:
475; CHECK:       // %bb.0:
476; CHECK-NEXT:    rev32 v0.8b, v0.8b
477; CHECK-NEXT:    ret
478  %res = call <2 x i32> @llvm.bswap.v2i32(<2 x i32> %op)
479  ret <2 x i32> %res
480}
481
482; Don't use SVE for 128-bit vectors.
483define <4 x i32> @bswap_v4i32(<4 x i32> %op) vscale_range(2,0) #0 {
484; CHECK-LABEL: bswap_v4i32:
485; CHECK:       // %bb.0:
486; CHECK-NEXT:    rev32 v0.16b, v0.16b
487; CHECK-NEXT:    ret
488  %res = call <4 x i32> @llvm.bswap.v4i32(<4 x i32> %op)
489  ret <4 x i32> %res
490}
491
492define void @bswap_v8i32(ptr %a) vscale_range(2,0) #0 {
493; CHECK-LABEL: bswap_v8i32:
494; CHECK:       // %bb.0:
495; CHECK-NEXT:    ptrue p0.s, vl8
496; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
497; CHECK-NEXT:    revb z0.s, p0/m, z0.s
498; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
499; CHECK-NEXT:    ret
500  %op = load <8 x i32>, ptr %a
501  %res = call <8 x i32> @llvm.bswap.v8i32(<8 x i32> %op)
502  store <8 x i32> %res, ptr %a
503  ret void
504}
505
506define void @bswap_v16i32(ptr %a) #0 {
507; VBITS_GE_256-LABEL: bswap_v16i32:
508; VBITS_GE_256:       // %bb.0:
509; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
510; VBITS_GE_256-NEXT:    mov x8, #8 // =0x8
511; VBITS_GE_256-NEXT:    ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
512; VBITS_GE_256-NEXT:    ld1w { z1.s }, p0/z, [x0]
513; VBITS_GE_256-NEXT:    revb z0.s, p0/m, z0.s
514; VBITS_GE_256-NEXT:    revb z1.s, p0/m, z1.s
515; VBITS_GE_256-NEXT:    st1w { z0.s }, p0, [x0, x8, lsl #2]
516; VBITS_GE_256-NEXT:    st1w { z1.s }, p0, [x0]
517; VBITS_GE_256-NEXT:    ret
518;
519; VBITS_GE_512-LABEL: bswap_v16i32:
520; VBITS_GE_512:       // %bb.0:
521; VBITS_GE_512-NEXT:    ptrue p0.s, vl16
522; VBITS_GE_512-NEXT:    ld1w { z0.s }, p0/z, [x0]
523; VBITS_GE_512-NEXT:    revb z0.s, p0/m, z0.s
524; VBITS_GE_512-NEXT:    st1w { z0.s }, p0, [x0]
525; VBITS_GE_512-NEXT:    ret
526  %op = load <16 x i32>, ptr %a
527  %res = call <16 x i32> @llvm.bswap.v16i32(<16 x i32> %op)
528  store <16 x i32> %res, ptr %a
529  ret void
530}
531
532define void @bswap_v32i32(ptr %a) vscale_range(8,0) #0 {
533; CHECK-LABEL: bswap_v32i32:
534; CHECK:       // %bb.0:
535; CHECK-NEXT:    ptrue p0.s, vl32
536; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
537; CHECK-NEXT:    revb z0.s, p0/m, z0.s
538; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
539; CHECK-NEXT:    ret
540  %op = load <32 x i32>, ptr %a
541  %res = call <32 x i32> @llvm.bswap.v32i32(<32 x i32> %op)
542  store <32 x i32> %res, ptr %a
543  ret void
544}
545
546define void @bswap_v64i32(ptr %a) vscale_range(16,0) #0 {
547; CHECK-LABEL: bswap_v64i32:
548; CHECK:       // %bb.0:
549; CHECK-NEXT:    ptrue p0.s, vl64
550; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
551; CHECK-NEXT:    revb z0.s, p0/m, z0.s
552; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
553; CHECK-NEXT:    ret
554  %op = load <64 x i32>, ptr %a
555  %res = call <64 x i32> @llvm.bswap.v64i32(<64 x i32> %op)
556  store <64 x i32> %res, ptr %a
557  ret void
558}
559
560; Don't use SVE for 64-bit vectors.
561define <1 x i64> @bswap_v1i64(<1 x i64> %op) vscale_range(2,0) #0 {
562; CHECK-LABEL: bswap_v1i64:
563; CHECK:       // %bb.0:
564; CHECK-NEXT:    rev64 v0.8b, v0.8b
565; CHECK-NEXT:    ret
566  %res = call <1 x i64> @llvm.bswap.v1i64(<1 x i64> %op)
567  ret <1 x i64> %res
568}
569
570; Don't use SVE for 128-bit vectors.
571define <2 x i64> @bswap_v2i64(<2 x i64> %op) vscale_range(2,0) #0 {
572; CHECK-LABEL: bswap_v2i64:
573; CHECK:       // %bb.0:
574; CHECK-NEXT:    rev64 v0.16b, v0.16b
575; CHECK-NEXT:    ret
576  %res = call <2 x i64> @llvm.bswap.v2i64(<2 x i64> %op)
577  ret <2 x i64> %res
578}
579
580define void @bswap_v4i64(ptr %a) vscale_range(2,0) #0 {
581; CHECK-LABEL: bswap_v4i64:
582; CHECK:       // %bb.0:
583; CHECK-NEXT:    ptrue p0.d, vl4
584; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
585; CHECK-NEXT:    revb z0.d, p0/m, z0.d
586; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
587; CHECK-NEXT:    ret
588  %op = load <4 x i64>, ptr %a
589  %res = call <4 x i64> @llvm.bswap.v4i64(<4 x i64> %op)
590  store <4 x i64> %res, ptr %a
591  ret void
592}
593
594define void @bswap_v8i64(ptr %a) #0 {
595; VBITS_GE_256-LABEL: bswap_v8i64:
596; VBITS_GE_256:       // %bb.0:
597; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
598; VBITS_GE_256-NEXT:    mov x8, #4 // =0x4
599; VBITS_GE_256-NEXT:    ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
600; VBITS_GE_256-NEXT:    ld1d { z1.d }, p0/z, [x0]
601; VBITS_GE_256-NEXT:    revb z0.d, p0/m, z0.d
602; VBITS_GE_256-NEXT:    revb z1.d, p0/m, z1.d
603; VBITS_GE_256-NEXT:    st1d { z0.d }, p0, [x0, x8, lsl #3]
604; VBITS_GE_256-NEXT:    st1d { z1.d }, p0, [x0]
605; VBITS_GE_256-NEXT:    ret
606;
607; VBITS_GE_512-LABEL: bswap_v8i64:
608; VBITS_GE_512:       // %bb.0:
609; VBITS_GE_512-NEXT:    ptrue p0.d, vl8
610; VBITS_GE_512-NEXT:    ld1d { z0.d }, p0/z, [x0]
611; VBITS_GE_512-NEXT:    revb z0.d, p0/m, z0.d
612; VBITS_GE_512-NEXT:    st1d { z0.d }, p0, [x0]
613; VBITS_GE_512-NEXT:    ret
614  %op = load <8 x i64>, ptr %a
615  %res = call <8 x i64> @llvm.bswap.v8i64(<8 x i64> %op)
616  store <8 x i64> %res, ptr %a
617  ret void
618}
619
620define void @bswap_v16i64(ptr %a) vscale_range(8,0) #0 {
621; CHECK-LABEL: bswap_v16i64:
622; CHECK:       // %bb.0:
623; CHECK-NEXT:    ptrue p0.d, vl16
624; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
625; CHECK-NEXT:    revb z0.d, p0/m, z0.d
626; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
627; CHECK-NEXT:    ret
628  %op = load <16 x i64>, ptr %a
629  %res = call <16 x i64> @llvm.bswap.v16i64(<16 x i64> %op)
630  store <16 x i64> %res, ptr %a
631  ret void
632}
633
634define void @bswap_v32i64(ptr %a) vscale_range(16,0) #0 {
635; CHECK-LABEL: bswap_v32i64:
636; CHECK:       // %bb.0:
637; CHECK-NEXT:    ptrue p0.d, vl32
638; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
639; CHECK-NEXT:    revb z0.d, p0/m, z0.d
640; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
641; CHECK-NEXT:    ret
642  %op = load <32 x i64>, ptr %a
643  %res = call <32 x i64> @llvm.bswap.v32i64(<32 x i64> %op)
644  store <32 x i64> %res, ptr %a
645  ret void
646}
647
648attributes #0 = { "target-features"="+sve" }
649
650declare <8 x i8> @llvm.bitreverse.v8i8(<8 x i8>)
651declare <16 x i8> @llvm.bitreverse.v16i8(<16 x i8>)
652declare <32 x i8> @llvm.bitreverse.v32i8(<32 x i8>)
653declare <64 x i8> @llvm.bitreverse.v64i8(<64 x i8>)
654declare <128 x i8> @llvm.bitreverse.v128i8(<128 x i8>)
655declare <256 x i8> @llvm.bitreverse.v256i8(<256 x i8>)
656declare <4 x i16> @llvm.bitreverse.v4i16(<4 x i16>)
657declare <8 x i16> @llvm.bitreverse.v8i16(<8 x i16>)
658declare <16 x i16> @llvm.bitreverse.v16i16(<16 x i16>)
659declare <32 x i16> @llvm.bitreverse.v32i16(<32 x i16>)
660declare <64 x i16> @llvm.bitreverse.v64i16(<64 x i16>)
661declare <128 x i16> @llvm.bitreverse.v128i16(<128 x i16>)
662declare <2 x i32> @llvm.bitreverse.v2i32(<2 x i32>)
663declare <4 x i32> @llvm.bitreverse.v4i32(<4 x i32>)
664declare <8 x i32> @llvm.bitreverse.v8i32(<8 x i32>)
665declare <16 x i32> @llvm.bitreverse.v16i32(<16 x i32>)
666declare <32 x i32> @llvm.bitreverse.v32i32(<32 x i32>)
667declare <64 x i32> @llvm.bitreverse.v64i32(<64 x i32>)
668declare <1 x i64> @llvm.bitreverse.v1i64(<1 x i64>)
669declare <2 x i64> @llvm.bitreverse.v2i64(<2 x i64>)
670declare <4 x i64> @llvm.bitreverse.v4i64(<4 x i64>)
671declare <8 x i64> @llvm.bitreverse.v8i64(<8 x i64>)
672declare <16 x i64> @llvm.bitreverse.v16i64(<16 x i64>)
673declare <32 x i64> @llvm.bitreverse.v32i64(<32 x i64>)
674
675declare <4 x i16> @llvm.bswap.v4i16(<4 x i16>)
676declare <8 x i16> @llvm.bswap.v8i16(<8 x i16>)
677declare <16 x i16> @llvm.bswap.v16i16(<16 x i16>)
678declare <32 x i16> @llvm.bswap.v32i16(<32 x i16>)
679declare <64 x i16> @llvm.bswap.v64i16(<64 x i16>)
680declare <128 x i16> @llvm.bswap.v128i16(<128 x i16>)
681declare <2 x i32> @llvm.bswap.v2i32(<2 x i32>)
682declare <4 x i32> @llvm.bswap.v4i32(<4 x i32>)
683declare <8 x i32> @llvm.bswap.v8i32(<8 x i32>)
684declare <16 x i32> @llvm.bswap.v16i32(<16 x i32>)
685declare <32 x i32> @llvm.bswap.v32i32(<32 x i32>)
686declare <64 x i32> @llvm.bswap.v64i32(<64 x i32>)
687declare <1 x i64> @llvm.bswap.v1i64(<1 x i64>)
688declare <2 x i64> @llvm.bswap.v2i64(<2 x i64>)
689declare <4 x i64> @llvm.bswap.v4i64(<4 x i64>)
690declare <8 x i64> @llvm.bswap.v8i64(<8 x i64>)
691declare <16 x i64> @llvm.bswap.v16i64(<16 x i64>)
692declare <32 x i64> @llvm.bswap.v32i64(<32 x i64>)
693