xref: /llvm-project/llvm/test/CodeGen/AArch64/arm64-rev.ll (revision 3d18c8cd265c0c0bf1d85226c4770a2dd0f86e8f)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=aarch64 -aarch64-neon-syntax=apple | FileCheck %s --check-prefixes=CHECK,CHECK-SD
3; RUN: llc < %s -mtriple=aarch64 -aarch64-neon-syntax=apple -global-isel | FileCheck %s --check-prefixes=CHECK,CHECK-GI
4
5define i32 @test_rev_w(i32 %a) nounwind {
6; CHECK-LABEL: test_rev_w:
7; CHECK:       // %bb.0: // %entry
8; CHECK-NEXT:    rev w0, w0
9; CHECK-NEXT:    ret
10entry:
11  %0 = tail call i32 @llvm.bswap.i32(i32 %a)
12  ret i32 %0
13}
14
15define i64 @test_rev_x(i64 %a) nounwind {
16; CHECK-LABEL: test_rev_x:
17; CHECK:       // %bb.0: // %entry
18; CHECK-NEXT:    rev x0, x0
19; CHECK-NEXT:    ret
20entry:
21  %0 = tail call i64 @llvm.bswap.i64(i64 %a)
22  ret i64 %0
23}
24
25; Canonicalize (srl (bswap x), 16) to (rotr (bswap x), 16) if the high 16-bits
26; of %a are zero. This optimizes rev + lsr 16 to rev16.
27define i32 @test_rev_w_srl16(i16 %a) {
28; CHECK-SD-LABEL: test_rev_w_srl16:
29; CHECK-SD:       // %bb.0: // %entry
30; CHECK-SD-NEXT:    rev w8, w0
31; CHECK-SD-NEXT:    lsr w0, w8, #16
32; CHECK-SD-NEXT:    ret
33;
34; CHECK-GI-LABEL: test_rev_w_srl16:
35; CHECK-GI:       // %bb.0: // %entry
36; CHECK-GI-NEXT:    and w8, w0, #0xffff
37; CHECK-GI-NEXT:    rev w8, w8
38; CHECK-GI-NEXT:    lsr w0, w8, #16
39; CHECK-GI-NEXT:    ret
40entry:
41  %0 = zext i16 %a to i32
42  %1 = tail call i32 @llvm.bswap.i32(i32 %0)
43  %2 = lshr i32 %1, 16
44  ret i32 %2
45}
46
47define i32 @test_rev_w_srl16_load(ptr %a) {
48; CHECK-LABEL: test_rev_w_srl16_load:
49; CHECK:       // %bb.0: // %entry
50; CHECK-NEXT:    ldrh w8, [x0]
51; CHECK-NEXT:    rev w8, w8
52; CHECK-NEXT:    lsr w0, w8, #16
53; CHECK-NEXT:    ret
54entry:
55  %0 = load i16, ptr %a
56  %1 = zext i16 %0 to i32
57  %2 = tail call i32 @llvm.bswap.i32(i32 %1)
58  %3 = lshr i32 %2, 16
59  ret i32 %3
60}
61
62define i32 @test_rev_w_srl16_add(i8 %a, i8 %b) {
63; CHECK-SD-LABEL: test_rev_w_srl16_add:
64; CHECK-SD:       // %bb.0: // %entry
65; CHECK-SD-NEXT:    and w8, w0, #0xff
66; CHECK-SD-NEXT:    add w8, w8, w1, uxtb
67; CHECK-SD-NEXT:    rev16 w0, w8
68; CHECK-SD-NEXT:    ret
69;
70; CHECK-GI-LABEL: test_rev_w_srl16_add:
71; CHECK-GI:       // %bb.0: // %entry
72; CHECK-GI-NEXT:    and w8, w1, #0xff
73; CHECK-GI-NEXT:    add w8, w8, w0, uxtb
74; CHECK-GI-NEXT:    rev w8, w8
75; CHECK-GI-NEXT:    lsr w0, w8, #16
76; CHECK-GI-NEXT:    ret
77entry:
78  %0 = zext i8 %a to i32
79  %1 = zext i8 %b to i32
80  %2 = add i32 %0, %1
81  %3 = tail call i32 @llvm.bswap.i32(i32 %2)
82  %4 = lshr i32 %3, 16
83  ret i32 %4
84}
85
86; Canonicalize (srl (bswap x), 32) to (rotr (bswap x), 32) if the high 32-bits
87; of %a are zero. This optimizes rev + lsr 32 to rev32.
88define i64 @test_rev_x_srl32(i32 %a) {
89; CHECK-SD-LABEL: test_rev_x_srl32:
90; CHECK-SD:       // %bb.0: // %entry
91; CHECK-SD-NEXT:    // kill: def $w0 killed $w0 def $x0
92; CHECK-SD-NEXT:    rev x8, x0
93; CHECK-SD-NEXT:    lsr x0, x8, #32
94; CHECK-SD-NEXT:    ret
95;
96; CHECK-GI-LABEL: test_rev_x_srl32:
97; CHECK-GI:       // %bb.0: // %entry
98; CHECK-GI-NEXT:    mov w8, w0
99; CHECK-GI-NEXT:    rev x8, x8
100; CHECK-GI-NEXT:    lsr x0, x8, #32
101; CHECK-GI-NEXT:    ret
102entry:
103  %0 = zext i32 %a to i64
104  %1 = tail call i64 @llvm.bswap.i64(i64 %0)
105  %2 = lshr i64 %1, 32
106  ret i64 %2
107}
108
109define i64 @test_rev_x_srl32_load(ptr %a) {
110; CHECK-LABEL: test_rev_x_srl32_load:
111; CHECK:       // %bb.0: // %entry
112; CHECK-NEXT:    ldr w8, [x0]
113; CHECK-NEXT:    rev x8, x8
114; CHECK-NEXT:    lsr x0, x8, #32
115; CHECK-NEXT:    ret
116entry:
117  %0 = load i32, ptr %a
118  %1 = zext i32 %0 to i64
119  %2 = tail call i64 @llvm.bswap.i64(i64 %1)
120  %3 = lshr i64 %2, 32
121  ret i64 %3
122}
123
124define i64 @test_rev_x_srl32_shift(i64 %a) {
125; CHECK-SD-LABEL: test_rev_x_srl32_shift:
126; CHECK-SD:       // %bb.0: // %entry
127; CHECK-SD-NEXT:    ubfx x8, x0, #2, #29
128; CHECK-SD-NEXT:    rev32 x0, x8
129; CHECK-SD-NEXT:    ret
130;
131; CHECK-GI-LABEL: test_rev_x_srl32_shift:
132; CHECK-GI:       // %bb.0: // %entry
133; CHECK-GI-NEXT:    ubfx x8, x0, #2, #29
134; CHECK-GI-NEXT:    rev x8, x8
135; CHECK-GI-NEXT:    lsr x0, x8, #32
136; CHECK-GI-NEXT:    ret
137entry:
138  %0 = shl i64 %a, 33
139  %1 = lshr i64 %0, 35
140  %2 = tail call i64 @llvm.bswap.i64(i64 %1)
141  %3 = lshr i64 %2, 32
142  ret i64 %3
143}
144
145declare i32 @llvm.bswap.i32(i32) nounwind readnone
146declare i64 @llvm.bswap.i64(i64) nounwind readnone
147
148define i32 @test_rev16_w(i32 %X) nounwind {
149; CHECK-SD-LABEL: test_rev16_w:
150; CHECK-SD:       // %bb.0: // %entry
151; CHECK-SD-NEXT:    rev16 w0, w0
152; CHECK-SD-NEXT:    ret
153;
154; CHECK-GI-LABEL: test_rev16_w:
155; CHECK-GI:       // %bb.0: // %entry
156; CHECK-GI-NEXT:    lsr w8, w0, #8
157; CHECK-GI-NEXT:    lsl w9, w0, #8
158; CHECK-GI-NEXT:    and w10, w8, #0xff0000
159; CHECK-GI-NEXT:    and w11, w9, #0xff000000
160; CHECK-GI-NEXT:    and w8, w8, #0xff
161; CHECK-GI-NEXT:    and w9, w9, #0xff00
162; CHECK-GI-NEXT:    orr w10, w11, w10
163; CHECK-GI-NEXT:    orr w8, w9, w8
164; CHECK-GI-NEXT:    orr w0, w10, w8
165; CHECK-GI-NEXT:    ret
166entry:
167  %tmp1 = lshr i32 %X, 8
168  %X15 = bitcast i32 %X to i32
169  %tmp4 = shl i32 %X15, 8
170  %tmp2 = and i32 %tmp1, 16711680
171  %tmp5 = and i32 %tmp4, -16777216
172  %tmp9 = and i32 %tmp1, 255
173  %tmp13 = and i32 %tmp4, 65280
174  %tmp6 = or i32 %tmp5, %tmp2
175  %tmp10 = or i32 %tmp6, %tmp13
176  %tmp14 = or i32 %tmp10, %tmp9
177  ret i32 %tmp14
178}
179
180; 64-bit REV16 is *not* a swap then a 16-bit rotation:
181;   01234567 ->(bswap) 76543210 ->(rotr) 10765432
182;   01234567 ->(rev16) 10325476
183define i64 @test_rev16_x(i64 %a) nounwind {
184; CHECK-LABEL: test_rev16_x:
185; CHECK:       // %bb.0: // %entry
186; CHECK-NEXT:    rev x8, x0
187; CHECK-NEXT:    ror x0, x8, #16
188; CHECK-NEXT:    ret
189entry:
190  %0 = tail call i64 @llvm.bswap.i64(i64 %a)
191  %1 = lshr i64 %0, 16
192  %2 = shl i64 %0, 48
193  %3 = or i64 %1, %2
194  ret i64 %3
195}
196
197define i64 @test_rev32_x(i64 %a) nounwind {
198; CHECK-LABEL: test_rev32_x:
199; CHECK:       // %bb.0: // %entry
200; CHECK-NEXT:    rev32 x0, x0
201; CHECK-NEXT:    ret
202entry:
203  %0 = tail call i64 @llvm.bswap.i64(i64 %a)
204  %1 = lshr i64 %0, 32
205  %2 = shl i64 %0, 32
206  %3 = or i64 %1, %2
207  ret i64 %3
208}
209
210define <8 x i8> @test_vrev64D8(ptr %A) nounwind {
211; CHECK-LABEL: test_vrev64D8:
212; CHECK:       // %bb.0:
213; CHECK-NEXT:    ldr d0, [x0]
214; CHECK-NEXT:    rev64.8b v0, v0
215; CHECK-NEXT:    ret
216	%tmp1 = load <8 x i8>, ptr %A
217	%tmp2 = shufflevector <8 x i8> %tmp1, <8 x i8> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
218	ret <8 x i8> %tmp2
219}
220
221define <4 x i16> @test_vrev64D16(ptr %A) nounwind {
222; CHECK-LABEL: test_vrev64D16:
223; CHECK:       // %bb.0:
224; CHECK-NEXT:    ldr d0, [x0]
225; CHECK-NEXT:    rev64.4h v0, v0
226; CHECK-NEXT:    ret
227	%tmp1 = load <4 x i16>, ptr %A
228	%tmp2 = shufflevector <4 x i16> %tmp1, <4 x i16> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
229	ret <4 x i16> %tmp2
230}
231
232define <2 x i32> @test_vrev64D32(ptr %A) nounwind {
233; CHECK-LABEL: test_vrev64D32:
234; CHECK:       // %bb.0:
235; CHECK-NEXT:    ldr d0, [x0]
236; CHECK-NEXT:    rev64.2s v0, v0
237; CHECK-NEXT:    ret
238	%tmp1 = load <2 x i32>, ptr %A
239	%tmp2 = shufflevector <2 x i32> %tmp1, <2 x i32> undef, <2 x i32> <i32 1, i32 0>
240	ret <2 x i32> %tmp2
241}
242
243define <2 x float> @test_vrev64Df(ptr %A) nounwind {
244; CHECK-LABEL: test_vrev64Df:
245; CHECK:       // %bb.0:
246; CHECK-NEXT:    ldr d0, [x0]
247; CHECK-NEXT:    rev64.2s v0, v0
248; CHECK-NEXT:    ret
249	%tmp1 = load <2 x float>, ptr %A
250	%tmp2 = shufflevector <2 x float> %tmp1, <2 x float> undef, <2 x i32> <i32 1, i32 0>
251	ret <2 x float> %tmp2
252}
253
254define <16 x i8> @test_vrev64Q8(ptr %A) nounwind {
255; CHECK-LABEL: test_vrev64Q8:
256; CHECK:       // %bb.0:
257; CHECK-NEXT:    ldr q0, [x0]
258; CHECK-NEXT:    rev64.16b v0, v0
259; CHECK-NEXT:    ret
260	%tmp1 = load <16 x i8>, ptr %A
261	%tmp2 = shufflevector <16 x i8> %tmp1, <16 x i8> undef, <16 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8>
262	ret <16 x i8> %tmp2
263}
264
265define <8 x i16> @test_vrev64Q16(ptr %A) nounwind {
266; CHECK-LABEL: test_vrev64Q16:
267; CHECK:       // %bb.0:
268; CHECK-NEXT:    ldr q0, [x0]
269; CHECK-NEXT:    rev64.8h v0, v0
270; CHECK-NEXT:    ret
271	%tmp1 = load <8 x i16>, ptr %A
272	%tmp2 = shufflevector <8 x i16> %tmp1, <8 x i16> undef, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
273	ret <8 x i16> %tmp2
274}
275
276define <4 x i32> @test_vrev64Q32(ptr %A) nounwind {
277; CHECK-LABEL: test_vrev64Q32:
278; CHECK:       // %bb.0:
279; CHECK-NEXT:    ldr q0, [x0]
280; CHECK-NEXT:    rev64.4s v0, v0
281; CHECK-NEXT:    ret
282	%tmp1 = load <4 x i32>, ptr %A
283	%tmp2 = shufflevector <4 x i32> %tmp1, <4 x i32> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
284	ret <4 x i32> %tmp2
285}
286
287define <4 x float> @test_vrev64Qf(ptr %A) nounwind {
288; CHECK-LABEL: test_vrev64Qf:
289; CHECK:       // %bb.0:
290; CHECK-NEXT:    ldr q0, [x0]
291; CHECK-NEXT:    rev64.4s v0, v0
292; CHECK-NEXT:    ret
293	%tmp1 = load <4 x float>, ptr %A
294	%tmp2 = shufflevector <4 x float> %tmp1, <4 x float> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
295	ret <4 x float> %tmp2
296}
297
298define <8 x i8> @test_vrev32D8(ptr %A) nounwind {
299; CHECK-LABEL: test_vrev32D8:
300; CHECK:       // %bb.0:
301; CHECK-NEXT:    ldr d0, [x0]
302; CHECK-NEXT:    rev32.8b v0, v0
303; CHECK-NEXT:    ret
304	%tmp1 = load <8 x i8>, ptr %A
305	%tmp2 = shufflevector <8 x i8> %tmp1, <8 x i8> undef, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
306	ret <8 x i8> %tmp2
307}
308
309define <4 x i16> @test_vrev32D16(ptr %A) nounwind {
310; CHECK-LABEL: test_vrev32D16:
311; CHECK:       // %bb.0:
312; CHECK-NEXT:    ldr d0, [x0]
313; CHECK-NEXT:    rev32.4h v0, v0
314; CHECK-NEXT:    ret
315	%tmp1 = load <4 x i16>, ptr %A
316	%tmp2 = shufflevector <4 x i16> %tmp1, <4 x i16> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
317	ret <4 x i16> %tmp2
318}
319
320define <16 x i8> @test_vrev32Q8(ptr %A) nounwind {
321; CHECK-LABEL: test_vrev32Q8:
322; CHECK:       // %bb.0:
323; CHECK-NEXT:    ldr q0, [x0]
324; CHECK-NEXT:    rev32.16b v0, v0
325; CHECK-NEXT:    ret
326	%tmp1 = load <16 x i8>, ptr %A
327	%tmp2 = shufflevector <16 x i8> %tmp1, <16 x i8> undef, <16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 11, i32 10, i32 9, i32 8, i32 15, i32 14, i32 13, i32 12>
328	ret <16 x i8> %tmp2
329}
330
331define <8 x i16> @test_vrev32Q16(ptr %A) nounwind {
332; CHECK-LABEL: test_vrev32Q16:
333; CHECK:       // %bb.0:
334; CHECK-NEXT:    ldr q0, [x0]
335; CHECK-NEXT:    rev32.8h v0, v0
336; CHECK-NEXT:    ret
337	%tmp1 = load <8 x i16>, ptr %A
338	%tmp2 = shufflevector <8 x i16> %tmp1, <8 x i16> undef, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
339	ret <8 x i16> %tmp2
340}
341
342define <8 x i8> @test_vrev16D8(ptr %A) nounwind {
343; CHECK-LABEL: test_vrev16D8:
344; CHECK:       // %bb.0:
345; CHECK-NEXT:    ldr d0, [x0]
346; CHECK-NEXT:    rev16.8b v0, v0
347; CHECK-NEXT:    ret
348	%tmp1 = load <8 x i8>, ptr %A
349	%tmp2 = shufflevector <8 x i8> %tmp1, <8 x i8> undef, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
350	ret <8 x i8> %tmp2
351}
352
353define <16 x i8> @test_vrev16Q8(ptr %A) nounwind {
354; CHECK-LABEL: test_vrev16Q8:
355; CHECK:       // %bb.0:
356; CHECK-NEXT:    ldr q0, [x0]
357; CHECK-NEXT:    rev16.16b v0, v0
358; CHECK-NEXT:    ret
359	%tmp1 = load <16 x i8>, ptr %A
360	%tmp2 = shufflevector <16 x i8> %tmp1, <16 x i8> undef, <16 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6, i32 9, i32 8, i32 11, i32 10, i32 13, i32 12, i32 15, i32 14>
361	ret <16 x i8> %tmp2
362}
363
364; Undef shuffle indices should not prevent matching to VREV:
365
366define <8 x i8> @test_vrev64D8_undef(ptr %A) nounwind {
367; CHECK-LABEL: test_vrev64D8_undef:
368; CHECK:       // %bb.0:
369; CHECK-NEXT:    ldr d0, [x0]
370; CHECK-NEXT:    rev64.8b v0, v0
371; CHECK-NEXT:    ret
372	%tmp1 = load <8 x i8>, ptr %A
373	%tmp2 = shufflevector <8 x i8> %tmp1, <8 x i8> undef, <8 x i32> <i32 7, i32 undef, i32 undef, i32 4, i32 3, i32 2, i32 1, i32 0>
374	ret <8 x i8> %tmp2
375}
376
377define <8 x i16> @test_vrev32Q16_undef(ptr %A) nounwind {
378; CHECK-LABEL: test_vrev32Q16_undef:
379; CHECK:       // %bb.0:
380; CHECK-NEXT:    ldr q0, [x0]
381; CHECK-NEXT:    rev32.8h v0, v0
382; CHECK-NEXT:    ret
383	%tmp1 = load <8 x i16>, ptr %A
384	%tmp2 = shufflevector <8 x i16> %tmp1, <8 x i16> undef, <8 x i32> <i32 undef, i32 0, i32 undef, i32 2, i32 5, i32 4, i32 7, i32 undef>
385	ret <8 x i16> %tmp2
386}
387
388; vrev <4 x i16> should use REV32 and not REV64
389define void @test_vrev64(ptr nocapture %source, ptr nocapture %dst) nounwind ssp {
390; CHECK-SD-LABEL: test_vrev64:
391; CHECK-SD:       // %bb.0: // %entry
392; CHECK-SD-NEXT:    ldr q0, [x0]
393; CHECK-SD-NEXT:    add x8, x1, #2
394; CHECK-SD-NEXT:    st1.h { v0 }[5], [x8]
395; CHECK-SD-NEXT:    st1.h { v0 }[6], [x1]
396; CHECK-SD-NEXT:    ret
397;
398; CHECK-GI-LABEL: test_vrev64:
399; CHECK-GI:       // %bb.0: // %entry
400; CHECK-GI-NEXT:    ldr q0, [x0]
401; CHECK-GI-NEXT:    add x8, x1, #2
402; CHECK-GI-NEXT:    st1.h { v0 }[6], [x1]
403; CHECK-GI-NEXT:    st1.h { v0 }[5], [x8]
404; CHECK-GI-NEXT:    ret
405entry:
406  %tmp2 = load <8 x i16>, ptr %source, align 4
407  %tmp3 = extractelement <8 x i16> %tmp2, i32 6
408  %tmp5 = insertelement <2 x i16> undef, i16 %tmp3, i32 0
409  %tmp9 = extractelement <8 x i16> %tmp2, i32 5
410  %tmp11 = insertelement <2 x i16> %tmp5, i16 %tmp9, i32 1
411  store <2 x i16> %tmp11, ptr %dst, align 4
412  ret void
413}
414
415; Test vrev of float4
416define void @float_vrev64(ptr nocapture %source, ptr nocapture %dest) nounwind noinline ssp {
417; CHECK-SD-LABEL: float_vrev64:
418; CHECK-SD:       // %bb.0: // %entry
419; CHECK-SD-NEXT:    movi.2d v0, #0000000000000000
420; CHECK-SD-NEXT:    add x8, x0, #12
421; CHECK-SD-NEXT:    dup.4s v0, v0[0]
422; CHECK-SD-NEXT:    ld1.s { v0 }[1], [x8]
423; CHECK-SD-NEXT:    str q0, [x1, #176]
424; CHECK-SD-NEXT:    ret
425;
426; CHECK-GI-LABEL: float_vrev64:
427; CHECK-GI:       // %bb.0: // %entry
428; CHECK-GI-NEXT:    movi d0, #0000000000000000
429; CHECK-GI-NEXT:    adrp x8, .LCPI28_0
430; CHECK-GI-NEXT:    ldr q1, [x0]
431; CHECK-GI-NEXT:    ldr q2, [x8, :lo12:.LCPI28_0]
432; CHECK-GI-NEXT:    tbl.16b v0, { v0, v1 }, v2
433; CHECK-GI-NEXT:    str q0, [x1, #176]
434; CHECK-GI-NEXT:    ret
435entry:
436  %tmp2 = load <4 x float>, ptr %source, align 4
437  %tmp5 = shufflevector <4 x float> <float 0.000000e+00, float undef, float undef, float undef>, <4 x float> %tmp2, <4 x i32> <i32 0, i32 7, i32 0, i32 0>
438  %arrayidx8 = getelementptr inbounds <4 x float>, ptr %dest, i32 11
439  store <4 x float> %tmp5, ptr %arrayidx8, align 4
440  ret void
441}
442
443
444define <4 x i32> @test_vrev32_bswap(<4 x i32> %source) nounwind {
445; CHECK-LABEL: test_vrev32_bswap:
446; CHECK:       // %bb.0:
447; CHECK-NEXT:    rev32.16b v0, v0
448; CHECK-NEXT:    ret
449  %bswap = call <4 x i32> @llvm.bswap.v4i32(<4 x i32> %source)
450  ret <4 x i32> %bswap
451}
452
453declare <4 x i32> @llvm.bswap.v4i32(<4 x i32>) nounwind readnone
454
455; Reduced regression from D114354
456define void @test_rev16_truncstore() {
457; CHECK-SD-LABEL: test_rev16_truncstore:
458; CHECK-SD:       // %bb.0: // %entry
459; CHECK-SD-NEXT:    cbnz wzr, .LBB30_2
460; CHECK-SD-NEXT:  .LBB30_1: // %cleanup
461; CHECK-SD-NEXT:    // =>This Inner Loop Header: Depth=1
462; CHECK-SD-NEXT:    ldrh w8, [x8]
463; CHECK-SD-NEXT:    rev16 w8, w8
464; CHECK-SD-NEXT:    strh w8, [x8]
465; CHECK-SD-NEXT:    cbz wzr, .LBB30_1
466; CHECK-SD-NEXT:  .LBB30_2: // %fail
467; CHECK-SD-NEXT:    ret
468;
469; CHECK-GI-LABEL: test_rev16_truncstore:
470; CHECK-GI:       // %bb.0: // %entry
471; CHECK-GI-NEXT:    tbnz wzr, #0, .LBB30_2
472; CHECK-GI-NEXT:  .LBB30_1: // %cleanup
473; CHECK-GI-NEXT:    // =>This Inner Loop Header: Depth=1
474; CHECK-GI-NEXT:    ldrh w8, [x8]
475; CHECK-GI-NEXT:    rev w8, w8
476; CHECK-GI-NEXT:    lsr w8, w8, #16
477; CHECK-GI-NEXT:    strh w8, [x8]
478; CHECK-GI-NEXT:    tbz wzr, #0, .LBB30_1
479; CHECK-GI-NEXT:  .LBB30_2: // %fail
480; CHECK-GI-NEXT:    ret
481entry:
482  br label %body
483
484body:
485  %out.6269.i = phi ptr [ undef, %cleanup ], [ undef, %entry ]
486  %0 = load i16, ptr undef, align 2
487  %1 = icmp eq i16 undef, -10240
488  br i1 %1, label %fail, label %cleanup
489
490cleanup:
491  %or130.i = call i16 @llvm.bswap.i16(i16 %0)
492  store i16 %or130.i, ptr %out.6269.i, align 2
493  br label %body
494
495fail:
496  ret void
497}
498declare i16 @llvm.bswap.i16(i16)
499
500; Reduced regression from D120192
501define void @test_bswap32_narrow(ptr %p0, ptr %p1) nounwind {
502; CHECK-SD-LABEL: test_bswap32_narrow:
503; CHECK-SD:       // %bb.0:
504; CHECK-SD-NEXT:    stp x30, x19, [sp, #-16]! // 16-byte Folded Spill
505; CHECK-SD-NEXT:    ldrh w8, [x0, #2]
506; CHECK-SD-NEXT:    mov x19, x1
507; CHECK-SD-NEXT:    rev16 w0, w8
508; CHECK-SD-NEXT:    bl gid_tbl_len
509; CHECK-SD-NEXT:    strh wzr, [x19]
510; CHECK-SD-NEXT:    ldp x30, x19, [sp], #16 // 16-byte Folded Reload
511; CHECK-SD-NEXT:    ret
512;
513; CHECK-GI-LABEL: test_bswap32_narrow:
514; CHECK-GI:       // %bb.0:
515; CHECK-GI-NEXT:    stp x30, x19, [sp, #-16]! // 16-byte Folded Spill
516; CHECK-GI-NEXT:    ldr w8, [x0]
517; CHECK-GI-NEXT:    mov x19, x1
518; CHECK-GI-NEXT:    and w8, w8, #0xffff0000
519; CHECK-GI-NEXT:    rev w0, w8
520; CHECK-GI-NEXT:    bl gid_tbl_len
521; CHECK-GI-NEXT:    strh wzr, [x19]
522; CHECK-GI-NEXT:    ldp x30, x19, [sp], #16 // 16-byte Folded Reload
523; CHECK-GI-NEXT:    ret
524  %ld = load i32, ptr %p0, align 4
525  %and = and i32 %ld, -65536
526  %bswap = tail call i32 @llvm.bswap.i32(i32 %and)
527  %and16 = zext i32 %bswap to i64
528  %call17 = tail call i32 @gid_tbl_len(i64 %and16)
529  store i16 0, ptr %p1, align 4
530  ret void
531}
532declare i32 @gid_tbl_len(...)
533
534; 64-bit REV16 is *not* a swap then a 16-bit rotation:
535;   01234567 ->(bswap) 76543210 ->(rotr) 10765432
536;   01234567 ->(rev16) 10325476
537; Optimize patterns where rev16 can be generated for a 64-bit input.
538define i64 @test_rev16_x_hwbyteswaps(i64 %a) nounwind {
539; CHECK-LABEL: test_rev16_x_hwbyteswaps:
540; CHECK:       // %bb.0: // %entry
541; CHECK-NEXT:    rev16 x0, x0
542; CHECK-NEXT:    ret
543entry:
544  %0 = lshr i64 %a, 8
545  %1 = and i64 %0, 71777214294589695
546  %2 = shl i64 %a, 8
547  %3 = and i64 %2, -71777214294589696
548  %4 = or i64 %1, %3
549  ret i64 %4
550}
551
552; Optimize pattern with multiple and/or to a simple pattern which can enable generation of rev16.
553define i64 @test_rev16_x_hwbyteswaps_complex1(i64 %a) nounwind {
554; CHECK-SD-LABEL: test_rev16_x_hwbyteswaps_complex1:
555; CHECK-SD:       // %bb.0: // %entry
556; CHECK-SD-NEXT:    lsr x8, x0, #8
557; CHECK-SD-NEXT:    lsr x9, x0, #48
558; CHECK-SD-NEXT:    and x10, x8, #0xff000000000000
559; CHECK-SD-NEXT:    and x11, x8, #0xff00000000
560; CHECK-SD-NEXT:    and x8, x8, #0xff0000
561; CHECK-SD-NEXT:    bfi x10, x9, #56, #8
562; CHECK-SD-NEXT:    lsr x9, x0, #32
563; CHECK-SD-NEXT:    orr x10, x10, x11
564; CHECK-SD-NEXT:    bfi x10, x9, #40, #8
565; CHECK-SD-NEXT:    lsr x9, x0, #16
566; CHECK-SD-NEXT:    orr x8, x10, x8
567; CHECK-SD-NEXT:    bfi x8, x9, #24, #8
568; CHECK-SD-NEXT:    ubfiz x9, x0, #8, #8
569; CHECK-SD-NEXT:    bfxil x8, x0, #8, #8
570; CHECK-SD-NEXT:    orr x0, x8, x9
571; CHECK-SD-NEXT:    ret
572;
573; CHECK-GI-LABEL: test_rev16_x_hwbyteswaps_complex1:
574; CHECK-GI:       // %bb.0: // %entry
575; CHECK-GI-NEXT:    lsr x8, x0, #8
576; CHECK-GI-NEXT:    lsl x9, x0, #8
577; CHECK-GI-NEXT:    and x10, x8, #0xff000000000000
578; CHECK-GI-NEXT:    and x11, x9, #0xff00000000000000
579; CHECK-GI-NEXT:    and x12, x8, #0xff00000000
580; CHECK-GI-NEXT:    and x13, x9, #0xff0000000000
581; CHECK-GI-NEXT:    and x14, x8, #0xff0000
582; CHECK-GI-NEXT:    orr x10, x10, x11
583; CHECK-GI-NEXT:    and x11, x9, #0xff000000
584; CHECK-GI-NEXT:    orr x12, x12, x13
585; CHECK-GI-NEXT:    and x8, x8, #0xff
586; CHECK-GI-NEXT:    orr x11, x14, x11
587; CHECK-GI-NEXT:    orr x10, x10, x12
588; CHECK-GI-NEXT:    and x9, x9, #0xff00
589; CHECK-GI-NEXT:    orr x8, x11, x8
590; CHECK-GI-NEXT:    orr x8, x10, x8
591; CHECK-GI-NEXT:    orr x0, x8, x9
592; CHECK-GI-NEXT:    ret
593entry:
594  %0 = lshr i64 %a, 8
595  %1 = and i64 %0, 71776119061217280
596  %2 = shl i64 %a, 8
597  %3 = and i64 %2, -72057594037927936
598  %4 = or i64 %1, %3
599  %5 = and i64 %0, 1095216660480
600  %6 = or i64 %4, %5
601  %7 = and i64 %2, 280375465082880
602  %8 = or i64 %6, %7
603  %9 = and i64 %0, 16711680
604  %10 = or i64 %8, %9
605  %11 = and i64 %2, 4278190080
606  %12 = or i64 %10, %11
607  %13 = and i64 %0, 255
608  %14 = or i64 %12, %13
609  %15 = and i64 %2, 65280
610  %16 = or i64 %14, %15
611  ret i64 %16
612}
613
614define i64 @test_rev16_x_hwbyteswaps_complex2(i64 %a) nounwind {
615; CHECK-SD-LABEL: test_rev16_x_hwbyteswaps_complex2:
616; CHECK-SD:       // %bb.0: // %entry
617; CHECK-SD-NEXT:    lsr x8, x0, #8
618; CHECK-SD-NEXT:    lsr x9, x0, #48
619; CHECK-SD-NEXT:    lsr x10, x0, #32
620; CHECK-SD-NEXT:    and x8, x8, #0xff00ff00ff00ff
621; CHECK-SD-NEXT:    bfi x8, x9, #56, #8
622; CHECK-SD-NEXT:    lsr x9, x0, #16
623; CHECK-SD-NEXT:    bfi x8, x10, #40, #8
624; CHECK-SD-NEXT:    bfi x8, x9, #24, #8
625; CHECK-SD-NEXT:    bfi x8, x0, #8, #8
626; CHECK-SD-NEXT:    mov x0, x8
627; CHECK-SD-NEXT:    ret
628;
629; CHECK-GI-LABEL: test_rev16_x_hwbyteswaps_complex2:
630; CHECK-GI:       // %bb.0: // %entry
631; CHECK-GI-NEXT:    lsr x8, x0, #8
632; CHECK-GI-NEXT:    lsl x9, x0, #8
633; CHECK-GI-NEXT:    and x10, x8, #0xff000000000000
634; CHECK-GI-NEXT:    and x11, x8, #0xff00000000
635; CHECK-GI-NEXT:    and x12, x8, #0xff0000
636; CHECK-GI-NEXT:    and x8, x8, #0xff
637; CHECK-GI-NEXT:    and x13, x9, #0xff00000000000000
638; CHECK-GI-NEXT:    orr x10, x10, x11
639; CHECK-GI-NEXT:    and x11, x9, #0xff0000000000
640; CHECK-GI-NEXT:    orr x8, x12, x8
641; CHECK-GI-NEXT:    and x12, x9, #0xff000000
642; CHECK-GI-NEXT:    orr x11, x13, x11
643; CHECK-GI-NEXT:    orr x8, x10, x8
644; CHECK-GI-NEXT:    and x9, x9, #0xff00
645; CHECK-GI-NEXT:    orr x10, x11, x12
646; CHECK-GI-NEXT:    orr x8, x8, x10
647; CHECK-GI-NEXT:    orr x0, x8, x9
648; CHECK-GI-NEXT:    ret
649entry:
650  %0 = lshr i64 %a, 8
651  %1 = and i64 %0, 71776119061217280
652  %2 = shl i64 %a, 8
653  %3 = and i64 %0, 1095216660480
654  %4 = or i64 %1, %3
655  %5 = and i64 %0, 16711680
656  %6 = or i64 %4, %5
657  %7 = and i64 %0, 255
658  %8 = or i64 %6, %7
659  %9 = and i64 %2, -72057594037927936
660  %10 = or i64 %8, %9
661  %11 = and i64 %2, 280375465082880
662  %12 = or i64 %10, %11
663  %13 = and i64 %2, 4278190080
664  %14 = or i64 %12, %13
665  %15 = and i64 %2, 65280
666  %16 = or i64 %14, %15
667  ret i64 %16
668}
669
670; Optimize pattern with multiple and/or to a simple pattern which can enable generation of rev16.
671define i64 @test_rev16_x_hwbyteswaps_complex3(i64 %a) nounwind {
672; CHECK-SD-LABEL: test_rev16_x_hwbyteswaps_complex3:
673; CHECK-SD:       // %bb.0: // %entry
674; CHECK-SD-NEXT:    lsr x8, x0, #8
675; CHECK-SD-NEXT:    lsr x9, x0, #48
676; CHECK-SD-NEXT:    and x10, x8, #0xff000000000000
677; CHECK-SD-NEXT:    and x11, x8, #0xff00000000
678; CHECK-SD-NEXT:    and x8, x8, #0xff0000
679; CHECK-SD-NEXT:    bfi x10, x9, #56, #8
680; CHECK-SD-NEXT:    lsr x9, x0, #32
681; CHECK-SD-NEXT:    orr x10, x11, x10
682; CHECK-SD-NEXT:    bfi x10, x9, #40, #8
683; CHECK-SD-NEXT:    lsr x9, x0, #16
684; CHECK-SD-NEXT:    orr x8, x8, x10
685; CHECK-SD-NEXT:    bfi x8, x9, #24, #8
686; CHECK-SD-NEXT:    ubfiz x9, x0, #8, #8
687; CHECK-SD-NEXT:    bfxil x8, x0, #8, #8
688; CHECK-SD-NEXT:    orr x0, x9, x8
689; CHECK-SD-NEXT:    ret
690;
691; CHECK-GI-LABEL: test_rev16_x_hwbyteswaps_complex3:
692; CHECK-GI:       // %bb.0: // %entry
693; CHECK-GI-NEXT:    lsr x8, x0, #8
694; CHECK-GI-NEXT:    lsl x9, x0, #8
695; CHECK-GI-NEXT:    and x10, x8, #0xff000000000000
696; CHECK-GI-NEXT:    and x11, x9, #0xff00000000000000
697; CHECK-GI-NEXT:    and x12, x8, #0xff00000000
698; CHECK-GI-NEXT:    and x13, x9, #0xff0000000000
699; CHECK-GI-NEXT:    and x14, x8, #0xff0000
700; CHECK-GI-NEXT:    orr x10, x11, x10
701; CHECK-GI-NEXT:    and x11, x9, #0xff000000
702; CHECK-GI-NEXT:    orr x12, x13, x12
703; CHECK-GI-NEXT:    and x8, x8, #0xff
704; CHECK-GI-NEXT:    orr x11, x11, x14
705; CHECK-GI-NEXT:    orr x10, x12, x10
706; CHECK-GI-NEXT:    and x9, x9, #0xff00
707; CHECK-GI-NEXT:    orr x8, x8, x11
708; CHECK-GI-NEXT:    orr x8, x8, x10
709; CHECK-GI-NEXT:    orr x0, x9, x8
710; CHECK-GI-NEXT:    ret
711entry:
712  %0 = lshr i64 %a, 8
713  %1 = and i64 %0, 71776119061217280
714  %2 = shl i64 %a, 8
715  %3 = and i64 %2, -72057594037927936
716  %4 = or i64 %3, %1
717  %5 = and i64 %0, 1095216660480
718  %6 = or i64 %5, %4
719  %7 = and i64 %2, 280375465082880
720  %8 = or i64 %7, %6
721  %9 = and i64 %0, 16711680
722  %10 = or i64 %9, %8
723  %11 = and i64 %2, 4278190080
724  %12 = or i64 %11, %10
725  %13 = and i64 %0, 255
726  %14 = or i64 %13, %12
727  %15 = and i64 %2, 65280
728  %16 = or i64 %15, %14
729  ret i64 %16
730}
731
732define i64 @test_or_and_combine1(i64 %a) nounwind {
733; CHECK-SD-LABEL: test_or_and_combine1:
734; CHECK-SD:       // %bb.0: // %entry
735; CHECK-SD-NEXT:    lsr x8, x0, #8
736; CHECK-SD-NEXT:    lsr x9, x0, #24
737; CHECK-SD-NEXT:    and x10, x8, #0xff000000000000
738; CHECK-SD-NEXT:    and x8, x8, #0xff0000
739; CHECK-SD-NEXT:    bfi x10, x9, #32, #8
740; CHECK-SD-NEXT:    orr x0, x10, x8
741; CHECK-SD-NEXT:    ret
742;
743; CHECK-GI-LABEL: test_or_and_combine1:
744; CHECK-GI:       // %bb.0: // %entry
745; CHECK-GI-NEXT:    lsr x8, x0, #8
746; CHECK-GI-NEXT:    lsl x9, x0, #8
747; CHECK-GI-NEXT:    and x10, x8, #0xff000000000000
748; CHECK-GI-NEXT:    and x9, x9, #0xff00000000
749; CHECK-GI-NEXT:    and x8, x8, #0xff0000
750; CHECK-GI-NEXT:    orr x9, x10, x9
751; CHECK-GI-NEXT:    orr x0, x9, x8
752; CHECK-GI-NEXT:    ret
753entry:
754  %0 = lshr i64 %a, 8
755  %1 = and i64 %0, 71776119061217280
756  %2 = shl i64 %a, 8
757  %3 = and i64 %2, 1095216660480
758  %4 = or i64 %1, %3
759  %5 = and i64 %0, 16711680
760  %6 = or i64 %4, %5
761  ret i64 %6
762}
763
764define i64 @test_or_and_combine2(i64 %a, i64 %b) nounwind {
765; CHECK-LABEL: test_or_and_combine2:
766; CHECK:       // %bb.0: // %entry
767; CHECK-NEXT:    lsr x8, x0, #8
768; CHECK-NEXT:    lsl x9, x0, #8
769; CHECK-NEXT:    and x10, x8, #0xff000000000000
770; CHECK-NEXT:    and x11, x9, #0xff00000000
771; CHECK-NEXT:    and x8, x8, #0xff0000
772; CHECK-NEXT:    orr x9, x10, x9
773; CHECK-NEXT:    orr x8, x11, x8
774; CHECK-NEXT:    orr x0, x9, x8
775; CHECK-NEXT:    ret
776entry:
777  %0 = lshr i64 %a, 8
778  %1 = and i64 %0, 71776119061217280
779  %2 = shl i64 %a, 8
780  %3 = or i64 %1, %2
781  %4 = and i64 %2, 1095216660480
782  %5 = or i64 %3, %4
783  %6 = and i64 %0, 16711680
784  %7 = or i64 %5, %6
785  ret i64 %7
786}
787
788define i32 @pr55484(i32 %0) {
789; CHECK-SD-LABEL: pr55484:
790; CHECK-SD:       // %bb.0:
791; CHECK-SD-NEXT:    lsr w8, w0, #8
792; CHECK-SD-NEXT:    orr w8, w8, w0, lsl #8
793; CHECK-SD-NEXT:    sxth w0, w8
794; CHECK-SD-NEXT:    ret
795;
796; CHECK-GI-LABEL: pr55484:
797; CHECK-GI:       // %bb.0:
798; CHECK-GI-NEXT:    lsl w8, w0, #8
799; CHECK-GI-NEXT:    orr w8, w8, w0, lsr #8
800; CHECK-GI-NEXT:    sxth w0, w8
801; CHECK-GI-NEXT:    ret
802  %2 = lshr i32 %0, 8
803  %3 = shl i32 %0, 8
804  %4 = or i32 %2, %3
805  %5 = trunc i32 %4 to i16
806  %6 = sext i16 %5 to i32
807  ret i32 %6
808}
809