xref: /llvm-project/llvm/test/CodeGen/AMDGPU/bswap.ll (revision 703e9e97d937f3bb25d4318d86e357a665e72731)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=amdgcn-- -verify-machineinstrs | FileCheck %s --check-prefix=SI
3; RUN: llc < %s -mtriple=amdgcn-- -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs | FileCheck %s --check-prefix=VI
4; RUN: llc < %s -mtriple=amdgcn-- -mcpu=gfx1100 -mattr=-flat-for-global,+real-true16 -verify-machineinstrs | FileCheck %s --check-prefixes=GFX11,GFX11-REAL16
5; RUN: llc < %s -mtriple=amdgcn-- -mcpu=gfx1100 -mattr=-flat-for-global,-real-true16 -verify-machineinstrs | FileCheck %s --check-prefixes=GFX11,GFX11-FAKE16
6
7declare i16 @llvm.bswap.i16(i16) nounwind readnone
8declare <2 x i16> @llvm.bswap.v2i16(<2 x i16>) nounwind readnone
9declare <3 x i16> @llvm.bswap.v3i16(<3 x i16>) nounwind readnone
10declare <4 x i16> @llvm.bswap.v4i16(<4 x i16>) nounwind readnone
11declare i32 @llvm.bswap.i32(i32) nounwind readnone
12declare <2 x i32> @llvm.bswap.v2i32(<2 x i32>) nounwind readnone
13declare <4 x i32> @llvm.bswap.v4i32(<4 x i32>) nounwind readnone
14declare <8 x i32> @llvm.bswap.v8i32(<8 x i32>) nounwind readnone
15declare i64 @llvm.bswap.i64(i64) nounwind readnone
16declare <2 x i64> @llvm.bswap.v2i64(<2 x i64>) nounwind readnone
17declare <4 x i64> @llvm.bswap.v4i64(<4 x i64>) nounwind readnone
18declare i48 @llvm.bswap.i48(i48) #1
19
20define amdgpu_kernel void @test_bswap_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) nounwind {
21; SI-LABEL: test_bswap_i32:
22; SI:       ; %bb.0:
23; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
24; SI-NEXT:    s_waitcnt lgkmcnt(0)
25; SI-NEXT:    s_load_dword s4, s[2:3], 0x0
26; SI-NEXT:    s_mov_b32 s3, 0xf000
27; SI-NEXT:    s_mov_b32 s2, -1
28; SI-NEXT:    s_waitcnt lgkmcnt(0)
29; SI-NEXT:    v_alignbit_b32 v0, s4, s4, 8
30; SI-NEXT:    v_alignbit_b32 v1, s4, s4, 24
31; SI-NEXT:    s_mov_b32 s4, 0xff00ff
32; SI-NEXT:    v_bfi_b32 v0, s4, v1, v0
33; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
34; SI-NEXT:    s_endpgm
35;
36; VI-LABEL: test_bswap_i32:
37; VI:       ; %bb.0:
38; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
39; VI-NEXT:    v_mov_b32_e32 v0, 0x10203
40; VI-NEXT:    s_mov_b32 s7, 0xf000
41; VI-NEXT:    s_mov_b32 s6, -1
42; VI-NEXT:    s_waitcnt lgkmcnt(0)
43; VI-NEXT:    s_load_dword s2, s[2:3], 0x0
44; VI-NEXT:    s_mov_b32 s4, s0
45; VI-NEXT:    s_mov_b32 s5, s1
46; VI-NEXT:    s_waitcnt lgkmcnt(0)
47; VI-NEXT:    v_perm_b32 v0, 0, s2, v0
48; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
49; VI-NEXT:    s_endpgm
50;
51; GFX11-LABEL: test_bswap_i32:
52; GFX11:       ; %bb.0:
53; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
54; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
55; GFX11-NEXT:    s_load_b32 s2, s[2:3], 0x0
56; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
57; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
58; GFX11-NEXT:    v_perm_b32 v0, 0, s2, 0x10203
59; GFX11-NEXT:    s_mov_b32 s2, -1
60; GFX11-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
61; GFX11-NEXT:    s_endpgm
62  %val = load i32, ptr addrspace(1) %in, align 4
63  %bswap = call i32 @llvm.bswap.i32(i32 %val) nounwind readnone
64  store i32 %bswap, ptr addrspace(1) %out, align 4
65  ret void
66}
67
68define amdgpu_kernel void @test_bswap_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %in) nounwind {
69; SI-LABEL: test_bswap_v2i32:
70; SI:       ; %bb.0:
71; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
72; SI-NEXT:    s_waitcnt lgkmcnt(0)
73; SI-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x0
74; SI-NEXT:    s_mov_b32 s3, 0xf000
75; SI-NEXT:    s_mov_b32 s2, -1
76; SI-NEXT:    s_mov_b32 s6, 0xff00ff
77; SI-NEXT:    s_waitcnt lgkmcnt(0)
78; SI-NEXT:    v_alignbit_b32 v0, s5, s5, 8
79; SI-NEXT:    v_alignbit_b32 v1, s5, s5, 24
80; SI-NEXT:    v_alignbit_b32 v2, s4, s4, 8
81; SI-NEXT:    v_alignbit_b32 v3, s4, s4, 24
82; SI-NEXT:    v_bfi_b32 v1, s6, v1, v0
83; SI-NEXT:    v_bfi_b32 v0, s6, v3, v2
84; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
85; SI-NEXT:    s_endpgm
86;
87; VI-LABEL: test_bswap_v2i32:
88; VI:       ; %bb.0:
89; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
90; VI-NEXT:    v_mov_b32_e32 v0, 0x10203
91; VI-NEXT:    s_mov_b32 s7, 0xf000
92; VI-NEXT:    s_mov_b32 s6, -1
93; VI-NEXT:    s_waitcnt lgkmcnt(0)
94; VI-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
95; VI-NEXT:    s_mov_b32 s4, s0
96; VI-NEXT:    s_mov_b32 s5, s1
97; VI-NEXT:    s_waitcnt lgkmcnt(0)
98; VI-NEXT:    v_perm_b32 v1, 0, s3, v0
99; VI-NEXT:    v_perm_b32 v0, 0, s2, v0
100; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
101; VI-NEXT:    s_endpgm
102;
103; GFX11-LABEL: test_bswap_v2i32:
104; GFX11:       ; %bb.0:
105; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
106; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
107; GFX11-NEXT:    s_load_b64 s[4:5], s[2:3], 0x0
108; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
109; GFX11-NEXT:    s_mov_b32 s2, -1
110; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
111; GFX11-NEXT:    v_perm_b32 v1, 0, s5, 0x10203
112; GFX11-NEXT:    v_perm_b32 v0, 0, s4, 0x10203
113; GFX11-NEXT:    buffer_store_b64 v[0:1], off, s[0:3], 0
114; GFX11-NEXT:    s_endpgm
115  %val = load <2 x i32>, ptr addrspace(1) %in, align 8
116  %bswap = call <2 x i32> @llvm.bswap.v2i32(<2 x i32> %val) nounwind readnone
117  store <2 x i32> %bswap, ptr addrspace(1) %out, align 8
118  ret void
119}
120
121define amdgpu_kernel void @test_bswap_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %in) nounwind {
122; SI-LABEL: test_bswap_v4i32:
123; SI:       ; %bb.0:
124; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
125; SI-NEXT:    s_waitcnt lgkmcnt(0)
126; SI-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x0
127; SI-NEXT:    s_mov_b32 s3, 0xf000
128; SI-NEXT:    s_mov_b32 s2, -1
129; SI-NEXT:    s_mov_b32 s8, 0xff00ff
130; SI-NEXT:    s_waitcnt lgkmcnt(0)
131; SI-NEXT:    v_alignbit_b32 v0, s7, s7, 8
132; SI-NEXT:    v_alignbit_b32 v1, s7, s7, 24
133; SI-NEXT:    v_alignbit_b32 v2, s6, s6, 8
134; SI-NEXT:    v_alignbit_b32 v4, s6, s6, 24
135; SI-NEXT:    v_alignbit_b32 v5, s5, s5, 8
136; SI-NEXT:    v_alignbit_b32 v6, s5, s5, 24
137; SI-NEXT:    v_alignbit_b32 v7, s4, s4, 8
138; SI-NEXT:    v_alignbit_b32 v8, s4, s4, 24
139; SI-NEXT:    v_bfi_b32 v3, s8, v1, v0
140; SI-NEXT:    v_bfi_b32 v2, s8, v4, v2
141; SI-NEXT:    v_bfi_b32 v1, s8, v6, v5
142; SI-NEXT:    v_bfi_b32 v0, s8, v8, v7
143; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
144; SI-NEXT:    s_endpgm
145;
146; VI-LABEL: test_bswap_v4i32:
147; VI:       ; %bb.0:
148; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
149; VI-NEXT:    v_mov_b32_e32 v0, 0x10203
150; VI-NEXT:    s_mov_b32 s7, 0xf000
151; VI-NEXT:    s_mov_b32 s6, -1
152; VI-NEXT:    s_waitcnt lgkmcnt(0)
153; VI-NEXT:    s_load_dwordx4 s[8:11], s[2:3], 0x0
154; VI-NEXT:    s_mov_b32 s4, s0
155; VI-NEXT:    s_mov_b32 s5, s1
156; VI-NEXT:    s_waitcnt lgkmcnt(0)
157; VI-NEXT:    v_perm_b32 v3, 0, s11, v0
158; VI-NEXT:    v_perm_b32 v2, 0, s10, v0
159; VI-NEXT:    v_perm_b32 v1, 0, s9, v0
160; VI-NEXT:    v_perm_b32 v0, 0, s8, v0
161; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
162; VI-NEXT:    s_endpgm
163;
164; GFX11-LABEL: test_bswap_v4i32:
165; GFX11:       ; %bb.0:
166; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
167; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
168; GFX11-NEXT:    s_load_b128 s[4:7], s[2:3], 0x0
169; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
170; GFX11-NEXT:    s_mov_b32 s2, -1
171; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
172; GFX11-NEXT:    v_perm_b32 v3, 0, s7, 0x10203
173; GFX11-NEXT:    v_perm_b32 v2, 0, s6, 0x10203
174; GFX11-NEXT:    v_perm_b32 v1, 0, s5, 0x10203
175; GFX11-NEXT:    v_perm_b32 v0, 0, s4, 0x10203
176; GFX11-NEXT:    buffer_store_b128 v[0:3], off, s[0:3], 0
177; GFX11-NEXT:    s_endpgm
178  %val = load <4 x i32>, ptr addrspace(1) %in, align 16
179  %bswap = call <4 x i32> @llvm.bswap.v4i32(<4 x i32> %val) nounwind readnone
180  store <4 x i32> %bswap, ptr addrspace(1) %out, align 16
181  ret void
182}
183
184define amdgpu_kernel void @test_bswap_v8i32(ptr addrspace(1) %out, ptr addrspace(1) %in) nounwind {
185; SI-LABEL: test_bswap_v8i32:
186; SI:       ; %bb.0:
187; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
188; SI-NEXT:    s_waitcnt lgkmcnt(0)
189; SI-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x0
190; SI-NEXT:    s_mov_b32 s3, 0xf000
191; SI-NEXT:    s_mov_b32 s2, -1
192; SI-NEXT:    s_mov_b32 s12, 0xff00ff
193; SI-NEXT:    s_waitcnt lgkmcnt(0)
194; SI-NEXT:    v_alignbit_b32 v0, s7, s7, 8
195; SI-NEXT:    v_alignbit_b32 v1, s7, s7, 24
196; SI-NEXT:    v_alignbit_b32 v2, s6, s6, 8
197; SI-NEXT:    v_alignbit_b32 v4, s6, s6, 24
198; SI-NEXT:    v_alignbit_b32 v5, s5, s5, 8
199; SI-NEXT:    v_alignbit_b32 v6, s5, s5, 24
200; SI-NEXT:    v_alignbit_b32 v7, s4, s4, 8
201; SI-NEXT:    v_alignbit_b32 v8, s4, s4, 24
202; SI-NEXT:    v_alignbit_b32 v9, s11, s11, 8
203; SI-NEXT:    v_alignbit_b32 v10, s11, s11, 24
204; SI-NEXT:    v_alignbit_b32 v11, s10, s10, 8
205; SI-NEXT:    v_alignbit_b32 v12, s10, s10, 24
206; SI-NEXT:    v_alignbit_b32 v13, s9, s9, 8
207; SI-NEXT:    v_alignbit_b32 v14, s9, s9, 24
208; SI-NEXT:    v_alignbit_b32 v15, s8, s8, 8
209; SI-NEXT:    v_alignbit_b32 v16, s8, s8, 24
210; SI-NEXT:    v_bfi_b32 v3, s12, v1, v0
211; SI-NEXT:    v_bfi_b32 v2, s12, v4, v2
212; SI-NEXT:    v_bfi_b32 v1, s12, v6, v5
213; SI-NEXT:    v_bfi_b32 v0, s12, v8, v7
214; SI-NEXT:    v_bfi_b32 v7, s12, v10, v9
215; SI-NEXT:    v_bfi_b32 v6, s12, v12, v11
216; SI-NEXT:    v_bfi_b32 v5, s12, v14, v13
217; SI-NEXT:    v_bfi_b32 v4, s12, v16, v15
218; SI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
219; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
220; SI-NEXT:    s_endpgm
221;
222; VI-LABEL: test_bswap_v8i32:
223; VI:       ; %bb.0:
224; VI-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x24
225; VI-NEXT:    v_mov_b32_e32 v4, 0x10203
226; VI-NEXT:    s_mov_b32 s15, 0xf000
227; VI-NEXT:    s_mov_b32 s14, -1
228; VI-NEXT:    s_waitcnt lgkmcnt(0)
229; VI-NEXT:    s_load_dwordx8 s[0:7], s[10:11], 0x0
230; VI-NEXT:    s_mov_b32 s12, s8
231; VI-NEXT:    s_mov_b32 s13, s9
232; VI-NEXT:    s_waitcnt lgkmcnt(0)
233; VI-NEXT:    v_perm_b32 v3, 0, s3, v4
234; VI-NEXT:    v_perm_b32 v2, 0, s2, v4
235; VI-NEXT:    v_perm_b32 v1, 0, s1, v4
236; VI-NEXT:    v_perm_b32 v0, 0, s0, v4
237; VI-NEXT:    v_perm_b32 v7, 0, s7, v4
238; VI-NEXT:    v_perm_b32 v6, 0, s6, v4
239; VI-NEXT:    v_perm_b32 v5, 0, s5, v4
240; VI-NEXT:    v_perm_b32 v4, 0, s4, v4
241; VI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[12:15], 0 offset:16
242; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[12:15], 0
243; VI-NEXT:    s_endpgm
244;
245; GFX11-LABEL: test_bswap_v8i32:
246; GFX11:       ; %bb.0:
247; GFX11-NEXT:    s_load_b128 s[8:11], s[4:5], 0x24
248; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
249; GFX11-NEXT:    s_load_b256 s[0:7], s[10:11], 0x0
250; GFX11-NEXT:    s_mov_b32 s11, 0x31016000
251; GFX11-NEXT:    s_mov_b32 s10, -1
252; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
253; GFX11-NEXT:    v_perm_b32 v7, 0, s7, 0x10203
254; GFX11-NEXT:    v_perm_b32 v6, 0, s6, 0x10203
255; GFX11-NEXT:    v_perm_b32 v5, 0, s5, 0x10203
256; GFX11-NEXT:    v_perm_b32 v4, 0, s4, 0x10203
257; GFX11-NEXT:    v_perm_b32 v3, 0, s3, 0x10203
258; GFX11-NEXT:    v_perm_b32 v2, 0, s2, 0x10203
259; GFX11-NEXT:    v_perm_b32 v1, 0, s1, 0x10203
260; GFX11-NEXT:    v_perm_b32 v0, 0, s0, 0x10203
261; GFX11-NEXT:    s_clause 0x1
262; GFX11-NEXT:    buffer_store_b128 v[4:7], off, s[8:11], 0 offset:16
263; GFX11-NEXT:    buffer_store_b128 v[0:3], off, s[8:11], 0
264; GFX11-NEXT:    s_endpgm
265  %val = load <8 x i32>, ptr addrspace(1) %in, align 32
266  %bswap = call <8 x i32> @llvm.bswap.v8i32(<8 x i32> %val) nounwind readnone
267  store <8 x i32> %bswap, ptr addrspace(1) %out, align 32
268  ret void
269}
270
271define amdgpu_kernel void @test_bswap_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) nounwind {
272; SI-LABEL: test_bswap_i64:
273; SI:       ; %bb.0:
274; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
275; SI-NEXT:    s_waitcnt lgkmcnt(0)
276; SI-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x0
277; SI-NEXT:    s_mov_b32 s3, 0xf000
278; SI-NEXT:    s_mov_b32 s2, -1
279; SI-NEXT:    s_mov_b32 s6, 0xff00ff
280; SI-NEXT:    s_waitcnt lgkmcnt(0)
281; SI-NEXT:    v_alignbit_b32 v0, s4, s4, 8
282; SI-NEXT:    v_alignbit_b32 v1, s4, s4, 24
283; SI-NEXT:    v_alignbit_b32 v2, s5, s5, 8
284; SI-NEXT:    v_alignbit_b32 v3, s5, s5, 24
285; SI-NEXT:    v_bfi_b32 v1, s6, v1, v0
286; SI-NEXT:    v_bfi_b32 v0, s6, v3, v2
287; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
288; SI-NEXT:    s_endpgm
289;
290; VI-LABEL: test_bswap_i64:
291; VI:       ; %bb.0:
292; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
293; VI-NEXT:    v_mov_b32_e32 v0, 0x10203
294; VI-NEXT:    s_mov_b32 s7, 0xf000
295; VI-NEXT:    s_mov_b32 s6, -1
296; VI-NEXT:    s_waitcnt lgkmcnt(0)
297; VI-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
298; VI-NEXT:    s_mov_b32 s4, s0
299; VI-NEXT:    s_mov_b32 s5, s1
300; VI-NEXT:    s_waitcnt lgkmcnt(0)
301; VI-NEXT:    v_perm_b32 v1, 0, s2, v0
302; VI-NEXT:    v_perm_b32 v0, 0, s3, v0
303; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
304; VI-NEXT:    s_endpgm
305;
306; GFX11-LABEL: test_bswap_i64:
307; GFX11:       ; %bb.0:
308; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
309; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
310; GFX11-NEXT:    s_load_b64 s[4:5], s[2:3], 0x0
311; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
312; GFX11-NEXT:    s_mov_b32 s2, -1
313; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
314; GFX11-NEXT:    v_perm_b32 v1, 0, s4, 0x10203
315; GFX11-NEXT:    v_perm_b32 v0, 0, s5, 0x10203
316; GFX11-NEXT:    buffer_store_b64 v[0:1], off, s[0:3], 0
317; GFX11-NEXT:    s_endpgm
318  %val = load i64, ptr addrspace(1) %in, align 8
319  %bswap = call i64 @llvm.bswap.i64(i64 %val) nounwind readnone
320  store i64 %bswap, ptr addrspace(1) %out, align 8
321  ret void
322}
323
324define amdgpu_kernel void @test_bswap_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %in) nounwind {
325; SI-LABEL: test_bswap_v2i64:
326; SI:       ; %bb.0:
327; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
328; SI-NEXT:    s_waitcnt lgkmcnt(0)
329; SI-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x0
330; SI-NEXT:    s_mov_b32 s3, 0xf000
331; SI-NEXT:    s_mov_b32 s2, -1
332; SI-NEXT:    s_mov_b32 s8, 0xff00ff
333; SI-NEXT:    s_waitcnt lgkmcnt(0)
334; SI-NEXT:    v_alignbit_b32 v0, s6, s6, 8
335; SI-NEXT:    v_alignbit_b32 v1, s6, s6, 24
336; SI-NEXT:    v_alignbit_b32 v2, s7, s7, 8
337; SI-NEXT:    v_alignbit_b32 v4, s7, s7, 24
338; SI-NEXT:    v_alignbit_b32 v5, s4, s4, 8
339; SI-NEXT:    v_alignbit_b32 v6, s4, s4, 24
340; SI-NEXT:    v_alignbit_b32 v7, s5, s5, 8
341; SI-NEXT:    v_alignbit_b32 v8, s5, s5, 24
342; SI-NEXT:    v_bfi_b32 v3, s8, v1, v0
343; SI-NEXT:    v_bfi_b32 v2, s8, v4, v2
344; SI-NEXT:    v_bfi_b32 v1, s8, v6, v5
345; SI-NEXT:    v_bfi_b32 v0, s8, v8, v7
346; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
347; SI-NEXT:    s_endpgm
348;
349; VI-LABEL: test_bswap_v2i64:
350; VI:       ; %bb.0:
351; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
352; VI-NEXT:    v_mov_b32_e32 v0, 0x10203
353; VI-NEXT:    s_mov_b32 s7, 0xf000
354; VI-NEXT:    s_mov_b32 s6, -1
355; VI-NEXT:    s_waitcnt lgkmcnt(0)
356; VI-NEXT:    s_load_dwordx4 s[8:11], s[2:3], 0x0
357; VI-NEXT:    s_mov_b32 s4, s0
358; VI-NEXT:    s_mov_b32 s5, s1
359; VI-NEXT:    s_waitcnt lgkmcnt(0)
360; VI-NEXT:    v_perm_b32 v3, 0, s10, v0
361; VI-NEXT:    v_perm_b32 v2, 0, s11, v0
362; VI-NEXT:    v_perm_b32 v1, 0, s8, v0
363; VI-NEXT:    v_perm_b32 v0, 0, s9, v0
364; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
365; VI-NEXT:    s_endpgm
366;
367; GFX11-LABEL: test_bswap_v2i64:
368; GFX11:       ; %bb.0:
369; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
370; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
371; GFX11-NEXT:    s_load_b128 s[4:7], s[2:3], 0x0
372; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
373; GFX11-NEXT:    s_mov_b32 s2, -1
374; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
375; GFX11-NEXT:    v_perm_b32 v3, 0, s6, 0x10203
376; GFX11-NEXT:    v_perm_b32 v2, 0, s7, 0x10203
377; GFX11-NEXT:    v_perm_b32 v1, 0, s4, 0x10203
378; GFX11-NEXT:    v_perm_b32 v0, 0, s5, 0x10203
379; GFX11-NEXT:    buffer_store_b128 v[0:3], off, s[0:3], 0
380; GFX11-NEXT:    s_endpgm
381  %val = load <2 x i64>, ptr addrspace(1) %in, align 16
382  %bswap = call <2 x i64> @llvm.bswap.v2i64(<2 x i64> %val) nounwind readnone
383  store <2 x i64> %bswap, ptr addrspace(1) %out, align 16
384  ret void
385}
386
387define amdgpu_kernel void @test_bswap_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %in) nounwind {
388; SI-LABEL: test_bswap_v4i64:
389; SI:       ; %bb.0:
390; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
391; SI-NEXT:    s_waitcnt lgkmcnt(0)
392; SI-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x0
393; SI-NEXT:    s_mov_b32 s3, 0xf000
394; SI-NEXT:    s_mov_b32 s2, -1
395; SI-NEXT:    s_mov_b32 s12, 0xff00ff
396; SI-NEXT:    s_waitcnt lgkmcnt(0)
397; SI-NEXT:    v_alignbit_b32 v0, s6, s6, 8
398; SI-NEXT:    v_alignbit_b32 v1, s6, s6, 24
399; SI-NEXT:    v_alignbit_b32 v2, s7, s7, 8
400; SI-NEXT:    v_alignbit_b32 v4, s7, s7, 24
401; SI-NEXT:    v_alignbit_b32 v5, s4, s4, 8
402; SI-NEXT:    v_alignbit_b32 v6, s4, s4, 24
403; SI-NEXT:    v_alignbit_b32 v7, s5, s5, 8
404; SI-NEXT:    v_alignbit_b32 v8, s5, s5, 24
405; SI-NEXT:    v_alignbit_b32 v9, s10, s10, 8
406; SI-NEXT:    v_alignbit_b32 v10, s10, s10, 24
407; SI-NEXT:    v_alignbit_b32 v11, s11, s11, 8
408; SI-NEXT:    v_alignbit_b32 v12, s11, s11, 24
409; SI-NEXT:    v_alignbit_b32 v13, s8, s8, 8
410; SI-NEXT:    v_alignbit_b32 v14, s8, s8, 24
411; SI-NEXT:    v_alignbit_b32 v15, s9, s9, 8
412; SI-NEXT:    v_alignbit_b32 v16, s9, s9, 24
413; SI-NEXT:    v_bfi_b32 v3, s12, v1, v0
414; SI-NEXT:    v_bfi_b32 v2, s12, v4, v2
415; SI-NEXT:    v_bfi_b32 v1, s12, v6, v5
416; SI-NEXT:    v_bfi_b32 v0, s12, v8, v7
417; SI-NEXT:    v_bfi_b32 v7, s12, v10, v9
418; SI-NEXT:    v_bfi_b32 v6, s12, v12, v11
419; SI-NEXT:    v_bfi_b32 v5, s12, v14, v13
420; SI-NEXT:    v_bfi_b32 v4, s12, v16, v15
421; SI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
422; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
423; SI-NEXT:    s_endpgm
424;
425; VI-LABEL: test_bswap_v4i64:
426; VI:       ; %bb.0:
427; VI-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x24
428; VI-NEXT:    v_mov_b32_e32 v4, 0x10203
429; VI-NEXT:    s_mov_b32 s15, 0xf000
430; VI-NEXT:    s_mov_b32 s14, -1
431; VI-NEXT:    s_waitcnt lgkmcnt(0)
432; VI-NEXT:    s_load_dwordx8 s[0:7], s[10:11], 0x0
433; VI-NEXT:    s_mov_b32 s12, s8
434; VI-NEXT:    s_mov_b32 s13, s9
435; VI-NEXT:    s_waitcnt lgkmcnt(0)
436; VI-NEXT:    v_perm_b32 v3, 0, s2, v4
437; VI-NEXT:    v_perm_b32 v2, 0, s3, v4
438; VI-NEXT:    v_perm_b32 v1, 0, s0, v4
439; VI-NEXT:    v_perm_b32 v0, 0, s1, v4
440; VI-NEXT:    v_perm_b32 v7, 0, s6, v4
441; VI-NEXT:    v_perm_b32 v6, 0, s7, v4
442; VI-NEXT:    v_perm_b32 v5, 0, s4, v4
443; VI-NEXT:    v_perm_b32 v4, 0, s5, v4
444; VI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[12:15], 0 offset:16
445; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[12:15], 0
446; VI-NEXT:    s_endpgm
447;
448; GFX11-LABEL: test_bswap_v4i64:
449; GFX11:       ; %bb.0:
450; GFX11-NEXT:    s_load_b128 s[8:11], s[4:5], 0x24
451; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
452; GFX11-NEXT:    s_load_b256 s[0:7], s[10:11], 0x0
453; GFX11-NEXT:    s_mov_b32 s11, 0x31016000
454; GFX11-NEXT:    s_mov_b32 s10, -1
455; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
456; GFX11-NEXT:    v_perm_b32 v7, 0, s6, 0x10203
457; GFX11-NEXT:    v_perm_b32 v6, 0, s7, 0x10203
458; GFX11-NEXT:    v_perm_b32 v5, 0, s4, 0x10203
459; GFX11-NEXT:    v_perm_b32 v4, 0, s5, 0x10203
460; GFX11-NEXT:    v_perm_b32 v3, 0, s2, 0x10203
461; GFX11-NEXT:    v_perm_b32 v2, 0, s3, 0x10203
462; GFX11-NEXT:    v_perm_b32 v1, 0, s0, 0x10203
463; GFX11-NEXT:    v_perm_b32 v0, 0, s1, 0x10203
464; GFX11-NEXT:    s_clause 0x1
465; GFX11-NEXT:    buffer_store_b128 v[4:7], off, s[8:11], 0 offset:16
466; GFX11-NEXT:    buffer_store_b128 v[0:3], off, s[8:11], 0
467; GFX11-NEXT:    s_endpgm
468  %val = load <4 x i64>, ptr addrspace(1) %in, align 32
469  %bswap = call <4 x i64> @llvm.bswap.v4i64(<4 x i64> %val) nounwind readnone
470  store <4 x i64> %bswap, ptr addrspace(1) %out, align 32
471  ret void
472}
473
474define float @missing_truncate_promote_bswap(i32 %arg) {
475; SI-LABEL: missing_truncate_promote_bswap:
476; SI:       ; %bb.0: ; %bb
477; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
478; SI-NEXT:    v_alignbit_b32 v1, v0, v0, 8
479; SI-NEXT:    v_alignbit_b32 v0, v0, v0, 24
480; SI-NEXT:    s_mov_b32 s4, 0xff00ff
481; SI-NEXT:    v_bfi_b32 v0, s4, v0, v1
482; SI-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
483; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
484; SI-NEXT:    s_setpc_b64 s[30:31]
485;
486; VI-LABEL: missing_truncate_promote_bswap:
487; VI:       ; %bb.0: ; %bb
488; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
489; VI-NEXT:    s_mov_b32 s4, 0xc0c0001
490; VI-NEXT:    v_perm_b32 v0, 0, v0, s4
491; VI-NEXT:    v_cvt_f32_f16_e32 v0, v0
492; VI-NEXT:    s_setpc_b64 s[30:31]
493;
494; GFX11-REAL16-LABEL: missing_truncate_promote_bswap:
495; GFX11-REAL16:       ; %bb.0: ; %bb
496; GFX11-REAL16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
497; GFX11-REAL16-NEXT:    v_perm_b32 v0, 0, v0, 0xc0c0001
498; GFX11-REAL16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
499; GFX11-REAL16-NEXT:    v_cvt_f32_f16_e32 v0, v0.l
500; GFX11-REAL16-NEXT:    s_setpc_b64 s[30:31]
501;
502; GFX11-FAKE16-LABEL: missing_truncate_promote_bswap:
503; GFX11-FAKE16:       ; %bb.0: ; %bb
504; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
505; GFX11-FAKE16-NEXT:    v_perm_b32 v0, 0, v0, 0xc0c0001
506; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
507; GFX11-FAKE16-NEXT:    v_cvt_f32_f16_e32 v0, v0
508; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
509bb:
510  %tmp = trunc i32 %arg to i16
511  %tmp1 = call i16 @llvm.bswap.i16(i16 %tmp)
512  %tmp2 = bitcast i16 %tmp1 to half
513  %tmp3 = fpext half %tmp2 to float
514  ret float %tmp3
515}
516
517define i16 @v_bswap_i16(i16 %src) {
518; SI-LABEL: v_bswap_i16:
519; SI:       ; %bb.0:
520; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
521; SI-NEXT:    v_alignbit_b32 v1, v0, v0, 8
522; SI-NEXT:    v_alignbit_b32 v0, v0, v0, 24
523; SI-NEXT:    s_mov_b32 s4, 0xff00ff
524; SI-NEXT:    v_bfi_b32 v0, s4, v0, v1
525; SI-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
526; SI-NEXT:    s_setpc_b64 s[30:31]
527;
528; VI-LABEL: v_bswap_i16:
529; VI:       ; %bb.0:
530; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
531; VI-NEXT:    s_mov_b32 s4, 0xc0c0001
532; VI-NEXT:    v_perm_b32 v0, 0, v0, s4
533; VI-NEXT:    s_setpc_b64 s[30:31]
534;
535; GFX11-LABEL: v_bswap_i16:
536; GFX11:       ; %bb.0:
537; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
538; GFX11-NEXT:    v_perm_b32 v0, 0, v0, 0xc0c0001
539; GFX11-NEXT:    s_setpc_b64 s[30:31]
540  %bswap = call i16 @llvm.bswap.i16(i16 %src)
541  ret i16 %bswap
542}
543
544define i32 @v_bswap_i16_zext_to_i32(i16 %src) {
545; SI-LABEL: v_bswap_i16_zext_to_i32:
546; SI:       ; %bb.0:
547; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
548; SI-NEXT:    v_alignbit_b32 v1, v0, v0, 8
549; SI-NEXT:    v_alignbit_b32 v0, v0, v0, 24
550; SI-NEXT:    s_mov_b32 s4, 0xff00ff
551; SI-NEXT:    v_bfi_b32 v0, s4, v0, v1
552; SI-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
553; SI-NEXT:    s_setpc_b64 s[30:31]
554;
555; VI-LABEL: v_bswap_i16_zext_to_i32:
556; VI:       ; %bb.0:
557; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
558; VI-NEXT:    s_mov_b32 s4, 0xc0c0001
559; VI-NEXT:    v_perm_b32 v0, 0, v0, s4
560; VI-NEXT:    s_setpc_b64 s[30:31]
561;
562; GFX11-LABEL: v_bswap_i16_zext_to_i32:
563; GFX11:       ; %bb.0:
564; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
565; GFX11-NEXT:    v_perm_b32 v0, 0, v0, 0xc0c0001
566; GFX11-NEXT:    s_setpc_b64 s[30:31]
567  %bswap = call i16 @llvm.bswap.i16(i16 %src)
568  %zext = zext i16 %bswap to i32
569  ret i32 %zext
570}
571
572define i32 @v_bswap_i16_sext_to_i32(i16 %src) {
573; SI-LABEL: v_bswap_i16_sext_to_i32:
574; SI:       ; %bb.0:
575; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
576; SI-NEXT:    v_alignbit_b32 v1, v0, v0, 8
577; SI-NEXT:    v_alignbit_b32 v0, v0, v0, 24
578; SI-NEXT:    s_mov_b32 s4, 0xff00ff
579; SI-NEXT:    v_bfi_b32 v0, s4, v0, v1
580; SI-NEXT:    v_ashrrev_i32_e32 v0, 16, v0
581; SI-NEXT:    s_setpc_b64 s[30:31]
582;
583; VI-LABEL: v_bswap_i16_sext_to_i32:
584; VI:       ; %bb.0:
585; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
586; VI-NEXT:    s_mov_b32 s4, 0xc0c0001
587; VI-NEXT:    v_perm_b32 v0, 0, v0, s4
588; VI-NEXT:    v_bfe_i32 v0, v0, 0, 16
589; VI-NEXT:    s_setpc_b64 s[30:31]
590;
591; GFX11-LABEL: v_bswap_i16_sext_to_i32:
592; GFX11:       ; %bb.0:
593; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
594; GFX11-NEXT:    v_perm_b32 v0, 0, v0, 0xc0c0001
595; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
596; GFX11-NEXT:    v_bfe_i32 v0, v0, 0, 16
597; GFX11-NEXT:    s_setpc_b64 s[30:31]
598  %bswap = call i16 @llvm.bswap.i16(i16 %src)
599  %zext = sext i16 %bswap to i32
600  ret i32 %zext
601}
602
603define <2 x i16> @v_bswap_v2i16(<2 x i16> %src) {
604; SI-LABEL: v_bswap_v2i16:
605; SI:       ; %bb.0:
606; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
607; SI-NEXT:    v_alignbit_b32 v2, v0, v0, 8
608; SI-NEXT:    v_alignbit_b32 v0, v0, v0, 24
609; SI-NEXT:    s_mov_b32 s4, 0xff00ff
610; SI-NEXT:    v_alignbit_b32 v3, v1, v1, 8
611; SI-NEXT:    v_alignbit_b32 v1, v1, v1, 24
612; SI-NEXT:    v_bfi_b32 v0, s4, v0, v2
613; SI-NEXT:    v_bfi_b32 v1, s4, v1, v3
614; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
615; SI-NEXT:    v_alignbit_b32 v0, v1, v0, 16
616; SI-NEXT:    s_setpc_b64 s[30:31]
617;
618; VI-LABEL: v_bswap_v2i16:
619; VI:       ; %bb.0:
620; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
621; VI-NEXT:    s_mov_b32 s4, 0x2030001
622; VI-NEXT:    v_perm_b32 v0, 0, v0, s4
623; VI-NEXT:    s_setpc_b64 s[30:31]
624;
625; GFX11-LABEL: v_bswap_v2i16:
626; GFX11:       ; %bb.0:
627; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
628; GFX11-NEXT:    v_perm_b32 v0, 0, v0, 0x2030001
629; GFX11-NEXT:    s_setpc_b64 s[30:31]
630  %bswap = call <2 x i16> @llvm.bswap.v2i16(<2 x i16> %src)
631  ret <2 x i16> %bswap
632}
633
634define <3 x i16> @v_bswap_v3i16(<3 x i16> %src) {
635; SI-LABEL: v_bswap_v3i16:
636; SI:       ; %bb.0:
637; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
638; SI-NEXT:    v_alignbit_b32 v3, v0, v0, 8
639; SI-NEXT:    v_alignbit_b32 v0, v0, v0, 24
640; SI-NEXT:    s_mov_b32 s4, 0xff00ff
641; SI-NEXT:    v_alignbit_b32 v4, v1, v1, 8
642; SI-NEXT:    v_alignbit_b32 v1, v1, v1, 24
643; SI-NEXT:    v_alignbit_b32 v5, v2, v2, 8
644; SI-NEXT:    v_alignbit_b32 v2, v2, v2, 24
645; SI-NEXT:    v_bfi_b32 v0, s4, v0, v3
646; SI-NEXT:    v_bfi_b32 v1, s4, v1, v4
647; SI-NEXT:    v_bfi_b32 v2, s4, v2, v5
648; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
649; SI-NEXT:    v_alignbit_b32 v0, v1, v0, 16
650; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
651; SI-NEXT:    v_alignbit_b32 v1, v2, v0, 16
652; SI-NEXT:    s_setpc_b64 s[30:31]
653;
654; VI-LABEL: v_bswap_v3i16:
655; VI:       ; %bb.0:
656; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
657; VI-NEXT:    s_mov_b32 s4, 0x2030001
658; VI-NEXT:    v_perm_b32 v0, 0, v0, s4
659; VI-NEXT:    v_perm_b32 v1, 0, v1, s4
660; VI-NEXT:    s_setpc_b64 s[30:31]
661;
662; GFX11-LABEL: v_bswap_v3i16:
663; GFX11:       ; %bb.0:
664; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
665; GFX11-NEXT:    v_perm_b32 v0, 0, v0, 0x2030001
666; GFX11-NEXT:    v_perm_b32 v1, 0, v1, 0x2030001
667; GFX11-NEXT:    s_setpc_b64 s[30:31]
668  %bswap = call <3 x i16> @llvm.bswap.v3i16(<3 x i16> %src)
669  ret <3 x i16> %bswap
670}
671
672define <4 x i16> @v_bswap_v4i16(<4 x i16> %src) {
673; SI-LABEL: v_bswap_v4i16:
674; SI:       ; %bb.0:
675; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
676; SI-NEXT:    v_alignbit_b32 v4, v2, v2, 8
677; SI-NEXT:    v_alignbit_b32 v2, v2, v2, 24
678; SI-NEXT:    s_mov_b32 s4, 0xff00ff
679; SI-NEXT:    v_alignbit_b32 v5, v3, v3, 8
680; SI-NEXT:    v_alignbit_b32 v3, v3, v3, 24
681; SI-NEXT:    v_alignbit_b32 v6, v0, v0, 8
682; SI-NEXT:    v_alignbit_b32 v0, v0, v0, 24
683; SI-NEXT:    v_alignbit_b32 v7, v1, v1, 8
684; SI-NEXT:    v_alignbit_b32 v1, v1, v1, 24
685; SI-NEXT:    v_bfi_b32 v2, s4, v2, v4
686; SI-NEXT:    v_bfi_b32 v3, s4, v3, v5
687; SI-NEXT:    v_bfi_b32 v0, s4, v0, v6
688; SI-NEXT:    v_bfi_b32 v1, s4, v1, v7
689; SI-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
690; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
691; SI-NEXT:    v_alignbit_b32 v2, v3, v2, 16
692; SI-NEXT:    v_alignbit_b32 v0, v1, v0, 16
693; SI-NEXT:    v_alignbit_b32 v1, v2, v0, 16
694; SI-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
695; SI-NEXT:    s_setpc_b64 s[30:31]
696;
697; VI-LABEL: v_bswap_v4i16:
698; VI:       ; %bb.0:
699; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
700; VI-NEXT:    s_mov_b32 s4, 0x2030001
701; VI-NEXT:    v_perm_b32 v0, 0, v0, s4
702; VI-NEXT:    v_perm_b32 v1, 0, v1, s4
703; VI-NEXT:    s_setpc_b64 s[30:31]
704;
705; GFX11-LABEL: v_bswap_v4i16:
706; GFX11:       ; %bb.0:
707; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
708; GFX11-NEXT:    v_perm_b32 v0, 0, v0, 0x2030001
709; GFX11-NEXT:    v_perm_b32 v1, 0, v1, 0x2030001
710; GFX11-NEXT:    s_setpc_b64 s[30:31]
711  %bswap = call <4 x i16> @llvm.bswap.v4i16(<4 x i16> %src)
712  ret <4 x i16> %bswap
713}
714
715define i64 @v_bswap_i48(i64 %src) {
716; SI-LABEL: v_bswap_i48:
717; SI:       ; %bb.0:
718; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
719; SI-NEXT:    v_alignbit_b32 v2, v0, v0, 8
720; SI-NEXT:    v_alignbit_b32 v0, v0, v0, 24
721; SI-NEXT:    s_mov_b32 s4, 0xff00ff
722; SI-NEXT:    v_alignbit_b32 v3, v1, v1, 8
723; SI-NEXT:    v_alignbit_b32 v1, v1, v1, 24
724; SI-NEXT:    v_bfi_b32 v2, s4, v0, v2
725; SI-NEXT:    v_bfi_b32 v0, s4, v1, v3
726; SI-NEXT:    v_alignbit_b32 v0, v2, v0, 16
727; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
728; SI-NEXT:    s_setpc_b64 s[30:31]
729;
730; VI-LABEL: v_bswap_i48:
731; VI:       ; %bb.0:
732; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
733; VI-NEXT:    s_mov_b32 s4, 0x10203
734; VI-NEXT:    v_perm_b32 v2, 0, v0, s4
735; VI-NEXT:    v_perm_b32 v0, 0, v1, s4
736; VI-NEXT:    v_alignbit_b32 v0, v2, v0, 16
737; VI-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
738; VI-NEXT:    s_setpc_b64 s[30:31]
739;
740; GFX11-LABEL: v_bswap_i48:
741; GFX11:       ; %bb.0:
742; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
743; GFX11-NEXT:    v_perm_b32 v2, 0, v0, 0x10203
744; GFX11-NEXT:    v_perm_b32 v0, 0, v1, 0x10203
745; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
746; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
747; GFX11-NEXT:    v_alignbit_b32 v0, v2, v0, 16
748; GFX11-NEXT:    s_setpc_b64 s[30:31]
749  %trunc = trunc i64 %src to i48
750  %bswap = call i48 @llvm.bswap.i48(i48 %trunc)
751  %zext = zext i48 %bswap to i64
752  ret i64 %zext
753}
754