xref: /llvm-project/llvm/test/CodeGen/AMDGPU/GlobalISel/bswap.ll (revision 8f6a1a07cb85980013c70d5af6d28f5fcf75e732)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=hawaii -o - %s | FileCheck -check-prefix=GFX7 %s
3; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=fiji -o - %s | FileCheck -check-prefix=GFX8 %s
4; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 -o - %s | FileCheck -check-prefix=GFX9 %s
5; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 -o - %s | FileCheck -check-prefix=GFX10 %s
6; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -o - %s | FileCheck -check-prefix=GFX10 %s
7
8define amdgpu_ps i32 @s_bswap_i32(i32 inreg %src) {
9; GFX7-LABEL: s_bswap_i32:
10; GFX7:       ; %bb.0:
11; GFX7-NEXT:    v_alignbit_b32 v0, s0, s0, 8
12; GFX7-NEXT:    v_alignbit_b32 v1, s0, s0, 24
13; GFX7-NEXT:    s_mov_b32 s0, 0xff00ff
14; GFX7-NEXT:    v_bfi_b32 v0, s0, v1, v0
15; GFX7-NEXT:    v_readfirstlane_b32 s0, v0
16; GFX7-NEXT:    ; return to shader part epilog
17;
18; GFX8-LABEL: s_bswap_i32:
19; GFX8:       ; %bb.0:
20; GFX8-NEXT:    v_mov_b32_e32 v0, s0
21; GFX8-NEXT:    s_mov_b32 s0, 0x10203
22; GFX8-NEXT:    v_perm_b32 v0, 0, v0, s0
23; GFX8-NEXT:    v_readfirstlane_b32 s0, v0
24; GFX8-NEXT:    ; return to shader part epilog
25;
26; GFX9-LABEL: s_bswap_i32:
27; GFX9:       ; %bb.0:
28; GFX9-NEXT:    v_mov_b32_e32 v0, s0
29; GFX9-NEXT:    s_mov_b32 s0, 0x10203
30; GFX9-NEXT:    v_perm_b32 v0, 0, v0, s0
31; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
32; GFX9-NEXT:    ; return to shader part epilog
33;
34; GFX10-LABEL: s_bswap_i32:
35; GFX10:       ; %bb.0:
36; GFX10-NEXT:    v_perm_b32 v0, 0, s0, 0x10203
37; GFX10-NEXT:    v_readfirstlane_b32 s0, v0
38; GFX10-NEXT:    ; return to shader part epilog
39  %bswap = call i32 @llvm.bswap.i32(i32 %src)
40  ret i32 %bswap
41}
42
43define i32 @v_bswap_i32(i32 %src) {
44; GFX7-LABEL: v_bswap_i32:
45; GFX7:       ; %bb.0:
46; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
47; GFX7-NEXT:    v_alignbit_b32 v1, v0, v0, 8
48; GFX7-NEXT:    v_alignbit_b32 v0, v0, v0, 24
49; GFX7-NEXT:    s_mov_b32 s4, 0xff00ff
50; GFX7-NEXT:    v_bfi_b32 v0, s4, v0, v1
51; GFX7-NEXT:    s_setpc_b64 s[30:31]
52;
53; GFX8-LABEL: v_bswap_i32:
54; GFX8:       ; %bb.0:
55; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
56; GFX8-NEXT:    s_mov_b32 s4, 0x10203
57; GFX8-NEXT:    v_perm_b32 v0, 0, v0, s4
58; GFX8-NEXT:    s_setpc_b64 s[30:31]
59;
60; GFX9-LABEL: v_bswap_i32:
61; GFX9:       ; %bb.0:
62; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
63; GFX9-NEXT:    s_mov_b32 s4, 0x10203
64; GFX9-NEXT:    v_perm_b32 v0, 0, v0, s4
65; GFX9-NEXT:    s_setpc_b64 s[30:31]
66;
67; GFX10-LABEL: v_bswap_i32:
68; GFX10:       ; %bb.0:
69; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
70; GFX10-NEXT:    v_perm_b32 v0, 0, v0, 0x10203
71; GFX10-NEXT:    s_setpc_b64 s[30:31]
72  %bswap = call i32 @llvm.bswap.i32(i32 %src)
73  ret i32 %bswap
74}
75
76define amdgpu_ps <2 x i32> @s_bswap_v2i32(<2 x i32> inreg %src) {
77; GFX7-LABEL: s_bswap_v2i32:
78; GFX7:       ; %bb.0:
79; GFX7-NEXT:    v_alignbit_b32 v0, s0, s0, 8
80; GFX7-NEXT:    v_alignbit_b32 v1, s0, s0, 24
81; GFX7-NEXT:    s_mov_b32 s0, 0xff00ff
82; GFX7-NEXT:    v_bfi_b32 v0, s0, v1, v0
83; GFX7-NEXT:    v_alignbit_b32 v1, s1, s1, 8
84; GFX7-NEXT:    v_alignbit_b32 v2, s1, s1, 24
85; GFX7-NEXT:    v_bfi_b32 v1, s0, v2, v1
86; GFX7-NEXT:    v_readfirstlane_b32 s0, v0
87; GFX7-NEXT:    v_readfirstlane_b32 s1, v1
88; GFX7-NEXT:    ; return to shader part epilog
89;
90; GFX8-LABEL: s_bswap_v2i32:
91; GFX8:       ; %bb.0:
92; GFX8-NEXT:    v_mov_b32_e32 v0, s0
93; GFX8-NEXT:    s_mov_b32 s0, 0x10203
94; GFX8-NEXT:    v_mov_b32_e32 v1, s1
95; GFX8-NEXT:    v_perm_b32 v0, 0, v0, s0
96; GFX8-NEXT:    v_perm_b32 v1, 0, v1, s0
97; GFX8-NEXT:    v_readfirstlane_b32 s0, v0
98; GFX8-NEXT:    v_readfirstlane_b32 s1, v1
99; GFX8-NEXT:    ; return to shader part epilog
100;
101; GFX9-LABEL: s_bswap_v2i32:
102; GFX9:       ; %bb.0:
103; GFX9-NEXT:    v_mov_b32_e32 v0, s0
104; GFX9-NEXT:    s_mov_b32 s0, 0x10203
105; GFX9-NEXT:    v_mov_b32_e32 v1, s1
106; GFX9-NEXT:    v_perm_b32 v0, 0, v0, s0
107; GFX9-NEXT:    v_perm_b32 v1, 0, v1, s0
108; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
109; GFX9-NEXT:    v_readfirstlane_b32 s1, v1
110; GFX9-NEXT:    ; return to shader part epilog
111;
112; GFX10-LABEL: s_bswap_v2i32:
113; GFX10:       ; %bb.0:
114; GFX10-NEXT:    v_perm_b32 v0, 0, s0, 0x10203
115; GFX10-NEXT:    v_perm_b32 v1, 0, s1, 0x10203
116; GFX10-NEXT:    v_readfirstlane_b32 s0, v0
117; GFX10-NEXT:    v_readfirstlane_b32 s1, v1
118; GFX10-NEXT:    ; return to shader part epilog
119  %bswap = call <2 x i32> @llvm.bswap.v2i32(<2 x i32> %src)
120  ret <2 x i32> %bswap
121}
122
123define <2 x i32> @v_bswap_v2i32(<2 x i32> %src) {
124; GFX7-LABEL: v_bswap_v2i32:
125; GFX7:       ; %bb.0:
126; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
127; GFX7-NEXT:    v_alignbit_b32 v2, v0, v0, 8
128; GFX7-NEXT:    v_alignbit_b32 v0, v0, v0, 24
129; GFX7-NEXT:    s_mov_b32 s4, 0xff00ff
130; GFX7-NEXT:    v_bfi_b32 v0, s4, v0, v2
131; GFX7-NEXT:    v_alignbit_b32 v2, v1, v1, 8
132; GFX7-NEXT:    v_alignbit_b32 v1, v1, v1, 24
133; GFX7-NEXT:    v_bfi_b32 v1, s4, v1, v2
134; GFX7-NEXT:    s_setpc_b64 s[30:31]
135;
136; GFX8-LABEL: v_bswap_v2i32:
137; GFX8:       ; %bb.0:
138; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
139; GFX8-NEXT:    s_mov_b32 s4, 0x10203
140; GFX8-NEXT:    v_perm_b32 v0, 0, v0, s4
141; GFX8-NEXT:    v_perm_b32 v1, 0, v1, s4
142; GFX8-NEXT:    s_setpc_b64 s[30:31]
143;
144; GFX9-LABEL: v_bswap_v2i32:
145; GFX9:       ; %bb.0:
146; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
147; GFX9-NEXT:    s_mov_b32 s4, 0x10203
148; GFX9-NEXT:    v_perm_b32 v0, 0, v0, s4
149; GFX9-NEXT:    v_perm_b32 v1, 0, v1, s4
150; GFX9-NEXT:    s_setpc_b64 s[30:31]
151;
152; GFX10-LABEL: v_bswap_v2i32:
153; GFX10:       ; %bb.0:
154; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
155; GFX10-NEXT:    v_perm_b32 v0, 0, v0, 0x10203
156; GFX10-NEXT:    v_perm_b32 v1, 0, v1, 0x10203
157; GFX10-NEXT:    s_setpc_b64 s[30:31]
158  %bswap = call <2 x i32> @llvm.bswap.v2i32(<2 x i32> %src)
159  ret <2 x i32> %bswap
160}
161
162define amdgpu_ps i64 @s_bswap_i64(i64 inreg %src) {
163; GFX7-LABEL: s_bswap_i64:
164; GFX7:       ; %bb.0:
165; GFX7-NEXT:    v_alignbit_b32 v0, s1, s1, 8
166; GFX7-NEXT:    v_alignbit_b32 v1, s1, s1, 24
167; GFX7-NEXT:    s_mov_b32 s1, 0xff00ff
168; GFX7-NEXT:    v_bfi_b32 v0, s1, v1, v0
169; GFX7-NEXT:    v_alignbit_b32 v1, s0, s0, 8
170; GFX7-NEXT:    v_alignbit_b32 v2, s0, s0, 24
171; GFX7-NEXT:    v_bfi_b32 v1, s1, v2, v1
172; GFX7-NEXT:    v_readfirstlane_b32 s0, v0
173; GFX7-NEXT:    v_readfirstlane_b32 s1, v1
174; GFX7-NEXT:    ; return to shader part epilog
175;
176; GFX8-LABEL: s_bswap_i64:
177; GFX8:       ; %bb.0:
178; GFX8-NEXT:    v_mov_b32_e32 v0, s1
179; GFX8-NEXT:    s_mov_b32 s1, 0x10203
180; GFX8-NEXT:    v_mov_b32_e32 v1, s0
181; GFX8-NEXT:    v_perm_b32 v0, 0, v0, s1
182; GFX8-NEXT:    v_perm_b32 v1, 0, v1, s1
183; GFX8-NEXT:    v_readfirstlane_b32 s0, v0
184; GFX8-NEXT:    v_readfirstlane_b32 s1, v1
185; GFX8-NEXT:    ; return to shader part epilog
186;
187; GFX9-LABEL: s_bswap_i64:
188; GFX9:       ; %bb.0:
189; GFX9-NEXT:    v_mov_b32_e32 v0, s1
190; GFX9-NEXT:    s_mov_b32 s1, 0x10203
191; GFX9-NEXT:    v_mov_b32_e32 v1, s0
192; GFX9-NEXT:    v_perm_b32 v0, 0, v0, s1
193; GFX9-NEXT:    v_perm_b32 v1, 0, v1, s1
194; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
195; GFX9-NEXT:    v_readfirstlane_b32 s1, v1
196; GFX9-NEXT:    ; return to shader part epilog
197;
198; GFX10-LABEL: s_bswap_i64:
199; GFX10:       ; %bb.0:
200; GFX10-NEXT:    v_perm_b32 v0, 0, s1, 0x10203
201; GFX10-NEXT:    v_perm_b32 v1, 0, s0, 0x10203
202; GFX10-NEXT:    v_readfirstlane_b32 s0, v0
203; GFX10-NEXT:    v_readfirstlane_b32 s1, v1
204; GFX10-NEXT:    ; return to shader part epilog
205  %bswap = call i64 @llvm.bswap.i64(i64 %src)
206  ret i64 %bswap
207}
208
209define i64 @v_bswap_i64(i64 %src) {
210; GFX7-LABEL: v_bswap_i64:
211; GFX7:       ; %bb.0:
212; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
213; GFX7-NEXT:    v_alignbit_b32 v2, v1, v1, 8
214; GFX7-NEXT:    v_alignbit_b32 v1, v1, v1, 24
215; GFX7-NEXT:    s_mov_b32 s4, 0xff00ff
216; GFX7-NEXT:    v_bfi_b32 v2, s4, v1, v2
217; GFX7-NEXT:    v_alignbit_b32 v1, v0, v0, 8
218; GFX7-NEXT:    v_alignbit_b32 v0, v0, v0, 24
219; GFX7-NEXT:    v_bfi_b32 v1, s4, v0, v1
220; GFX7-NEXT:    v_mov_b32_e32 v0, v2
221; GFX7-NEXT:    s_setpc_b64 s[30:31]
222;
223; GFX8-LABEL: v_bswap_i64:
224; GFX8:       ; %bb.0:
225; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
226; GFX8-NEXT:    s_mov_b32 s4, 0x10203
227; GFX8-NEXT:    v_perm_b32 v2, 0, v1, s4
228; GFX8-NEXT:    v_perm_b32 v1, 0, v0, s4
229; GFX8-NEXT:    v_mov_b32_e32 v0, v2
230; GFX8-NEXT:    s_setpc_b64 s[30:31]
231;
232; GFX9-LABEL: v_bswap_i64:
233; GFX9:       ; %bb.0:
234; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
235; GFX9-NEXT:    s_mov_b32 s4, 0x10203
236; GFX9-NEXT:    v_perm_b32 v2, 0, v1, s4
237; GFX9-NEXT:    v_perm_b32 v1, 0, v0, s4
238; GFX9-NEXT:    v_mov_b32_e32 v0, v2
239; GFX9-NEXT:    s_setpc_b64 s[30:31]
240;
241; GFX10-LABEL: v_bswap_i64:
242; GFX10:       ; %bb.0:
243; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
244; GFX10-NEXT:    v_perm_b32 v2, 0, v1, 0x10203
245; GFX10-NEXT:    v_perm_b32 v1, 0, v0, 0x10203
246; GFX10-NEXT:    v_mov_b32_e32 v0, v2
247; GFX10-NEXT:    s_setpc_b64 s[30:31]
248  %bswap = call i64 @llvm.bswap.i64(i64 %src)
249  ret i64 %bswap
250}
251
252define amdgpu_ps <2 x i64> @s_bswap_v2i64(<2 x i64> inreg %src) {
253; GFX7-LABEL: s_bswap_v2i64:
254; GFX7:       ; %bb.0:
255; GFX7-NEXT:    v_alignbit_b32 v0, s1, s1, 8
256; GFX7-NEXT:    v_alignbit_b32 v1, s1, s1, 24
257; GFX7-NEXT:    s_mov_b32 s1, 0xff00ff
258; GFX7-NEXT:    v_bfi_b32 v0, s1, v1, v0
259; GFX7-NEXT:    v_alignbit_b32 v1, s0, s0, 8
260; GFX7-NEXT:    v_alignbit_b32 v2, s0, s0, 24
261; GFX7-NEXT:    v_bfi_b32 v1, s1, v2, v1
262; GFX7-NEXT:    v_alignbit_b32 v2, s3, s3, 8
263; GFX7-NEXT:    v_alignbit_b32 v3, s3, s3, 24
264; GFX7-NEXT:    v_bfi_b32 v2, s1, v3, v2
265; GFX7-NEXT:    v_alignbit_b32 v3, s2, s2, 8
266; GFX7-NEXT:    v_alignbit_b32 v4, s2, s2, 24
267; GFX7-NEXT:    v_bfi_b32 v3, s1, v4, v3
268; GFX7-NEXT:    v_readfirstlane_b32 s0, v0
269; GFX7-NEXT:    v_readfirstlane_b32 s1, v1
270; GFX7-NEXT:    v_readfirstlane_b32 s2, v2
271; GFX7-NEXT:    v_readfirstlane_b32 s3, v3
272; GFX7-NEXT:    ; return to shader part epilog
273;
274; GFX8-LABEL: s_bswap_v2i64:
275; GFX8:       ; %bb.0:
276; GFX8-NEXT:    v_mov_b32_e32 v0, s1
277; GFX8-NEXT:    s_mov_b32 s1, 0x10203
278; GFX8-NEXT:    v_mov_b32_e32 v1, s0
279; GFX8-NEXT:    v_mov_b32_e32 v2, s3
280; GFX8-NEXT:    v_mov_b32_e32 v3, s2
281; GFX8-NEXT:    v_perm_b32 v0, 0, v0, s1
282; GFX8-NEXT:    v_perm_b32 v1, 0, v1, s1
283; GFX8-NEXT:    v_perm_b32 v2, 0, v2, s1
284; GFX8-NEXT:    v_perm_b32 v3, 0, v3, s1
285; GFX8-NEXT:    v_readfirstlane_b32 s0, v0
286; GFX8-NEXT:    v_readfirstlane_b32 s1, v1
287; GFX8-NEXT:    v_readfirstlane_b32 s2, v2
288; GFX8-NEXT:    v_readfirstlane_b32 s3, v3
289; GFX8-NEXT:    ; return to shader part epilog
290;
291; GFX9-LABEL: s_bswap_v2i64:
292; GFX9:       ; %bb.0:
293; GFX9-NEXT:    v_mov_b32_e32 v0, s1
294; GFX9-NEXT:    s_mov_b32 s1, 0x10203
295; GFX9-NEXT:    v_mov_b32_e32 v1, s0
296; GFX9-NEXT:    v_mov_b32_e32 v2, s3
297; GFX9-NEXT:    v_mov_b32_e32 v3, s2
298; GFX9-NEXT:    v_perm_b32 v0, 0, v0, s1
299; GFX9-NEXT:    v_perm_b32 v1, 0, v1, s1
300; GFX9-NEXT:    v_perm_b32 v2, 0, v2, s1
301; GFX9-NEXT:    v_perm_b32 v3, 0, v3, s1
302; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
303; GFX9-NEXT:    v_readfirstlane_b32 s1, v1
304; GFX9-NEXT:    v_readfirstlane_b32 s2, v2
305; GFX9-NEXT:    v_readfirstlane_b32 s3, v3
306; GFX9-NEXT:    ; return to shader part epilog
307;
308; GFX10-LABEL: s_bswap_v2i64:
309; GFX10:       ; %bb.0:
310; GFX10-NEXT:    v_perm_b32 v0, 0, s1, 0x10203
311; GFX10-NEXT:    v_perm_b32 v1, 0, s0, 0x10203
312; GFX10-NEXT:    v_perm_b32 v2, 0, s3, 0x10203
313; GFX10-NEXT:    v_perm_b32 v3, 0, s2, 0x10203
314; GFX10-NEXT:    v_readfirstlane_b32 s0, v0
315; GFX10-NEXT:    v_readfirstlane_b32 s1, v1
316; GFX10-NEXT:    v_readfirstlane_b32 s2, v2
317; GFX10-NEXT:    v_readfirstlane_b32 s3, v3
318; GFX10-NEXT:    ; return to shader part epilog
319  %bswap = call <2 x i64> @llvm.bswap.v2i64(<2 x i64> %src)
320  ret <2 x i64> %bswap
321}
322
323define <2 x i64> @v_bswap_v2i64(<2 x i64> %src) {
324; GFX7-LABEL: v_bswap_v2i64:
325; GFX7:       ; %bb.0:
326; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
327; GFX7-NEXT:    v_alignbit_b32 v4, v1, v1, 8
328; GFX7-NEXT:    v_alignbit_b32 v1, v1, v1, 24
329; GFX7-NEXT:    s_mov_b32 s4, 0xff00ff
330; GFX7-NEXT:    v_bfi_b32 v4, s4, v1, v4
331; GFX7-NEXT:    v_alignbit_b32 v1, v0, v0, 8
332; GFX7-NEXT:    v_alignbit_b32 v0, v0, v0, 24
333; GFX7-NEXT:    v_bfi_b32 v1, s4, v0, v1
334; GFX7-NEXT:    v_alignbit_b32 v0, v3, v3, 8
335; GFX7-NEXT:    v_alignbit_b32 v3, v3, v3, 24
336; GFX7-NEXT:    v_bfi_b32 v5, s4, v3, v0
337; GFX7-NEXT:    v_alignbit_b32 v0, v2, v2, 8
338; GFX7-NEXT:    v_alignbit_b32 v2, v2, v2, 24
339; GFX7-NEXT:    v_bfi_b32 v3, s4, v2, v0
340; GFX7-NEXT:    v_mov_b32_e32 v0, v4
341; GFX7-NEXT:    v_mov_b32_e32 v2, v5
342; GFX7-NEXT:    s_setpc_b64 s[30:31]
343;
344; GFX8-LABEL: v_bswap_v2i64:
345; GFX8:       ; %bb.0:
346; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
347; GFX8-NEXT:    s_mov_b32 s4, 0x10203
348; GFX8-NEXT:    v_perm_b32 v4, 0, v1, s4
349; GFX8-NEXT:    v_perm_b32 v5, 0, v3, s4
350; GFX8-NEXT:    v_perm_b32 v1, 0, v0, s4
351; GFX8-NEXT:    v_perm_b32 v3, 0, v2, s4
352; GFX8-NEXT:    v_mov_b32_e32 v0, v4
353; GFX8-NEXT:    v_mov_b32_e32 v2, v5
354; GFX8-NEXT:    s_setpc_b64 s[30:31]
355;
356; GFX9-LABEL: v_bswap_v2i64:
357; GFX9:       ; %bb.0:
358; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
359; GFX9-NEXT:    s_mov_b32 s4, 0x10203
360; GFX9-NEXT:    v_perm_b32 v4, 0, v1, s4
361; GFX9-NEXT:    v_perm_b32 v5, 0, v3, s4
362; GFX9-NEXT:    v_perm_b32 v1, 0, v0, s4
363; GFX9-NEXT:    v_perm_b32 v3, 0, v2, s4
364; GFX9-NEXT:    v_mov_b32_e32 v0, v4
365; GFX9-NEXT:    v_mov_b32_e32 v2, v5
366; GFX9-NEXT:    s_setpc_b64 s[30:31]
367;
368; GFX10-LABEL: v_bswap_v2i64:
369; GFX10:       ; %bb.0:
370; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
371; GFX10-NEXT:    v_perm_b32 v4, 0, v1, 0x10203
372; GFX10-NEXT:    v_perm_b32 v5, 0, v3, 0x10203
373; GFX10-NEXT:    v_perm_b32 v1, 0, v0, 0x10203
374; GFX10-NEXT:    v_perm_b32 v3, 0, v2, 0x10203
375; GFX10-NEXT:    v_mov_b32_e32 v0, v4
376; GFX10-NEXT:    v_mov_b32_e32 v2, v5
377; GFX10-NEXT:    s_setpc_b64 s[30:31]
378  %bswap = call <2 x i64> @llvm.bswap.v2i64(<2 x i64> %src)
379  ret <2 x i64> %bswap
380}
381
382define amdgpu_ps i16 @s_bswap_i16(i16 inreg %src) {
383; GFX7-LABEL: s_bswap_i16:
384; GFX7:       ; %bb.0:
385; GFX7-NEXT:    s_lshl_b32 s1, s0, 8
386; GFX7-NEXT:    s_bfe_u32 s0, s0, 0x80008
387; GFX7-NEXT:    s_or_b32 s0, s0, s1
388; GFX7-NEXT:    ; return to shader part epilog
389;
390; GFX8-LABEL: s_bswap_i16:
391; GFX8:       ; %bb.0:
392; GFX8-NEXT:    v_mov_b32_e32 v0, s0
393; GFX8-NEXT:    s_mov_b32 s0, 0xc0c0001
394; GFX8-NEXT:    v_perm_b32 v0, 0, v0, s0
395; GFX8-NEXT:    v_readfirstlane_b32 s0, v0
396; GFX8-NEXT:    ; return to shader part epilog
397;
398; GFX9-LABEL: s_bswap_i16:
399; GFX9:       ; %bb.0:
400; GFX9-NEXT:    v_mov_b32_e32 v0, s0
401; GFX9-NEXT:    s_mov_b32 s0, 0xc0c0001
402; GFX9-NEXT:    v_perm_b32 v0, 0, v0, s0
403; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
404; GFX9-NEXT:    ; return to shader part epilog
405;
406; GFX10-LABEL: s_bswap_i16:
407; GFX10:       ; %bb.0:
408; GFX10-NEXT:    v_perm_b32 v0, 0, s0, 0xc0c0001
409; GFX10-NEXT:    v_readfirstlane_b32 s0, v0
410; GFX10-NEXT:    ; return to shader part epilog
411  %bswap = call i16 @llvm.bswap.i16(i16 %src)
412  ret i16 %bswap
413}
414
415define i16 @v_bswap_i16(i16 %src) {
416; GFX7-LABEL: v_bswap_i16:
417; GFX7:       ; %bb.0:
418; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
419; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 8, v0
420; GFX7-NEXT:    v_bfe_u32 v0, v0, 8, 8
421; GFX7-NEXT:    v_or_b32_e32 v0, v0, v1
422; GFX7-NEXT:    s_setpc_b64 s[30:31]
423;
424; GFX8-LABEL: v_bswap_i16:
425; GFX8:       ; %bb.0:
426; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
427; GFX8-NEXT:    s_mov_b32 s4, 0xc0c0001
428; GFX8-NEXT:    v_perm_b32 v0, 0, v0, s4
429; GFX8-NEXT:    s_setpc_b64 s[30:31]
430;
431; GFX9-LABEL: v_bswap_i16:
432; GFX9:       ; %bb.0:
433; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
434; GFX9-NEXT:    s_mov_b32 s4, 0xc0c0001
435; GFX9-NEXT:    v_perm_b32 v0, 0, v0, s4
436; GFX9-NEXT:    s_setpc_b64 s[30:31]
437;
438; GFX10-LABEL: v_bswap_i16:
439; GFX10:       ; %bb.0:
440; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
441; GFX10-NEXT:    v_perm_b32 v0, 0, v0, 0xc0c0001
442; GFX10-NEXT:    s_setpc_b64 s[30:31]
443  %bswap = call i16 @llvm.bswap.i16(i16 %src)
444  ret i16 %bswap
445}
446
447define amdgpu_ps i32 @s_bswap_v2i16(<2 x i16> inreg %src) {
448; GFX7-LABEL: s_bswap_v2i16:
449; GFX7:       ; %bb.0:
450; GFX7-NEXT:    s_lshl_b32 s2, s0, 8
451; GFX7-NEXT:    s_bfe_u32 s0, s0, 0x80008
452; GFX7-NEXT:    s_or_b32 s0, s0, s2
453; GFX7-NEXT:    s_lshl_b32 s2, s1, 8
454; GFX7-NEXT:    s_bfe_u32 s1, s1, 0x80008
455; GFX7-NEXT:    s_or_b32 s1, s1, s2
456; GFX7-NEXT:    s_and_b32 s1, 0xffff, s1
457; GFX7-NEXT:    s_and_b32 s0, 0xffff, s0
458; GFX7-NEXT:    s_lshl_b32 s1, s1, 16
459; GFX7-NEXT:    s_or_b32 s0, s0, s1
460; GFX7-NEXT:    ; return to shader part epilog
461;
462; GFX8-LABEL: s_bswap_v2i16:
463; GFX8:       ; %bb.0:
464; GFX8-NEXT:    v_mov_b32_e32 v0, s0
465; GFX8-NEXT:    s_mov_b32 s0, 0x2030001
466; GFX8-NEXT:    v_perm_b32 v0, 0, v0, s0
467; GFX8-NEXT:    v_readfirstlane_b32 s0, v0
468; GFX8-NEXT:    ; return to shader part epilog
469;
470; GFX9-LABEL: s_bswap_v2i16:
471; GFX9:       ; %bb.0:
472; GFX9-NEXT:    v_mov_b32_e32 v0, s0
473; GFX9-NEXT:    s_mov_b32 s0, 0x2030001
474; GFX9-NEXT:    v_perm_b32 v0, 0, v0, s0
475; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
476; GFX9-NEXT:    ; return to shader part epilog
477;
478; GFX10-LABEL: s_bswap_v2i16:
479; GFX10:       ; %bb.0:
480; GFX10-NEXT:    v_perm_b32 v0, 0, s0, 0x2030001
481; GFX10-NEXT:    v_readfirstlane_b32 s0, v0
482; GFX10-NEXT:    ; return to shader part epilog
483  %bswap = call <2 x i16> @llvm.bswap.v2i16(<2 x i16> %src)
484  %cast = bitcast <2 x i16> %bswap to i32
485  ret i32 %cast
486}
487
488define i32 @v_bswap_i16_zext_to_i32(i16 %src) {
489; GFX7-LABEL: v_bswap_i16_zext_to_i32:
490; GFX7:       ; %bb.0:
491; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
492; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 8, v0
493; GFX7-NEXT:    v_bfe_u32 v0, v0, 8, 8
494; GFX7-NEXT:    v_or_b32_e32 v0, v0, v1
495; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff, v0
496; GFX7-NEXT:    s_setpc_b64 s[30:31]
497;
498; GFX8-LABEL: v_bswap_i16_zext_to_i32:
499; GFX8:       ; %bb.0:
500; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
501; GFX8-NEXT:    s_mov_b32 s4, 0xc0c0001
502; GFX8-NEXT:    v_perm_b32 v0, 0, v0, s4
503; GFX8-NEXT:    s_setpc_b64 s[30:31]
504;
505; GFX9-LABEL: v_bswap_i16_zext_to_i32:
506; GFX9:       ; %bb.0:
507; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
508; GFX9-NEXT:    s_mov_b32 s4, 0xc0c0001
509; GFX9-NEXT:    v_perm_b32 v0, 0, v0, s4
510; GFX9-NEXT:    s_setpc_b64 s[30:31]
511;
512; GFX10-LABEL: v_bswap_i16_zext_to_i32:
513; GFX10:       ; %bb.0:
514; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
515; GFX10-NEXT:    v_perm_b32 v0, 0, v0, 0xc0c0001
516; GFX10-NEXT:    s_setpc_b64 s[30:31]
517  %bswap = call i16 @llvm.bswap.i16(i16 %src)
518  %zext = zext i16 %bswap to i32
519  ret i32 %zext
520}
521
522define i32 @v_bswap_i16_sext_to_i32(i16 %src) {
523; GFX7-LABEL: v_bswap_i16_sext_to_i32:
524; GFX7:       ; %bb.0:
525; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
526; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 8, v0
527; GFX7-NEXT:    v_bfe_u32 v0, v0, 8, 8
528; GFX7-NEXT:    v_or_b32_e32 v0, v0, v1
529; GFX7-NEXT:    v_bfe_i32 v0, v0, 0, 16
530; GFX7-NEXT:    s_setpc_b64 s[30:31]
531;
532; GFX8-LABEL: v_bswap_i16_sext_to_i32:
533; GFX8:       ; %bb.0:
534; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
535; GFX8-NEXT:    s_mov_b32 s4, 0xc0c0001
536; GFX8-NEXT:    v_perm_b32 v0, 0, v0, s4
537; GFX8-NEXT:    v_bfe_i32 v0, v0, 0, 16
538; GFX8-NEXT:    s_setpc_b64 s[30:31]
539;
540; GFX9-LABEL: v_bswap_i16_sext_to_i32:
541; GFX9:       ; %bb.0:
542; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
543; GFX9-NEXT:    s_mov_b32 s4, 0xc0c0001
544; GFX9-NEXT:    v_perm_b32 v0, 0, v0, s4
545; GFX9-NEXT:    v_bfe_i32 v0, v0, 0, 16
546; GFX9-NEXT:    s_setpc_b64 s[30:31]
547;
548; GFX10-LABEL: v_bswap_i16_sext_to_i32:
549; GFX10:       ; %bb.0:
550; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
551; GFX10-NEXT:    v_perm_b32 v0, 0, v0, 0xc0c0001
552; GFX10-NEXT:    v_bfe_i32 v0, v0, 0, 16
553; GFX10-NEXT:    s_setpc_b64 s[30:31]
554  %bswap = call i16 @llvm.bswap.i16(i16 %src)
555  %zext = sext i16 %bswap to i32
556  ret i32 %zext
557}
558
559define <2 x i16> @v_bswap_v2i16(<2 x i16> %src) {
560; GFX7-LABEL: v_bswap_v2i16:
561; GFX7:       ; %bb.0:
562; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
563; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 8, v0
564; GFX7-NEXT:    v_bfe_u32 v0, v0, 8, 8
565; GFX7-NEXT:    v_or_b32_e32 v0, v0, v2
566; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 8, v1
567; GFX7-NEXT:    v_bfe_u32 v1, v1, 8, 8
568; GFX7-NEXT:    v_or_b32_e32 v1, v1, v2
569; GFX7-NEXT:    s_setpc_b64 s[30:31]
570;
571; GFX8-LABEL: v_bswap_v2i16:
572; GFX8:       ; %bb.0:
573; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
574; GFX8-NEXT:    s_mov_b32 s4, 0x2030001
575; GFX8-NEXT:    v_perm_b32 v0, 0, v0, s4
576; GFX8-NEXT:    s_setpc_b64 s[30:31]
577;
578; GFX9-LABEL: v_bswap_v2i16:
579; GFX9:       ; %bb.0:
580; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
581; GFX9-NEXT:    s_mov_b32 s4, 0x2030001
582; GFX9-NEXT:    v_perm_b32 v0, 0, v0, s4
583; GFX9-NEXT:    s_setpc_b64 s[30:31]
584;
585; GFX10-LABEL: v_bswap_v2i16:
586; GFX10:       ; %bb.0:
587; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
588; GFX10-NEXT:    v_perm_b32 v0, 0, v0, 0x2030001
589; GFX10-NEXT:    s_setpc_b64 s[30:31]
590  %bswap = call <2 x i16> @llvm.bswap.v2i16(<2 x i16> %src)
591  ret <2 x i16> %bswap
592}
593
594define <3 x i16> @v_bswap_v3i16(<3 x i16> %src) {
595; GFX7-LABEL: v_bswap_v3i16:
596; GFX7:       ; %bb.0:
597; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
598; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 8, v0
599; GFX7-NEXT:    v_bfe_u32 v0, v0, 8, 8
600; GFX7-NEXT:    v_or_b32_e32 v0, v0, v3
601; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 8, v1
602; GFX7-NEXT:    v_bfe_u32 v1, v1, 8, 8
603; GFX7-NEXT:    v_or_b32_e32 v1, v1, v3
604; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 8, v2
605; GFX7-NEXT:    v_bfe_u32 v2, v2, 8, 8
606; GFX7-NEXT:    v_or_b32_e32 v2, v2, v3
607; GFX7-NEXT:    s_setpc_b64 s[30:31]
608;
609; GFX8-LABEL: v_bswap_v3i16:
610; GFX8:       ; %bb.0:
611; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
612; GFX8-NEXT:    s_mov_b32 s4, 0x2030001
613; GFX8-NEXT:    v_perm_b32 v0, 0, v0, s4
614; GFX8-NEXT:    v_perm_b32 v1, 0, v1, s4
615; GFX8-NEXT:    s_setpc_b64 s[30:31]
616;
617; GFX9-LABEL: v_bswap_v3i16:
618; GFX9:       ; %bb.0:
619; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
620; GFX9-NEXT:    s_mov_b32 s4, 0x2030001
621; GFX9-NEXT:    v_perm_b32 v0, 0, v0, s4
622; GFX9-NEXT:    v_perm_b32 v1, 0, v1, s4
623; GFX9-NEXT:    s_setpc_b64 s[30:31]
624;
625; GFX10-LABEL: v_bswap_v3i16:
626; GFX10:       ; %bb.0:
627; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
628; GFX10-NEXT:    v_perm_b32 v0, 0, v0, 0x2030001
629; GFX10-NEXT:    v_perm_b32 v1, 0, v1, 0x2030001
630; GFX10-NEXT:    s_setpc_b64 s[30:31]
631  %bswap = call <3 x i16> @llvm.bswap.v3i16(<3 x i16> %src)
632  ret <3 x i16> %bswap
633}
634
635define i64 @v_bswap_i48(i64 %src) {
636; GFX7-LABEL: v_bswap_i48:
637; GFX7:       ; %bb.0:
638; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
639; GFX7-NEXT:    v_alignbit_b32 v2, v1, v1, 8
640; GFX7-NEXT:    v_alignbit_b32 v1, v1, v1, 24
641; GFX7-NEXT:    s_mov_b32 s4, 0xff00ff
642; GFX7-NEXT:    v_bfi_b32 v1, s4, v1, v2
643; GFX7-NEXT:    v_alignbit_b32 v2, v0, v0, 8
644; GFX7-NEXT:    v_alignbit_b32 v0, v0, v0, 24
645; GFX7-NEXT:    v_bfi_b32 v2, s4, v0, v2
646; GFX7-NEXT:    v_lshr_b64 v[0:1], v[1:2], 16
647; GFX7-NEXT:    s_setpc_b64 s[30:31]
648;
649; GFX8-LABEL: v_bswap_i48:
650; GFX8:       ; %bb.0:
651; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
652; GFX8-NEXT:    s_mov_b32 s4, 0x10203
653; GFX8-NEXT:    v_perm_b32 v1, 0, v1, s4
654; GFX8-NEXT:    v_perm_b32 v2, 0, v0, s4
655; GFX8-NEXT:    v_lshrrev_b64 v[0:1], 16, v[1:2]
656; GFX8-NEXT:    s_setpc_b64 s[30:31]
657;
658; GFX9-LABEL: v_bswap_i48:
659; GFX9:       ; %bb.0:
660; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
661; GFX9-NEXT:    s_mov_b32 s4, 0x10203
662; GFX9-NEXT:    v_perm_b32 v1, 0, v1, s4
663; GFX9-NEXT:    v_perm_b32 v2, 0, v0, s4
664; GFX9-NEXT:    v_lshrrev_b64 v[0:1], 16, v[1:2]
665; GFX9-NEXT:    s_setpc_b64 s[30:31]
666;
667; GFX10-LABEL: v_bswap_i48:
668; GFX10:       ; %bb.0:
669; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
670; GFX10-NEXT:    v_perm_b32 v1, 0, v1, 0x10203
671; GFX10-NEXT:    v_perm_b32 v2, 0, v0, 0x10203
672; GFX10-NEXT:    v_lshrrev_b64 v[0:1], 16, v[1:2]
673; GFX10-NEXT:    s_setpc_b64 s[30:31]
674  %trunc = trunc i64 %src to i48
675  %bswap = call i48 @llvm.bswap.i48(i48 %trunc)
676  %zext = zext i48 %bswap to i64
677  ret i64 %zext
678}
679
680declare i16 @llvm.bswap.i16(i16) #1
681declare <2 x i16> @llvm.bswap.v2i16(<2 x i16>) #1
682declare <3 x i16> @llvm.bswap.v3i16(<3 x i16>) #1
683declare i32 @llvm.bswap.i32(i32) #1
684declare <2 x i32> @llvm.bswap.v2i32(<2 x i32>) #1
685declare i64 @llvm.bswap.i64(i64) #1
686declare <2 x i64> @llvm.bswap.v2i64(<2 x i64>) #1
687declare i48 @llvm.bswap.i48(i48) #1
688
689attributes #0 = { convergent nounwind readnone }
690attributes #1 = { nounwind readnone speculatable willreturn }
691