xref: /llvm-project/llvm/test/CodeGen/AMDGPU/pack.v2f16.ll (revision 6548b6354d1d990e1c98736f5e7c3de876bedc8e)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx900 -mattr=-flat-for-global -denormal-fp-math=preserve-sign -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX9 %s
3; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GFX8 %s
4; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GFX7 %s
5
6
7define amdgpu_kernel void @s_pack_v2f16(ptr addrspace(4) %in0, ptr addrspace(4) %in1) #0 {
8; GFX9-LABEL: s_pack_v2f16:
9; GFX9:       ; %bb.0:
10; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
11; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
12; GFX9-NEXT:    s_load_dword s4, s[0:1], 0x0
13; GFX9-NEXT:    s_load_dword s5, s[2:3], 0x0
14; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
15; GFX9-NEXT:    s_pack_ll_b32_b16 s0, s4, s5
16; GFX9-NEXT:    ;;#ASMSTART
17; GFX9-NEXT:    ; use s0
18; GFX9-NEXT:    ;;#ASMEND
19; GFX9-NEXT:    s_endpgm
20;
21; GFX8-LABEL: s_pack_v2f16:
22; GFX8:       ; %bb.0:
23; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
24; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
25; GFX8-NEXT:    s_load_dword s0, s[0:1], 0x0
26; GFX8-NEXT:    s_load_dword s1, s[2:3], 0x0
27; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
28; GFX8-NEXT:    s_and_b32 s0, s0, 0xffff
29; GFX8-NEXT:    s_lshl_b32 s1, s1, 16
30; GFX8-NEXT:    s_or_b32 s0, s0, s1
31; GFX8-NEXT:    ;;#ASMSTART
32; GFX8-NEXT:    ; use s0
33; GFX8-NEXT:    ;;#ASMEND
34; GFX8-NEXT:    s_endpgm
35;
36; GFX7-LABEL: s_pack_v2f16:
37; GFX7:       ; %bb.0:
38; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
39; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
40; GFX7-NEXT:    s_load_dword s0, s[0:1], 0x0
41; GFX7-NEXT:    s_load_dword s1, s[2:3], 0x0
42; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
43; GFX7-NEXT:    s_and_b32 s0, s0, 0xffff
44; GFX7-NEXT:    s_lshl_b32 s1, s1, 16
45; GFX7-NEXT:    s_or_b32 s0, s0, s1
46; GFX7-NEXT:    ;;#ASMSTART
47; GFX7-NEXT:    ; use s0
48; GFX7-NEXT:    ;;#ASMEND
49; GFX7-NEXT:    s_endpgm
50  %val0 = load volatile i32, ptr addrspace(4) %in0
51  %val1 = load volatile i32, ptr addrspace(4) %in1
52  %lo.i = trunc i32 %val0 to i16
53  %hi.i = trunc i32 %val1 to i16
54  %lo = bitcast i16 %lo.i to half
55  %hi = bitcast i16 %hi.i to half
56  %vec.0 = insertelement <2 x half> undef, half %lo, i32 0
57  %vec.1 = insertelement <2 x half> %vec.0, half %hi, i32 1
58  %vec.i32 = bitcast <2 x half> %vec.1 to i32
59
60  call void asm sideeffect "; use $0", "s"(i32 %vec.i32) #0
61  ret void
62}
63
64define amdgpu_kernel void @s_pack_v2f16_imm_lo(ptr addrspace(4) %in1) #0 {
65; GFX9-LABEL: s_pack_v2f16_imm_lo:
66; GFX9:       ; %bb.0:
67; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
68; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
69; GFX9-NEXT:    s_load_dword s0, s[0:1], 0x0
70; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
71; GFX9-NEXT:    s_pack_ll_b32_b16 s0, 0x1234, s0
72; GFX9-NEXT:    ;;#ASMSTART
73; GFX9-NEXT:    ; use s0
74; GFX9-NEXT:    ;;#ASMEND
75; GFX9-NEXT:    s_endpgm
76;
77; GFX8-LABEL: s_pack_v2f16_imm_lo:
78; GFX8:       ; %bb.0:
79; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
80; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
81; GFX8-NEXT:    s_load_dword s0, s[0:1], 0x0
82; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
83; GFX8-NEXT:    s_lshl_b32 s0, s0, 16
84; GFX8-NEXT:    s_or_b32 s0, s0, 0x1234
85; GFX8-NEXT:    ;;#ASMSTART
86; GFX8-NEXT:    ; use s0
87; GFX8-NEXT:    ;;#ASMEND
88; GFX8-NEXT:    s_endpgm
89;
90; GFX7-LABEL: s_pack_v2f16_imm_lo:
91; GFX7:       ; %bb.0:
92; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
93; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
94; GFX7-NEXT:    s_load_dword s0, s[0:1], 0x0
95; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
96; GFX7-NEXT:    s_lshl_b32 s0, s0, 16
97; GFX7-NEXT:    s_or_b32 s0, s0, 0x1234
98; GFX7-NEXT:    ;;#ASMSTART
99; GFX7-NEXT:    ; use s0
100; GFX7-NEXT:    ;;#ASMEND
101; GFX7-NEXT:    s_endpgm
102  %val1 = load i32, ptr addrspace(4) %in1
103  %hi.i = trunc i32 %val1 to i16
104  %hi = bitcast i16 %hi.i to half
105  %vec.0 = insertelement <2 x half> undef, half 0xH1234, i32 0
106  %vec.1 = insertelement <2 x half> %vec.0, half %hi, i32 1
107  %vec.i32 = bitcast <2 x half> %vec.1 to i32
108
109  call void asm sideeffect "; use $0", "s"(i32 %vec.i32) #0
110  ret void
111}
112
113define amdgpu_kernel void @s_pack_v2f16_imm_hi(ptr addrspace(4) %in0) #0 {
114; GFX9-LABEL: s_pack_v2f16_imm_hi:
115; GFX9:       ; %bb.0:
116; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
117; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
118; GFX9-NEXT:    s_load_dword s0, s[0:1], 0x0
119; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
120; GFX9-NEXT:    s_pack_ll_b32_b16 s0, s0, 0x1234
121; GFX9-NEXT:    ;;#ASMSTART
122; GFX9-NEXT:    ; use s0
123; GFX9-NEXT:    ;;#ASMEND
124; GFX9-NEXT:    s_endpgm
125;
126; GFX8-LABEL: s_pack_v2f16_imm_hi:
127; GFX8:       ; %bb.0:
128; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
129; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
130; GFX8-NEXT:    s_load_dword s0, s[0:1], 0x0
131; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
132; GFX8-NEXT:    s_and_b32 s0, s0, 0xffff
133; GFX8-NEXT:    s_or_b32 s0, s0, 0x12340000
134; GFX8-NEXT:    ;;#ASMSTART
135; GFX8-NEXT:    ; use s0
136; GFX8-NEXT:    ;;#ASMEND
137; GFX8-NEXT:    s_endpgm
138;
139; GFX7-LABEL: s_pack_v2f16_imm_hi:
140; GFX7:       ; %bb.0:
141; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
142; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
143; GFX7-NEXT:    s_load_dword s0, s[0:1], 0x0
144; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
145; GFX7-NEXT:    s_and_b32 s0, s0, 0xffff
146; GFX7-NEXT:    s_or_b32 s0, s0, 0x12340000
147; GFX7-NEXT:    ;;#ASMSTART
148; GFX7-NEXT:    ; use s0
149; GFX7-NEXT:    ;;#ASMEND
150; GFX7-NEXT:    s_endpgm
151  %val0 = load i32, ptr addrspace(4) %in0
152  %lo.i = trunc i32 %val0 to i16
153  %lo = bitcast i16 %lo.i to half
154  %vec.0 = insertelement <2 x half> undef, half %lo, i32 0
155  %vec.1 = insertelement <2 x half> %vec.0, half 0xH1234, i32 1
156  %vec.i32 = bitcast <2 x half> %vec.1 to i32
157
158  call void asm sideeffect "; use $0", "s"(i32 %vec.i32) #0
159  ret void
160}
161
162define amdgpu_kernel void @v_pack_v2f16(ptr addrspace(1) %in0, ptr addrspace(1) %in1) #0 {
163; GFX9-LABEL: v_pack_v2f16:
164; GFX9:       ; %bb.0:
165; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
166; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
167; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
168; GFX9-NEXT:    global_load_dword v1, v0, s[0:1] glc
169; GFX9-NEXT:    s_waitcnt vmcnt(0)
170; GFX9-NEXT:    global_load_dword v2, v0, s[2:3] glc
171; GFX9-NEXT:    s_waitcnt vmcnt(0)
172; GFX9-NEXT:    s_mov_b32 s0, 0x5040100
173; GFX9-NEXT:    v_perm_b32 v0, v2, v1, s0
174; GFX9-NEXT:    ;;#ASMSTART
175; GFX9-NEXT:    ; use v0
176; GFX9-NEXT:    ;;#ASMEND
177; GFX9-NEXT:    s_endpgm
178;
179; GFX8-LABEL: v_pack_v2f16:
180; GFX8:       ; %bb.0:
181; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
182; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
183; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
184; GFX8-NEXT:    v_mov_b32_e32 v1, s1
185; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
186; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
187; GFX8-NEXT:    v_mov_b32_e32 v3, s3
188; GFX8-NEXT:    v_add_u32_e32 v2, vcc, s2, v2
189; GFX8-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
190; GFX8-NEXT:    flat_load_dword v0, v[0:1] glc
191; GFX8-NEXT:    s_waitcnt vmcnt(0)
192; GFX8-NEXT:    flat_load_dword v1, v[2:3] glc
193; GFX8-NEXT:    s_waitcnt vmcnt(0)
194; GFX8-NEXT:    s_mov_b32 s0, 0x1000504
195; GFX8-NEXT:    v_perm_b32 v0, v0, v1, s0
196; GFX8-NEXT:    ;;#ASMSTART
197; GFX8-NEXT:    ; use v0
198; GFX8-NEXT:    ;;#ASMEND
199; GFX8-NEXT:    s_endpgm
200;
201; GFX7-LABEL: v_pack_v2f16:
202; GFX7:       ; %bb.0:
203; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
204; GFX7-NEXT:    s_mov_b32 s7, 0x100f000
205; GFX7-NEXT:    s_mov_b32 s6, 0
206; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
207; GFX7-NEXT:    v_mov_b32_e32 v1, 0
208; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
209; GFX7-NEXT:    s_mov_b64 s[4:5], s[0:1]
210; GFX7-NEXT:    s_mov_b64 s[0:1], s[2:3]
211; GFX7-NEXT:    s_mov_b64 s[2:3], s[6:7]
212; GFX7-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 glc
213; GFX7-NEXT:    s_waitcnt vmcnt(0)
214; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[0:3], 0 addr64 glc
215; GFX7-NEXT:    s_waitcnt vmcnt(0)
216; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff, v2
217; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
218; GFX7-NEXT:    v_or_b32_e32 v0, v1, v0
219; GFX7-NEXT:    ;;#ASMSTART
220; GFX7-NEXT:    ; use v0
221; GFX7-NEXT:    ;;#ASMEND
222; GFX7-NEXT:    s_endpgm
223  %tid = call i32 @llvm.amdgcn.workitem.id.x()
224  %tid.ext = sext i32 %tid to i64
225  %in0.gep = getelementptr inbounds i32, ptr addrspace(1) %in0, i64 %tid.ext
226  %in1.gep = getelementptr inbounds i32, ptr addrspace(1) %in1, i64 %tid.ext
227  %val0 = load volatile i32, ptr addrspace(1) %in0.gep
228  %val1 = load volatile i32, ptr addrspace(1) %in1.gep
229  %lo.i = trunc i32 %val0 to i16
230  %hi.i = trunc i32 %val1 to i16
231  %lo = bitcast i16 %lo.i to half
232  %hi = bitcast i16 %hi.i to half
233  %vec.0 = insertelement <2 x half> undef, half %lo, i32 0
234  %vec.1 = insertelement <2 x half> %vec.0, half %hi, i32 1
235  %vec.i32 = bitcast <2 x half> %vec.1 to i32
236  call void asm sideeffect "; use $0", "v"(i32 %vec.i32) #0
237  ret void
238}
239
240define amdgpu_kernel void @v_pack_v2f16_user(ptr addrspace(1) %in0, ptr addrspace(1) %in1) #0 {
241; GFX9-LABEL: v_pack_v2f16_user:
242; GFX9:       ; %bb.0:
243; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
244; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
245; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
246; GFX9-NEXT:    global_load_dword v1, v0, s[0:1] glc
247; GFX9-NEXT:    s_waitcnt vmcnt(0)
248; GFX9-NEXT:    global_load_dword v2, v0, s[2:3] glc
249; GFX9-NEXT:    s_waitcnt vmcnt(0)
250; GFX9-NEXT:    s_mov_b32 s0, 0x5040100
251; GFX9-NEXT:    s_mov_b32 s3, 0xf000
252; GFX9-NEXT:    s_mov_b32 s2, -1
253; GFX9-NEXT:    v_perm_b32 v0, v2, v1, s0
254; GFX9-NEXT:    v_add_u32_e32 v0, 9, v0
255; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
256; GFX9-NEXT:    s_waitcnt vmcnt(0)
257; GFX9-NEXT:    s_endpgm
258;
259; GFX8-LABEL: v_pack_v2f16_user:
260; GFX8:       ; %bb.0:
261; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
262; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
263; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
264; GFX8-NEXT:    v_mov_b32_e32 v1, s1
265; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
266; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
267; GFX8-NEXT:    v_mov_b32_e32 v3, s3
268; GFX8-NEXT:    v_add_u32_e32 v2, vcc, s2, v2
269; GFX8-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
270; GFX8-NEXT:    flat_load_dword v0, v[0:1] glc
271; GFX8-NEXT:    s_waitcnt vmcnt(0)
272; GFX8-NEXT:    flat_load_dword v1, v[2:3] glc
273; GFX8-NEXT:    s_waitcnt vmcnt(0)
274; GFX8-NEXT:    s_mov_b32 s0, 0x1000504
275; GFX8-NEXT:    s_mov_b32 s3, 0x1100f000
276; GFX8-NEXT:    s_mov_b32 s2, -1
277; GFX8-NEXT:    v_perm_b32 v0, v0, v1, s0
278; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 9, v0
279; GFX8-NEXT:    buffer_store_dword v0, off, s[0:3], 0
280; GFX8-NEXT:    s_waitcnt vmcnt(0)
281; GFX8-NEXT:    s_endpgm
282;
283; GFX7-LABEL: v_pack_v2f16_user:
284; GFX7:       ; %bb.0:
285; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
286; GFX7-NEXT:    s_mov_b32 s6, 0
287; GFX7-NEXT:    s_mov_b32 s7, 0x100f000
288; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
289; GFX7-NEXT:    v_mov_b32_e32 v1, 0
290; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
291; GFX7-NEXT:    s_mov_b64 s[4:5], s[0:1]
292; GFX7-NEXT:    s_mov_b64 s[0:1], s[2:3]
293; GFX7-NEXT:    s_mov_b64 s[2:3], s[6:7]
294; GFX7-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 glc
295; GFX7-NEXT:    s_waitcnt vmcnt(0)
296; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[0:3], 0 addr64 glc
297; GFX7-NEXT:    s_waitcnt vmcnt(0)
298; GFX7-NEXT:    s_mov_b32 s6, -1
299; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff, v2
300; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
301; GFX7-NEXT:    v_or_b32_e32 v0, v1, v0
302; GFX7-NEXT:    v_add_i32_e32 v0, vcc, 9, v0
303; GFX7-NEXT:    buffer_store_dword v0, off, s[4:7], 0
304; GFX7-NEXT:    s_waitcnt vmcnt(0)
305; GFX7-NEXT:    s_endpgm
306  %tid = call i32 @llvm.amdgcn.workitem.id.x()
307  %tid.ext = sext i32 %tid to i64
308  %in0.gep = getelementptr inbounds i32, ptr addrspace(1) %in0, i64 %tid.ext
309  %in1.gep = getelementptr inbounds i32, ptr addrspace(1) %in1, i64 %tid.ext
310  %val0 = load volatile i32, ptr addrspace(1) %in0.gep
311  %val1 = load volatile i32, ptr addrspace(1) %in1.gep
312  %lo.i = trunc i32 %val0 to i16
313  %hi.i = trunc i32 %val1 to i16
314  %lo = bitcast i16 %lo.i to half
315  %hi = bitcast i16 %hi.i to half
316  %vec.0 = insertelement <2 x half> undef, half %lo, i32 0
317  %vec.1 = insertelement <2 x half> %vec.0, half %hi, i32 1
318  %vec.i32 = bitcast <2 x half> %vec.1 to i32
319  %foo = add i32 %vec.i32, 9
320  store volatile i32 %foo, ptr addrspace(1) undef
321  ret void
322}
323
324define amdgpu_kernel void @v_pack_v2f16_imm_lo(ptr addrspace(1) %in1) #0 {
325; GFX9-LABEL: v_pack_v2f16_imm_lo:
326; GFX9:       ; %bb.0:
327; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
328; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
329; GFX9-NEXT:    v_mov_b32_e32 v1, 0x5040100
330; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
331; GFX9-NEXT:    global_load_dword v0, v0, s[0:1] glc
332; GFX9-NEXT:    s_waitcnt vmcnt(0)
333; GFX9-NEXT:    s_movk_i32 s0, 0x1234
334; GFX9-NEXT:    v_perm_b32 v0, v0, s0, v1
335; GFX9-NEXT:    ;;#ASMSTART
336; GFX9-NEXT:    ; use v0
337; GFX9-NEXT:    ;;#ASMEND
338; GFX9-NEXT:    s_endpgm
339;
340; GFX8-LABEL: v_pack_v2f16_imm_lo:
341; GFX8:       ; %bb.0:
342; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
343; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
344; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
345; GFX8-NEXT:    v_mov_b32_e32 v1, s1
346; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
347; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
348; GFX8-NEXT:    flat_load_dword v0, v[0:1] glc
349; GFX8-NEXT:    s_waitcnt vmcnt(0)
350; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
351; GFX8-NEXT:    v_or_b32_e32 v0, 0x1234, v0
352; GFX8-NEXT:    ;;#ASMSTART
353; GFX8-NEXT:    ; use v0
354; GFX8-NEXT:    ;;#ASMEND
355; GFX8-NEXT:    s_endpgm
356;
357; GFX7-LABEL: v_pack_v2f16_imm_lo:
358; GFX7:       ; %bb.0:
359; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
360; GFX7-NEXT:    s_mov_b32 s3, 0x100f000
361; GFX7-NEXT:    s_mov_b32 s2, 0
362; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
363; GFX7-NEXT:    v_mov_b32_e32 v1, 0
364; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
365; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[0:3], 0 addr64 glc
366; GFX7-NEXT:    s_waitcnt vmcnt(0)
367; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
368; GFX7-NEXT:    v_or_b32_e32 v0, 0x1234, v0
369; GFX7-NEXT:    ;;#ASMSTART
370; GFX7-NEXT:    ; use v0
371; GFX7-NEXT:    ;;#ASMEND
372; GFX7-NEXT:    s_endpgm
373  %tid = call i32 @llvm.amdgcn.workitem.id.x()
374  %tid.ext = sext i32 %tid to i64
375  %in1.gep = getelementptr inbounds i32, ptr addrspace(1) %in1, i64 %tid.ext
376  %val1 = load volatile i32, ptr addrspace(1) %in1.gep
377  %hi.i = trunc i32 %val1 to i16
378  %hi = bitcast i16 %hi.i to half
379  %vec.0 = insertelement <2 x half> undef, half 0xH1234, i32 0
380  %vec.1 = insertelement <2 x half> %vec.0, half %hi, i32 1
381  %vec.i32 = bitcast <2 x half> %vec.1 to i32
382  call void asm sideeffect "; use $0", "v"(i32 %vec.i32) #0
383  ret void
384}
385
386define amdgpu_kernel void @v_pack_v2f16_inline_imm_lo(ptr addrspace(1) %in1) #0 {
387; GFX9-LABEL: v_pack_v2f16_inline_imm_lo:
388; GFX9:       ; %bb.0:
389; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
390; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
391; GFX9-NEXT:    v_mov_b32_e32 v1, 0x5040100
392; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
393; GFX9-NEXT:    global_load_dword v0, v0, s[0:1] glc
394; GFX9-NEXT:    s_waitcnt vmcnt(0)
395; GFX9-NEXT:    s_movk_i32 s0, 0x4400
396; GFX9-NEXT:    v_perm_b32 v0, v0, s0, v1
397; GFX9-NEXT:    ;;#ASMSTART
398; GFX9-NEXT:    ; use v0
399; GFX9-NEXT:    ;;#ASMEND
400; GFX9-NEXT:    s_endpgm
401;
402; GFX8-LABEL: v_pack_v2f16_inline_imm_lo:
403; GFX8:       ; %bb.0:
404; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
405; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
406; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
407; GFX8-NEXT:    v_mov_b32_e32 v1, s1
408; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
409; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
410; GFX8-NEXT:    flat_load_dword v0, v[0:1] glc
411; GFX8-NEXT:    s_waitcnt vmcnt(0)
412; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
413; GFX8-NEXT:    v_or_b32_e32 v0, 0x4400, v0
414; GFX8-NEXT:    ;;#ASMSTART
415; GFX8-NEXT:    ; use v0
416; GFX8-NEXT:    ;;#ASMEND
417; GFX8-NEXT:    s_endpgm
418;
419; GFX7-LABEL: v_pack_v2f16_inline_imm_lo:
420; GFX7:       ; %bb.0:
421; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
422; GFX7-NEXT:    s_mov_b32 s3, 0x100f000
423; GFX7-NEXT:    s_mov_b32 s2, 0
424; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
425; GFX7-NEXT:    v_mov_b32_e32 v1, 0
426; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
427; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[0:3], 0 addr64 glc
428; GFX7-NEXT:    s_waitcnt vmcnt(0)
429; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
430; GFX7-NEXT:    v_or_b32_e32 v0, 0x4400, v0
431; GFX7-NEXT:    ;;#ASMSTART
432; GFX7-NEXT:    ; use v0
433; GFX7-NEXT:    ;;#ASMEND
434; GFX7-NEXT:    s_endpgm
435  %tid = call i32 @llvm.amdgcn.workitem.id.x()
436  %tid.ext = sext i32 %tid to i64
437  %in1.gep = getelementptr inbounds i32, ptr addrspace(1) %in1, i64 %tid.ext
438  %val1 = load volatile i32, ptr addrspace(1) %in1.gep
439  %hi.i = trunc i32 %val1 to i16
440  %hi = bitcast i16 %hi.i to half
441  %vec.0 = insertelement <2 x half> undef, half 4.0, i32 0
442  %vec.1 = insertelement <2 x half> %vec.0, half %hi, i32 1
443  %vec.i32 = bitcast <2 x half> %vec.1 to i32
444  call void asm sideeffect "; use $0", "v"(i32 %vec.i32) #0
445  ret void
446}
447
448define amdgpu_kernel void @v_pack_v2f16_imm_hi(ptr addrspace(1) %in0) #0 {
449; GFX9-LABEL: v_pack_v2f16_imm_hi:
450; GFX9:       ; %bb.0:
451; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
452; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
453; GFX9-NEXT:    v_mov_b32_e32 v1, 0x5040100
454; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
455; GFX9-NEXT:    global_load_dword v0, v0, s[0:1] glc
456; GFX9-NEXT:    s_waitcnt vmcnt(0)
457; GFX9-NEXT:    s_movk_i32 s0, 0x1234
458; GFX9-NEXT:    v_perm_b32 v0, s0, v0, v1
459; GFX9-NEXT:    ;;#ASMSTART
460; GFX9-NEXT:    ; use v0
461; GFX9-NEXT:    ;;#ASMEND
462; GFX9-NEXT:    s_endpgm
463;
464; GFX8-LABEL: v_pack_v2f16_imm_hi:
465; GFX8:       ; %bb.0:
466; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
467; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
468; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
469; GFX8-NEXT:    v_mov_b32_e32 v1, s1
470; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
471; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
472; GFX8-NEXT:    flat_load_dword v0, v[0:1] glc
473; GFX8-NEXT:    s_waitcnt vmcnt(0)
474; GFX8-NEXT:    v_mov_b32_e32 v1, 0x12340000
475; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
476; GFX8-NEXT:    ;;#ASMSTART
477; GFX8-NEXT:    ; use v0
478; GFX8-NEXT:    ;;#ASMEND
479; GFX8-NEXT:    s_endpgm
480;
481; GFX7-LABEL: v_pack_v2f16_imm_hi:
482; GFX7:       ; %bb.0:
483; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
484; GFX7-NEXT:    s_mov_b32 s3, 0x100f000
485; GFX7-NEXT:    s_mov_b32 s2, 0
486; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
487; GFX7-NEXT:    v_mov_b32_e32 v1, 0
488; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
489; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[0:3], 0 addr64 glc
490; GFX7-NEXT:    s_waitcnt vmcnt(0)
491; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff, v0
492; GFX7-NEXT:    v_or_b32_e32 v0, 0x12340000, v0
493; GFX7-NEXT:    ;;#ASMSTART
494; GFX7-NEXT:    ; use v0
495; GFX7-NEXT:    ;;#ASMEND
496; GFX7-NEXT:    s_endpgm
497  %tid = call i32 @llvm.amdgcn.workitem.id.x()
498  %tid.ext = sext i32 %tid to i64
499  %in0.gep = getelementptr inbounds i32, ptr addrspace(1) %in0, i64 %tid.ext
500  %val0 = load volatile i32, ptr addrspace(1) %in0.gep
501  %lo.i = trunc i32 %val0 to i16
502  %lo = bitcast i16 %lo.i to half
503  %vec.0 = insertelement <2 x half> undef, half %lo, i32 0
504  %vec.1 = insertelement <2 x half> %vec.0, half 0xH1234, i32 1
505  %vec.i32 = bitcast <2 x half> %vec.1 to i32
506  call void asm sideeffect "; use $0", "v"(i32 %vec.i32) #0
507  ret void
508}
509
510define amdgpu_kernel void @v_pack_v2f16_inline_f16imm_hi(ptr addrspace(1) %in0) #0 {
511; GFX9-LABEL: v_pack_v2f16_inline_f16imm_hi:
512; GFX9:       ; %bb.0:
513; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
514; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
515; GFX9-NEXT:    v_mov_b32_e32 v1, 0x5040100
516; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
517; GFX9-NEXT:    global_load_dword v0, v0, s[0:1] glc
518; GFX9-NEXT:    s_waitcnt vmcnt(0)
519; GFX9-NEXT:    s_movk_i32 s0, 0x3c00
520; GFX9-NEXT:    v_perm_b32 v0, s0, v0, v1
521; GFX9-NEXT:    ;;#ASMSTART
522; GFX9-NEXT:    ; use v0
523; GFX9-NEXT:    ;;#ASMEND
524; GFX9-NEXT:    s_endpgm
525;
526; GFX8-LABEL: v_pack_v2f16_inline_f16imm_hi:
527; GFX8:       ; %bb.0:
528; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
529; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
530; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
531; GFX8-NEXT:    v_mov_b32_e32 v1, s1
532; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
533; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
534; GFX8-NEXT:    flat_load_dword v0, v[0:1] glc
535; GFX8-NEXT:    s_waitcnt vmcnt(0)
536; GFX8-NEXT:    v_bfrev_b32_e32 v1, 60
537; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
538; GFX8-NEXT:    ;;#ASMSTART
539; GFX8-NEXT:    ; use v0
540; GFX8-NEXT:    ;;#ASMEND
541; GFX8-NEXT:    s_endpgm
542;
543; GFX7-LABEL: v_pack_v2f16_inline_f16imm_hi:
544; GFX7:       ; %bb.0:
545; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
546; GFX7-NEXT:    s_mov_b32 s3, 0x100f000
547; GFX7-NEXT:    s_mov_b32 s2, 0
548; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
549; GFX7-NEXT:    v_mov_b32_e32 v1, 0
550; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
551; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[0:3], 0 addr64 glc
552; GFX7-NEXT:    s_waitcnt vmcnt(0)
553; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff, v0
554; GFX7-NEXT:    v_or_b32_e32 v0, 0x3c000000, v0
555; GFX7-NEXT:    ;;#ASMSTART
556; GFX7-NEXT:    ; use v0
557; GFX7-NEXT:    ;;#ASMEND
558; GFX7-NEXT:    s_endpgm
559  %tid = call i32 @llvm.amdgcn.workitem.id.x()
560  %tid.ext = sext i32 %tid to i64
561  %in0.gep = getelementptr inbounds i32, ptr addrspace(1) %in0, i64 %tid.ext
562  %val0 = load volatile i32, ptr addrspace(1) %in0.gep
563  %lo.i = trunc i32 %val0 to i16
564  %lo = bitcast i16 %lo.i to half
565  %vec.0 = insertelement <2 x half> undef, half %lo, i32 0
566  %vec.1 = insertelement <2 x half> %vec.0, half 1.0, i32 1
567  %vec.i32 = bitcast <2 x half> %vec.1 to i32
568  call void asm sideeffect "; use $0", "v"(i32 %vec.i32) #0
569  ret void
570}
571
572define amdgpu_kernel void @v_pack_v2f16_inline_imm_hi(ptr addrspace(1) %in0) #0 {
573; GFX9-LABEL: v_pack_v2f16_inline_imm_hi:
574; GFX9:       ; %bb.0:
575; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
576; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
577; GFX9-NEXT:    v_mov_b32_e32 v1, 0x5040100
578; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
579; GFX9-NEXT:    global_load_dword v0, v0, s[0:1] glc
580; GFX9-NEXT:    s_waitcnt vmcnt(0)
581; GFX9-NEXT:    v_perm_b32 v0, 64, v0, v1
582; GFX9-NEXT:    ;;#ASMSTART
583; GFX9-NEXT:    ; use v0
584; GFX9-NEXT:    ;;#ASMEND
585; GFX9-NEXT:    s_endpgm
586;
587; GFX8-LABEL: v_pack_v2f16_inline_imm_hi:
588; GFX8:       ; %bb.0:
589; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
590; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
591; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
592; GFX8-NEXT:    v_mov_b32_e32 v1, s1
593; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
594; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
595; GFX8-NEXT:    flat_load_dword v0, v[0:1] glc
596; GFX8-NEXT:    s_waitcnt vmcnt(0)
597; GFX8-NEXT:    v_mov_b32_e32 v1, 0x400000
598; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
599; GFX8-NEXT:    ;;#ASMSTART
600; GFX8-NEXT:    ; use v0
601; GFX8-NEXT:    ;;#ASMEND
602; GFX8-NEXT:    s_endpgm
603;
604; GFX7-LABEL: v_pack_v2f16_inline_imm_hi:
605; GFX7:       ; %bb.0:
606; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
607; GFX7-NEXT:    s_mov_b32 s3, 0x100f000
608; GFX7-NEXT:    s_mov_b32 s2, 0
609; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
610; GFX7-NEXT:    v_mov_b32_e32 v1, 0
611; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
612; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[0:3], 0 addr64 glc
613; GFX7-NEXT:    s_waitcnt vmcnt(0)
614; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff, v0
615; GFX7-NEXT:    v_or_b32_e32 v0, 0x400000, v0
616; GFX7-NEXT:    ;;#ASMSTART
617; GFX7-NEXT:    ; use v0
618; GFX7-NEXT:    ;;#ASMEND
619; GFX7-NEXT:    s_endpgm
620  %tid = call i32 @llvm.amdgcn.workitem.id.x()
621  %tid.ext = sext i32 %tid to i64
622  %in0.gep = getelementptr inbounds i32, ptr addrspace(1) %in0, i64 %tid.ext
623  %val0 = load volatile i32, ptr addrspace(1) %in0.gep
624  %lo.i = trunc i32 %val0 to i16
625  %lo = bitcast i16 %lo.i to half
626  %vec.0 = insertelement <2 x half> undef, half %lo, i32 0
627  %vec.1 = insertelement <2 x half> %vec.0, half 0xH0040, i32 1
628  %vec.i32 = bitcast <2 x half> %vec.1 to i32
629  call void asm sideeffect "; use $0", "v"(i32 %vec.i32) #0
630  ret void
631}
632
633declare i32 @llvm.amdgcn.workitem.id.x() #1
634
635attributes #0 = { nounwind }
636attributes #1 = { nounwind readnone }
637